--- //depot/vendor/freebsd/src/sys/conf/files 2005/07/19 02:10:35 +++ //depot/user/rwatson/netperf/sys/conf/files 2005/07/19 11:47:08 @@ -1239,6 +1239,7 @@ kern/subr_kobj.c standard kern/subr_log.c standard kern/subr_mbpool.c optional libmbpool +kern/subr_mbufqueue.c standard kern/subr_mchain.c optional libmchain kern/subr_module.c standard kern/subr_msgbuf.c standard @@ -1784,6 +1785,7 @@ security/mac_seeotheruids/mac_seeotheruids.c optional mac_seeotheruids security/mac_stub/mac_stub.c optional mac_stub security/mac_test/mac_test.c optional mac_test +test/test_synch_timing.c standard ufs/ffs/ffs_alloc.c optional ffs ufs/ffs/ffs_balloc.c optional ffs ufs/ffs/ffs_inode.c optional ffs --- //depot/vendor/freebsd/src/sys/dev/aic7xxx/aicasm/Makefile 2004/12/21 08:50:46 +++ //depot/user/rwatson/netperf/sys/dev/aic7xxx/aicasm/Makefile 2005/01/01 21:09:05 @@ -34,6 +34,7 @@ CFLAGS+= -I${MAKESRCPATH} .endif NO_MAN= +NOMAN= YFLAGS= -b ${.TARGET:R} ${.TARGET:M*macro*:S/$(.TARGET)/-p mm/} -d LFLAGS+= ${.TARGET:M*macro*:S/$(.TARGET)/-Pmm/} --- //depot/vendor/freebsd/src/sys/dev/em/if_em.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/dev/em/if_em.c 2005/07/05 14:55:33 @@ -584,7 +584,64 @@ return(0); } +/* + * Alternative transmit entry point: accept an mbuf as well as a network + * interface, avoiding queuing overhead if possible. Several paths, depending + * on whether other packets are already queued or not. We must either hand + * off or free the mbuf, as the caller no longer owns it. + * + * XXXRW: For now, simply replicate if_handoff(), avoiding two mutex + * operations, but eventually we want a fast path here with no ifq mutex + * operations for when the ifq is full. I left it with one pair for now to + * maintain consistency on the if send counters. + */ +static int +em_start_mbuf(struct ifnet *ifp, struct mbuf *m, int adjust) +{ + struct adapter *adapter = ifp->if_softc; + struct ifqueue *ifq; + + /* + * First, do a lot of work to make it look as though the packet was + * queued by if_handoff(), stats, etc. + */ + EM_LOCK(adapter); + ifq = (struct ifqueue *)&ifp->if_snd; + IF_LOCK(ifq); + if (_IF_QFULL(ifq)) { + _IF_DROP(ifq); + IF_UNLOCK(ifq); + EM_UNLOCK(adapter); + m_freem(m); + return (0); + } + ifp->if_obytes += m->m_pkthdr.len + adjust; + if (m->m_flags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + /* + * Now we see if the queue is empty, in which case we can begin a + * direct dispatch of our packet. Otherwise, we enqueue it as + * if_handoff() would have. NOTE: Not ALTQ-safe. + */ + if (IFQ_IS_EMPTY(ifq)) { + if (em_encap(adapter, &m)) + goto enqueue; + ifp->if_flags |= IFF_OACTIVE; + BPF_MTAP(ifp, m); + ifp->if_timer = EM_TX_TIMEOUT; + IF_UNLOCK(ifq); + EM_UNLOCK(adapter); + return (1); + } +enqueue: + if (m != NULL) + _IF_ENQUEUE(ifq, m); + IF_UNLOCK(ifq); + EM_UNLOCK(adapter); + return (1); +} + /********************************************************************* * Transmit entry point * @@ -1940,6 +1997,7 @@ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = em_ioctl; ifp->if_start = em_start; + ifp->if_start_mbuf = em_start_mbuf; ifp->if_watchdog = em_watchdog; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; --- //depot/vendor/freebsd/src/sys/dev/fdc/fdc_isa.c 2005/03/15 08:07:19 +++ //depot/user/rwatson/netperf/sys/dev/fdc/fdc_isa.c 2005/06/01 10:10:04 @@ -78,15 +78,14 @@ fdc_isa_alloc_resources(device_t dev, struct fdc_data *fdc) { struct resource *res; - int i, j, rid, newrid, nport; - u_long port; + int nports = 6; + int i, j, rid, newrid; fdc->fdc_dev = dev; rid = 0; for (i = 0; i < FDC_MAXREG; i++) fdc->resio[i] = NULL; - nport = isa_get_logicalid(dev) ? 1 : 6; for (rid = 0; ; rid++) { newrid = rid; res = bus_alloc_resource(dev, SYS_RES_IOPORT, &newrid, @@ -109,28 +108,15 @@ fdc->ioh[i + j] = rman_get_bushandle(res); } } - if (fdc->resio[2] == NULL) { - device_printf(dev, "No FDOUT register!\n"); + if (fdc->resio[2] == NULL) return (ENXIO); - } fdc->iot = rman_get_bustag(fdc->resio[2]); if (fdc->resio[7] == NULL) { - port = (rman_get_start(fdc->resio[2]) & ~0x7) + 7; - newrid = rid; - res = bus_alloc_resource(dev, SYS_RES_IOPORT, &newrid, port, - port, 1, RF_ACTIVE); - if (res == NULL) { - device_printf(dev, "Faking up FDCTL\n"); - fdc->resio[7] = fdc->resio[2]; - fdc->ridio[7] = fdc->ridio[2]; - fdc->ioff[7] = fdc->ioff[2] + 5; - fdc->ioh[7] = fdc->ioh[2]; - } else { - fdc->resio[7] = res; - fdc->ridio[7] = newrid; - fdc->ioff[7] = rman_get_start(res) & 7; - fdc->ioh[7] = rman_get_bushandle(res); - } + /* XXX allocate */ + fdc->resio[7] = fdc->resio[2]; + fdc->ridio[7] = fdc->ridio[2]; + fdc->ioff[7] = fdc->ioff[2] + 5; + fdc->ioh[7] = fdc->ioh[2]; } fdc->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &fdc->rid_irq, --- //depot/vendor/freebsd/src/sys/dev/random/randomdev_soft.c 2005/03/29 11:10:26 +++ //depot/user/rwatson/netperf/sys/dev/random/randomdev_soft.c 2005/06/01 10:10:04 @@ -53,7 +53,7 @@ #include #include -#define RANDOM_FIFO_MAX 256 /* How many events to queue up */ +#define RANDOM_FIFO_MAX 4 /* How many events to queue up */ static void random_kthread(void *); static void --- //depot/vendor/freebsd/src/sys/fs/portalfs/portal_vnops.c 2005/03/28 09:35:18 +++ //depot/user/rwatson/netperf/sys/fs/portalfs/portal_vnops.c 2005/06/01 10:10:04 @@ -188,6 +188,7 @@ unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + /* XXXRW: Locking? */ if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) sodupsockaddr((struct sockaddr *)unp2->unp_addr, --- //depot/vendor/freebsd/src/sys/i386/conf/GENERIC 2005/07/14 15:40:25 +++ //depot/user/rwatson/netperf/sys/i386/conf/GENERIC 2005/07/19 11:47:08 @@ -71,6 +71,7 @@ options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options BREAK_TO_DEBUGGER # To make an SMP kernel, the next two lines are needed options SMP # Symmetric MultiProcessor Kernel --- //depot/vendor/freebsd/src/sys/kern/kern_descrip.c 2005/06/25 03:35:21 +++ //depot/user/rwatson/netperf/sys/kern/kern_descrip.c 2005/07/05 14:55:33 @@ -1517,6 +1517,9 @@ /* * Release a filedesc structure. + * + * XXXRW: Requires Giant because it calls into VFS. Status of p_fdtol is + * unclear to me. */ void fdfree(struct thread *td) @@ -1656,10 +1659,16 @@ * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. + * + * XXXRW: Requires Giant due to check of v_vflag. Actually, this can + * probably become a lockless read. */ static int is_unsafe(struct file *fp) { + + GIANT_REQUIRED; + if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; @@ -1671,6 +1680,8 @@ /* * Make this setguid thing safe, if at all possible. + * + * XXXRW: MPSAFE, as knote_fdclose() and closef() are both now MPSAFE. */ void setugidsafety(struct thread *td) @@ -1728,6 +1739,8 @@ /* * Close any files on exec? + * + * XXXRW: Now MPSAFE, as knode_fdclosed() and closef() are now MPSAFE. */ void fdcloseexec(struct thread *td) @@ -1774,6 +1787,8 @@ * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. + * + * XXXRW: Requires Giant for VFS access. do_dup is MPSAFE. */ int fdcheckstd(struct thread *td) @@ -2000,6 +2015,7 @@ * but never have VM objects. The returned vnode will be vref()d. * * XXX: what about the unused flags ? + * XXXRW: _fgetvp() is not MPSAFE because of VFS access. */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) @@ -2049,6 +2065,8 @@ * * We bump the ref count on the returned socket. XXX Also obtain the SX * lock in the future. + * + * XXXRW: fgetsock() is MPSAFE. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) @@ -2103,6 +2121,9 @@ * Drop reference on struct file passed in, may call closef if the * reference hits zero. * Expects struct file locked, and will unlock it. + * + * XXXRW: fdrop_locked() is MPSAFE, as fo_close() will acquire Giant if it + * needs it. */ static int fdrop_locked(struct file *fp, struct thread *td) --- //depot/vendor/freebsd/src/sys/kern/kern_poll.c 2005/02/25 22:10:33 +++ //depot/user/rwatson/netperf/sys/kern/kern_poll.c 2005/02/27 16:23:38 @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include /* needed by net/if.h */ #include @@ -177,12 +179,15 @@ static struct pollrec pr[POLL_LIST_LEN]; +static struct mtx pollmtx; + static void init_device_poll(void) { - netisr_register(NETISR_POLL, (netisr_t *)netisr_poll, NULL, 0); - netisr_register(NETISR_POLLMORE, (netisr_t *)netisr_pollmore, NULL, 0); + mtx_init(&pollmtx, "pollmtx", NULL, MTX_DEF); + netisr_register(NETISR_POLL, (netisr_t *)netisr_poll, NULL, NETISR_MPSAFE); + netisr_register(NETISR_POLLMORE, (netisr_t *)netisr_pollmore, NULL, NETISR_MPSAFE); } SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL) @@ -209,6 +214,7 @@ if (poll_handlers == 0) return; + mtx_lock(&pollmtx); microuptime(&t); delta = (t.tv_usec - prev_t.tv_usec) + (t.tv_sec - prev_t.tv_sec)*1000000; @@ -236,6 +242,7 @@ } if (pending_polls++ > 0) lost_polls++; + mtx_unlock(&pollmtx); } /* @@ -246,15 +253,14 @@ { int i; - mtx_lock(&Giant); - + mtx_lock(&pollmtx); if (count > poll_each_burst) count = poll_each_burst; for (i = 0 ; i < poll_handlers ; i++) if (pr[i].handler && (IFF_UP|IFF_RUNNING) == (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) ) pr[i].handler(pr[i].ifp, 0, count); /* quick check */ - mtx_unlock(&Giant); + mtx_unlock(&pollmtx); } /* @@ -280,8 +286,8 @@ { struct timeval t; int kern_load; - /* XXX run at splhigh() or equivalent */ + mtx_lock(&pollmtx); phase = 5; if (residual_burst > 0) { schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE); @@ -316,6 +322,7 @@ schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE); phase = 6; } + mtx_unlock(&pollmtx); } /* @@ -329,8 +336,8 @@ static int reg_frac_count; int i, cycles; enum poll_cmd arg = POLL_ONLY; - mtx_lock(&Giant); + mtx_lock(&pollmtx); phase = 3; if (residual_burst == 0) { /* first call in this tick */ microuptime(&poll_start_t); @@ -390,7 +397,7 @@ } /* on -stable, schednetisr(NETISR_POLLMORE); */ phase = 4; - mtx_unlock(&Giant); + mtx_unlock(&pollmtx); } /* @@ -404,7 +411,6 @@ int ether_poll_register(poll_handler_t *h, struct ifnet *ifp) { - int s; if (polling == 0) /* polling disabled, cannot register */ return 0; @@ -415,7 +421,7 @@ if (ifp->if_flags & IFF_POLLING) /* already polling */ return 0; - s = splhigh(); + mtx_lock(&pollmtx); if (poll_handlers >= POLL_LIST_LEN) { /* * List full, cannot register more entries. @@ -425,7 +431,7 @@ * anyways, so just report a few times and then give up. */ static int verbose = 10 ; - splx(s); + mtx_unlock(&pollmtx); if (verbose >0) { printf("poll handlers list full, " "maybe a broken driver ?\n"); @@ -438,9 +444,9 @@ pr[poll_handlers].ifp = ifp; poll_handlers++; ifp->if_flags |= IFF_POLLING; - splx(s); if (idlepoll_sleeping) wakeup(&idlepoll_sleeping); + mtx_unlock(&pollmtx); return 1; /* polling enabled in next call */ } @@ -455,9 +461,9 @@ { int i; - mtx_lock(&Giant); + mtx_lock(&pollmtx); if ( !ifp || !(ifp->if_flags & IFF_POLLING) ) { - mtx_unlock(&Giant); + mtx_unlock(&pollmtx); return 0; } for (i = 0 ; i < poll_handlers ; i++) @@ -465,7 +471,7 @@ break; ifp->if_flags &= ~IFF_POLLING; /* found or not... */ if (i == poll_handlers) { - mtx_unlock(&Giant); + mtx_unlock(&pollmtx); printf("ether_poll_deregister: ifp not found!!!\n"); return 0; } @@ -474,7 +480,7 @@ pr[i].handler = pr[poll_handlers].handler; pr[i].ifp = pr[poll_handlers].ifp; } - mtx_unlock(&Giant); + mtx_unlock(&pollmtx); return 1; } @@ -492,21 +498,24 @@ pri = td->td_priority; mtx_unlock_spin(&sched_lock); + mtx_lock(&pollmtx); for (;;) { if (poll_in_idle_loop && poll_handlers > 0) { idlepoll_sleeping = 0; - mtx_lock(&Giant); ether_poll(poll_each_burst); - mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); + mtx_unlock(&pollmtx); mtx_lock_spin(&sched_lock); mi_switch(SW_VOL, NULL); mtx_unlock_spin(&sched_lock); + mtx_lock(&pollmtx); } else { idlepoll_sleeping = 1; - tsleep(&idlepoll_sleeping, pri, "pollid", hz * 3); + msleep(&idlepoll_sleeping, &pollmtx, pri, "pollid", + hz * 3); } } + mtx_unlock(&pollmtx); } static struct proc *idlepoll; --- //depot/vendor/freebsd/src/sys/kern/subr_log.c 2005/02/27 22:05:29 +++ //depot/user/rwatson/netperf/sys/kern/subr_log.c 2005/03/12 17:40:56 @@ -81,7 +81,12 @@ struct callout sc_callout; /* callout to wakeup syslog */ } logsoftc; -int log_open; /* also used in log() */ +/* + * log_mtx protects logsoftc, log_open. Note that log_mtx does *not* + * protect the structures associated with msgbuf, which require Giant. + */ +struct mtx log_mtx; +int log_open; /* also used in log() */ /* Times per second to check for a pending syslog wakeup. */ static int log_wakeups_per_second = 5; @@ -92,17 +97,24 @@ static int logopen(struct cdev *dev, int flags, int mode, struct thread *td) { - if (log_open) + + mtx_lock(&log_mtx); + if (log_open) { + mtx_unlock(&log_mtx); return (EBUSY); + } log_open = 1; - callout_init(&logsoftc.sc_callout, 0); + callout_init(&logsoftc.sc_callout, CALLOUT_MPSAFE); + mtx_unlock(&log_mtx); fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */ + mtx_lock(&log_mtx); if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return (0); } @@ -111,9 +123,11 @@ logclose(struct cdev *dev, int flag, int mode, struct thread *td) { + mtx_lock(&log_mtx); log_open = 0; callout_stop(&logsoftc.sc_callout); logsoftc.sc_state = 0; + mtx_unlock(&log_mtx); funsetown(&logsoftc.sc_sigio); return (0); } @@ -132,14 +146,18 @@ splx(s); return (EWOULDBLOCK); } + mtx_lock(&log_mtx); logsoftc.sc_state |= LOG_RDWAIT; + mtx_unlock(&log_mtx); if ((error = tsleep(mbp, LOG_RDPRI | PCATCH, "klog", 0))) { splx(s); return (error); } } splx(s); + mtx_lock(&log_mtx); logsoftc.sc_state &= ~LOG_RDWAIT; + mtx_unlock(&log_mtx); while (uio->uio_resid > 0) { l = imin(sizeof(buf), uio->uio_resid); @@ -176,8 +194,11 @@ logtimeout(void *arg) { - if (!log_open) + mtx_lock(&log_mtx); + if (!log_open) { + mtx_unlock(&log_mtx); return; + } if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; @@ -185,6 +206,7 @@ if (msgbuftrigger == 0) { callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return; } msgbuftrigger = 0; @@ -197,6 +219,7 @@ } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); } /*ARGSUSED*/ @@ -215,10 +238,12 @@ break; case FIOASYNC: + mtx_lock(&log_mtx); if (*(int *)data) logsoftc.sc_state |= LOG_ASYNC; else logsoftc.sc_state &= ~LOG_ASYNC; + mtx_unlock(&log_mtx); break; case FIOSETOWN: @@ -247,6 +272,7 @@ log_drvinit(void *unused) { + mtx_init(&log_mtx, "log_mtx", NULL, MTX_DEF); make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog"); } --- //depot/vendor/freebsd/src/sys/kern/subr_prf.c 2005/06/06 22:21:14 +++ //depot/user/rwatson/netperf/sys/kern/subr_prf.c 2005/07/05 14:55:33 @@ -84,6 +84,9 @@ size_t remain; }; +/* + * XXXRW: We access subr_log.c's log_open variable unlocked. + */ extern int log_open; static void msglogchar(int c, int pri); --- //depot/vendor/freebsd/src/sys/kern/subr_witness.c 2005/06/04 23:25:18 +++ //depot/user/rwatson/netperf/sys/kern/subr_witness.c 2005/07/05 14:55:33 @@ -316,6 +316,7 @@ */ { "ddp_list_mtx", &lock_class_mtx_sleep }, { "ddp_mtx", &lock_class_mtx_sleep }, + { "at_ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF --- //depot/vendor/freebsd/src/sys/kern/sys_pipe.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/netperf/sys/kern/sys_pipe.c 2005/07/05 14:55:33 @@ -414,6 +414,9 @@ * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. + * + * This version of the call should not be called directly except via + * pipe_create(), as it doesn't assert the pipe lock. */ static int pipespace_new(cpipe, size) --- //depot/vendor/freebsd/src/sys/kern/uipc_socket.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket.c 2005/07/05 14:55:33 @@ -347,6 +347,7 @@ ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); + /* XXXRW: Why would SS_NOFDREF be unset here? so_count is 0. */ if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 || so->so_count != 0) { SOCK_UNLOCK(so); @@ -409,6 +410,45 @@ sodealloc(so); } +#if 0 +static void +dump_socket(struct socket *so) +{ + + printf("so = %p\n", so); + printf(" so_count = %d\n", so->so_count); + printf(" so_type = %d\n", so->so_type); + printf(" so_options = %d\n", so->so_options); + printf(" so_linger = %d\n", so->so_linger); + printf(" so_state = %d\n", so->so_state); + printf(" so_qstate = %d\n", so->so_qstate); + printf(" so_pcb = %p\n", so->so_pcb); + printf(" so_proto = %p\n", so->so_proto); + printf(" pr_type = %d\n", so->so_proto->pr_type); + printf(" pr_domain = %p\n", so->so_proto->pr_domain); + printf(" dom_family = %d\n", so->so_proto->pr_domain->dom_family); + printf(" dom_name = %s\n", so->so_proto->pr_domain->dom_name); + printf(" pr_protocol = %d\n", so->so_proto->pr_protocol); + printf(" pr_flags = %d\n", so->so_proto->pr_flags); + printf(" so_head = %p\n", so->so_head); + printf(" TAILQ_FIRST(so_incomp) = %p\n", TAILQ_FIRST(&so->so_incomp)); + printf(" TAILQ_FIRST(so_comp) = %p\n", TAILQ_FIRST(&so->so_comp)); + printf(" so_qlen = %d\n", so->so_qlen); + printf(" so_incqlen = %d\n", so->so_incqlen); + printf(" so_qlimit = %d\n", so->so_qlimit); + printf(" so_timeo = %d\n", so->so_timeo); + printf(" so_error = %d\n", so->so_error); + printf(" so_sigio = %p\n", so->so_sigio); + printf(" so_oobmark = %lu\n", so->so_oobmark); + printf(" so_rcv.sb_state = %d\n", so->so_rcv.sb_state); + printf(" so_snd.sb_state = %d\n", so->so_snd.sb_state); + printf(" so_upcall = %p\n", so->so_upcall); + printf(" so_upcallarg = %p\n", so->so_upcallarg); + printf(" so_cred = %p\n", so->so_cred); + printf(" so_gencnt = %d\n", (int)so->so_gencnt); +} +#endif + /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. @@ -427,6 +467,23 @@ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); funsetown(&so->so_sigio); + /*- + * XXXRW: Lots of locking problems here. For one thing, + * we should probably clear the SO_ACCEPTCONN flag and + * push that down to the protocol layer. Also, the locking + * for the queue draining is probably not right either: + * what prevents new ones from being inserted after we get + * past this? Ideally, the downcall would prevent this, but + * there isn't one. + */ + /* + * XXXRW: soclose() should be split into two parts: one to handle + * listen sockets, and the other to handle the remainder. To + * prevent races with the protocol code, we need to detach before + * draining the queues. For non-listen sockets, the disconnect + * has to happen before the detach. + */ + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) { struct socket *sp; ACCEPT_LOCK(); @@ -452,6 +509,7 @@ } if (so->so_pcb == NULL) goto discard; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); @@ -538,6 +596,7 @@ * This allows user to disconnect by connecting to, e.g., * a null address. */ + /* Unlocked read. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { @@ -569,6 +628,7 @@ { int error; + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) @@ -659,8 +719,6 @@ #define snderr(errno) { error = (errno); goto release; } SOCKBUF_LOCK(&so->so_snd); -restart: - SOCKBUF_LOCK_ASSERT(&so->so_snd); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out_locked; @@ -699,11 +757,10 @@ (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) snderr(EWOULDBLOCK); - sbunlock(&so->so_snd); error = sbwait(&so->so_snd); if (error) - goto out_locked; - goto restart; + goto release; + continue; } SOCKBUF_UNLOCK(&so->so_snd); mp = ⊤ @@ -812,6 +869,15 @@ break; } } while (space > 0 && atomic); + /* + * XXXRW: There may be a race condition here regarding + * SO_DONTROUTE. We hold the sblock() so in theory + * there won't be other consumers of the socket doing + * a pru_send(), but SO_DONTROUTE is also consumed + * from the protocol side. It sounds like we might + * want to use MSG_DONTROUTE here, if the semantics are + * right. + */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; @@ -840,6 +906,10 @@ /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); + /* + * XXXRW: Second half of above comment on SO_DONTROUTE. + * and so_options. + */ if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; @@ -1004,17 +1074,18 @@ return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; + /* Unlocked read. */ if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) (*pr->pr_usrreqs->pru_rcvd)(so, 0); SOCKBUF_LOCK(&so->so_rcv); -restart: - SOCKBUF_LOCK_ASSERT(&so->so_rcv); error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) goto out; +restart: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more @@ -1055,6 +1126,7 @@ m = so->so_rcv.sb_mb; goto dontblock; } + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; @@ -1062,6 +1134,7 @@ } if (uio->uio_resid == 0) goto release; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; @@ -1069,10 +1142,9 @@ } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); - sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) - goto out; + goto release; goto restart; } dontblock: @@ -1101,10 +1173,25 @@ if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); - orig_resid = 0; - if (psa != NULL) + if (psa != NULL) { *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); + if (*psa == NULL) { + error = ENOMEM; + goto release; + } + /* + * XXXRW: In the rwatson_netperf branch, we don't + * release the socket buffer lock here because + * we always use M_NOWAIT. In the main tree, + * or if we restore conditional waiting, we + * need to refresh nextpacket from the socket + * buffer version of the head mbuf, or we may + * have a stale value if it was previously NULL + * and now isn't. + */ + nextrecord = m->m_nextpkt; + } if (flags & MSG_PEEK) { m = m->m_next; } else { @@ -1113,6 +1200,7 @@ m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } + orig_resid = 0; } /* @@ -1127,6 +1215,14 @@ do { if (flags & MSG_PEEK) { + /* + * XXXRW: In the BSD/OS version of this + * code, m_copym() is called with M_TRYWAIT, + * and we catch allcation failures and + * jump to release with error of ENOBUFS. + * For consistency with the current + * implementation, we don't. + */ if (controlp != NULL) { *controlp = m_copy(m, 0, m->m_len); controlp = &(*controlp)->m_next; @@ -1141,6 +1237,12 @@ m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); + /* + * XXXRW: Since we're dropping the socket buffer locks, and + * may have modified the mbuf list on the socket buffer, push + * out the latest version so it can be seen by any other code + * that accesses the buffer in the mean time. + */ if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { @@ -1162,10 +1264,21 @@ } cm = cmn; } + /* + * XXXRW: During externalization, additional mbuf chains + * may have been added to the socket buffer. Update our + * local cache to avoid using a stale value and corrupting + * the list. + */ nextrecord = so->so_rcv.sb_mb->m_nextpkt; orig_resid = 0; } if (m != NULL) { + /* + * XXXRW: Before the advent of sockbuf_pushsync() above, it + * was necessary to do the equivilent work here. Now, we + * just assert that it is the case. + */ if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); @@ -1180,6 +1293,11 @@ if (type == MT_OOBDATA) flags |= MSG_OOB; } else { + /* + * XXXRW: Before the advent of sockbuf_pushsync() above, it + * was necessary to update sb_mb to nextrecord here. Now, we + * just assert that is the case. + */ if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); @@ -1276,14 +1394,7 @@ so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } - if (m != NULL) { - m->m_nextpkt = nextrecord; - if (nextrecord == NULL) - so->so_rcv.sb_lastrecord = m; - } else { - so->so_rcv.sb_mb = nextrecord; - SB_EMPTY_FIXUP(&so->so_rcv); - } + sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } @@ -1350,6 +1461,12 @@ /* * Notify the protocol that some data has been * drained before blocking. + * + * XXXRW: We drop the socket buffer lock here + * because we know the protocol will need to do its + * own locking. However, after calling pru_rcvd() + * and re-grabbing the buffer mutex, we don't + * re-check the condition before sleeping. */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); @@ -1359,8 +1476,10 @@ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); - if (error) + if (error) { + error = 0; goto release; + } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; @@ -1393,6 +1512,10 @@ * If soreceive() is being done from the socket callback, then * don't need to generate ACK to peer to update window, since * ACK will be generated on return to TCP. + * + * XXXRW: We drop the socket buffer lock before calling + * down into the protocol. Is that OK in the calling + * context? */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) { @@ -1403,10 +1526,8 @@ } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && - (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { - sbunlock(&so->so_rcv); - goto restart; - } + (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) + goto restart; /* XXX multi-counts msgs */ if (flagsp != NULL) *flagsp |= flags; @@ -1474,6 +1595,7 @@ sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); + /* XXXRW: is passing in sb_mb this way really safe? */ SOCKBUF_LOCK_INIT(&asb, "so_rcv"); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); @@ -1781,6 +1903,7 @@ case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: + /* Unlocked read. */ optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); @@ -1986,6 +2109,10 @@ { int revents = 0; + /* + * XXXRW: We can probably trim this down to acquire only one lock at + * a time, and do it conditional on (events). + */ SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) @@ -2032,6 +2159,7 @@ switch (kn->kn_filter) { case EVFILT_READ: + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else @@ -2129,6 +2257,7 @@ { struct socket *so = kn->kn_fp->f_data; + /* Unlocked read. */ kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } --- //depot/vendor/freebsd/src/sys/kern/uipc_socket2.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket2.c 2005/07/05 14:55:33 @@ -106,11 +106,22 @@ { SOCK_LOCK(so); + /* + * XXXRW: so_state locking? + * + * XXXRW: Why not clear SS_ISDISCONNECTED, SS_ISCONFIRMING? + */ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } +/* + * soisconnected() transitions a socket to a fully connected state. If + * so->so_head is non-NULL, we transition it from the incompletely + * connected queue to the completely connected queue. Otherwise, we + * just wake up the socket since it was an out-bound connection. + */ void soisconnected(so) struct socket *so; @@ -121,6 +132,12 @@ SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; + + /* + * XXXRW: Maybe an unlocked read of so_head would be desirable + * here? Unlocked read of so_qstate probably not a good idea + * regardless. so_options handling here is a little unsavory. + */ head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { @@ -135,14 +152,36 @@ sorwakeup(head); wakeup_one(&head->so_timeo); } else { + void (*so_upcall)(struct socket *, void *, int); + void *so_upcallarg; ACCEPT_UNLOCK(); - so->so_upcall = + /* + * XXXRW: Should probably copy the upcall fields to + * local stack variables while holding the socket + * lock (and clear the option), then release the + * lock and make the call. This will prevent lock + * order reversals/recursion between this code and + * the upcall implementation, which will likely + * also want to frob the socket using locks. + * + * NOTE: We keep a local copy of the function + * pointer so we can invoke the upcall without + * holding locks. However, we also have to copy + * in the upcall fields from the head because the + * filter may need to be called more than once. + */ + so_upcall = so->so_upcall = head->so_accf->so_accept_filter->accf_callback; - so->so_upcallarg = head->so_accf->so_accept_filter_arg; + so_upcallarg = so->so_upcallarg = + head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); - so->so_upcall(so, so->so_upcallarg, M_DONTWAIT); + /* + * Call with our existing reference, but without + * any locks. + */ + so_upcall(so, so_upcallarg, M_DONTWAIT); } return; } @@ -226,6 +265,8 @@ so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; + /* XXXRW: so_state locking? */ + /* XXXRW: should mask additional head->so_state bits here? */ so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; @@ -402,6 +443,10 @@ } KNOTE_LOCKED(&sb->sb_sel.si_note, 0); SOCKBUF_UNLOCK(sb); + /* + * XXXRW: What locks should be held over what parts of the + * following? Currently we hold none. + */ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); if (sb->sb_flags & SB_UPCALL) @@ -448,7 +493,7 @@ register struct socket *so; u_long sndcc, rcvcc; { - struct thread *td = curthread; + struct thread *td = curthread; /* XXX */ SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); @@ -512,6 +557,13 @@ /* * td will only be NULL when we're in an interrupt * (e.g. in tcp_input()) + * + * XXXRW: This comment is true, but only because the caller passed + * in NULL, not for the 4.x reason that there is no thread + * available. Need to be careful of callers that do this wrong; + * I suspect many do it wrong, and therefore many socket buffers + * end up with the wrong limits, especially via the soreserve() + * path. */ if (cc > sb_max_adj) return (0); @@ -1442,6 +1494,7 @@ xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; + /* Unlocked read. */ xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; --- //depot/vendor/freebsd/src/sys/kern/uipc_syscalls.c 2005/07/05 22:50:36 +++ //depot/user/rwatson/netperf/sys/kern/uipc_syscalls.c 2005/07/19 11:47:08 @@ -317,6 +317,7 @@ error = fgetsock(td, uap->s, &head, &fflag); if (error) goto done2; + /* Unlocked read. */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; @@ -331,6 +332,10 @@ error = falloc(td, &nfp, &fd); if (error) goto done; + /* + * Unlocked reads. + * XXXRW: Dubious use of so->so_error. + */ ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); @@ -379,6 +384,11 @@ /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; + /* + * XXXRW: Might make life simpler to grab the socket buffer lock + * here so it's held when KNOTE() calls back into the socket + * code. + */ /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); @@ -522,6 +532,7 @@ if (error) goto done2; so = fp->f_data; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; @@ -536,11 +547,13 @@ error = soconnect(so, sa, td); if (error) goto bad; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); + /* XXXRW: so_state locking? */ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); @@ -556,6 +569,7 @@ } SOCK_UNLOCK(so); bad: + /* XXXRW: so_state locking? */ if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) @@ -1534,6 +1548,7 @@ if (error) goto done2; so = fp->f_data; + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done1; @@ -1775,6 +1790,7 @@ error = EINVAL; goto done; } + /* XXXRW: so_state locking? */ if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto done; @@ -1867,6 +1883,7 @@ * before going to the extra work of constituting the sf_buf. */ SOCKBUF_LOCK(&so->so_snd); + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { if (so->so_snd.sb_state & SBS_CANTSENDMORE) error = EPIPE; @@ -1931,6 +1948,7 @@ * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; + mtx_lock(&Giant); /* VFS */ vn_lock(vp, LK_SHARED | LK_RETRY, td); /* * XXXMAC: Because we don't have fp->f_cred here, @@ -1942,6 +1960,7 @@ IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); + mtx_unlock(&Giant); /* VFS */ VM_OBJECT_LOCK(obj); vm_page_lock_queues(); vm_page_io_finish(pg); @@ -2052,6 +2071,7 @@ * after checking the connection state above in order to avoid * a race condition with sbwait(). */ + /* XXXRW: so_state locking? */ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { m_freem(m); --- //depot/vendor/freebsd/src/sys/kern/uipc_usrreq.c 2005/05/07 00:45:19 +++ //depot/user/rwatson/netperf/sys/kern/uipc_usrreq.c 2005/06/01 10:10:04 @@ -181,6 +181,9 @@ { struct unpcb *unp = sotounpcb(so); + /* + * XXXRW: How could unp be non-NULL here? + */ if (unp != NULL) return (EISCONN); return (unp_attach(so)); @@ -355,6 +358,14 @@ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = so->so_rcv.sb_cc; + + /* + * XXXRW: It would be desirable to release the UNP lock + * before doing the socket wakeup. However, to do that we + * need to acquire a reference to so2 so it doesn't evaporate + * if the peer closes it; this may be more expensive than the + * wakeup causing contention on the socket mutex. + */ SOCKBUF_UNLOCK(&so->so_rcv); sowwakeup_locked(so2); break; @@ -377,6 +388,12 @@ struct socket *so2; u_long newhiwat; + /* + * XXXRW: We do the basic checks before acquiring the UNP lock in + * order to avoid paying the cost if it's not needed. We have to + * re-do the sotounpcb() call once we do acquire it since that may + * have changed. + */ unp = sotounpcb(so); if (unp == NULL) { error = EINVAL; @@ -444,6 +461,7 @@ * Note: A better implementation would complain * if not equal to the peer's address. */ + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); @@ -477,6 +495,9 @@ * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. + * + * XXXRW: Does it matter that we don't check return value of + * sbappend_locked() here? */ if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) @@ -538,6 +559,7 @@ sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { so2 = unp->unp_conn->unp_socket; + /* Unlocked read. */ sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; @@ -758,6 +780,7 @@ so->so_pcb = unp; UNP_LOCK(); + KASSERT(so->so_pcb == NULL, ("unp_attach(): so->so_pcb != NULL")); unp->unp_gencnt = ++unp_gencnt; unp_count++; LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead @@ -894,6 +917,12 @@ ASSERT_VOP_LOCKED(vp, "unp_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_LOCK(); + /* + * XXXRW: we actually need to retest unp here and make sure that we + * didn't race with another thread operating on this socket. In + * particular, we must check unp != NULL, and that unp->unp_vnode == + * NULL. + */ vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = soun; @@ -949,6 +978,11 @@ goto bad; mtx_unlock(&Giant); UNP_LOCK(); + /* + * XXXRW: We actually need to retest our assumptions here to make + * sure we didn't race with another thread operating on this UNIX + * domain socket. In particular, that unp != NULL. + */ unp = sotounpcb(so); if (unp == NULL) { error = EINVAL; @@ -970,6 +1004,10 @@ * w/o locks; this avoids a recursive lock * of the head and holding sleep locks across * a (potentially) blocking malloc. + * + * XXXRW: This may introduce potential races if + * close() is called on (so) while the lock is + * releasd. */ UNP_UNLOCK(); so3 = sonewconn(so2, 0); @@ -1678,6 +1716,9 @@ * of a signal. If sbwait does return * an error, we'll go into an infinite * loop. Delete all of this for now. + * + * XXXRW: We will need to lock the socket + * buffer here if we enable this code. */ (void) sbwait(&so->so_rcv); goto restart; @@ -1689,6 +1730,9 @@ * to see if we hold any file descriptors in its * message buffers. Follow those links and mark them * as accessible too. + * + * XXXRW: We hold the socket buffer lock to prevent + * the mbuf chain of interest from changing. */ SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, unp_mark); --- //depot/vendor/freebsd/src/sys/kern/vfs_aio.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/netperf/sys/kern/vfs_aio.c 2005/07/05 14:55:33 @@ -170,6 +170,9 @@ SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); +/* + * struct aiocblist describes a specific AIO I/O request (job). + */ struct aiocblist { TAILQ_ENTRY(aiocblist) list; /* List of jobs */ TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */ @@ -196,6 +199,10 @@ */ #define AIOP_FREE 0x1 /* proc on free queue */ +/* + * struct aiothreadlist is the per-worker thread data structure, and is + * allocated by the thread when it is created. + */ struct aiothreadlist { int aiothreadflags; /* AIO proc flags */ TAILQ_ENTRY(aiothreadlist) list; /* List of processes */ @@ -218,7 +225,9 @@ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* - * per process aio data structure + * struct kaiinfo is the per-process aio data structure. This is allocated + * and hung off of p->p_aioinfo by aio_init_aioinfo() the first time a + * process makes use of the AIO facility. */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ @@ -1212,6 +1221,8 @@ /* * Wake up aio requests that may be serviceable now. + * + * XXXRW: This doesn't look MPSAFE at all. */ static void aio_swake_cb(struct socket *so, struct sockbuf *sb) --- //depot/vendor/freebsd/src/sys/net/bpf.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/netperf/sys/net/bpf.c 2005/07/05 14:55:33 @@ -294,6 +294,10 @@ struct bpf_if *bp; struct ifnet *ifp; + /* + * XXXRW: Watch out for bpfdetach() in the other direction: can + * d->bd_bif become NULL here? + */ bp = d->bd_bif; BPFIF_LOCK(bp); BPFD_LOCK(d); @@ -317,6 +321,9 @@ /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. + * + * XXXRW: Note, ifp might be stale here if the ifnet has been + * removed via bpfdetach(). */ if (d->bd_promisc) { d->bd_promisc = 0; @@ -365,7 +372,6 @@ make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600, "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); - dev->si_drv1 = d; d->bd_bufsize = bpf_bufsize; d->bd_sig = SIGIO; d->bd_seesent = 1; @@ -376,6 +382,7 @@ mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init(&d->bd_callout, NET_CALLOUT_MPSAFE); knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL); + dev->si_drv1 = d; return (0); } @@ -394,6 +401,9 @@ { struct bpf_d *d = dev->si_drv1; + /* + * XXXRW: Should this be a drain not just a callout? + */ BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); @@ -521,6 +531,9 @@ * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. + * + * XXXRW: What protects bd_hbuf and bd_hlen from changing during this + * operation? Especially given that uiomove() can page fault. */ error = uiomove(d->bd_hbuf, d->bd_hlen, uio); @@ -555,6 +568,10 @@ KNOTE_LOCKED(&d->bd_sel.si_note, 0); } +/* + * XXXRW: What about races against bpf_timed_out() in close? Need to ensure + * that (d) is still valid. + */ static void bpf_timed_out(arg) void *arg; @@ -582,6 +599,11 @@ int error; struct sockaddr dst; + /* + * XXXRW: Need to hold some sort of lock here to prevent races with a + * change of bd_bif? We might want to cache a bunch of these values + * locally so we don't keep dereferencing (d). + */ if (d->bd_bif == NULL) return (ENXIO); @@ -606,6 +628,10 @@ mac_create_mbuf_from_bpfdesc(d, m); BPFD_UNLOCK(d); #endif + /* + * XXXRW: How do we know that (ifp) is still valid here, and also + * still the right type and same interface we had before? + */ NET_LOCK_GIANT(); error = (*ifp->if_output)(ifp, m, &dst, NULL); NET_UNLOCK_GIANT(); @@ -668,6 +694,10 @@ struct bpf_d *d = dev->si_drv1; int error = 0; + /* + * XXXRW: This is a bit odd. Does this properly account for possible + * multiple simultaenous access from more than one thread/process? + */ BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); @@ -701,6 +731,10 @@ { struct ifnet *ifp; + /* + * XXXRW: need to make sure ther interface doesn't + * change during this operation? + */ if (d->bd_bif == NULL) error = EINVAL; else { @@ -721,6 +755,11 @@ * Set buffer length. */ case BIOCSBLEN: + /* + * XXXRW: Probably ought to hold a mutex over the test of + * bd_bif and the set of bd_bufsize to prevent test-and-set + * races. + */ if (d->bd_bif != NULL) error = EINVAL; else { @@ -754,6 +793,12 @@ * Put interface into promiscuous mode. */ case BIOCPROMISC: + /* + * XXXRW: Some atomic test-and-set issues here -- need to + * prevent bd_bif changing simultaneously, as well as deal + * with the call to ifpromisc() safely. Can ifpromisc() + * sleep? + */ if (d->bd_bif == NULL) { /* * No interface attached yet. @@ -774,6 +819,10 @@ * Get current data link type. */ case BIOCGDLT: + /* + * XXXRW: Need to hold a mutex over test and then use of + * bd_bif. + */ if (d->bd_bif == NULL) error = EINVAL; else @@ -784,6 +833,10 @@ * Get a list of supported data link types. */ case BIOCGDLTLIST: + /* + * XXXRW: Need to hold a mutex over test of bd_bif and then + * pass into bpf_getdltlist() to prevent races on bd_bif? + */ if (d->bd_bif == NULL) error = EINVAL; else @@ -794,6 +847,10 @@ * Set data link type. */ case BIOCSDLT: + /* + * XXXRW: Need to hold a mutex over test of bd_bif and then + * pass into bpf_setfltlist() to prevent races on bd_bif? + */ if (d->bd_bif == NULL) error = EINVAL; else @@ -804,6 +861,10 @@ * Get interface name. */ case BIOCGETIF: + /* + * XXXRW: Need to hold a mutex over test of bd_bif and then + * dereferencing of it. + */ if (d->bd_bif == NULL) error = EINVAL; else { @@ -845,6 +906,10 @@ { struct timeval *tv = (struct timeval *)addr; + /* + * XXXRW: Need to hold a mutex over multiple reads to + * prevent inconsistency. + */ tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; @@ -857,6 +922,10 @@ { struct bpf_stat *bs = (struct bpf_stat *)addr; + /* + * XXXRW: Need to hold a mutex over multiple reads to + * prevent inconsistency. + */ bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; @@ -914,20 +983,36 @@ break; case FIOSETOWN: + /* + * XXXRW: This is protected by sigio locking at the next + * layer down, right? + */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: + /* + * XXXRW: This is protected by sigio locking at the next + * layer down, right? + */ *(int *)addr = fgetown(&d->bd_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: + /* + * XXXRW: This is protected by sigio locking at the next + * layer down, right? + */ error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: + /* + * XXXRW: This is protected by sigio locking at the next + * layer down, right? + */ *(int *)addr = -fgetown(&d->bd_sigio); break; @@ -953,6 +1038,10 @@ /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. + * + * XXXRW: There seem to be a number of races in here. Perhaps the copyin and + * malloc should be done up front to make it easier to hold the lock over the + * duration? */ static int bpf_setf(d, fp) @@ -1010,6 +1099,10 @@ int error; struct ifnet *theywant; + /* + * XXXRW: While this might be more efficient, it does open the door + * for more racing. + */ theywant = ifunit(ifr->ifr_name); if (theywant == NULL) return ENXIO; @@ -1034,6 +1127,10 @@ * If we're already attached to requested interface, * just flush the buffer. */ + /* + * XXXRW: This section appears to be a bit racey. Should it + * be protected by the descriptor mutex? + */ if (d->bd_sbuf == NULL) { error = bpf_allocbufs(d); if (error != 0) @@ -1073,6 +1170,10 @@ struct bpf_d *d; int revents; + /* + * XXXRW: mutex should probably be grabbed before the test of bd_bif + * or we might race. + */ d = dev->si_drv1; if (d->bd_bif == NULL) return (ENXIO); @@ -1110,6 +1211,10 @@ if (kn->kn_filter != EVFILT_READ) return (1); + /* + * XXXRW: Shouldn't we hold the bpf descriptor lock over this? Or + * maybe not. + */ kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 0); @@ -1123,6 +1228,9 @@ { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + /* + * XXXRW: Anything extra locking-wise need to happen here? + */ knlist_remove(&d->bd_sel.si_note, kn, 0); } @@ -1134,6 +1242,9 @@ struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; + /* + * XXXRW: Wow, this locking actually looks mostly correct! + */ BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { @@ -1168,6 +1279,9 @@ /* * Lockless read to avoid cost of locking the interface if there are * no descriptors attached. + * + * XXXRW: Is there a risk that the (bp) pointer here might become + * invalid during or before dereference? */ if (LIST_EMPTY(&bp->bif_dlist)) return; @@ -1229,6 +1343,8 @@ /* * Lockless read to avoid cost of locking the interface if there are * no descriptors attached. + * + * XXXRW: See bpf_tap() comment. */ if (LIST_EMPTY(&bp->bif_dlist)) return; @@ -1275,6 +1391,8 @@ /* * Lockless read to avoid cost of locking the interface if there are * no descriptors attached. + * + * XXXRW: See bpf_tap() comment. */ if (LIST_EMPTY(&bp->bif_dlist)) return; @@ -1292,6 +1410,10 @@ BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + /* + * XXXRW: For test-and-set reasons, shouldn't bpf descriptor + * be locked *before* this test? + */ if (!d->bd_seesent && (m->m_pkthdr.rcvif == NULL)) continue; BPFD_LOCK(d); @@ -1324,7 +1446,12 @@ { struct bpf_hdr *hp; int totlen, curlen; - int hdrlen = d->bd_bif->bif_hdrlen; + int hdrlen; + + /* + * XXXRW: Assert bpf descriptor lock here? + */ + hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); @@ -1387,6 +1514,9 @@ /* * Initialize all nonzero fields of a descriptor. + * + * XXXRW: Shouldn't some sort of locking be going on here? This can be + * called "live" by an ioctl(). */ static int bpf_allocbufs(d) @@ -1467,6 +1597,11 @@ bp->bif_dlt = dlt; mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF); + /* + * XXXRW: Seems like this should really happen down at the bottom to + * avoid exposing a mostly initialized bpf interface structure to the + * world. + */ mtx_lock(&bpf_mtx); LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); mtx_unlock(&bpf_mtx); @@ -1498,6 +1633,12 @@ struct bpf_if *bp; struct bpf_d *d; + /* + * XXXRW: Check to make sure races between this code and other code + * aren't too extreme. In particular, that other code doesn't assume + * that a descriptor will point to an interface in the global + * interface list? + */ /* Locate BPF interface information */ mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { @@ -1516,6 +1657,12 @@ mtx_unlock(&bpf_mtx); while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { + /* + * XXXRW: Should bpf descriptor lock be edged above the + * detach call? How come bpf_detachd() doesn't always + * perform a wakeup? Once detached, don't we risk a race with + * wakeup()? + */ bpf_detachd(d); BPFD_LOCK(d); bpf_wakeup(d); @@ -1538,6 +1685,9 @@ struct ifnet *ifp; struct bpf_if *bp; + /* + * XXXRW: Assert descriptor lock here? Why can't we use d->bd_bif? + */ ifp = d->bd_bif->bif_ifp; n = 0; error = 0; @@ -1572,6 +1722,10 @@ struct ifnet *ifp; struct bpf_if *bp; + /* + * XXXRW: Should acquire mutex earlier to prevent the ifnet from + * changing out from under us? + */ if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; @@ -1581,6 +1735,9 @@ break; } mtx_unlock(&bpf_mtx); + /* + * XXXRW: Dropping the mutex here might open up a race? + */ if (bp != NULL) { opromisc = d->bd_promisc; bpf_detachd(d); --- //depot/vendor/freebsd/src/sys/net/if.c 2005/07/19 10:16:50 +++ //depot/user/rwatson/netperf/sys/net/if.c 2005/07/19 11:47:08 @@ -287,6 +287,14 @@ if_clone_init(); } +/* + * Grow the global ifindex_table by a power of two over the current side. + * Note that this call assumes ifindex_table (and hence the if_indexentry + * entries in it) can be relocated in memory without causing a problem. + * + * XXXRW: Locking assertion needed here? What protects against untimely + * indirection through ifindex_table? Note M_WAITOK. + */ static void if_grow(void) { @@ -303,6 +311,10 @@ ifindex_table = e; } +/* + * if_check() scans the list of registered network interfaces to check that + * initialization-time invariants are met. + */ /* ARGSUSED*/ static void if_check(void *dummy __unused) @@ -329,6 +341,19 @@ if_slowtimo(0); } +/* + * if_findindex ... XXXRW: what? + * + * Given a partially initialized ifnet reference, identify an available ifnet + * unit number that can be assigned to it. Try searching for any user + * preferences or prior unit allocations; if there's no match or a collision, + * iterate through searching for the next available unit. Note that the + * returned unit can be beyond the end of the index array, requiring the + * caller to extend the array by calling if_grow(). + * + * XXXRW: Note that no reservation takes place, so as soon as the caller + * releases the lock, the unit returned may be reused. + */ /* XXX: should be locked. */ static int if_findindex(struct ifnet *ifp) @@ -337,12 +362,18 @@ char eaddr[18], devname[32]; const char *name, *p; + IFNET_WASSERT(); + switch (ifp->if_type) { case IFT_ETHER: /* these types use struct arpcom */ case IFT_FDDI: case IFT_XETHER: case IFT_ISO88025: case IFT_L2VLAN: + /* + * XXXRW: Assumes that arpcom always has its ifnet at the + * head of its structure. + */ case IFT_BRIDGE: snprintf(eaddr, 18, "%6D", IFP2ENADDR(ifp), ":"); break; @@ -453,6 +484,10 @@ struct sockaddr_dl *sdl; struct ifaddr *ifa; + /* + * XXXRW: Shouldn't we add to the global list only once the ifnet + * is ready for use? + */ if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); @@ -461,9 +496,6 @@ TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); ifp->if_afdata_initialized = 0; - IFNET_WLOCK(); - TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); - IFNET_WUNLOCK(); /* * XXX - * The old code would work if the interface passed a pre-existing @@ -484,6 +516,9 @@ mac_create_ifnet(ifp); #endif + IFNET_WLOCK(); + TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); + IFNET_WUNLOCK(); ifdev_byindex(ifp->if_index) = make_dev(&net_cdevsw, unit2minor(ifp->if_index), UID_ROOT, GID_WHEEL, 0600, "%s/%s", @@ -557,6 +592,14 @@ SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); +/* + * XXXRW: if_attachdomain1() can only be called on an interface after it has + * an ifindex assigned, as IPv6 assumes that the index is allocated and will + * attempt to use it. + * + * XXXRW: How come we need to check for multiple attachments of domain- + * specific data? + */ static void if_attachdomain1(struct ifnet *ifp) { @@ -2155,10 +2198,38 @@ } int +ifq_handoff(struct ifnet *ifp, struct mbuf *m, int adjust) +{ + int error, len; + short mflags; + + if (ifp->if_start_mbuf != NULL) { + if (ifp->if_start_mbuf(ifp, m, adjust)) + return (0); + else + return (ENOBUFS); + } + + len = m->m_pkthdr.len; + mflags = m->m_flags; + IFQ_ENQUEUE(&ifp->if_snd, m, error); + if (error) + return (error); + ifp->if_obytes += len + adjust; + if (mflags & M_MCAST) + ifp->if_omcasts++; + if ((ifp->if_flags & IFF_OACTIVE) == 0) + if_start(ifp); + return (0); +} + +int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; + if (ifp != NULL && ifp->if_start_mbuf != NULL) + return (ifp->if_start_mbuf(ifp, m, adjust)); IF_LOCK(ifq); if (_IF_QFULL(ifq)) { _IF_DROP(ifq); --- //depot/vendor/freebsd/src/sys/net/if_ethersubr.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_ethersubr.c 2005/07/05 14:55:33 @@ -634,9 +634,6 @@ if ((m = bridge_in_ptr(ifp, m)) == NULL) return; - /* First chunk of an mbuf contains good entropy */ - if (harvest.ethernet) - random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); } --- //depot/vendor/freebsd/src/sys/net/if_gif.c 2005/06/26 18:15:33 +++ //depot/user/rwatson/netperf/sys/net/if_gif.c 2005/07/05 14:55:33 @@ -89,6 +89,10 @@ * gif_mtx protects the global gif_softc_list. * XXX: Per-softc locking is still required. */ +/* + * XXXRW: Note that gif_mtx only protects global gif-related data, not + * per-softc data. See also netinet/in_gif.c for locking needs. + */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static LIST_HEAD(, gif_softc) gif_softc_list; @@ -513,6 +517,9 @@ } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ +/* + * XXXRW: per-gif softc locking required. + */ int gif_ioctl(ifp, cmd, data) struct ifnet *ifp; @@ -769,8 +776,10 @@ int s; int error = 0; + /* + * XXXRW: per-gif softc locking required. + */ s = splnet(); - mtx_lock(&gif_mtx); LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -799,6 +808,9 @@ } mtx_unlock(&gif_mtx); + /* + * XXXRW: Lock gif softc fields. + */ /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) switch (sc->gif_psrc->sa_family) { --- //depot/vendor/freebsd/src/sys/net/if_gre.c 2005/06/28 07:00:51 +++ //depot/user/rwatson/netperf/sys/net/if_gre.c 2005/07/05 14:55:33 @@ -95,7 +95,8 @@ /* * gre_mtx protects all global variables in if_gre.c. - * XXX: gre_softc data not protected yet. + * + * XXXRW: It does not protect softc-specific data. */ struct mtx gre_mtx; static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); --- //depot/vendor/freebsd/src/sys/net/if_gre.h 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_gre.h 2005/07/05 14:55:33 @@ -54,6 +54,12 @@ WCCP_V2 } wccp_ver_t; +/* + * XXXRW: softc fields need locking. + * + * XXXRW: gre's notion of a 'called' count is not MP-safe, as it assumes + * only one packet can be processed at a time. + */ struct gre_softc { struct ifnet *sc_ifp; LIST_ENTRY(gre_softc) sc_list; --- //depot/vendor/freebsd/src/sys/net/if_sl.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_sl.c 2005/07/05 14:55:33 @@ -162,6 +162,7 @@ #define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */ static LIST_HEAD(sl_list, sl_softc) sl_list; +static struct mtx slip_mtx; #define FRAME_END 0xc0 /* Frame End */ #define FRAME_ESCAPE 0xdb /* Frame Esc */ @@ -204,6 +205,7 @@ { switch (type) { case MOD_LOAD: + mtx_init(&slip_mtx, "slip_mtx", NULL, MTX_DEF); ldisc_register(SLIPDISC, &slipdisc); LIST_INIT(&sl_list); break; @@ -225,6 +227,7 @@ DECLARE_MODULE(if_sl, sl_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +/* Locked using slip_mtx. */ static int *st_unit_list; static size_t st_unit_max = 0; @@ -233,6 +236,7 @@ { struct sl_softc *sc; + mtx_assert(&slip_mtx, MA_OWNED); LIST_FOREACH(sc, &sl_list, sl_next) { if (SL2IFP(sc)->if_dunit == unit) return (0); @@ -256,6 +260,7 @@ { size_t i; + mtx_assert(&slip_mtx, MA_OWNED); for (i = 0; i < st_unit_max; i++) if (st_unit_list[i] == unit) return 1; @@ -267,6 +272,7 @@ { int *t; + mtx_assert(&slip_mtx, MA_OWNED); if (slisstatic(unit)) return; @@ -334,10 +340,12 @@ SL2IFP(sc)->if_linkmib = sc; SL2IFP(sc)->if_linkmiblen = sizeof *sc; mtx_init(&sc->sc_fastq.ifq_mtx, "sl_fastq", NULL, MTX_DEF); + mtx_init(&sc->sc_mtx, "slip sc_mtx", NULL, MTX_DEF); /* * Find a suitable unit number. */ + mtx_lock(&slip_mtx); for (unit=0; ; unit++) { if (slisstatic(unit)) continue; @@ -347,6 +355,7 @@ } if_initname(SL2IFP(sc), "sl", unit); LIST_INSERT_HEAD(&sl_list, sc, sl_next); + mtx_unlock(&slip_mtx); if_attach(SL2IFP(sc)); bpfattach(SL2IFP(sc), DLT_SLIP, SLIP_HDRLEN); @@ -403,11 +412,21 @@ static void sldestroy(struct sl_softc *sc) { + + /* + * XXXRW: Slight race here: we may detach bpf/if before we + * attach. This appears to be a property of the unit selection + * process, which might be better handled by the interface + * cloning subsystem? + */ bpfdetach(SL2IFP(sc)); if_detach(SL2IFP(sc)); if_free(SL2IFP(sc)); + mtx_lock(&slip_mtx); LIST_REMOVE(sc, sl_next); + mtx_unlock(&slip_mtx); m_free(sc->sc_mbuf); + mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_fastq.ifq_mtx); if (sc->bpfbuf) free(sc->bpfbuf, M_SL); @@ -434,6 +453,10 @@ clist_free_cblocks(&tp->t_outq); sc = sl_for_tty(tp); if (sc != NULL) { + /* + * XXXRW: tear-down race between timeout and slclose()? + */ + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill) { sc->sc_outfill = 0; untimeout(sl_outfill, sc, sc->sc_ofhandle); @@ -442,6 +465,7 @@ sc->sc_keepalive = 0; untimeout(sl_keepalive, sc, sc->sc_kahandle); } + mtx_unlock(&sc->sc_mtx); if_down(SL2IFP(sc)); sc->sc_ttyp = NULL; sldestroy(sc); @@ -474,12 +498,21 @@ splx(s); return (ENXIO); } + /* + * XXXRW: we hold the mutex over all of this to protect + * the unit change and global list consistency. However, + * some of these functions probably sleep, making this + * wrong. If we have to support renumbering, we probably + * need a way to reserve both numbers to prevent them + * from being reused during the change, or a way to sleep + * waiting for a change to end (i.e., a CV). + */ + mtx_lock(&slip_mtx); if (SL2IFP(sc)->if_dunit != unit) { if (!slisunitfree(unit)) { - splx(s); + mtx_unlock(&slip_mtx); return (ENXIO); } - wasup = SL2IFP(sc)->if_flags & IFF_UP; bpfdetach(SL2IFP(sc)); if_detach(SL2IFP(sc)); @@ -497,9 +530,11 @@ SLIP_HIWAT + 2 * SL2IFP(sc)->if_mtu + 1); } slmarkstatic(unit); + mtx_unlock(&slip_mtx); break; case SLIOCSKEEPAL: + mtx_lock(&sc->sc_mtx); sc->sc_keepalive = *(u_int *)data * hz; if (sc->sc_keepalive) { sc->sc_flags |= SC_KEEPALIVE; @@ -511,6 +546,7 @@ sc->sc_flags &= ~SC_KEEPALIVE; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGKEEPAL: @@ -518,6 +554,7 @@ break; case SLIOCSOUTFILL: + mtx_lock(&sc->sc_mtx); sc->sc_outfill = *(u_int *)data * hz; if (sc->sc_outfill) { sc->sc_flags |= SC_OUTWAIT; @@ -529,9 +566,11 @@ sc->sc_flags &= ~SC_OUTWAIT; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGOUTFILL: + /* Unlocked read. */ *(int *)data = sc->sc_outfill / hz; break; @@ -606,9 +645,10 @@ } /* - * Start output on interface. Get another datagram - * to send from the interface queue and map it to - * the interface before starting output. + * Start output on interface. Get another datagram to send from the + * interface queue and map it to the interface before starting output. Due + * to IFF_NEEDSGIANT, this code runs with Giant and can safely interface + * with the tty code. */ static int sltstart(struct tty *tp) @@ -631,8 +671,11 @@ (*tp->t_oproc)(tp); if (tp->t_outq.c_cc != 0) { - if (sc != NULL) + if (sc != NULL) { + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); + } if (tp->t_outq.c_cc > SLIP_HIWAT) return 0; } @@ -662,6 +705,7 @@ * queueing, and the connection id compression will get * munged when this happens. */ + mtx_lock(&sc->sc_mtx); if (SL2IFP(sc)->if_bpf) { /* * We need to save the TCP/IP header before it's @@ -706,6 +750,7 @@ bcopy(mtod(m, caddr_t), &sc->bpfbuf[SLX_CHDR], CHDR_LEN); BPF_TAP(SL2IFP(sc), sc->bpfbuf, len + SLIP_HDRLEN); } + mtx_unlock(&sc->sc_mtx); /* * If system is getting low on clists, just flush our @@ -721,7 +766,9 @@ continue; } + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); /* * The extra FRAME_END will start up a new packet, and thus * will flush any accumulated garbage. We do this whenever @@ -809,6 +856,8 @@ { struct mbuf *m, *newm; + mtx_assert(&sc->sc_mtx, MA_OWNED); + MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); @@ -862,13 +911,16 @@ if (sc == NULL) return 0; if (c & TTY_ERRORMASK || (tp->t_state & TS_CONNECTED) == 0) { + mtx_lock(&sc->sc_mtx); sc->sc_flags |= SC_ERROR; + mtx_unlock(&sc->sc_mtx); return 0; } c &= TTY_CHARMASK; ++SL2IFP(sc)->if_ibytes; + mtx_lock(&sc->sc_mtx); if (SL2IFP(sc)->if_flags & IFF_DEBUG) { if (c == ABT_ESC) { /* @@ -887,6 +939,7 @@ sc->sc_starttime = time_second; if (sc->sc_abortcount >= ABT_COUNT) { slclose(tp,0); + mtx_unlock(&sc->sc_mtx); return 0; } } @@ -909,6 +962,7 @@ case FRAME_ESCAPE: sc->sc_escape = 1; + mtx_unlock(&sc->sc_mtx); return 0; case FRAME_END: @@ -993,6 +1047,7 @@ if (sc->sc_mp < sc->sc_ep) { *sc->sc_mp++ = c; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -1004,6 +1059,7 @@ newpack: sc->sc_mp = sc->sc_buf = sc->sc_ep - SLRMAX; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -1083,6 +1139,7 @@ { struct sl_softc *sc = chan; + mtx_lock(&sc->sc_mtx); if (sc->sc_keepalive) { if (sc->sc_flags & SC_KEEPALIVE) { if (sc->sc_ttyp->t_pgrp != NULL) { @@ -1096,6 +1153,7 @@ } else { sc->sc_flags &= ~SC_KEEPALIVE; } + mtx_unlock(&sc->sc_mtx); } static void @@ -1105,6 +1163,7 @@ register struct tty *tp = sc->sc_ttyp; int s; + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill && tp != NULL) { if (sc->sc_flags & SC_OUTWAIT) { s = splimp (); @@ -1118,4 +1177,5 @@ } else { sc->sc_flags &= ~SC_OUTWAIT; } + mtx_unlock(&sc->sc_mtx); } --- //depot/vendor/freebsd/src/sys/net/if_slvar.h 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_slvar.h 2005/07/05 14:55:33 @@ -34,13 +34,16 @@ #ifndef _NET_IF_SLVAR_H_ #define _NET_IF_SLVAR_H_ +#include #include /* * Definitions for SLIP interface data structures * * (This exists so programs like slstats can get at the definition - * of sl_softc.) + * of sl_softc.) Fields owned by the SLIP subsystem are protected + * using sc_mtx, with the exception of sc_next, which is protected + * by the global slip_mtx. */ struct sl_softc { struct ifnet *sc_ifp; /* network-visible interface */ @@ -66,6 +69,7 @@ struct slcompress sc_comp; /* tcp compression data */ LIST_ENTRY(sl_softc) sl_next; u_char *bpfbuf; /* hang buffer for bpf here */ + struct mtx sc_mtx; }; #define SL2IFP(sc) ((sc)->sc_ifp) --- //depot/vendor/freebsd/src/sys/net/if_spppsubr.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_spppsubr.c 2005/07/05 14:55:33 @@ -256,6 +256,9 @@ void (*scr)(struct sppp *sp); }; +struct mtx sppp_mtx; +MTX_SYSINIT(sppp_mtx, &sppp_mtx, "sppp_mtx", MTX_DEF); + #if defined(__FreeBSD__) && __FreeBSD__ >= 3 && __FreeBSD_version < 501113 #define SPP_FMT "%s%d: " #define SPP_ARGS(ifp) (ifp)->if_name, (ifp)->if_unit @@ -1077,6 +1080,7 @@ (ifp->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); + mtx_unlock(&sppp_mtx); ifp->if_mtu = PP_MTU; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; @@ -1125,6 +1129,7 @@ struct sppp *sp = IFP2SP(ifp); int i; + mtx_lock(&sppp_mtx); KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); /* Stop keepalive handler. */ @@ -4811,7 +4816,12 @@ struct ifnet *ifp = SP2IFP(sp); int s; + /* + * XXXRW: It would be nice to avoid calling all this stuff while + * holding sppp_mtx, or we risk lock order reversals. + */ s = splimp(); + mtx_lock(&sppp_mtx); SPPP_LOCK(sp); /* Keepalive mode disabled or channel down? */ if (! (sp->pp_flags & PP_KEEPALIVE) || --- //depot/vendor/freebsd/src/sys/net/if_stf.c 2005/06/26 18:15:33 +++ //depot/user/rwatson/netperf/sys/net/if_stf.c 2005/07/05 14:55:33 @@ -139,14 +139,12 @@ #define sc_ro __sc_ro46.__sc_ro4 const struct encaptab *encap_cookie; LIST_ENTRY(stf_softc) sc_list; /* all stf's are linked */ + struct mtx sc_mtx; /* protect sc_ro */ }; #define STF2IFP(sc) ((sc)->sc_ifp) /* * All mutable global variables in if_stf.c are protected by stf_mtx. - * XXXRW: Note that mutable fields in the softc are not currently locked: - * in particular, sc_ro needs to be protected from concurrent entrance - * of stf_output(). */ static struct mtx stf_mtx; static LIST_HEAD(, stf_softc) stf_softc_list; @@ -239,6 +237,7 @@ ifc_free_unit(ifc, unit); return (ENOMEM); } + mtx_init(&sc->sc_mtx, "stf sc_mtx", NULL, MTX_DEF); ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; @@ -263,6 +262,7 @@ if_detach(STF2IFP(sc)); if_free(STF2IFP(sc)); + mtx_destroy(&sc->sc_mtx); free(sc, M_STF); } @@ -439,9 +439,10 @@ struct ip6_hdr *ip6; struct in6_ifaddr *ia6; u_int32_t af; -#ifdef MAC + struct route ro; int error; +#ifdef MAC error = mac_check_ifnet_transmit(ifp, m); if (error) { m_freem(m); @@ -540,9 +541,7 @@ else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); - /* - * XXXRW: Locking of sc_ro required. - */ + mtx_lock(&sc->sc_mtx); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { @@ -561,12 +560,21 @@ if (sc->sc_ro.ro_rt == NULL) { m_freem(m); ifp->if_oerrors++; + mtx_unlock(&sc->sc_mtx); return ENETUNREACH; } } + /* + * XXXRW: Holding mutex over call to ip_output(): potential lock + * order issue? Hard to resolve cleanly with the current route + * caching model, as we have to synchronize access to shared softc + * state. + */ ifp->if_opackets++; - return ip_output(m, NULL, &sc->sc_ro, 0, NULL, NULL); + error = ip_output(m, NULL, &ro, 0, NULL, NULL); + mtx_unlock(&sc->sc_mtx); + return (error); } static int --- //depot/vendor/freebsd/src/sys/net/if_tap.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_tap.c 2005/07/05 14:55:33 @@ -114,6 +114,10 @@ * All global variables in if_tap.c are locked with tapmtx, with the * exception of tapdebug, which is accessed unlocked; tapclones is * static at runtime. + * + * XXXRW: si_flags appears not to be protected from concurrent access, + * and is written at run-time. + * XXXRW: si_drv1 is also used for test-and-set, and isn't synchronized. */ static struct mtx tapmtx; static int tapdebug = 0; /* debug flag */ @@ -172,6 +176,7 @@ * The EBUSY algorithm here can't quite atomically * guarantee that this is race-free since we have to * release the tap mtx to deregister the clone handler. + * XXXRW: is this true? */ mtx_lock(&tapmtx); SLIST_FOREACH(tp, &taphead, tap_next) { @@ -704,6 +709,7 @@ case SIOCSIFADDR: /* set MAC address of the remote side */ mtx_lock(&tp->tap_mtx); + /* XXXRW: Does this actually do anything? */ bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); break; @@ -758,6 +764,7 @@ if (flag & O_NONBLOCK) return (EWOULDBLOCK); + /* This looks like a wanna-be condition variable. */ mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_RWAIT; mtx_unlock(&tp->tap_mtx); --- //depot/vendor/freebsd/src/sys/net/if_tun.c 2005/06/26 18:15:33 +++ //depot/user/rwatson/netperf/sys/net/if_tun.c 2005/07/05 14:55:33 @@ -60,6 +60,12 @@ * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. + * + * XXXRW: we allocate si_drv1 for the dev_t on demand, rather than when + * the dev_t is instantiated. Nothing serializes the test/set of that + * field. + * + * XXXRW: what serializes access to si_flags? */ struct tun_softc { TAILQ_ENTRY(tun_softc) tun_list; @@ -123,6 +129,9 @@ static d_ioctl_t tunioctl; static d_poll_t tunpoll; +/* + * XXXRW: can remove D_NEEDGIANT? Probably not because of si_drv1 for now. + */ static struct cdevsw tun_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDGIANT, @@ -390,6 +399,9 @@ ifp->if_flags |= IFF_UP | IFF_RUNNING; getmicrotime(&ifp->if_lastchange); + /* + * XXXRW: interface locking. + */ for (ifa = TAILQ_FIRST(&ifp->if_addrhead); ifa; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr == NULL) --- //depot/vendor/freebsd/src/sys/net/if_var.h 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/net/if_var.h 2005/07/05 14:55:33 @@ -158,7 +158,8 @@ (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); - void *if_spare1; /* spare pointer 1 */ + int (*if_start_mbuf) /* Hand off mbuf to driver for send */ + (struct ifnet *, struct mbuf *, int); void *if_spare2; /* spare pointer 2 */ void *if_spare3; /* spare pointer 3 */ u_int if_spare_flags1; /* spare flags 1 */ @@ -270,9 +271,12 @@ } while (0) #define IF_DEQUEUE(ifq, m) do { \ - IF_LOCK(ifq); \ - _IF_DEQUEUE(ifq, m); \ - IF_UNLOCK(ifq); \ + if ((ifq)->ifq_head != NULL) { \ + IF_LOCK(ifq); \ + _IF_DEQUEUE(ifq, m); \ + IF_UNLOCK(ifq); \ + } else \ + (m) = NULL; \ } while (0) #define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head) @@ -289,9 +293,11 @@ } while (0) #define IF_DRAIN(ifq) do { \ - IF_LOCK(ifq); \ - _IF_DRAIN(ifq); \ - IF_UNLOCK(ifq); \ + if ((ifq)->ifq_head != NULL) { \ + IF_LOCK(ifq); \ + _IF_DRAIN(ifq); \ + IF_UNLOCK(ifq); \ + } \ } while(0) #ifdef _KERNEL @@ -411,22 +417,11 @@ #define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++) #define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) +int ifq_handoff(struct ifnet *ifp, struct mbuf *m, int adjust); #define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \ do { \ - int len; \ - short mflags; \ - \ - len = (m)->m_pkthdr.len; \ - mflags = (m)->m_flags; \ - IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \ - if ((err) == 0) { \ - (ifp)->if_obytes += len + (adj); \ - if (mflags & M_MCAST) \ - (ifp)->if_omcasts++; \ - if (((ifp)->if_flags & IFF_OACTIVE) == 0) \ - if_start(ifp); \ - } \ -} while (0) + err = ifq_handoff(ifp, m, adj); \ +} while (0) \ #define IFQ_HANDOFF(ifp, m, err) \ IFQ_HANDOFF_ADJ(ifp, m, 0, err) @@ -584,8 +579,10 @@ mtx_init(&ifnet_lock, "ifnet", NULL, MTX_DEF | MTX_RECURSE) #define IFNET_WLOCK() mtx_lock(&ifnet_lock) #define IFNET_WUNLOCK() mtx_unlock(&ifnet_lock) +#define IFNET_WASSERT() mtx_assert(&ifnet_lock, MA_OWNED) #define IFNET_RLOCK() IFNET_WLOCK() #define IFNET_RUNLOCK() IFNET_WUNLOCK() +#define IFNET_RASSERT() IFNET_WASSERT() struct ifindex_entry { struct ifnet *ife_ifnet; --- //depot/vendor/freebsd/src/sys/net/netisr.c 2004/10/11 20:05:57 +++ //depot/user/rwatson/netperf/sys/net/netisr.c 2004/11/28 13:01:02 @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2004 Robert N. M. Watson * Copyright (c) 2001,2002,2003 Jonathan Lemon * Copyright (c) 1997, Stefan Esser * All rights reserved. @@ -48,6 +49,7 @@ #include #include +#include #include #include @@ -199,11 +201,19 @@ static struct isrstat isrstat; SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr counters"); +SYSCTL_NODE(_net_isr, OID_AUTO, dispatch, CTLFLAG_RW, 0, "direct dispatch"); + +static int netisr_dispatch_enable = 0; +SYSCTL_INT(_net_isr_dispatch, OID_AUTO, enable, CTLFLAG_RW, + &netisr_dispatch_enable, 0, "enable direct dispatch"); +TUNABLE_INT("net.isr.dispatch.enable", &netisr_dispatch_enable); -static int netisr_enable = 0; -SYSCTL_INT(_net_isr, OID_AUTO, enable, CTLFLAG_RW, - &netisr_enable, 0, "enable direct dispatch"); -TUNABLE_INT("net.isr.enable", &netisr_enable); +/* + * XXXRW: net.isr.enable is a compatibility interface, to be removed. + */ +SYSCTL_INT(_net_isr, OID_AUTO, enable, CTLFLAG_RW, &netisr_dispatch_enable, 0, + "enable direct dispatch"); +TUNABLE_INT("net.isr.enable", &netisr_dispatch_enable); SYSCTL_INT(_net_isr, OID_AUTO, count, CTLFLAG_RD, &isrstat.isrs_count, 0, ""); @@ -226,14 +236,21 @@ static void netisr_processqueue(struct netisr *ni) { + struct mbufqueue mbq; + struct ifqueue *ifq; struct mbuf *m; - for (;;) { - IF_DEQUEUE(ni->ni_queue, m); - if (m == NULL) - break; + ifq = ni->ni_queue; + if (ifq->ifq_head == NULL) + return; + mbq_init(&mbq); + IF_LOCK(ifq); + mbq_enqueue_from_ifq(&mbq, ifq); + IF_UNLOCK(ifq); + + while ((m = mbq_dequeue_one(&mbq)) != NULL) ni->ni_handler(m); - } + mbq_assert_empty(&mbq); } /* @@ -265,7 +282,7 @@ * between multiple places in the system (e.g. IP * dispatched from interfaces vs. IP queued from IPSec). */ - if (netisr_enable && (ni->ni_flags & NETISR_MPSAFE)) { + if (netisr_dispatch_enable && (ni->ni_flags & NETISR_MPSAFE)) { isrstat.isrs_directed++; /* * NB: We used to drain the queue before handling --- //depot/vendor/freebsd/src/sys/net/raw_cb.c 2005/01/24 23:00:55 +++ //depot/user/rwatson/netperf/sys/net/raw_cb.c 2005/02/27 16:23:38 @@ -126,6 +126,7 @@ m_freem(dtom(rp->rcb_faddr)); rp->rcb_faddr = 0; #endif + /* Unlocked read. */ if (rp->rcb_socket->so_state & SS_NOFDREF) raw_detach(rp); } --- //depot/vendor/freebsd/src/sys/net/raw_usrreq.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/net/raw_usrreq.c 2005/02/27 16:23:38 @@ -76,6 +76,10 @@ register struct mbuf *m = m0; struct socket *last; + /* + * XXXRW: Potential lock order issues due to holding the + * rawcb_mtx across all this stuff. Need to revisit. + */ last = 0; mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &rawcb_list, list) { --- //depot/vendor/freebsd/src/sys/net/rtsock.c 2005/07/15 09:51:31 +++ //depot/user/rwatson/netperf/sys/net/rtsock.c 2005/07/19 11:47:08 @@ -54,10 +54,18 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ +/* + * XXXRW: It would be really nice to add const to these, but that may + * not be possible due to where they are passed in. We might need + * to const-poison a whole boatload of APIs...? + */ static struct sockaddr route_dst = { 2, PF_ROUTE, }; static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* + * XXXRW: These fields are locked by RTSOCK_LOCK(). + */ static struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ --- //depot/vendor/freebsd/src/sys/netatalk/aarp.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/netatalk/aarp.c 2005/07/05 14:55:33 @@ -104,10 +104,6 @@ #define AARPT_KILLC 20 #define AARPT_KILLI 3 -# if !defined(__FreeBSD__) -extern u_char etherbroadcastaddr[6]; -# endif /* __FreeBSD__ */ - static const u_char atmulticastaddr[ 6 ] = { 0x09, 0x00, 0x07, 0xff, 0xff, 0xff, }; @@ -119,6 +115,9 @@ 0x00, 0x00, 0x00, }; +/* + * XXXRW: Make use callouts, not timeouts. + */ static struct callout_handle aarptimer_ch = CALLOUT_HANDLE_INITIALIZER(&aarptimer_ch); @@ -153,6 +152,7 @@ struct at_ifaddr *aa; struct sockaddr_at *sat2; + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { sat2 = &(aa->aa_addr); if (sat2->sat_addr.s_net == sat->sat_addr.s_net) { @@ -164,6 +164,8 @@ break; } } + IFAREF((struct ifaddr *)aa); + AT_IFADDR_LIST_UNLOCK(); return (aa); } @@ -246,6 +248,7 @@ ntohs(AA_SAT(aa)->sat_addr.s_net), AA_SAT(aa)->sat_addr.s_node); #endif /* NETATALKDEBUG */ + IFAFREE((struct ifaddr *)aa); sa.sa_len = sizeof(struct sockaddr); sa.sa_family = AF_UNSPEC; @@ -274,6 +277,7 @@ bcopy(ifp->if_broadcastaddr, (caddr_t)desten, sizeof(ifp->if_addrlen)); } + IFAFREE((struct ifaddr *)aa); return (1); } @@ -383,6 +387,8 @@ /* * Since we don't know the net, we just look for the first * phase 1 address on the interface. + * + * XXXRW: Note, if_addrhead not locked here. */ for (aa = (struct at_ifaddr *)TAILQ_FIRST(&ifp->if_addrhead); aa; aa = (struct at_ifaddr *)aa->aa_ifa.ifa_link.tqe_next) { @@ -416,6 +422,7 @@ callout_stop(&aa->aa_callout); wakeup(aa); m_freem(m); + IFAFREE((struct ifaddr *)aa); return; } else if (op != AARPOP_PROBE) { /* @@ -428,6 +435,7 @@ ea->aarp_sha[ 0 ], ea->aarp_sha[ 1 ], ea->aarp_sha[ 2 ], ea->aarp_sha[ 3 ], ea->aarp_sha[ 4 ], ea->aarp_sha[ 5 ]); m_freem(m); + IFAFREE((struct ifaddr *)aa); return; } } @@ -444,6 +452,7 @@ aarptfree(aat); AARPTAB_UNLOCK(); m_freem(m); + IFAFREE((struct ifaddr *)aa); return; } @@ -479,6 +488,7 @@ if (tpa.s_net != ma.s_net || tpa.s_node != ma.s_node || op == AARPOP_RESPONSE || (aa->aa_flags & AFA_PROBING)) { m_freem(m); + IFAFREE((struct ifaddr *)aa); return; } @@ -497,6 +507,7 @@ sizeof(struct ether_aarp)); M_PREPEND(m, sizeof(struct llc), M_DONTWAIT); if (m == NULL) { + IFAFREE((struct ifaddr *)aa); return; } llc = mtod(m, struct llc *); @@ -510,6 +521,7 @@ } else { eh->ether_type = htons(ETHERTYPE_AARP); } + IFAFREE((struct ifaddr *)aa); ea->aarp_tpnode = ea->aarp_spnode; ea->aarp_spnode = ma.s_node; @@ -679,6 +691,9 @@ struct aarptab *aat; int i; + /* + * XXXRW: Should grab mutex before untimeout? + */ untimeout(aarptimer, 0, aarptimer_ch); AARPTAB_LOCK(); for (i = 0, aat = aarptab; i < AARPTAB_SIZE; i++, aat++) { --- //depot/vendor/freebsd/src/sys/netatalk/at_control.c 2005/02/22 14:20:44 +++ //depot/user/rwatson/netperf/sys/netatalk/at_control.c 2005/02/27 16:23:38 @@ -42,7 +42,15 @@ #include #include +/* + * Access to at_ifaddr_list and the aa_next pointer in struct at_ifaddr are + * locked with this global mutex. + * + * XXXRW: We'll need something better in the future. + */ +struct mtx at_ifaddr_mtx; struct at_ifaddr *at_ifaddr_list; +MTX_SYSINIT(at_ifaddr_mtx, &at_ifaddr_mtx, "at_ifaddr_mtx", MTX_DEF); static int aa_dorangeroute(struct ifaddr *ifa, u_int first, u_int last, int cmd); @@ -74,10 +82,12 @@ struct at_ifaddr *aa0; struct at_ifaddr *aa = NULL; struct ifaddr *ifa, *ifa0; + int error; /* * If we have an ifp, then find the matching at_ifaddr if it exists */ + mtx_lock(&at_ifaddr_mtx); if (ifp != NULL) { for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if (aa->aa_ifp == ifp) @@ -111,16 +121,20 @@ * If we a retrying to delete an addres but didn't find such, * then rewurn with an error */ - if (cmd == SIOCDIFADDR && aa == NULL) + if (cmd == SIOCDIFADDR && aa == NULL) { + mtx_unlock(&at_ifaddr_mtx); return (EADDRNOTAVAIL); + } /*FALLTHROUGH*/ case SIOCSIFADDR: /* * If we are not superuser, then we don't get to do these ops. */ - if (suser(td)) + if (suser(td)) { + mtx_unlock(&at_ifaddr_mtx); return (EPERM); + } sat = satosat(&ifr->ifr_addr); nr = (struct netrange *)sat->sat_zero; @@ -148,8 +162,7 @@ } } - if (ifp == NULL) - panic("at_control"); + KASSERT(ifp != NULL, ("at_control: ifp == NULL")); /* * If we failed to find an existing at_ifaddr entry, then we @@ -215,6 +228,9 @@ /* * If we DID find one then we clobber any routes * dependent on it.. + * + * XXXRW: Do we really want to hold at_ifaddr_mtx + * over at_scrub()? */ at_scrub(ifp, aa); } @@ -244,8 +260,10 @@ } } - if (aa == NULL) + if (aa == NULL) { + mtx_unlock(&at_ifaddr_mtx); return (EADDRNOTAVAIL); + } break; } @@ -253,6 +271,7 @@ * By the time this switch is run we should be able to assume that * the "aa" pointer is valid when needed. */ + error = 0; switch (cmd) { case SIOCGIFADDR: @@ -274,18 +293,30 @@ break; case SIOCSIFADDR: - return (at_ifinit(ifp, aa, - (struct sockaddr_at *)&ifr->ifr_addr)); + /* + * XXXRW: Do we really want to hold at_ifaddr_mtx over + * at_ifinit(). + */ + error = at_ifinit(ifp, aa, + (struct sockaddr_at *)&ifr->ifr_addr); + break; case SIOCAIFADDR: if (sateqaddr(&ifra->ifra_addr, &aa->aa_addr)) - return (0); - return (at_ifinit(ifp, aa, - (struct sockaddr_at *)&ifr->ifr_addr)); + break; + /* + * XXXRW: Do we really want to hold at_ifaddr_mtx over + * at_ifinit(). + */ + error = at_ifinit(ifp, aa, + (struct sockaddr_at *)&ifr->ifr_addr); + break; case SIOCDIFADDR: /* * scrub all routes.. didn't we just DO this? XXX yes, del it + * + * XXXRW: Do we want to hold at_ifaddr_mtx over at_scrub()? */ at_scrub(ifp, aa); @@ -309,10 +340,9 @@ /* * if we found it, remove it, otherwise we screwed up. */ - if (aa->aa_next) - aa->aa_next = aa0->aa_next; - else - panic("at_control"); + KASSERT(aa->aa_next != NULL, + ("at_control: aa->aa_next == NULL")); + aa->aa_next = aa0->aa_next; } /* @@ -322,11 +352,14 @@ break; default: - if (ifp == NULL || ifp->if_ioctl == NULL) - return (EOPNOTSUPP); - return ((*ifp->if_ioctl)(ifp, cmd, data)); + if (ifp == NULL || ifp->if_ioctl == NULL) { + error = EOPNOTSUPP; + break; + } + error = (*ifp->if_ioctl)(ifp, cmd, data); } - return (0); + mtx_unlock(&at_ifaddr_mtx); + return (error); } /* @@ -340,6 +373,14 @@ { int error; + /* + * XXXRW: We don't asser the at_ifaddr_mtx here, but we do rely on + * the caller holding a reference to 'aa' in such a way that it won't + * go away as at_scrub() runs. In practice this means that either + * the caller has to hold an ifaddr reference or at_ifaddr_mtx. + * + * mtx_assert(&at_ifaddr_mtx, MA_OWNED); + */ if (aa->aa_flags & AFA_ROUTE) { if (ifp->if_flags & IFF_LOOPBACK) { if ((error = aa_delsingleroute(&aa->aa_ifa, @@ -374,6 +415,7 @@ int netinc, nodeinc, nnets; u_short net; + mtx_assert(&at_ifaddr_mtx, MA_OWNED); /* * save the old addresses in the at_ifaddr just in case we need them. */ @@ -517,13 +559,15 @@ /* * start off the probes as an asynchronous * activity. though why wait 200mSec? + * + * XXXRW: This probably opens up a race, but + * the race already existed it and so I + * haven't fixed it. */ - AARPTAB_LOCK(); callout_reset(&aa->aa_callout, hz / 5, aarpprobe, ifp); - if (msleep(aa, &aarptab_mtx, PPAUSE|PCATCH, + if (msleep(aa, &at_ifaddr_mtx, PPAUSE|PCATCH, "at_ifinit", 0)) { - AARPTAB_UNLOCK(); /* * theoretically we shouldn't time * out here so if we returned with an @@ -536,7 +580,6 @@ aa->aa_lastnet = onr.nr_lastnet; return (EINTR); } - AARPTAB_UNLOCK(); /* * The async activity should have woken us @@ -572,6 +615,9 @@ /* * Now that we have selected an address, we need to tell the interface * about it, just in case it needs to adjust something. + * + * XXXRW: Should we release at_ifaddr_mtx before calling into the + * interface ioctl() code? */ if (ifp->if_ioctl != NULL && (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)aa))) { @@ -644,6 +690,10 @@ * getting risky by now XXX */ if (error) { + /* + * XXXRW: Do we want to hold at_ifaddr_mtx over calls to + * at_scrub()? + */ at_scrub(ifp, aa); aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; @@ -666,6 +716,7 @@ at_broadcast(struct sockaddr_at *sat) { struct at_ifaddr *aa; + int ret; /* * If the node is not right, it can't be a broadcast @@ -682,13 +733,18 @@ /* * failing that, if the net is one we have, it's a broadcast as well. */ + ret = 0; + mtx_lock(&at_ifaddr_mtx); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if ((aa->aa_ifp->if_flags & IFF_BROADCAST) && (ntohs(sat->sat_addr.s_net) >= ntohs(aa->aa_firstnet) - && ntohs(sat->sat_addr.s_net) <= ntohs(aa->aa_lastnet))) - return (1); + && ntohs(sat->sat_addr.s_net) <= ntohs(aa->aa_lastnet))) { + ret = 1; + break; + } } - return (0); + mtx_unlock(&at_ifaddr_mtx); + return (ret); } /* @@ -810,6 +866,10 @@ struct ifaddr *ifa; struct ifnet *ifp; + /* + * XXXRW: #if 0, and hence untested. + */ + mtx_lock(&at_ifaddr_mtx); while ((aa = at_ifaddr_list) != NULL) { ifp = aa->aa_ifp; at_scrub(ifp, aa); @@ -820,13 +880,12 @@ while (ifa->ifa_next && (ifa->ifa_next != (struct ifaddr *)aa)) ifa = ifa->ifa_next; - if (ifa->ifa_next) - ifa->ifa_next = - ((struct ifaddr *)aa)->ifa_next; - else - panic("at_entry"); + KASSERT(ifa->ifa_next != NULL, + ("aa_clean: ifa_next == NULL")); + ifa->ifa_next = ((struct ifaddr *)aa)->ifa_next; } } + mtx_unlock(&at_ifaddr_mtx); } #endif --- //depot/vendor/freebsd/src/sys/netatalk/at_var.h 2005/02/22 14:20:44 +++ //depot/user/rwatson/netperf/sys/netatalk/at_var.h 2005/02/27 16:23:38 @@ -60,6 +60,11 @@ #define AFA_PHASE2 0x0004 #ifdef _KERNEL +extern struct mtx at_ifaddr_mtx; +#define AT_IFADDR_LIST_ASSERT() mtx_assert(&at_ifaddr_mtx, MA_OWNED) +#define AT_IFADDR_LIST_LOCK() mtx_lock(&at_ifaddr_mtx) +#define AT_IFADDR_LIST_UNLOCK() mtx_unlock(&at_ifaddr_mtx) + extern struct at_ifaddr *at_ifaddr_list; #endif --- //depot/vendor/freebsd/src/sys/netatalk/ddp_input.c 2005/01/07 02:37:15 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_input.c 2005/02/27 16:23:38 @@ -76,6 +76,11 @@ static volatile int ddp_firewall = 0; static struct ddpstat ddpstat; +/* + * XXXRW: If we're going to keep this cached route data, we'll need to lock it + * down, and change later function-local use of it to grab an extra reference + * after deciding it is useful. + */ static struct route forwro; static void ddp_input(struct mbuf *, struct ifnet *, struct elaphdr *, int); @@ -170,6 +175,7 @@ * Make sure that we point to the phase1 ifaddr info * and that it's valid for this packet. */ + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if ((aa->aa_ifp == ifp) && ((aa->aa_flags & AFA_PHASE2) == 0) @@ -185,6 +191,8 @@ m_freem(m); return; } + IFAREF((struct ifaddr *)aa); + AT_IFADDR_LIST_UNLOCK(); } else { /* * There was no 'elh' passed on. This could still be @@ -229,6 +237,7 @@ * this node number will match (which may NOT be what we want, * but it's probably safe in 99.999% of cases. */ + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if (phase == 1 && (aa->aa_flags & AFA_PHASE2)) { continue; @@ -243,11 +252,15 @@ break; } } + if (aa != NULL) + IFAREF((struct ifaddr *)aa); + AT_IFADDR_LIST_UNLOCK(); } else { /* * A destination network was given. We just try to find * which ifaddr info matches it. */ + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { /* * This is a kludge. Accept packets that are @@ -281,6 +294,9 @@ } break; } + if (aa != NULL) + IFAREF((struct ifaddr *)aa); + AT_IFADDR_LIST_UNLOCK(); } } @@ -293,6 +309,8 @@ if (mlen < dlen) { ddpstat.ddps_toosmall++; m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return; } if (mlen > dlen) { @@ -315,6 +333,8 @@ */ if (ddp_forward == 0) { m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return; } /* @@ -359,6 +379,8 @@ && ((forwro.ro_rt == NULL) || (forwro.ro_rt->rt_ifp != ifp))) { m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return; } @@ -376,6 +398,8 @@ } else { ddpstat.ddps_forward++; } + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return; } @@ -440,6 +464,8 @@ DDP_LIST_SUNLOCK(); if (m != NULL) m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); } #if 0 --- //depot/vendor/freebsd/src/sys/netatalk/ddp_output.c 2005/07/05 23:40:38 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_output.c 2005/07/19 11:47:08 @@ -151,6 +151,7 @@ && (ro->ro_rt->rt_ifa) && (ifp = ro->ro_rt->rt_ifa->ifa_ifp)) { net = ntohs(satosat(ro->ro_rt->rt_gateway)->sat_addr.s_net); + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if (((net == 0) || (aa->aa_ifp == ifp)) && net >= ntohs(aa->aa_firstnet) && @@ -158,6 +159,8 @@ break; } } + IFAREF((struct ifaddr *)aa); + AT_IFADDR_LIST_UNLOCK(); } else { m_freem(m); #ifdef NETATALK_DEBUG @@ -177,6 +180,8 @@ ifp->if_xname); #endif m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return (ENETUNREACH); } @@ -203,6 +208,8 @@ MGET(m0, M_DONTWAIT, MT_HEADER); if (m0 == NULL) { m_freem(m); + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); printf("ddp_route: no buffers\n"); return (ENOBUFS); } @@ -236,9 +243,13 @@ if ((satosat(&aa->aa_addr)->sat_addr.s_net == satosat(&ro->ro_dst)->sat_addr.s_net) && (satosat(&aa->aa_addr)->sat_addr.s_node == satosat(&ro->ro_dst)->sat_addr.s_node)) { + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return (if_simloop(ifp, m, gate.sat_family, 0)); } + if (aa != NULL) + IFAFREE((struct ifaddr *)aa); return ((*ifp->if_output)(ifp, m, (struct sockaddr *)&gate, NULL)); /* XXX */ } --- //depot/vendor/freebsd/src/sys/netatalk/ddp_pcb.c 2005/01/07 02:37:15 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_pcb.c 2005/02/27 16:23:38 @@ -64,6 +64,7 @@ struct sockaddr_at lsat, *sat; struct at_ifaddr *aa; struct ddpcb *ddpp; + int error; /* * We read and write both the ddp passed in, and also ddp_ports. @@ -71,37 +72,46 @@ DDP_LIST_XLOCK_ASSERT(); DDP_LOCK_ASSERT(ddp); - if (ddp->ddp_lsat.sat_port != ATADDR_ANYPORT) { /* shouldn't be bound */ - return (EINVAL); + error = 0; + if (ddp->ddp_lsat.sat_port != ATADDR_ANYPORT) { + /* shouldn't be bound */ + error = EINVAL; + goto out; } if (addr != NULL) { /* validate passed address */ sat = (struct sockaddr_at *)addr; if (sat->sat_family != AF_APPLETALK) { - return (EAFNOSUPPORT); + error = EAFNOSUPPORT; + goto out; } if (sat->sat_addr.s_node != ATADDR_ANYNODE || sat->sat_addr.s_net != ATADDR_ANYNET) { + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if ((sat->sat_addr.s_net == AA_SAT(aa)->sat_addr.s_net) && (sat->sat_addr.s_node == AA_SAT(aa)->sat_addr.s_node)) { break; } } - if (!aa) { - return (EADDRNOTAVAIL); + AT_IFADDR_LIST_UNLOCK(); + if (aa != NULL) { + error = EADDRNOTAVAIL; + goto out; } } if (sat->sat_port != ATADDR_ANYPORT) { if (sat->sat_port < ATPORT_FIRST || sat->sat_port >= ATPORT_LAST) { - return (EINVAL); + error = EINVAL; + goto out; } if (sat->sat_port < ATPORT_RESERVED && suser(td)) { - return (EACCES); + error = EACCES; + goto out; } } } else { @@ -116,7 +126,8 @@ if (sat->sat_addr.s_node == ATADDR_ANYNODE && sat->sat_addr.s_net == ATADDR_ANYNET) { if (at_ifaddr_list == NULL) { - return (EADDRNOTAVAIL); + error = EADDRNOTAVAIL; + goto out; } sat->sat_addr = AA_SAT(at_ifaddr_list)->sat_addr; } @@ -133,7 +144,8 @@ } } if (sat->sat_port == ATPORT_LAST) { - return (EADDRNOTAVAIL); + error = EADDRNOTAVAIL; + goto out; } ddp->ddp_lsat.sat_port = sat->sat_port; ddp_ports[ sat->sat_port - 1 ] = ddp; @@ -146,7 +158,8 @@ } } if (ddpp != NULL) { - return (EADDRINUSE); + error = EADDRINUSE; + goto out; } ddp->ddp_pnext = ddp_ports[ sat->sat_port - 1 ]; ddp_ports[ sat->sat_port - 1 ] = ddp; @@ -155,7 +168,8 @@ } } - return (0); +out: + return (error); } int @@ -201,6 +215,7 @@ } aa = NULL; if ((ifp = ro->ro_rt->rt_ifp) != NULL) { + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if (aa->aa_ifp == ifp && ntohs(net) >= ntohs(aa->aa_firstnet) && @@ -208,6 +223,7 @@ break; } } + AT_IFADDR_LIST_UNLOCK(); } if (aa == NULL || (satosat(&ro->ro_dst)->sat_addr.s_net != (hintnet ? hintnet : sat->sat_addr.s_net) || @@ -238,11 +254,13 @@ */ aa = NULL; if (ro->ro_rt && (ifp = ro->ro_rt->rt_ifp)) { + AT_IFADDR_LIST_LOCK(); for (aa = at_ifaddr_list; aa != NULL; aa = aa->aa_next) { if (aa->aa_ifp == ifp) { break; } } + AT_IFADDR_LIST_UNLOCK(); } if (aa == NULL) { return (ENETUNREACH); @@ -279,6 +297,10 @@ DDP_LOCK_INIT(ddp); ddp->ddp_lsat.sat_port = ATADDR_ANYPORT; + /* + * XXXRW: Is this unlocked assignment payer for socket and + * back-pointer something that needs to be protected? + */ ddp->ddp_socket = so; so->so_pcb = (caddr_t)ddp; --- //depot/vendor/freebsd/src/sys/netatalk/ddp_usrreq.c 2005/02/18 10:57:04 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_usrreq.c 2005/02/27 16:23:38 @@ -44,6 +44,11 @@ #include #include +/* + * XXXRW: These structures are currently not mutable. They're not marked + * with 'const' because I suspect that consumers of this code would like for + * them to be mutable someday. + */ static u_long ddp_sendspace = DDP_MAXSZ; /* Max ddp size + 1 (ddp_type) */ static u_long ddp_recvspace = 10 * (587 + sizeof(struct sockaddr_at)); @@ -170,6 +175,13 @@ return (0); } +/* + * XXXRW: If an explicit address is specified, then we temporarily change + * the address on the pcb for sending. This is inefficient because it + * requires us to perform global rather than pcb-local operations. It + * may also create a race if other users of the socket are simultaneously + * sending. + */ static int ddp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) --- //depot/vendor/freebsd/src/sys/netgraph/ng_base.c 2005/07/05 17:37:36 +++ //depot/user/rwatson/netperf/sys/netgraph/ng_base.c 2005/07/19 11:47:08 @@ -3088,6 +3088,7 @@ ng_qzone = uma_zcreate("NetGraph items", sizeof(struct ng_item), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(ng_qzone, maxalloc); + /* XXX could use NETISR_MPSAFE but need to verify code */ netisr_register(NETISR_NETGRAPH, (netisr_t *)ngintr, NULL, NETISR_MPSAFE); break; --- //depot/vendor/freebsd/src/sys/netgraph/ng_ksocket.c 2005/05/16 17:10:18 +++ //depot/user/rwatson/netperf/sys/netgraph/ng_ksocket.c 2005/06/01 10:10:04 @@ -1005,6 +1005,9 @@ * To decouple stack, we use queue version of ng_send_fn(). */ +/* + * XXXRW: ng_ksocket_incoming() is called without Giant. Is that OK? + */ static void ng_ksocket_incoming(struct socket *so, void *arg, int waitflag) { --- //depot/vendor/freebsd/src/sys/netinet/accf_http.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/accf_http.c 2005/02/27 16:23:38 @@ -161,6 +161,7 @@ sohashttpget(struct socket *so, void *arg, int waitflag) { + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { struct mbuf *m; char *cmp; @@ -214,6 +215,7 @@ struct mbuf *m, *n; int i, cc, spaces, inspaces; + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto fallout; @@ -301,6 +303,7 @@ int ccleft, copied; DPRINT("start"); + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto gotit; --- //depot/vendor/freebsd/src/sys/netinet/if_ether.c 2005/06/05 03:15:31 +++ //depot/user/rwatson/netperf/sys/netinet/if_ether.c 2005/07/05 14:55:33 @@ -101,6 +101,11 @@ #define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ }; +/* + * XXXRW: Need to document (and/or fix) locking for this. We always + * seem to hold a lock (and assert) when referencing this list, but it's + * not clear it's always the same lock. + */ static LIST_HEAD(, llinfo_arp) llinfo_arp; static struct ifqueue arpintrq; --- //depot/vendor/freebsd/src/sys/netinet/igmp.c 2005/03/26 22:25:15 +++ //depot/user/rwatson/netperf/sys/netinet/igmp.c 2005/06/01 10:10:04 @@ -99,6 +99,9 @@ static u_long igmp_all_hosts_group; static u_long igmp_all_rtrs_group; +/* + * XXXRW: These variables make me vaguely nervous. + */ static struct mbuf *router_alert; static struct route igmprt; @@ -123,6 +126,7 @@ /* * Construct a Router Alert option to use in outgoing packets + * XXXRW: This might actually need a MAC label. */ MGET(router_alert, M_DONTWAIT, MT_DATA); ra = mtod(router_alert, struct ipoption *); --- //depot/vendor/freebsd/src/sys/netinet/in_gif.c 2005/06/20 08:42:36 +++ //depot/user/rwatson/netperf/sys/netinet/in_gif.c 2005/07/05 14:55:33 @@ -174,6 +174,9 @@ } bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); + /* + * XXXRW: locking of gif's softc. + */ if (dst->sin_family != sin_dst->sin_family || dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { /* cache route doesn't match */ @@ -335,6 +338,10 @@ case 0: case 127: case 255: return 0; } + + /* + * XXXRW: Lock in_ifaddrhead walking. + */ /* reject packets with broadcast on source */ TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) @@ -343,6 +350,7 @@ return 0; } + /* XXXRW: unlocked read. */ /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in sin; @@ -398,6 +406,11 @@ in_gif_attach(sc) struct gif_softc *sc; { + + /* + * XXXRW: Technically, NULL can also be returned for ENOMEM, + * not just EEXIST. + */ sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck, &in_gif_protosw, sc); if (sc->encap_cookie4 == NULL) --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.c 2005/06/01 11:45:18 +++ //depot/user/rwatson/netperf/sys/netinet/in_pcb.c 2005/07/05 14:55:33 @@ -702,6 +702,7 @@ #ifdef IPSEC ipsec_pcbdisconn(inp->inp_sp); #endif + /* Unlocked read. */ if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp); } @@ -774,6 +775,10 @@ struct in_addr addr; in_port_t port; + /* + * XXXRW: Why not use the inpcb lock instead of the info lock + * here? + */ s = splnet(); INP_INFO_RLOCK(pcbinfo); inp = sotoinpcb(so); @@ -782,10 +787,8 @@ splx(s); return ECONNRESET; } - INP_LOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; - INP_UNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); splx(s); @@ -807,6 +810,10 @@ struct in_addr addr; in_port_t port; + /* + * XXXRW: Why not use the inpcb lock instead of the info lock + * here? + */ s = splnet(); INP_INFO_RLOCK(pcbinfo); inp = sotoinpcb(so); @@ -815,10 +822,8 @@ splx(s); return ECONNRESET; } - INP_LOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; - INP_UNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); splx(s); --- //depot/vendor/freebsd/src/sys/netinet/ip_divert.c 2005/05/13 11:46:15 +++ //depot/user/rwatson/netperf/sys/netinet/ip_divert.c 2005/06/01 10:10:04 @@ -373,6 +373,9 @@ m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC + /* + * XXXRW: perhaps should be mac_create_mbuf_from_inpcb()? + */ SOCK_LOCK(so); mac_create_mbuf_from_socket(so, m); SOCK_UNLOCK(so); @@ -421,6 +424,7 @@ inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; INP_UNLOCK(inp); + /* XXXRW: Not clear if this is adequate. */ return 0; } --- //depot/vendor/freebsd/src/sys/netinet/ip_encap.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/ip_encap.c 2005/02/27 16:23:38 @@ -106,8 +106,7 @@ LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); /* - * We currently keey encap_init() for source code compatibility reasons -- - * it's referenced by KAME pieces in netinet6. + * XXXRW: encap_init() was entirely useless, so I deleted it. */ void encap_init() @@ -185,6 +184,10 @@ } mtx_unlock(&encapmtx); + /* + * XXXRW: Need drain mechanism to prevent the encapsulation + * entry from being released while in use. + */ if (match) { /* found a match, "match" has the best one */ psw = match->psw; @@ -255,6 +258,10 @@ } mtx_unlock(&encapmtx); + /* + * XXXRW: Need drain mechanism so the encap entry isn't freed + * while in use. + */ if (match) { /* found a match */ psw = (const struct ip6protosw *)match->psw; --- //depot/vendor/freebsd/src/sys/netinet/ip_encap.h 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/ip_encap.h 2005/02/27 16:23:38 @@ -35,6 +35,15 @@ #ifdef _KERNEL +/* + * This structure is entirely static after registration, and other than + * its entry in the encapsulation table, requires no locking. The chain + * field is locked using the global encapmtx. + * + * XXXRW: Need to add a refcount/drain mechanism so that encapsulation + * entries can't be removed while in use. This likely requires a + * refcount and cv to wait for it to drain, or an sx lock. + */ struct encaptab { LIST_ENTRY(encaptab) chain; int af; --- //depot/vendor/freebsd/src/sys/netinet/ip_fastfwd.c 2005/05/04 13:11:32 +++ //depot/user/rwatson/netperf/sys/netinet/ip_fastfwd.c 2005/06/01 10:10:04 @@ -482,6 +482,7 @@ * "ours"-label. */ m->m_flags |= M_FASTFWD_OURS; + /* XXX statistic */ if (ro.ro_rt) RTFREE(ro.ro_rt); return 0; --- //depot/vendor/freebsd/src/sys/netinet/ip_id.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/ip_id.c 2005/02/27 16:23:38 @@ -61,7 +61,10 @@ #include #include #include +#include +#include #include +#include #define RU_OUT 180 /* Time after wich will be reseeded */ #define RU_MAX 30000 /* Uniq cycle, avoid blackjack prediction */ @@ -77,19 +80,24 @@ 2729 }; -static u_int16_t ru_x; -static u_int16_t ru_seed, ru_seed2; -static u_int16_t ru_a, ru_b; -static u_int16_t ru_g; -static u_int16_t ru_counter = 0; -static u_int16_t ru_msb = 0; -static long ru_reseed; -static u_int32_t tmp; /* Storage for unused random */ +struct random_pcpu { + u_int16_t ru_x; + u_int16_t ru_seed, ru_seed2; + u_int16_t ru_a, ru_b; + u_int16_t ru_g; + u_int16_t ru_counter; + u_int16_t ru_msb; + long ru_reseed; + u_int32_t tmp; /* Storage for unused random */ +}; + +static struct random_pcpu random_pcpu[MAXCPU]; static u_int16_t pmod(u_int16_t, u_int16_t, u_int16_t); -static void ip_initid(void); +static void ip_initid(struct random_pcpu *rp); u_int16_t ip_randomid(void); + /* * Do a fast modular exponation, returned value will be in the range * of 0 - (mod-1) @@ -128,32 +136,32 @@ * application does not have to worry about it. */ static void -ip_initid(void) +ip_initid(struct random_pcpu *rp) { u_int16_t j, i; int noprime = 1; struct timeval time; getmicrotime(&time); - read_random((void *) &tmp, sizeof(tmp)); - ru_x = (tmp & 0xFFFF) % RU_M; + read_random((void *) &rp->tmp, sizeof(rp->tmp)); + rp->ru_x = (rp->tmp & 0xFFFF) % RU_M; /* 15 bits of random seed */ - ru_seed = (tmp >> 16) & 0x7FFF; - read_random((void *) &tmp, sizeof(tmp)); - ru_seed2 = tmp & 0x7FFF; + rp->ru_seed = (rp->tmp >> 16) & 0x7FFF; + read_random((void *) &rp->tmp, sizeof(rp->tmp)); + rp->ru_seed2 = rp->tmp & 0x7FFF; - read_random((void *) &tmp, sizeof(tmp)); + read_random((void *) &rp->tmp, sizeof(rp->tmp)); /* Determine the LCG we use */ - ru_b = (tmp & 0xfffe) | 1; - ru_a = pmod(RU_AGEN, (tmp >> 16) & 0xfffe, RU_M); - while (ru_b % 3 == 0) - ru_b += 2; + rp->ru_b = (rp->tmp & 0xfffe) | 1; + rp->ru_a = pmod(RU_AGEN, (rp->tmp >> 16) & 0xfffe, RU_M); + while (rp->ru_b % 3 == 0) + rp->ru_b += 2; - read_random((void *) &tmp, sizeof(tmp)); - j = tmp % RU_N; - tmp = tmp >> 16; + read_random((void *) &rp->tmp, sizeof(rp->tmp)); + j = rp->tmp % RU_N; + rp->tmp = rp->tmp >> 16; /* * Do a fast gcd(j,RU_N-1), so we can find a j with @@ -172,38 +180,46 @@ j = (j+1) % RU_N; } - ru_g = pmod(RU_GEN,j,RU_N); - ru_counter = 0; + rp->ru_g = pmod(RU_GEN,j,RU_N); + rp->ru_counter = 0; - ru_reseed = time.tv_sec + RU_OUT; - ru_msb = ru_msb == 0x8000 ? 0 : 0x8000; + rp->ru_reseed = time.tv_sec + RU_OUT; + rp->ru_msb = rp->ru_msb == 0x8000 ? 0 : 0x8000; } u_int16_t ip_randomid(void) { + struct random_pcpu *rp; + struct timeval time; + u_int16_t retval; int i, n; - struct timeval time; - /* XXX not reentrant */ getmicrotime(&time); - if (ru_counter >= RU_MAX || time.tv_sec > ru_reseed) - ip_initid(); + + critical_enter(); + rp = &random_pcpu[curcpu]; + + if (rp->ru_counter >= RU_MAX || time.tv_sec > rp->ru_reseed) + ip_initid(rp); - if (!tmp) - read_random((void *) &tmp, sizeof(tmp)); + if (!rp->tmp) + read_random((void *) &rp->tmp, sizeof(rp->tmp)); /* Skip a random number of ids */ - n = tmp & 0x3; tmp = tmp >> 2; - if (ru_counter + n >= RU_MAX) - ip_initid(); + n = rp->tmp & 0x3; rp->tmp = rp->tmp >> 2; + if (rp->ru_counter + n >= RU_MAX) + ip_initid(rp); for (i = 0; i <= n; i++) /* Linear Congruential Generator */ - ru_x = (ru_a*ru_x + ru_b) % RU_M; + rp->ru_x = (rp->ru_a*rp->ru_x + rp->ru_b) % RU_M; - ru_counter += i; + rp->ru_counter += i; - return (ru_seed ^ pmod(ru_g,ru_seed2 ^ ru_x,RU_N)) | ru_msb; + retval = (rp->ru_seed ^ pmod(rp->ru_g,rp->ru_seed2 ^ rp->ru_x,RU_N)) + | rp->ru_msb; + critical_exit(); + return (retval); } --- //depot/vendor/freebsd/src/sys/netinet/ip_output.c 2005/07/05 23:40:38 +++ //depot/user/rwatson/netperf/sys/netinet/ip_output.c 2005/07/19 11:47:08 @@ -1149,7 +1149,8 @@ struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); - int error, optval; + struct mbuf *m, *mopt; + int error, optval, tmp; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { @@ -1164,7 +1165,6 @@ case IP_RETOPTS: #endif { - struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; @@ -1290,7 +1290,6 @@ caddr_t req; size_t len = 0; int priv; - struct mbuf *m; int optname; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ @@ -1302,7 +1301,9 @@ req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; + INP_LOCK(inp); error = ipsec4_set_policy(inp, optname, req, len, priv); + INP_UNLOCK(inp); m_freem(m); break; } @@ -1318,13 +1319,39 @@ switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: - if (inp->inp_options) - error = sooptcopyout(sopt, - mtod(inp->inp_options, - char *), - inp->inp_options->m_len); - else + /* + * Make a temporary copy of the options in a local + * mbuf while holding the lock so that the copyout() + * can be done without a lock. Optimistically assume + * options will be present, so always allocate the + * mbuf and grab the lock. If there's no data, it + * doesn't matter if the allocation fails. + */ + MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, + MT_HEADER); + INP_LOCK(inp); + mopt = inp->inp_options; + if (mopt == NULL) { + INP_UNLOCK(inp); + if (m != NULL) + m_freem(m); + sopt->sopt_valsize = 0; + break; + } + if (m == NULL) { + INP_UNLOCK(inp); sopt->sopt_valsize = 0; + error = ENOBUFS; + break; + } + KASSERT(mopt->m_len <= MLEN, + ("ip_ctloutput: m_len > MLEN")); + bcopy(mtod(mopt, char *), mtod(m, char *), + mopt->m_len); + m->m_len = mopt->m_len; + INP_UNLOCK(inp); + error = sooptcopyout(sopt, mtod(m, char *), m->m_len); + m_freem(m); break; case IP_TOS: @@ -1370,9 +1397,14 @@ break; case IP_PORTRANGE: - if (inp->inp_flags & INP_HIGHPORT) + /* + * We make a local copy to avoid multiple + * reads without locking. + */ + tmp = inp->inp_flags; + if (tmp & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; - else if (inp->inp_flags & INP_LOWPORT) + else if (tmp & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; @@ -1585,12 +1617,14 @@ imo = inp->inp_moptions; if (imo == NULL) { /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. + * No multicast option buffer attached to the pcb; allocate + * one and initialize to default values. Make sure to handle + * losing this race, as malloc() may sleep. + * + * XXXRW: inpcb locking here. */ imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); - if (imo == NULL) return (ENOBUFS); inp->inp_moptions = imo; @@ -1647,11 +1681,13 @@ error = EADDRNOTAVAIL; break; } + INP_LOCK(inp); imo->imo_multicast_ifp = ifp; if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; + INP_UNLOCK(inp); splx(s); break; @@ -1754,6 +1790,7 @@ * See if the membership already exists or if all the * membership slots are full. */ + INP_LOCK(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && imo->imo_membership[i]->inm_addr.s_addr @@ -1761,11 +1798,13 @@ break; } if (i < imo->imo_num_memberships) { + INP_UNLOCK(inp); error = EADDRINUSE; splx(s); break; } if (i == IP_MAX_MEMBERSHIPS) { + INP_UNLOCK(inp); error = ETOOMANYREFS; splx(s); break; @@ -1774,8 +1813,10 @@ * Everything looks good; add a new record to the multicast * address list for the given interface. */ + INP_UNLOCK(inp); /* XXXRW: in_addmulti() may sleep? */ if ((imo->imo_membership[i] = in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; splx(s); break; @@ -1816,6 +1857,7 @@ /* * Find the membership in the membership array. */ + INP_LOCK(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && @@ -1824,6 +1866,7 @@ break; } if (i == imo->imo_num_memberships) { + INP_UNLOCK(inp); error = EADDRNOTAVAIL; splx(s); break; @@ -1832,6 +1875,7 @@ * Give up the multicast address record to which the * membership points. */ + INP_UNLOCK(inp); /* XXXRW: in_delmulti() may sleep? */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. @@ -1850,6 +1894,7 @@ /* * If all options have default values, no need to keep the mbuf. */ + INP_LOCK(inp); if (imo->imo_multicast_ifp == NULL && imo->imo_multicast_vif == -1 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && @@ -1858,6 +1903,7 @@ free(inp->inp_moptions, M_IPMOPTS); inp->inp_moptions = NULL; } + INP_UNLOCK(inp); return (error); } @@ -1879,7 +1925,9 @@ error = 0; switch (sopt->sopt_name) { - case IP_MULTICAST_VIF: + case IP_MULTICAST_VIF: + INP_LOCK(inp); + imo = inp->inp_moptions; if (imo != NULL) optval = imo->imo_multicast_vif; else @@ -1889,6 +1937,8 @@ break; case IP_MULTICAST_IF: + INP_LOCK(inp); + imo = inp->inp_moptions; if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; else if (imo->imo_multicast_addr.s_addr) { @@ -1904,7 +1954,9 @@ break; case IP_MULTICAST_TTL: - if (imo == 0) + INP_LOCK(inp); + imo = inp->inp_moptions; + if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; @@ -1916,7 +1968,9 @@ break; case IP_MULTICAST_LOOP: - if (imo == 0) + INP_LOCK(inp); + imo = inp->inp_moptions; + if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; --- //depot/vendor/freebsd/src/sys/netinet/raw_ip.c 2005/06/01 11:40:22 +++ //depot/user/rwatson/netperf/sys/netinet/raw_ip.c 2005/07/05 14:55:33 @@ -86,6 +86,9 @@ * so leave them not initialized and rely on BSS being set to 0. */ +/* + * XXXRW: Locking for mrouter bits? + */ /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter; @@ -660,6 +663,7 @@ } INP_LOCK(inp); soisdisconnected(so); + /* Unlocked read. */ if (so->so_state & SS_NOFDREF) rip_pcbdetach(so, inp); else @@ -671,6 +675,8 @@ static int rip_disconnect(struct socket *so) { + + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return rip_abort(so); @@ -767,6 +773,7 @@ INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); + /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { INP_INFO_WUNLOCK(&ripcbinfo); --- //depot/vendor/freebsd/src/sys/netinet/tcp_debug.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_debug.c 2005/02/27 16:23:38 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include --- //depot/vendor/freebsd/src/sys/netinet/tcp_hostcache.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_hostcache.c 2005/02/27 16:23:38 @@ -250,7 +250,8 @@ * Set up periodic cache cleanup. */ callout_init(&tcp_hc_callout, CALLOUT_MPSAFE); - callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0); + callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, + NULL); } /* @@ -287,6 +288,9 @@ /* * circle through entries in bucket row looking for a match + * + * XXXRW: Wouldn't it be faster to let the compiler do the comparison + * if the data types are approrpiate, rather than using memcmp()? */ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { if (inc->inc_isipv6) { --- //depot/vendor/freebsd/src/sys/netinet/tcp_input.c 2005/07/05 19:26:06 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_input.c 2005/07/19 11:47:08 @@ -1231,6 +1231,7 @@ tcp_timer_rexmt, tp); sowwakeup(so); + /* Unlocked read. */ if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; @@ -1655,6 +1656,7 @@ * If new data are received on a connection after the * user processes are gone, then RST the other end. */ + /* Unlocked read. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(headlocked, ("trimthenstep6: tcp_close.3: head not " @@ -2159,6 +2161,7 @@ * we should release the tp also, and use a * compressed state. */ + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, @@ -2535,6 +2538,11 @@ if (tp) INP_UNLOCK(inp); + /* + * XXXRW: Added assertion at dropwithreset label because I believe + * the head should always be locked here. If true, this can be + * changed to an unconditional unlock. + */ if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; --- //depot/vendor/freebsd/src/sys/netinet/tcp_output.c 2005/05/21 00:40:18 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_output.c 2005/06/01 10:10:04 @@ -275,6 +275,7 @@ * to send then the probe will be the FIN * itself. */ + /* Unlocked read of sb_cc. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; @@ -381,6 +382,7 @@ len = tp->t_maxseg; sendalot = 1; } + /* Unlocked read of sb_cc. */ if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; @@ -413,6 +415,7 @@ * * note: the len + off check is almost certainly unnecessary. */ + /* Unlocked read of sb_cc. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && @@ -504,6 +507,7 @@ * if window is nonzero, transmit what we can, * otherwise force out a byte. */ + /* Unlocked read of sb_cc. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; @@ -786,6 +790,7 @@ * give data to the user when a buffer fills or * a PUSH comes in.) */ + /* Unlocked read of sb_cc. */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); --- //depot/vendor/freebsd/src/sys/netinet/tcp_subr.c 2005/07/01 22:55:27 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_subr.c 2005/07/05 14:55:33 @@ -1466,6 +1466,10 @@ #endif /* INET6 */ INP_LOCK_ASSERT(inp); + + /* + * XXXRW: How often and why is it that tp could be non-NULL here? + */ if (tp != NULL) { #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; --- //depot/vendor/freebsd/src/sys/netinet/tcp_syncache.c 2005/04/21 20:30:38 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_syncache.c 2005/06/01 10:10:04 @@ -556,6 +556,9 @@ goto abort2; } #ifdef MAC + /* + * XXXRW: Would prefer inpcb. + */ SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); @@ -719,6 +722,14 @@ abort: INP_UNLOCK(inp); abort2: + /* + * XXXRW: Technically speaking, this soabort() likely doesn't + * do anything, since we insert sockets into the accept queues + * in an already completed state, and soabort() leaves it to + * the close() on the listen socket to remove completed + * connections. However, this means a TCP socket without + * full TCP state could slip through...? + */ if (so != NULL) (void) soabort(so); return (NULL); --- //depot/vendor/freebsd/src/sys/netinet/tcp_timer.c 2005/05/21 00:40:18 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_timer.c 2005/06/01 10:10:04 @@ -263,6 +263,9 @@ LIST_REMOVE(tw, tw_2msl); tw->tw_time = timeo + ticks; i = timeo > tcp_msl ? 1 : 0; + /* + * XXXRW: Locking? + */ tw_tail = &twl_2msl[i].tw_tail; LIST_INSERT_BEFORE(tw_tail, tw, tw_2msl); } @@ -272,6 +275,7 @@ { INP_INFO_WLOCK_ASSERT(&tcbinfo); + /* XXXRW: Assert tw lock? */ if (tw->tw_time != 0) LIST_REMOVE(tw, tw_2msl); } --- //depot/vendor/freebsd/src/sys/netinet/tcp_usrreq.c 2005/06/01 12:15:19 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_usrreq.c 2005/07/05 14:55:33 @@ -146,6 +146,50 @@ } /* + * Common code to setup and teardown locking. Most + * code begins with a COMMON_START macro and finishes + * with COMMON_END. You indicate whether the inpcb + * and enclosing head are to be locked read or write. + */ +#define INI_NOLOCK 0 /* no head lock */ +#define INI_READ 1 /* read head lock */ +#define INI_WRITE 2 /* write head lock */ + +#define COMMON_START0(_headrw) do { \ + if (_headrw == INI_READ) \ + INP_INFO_RLOCK(&tcbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WLOCK(&tcbinfo); \ + inp = sotoinpcb(so); \ + if (inp == 0) { \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + return EINVAL; \ + } \ + INP_LOCK(inp); \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ +} while(0) + +#define COMMON_START(_headrw) do { \ + TCPDEBUG0; \ + COMMON_START0(_headrw); \ +} while (0) + +#define COMMON_END(_headrw, req) \ + TCPDEBUG2(req); \ + do { \ + if (tp) \ + INP_UNLOCK(inp); \ + if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + } while(0) + +/* * pru_detach() detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a pru_disconnect(), @@ -158,63 +202,13 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - if (inp == NULL) { - INP_INFO_WUNLOCK(&tcbinfo); - return error; - } - INP_LOCK(inp); - tp = intotcpcb(inp); - TCPDEBUG1(); + COMMON_START(INI_WRITE); tp = tcp_disconnect(tp); - - TCPDEBUG2(PRU_DETACH); - if (tp) - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + COMMON_END(INI_WRITE, PRU_DETACH); return error; } -#define INI_NOLOCK 0 -#define INI_READ 1 -#define INI_WRITE 2 - -#define COMMON_START() \ - TCPDEBUG0; \ - do { \ - if (inirw == INI_READ) \ - INP_INFO_RLOCK(&tcbinfo); \ - else if (inirw == INI_WRITE) \ - INP_INFO_WLOCK(&tcbinfo); \ - inp = sotoinpcb(so); \ - if (inp == 0) { \ - if (inirw == INI_READ) \ - INP_INFO_RUNLOCK(&tcbinfo); \ - else if (inirw == INI_WRITE) \ - INP_INFO_WUNLOCK(&tcbinfo); \ - return EINVAL; \ - } \ - INP_LOCK(inp); \ - if (inirw == INI_READ) \ - INP_INFO_RUNLOCK(&tcbinfo); \ - tp = intotcpcb(inp); \ - TCPDEBUG1(); \ -} while(0) - -#define COMMON_END(req) \ -out: TCPDEBUG2(req); \ - do { \ - if (tp) \ - INP_UNLOCK(inp); \ - if (inirw == INI_WRITE) \ - INP_INFO_WUNLOCK(&tcbinfo); \ - return error; \ - goto out; \ -} while(0) - /* * Give the socket an address. */ @@ -225,7 +219,6 @@ struct inpcb *inp; struct tcpcb *tp; struct sockaddr_in *sinp; - const int inirw = INI_WRITE; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) @@ -238,11 +231,10 @@ IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); - COMMON_START(); + COMMON_START(INI_WRITE); error = in_pcbbind(inp, nam, td->td_ucred); - if (error) - goto out; - COMMON_END(PRU_BIND); + COMMON_END(INI_WRITE, PRU_BIND); + return error; } #ifdef INET6 @@ -253,7 +245,6 @@ struct inpcb *inp; struct tcpcb *tp; struct sockaddr_in6 *sin6p; - const int inirw = INI_WRITE; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) @@ -266,7 +257,7 @@ IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - COMMON_START(); + COMMON_START(INI_WRITE); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { @@ -284,9 +275,9 @@ } } error = in6_pcbbind(inp, nam, td->td_ucred); - if (error) - goto out; - COMMON_END(PRU_BIND); +out: + COMMON_END(INI_WRITE, PRU_BIND); + return error; } #endif /* INET6 */ @@ -299,9 +290,8 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_WRITE; - COMMON_START(); + COMMON_START(INI_WRITE); SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0 && inp->inp_lport == 0) @@ -311,7 +301,8 @@ solisten_proto(so); } SOCK_UNLOCK(so); - COMMON_END(PRU_LISTEN); + COMMON_END(INI_WRITE, PRU_LISTEN); + return error; } #ifdef INET6 @@ -321,9 +312,8 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_WRITE; - COMMON_START(); + COMMON_START(INI_WRITE); SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0 && inp->inp_lport == 0) { @@ -337,7 +327,8 @@ solisten_proto(so); } SOCK_UNLOCK(so); - COMMON_END(PRU_LISTEN); + COMMON_END(INI_WRITE, PRU_LISTEN); + return error; } #endif /* INET6 */ @@ -355,7 +346,6 @@ struct inpcb *inp; struct tcpcb *tp; struct sockaddr_in *sinp; - const int inirw = INI_WRITE; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) @@ -369,11 +359,13 @@ if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); - COMMON_START(); + COMMON_START(INI_WRITE); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; error = tcp_output(tp); - COMMON_END(PRU_CONNECT); +out: + COMMON_END(INI_WRITE, PRU_CONNECT); + return error; } #ifdef INET6 @@ -384,7 +376,6 @@ struct inpcb *inp; struct tcpcb *tp; struct sockaddr_in6 *sin6p; - const int inirw = INI_WRITE; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) @@ -396,7 +387,7 @@ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - COMMON_START(); + COMMON_START(INI_WRITE); if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; @@ -419,7 +410,9 @@ if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; error = tcp_output(tp); - COMMON_END(PRU_CONNECT); +out: + COMMON_END(INI_WRITE, PRU_CONNECT); + return error; } #endif /* INET6 */ @@ -440,11 +433,11 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_WRITE; - COMMON_START(); + COMMON_START(INI_WRITE); tp = tcp_disconnect(tp); - COMMON_END(PRU_DISCONNECT); + COMMON_END(INI_WRITE, PRU_DISCONNECT); + return error; } /* @@ -462,33 +455,24 @@ in_port_t port = 0; TCPDEBUG0; + /* Unlocked read. */ if (so->so_state & SS_ISDISCONNECTED) { error = ECONNABORTED; - goto out; + goto out; /* NB: ok 'cuz tp is NULL */ } - INP_INFO_RLOCK(&tcbinfo); - inp = sotoinpcb(so); - if (!inp) { - INP_INFO_RUNLOCK(&tcbinfo); - return (EINVAL); - } - INP_LOCK(inp); - INP_INFO_RUNLOCK(&tcbinfo); - tp = intotcpcb(inp); - TCPDEBUG1(); + COMMON_START0(INI_READ); /* - * We inline in_setpeeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. + * We inline in_setpeeraddr so that we can copy the + * data of interest and defer the malloc until after + * we release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; -out: TCPDEBUG2(PRU_ACCEPT); - if (tp) - INP_UNLOCK(inp); +out: + COMMON_END(INI_READ, PRU_ACCEPT); if (error == 0) *nam = in_sockaddr(port, &addr); return error; @@ -507,25 +491,17 @@ int v4 = 0; TCPDEBUG0; + /* Unlocked read. */ if (so->so_state & SS_ISDISCONNECTED) { error = ECONNABORTED; - goto out; + goto out; /* NB: ok 'cuz tp is NULL */ } - INP_INFO_RLOCK(&tcbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_RUNLOCK(&tcbinfo); - return (EINVAL); - } - INP_LOCK(inp); - INP_INFO_RUNLOCK(&tcbinfo); - tp = intotcpcb(inp); - TCPDEBUG1(); + COMMON_START0(INI_READ); /* - * We inline in6_mapped_peeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. + * We inline in6_mapped_peeraddr so that we can + * copy the data of interest and defer the malloc + * until after we release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; @@ -536,9 +512,8 @@ addr6 = inp->in6p_faddr; } -out: TCPDEBUG2(PRU_ACCEPT); - if (tp) - INP_UNLOCK(inp); +out: + COMMON_END(INI_READ, PRU_ACCEPT); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -579,14 +554,14 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_WRITE; - COMMON_START(); + COMMON_START(INI_WRITE); socantsendmore(so); tp = tcp_usrclosed(tp); if (tp) error = tcp_output(tp); - COMMON_END(PRU_SHUTDOWN); + COMMON_END(INI_WRITE, PRU_SHUTDOWN); + return error; } /* @@ -598,11 +573,11 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_READ; - COMMON_START(); + COMMON_START(INI_READ); tcp_output(tp); - COMMON_END(PRU_RCVD); + COMMON_END(INI_READ, PRU_RCVD); + return error; } /* @@ -741,12 +716,14 @@ } INP_INFO_WUNLOCK(&tcbinfo); unlocked = 1; + /* Unlocked read of sb_cc. */ tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; error = tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } out: + COMMON_END(INI_WRITE, (flags & PRUS_OOB) ? PRU_SENDOOB : TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); if (tp) @@ -754,6 +731,7 @@ if (!unlocked) INP_INFO_WUNLOCK(&tcbinfo); return (error); + return error; } /* @@ -765,11 +743,11 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_WRITE; - COMMON_START(); + COMMON_START(INI_WRITE); tp = tcp_drop(tp, ECONNABORTED); - COMMON_END(PRU_ABORT); + COMMON_END(INI_WRITE, PRU_ABORT); + return error; } /* @@ -781,9 +759,9 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp; - const int inirw = INI_READ; - COMMON_START(); + COMMON_START(INI_READ); + /* Unlocked read. */ if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || @@ -799,7 +777,9 @@ *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); - COMMON_END(PRU_RCVOOB); +out: + COMMON_END(INI_READ, PRU_RCVOOB); + return error; } struct pr_usrreqs tcp_usrreqs = { @@ -1207,11 +1187,23 @@ #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); - if (tp == 0) { - int nofd = so->so_state & SS_NOFDREF; /* XXX */ + if (tp == NULL) { + int nofd; + /* + * XXXRW: This is a potentially racy scenario: we perform + * a read-update-write on so_state but don't hold a lock, + * not to mention calling out to external code that may + * grab locks. This section requires attention. + * + * XXXRW: Or not, this is in a local socket attach? + */ + SOCK_LOCK(so); + nofd = so->so_state & SS_NOFDREF; + /* don't free the socket yet */ + if (nofd) + so->so_state &= ~SS_NOFDREF; + SOCK_UNLOCK(so); - so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ - INP_LOCK(inp); #ifdef INET6 if (isipv6) @@ -1219,7 +1211,9 @@ else #endif in_pcbdetach(inp); + SOCK_LOCK(so); so->so_state |= nofd; + SOCK_UNLOCK(so); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; --- //depot/vendor/freebsd/src/sys/netinet/udp_usrreq.c 2005/06/01 11:25:19 +++ //depot/user/rwatson/netperf/sys/netinet/udp_usrreq.c 2005/07/05 14:55:33 @@ -691,10 +691,10 @@ int unlock_udbinfo; /* - * udp_output() may need to temporarily bind or connect the current - * inpcb. As such, we don't know up front what inpcb locks we will - * need. Do any work to decide what is needed up front before - * acquiring locks. + * XXXRW: udp_output() may need to temporarily bind or connect the + * current inpcb. As such, we don't know up front what inpcb locks + * we will need. Do any work to decide what is needed up front + * before acquiring locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) @@ -737,6 +737,9 @@ src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; + /* + * XXXRW: alignment? + */ src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; default: @@ -753,6 +756,12 @@ return error; } + /* + * XXXRW: We now have all the information we need in order to + * determine what locks are required. The temporary binding behavior + * here is expensive and undesirable. The current use of a local + * variable to track lock state is also unpretty. + */ if (src.sin_addr.s_addr != INADDR_ANY || addr != NULL) { INP_INFO_WLOCK(&udbinfo); @@ -899,18 +908,47 @@ SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); +/* + * Common code to setup and teardown locking. Most + * code begins with a COMMON_START macro and finishes + * with COMMON_END. You indicate whether the inpcb + * and enclosing head are to be locked read or write . + */ +#define INI_NOLOCK 0 /* no head lock */ +#define INI_READ 1 /* read head lock */ +#define INI_WRITE 2 /* write head lock */ + +#define COMMON_START(_headrw) do { \ + if (_headrw == INI_READ) \ + INP_INFO_RLOCK(&udbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WLOCK(&udbinfo); \ + inp = sotoinpcb(so); \ + if (inp == 0) { \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&udbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&udbinfo); \ + return EINVAL; \ + } \ + INP_LOCK(inp); \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&udbinfo); \ +} while(0) + +#define COMMON_END(_headrw) \ + do { \ + INP_UNLOCK(inp); \ + if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&udbinfo); \ + } while(0) + static int udp_abort(struct socket *so) { struct inpcb *inp; - INP_INFO_WLOCK(&udbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_WUNLOCK(&udbinfo); - return EINVAL; /* ??? possible? panic instead? */ - } - INP_LOCK(inp); + COMMON_START(INI_WRITE); soisdisconnected(so); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); @@ -975,17 +1013,10 @@ int error; struct sockaddr_in *sin; - INP_INFO_WLOCK(&udbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_WUNLOCK(&udbinfo); - return EINVAL; - } - INP_LOCK(inp); + COMMON_START(INI_WRITE); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); - return EISCONN; + error = EISCONN; + goto out; } sin = (struct sockaddr_in *)nam; if (jailed(td->td_ucred)) @@ -993,8 +1024,8 @@ error = in_pcbconnect(inp, nam, td->td_ucred); if (error == 0) soisconnected(so); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); +out: + COMMON_END(INI_WRITE); return error; } @@ -1018,19 +1049,13 @@ static int udp_disconnect(struct socket *so) { + int error = 0; struct inpcb *inp; - INP_INFO_WLOCK(&udbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_WUNLOCK(&udbinfo); - return EINVAL; - } - INP_LOCK(inp); + COMMON_START(INI_WRITE); if (inp->inp_faddr.s_addr == INADDR_ANY) { - INP_INFO_WUNLOCK(&udbinfo); - INP_UNLOCK(inp); - return ENOTCONN; + error = ENOTCONN; + goto out; } in_pcbdisconnect(inp); @@ -1038,7 +1063,8 @@ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); so->so_state &= ~SS_ISCONNECTED; /* XXX */ - return 0; +out: + return error; } static int @@ -1056,16 +1082,9 @@ { struct inpcb *inp; - INP_INFO_RLOCK(&udbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_RUNLOCK(&udbinfo); - return EINVAL; - } - INP_LOCK(inp); - INP_INFO_RUNLOCK(&udbinfo); + COMMON_START(INI_READ); socantsendmore(so); - INP_UNLOCK(inp); + COMMON_END(INI_READ); return 0; } --- //depot/vendor/freebsd/src/sys/netinet6/in6_gif.c 2005/06/20 20:20:47 +++ //depot/user/rwatson/netperf/sys/netinet6/in6_gif.c 2005/07/05 14:55:33 @@ -82,6 +82,11 @@ &rip6_usrreqs }; +/* + * XXXRW: in6_gif per-softc locking required. Need to lock both the + * members, and also prevent the softc from disappearing during use + * including the route). + */ int in6_gif_output(ifp, family, m) struct ifnet *ifp; @@ -394,6 +399,10 @@ in6_gif_attach(sc) struct gif_softc *sc; { + + /* + * XXXRW: Technically, encap_attach() can return NULL due to ENOMEM? + */ sc->encap_cookie6 = encap_attach_func(AF_INET6, -1, gif_encapcheck, (struct protosw *)&in6_gif_protosw, sc); if (sc->encap_cookie6 == NULL) --- //depot/vendor/freebsd/src/sys/netinet6/in6_ifattach.c 2005/02/22 13:06:15 +++ //depot/user/rwatson/netperf/sys/netinet6/in6_ifattach.c 2005/02/27 16:23:38 @@ -226,8 +226,8 @@ struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; - static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - static u_int8_t allone[8] = + static const u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + static const u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; for (ifa = ifp->if_addrlist.tqh_first; --- //depot/vendor/freebsd/src/sys/netinet6/in6_pcb.c 2005/01/07 02:32:16 +++ //depot/user/rwatson/netperf/sys/netinet6/in6_pcb.c 2005/02/27 16:23:38 @@ -414,6 +414,7 @@ #ifdef IPSEC ipsec_pcbdisconn(inp->inp_sp); #endif + /* Unlocked read. */ if (inp->inp_socket->so_state & SS_NOFDREF) in6_pcbdetach(inp); } --- //depot/vendor/freebsd/src/sys/netinet6/in6_proto.c 2005/02/22 13:06:15 +++ //depot/user/rwatson/netperf/sys/netinet6/in6_proto.c 2005/02/27 16:23:38 @@ -67,6 +67,8 @@ #include "opt_carp.h" #include +#include +#include #include #include #include --- //depot/vendor/freebsd/src/sys/netinet6/in6_rmx.c 2005/01/07 02:32:16 +++ //depot/user/rwatson/netperf/sys/netinet6/in6_rmx.c 2005/02/27 16:23:38 @@ -79,6 +79,8 @@ #include #include #include +#include +#include #include #include #include --- //depot/vendor/freebsd/src/sys/netinet6/ip6_forward.c 2005/01/07 02:32:16 +++ //depot/user/rwatson/netperf/sys/netinet6/ip6_forward.c 2005/02/27 16:23:38 @@ -85,6 +85,9 @@ #include +/* + * XXXRW: The route cache needs synchronization, or removal as in IPv4. + */ struct route_in6 ip6_forward_rt; /* --- //depot/vendor/freebsd/src/sys/netinet6/ip6_input.c 2005/03/16 05:15:18 +++ //depot/user/rwatson/netperf/sys/netinet6/ip6_input.c 2005/06/01 10:10:04 @@ -195,7 +195,7 @@ ip6intrq.ifq_maxlen = ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); - netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); + netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, NETISR_MPSAFE); scope6_init(); addrsel_policy_init(); nd6_init(); @@ -241,7 +241,6 @@ struct in6_addr odst; int srcrt = 0; - GIANT_REQUIRED; /* XXX for now */ #ifdef IPSEC /* * should the inner packet be considered authentic? --- //depot/vendor/freebsd/src/sys/netinet6/ip6_output.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/netinet6/ip6_output.c 2005/07/05 14:55:33 @@ -2101,6 +2101,7 @@ bzero(&sro, sizeof(sro)); + /* Unlocked read. */ if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* --- //depot/vendor/freebsd/src/sys/netinet6/nd6.c 2005/02/22 13:06:15 +++ //depot/user/rwatson/netperf/sys/netinet6/nd6.c 2005/02/27 16:23:38 @@ -97,6 +97,9 @@ /* for debugging? */ static int nd6_inuse, nd6_allocated; +/* + * XXXRW: What follows requires locking. + */ struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6}; struct nd_drhead nd_defrouter; struct nd_prhead nd_prefix = { 0 }; @@ -1798,6 +1801,13 @@ nd6_slowtimo, NULL); IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + /* + * XXXRW: Temporary work-around for the fact that the timeout + * can fire during interface initialization, racing with the + * code that is intended to initialize this field. + */ + if (ifp->if_afdata[AF_INET6] == NULL) + continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { --- //depot/vendor/freebsd/src/sys/netinet6/raw_ip6.c 2005/03/29 01:30:18 +++ //depot/user/rwatson/netperf/sys/netinet6/raw_ip6.c 2005/06/01 10:10:04 @@ -639,6 +639,7 @@ { struct inpcb *inp = sotoinpcb(so); + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; inp->in6p_faddr = in6addr_any; --- //depot/vendor/freebsd/src/sys/netipsec/keysock.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netipsec/keysock.c 2005/02/27 16:23:38 @@ -306,6 +306,11 @@ pfkeystat.in_msgtype[msg->sadb_msg_type]++; } + /* + * XXXRW: This will almost certainly cause lock order problems; + * we may need a netisr solution similar to that used in rtsock. + */ + mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) @@ -354,18 +359,21 @@ continue; if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { + mtx_unlock(&rawcb_mtx); m_freem(m); pfkeystat.in_nomem++; return ENOBUFS; } if ((error = key_sendup0(rp, n, 0)) != 0) { + mtx_unlock(&rawcb_mtx); m_freem(m); return error; } n = NULL; } + mtx_lock(&rawcb_mtx); if (so) { error = key_sendup0(sotorawcb(so), m, 0); --- //depot/vendor/freebsd/src/sys/netipx/ipx_input.c 2005/04/10 18:05:47 +++ //depot/user/rwatson/netperf/sys/netipx/ipx_input.c 2005/06/01 10:10:04 @@ -82,7 +82,15 @@ .s_host[1] = 0xffff, .s_host[2] = 0xffff }; +/* + * XXXRW: Locking needed here. + */ struct ipxstat ipxstat; + +/* + * XXXRW: These should/could also be const, since they're set only at + * init time. + */ struct sockaddr_ipx ipx_netmask, ipx_hostmask; /* @@ -117,6 +125,9 @@ IPX_LIST_LOCK_INIT(); + /* + * XXXRW: These should be const? + */ ipx_netmask.sipx_len = 6; ipx_netmask.sipx_addr.x_net = ipx_broadnet; @@ -472,6 +483,14 @@ } } +/* + * XXXRW: Herein lies the largest locking problem in netipx: ipx_outputfl() + * loops back into ipx_watch_output() to re-inject packets back into an + * ipxpcb if raw sockets are listening for all packets. Since ipx_outputfl() + * is called from the outbound path with potentially list and ipxpcb locks + * held, things get sticky. We may want to insert a netisr deferal of the + * loop back into the raw socket layer. + */ void ipx_watch_output(m, ifp) struct mbuf *m; --- //depot/vendor/freebsd/src/sys/netipx/ipx_outputfl.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netipx/ipx_outputfl.c 2005/02/27 16:23:38 @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2005 Robert N. M. Watson * Copyright (c) 1995, Mike Mitchell * Copyright (c) 1984, 1985, 1986, 1987, 1993 * The Regents of the University of California. All rights reserved. @@ -129,6 +130,10 @@ if (htons(ipx->ipx_len) <= ifp->if_mtu) { ipxstat.ipxs_localout++; if (ipx_copy_output) { + /* + * XXXRW: We don't know if the caller holds the list + * lock or not. + */ ipx_watch_output(m0, ifp); } error = (*ifp->if_output)(ifp, m0, @@ -140,6 +145,10 @@ } bad: if (ipx_copy_output) { + /* + * XXXRW: We don't know if the caller holds the list lock or + * not. + */ ipx_watch_output(m0, ifp); } m_freem(m0); --- //depot/vendor/freebsd/src/sys/netipx/ipx_pcb.c 2005/01/09 05:10:49 +++ //depot/user/rwatson/netperf/sys/netipx/ipx_pcb.c 2005/02/27 16:23:38 @@ -115,6 +115,9 @@ } ipxp->ipxp_laddr = sipx->sipx_addr; noname: + /* + * XXXRW: I wonder what guarantees this loop will terminate... + */ if (lport == 0) do { ipxpcb_lport_cache++; --- //depot/vendor/freebsd/src/sys/netipx/ipx_usrreq.c 2005/01/09 05:20:42 +++ //depot/user/rwatson/netperf/sys/netipx/ipx_usrreq.c 2005/02/27 16:23:38 @@ -134,11 +134,10 @@ * Construct sockaddr format source address. * Stuff source address and datagram in user buffer. */ + bzero(&ipx_ipx, sizeof(ipx_ipx)); ipx_ipx.sipx_len = sizeof(ipx_ipx); ipx_ipx.sipx_family = AF_IPX; ipx_ipx.sipx_addr = ipx->ipx_sna; - ipx_ipx.sipx_zero[0] = '\0'; - ipx_ipx.sipx_zero[1] = '\0'; if (ipx_neteqnn(ipx->ipx_sna.x_net, ipx_zeronet) && ifp != NULL) { register struct ifaddr *ifa; --- //depot/vendor/freebsd/src/sys/netipx/spx_usrreq.c 2005/02/21 22:00:52 +++ //depot/user/rwatson/netperf/sys/netipx/spx_usrreq.c 2005/02/27 16:23:38 @@ -692,6 +692,9 @@ return (0); } +/* + * XXXRW: This is basically unimplemented -- should we GC? + */ void spx_ctlinput(cmd, arg_as_sa, dummy) int cmd; @@ -1670,6 +1673,7 @@ error = spx_attach(so, proto, td); if (error == 0) { ipxp = sotoipxpcb(so); + /* XXXRW: For now, we accept a race here. */ ((struct spxpcb *)ipxp->ipxp_pcb)->s_flags |= (SF_HI | SF_HO | SF_PI); } --- //depot/vendor/freebsd/src/sys/netkey/keysock.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netkey/keysock.c 2005/02/27 16:23:38 @@ -214,6 +214,11 @@ pfkeystat.in_msgtype[msg->sadb_msg_type]++; } + /* + * XXXRW: This will almost certainly cause lock order problems; + * we may need a netisr solution similar to that used in rtsock. + */ + mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) continue; @@ -261,6 +266,7 @@ continue; if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { + mtx_unlock(&rawcb_mtx); m_freem(m); pfkeystat.in_nomem++; return ENOBUFS; @@ -275,6 +281,7 @@ n = NULL; } + mtx_unlock(&rawcb_mtx); if (so) { error = key_sendup0(sotorawcb(so), m, 0); --- //depot/vendor/freebsd/src/sys/netsmb/smb_trantcp.c 2005/01/07 01:52:23 +++ //depot/user/rwatson/netperf/sys/netsmb/smb_trantcp.c 2005/02/27 16:23:38 @@ -184,6 +184,9 @@ return 0; } +/* + * XXXRW: nb_upcall() is called without Giant, which is probably safeish. + */ static void nb_upcall(struct socket *so, void *arg, int waitflag) { --- //depot/vendor/freebsd/src/sys/nfsclient/nfs_socket.c 2005/07/18 02:15:48 +++ //depot/user/rwatson/netperf/sys/nfsclient/nfs_socket.c 2005/07/19 11:47:08 @@ -1241,6 +1241,7 @@ * Set r_rtt to -1 in case we fail to send it now. */ rep->r_rtt = -1; + /* XXXRW: Unlocked reads. */ if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) || --- //depot/vendor/freebsd/src/sys/nfsserver/nfs_syscalls.c 2005/03/26 11:30:19 +++ //depot/user/rwatson/netperf/sys/nfsserver/nfs_syscalls.c 2005/06/01 10:10:04 @@ -608,6 +608,10 @@ NFSD_UNLOCK(); slp->ns_fp = NULL; so = slp->ns_so; + /* + * XXXRW: More general locking issues here relating to + * upcalls. + */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_UPCALL; SOCKBUF_UNLOCK(&so->so_rcv); --- //depot/vendor/freebsd/src/sys/pci/if_dc.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/pci/if_dc.c 2005/07/05 14:55:33 @@ -343,8 +343,6 @@ #define SIO_SET(x) DC_SETBIT(sc, DC_SIO, (x)) #define SIO_CLR(x) DC_CLRBIT(sc, DC_SIO, (x)) -#define IS_MPSAFE 0 - static void dc_delay(struct dc_softc *sc) { @@ -2190,8 +2188,6 @@ /* XXX: bleah, MTU gets overwritten in ether_ifattach() */ ifp->if_mtu = ETHERMTU; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - if (!IS_MPSAFE) - ifp->if_flags |= IFF_NEEDSGIANT; ifp->if_ioctl = dc_ioctl; ifp->if_start = dc_start; ifp->if_watchdog = dc_watchdog; @@ -2272,7 +2268,7 @@ #endif ifp->if_capenable = ifp->if_capabilities; - callout_init(&sc->dc_stat_ch, IS_MPSAFE ? CALLOUT_MPSAFE : 0); + callout_init(&sc->dc_stat_ch, debug_mpsafenet ? CALLOUT_MPSAFE : 0); #ifdef SRM_MEDIA sc->dc_srm_media = 0; @@ -2306,8 +2302,7 @@ ether_ifattach(ifp, eaddr); /* Hook interrupt last to avoid having to lock softc */ - error = bus_setup_intr(dev, sc->dc_irq, INTR_TYPE_NET | - (IS_MPSAFE ? INTR_MPSAFE : 0), + error = bus_setup_intr(dev, sc->dc_irq, INTR_TYPE_NET | INTR_MPSAFE, dc_intr, sc, &sc->dc_intrhand); if (error) { --- //depot/vendor/freebsd/src/sys/pci/if_pcn.c 2005/06/10 16:51:34 +++ //depot/user/rwatson/netperf/sys/pci/if_pcn.c 2005/07/05 14:55:33 @@ -631,7 +631,7 @@ ether_ifattach(ifp, (u_int8_t *) eaddr); /* Hook interrupt last to avoid having to lock softc */ - error = bus_setup_intr(dev, sc->pcn_irq, INTR_TYPE_NET, + error = bus_setup_intr(dev, sc->pcn_irq, INTR_TYPE_NET | INTR_MPSAFE, pcn_intr, sc, &sc->pcn_intrhand); if (error) { --- //depot/vendor/freebsd/src/sys/rpc/rpcclnt.c 2005/03/19 01:22:25 +++ //depot/user/rwatson/netperf/sys/rpc/rpcclnt.c 2005/06/01 10:10:04 @@ -145,7 +145,7 @@ /* XXX ugly debug strings */ #define RPC_ERRSTR_ACCEPTED_SIZE 6 -char *rpc_errstr_accepted[RPC_ERRSTR_ACCEPTED_SIZE] = { +const char *rpc_errstr_accepted[RPC_ERRSTR_ACCEPTED_SIZE] = { "", /* no good message... */ "remote server hasn't exported program.", "remote server can't support version number.", @@ -154,13 +154,13 @@ "remote error. remote side memory allocation failure?" }; -char *rpc_errstr_denied[2] = { +const char *rpc_errstr_denied[2] = { "remote server doesnt support rpc version 2!", "remote server authentication error." }; #define RPC_ERRSTR_AUTH_SIZE 6 -char *rpc_errstr_auth[RPC_ERRSTR_AUTH_SIZE] = { +const char *rpc_errstr_auth[RPC_ERRSTR_AUTH_SIZE] = { "", "auth error: bad credential (seal broken).", "auth error: client must begin new session.", @@ -360,7 +360,7 @@ RPC_RETURN(EFAULT); } - GIANT_REQUIRED; /* XXX until socket locking done */ + NET_ASSERT_GIANT(); /* create the socket */ rpc->rc_so = NULL; @@ -618,7 +618,7 @@ { struct socket *so; - GIANT_REQUIRED; /* XXX until socket locking done */ + NET_ASSERT_GIANT(); if (rpc->rc_so) { so = rpc->rc_so; @@ -669,7 +669,7 @@ #endif int error, soflags, flags; - GIANT_REQUIRED; /* XXX until socket locking done */ + NET_ASSERT_GIANT(); if (rep) { if (rep->r_flags & R_SOFTTERM) { @@ -754,7 +754,7 @@ #endif int error, sotype, rcvflg; - GIANT_REQUIRED; /* XXX until socket locking done */ + NET_ASSERT_GIANT(); /* * Set up arguments for soreceive() @@ -1439,6 +1439,7 @@ * Set r_rtt to -1 in case we fail to send it now. */ rep->r_rtt = -1; + SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && ((rpc->rc_flag & RPCCLNT_DUMBTIMR) || (rep->r_flags & R_SENT) || @@ -1473,6 +1474,7 @@ rep->r_rtt = 0; } } + SOCKBUF_UNLOCK(&so->so_snd); } splx(s); --- //depot/vendor/freebsd/src/sys/security/mac/mac_net.c 2005/07/05 23:40:38 +++ //depot/user/rwatson/netperf/sys/security/mac/mac_net.c 2005/07/19 11:47:08 @@ -271,6 +271,11 @@ MAC_PERFORM(copy_ifnet_label, src, dest); } +/* + * XXXRW: Need to use a routine to copy the ifnet label while holding the + * ifnet mutex, similar to sockets, pipes, vnodes, et al, to avoid holding + * the mutex over the copyin/copyout. + */ static int mac_externalize_ifnet_label(struct label *label, char *elements, char *outbuf, size_t outbuflen) @@ -282,6 +287,9 @@ return (error); } +/* + * XXXRW: See comment for mac_externalize_ifnet_label(). + */ static int mac_internalize_ifnet_label(struct label *label, char *string) { --- //depot/vendor/freebsd/src/sys/sys/pcpu.h 2005/04/26 17:10:17 +++ //depot/user/rwatson/netperf/sys/sys/pcpu.h 2005/06/01 10:10:04 @@ -85,6 +85,7 @@ #ifndef curthread #define curthread PCPU_GET(curthread) #endif +#define curcpu PCPU_GET(cpuid) /* * MI PCPU support functions --- //depot/vendor/freebsd/src/sys/sys/socketvar.h 2005/07/09 12:25:21 +++ //depot/user/rwatson/netperf/sys/sys/socketvar.h 2005/07/19 11:47:08 @@ -356,6 +356,7 @@ SOCK_UNLOCK(so); \ ACCEPT_UNLOCK(); \ } \ + so = NULL; \ } while (0) #define sotryfree(so) do { \ @@ -367,6 +368,7 @@ SOCK_UNLOCK(so); \ ACCEPT_UNLOCK(); \ } \ + so = NULL; \ } while(0) /*