--- //depot/vendor/freebsd/src/sys/fs/fifofs/fifo_vnops.c 2004/06/18 03:02:01 +++ //depot/user/rwatson/netperf/sys/fs/fifofs/fifo_vnops.c 2004/06/18 03:27:04 @@ -461,6 +461,7 @@ if (need_lock) SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc; + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; result = 1; --- //depot/vendor/freebsd/src/sys/fs/portalfs/portal_vnops.c 2004/06/17 22:51:43 +++ //depot/user/rwatson/netperf/sys/fs/portalfs/portal_vnops.c 2004/06/18 00:24:39 @@ -193,6 +193,7 @@ unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + /* XXXRW: Locking? */ if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) sodupsockaddr((struct sockaddr *)unp2->unp_addr, @@ -284,6 +285,7 @@ * and keep polling the reference count. XXX. */ s = splnet(); + /* XXXRW: Locking? */ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { if (fmp->pm_server->f_count == 1) { error = ECONNREFUSED; --- //depot/vendor/freebsd/src/sys/i386/conf/GENERIC 2004/05/28 00:26:29 +++ //depot/user/rwatson/netperf/sys/i386/conf/GENERIC 2004/05/31 01:41:33 @@ -66,6 +66,7 @@ options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options BREAK_TO_DEBUGGER # To make an SMP kernel, the next two are needed options SMP # Symmetric MultiProcessor Kernel --- //depot/vendor/freebsd/src/sys/kern/kern_descrip.c 2004/06/19 11:40:38 +++ //depot/user/rwatson/netperf/sys/kern/kern_descrip.c 2004/06/19 15:26:22 @@ -2057,7 +2057,7 @@ struct file *fp; struct thread *td; { - int error; + int error, type; FILE_LOCK_ASSERT(fp, MA_OWNED); @@ -2067,15 +2067,35 @@ } /* We have the last ref so we can proceed without the file lock. */ FILE_UNLOCK(fp); + + /* + * XXXRW: It's not pretty, but this way we can avoid holding Giant + * over operation vectors that don't require it. Note that + * technically, this is slightly conservative as badops doesn't + * need Giant. + */ + type = fp->f_type; + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_lock(&Giant); + } if (fp->f_count < 0) panic("fdrop: count < 0"); - mtx_lock(&Giant); if (fp->f_ops != &badfileops) error = fo_close(fp, td); else error = 0; ffree(fp); - mtx_unlock(&Giant); + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_unlock(&Giant); + } return (error); } --- //depot/vendor/freebsd/src/sys/kern/kern_mbuf.c 2004/06/01 16:20:40 +++ //depot/user/rwatson/netperf/sys/kern/kern_mbuf.c 2004/06/08 21:28:41 @@ -214,7 +214,7 @@ #endif } else m->m_data = m->m_dat; - mbstat.m_mbufs += 1; /* XXX */ + atomic_add_long(&mbstat.m_mbufs, 1); /* return 1; */ } @@ -230,7 +230,7 @@ m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); - mbstat.m_mbufs -= 1; /* XXX */ + atomic_subtract_long(&mbstat.m_mbufs, 1); } /* XXX Only because of stats */ @@ -242,8 +242,8 @@ m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); - mbstat.m_mbufs -= 1; /* XXX */ - mbstat.m_mclusts -= 1; /* XXX */ + atomic_subtract_long(&mbstat.m_mbufs, 1); + atomic_subtract_long(&mbstat.m_mclusts, 1); } /* @@ -268,7 +268,7 @@ m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust, m->m_ext.ext_buf); *(m->m_ext.ref_cnt) = 1; - mbstat.m_mclusts += 1; /* XXX */ + atomic_add_long(&mbstat.m_mclusts, 1); /* return 1; */ } @@ -277,7 +277,7 @@ static void mb_dtor_clust(void *mem, int size, void *arg) { - mbstat.m_mclusts -= 1; /* XXX */ + atomic_subtract_long(&mbstat.m_mclusts, 1); } /* @@ -294,7 +294,7 @@ uma_zalloc_arg(zone_clust, m, M_NOWAIT); if (m->m_ext.ext_buf == NULL) /* XXX */ panic("mb_init_pack(): Can't deal with failure yet."); - mbstat.m_mclusts -= 1; /* XXX */ + atomic_subtract_long(&mbstat.m_mclusts, 1); } /* @@ -309,7 +309,7 @@ m = (struct mbuf *)mem; uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); m->m_ext.ext_buf = NULL; - mbstat.m_mclusts += 1; /* XXX */ + atomic_add_long(&mbstat.m_mclusts, 1); } /* @@ -353,8 +353,8 @@ } #endif } - mbstat.m_mbufs += 1; /* XXX */ - mbstat.m_mclusts += 1; /* XXX */ + atomic_add_long(&mbstat.m_mbufs, 1); + atomic_add_long(&mbstat.m_mclusts, 1); /* return 1; */ } @@ -375,7 +375,7 @@ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, "mb_reclaim()"); - mbstat.m_drain++; + atomic_add_long(&mbstat.m_drain, 1); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) --- //depot/vendor/freebsd/src/sys/kern/kern_timeout.c 2004/04/25 04:10:43 +++ //depot/user/rwatson/netperf/sys/kern/kern_timeout.c 2004/05/04 02:32:28 @@ -44,6 +44,7 @@ #include #include #include +#include #include static int avg_depth; @@ -55,6 +56,89 @@ static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); + +/*- + * Sampling buffer of function pointers executed by timeouts and callouts. + * This circular buffer wraps when it fills, and uses an inefficient + * sbuf-based sysctl to dump sample data to userspace. Sysctls can select + * to monitor mpsafe and !mpsafe callouts/timeouts as desired. Suggested + * use is: (1) set sample of interest (mpsafe/notmpsafe), (2) reset the + * buffer, (3) do some benchmark/test, (5) disable sampling, (6) dump + * buffer. + * + * XXX: ifdef TIMEOUT_SAMPLING? + */ + +#define MAXFUNC 200000 +static void * func_array[MAXFUNC]; +static int array_off; + +static void +push_cfunc(void *ptr) +{ + + /* XXX */ + func_array[array_off % MAXFUNC] = ptr; + array_off++; +} + +static int +sysctl_cfunc(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sb; + int error, i; + + if (req->newptr != NULL) + return (EINVAL); + + sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND); + + for (i = 0; i < MAXFUNC; i++) { + if (func_array[i] == NULL) + break; + sbuf_printf(&sb, "%p ", func_array[i]); + } + sbuf_finish(&sb); + + error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb) + 1, req); + + sbuf_delete(&sb); + + return (error); +} + +SYSCTL_PROC(_debug, OID_AUTO, to_cfunc, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, + sysctl_cfunc, "A", "callout/timeout sample"); + +static int +sysctl_cfunc_reset(SYSCTL_HANDLER_ARGS) +{ + int dummy, error; + + dummy = 0; + error = sysctl_handle_int(oidp, &dummy, 0, req); + if (error) + return (error); + + if (dummy != 0) { + bzero(func_array, sizeof(void *) * MAXFUNC); + array_off = 0; + } + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, to_cfunc_reset, CTLTYPE_INT|CTLFLAG_RW, 0, 0, + sysctl_cfunc_reset, "I", "Reset sample"); + +static int cfunc_sample_mpsafe; +static int cfunc_sample_notmpsafe; + +SYSCTL_INT(_debug, OID_AUTO, to_cfunc_mpsafe, CTLFLAG_RW, + &cfunc_sample_mpsafe, 0, "Sample mpsafe callouts"); +SYSCTL_INT(_debug, OID_AUTO, to_cfunc_notmpsafe, CTLFLAG_RW, + &cfunc_sample_notmpsafe, 0, "Sample !mpsafe callouts"); + /* * TODO: * allocate more timeout table slots when table overflows. @@ -245,8 +329,12 @@ if (!(c_flags & CALLOUT_MPSAFE)) { mtx_lock(&Giant); gcalls++; + if (cfunc_sample_mpsafe) + push_cfunc(c_func); } else { mpcalls++; + if (cfunc_sample_notmpsafe) + push_cfunc(c_func); } #ifdef DIAGNOSTIC binuptime(&bt1); --- //depot/vendor/freebsd/src/sys/kern/subr_log.c 2004/06/16 09:52:13 +++ //depot/user/rwatson/netperf/sys/kern/subr_log.c 2004/06/18 00:24:39 @@ -83,7 +83,12 @@ struct callout sc_callout; /* callout to wakeup syslog */ } logsoftc; -int log_open; /* also used in log() */ +/* + * log_mtx protects logsoftc, log_open. Note that log_mtx does *not* + * protect the structures associated with msgbuf, which require Giant. + */ +struct mtx log_mtx; +int log_open; /* also used in log() */ /* Times per second to check for a pending syslog wakeup. */ static int log_wakeups_per_second = 5; @@ -94,17 +99,24 @@ static int logopen(struct cdev *dev, int flags, int mode, struct thread *td) { - if (log_open) + + mtx_lock(&log_mtx); + if (log_open) { + mtx_unlock(&log_mtx); return (EBUSY); + } log_open = 1; - callout_init(&logsoftc.sc_callout, 0); + callout_init(&logsoftc.sc_callout, CALLOUT_MPSAFE); + mtx_unlock(&log_mtx); fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */ + mtx_lock(&log_mtx); if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return (0); } @@ -113,9 +125,11 @@ logclose(struct cdev *dev, int flag, int mode, struct thread *td) { + mtx_lock(&log_mtx); log_open = 0; callout_stop(&logsoftc.sc_callout); logsoftc.sc_state = 0; + mtx_unlock(&log_mtx); funsetown(&logsoftc.sc_sigio); return (0); } @@ -134,14 +148,18 @@ splx(s); return (EWOULDBLOCK); } + mtx_lock(&log_mtx); logsoftc.sc_state |= LOG_RDWAIT; + mtx_unlock(&log_mtx); if ((error = tsleep(mbp, LOG_RDPRI | PCATCH, "klog", 0))) { splx(s); return (error); } } splx(s); + mtx_lock(&log_mtx); logsoftc.sc_state &= ~LOG_RDWAIT; + mtx_unlock(&log_mtx); while (uio->uio_resid > 0) { l = imin(sizeof(buf), uio->uio_resid); @@ -178,8 +196,11 @@ logtimeout(void *arg) { - if (!log_open) + mtx_lock(&log_mtx); + if (!log_open) { + mtx_unlock(&log_mtx); return; + } if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; @@ -187,6 +208,7 @@ if (msgbuftrigger == 0) { callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return; } msgbuftrigger = 0; @@ -199,6 +221,7 @@ } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); } /*ARGSUSED*/ @@ -217,10 +240,12 @@ break; case FIOASYNC: + mtx_lock(&log_mtx); if (*(int *)data) logsoftc.sc_state |= LOG_ASYNC; else logsoftc.sc_state &= ~LOG_ASYNC; + mtx_unlock(&log_mtx); break; case FIOSETOWN: @@ -249,6 +274,7 @@ log_drvinit(void *unused) { + mtx_init(&log_mtx, "log_mtx", NULL, MTX_DEF); make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog"); } --- //depot/vendor/freebsd/src/sys/kern/subr_prf.c 2004/06/18 20:15:39 +++ //depot/user/rwatson/netperf/sys/kern/subr_prf.c 2004/06/19 04:30:55 @@ -83,6 +83,9 @@ size_t remain; }; +/* + * XXXRW: We access subr_log.c's log_open variable unlocked. + */ extern int log_open; static void msglogchar(int c, int pri); --- //depot/vendor/freebsd/src/sys/kern/sys_socket.c 2004/06/17 22:51:43 +++ //depot/user/rwatson/netperf/sys/kern/sys_socket.c 2004/06/18 00:24:39 @@ -170,6 +170,7 @@ return (0); case FIONREAD: + /* Unlocked read. */ *(int *)data = so->so_rcv.sb_cc; return (0); @@ -188,6 +189,7 @@ return (0); case SIOCATMARK: + /* Unlocked read. */ *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; return (0); } @@ -229,7 +231,11 @@ /* * If SBS_CANTRCVMORE is set, but there's still data left in the * receive buffer, the socket is still readable. + * + * XXXRW: perhaps should lock socket buffer so st_size result + * is consistent. */ + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; --- //depot/vendor/freebsd/src/sys/kern/uipc_mbuf.c 2004/06/11 18:20:48 +++ //depot/user/rwatson/netperf/sys/kern/uipc_mbuf.c 2004/06/12 17:06:18 @@ -413,12 +413,12 @@ np = &n->m_next; } if (top == NULL) - mbstat.m_mcfail++; /* XXX: No consistency. */ + atomic_add_long(&mbstat.m_mcfail, 1); return (top); nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + atomic_add_long(&mbstat.m_mcfail, 1); return (NULL); } @@ -478,7 +478,7 @@ return top; nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + atomic_add_long(&mbstat.m_mcfail, 1); return (NULL); } @@ -580,7 +580,7 @@ nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + atomic_add_long(&mbstat.m_mcfail, 1); return (NULL); } @@ -740,7 +740,7 @@ return (m); bad: m_freem(n); - mbstat.m_mpfail++; /* XXX: No consistency. */ + atomic_add_long(&mbstat.m_mpfail, 1); return (NULL); } --- //depot/vendor/freebsd/src/sys/kern/uipc_socket.c 2004/06/19 03:25:35 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket.c 2004/06/20 17:21:28 @@ -109,7 +109,6 @@ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); - /* * Socket operation routines. * These routines are called by the routines in @@ -266,11 +265,17 @@ int s, error; s = splnet(); + /* Unlocked read. */ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) { splx(s); return (EINVAL); } + /* + * XXXRW: Ordering issue here -- perhaps we need to set + * SO_ACCEPTCONN before the call to pru_listen()? + * XXXRW: General atomic test-and-set concerns here also. + */ error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); if (error) { splx(s); @@ -299,6 +304,7 @@ KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); SOCK_LOCK_ASSERT(so); + /* XXXRW: Why would SS_NOFDREF be unset here? so_count is 0. */ if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { SOCK_UNLOCK(so); return; @@ -343,14 +349,59 @@ SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; (void)sblock(&so->so_snd, M_WAITOK); - socantsendmore(so); + /* + * socantsendmore_locked() drops the socket buffer mutex so that it + * can safely perform wakeups. Re-acquire the mutex before + * continuing. + */ + socantsendmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); - sbrelease(&so->so_snd, so); + sbrelease_locked(&so->so_snd, so); SOCKBUF_UNLOCK(&so->so_snd); sorflush(so); sodealloc(so); } +#if 0 +static void +dump_socket(struct socket *so) +{ + + printf("so = %p\n", so); + printf(" so_count = %d\n", so->so_count); + printf(" so_type = %d\n", so->so_type); + printf(" so_options = %d\n", so->so_options); + printf(" so_linger = %d\n", so->so_linger); + printf(" so_state = %d\n", so->so_state); + printf(" so_qstate = %d\n", so->so_qstate); + printf(" so_pcb = %p\n", so->so_pcb); + printf(" so_proto = %p\n", so->so_proto); + printf(" pr_type = %d\n", so->so_proto->pr_type); + printf(" pr_domain = %p\n", so->so_proto->pr_domain); + printf(" dom_family = %d\n", so->so_proto->pr_domain->dom_family); + printf(" dom_name = %s\n", so->so_proto->pr_domain->dom_name); + printf(" pr_protocol = %d\n", so->so_proto->pr_protocol); + printf(" pr_flags = %d\n", so->so_proto->pr_flags); + printf(" so_head = %p\n", so->so_head); + printf(" TAILQ_FIRST(so_incomp) = %p\n", TAILQ_FIRST(&so->so_incomp)); + printf(" TAILQ_FIRST(so_comp) = %p\n", TAILQ_FIRST(&so->so_comp)); + printf(" so_qlen = %d\n", so->so_qlen); + printf(" so_incqlen = %d\n", so->so_incqlen); + printf(" so_qlimit = %d\n", so->so_qlimit); + printf(" so_timeo = %d\n", so->so_timeo); + printf(" so_error = %d\n", so->so_error); + printf(" so_sigio = %p\n", so->so_sigio); + printf(" so_oobmark = %lu\n", so->so_oobmark); + printf(" so_rcv.sb_state = %d\n", so->so_rcv.sb_state); + printf(" so_snd.sb_state = %d\n", so->so_snd.sb_state); + printf(" so_upcall = %p\n", so->so_upcall); + printf(" so_upcallarg = %p\n", so->so_upcallarg); + printf(" so_cred = %p\n", so->so_cred); + printf(" so_gencnt = %d\n", (int)so->so_gencnt); +} +#endif + /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. @@ -367,7 +418,26 @@ int s = splnet(); /* conservative */ int error = 0; + KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + funsetown(&so->so_sigio); + /*- + * XXXRW: Lots of locking problems here. For one thing, + * we should probably clear the SO_ACCEPTCONN flag and + * push that down to the protocol layer. Also, the locking + * for the queue draining is probably not right either: + * what prevents new ones from being inserted after we get + * past this? Ideally, the downcall would prevent this, but + * there isn't one. + */ + /* + * XXXRW: soclose() should be split into two parts: one to handle + * listen sockets, and the other to handle the remainder. To + * prevent races with the protocol code, we need to detach before + * draining the queues. For non-listen sockets, the disconnect + * has to happen before the detach. + */ + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) { struct socket *sp; ACCEPT_LOCK(); @@ -393,6 +463,7 @@ } if (so->so_pcb == NULL) goto discard; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); @@ -413,14 +484,14 @@ } drop: if (so->so_pcb != NULL) { - int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); + int error2; + error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; } discard: SOCK_LOCK(so); - if (so->so_state & SS_NOFDREF) - panic("soclose: NOFDREF"); + KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); splx(s); @@ -428,7 +499,9 @@ } /* - * Must be called at splnet... + * XXXRW: soabort() must not be called with any locks held, as the protocol + * will need to be able to grab socket and socket buffer locks, and we also + * try to free the socket if the protocol generates an error. */ int soabort(so) @@ -466,25 +539,23 @@ struct sockaddr *nam; struct thread *td; { - int s; int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); - s = splnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ + /* Unlocked read. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) error = EISCONN; else error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); - splx(s); return (error); } @@ -493,11 +564,9 @@ struct socket *so1; struct socket *so2; { - int s = splnet(); int error; error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); - splx(s); return (error); } @@ -507,6 +576,7 @@ { int error; + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) @@ -562,7 +632,7 @@ struct mbuf **mp; struct mbuf *m; long space, len = 0, resid; - int clen = 0, error, s, dontroute; + int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; #ifdef ZERO_COPY_SOCKETS int cow_send; @@ -594,23 +664,19 @@ td->td_proc->p_stats->p_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; -#define snderr(errno) { error = (errno); splx(s); goto release; } +#define snderr(errno) { error = (errno); goto release; } SOCKBUF_LOCK(&so->so_snd); -restart: - SOCKBUF_LOCK_ASSERT(&so->so_snd); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out_locked; do { SOCKBUF_LOCK_ASSERT(&so->so_snd); - s = splnet(); if (so->so_snd.sb_state & SBS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; - splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { @@ -639,15 +705,12 @@ (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) snderr(EWOULDBLOCK); - sbunlock(&so->so_snd); error = sbwait(&so->so_snd); - splx(s); if (error) - goto out_locked; - goto restart; + goto release; + continue; } SOCKBUF_UNLOCK(&so->so_snd); - splx(s); mp = ⊤ space -= clen; do { @@ -757,6 +820,15 @@ break; } } while (space > 0 && atomic); + /* + * XXXRW: There may be a race condition here regarding + * SO_DONTROUTE. We hold the sblock() so in theory + * there won't be other consumers of the socket doing + * a pru_send(), but SO_DONTROUTE is also consumed + * from the protocol side. It sounds like we might + * want to use MSG_DONTROUTE here, if the semantics are + * right. + */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; @@ -767,7 +839,7 @@ * done could be out of date. We could have recieved * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We could - * probably recheck again inside the splnet() protection + * probably recheck again inside the locking protection * here, but there are probably other places that this * also happens. We must rethink this. */ @@ -785,7 +857,10 @@ /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); - splx(s); + /* + * XXXRW: Second half of above comment on SO_DONTROUTE. + * and so_options. + */ if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; @@ -843,7 +918,7 @@ int *flagsp; { struct mbuf *m, **mp; - int flags, len, error, s, offset; + int flags, len, error, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; @@ -898,17 +973,17 @@ } if (mp != NULL) *mp = NULL; + /* Unlocked read. */ if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreqs->pru_rcvd)(so, 0); SOCKBUF_LOCK(&so->so_rcv); -restart: - SOCKBUF_LOCK_ASSERT(&so->so_rcv); error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) goto out; - s = splnet(); +restart: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more @@ -926,9 +1001,8 @@ (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { - KASSERT(m != NULL || !so->so_rcv.sb_cc, - ("receive: m == %p so->so_rcv.sb_cc == %u", - m, so->so_rcv.sb_cc)); + KASSERT(!(m == NULL && so->so_rcv.sb_cc), + ("m %p so->so_rcv.sb_cc %u", m, so->so_rcv.sb_cc)); if (so->so_error) { if (m != NULL) goto dontblock; @@ -937,6 +1011,7 @@ so->so_error = 0; goto release; } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m) goto dontblock; @@ -948,6 +1023,7 @@ m = so->so_rcv.sb_mb; goto dontblock; } + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; @@ -955,6 +1031,7 @@ } if (uio->uio_resid == 0) goto release; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; @@ -962,15 +1039,14 @@ } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); - sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); - splx(s); if (error) - goto out; + goto release; goto restart; } dontblock: SOCKBUF_LOCK_ASSERT(&so->so_rcv); + KASSERT(error == 0, ("unexpected state, error %u", error)); if (uio->uio_td) uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); @@ -979,43 +1055,110 @@ if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); - orig_resid = 0; - if (psa != NULL) + if (psa != NULL) { *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); + if (*psa == NULL) { + error = ENOMEM; + goto release; + } + /* + * XXXRW: In the rwatson_netperf branch, we don't + * release the socket buffer lock here because + * we always use M_NOWAIT. In the main tree, + * or if we restore conditional waiting, we + * need to refresh nextpacket from the socket + * buffer version of the head mbuf, or we may + * have a stale value if it was previously NULL + * and now isn't. + */ + nextrecord = m->m_nextpkt; + } if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; + /* + * XXXRW: When running MPSAFE, we may release + * locks after this point, and therefore have + * effective preemption relative to the socket + * buffer mbuf chain. Since we've modified + * the head, we have to make sure the new head + * has the right nextpkt copied from the + * original head. + */ + m->m_nextpkt = nextrecord; } + orig_resid = 0; } - while (m != NULL && m->m_type == MT_CONTROL && error == 0) { - if (flags & MSG_PEEK) { - if (controlp != NULL) - *controlp = m_copy(m, 0, m->m_len); - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); - so->so_rcv.sb_mb = m->m_next; - m->m_next = NULL; - if (pr->pr_domain->dom_externalize) { + if (m != NULL && m->m_type == MT_CONTROL) { + struct mbuf *cm = NULL; + struct mbuf **cme = &cm; + + do { + if (flags & MSG_PEEK) { + /* + * XXXRW: In the BSD/OS version of this + * code, m_copym() is called with M_TRYWAIT, + * and we catch allcation failures and + * jump to release with error of ENOBUFS. + * For consistency with the current + * implementation, we don't. + */ + if (controlp != NULL) { + *controlp = m_copym(m, 0, m->m_len, + M_DONTWAIT); + controlp = &(*controlp)->m_next; + } + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m->m_next; + m->m_next = NULL; + if (controlp) { + /* + * Collect mbufs for processing below. + */ + *cme = m; + cme = &(*cme)->m_next; + } else + m_free(m); + m = so->so_rcv.sb_mb; + } + } while (m != NULL && m->m_type == MT_CONTROL); + /* + * XXXRW: Since we're dropping the socket buffer locks, and + * may have modified the mbuf list on the socket buffer, + * push out the latest version so it can be seen by any + * other code that accesses the buffer in the mean time. + */ + so->so_rcv.sb_mb->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_lastrecord = so->so_rcv.sb_mb; + if (cm != NULL) { + if (pr->pr_domain->dom_externalize != NULL) { + /* + * NB: drop the lock to avoid potential LORs; + * in particular unix domain sockets grab the + * file descriptor lock which would be a LOR. + */ SOCKBUF_UNLOCK(&so->so_rcv); error = (*pr->pr_domain->dom_externalize) - (m, controlp); + (cm, controlp); SOCKBUF_LOCK(&so->so_rcv); - } else if (controlp != NULL) - *controlp = m; - else - m_freem(m); - m = so->so_rcv.sb_mb; + } else + m_freem(cm); } - if (controlp != NULL) { - orig_resid = 0; - while (*controlp != NULL) - controlp = &(*controlp)->m_next; - } + /* + * XXXRW: During externalization, additional mbuf chains + * may have been added to the socket buffer. Update our + * local cache to avoid using a stale value and corrupting + * the list. + */ + nextrecord = so->so_rcv.sb_mb->m_nextpkt; + orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { @@ -1076,7 +1219,6 @@ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); - splx(s); #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { vm_page_t pg; @@ -1101,7 +1243,6 @@ #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); - s = splnet(); if (error) goto release; } else @@ -1143,6 +1284,7 @@ *mp = m_copym(m, 0, len, M_TRYWAIT); m->m_data += len; m->m_len -= len; + SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_cc -= len; } } @@ -1177,6 +1319,12 @@ /* * Notify the protocol that some data has been * drained before blocking. + * + * XXXRW: We drop the socket buffer lock here + * because we know the protocol will need to do its + * own locking. However, after calling pru_rcvd() + * and re-grabbing the buffer mutex, we don't + * re-check the condition before sleeping. */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); @@ -1186,8 +1334,10 @@ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); - if (error) + if (error) { + error = 0; goto release; + } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; @@ -1198,7 +1348,7 @@ flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); - (void) sbdroprecord(&so->so_rcv); + (void) sbdroprecord_locked(&so->so_rcv); } } if ((flags & MSG_PEEK) == 0) { @@ -1217,6 +1367,11 @@ } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); + /* + * XXXRW: We drop the socket buffer lock before calling + * down into the protocol. Is that OK in the calling + * context? + */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { SOCKBUF_UNLOCK(&so->so_rcv); (*pr->pr_usrreqs->pru_rcvd)(so, flags); @@ -1225,11 +1380,8 @@ } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && - (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { - sbunlock(&so->so_rcv); - splx(s); - goto restart; - } + (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) + goto restart; /* XXX multi-counts msgs */ if (flagsp != NULL) *flagsp |= flags; @@ -1237,8 +1389,8 @@ SOCKBUF_LOCK_ASSERT(&so->so_rcv); sbunlock(&so->so_rcv); out: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); - splx(s); return (error); } @@ -1267,23 +1419,42 @@ struct protosw *pr = so->so_proto; struct sockbuf asb; + /* + * XXXRW: This is quite ugly. The existing code made a copy of the + * socket buffer, then zero'd the original to clear the buffer + * fields. However, with mutexes in the socket buffer, this causes + * problems. We only clear the zeroable bits of the original; + * however, we have to initialize and destroy the mutex in the copy + * so that dom_dispose() and sbrelease() can lock t as needed. + */ SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOINTR; (void) sblock(sb, M_WAITOK); - socantrcvmore(so); + /* + * socantrcvmore_locked() drops the socket buffer mutex so that it + * can safely perform wakeups. Re-acquire the mutex before + * continuing. + */ + socantrcvmore_locked(so); + SOCKBUF_LOCK(sb); sbunlock(sb); - asb = *sb; /* - * Invalidate/clear most of the sockbuf structure, but keep - * its selinfo structure valid. + * Invalidate/clear most of the sockbuf structure, but leave + * selinfo and mutex data unchanged. */ + bzero(&asb, offsetof(struct sockbuf, sb_startzero)); + bcopy(&sb->sb_startzero, &asb.sb_startzero, + sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); + /* XXXRW: is passing in sb_mb this way really safe? */ + SOCKBUF_LOCK_INIT(&asb, "so_rcv"); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease(&asb, so); + SOCKBUF_LOCK_DESTROY(&asb); } #ifdef INET @@ -1292,20 +1463,31 @@ struct socket *so; struct sockopt *sopt; { - struct accept_filter_arg *afap = NULL; + struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *af = so->so_accf; + struct so_accf *newaf; int error = 0; + newaf = NULL; + afap = NULL; + + /* + * XXXRW: Configuring accept filters should be an atomic + * test-and-set operation to prevent races during setup and + * and attach. There may be more general issues of racing + * and ordering here that are not yet addressed by locking. + */ /* do not set/remove accept filters on non listen sockets */ + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { - error = EINVAL; - goto out; + SOCK_UNLOCK(so); + return (EINVAL); } /* removing the filter */ if (sopt == NULL) { - if (af != NULL) { + if (so->so_accf != NULL) { + struct so_accf *af = so->so_accf; if (af->so_accept_filter != NULL && af->so_accept_filter->accf_destroy != NULL) { af->so_accept_filter->accf_destroy(so); @@ -1317,47 +1499,80 @@ so->so_accf = NULL; } so->so_options &= ~SO_ACCEPTFILTER; + SOCK_UNLOCK(so); return (0); } - /* adding a filter */ - /* must remove previous filter first */ - if (af != NULL) { - error = EINVAL; - goto out; - } + SOCK_UNLOCK(so); + + /*- + * Adding a filter. + * + * Do memory allocation, copyin, and filter lookup now while we're + * not holding any locks. Avoids sleeping with a mutex, as well + * as introducing a lock order between accept filter locks and + * socket locks here. + */ + MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, + M_WAITOK); /* don't put large objects on the kernel stack */ - MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; - if (error) - goto out; + if (error) { + FREE(afap, M_TEMP); + return (error); + } afp = accept_filt_get(afap->af_name); if (afp == NULL) { - error = ENOENT; + FREE(afap, M_TEMP); + return (ENOENT); + } + + /* + * Allocate the new accept filter instance storage. We may have + * to free it again later if we fail to attach it. If attached + * properly, 'newaf' is NULLed to avoid a free() while in use. + */ + MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | + M_ZERO); + if (afp->accf_create != NULL && afap->af_name[0] != '\0') { + int len = strlen(afap->af_name) + 1; + MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, + M_WAITOK); + strcpy(newaf->so_accept_filter_str, afap->af_name); + } + + SOCK_LOCK(so); + /* must remove previous filter first */ + if (so->so_accf != NULL) { + error = EINVAL; goto out; } - MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); + /* + * Invoke the accf_create() method of the filter if required. + * XXXRW: the socket mutex is held over this call, so the + * create method cannot block. This may be something we have + * to change, but it would require addressing possible races. + */ if (afp->accf_create != NULL) { - if (afap->af_name[0] != '\0') { - int len = strlen(afap->af_name) + 1; - - MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); - strcpy(af->so_accept_filter_str, afap->af_name); - } - af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); - if (af->so_accept_filter_arg == NULL) { - FREE(af->so_accept_filter_str, M_ACCF); - FREE(af, M_ACCF); - so->so_accf = NULL; + newaf->so_accept_filter_arg = + afp->accf_create(so, afap->af_arg); + if (newaf->so_accept_filter_arg == NULL) { error = EINVAL; goto out; } } - af->so_accept_filter = afp; - so->so_accf = af; + newaf->so_accept_filter = afp; + so->so_accf = newaf; so->so_options |= SO_ACCEPTFILTER; + newaf = NULL; out: + SOCK_UNLOCK(so); + if (newaf != NULL) { + if (newaf->so_accept_filter_str != NULL) + FREE(newaf->so_accept_filter_str, M_ACCF); + FREE(newaf, M_ACCF); + } if (afap != NULL) FREE(afap, M_TEMP); return (error); @@ -1622,23 +1837,33 @@ switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: + /* Unlocked read. */ if ((so->so_options & SO_ACCEPTCONN) == 0) return (EINVAL); MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTFILTER) != 0) { strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); if (so->so_accf->so_accept_filter_str != NULL) strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); } + SOCK_UNLOCK(so); error = sooptcopyout(sopt, afap, sizeof(*afap)); FREE(afap, M_TEMP); break; #endif case SO_LINGER: + /* + * XXXRW: We grab the lock here to get a consistent + * snapshot of both fields. This may not really + * be necessary. + */ + SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; + SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; @@ -1653,6 +1878,7 @@ case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: + /* Unlocked read. */ optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); @@ -1859,10 +2085,16 @@ int revents = 0; int s = splnet(); + /* + * XXXRW: Lots of unlocked reads, and some writes. Probably + * some more locking is called for here, especially when + * setting the sb_sel flags. + */ if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) revents |= events & (POLLIN | POLLRDNORM); + /* Unlocked read. */ if (events & POLLINIGNEOF) if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || !TAILQ_EMPTY(&so->so_comp) || so->so_error) @@ -1880,6 +2112,9 @@ if (events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND)) { + /* + * XXXRW: Should lock also be held over selrecord()? + */ selrecord(td, &so->so_rcv.sb_sel); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_SEL; @@ -1887,6 +2122,9 @@ } if (events & (POLLOUT | POLLWRNORM)) { + /* + * XXXRW: Should lock also be held over selrecord()? + */ selrecord(td, &so->so_snd.sb_sel); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_SEL; @@ -1906,6 +2144,7 @@ switch (kn->kn_filter) { case EVFILT_READ: + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else @@ -2022,6 +2261,7 @@ { struct socket *so = kn->kn_fp->f_data; + /* Unlocked read. */ kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } --- //depot/vendor/freebsd/src/sys/kern/uipc_socket2.c 2004/06/19 03:25:35 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket2.c 2004/06/20 17:21:28 @@ -106,21 +106,39 @@ { SOCK_LOCK(so); + /* XXXRW: so_state locking? */ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } +/* + * soisconnected() transitions a socket to a fully connected state. If + * so->so_head is non-NULL, we transition it from the incompletely + * connected queue to the completely connected queue. Otherwise, we + * just wake up the socket since it was an out-bound connection. + */ void soisconnected(so) struct socket *so; { struct socket *head; +#ifdef INVARIANTS + int need_lock = !SOCK_OWNED(so); +#endif + KASSERT(need_lock == 1, ("soisconnected(): called with lock!")); + /* XXXRW: so_state locking? */ SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; SOCK_UNLOCK(so); + + /* + * XXXRW: Maybe an unlocked read of so_head would be desirable + * here? Unlocked read of so_qstate probably not a good idea + * regardless. so_options handling here is a little unsavory. + */ ACCEPT_LOCK(); head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { @@ -135,20 +153,47 @@ sorwakeup(head); wakeup_one(&head->so_timeo); } else { - ACCEPT_UNLOCK(); + void (*so_upcall)(struct socket *, void *, int); + void *so_upcallarg; + /* + * XXXRW: Should probably copy the upcall fields to + * local stack variables while holding the socket + * lock (and clear the option), then release the + * lock and make the call. This will prevent lock + * order reversals/recursion between this code and + * the upcall implementation, which will likely + * also want to frob the socket using locks. + * + * NOTE: We keep a local copy of the function + * pointer so we can invoke the upcall without + * holding locks. However, we also have to copy + * in the upcall fields from the head because the + * filter may need to be called more than once. + */ SOCK_LOCK(so); - so->so_upcall = + so_upcall = so->so_upcall = head->so_accf->so_accept_filter->accf_callback; - so->so_upcallarg = head->so_accf->so_accept_filter_arg; + so_upcallarg = so->so_upcallarg = + head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); - so->so_upcall(so, so->so_upcallarg, M_TRYWAIT); + ACCEPT_UNLOCK(); + /* + * Call with our existing reference, but without + * any locks. + */ + so_upcall(so, so_upcallarg, M_TRYWAIT); } return; } ACCEPT_UNLOCK(); wakeup(&so->so_timeo); + /* + * XXXRW: If assuming socket lock is socket buffer receive + * lock, should use a _locked variant of sorwakeup() and + * avoid recursion here. + */ sorwakeup(so); sowwakeup(so); } @@ -174,6 +219,10 @@ so->so_snd.sb_state |= SBS_CANTSENDMORE; SOCKBUF_UNLOCK(&so->so_snd); wakeup(&so->so_timeo); + /* + * XXXRW: Multiple socket buffer lock/unlock here could be avoided + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); } @@ -188,6 +237,7 @@ * SOCKBUF_LOCK(&so->so_rcv) even though they are the same mutex to * avoid introducing the assumption that they are the same. */ + /* XXXRW: so_state locking? */ SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; @@ -197,9 +247,13 @@ SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; + sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); SOCKBUF_UNLOCK(&so->so_snd); wakeup(&so->so_timeo); - sbdrop(&so->so_snd, so->so_snd.sb_cc); + /* + * XXXRW: Avoid multiple lock/unlock of socket buffer locks here + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); } @@ -236,6 +290,7 @@ so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; + /* XXXRW: so_state locking? */ so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; @@ -280,6 +335,7 @@ } ACCEPT_UNLOCK(); if (connstatus) { + /* XXXRW: so_state locking? */ so->so_state |= connstatus; sorwakeup(head); wakeup_one(&head->so_timeo); @@ -296,14 +352,36 @@ * protocol when it detects that the peer will send no more data. * Data queued for reading in the socket may yet be read. */ +void +socantsendmore_locked(so) + struct socket *so; +{ + SOCKBUF_LOCK_ASSERT(&so->so_snd); + so->so_snd.sb_state |= SBS_CANTSENDMORE; + sowwakeup_locked(so); + mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); +} + void socantsendmore(so) struct socket *so; { - so->so_snd.sb_state |= SBS_CANTSENDMORE; - sowwakeup(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); +} + +void +socantrcvmore_locked(so) + struct socket *so; +{ + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + so->so_rcv.sb_state |= SBS_CANTRCVMORE; + sorwakeup_locked(so); + mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void @@ -311,8 +389,9 @@ struct socket *so; { - so->so_rcv.sb_state |= SBS_CANTRCVMORE; - sorwakeup(so); + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* @@ -356,9 +435,16 @@ } /* - * Wakeup processes waiting on a socket buffer. - * Do asynchronous notification via SIGIO - * if the socket has the SS_ASYNC flag set. + * Wakeup processes waiting on a socket buffer. Do asynchronous + * notification via SIGIO if the socket has the SS_ASYNC flag set. + * + * Called with the socket buffer lock held; will release the lock by the end + * of the function. This allows the caller to acquire the socket buffer lock + * while testing for the need for various sorts of wakeup and hold it through + * to the point where it's no longer required. We currently hold the lock + * through calls out to other subsystems (with the exception of kqueue), and + * then release it to avoid lock order issues. It's not clear that's + * correct. */ void sowakeup(so, sb) @@ -366,19 +452,31 @@ register struct sockbuf *sb; { + SOCKBUF_LOCK_ASSERT(sb); + selwakeuppri(&sb->sb_sel, PSOCK); sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); } + + KNOTE(&sb->sb_sel.si_note, 0); + + SOCKBUF_UNLOCK(sb); + + /* + * XXXRW: What locks should be held over what parts of the + * following? Currently we hold none. + */ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); - if (sb->sb_flags & SB_AIO) + if (sb->sb_flags & SB_AIO) /* XXX locking */ aio_swake(so, sb); - KNOTE(&sb->sb_sel.si_note, 0); + + mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* @@ -418,7 +516,7 @@ register struct socket *so; u_long sndcc, rcvcc; { - struct thread *td = curthread; + struct thread *td = curthread; /* XXX */ if (sbreserve(&so->so_snd, sndcc, so, td) == 0) goto bad; @@ -478,6 +576,13 @@ /* * td will only be NULL when we're in an interrupt * (e.g. in tcp_input()) + * + * XXXRW: This comment is true, but only because the caller passed + * in NULL, not for the 4.x reason that there is no thread + * available. Need to be careful of callers that do this wrong; + * I suspect many do it wrong, and therefore many socket buffers + * end up with the wrong limits, especially via the soreserve() + * path. */ if (cc > sb_max_adj) return (0); @@ -500,17 +605,29 @@ * Free mbufs held by a socket, and reserved mbuf space. */ void -sbrelease(sb, so) +sbrelease_locked(sb, so) struct sockbuf *sb; struct socket *so; { - sbflush(sb); + SOCKBUF_LOCK_ASSERT(sb); + + sbflush_locked(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } +void +sbrelease(sb, so) + struct sockbuf *sb; + struct socket *so; +{ + + SOCKBUF_LOCK(sb); + sbrelease_locked(sb, so); + SOCKBUF_UNLOCK(sb); +} /* * Routines to add and remove * data from an mbuf queue. @@ -542,6 +659,8 @@ { struct mbuf *m = sb->sb_mb; + SOCKBUF_LOCK_ASSERT(sb); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -561,6 +680,8 @@ struct mbuf *m = sb->sb_mb; struct mbuf *n; + SOCKBUF_LOCK_ASSERT(sb); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -583,6 +704,7 @@ #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ + SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ @@ -597,14 +719,17 @@ * discarded and mbufs are compacted where possible. */ void -sbappend(sb, m) +sbappend_locked(sb, m) struct sockbuf *sb; struct mbuf *m; { register struct mbuf *n; + SOCKBUF_LOCK_ASSERT(sb); + if (m == 0) return; + SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { @@ -612,7 +737,7 @@ n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { - sbappendrecord(sb, m); /* XXXXXX!!!! */ + sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); @@ -625,7 +750,7 @@ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { - sbappendrecord(sb, m); /* XXXXXX!!!! */ + sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); @@ -642,13 +767,31 @@ } /* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + + SOCKBUF_LOCK(sb); + sbappend_locked(sb, m); + SOCKBUF_UNLOCK(sb); +} + +/* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void -sbappendstream(struct sockbuf *sb, struct mbuf *m) +sbappendstream_locked(struct sockbuf *sb, struct mbuf *m) { + SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); @@ -661,6 +804,20 @@ SBLASTRECORDCHK(sb); } +/* + * This version of sbappend() should only be used when the caller + * absolutely knows that there will never be more than one record + * in the socket buffer, that is, a stream protocol (such as TCP). + */ +void +sbappendstream(struct sockbuf *sb, struct mbuf *m) +{ + + SOCKBUF_LOCK(sb); + sbappendstream_locked(sb, m); + SOCKBUF_UNLOCK(sb); +} + #ifdef SOCKBUF_DEBUG void sbcheck(sb) @@ -670,6 +827,8 @@ struct mbuf *n = 0; u_long len = 0, mbcnt = 0; + SOCKBUF_LOCK_ASSERT(sb); + for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { @@ -692,12 +851,14 @@ * begins a new record. */ void -sbappendrecord(sb, m0) +sbappendrecord_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 == 0) return; m = sb->sb_mb; @@ -725,18 +886,35 @@ } /* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + + SOCKBUF_LOCK(sb); + sbappendrecord_locked(sb, m0); + SOCKBUF_UNLOCK(sb); +} + +/* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void -sbinsertoob(sb, m0) +sbinsertoob_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; register struct mbuf **mp; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 == 0) return; for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { @@ -771,13 +949,29 @@ } /* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + + SOCKBUF_LOCK(sb); + sbinsertoob_locked(sb, m0); + SOCKBUF_UNLOCK(sb); +} + +/* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int -sbappendaddr(sb, asa, m0, control) +sbappendaddr_locked(sb, asa, m0, control) struct sockbuf *sb; const struct sockaddr *asa; struct mbuf *m0, *control; @@ -785,11 +979,14 @@ struct mbuf *m, *n, *nlast; int space = asa->sa_len; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 && (m0->m_flags & M_PKTHDR) == 0) - panic("sbappendaddr"); + panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); + if (space > sbspace(sb)) return (0); #if MSIZE <= 256 @@ -811,25 +1008,47 @@ sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); + sb->sb_mbtail = nlast; - sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); - SBLASTRECORDCHK(sb); return (1); } +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ int -sbappendcontrol(sb, m0, control) +sbappendaddr(sb, asa, m0, control) + struct sockbuf *sb; + const struct sockaddr *asa; + struct mbuf *m0, *control; +{ + int retval; + + SOCKBUF_LOCK(sb); + retval = sbappendaddr_locked(sb, asa, m0, control); + SOCKBUF_UNLOCK(sb); + return (retval); +} + +int +sbappendcontrol_locked(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { struct mbuf *m, *n, *mlast; int space; + SOCKBUF_LOCK_ASSERT(sb); + if (control == 0) - panic("sbappendcontrol"); + panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); + if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ @@ -841,14 +1060,27 @@ sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); + sb->sb_mbtail = mlast; - sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); + SBLASTRECORDCHK(sb); - SBLASTRECORDCHK(sb); return (1); } +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + int retval; + + SOCKBUF_LOCK(sb); + retval = sbappendcontrol_locked(sb, m0, control); + SOCKBUF_UNLOCK(sb); + return (retval); +} + /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n @@ -862,6 +1094,8 @@ register int eor = 0; register struct mbuf *o; + SOCKBUF_LOCK_ASSERT(sb); + while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && @@ -914,12 +1148,14 @@ * Check that all resources are reclaimed. */ void -sbflush(sb) +sbflush_locked(sb) register struct sockbuf *sb; { + SOCKBUF_LOCK_ASSERT(sb); + if (sb->sb_flags & SB_LOCK) - panic("sbflush: locked"); + panic("sbflush_locked: locked"); while (sb->sb_mbcnt) { /* * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: @@ -927,23 +1163,36 @@ */ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; - sbdrop(sb, (int)sb->sb_cc); + sbdrop_locked(sb, (int)sb->sb_cc); } if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) - panic("sbflush: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); + panic("sbflush_locked: cc %u || mb %p || mbcnt %u", + sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); +} + +void +sbflush(sb) + register struct sockbuf *sb; +{ + + SOCKBUF_LOCK(sb); + sbflush_locked(sb); + SOCKBUF_UNLOCK(sb); } /* * Drop data from (the front of) a sockbuf. */ void -sbdrop(sb, len) +sbdrop_locked(sb, len) register struct sockbuf *sb; register int len; { register struct mbuf *m; struct mbuf *next; + SOCKBUF_LOCK_ASSERT(sb); + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; while (len > 0) { if (m == 0) { @@ -990,15 +1239,31 @@ } /* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + + SOCKBUF_LOCK(sb); + sbdrop(sb, len); + SOCKBUF_UNLOCK(sb); +} + +/* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void -sbdroprecord(sb) +sbdroprecord_locked(sb) register struct sockbuf *sb; { register struct mbuf *m; + SOCKBUF_LOCK_ASSERT(sb); + m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; @@ -1011,6 +1276,20 @@ } /* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + + SOCKBUF_LOCK(sb); + sbdroprecord_locked(sb); + SOCKBUF_UNLOCK(sb); +} + +/* * Create a "control" mbuf containing the specified data * with the specified type for presentation on a socket buffer. */ @@ -1142,6 +1421,7 @@ xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; + /* Unlocked read. */ xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; --- //depot/vendor/freebsd/src/sys/kern/uipc_syscalls.c 2004/06/19 03:25:35 +++ //depot/user/rwatson/netperf/sys/kern/uipc_syscalls.c 2004/06/19 04:30:55 @@ -278,6 +278,7 @@ error = fgetsock(td, uap->s, &head, &fflag); if (error) goto done2; + /* Unlocked read. */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; @@ -285,6 +286,10 @@ error = falloc(td, &nfp, &fd); if (error) goto done; + /* + * Unlocked reads. + * XXXRW: Dubious use of so->so_error. + */ ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); @@ -332,6 +337,11 @@ /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; + /* + * XXXRW: Might make life simpler to grab the socket buffer lock + * here so it's held when KNOTE() calls back into the socket + * code. + */ /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); @@ -481,6 +491,7 @@ NET_LOCK_GIANT(); if ((error = fgetsock(td, fd, &so, NULL)) != 0) goto done2; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; @@ -495,13 +506,17 @@ error = soconnect(so, sa, td); if (error) goto bad; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } s = splnet(); + SOCK_LOCK(so); + /* XXXRW: so_state locking? */ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0); + error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + "connec", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; @@ -512,8 +527,10 @@ error = so->so_error; so->so_error = 0; } + SOCK_UNLOCK(so); splx(s); bad: + /* XXXRW: so_state locking? */ if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) @@ -1508,6 +1525,7 @@ NET_LOCK_GIANT(); if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) goto done2; + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done1; @@ -1750,6 +1768,7 @@ error = EINVAL; goto done; } + /* XXXRW: so_state locking? */ if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto done; @@ -1844,6 +1863,7 @@ * before going to the extra work of constituting the sf_buf. */ SOCKBUF_LOCK(&so->so_snd); + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { if (so->so_snd.sb_state & SBS_CANTSENDMORE) error = EPIPE; @@ -1909,6 +1929,7 @@ * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; + mtx_lock(&Giant); /* VFS */ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); /* * XXXMAC: Because we don't have fp->f_cred here, @@ -1920,6 +1941,7 @@ IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); + mtx_unlock(&Giant); /* VFS */ if (error) VM_OBJECT_LOCK(obj); vm_page_lock_queues(); @@ -2032,6 +2054,7 @@ * after checking the connection state above in order to avoid * a race condition with sbwait(). */ + /* XXXRW: so_state locking? */ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { m_freem(m); --- //depot/vendor/freebsd/src/sys/kern/uipc_usrreq.c 2004/06/17 17:21:12 +++ //depot/user/rwatson/netperf/sys/kern/uipc_usrreq.c 2004/06/20 15:39:13 @@ -280,6 +280,9 @@ if (unp->unp_conn == NULL) break; so2 = unp->unp_conn->unp_socket; + /* NB: careful of order here */ + SOCKBUF_LOCK(&so2->so_snd); + SOCKBUF_LOCK(&so->so_rcv); /* * Adjust backpressure on sender * and wakeup any waiting to write. @@ -291,6 +294,8 @@ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = so->so_rcv.sb_cc; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so2->so_snd); sowwakeup(so2); break; @@ -349,11 +354,14 @@ from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; - if (sbappendaddr(&so2->so_rcv, from, m, control)) { + SOCKBUF_LOCK(&so2->so_rcv); + if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { + SOCKBUF_UNLOCK(&so2->so_rcv); sorwakeup(so2); m = NULL; control = NULL; } else { + SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) @@ -367,6 +375,7 @@ * Note: A better implementation would complain * if not equal to the peer's address. */ + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); @@ -378,6 +387,7 @@ } } + /* Unlocked read. */ if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; @@ -385,16 +395,17 @@ if (unp->unp_conn == NULL) panic("uipc_send connected but no connection?"); so2 = unp->unp_conn->unp_socket; + SOCKBUF_LOCK(&so2->so_rcv); /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control != NULL) { - if (sbappendcontrol(&so2->so_rcv, m, control)) + if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; } else { - sbappend(&so2->so_rcv, m); + sbappend_locked(&so2->so_rcv, m); } so->so_snd.sb_mbmax -= so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; @@ -404,6 +415,7 @@ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; + SOCKBUF_UNLOCK(&so2->so_rcv); sorwakeup(so2); m = NULL; break; @@ -445,6 +457,7 @@ sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { so2 = unp->unp_conn->unp_socket; + /* Unlocked read. */ sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; @@ -906,6 +919,7 @@ struct unpcb *unp; { register struct unpcb *unp2 = unp->unp_conn; + struct socket *so; UNP_LOCK_ASSERT(); @@ -916,7 +930,10 @@ case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); - unp->unp_socket->so_state &= ~SS_ISCONNECTED; + so = unp->unp_socket; + SOCK_LOCK(so); + so->so_state &= ~SS_ISCONNECTED; + SOCK_UNLOCK(so); break; case SOCK_STREAM: @@ -1388,6 +1405,10 @@ UNP_LOCK_ASSERT(); + /* + * XXXRW: unp_gcing seems like a bad idea. Use a real lock + * instead? + */ if (unp_gcing) return; unp_gcing = 1; @@ -1470,6 +1491,9 @@ * of a signal. If sbwait does return * an error, we'll go into an infinite * loop. Delete all of this for now. + * + * XXXRW: We will need to lock the socket + * buffer here if we enable this code. */ (void) sbwait(&so->so_rcv); goto restart; @@ -1482,7 +1506,9 @@ * message buffers. Follow those links and mark them * as accessible too. */ + SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, unp_mark); + SOCKBUF_UNLOCK(&so->so_rcv); } } while (unp_defer); sx_sunlock(&filelist_lock); --- //depot/vendor/freebsd/src/sys/net/if.c 2004/06/16 09:52:13 +++ //depot/user/rwatson/netperf/sys/net/if.c 2004/06/18 00:24:39 @@ -369,6 +369,10 @@ struct sockaddr_dl *sdl; struct ifaddr *ifa; + /* + * XXXRW: Shouldn't we add to the global list only once the ifnet + * is ready for use? + */ IF_AFDATA_LOCK_INIT(ifp); ifp->if_afdata_initialized = 0; IFNET_WLOCK(); @@ -670,6 +674,7 @@ /* * Create a clone network interface. + * XXXRW: Locking? */ int if_clone_create(char *name, int len) @@ -742,6 +747,7 @@ /* * Destroy a clone network interface. + * XXXRW: Locking? */ int if_clone_destroy(const char *name) @@ -779,6 +785,7 @@ /* * Look up a network interface cloner. + * XXXRW: Locking? */ static struct if_clone * if_clone_lookup(const char *name, int *unitp) @@ -820,6 +827,7 @@ /* * Register a network interface cloner. + * XXXRW: Locking? */ void if_clone_attach(struct if_clone *ifc) @@ -862,6 +870,7 @@ /* * Unregister a network interface cloner. + * XXXRW: Locking? */ void if_clone_detach(struct if_clone *ifc) @@ -874,6 +883,7 @@ /* * Provide list of interface cloners to userspace. + * XXXRW: Locking? */ static int if_clone_list(struct if_clonereq *ifcr) --- //depot/vendor/freebsd/src/sys/net/if_gif.c 2004/05/30 20:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_gif.c 2004/05/31 01:41:33 @@ -88,6 +88,10 @@ * gif_mtx protects the global gif_softc_list. * XXX: Per-softc locking is still required. */ +/* + * XXXRW: Note that gif_mtx only protects global gif-related data, not + * per-softc data. See also netinet/in_gif.c for locking needs. + */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static LIST_HEAD(, gif_softc) gif_softc_list; @@ -500,6 +504,9 @@ } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ +/* + * XXXRW: per-gif softc locking required. + */ int gif_ioctl(ifp, cmd, data) struct ifnet *ifp; @@ -756,8 +763,9 @@ int s; int error = 0; - s = splnet(); - + /* + * XXXRW: per-gif softc locking required. + */ mtx_lock(&gif_mtx); LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -786,6 +794,9 @@ } mtx_unlock(&gif_mtx); + /* + * XXXRW: Lock gif softc fields. + */ /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) switch (sc->gif_psrc->sa_family) { --- //depot/vendor/freebsd/src/sys/net/if_gre.c 2004/05/30 20:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_gre.c 2004/05/31 01:41:33 @@ -94,7 +94,8 @@ /* * gre_mtx protects all global variables in if_gre.c. - * XXX: gre_softc data not protected yet. + * + * XXXRW: It does not protect softc-specific data. */ struct mtx gre_mtx; static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); --- //depot/vendor/freebsd/src/sys/net/if_gre.h 2004/03/22 16:06:54 +++ //depot/user/rwatson/netperf/sys/net/if_gre.h 2004/03/22 16:18:59 @@ -54,6 +54,12 @@ WCCP_V2 } wccp_ver_t; +/* + * XXXRW: softc fields need locking. + * + * XXXRW: gre's notion of a 'called' count is not MP-safe, as it assumes + * only one packet can be processed at a time. + */ struct gre_softc { struct ifnet sc_if; LIST_ENTRY(gre_softc) sc_list; --- //depot/vendor/freebsd/src/sys/net/if_sl.c 2004/06/16 09:52:13 +++ //depot/user/rwatson/netperf/sys/net/if_sl.c 2004/06/18 00:24:39 @@ -162,6 +162,7 @@ #define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */ static LIST_HEAD(sl_list, sl_softc) sl_list; +static struct mtx slip_mtx; #define FRAME_END 0xc0 /* Frame End */ #define FRAME_ESCAPE 0xdb /* Frame Esc */ @@ -198,6 +199,7 @@ { switch (type) { case MOD_LOAD: + mtx_init(&slip_mtx, "slip_mtx", NULL, MTX_DEF); ldisc_register(SLIPDISC, &slipdisc); LIST_INIT(&sl_list); break; @@ -217,6 +219,7 @@ DECLARE_MODULE(if_sl, sl_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +/* Locked using slip_mtx. */ static int *st_unit_list; static size_t st_unit_max = 0; @@ -225,6 +228,7 @@ { struct sl_softc *nc; + mtx_assert(&slip_mtx, MA_OWNED); LIST_FOREACH(nc, &sl_list, sl_next) { if (nc->sc_if.if_dunit == unit) return (0); @@ -238,6 +242,7 @@ { size_t i; + mtx_assert(&slip_mtx, MA_OWNED); for (i = 0; i < st_unit_max; i++) if (st_unit_list[i] == unit) return 1; @@ -250,6 +255,7 @@ { int *t; + mtx_assert(&slip_mtx, MA_OWNED); if (slisstatic(unit)) return; @@ -312,10 +318,12 @@ sc->sc_if.if_linkmib = sc; sc->sc_if.if_linkmiblen = sizeof *sc; mtx_init(&sc->sc_fastq.ifq_mtx, "sl_fastq", NULL, MTX_DEF); + mtx_init(&sc->sc_mtx, "slip sc_mtx", NULL, MTX_DEF); /* * Find a suitable unit number. */ + mtx_lock(&slip_mtx); for (unit=0; ; unit++) { if (slisstatic(unit)) continue; @@ -325,6 +333,7 @@ } if_initname(&sc->sc_if, "sl", unit); LIST_INSERT_HEAD(&sl_list, sc, sl_next); + mtx_unlock(&slip_mtx); if_attach(&sc->sc_if); bpfattach(&sc->sc_if, DLT_SLIP, SLIP_HDRLEN); @@ -387,10 +396,20 @@ static void sldestroy(struct sl_softc *sc) { + + /* + * XXXRW: Slight race here: we may detach bpf/if before we + * attach. This appears to be a property of the unit selection + * process, which might be better handled by the interface + * cloning subsystem? + */ bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); + mtx_lock(&slip_mtx); LIST_REMOVE(sc, sl_next); + mtx_unlock(&slip_mtx); m_free(sc->sc_mbuf); + mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_fastq.ifq_mtx); if (sc->bpfbuf) free(sc->bpfbuf, M_SL); @@ -420,6 +439,10 @@ tp->t_line = 0; sc = (struct sl_softc *)tp->t_sc; if (sc != NULL) { + /* + * XXXRW: tear-down race between timeout and slclose()? + */ + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill) { sc->sc_outfill = 0; untimeout(sl_outfill, sc, sc->sc_ofhandle); @@ -428,6 +451,7 @@ sc->sc_keepalive = 0; untimeout(sl_keepalive, sc, sc->sc_kahandle); } + mtx_unlock(&sc->sc_mtx); if_down(&sc->sc_if); sc->sc_ttyp = NULL; tp->t_sc = NULL; @@ -465,12 +489,21 @@ splx(s); return (ENXIO); } + /* + * XXXRW: we hold the mutex over all of this to protect + * the unit change and global list consistency. However, + * some of these functions probably sleep, making this + * wrong. If we have to support renumbering, we probably + * need a way to reserve both numbers to prevent them + * from being reused during the change, or a way to sleep + * waiting for a change to end (i.e., a CV). + */ + mtx_lock(&slip_mtx); if (sc->sc_if.if_dunit != unit) { if (!slisunitfree(unit)) { - splx(s); + mtx_unlock(&slip_mtx); return (ENXIO); } - wasup = sc->sc_if.if_flags & IFF_UP; bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); @@ -488,9 +521,11 @@ SLIP_HIWAT + 2 * sc->sc_if.if_mtu + 1); } slmarkstatic(unit); + mtx_unlock(&slip_mtx); break; case SLIOCSKEEPAL: + mtx_lock(&sc->sc_mtx); sc->sc_keepalive = *(u_int *)data * hz; if (sc->sc_keepalive) { sc->sc_flags |= SC_KEEPALIVE; @@ -502,6 +537,7 @@ sc->sc_flags &= ~SC_KEEPALIVE; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGKEEPAL: @@ -509,6 +545,7 @@ break; case SLIOCSOUTFILL: + mtx_lock(&sc->sc_mtx); sc->sc_outfill = *(u_int *)data * hz; if (sc->sc_outfill) { sc->sc_flags |= SC_OUTWAIT; @@ -520,9 +557,11 @@ sc->sc_flags &= ~SC_OUTWAIT; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGOUTFILL: + /* Unlocked read. */ *(int *)data = sc->sc_outfill / hz; break; @@ -616,8 +655,11 @@ (*tp->t_oproc)(tp); if (tp->t_outq.c_cc != 0) { - if (sc != NULL) + if (sc != NULL) { + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); + } if (tp->t_outq.c_cc > SLIP_HIWAT) return 0; } @@ -647,6 +689,7 @@ * queueing, and the connection id compression will get * munged when this happens. */ + mtx_lock(&sc->sc_mtx); if (sc->sc_if.if_bpf) { /* * We need to save the TCP/IP header before it's @@ -677,9 +720,10 @@ } ip = mtod(m, struct ip *); if (ip->ip_v == IPVERSION && ip->ip_p == IPPROTO_TCP) { - if (sc->sc_if.if_flags & SC_COMPRESS) + if (sc->sc_if.if_flags & SC_COMPRESS) { *mtod(m, u_char *) |= sl_compress_tcp(m, ip, &sc->sc_comp, 1); + } } if (sc->sc_if.if_bpf && sc->bpfbuf) { /* @@ -691,6 +735,7 @@ bcopy(mtod(m, caddr_t), &sc->bpfbuf[SLX_CHDR], CHDR_LEN); BPF_TAP(&sc->sc_if, sc->bpfbuf, len + SLIP_HDRLEN); } + mtx_unlock(&sc->sc_mtx); /* * If system is getting low on clists, just flush our @@ -706,7 +751,9 @@ continue; } + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); /* * The extra FRAME_END will start up a new packet, and thus * will flush any accumulated garbage. We do this whenever @@ -796,6 +843,8 @@ { struct mbuf *m, *newm; + mtx_assert(&sc->sc_mtx, MA_OWNED); + MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); @@ -851,13 +900,16 @@ if (sc == NULL) return 0; if (c & TTY_ERRORMASK || (tp->t_state & TS_CONNECTED) == 0) { + mtx_lock(&sc->sc_mtx); sc->sc_flags |= SC_ERROR; + mtx_unlock(&sc->sc_mtx); return 0; } c &= TTY_CHARMASK; ++sc->sc_if.if_ibytes; + mtx_lock(&sc->sc_mtx); if (sc->sc_if.if_flags & IFF_DEBUG) { if (c == ABT_ESC) { /* @@ -876,6 +928,7 @@ sc->sc_starttime = time_second; if (sc->sc_abortcount >= ABT_COUNT) { slclose(tp,0); + mtx_unlock(&sc->sc_mtx); return 0; } } @@ -898,6 +951,7 @@ case FRAME_ESCAPE: sc->sc_escape = 1; + mtx_unlock(&sc->sc_mtx); return 0; case FRAME_END: @@ -982,6 +1036,7 @@ if (sc->sc_mp < sc->sc_ep) { *sc->sc_mp++ = c; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -993,6 +1048,7 @@ newpack: sc->sc_mp = sc->sc_buf = sc->sc_ep - SLRMAX; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -1076,6 +1132,7 @@ { struct sl_softc *sc = chan; + mtx_lock(&sc->sc_mtx); if (sc->sc_keepalive) { if (sc->sc_flags & SC_KEEPALIVE) { if (sc->sc_ttyp->t_pgrp != NULL) { @@ -1089,6 +1146,7 @@ } else { sc->sc_flags &= ~SC_KEEPALIVE; } + mtx_unlock(&sc->sc_mtx); } static void @@ -1099,6 +1157,7 @@ register struct tty *tp = sc->sc_ttyp; int s; + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill && tp != NULL) { if (sc->sc_flags & SC_OUTWAIT) { s = splimp (); @@ -1112,4 +1171,5 @@ } else { sc->sc_flags &= ~SC_OUTWAIT; } + mtx_unlock(&sc->sc_mtx); } --- //depot/vendor/freebsd/src/sys/net/if_slvar.h 2004/04/07 20:52:05 +++ //depot/user/rwatson/netperf/sys/net/if_slvar.h 2004/04/08 03:11:34 @@ -34,13 +34,16 @@ #ifndef _NET_IF_SLVAR_H_ #define _NET_IF_SLVAR_H_ +#include #include /* * Definitions for SLIP interface data structures * * (This exists so programs like slstats can get at the definition - * of sl_softc.) + * of sl_softc.) Fields owned by the SLIP subsystem are protected + * using sc_mtx, with the exception of sc_next, which is protected + * by the global slip_mtx. */ struct sl_softc { struct ifnet sc_if; /* network-visible interface */ @@ -66,6 +69,7 @@ struct slcompress sc_comp; /* tcp compression data */ LIST_ENTRY(sl_softc) sl_next; u_char *bpfbuf; /* hang buffer for bpf here */ + struct mtx sc_mtx; }; /* internal flags */ --- //depot/vendor/freebsd/src/sys/net/if_spppsubr.c 2004/06/16 00:02:04 +++ //depot/user/rwatson/netperf/sys/net/if_spppsubr.c 2004/06/18 00:24:39 @@ -92,12 +92,8 @@ #include #if defined(__FreeBSD__) && __FreeBSD__ >= 3 -# define UNTIMEOUT(fun, arg, handle) untimeout(fun, arg, handle) -# define TIMEOUT(fun, arg1, arg2, handle) handle = timeout(fun, arg1, arg2) # define IOCTL_CMD_T u_long #else -# define UNTIMEOUT(fun, arg, handle) untimeout(fun, arg) -# define TIMEOUT(fun, arg1, arg2, handle) timeout(fun, arg1, arg2) # define IOCTL_CMD_T int #endif @@ -259,10 +255,11 @@ void (*scr)(struct sppp *sp); }; +struct mtx sppp_mtx; +MTX_SYSINIT(sppp_mtx, &sppp_mtx, "sppp_mtx", MTX_DEF); + static struct sppp *spppq; -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 -static struct callout_handle keepalive_ch; -#endif +static struct callout keepalive_callout; #if defined(__FreeBSD__) && __FreeBSD__ >= 3 && __FreeBSD_version < 501113 #define SPP_FMT "%s%d: " @@ -965,13 +962,18 @@ { struct sppp *sp = (struct sppp*) ifp; + mtx_lock(&sppp_mtx); /* Initialize keepalive handler. */ - if (spppq == NULL) - TIMEOUT(sppp_keepalive, 0, hz * 10, keepalive_ch); + if (spppq == NULL) { + callout_init(&keepalive_callout, 0); + callout_reset(&keepalive_callout, hz * 10, sppp_keepalive, + NULL); + } /* Insert new entry into the keepalive list. */ sp->pp_next = spppq; spppq = sp; + mtx_unlock(&sppp_mtx); sp->pp_if.if_mtu = PP_MTU; sp->pp_if.if_flags = IFF_POINTOPOINT | IFF_MULTICAST; @@ -1017,6 +1019,7 @@ struct sppp **q, *p, *sp = (struct sppp*) ifp; int i; + mtx_lock(&sppp_mtx); /* Remove the entry from the keepalive list. */ for (q = &spppq; (p = *q); q = &p->pp_next) if (p == sp) { @@ -1026,11 +1029,12 @@ /* Stop keepalive handler. */ if (spppq == NULL) - UNTIMEOUT(sppp_keepalive, 0, keepalive_ch); + callout_stop(&keepalive_callout); + mtx_unlock(&sppp_mtx); for (i = 0; i < IDX_COUNT; i++) - UNTIMEOUT((cps[i])->TO, (void *)sp, sp->ch[i]); - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout((cps[i])->TO, (void *)sp, sp->ch[i]); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); } @@ -2009,8 +2013,8 @@ case STATE_STOPPING: sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: @@ -2020,8 +2024,8 @@ break; case STATE_ACK_SENT: (cp->scr)(sp); - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; } @@ -2037,7 +2041,7 @@ { sp->state[cp->protoidx] = newstate; - UNTIMEOUT(cp->TO, (void *)sp, sp->ch[cp->protoidx]); + untimeout(cp->TO, (void *)sp, sp->ch[cp->protoidx]); switch (newstate) { case STATE_INITIAL: case STATE_STARTING: @@ -2050,8 +2054,8 @@ case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; } } @@ -4147,7 +4151,7 @@ * a number between 300 and 810 seconds. */ i = 300 + ((unsigned)(random() & 0xff00) >> 7); - TIMEOUT(chap.TO, (void *)sp, i * hz, sp->ch[IDX_CHAP]); + sp->ch[IDX_CHAP] = timeout(chap.TO, (void *)sp, i * hz); } if (debug) { @@ -4191,7 +4195,7 @@ if (debug) log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp)); - UNTIMEOUT(chap.TO, (void *)sp, sp->ch[IDX_CHAP]); + untimeout(chap.TO, (void *)sp, sp->ch[IDX_CHAP]); sp->lcp.protos &= ~(1 << IDX_CHAP); lcp.Close(sp); @@ -4327,7 +4331,7 @@ /* ack and nak are his authproto */ case PAP_ACK: - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); if (debug) { log(LOG_DEBUG, SPP_FMT "pap success", SPP_ARGS(ifp)); @@ -4356,7 +4360,7 @@ break; case PAP_NAK: - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); if (debug) { log(LOG_INFO, SPP_FMT "pap failure", SPP_ARGS(ifp)); @@ -4413,8 +4417,8 @@ if (sp->myauth.proto == PPP_PAP) { /* we are peer, send a request, and start a timer */ pap.scr(sp); - TIMEOUT(sppp_pap_my_TO, (void *)sp, sp->lcp.timeout, - sp->pap_my_to_ch); + sp->pap_my_to_ch = timeout(sppp_pap_my_TO, (void *)sp, + sp->lcp.timeout); } } @@ -4517,8 +4521,8 @@ if (debug) log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp)); - UNTIMEOUT(pap.TO, (void *)sp, sp->ch[IDX_PAP]); - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(pap.TO, (void *)sp, sp->ch[IDX_PAP]); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); sp->lcp.protos &= ~(1 << IDX_PAP); lcp.Close(sp); @@ -4645,7 +4649,12 @@ struct sppp *sp; int s; + /* + * XXXRW: It would be nice to avoid calling all this stuff while + * holding sppp_mtx, or we risk lock order reversals. + */ s = splimp(); + mtx_lock(&sppp_mtx); for (sp=spppq; sp; sp=sp->pp_next) { struct ifnet *ifp = &sp->pp_if; @@ -4684,8 +4693,9 @@ sp->lcp.echoid, 4, &nmagic); } } + callout_reset(&keepalive_callout, hz * 10, sppp_keepalive, NULL); + mtx_unlock(&sppp_mtx); splx(s); - TIMEOUT(sppp_keepalive, 0, hz * 10, keepalive_ch); } /* --- //depot/vendor/freebsd/src/sys/net/if_stf.c 2004/05/30 20:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_stf.c 2004/05/31 01:41:33 @@ -137,13 +137,11 @@ #define sc_ro __sc_ro46.__sc_ro4 const struct encaptab *encap_cookie; LIST_ENTRY(stf_softc) sc_list; /* all stf's are linked */ + struct mtx sc_mtx; /* protect sc_ro */ }; /* * All mutable global variables in if_stf.c are protected by stf_mtx. - * XXXRW: Note that mutable fields in the softc are not currently locked: - * in particular, sc_ro needs to be protected from concurrent entrance - * of stf_output(). */ static struct mtx stf_mtx; static LIST_HEAD(, stf_softc) stf_softc_list; @@ -199,6 +197,7 @@ free(sc, M_STF); return (ENOMEM); } + mtx_init(&sc->sc_mtx, "stf sc_mtx", NULL, MTX_DEF); ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; @@ -223,6 +222,7 @@ bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); + mtx_destroy(&sc->sc_mtx); free(sc, M_STF); } @@ -394,9 +394,10 @@ struct ip *ip; struct ip6_hdr *ip6; struct in6_ifaddr *ia6; -#ifdef MAC + struct route ro; int error; +#ifdef MAC error = mac_check_ifnet_transmit(ifp, m); if (error) { m_freem(m); @@ -498,9 +499,7 @@ else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); - /* - * XXXRW: Locking of sc_ro required. - */ + mtx_lock(&sc->sc_mtx); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { @@ -519,12 +518,21 @@ if (sc->sc_ro.ro_rt == NULL) { m_freem(m); ifp->if_oerrors++; + mtx_unlock(&sc->sc_mtx); return ENETUNREACH; } } + /* + * XXXRW: Holding mutex over call to ip_output(): potential lock + * order issue? Hard to resolve cleanly with the current route + * caching model, as we have to synchronize access to shared softc + * state. + */ ifp->if_opackets++; - return ip_output(m, NULL, &sc->sc_ro, 0, NULL, NULL); + error = ip_output(m, NULL, &ro, 0, NULL, NULL); + mtx_unlock(&sc->sc_mtx); + return (error); } static int --- //depot/vendor/freebsd/src/sys/net/if_tap.c 2004/06/17 17:21:12 +++ //depot/user/rwatson/netperf/sys/net/if_tap.c 2004/06/18 00:24:39 @@ -113,6 +113,10 @@ * All global variables in if_tap.c are locked with tapmtx, with the * exception of tapdebug, which is accessed unlocked; tapclones is * static at runtime. + * + * XXXRW: si_flags appears not to be protected from concurrent access, + * and is written at run-time. + * XXXRW: si_drv1 is also used for test-and-set, and isn't synchronized. */ static struct mtx tapmtx; static int tapdebug = 0; /* debug flag */ @@ -162,6 +166,7 @@ * The EBUSY algorithm here can't quite atomically * guarantee that this is race-free since we have to * release the tap mtx to deregister the clone handler. + * XXXRW: is this true? */ mtx_lock(&tapmtx); SLIST_FOREACH(tp, &taphead, tap_next) { @@ -693,6 +698,7 @@ case SIOCSIFADDR: /* set MAC address of the remote side */ mtx_lock(&tp->tap_mtx); + /* XXXRW: Does this actually do anything? */ bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); break; @@ -747,6 +753,7 @@ if (flag & IO_NDELAY) return (EWOULDBLOCK); + /* This looks like a wanna-be condition variable. */ mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_RWAIT; mtx_unlock(&tp->tap_mtx); --- //depot/vendor/freebsd/src/sys/net/if_tun.c 2004/06/17 17:21:12 +++ //depot/user/rwatson/netperf/sys/net/if_tun.c 2004/06/18 00:24:39 @@ -59,6 +59,12 @@ * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. + * + * XXXRW: we allocate si_drv1 for the dev_t on demand, rather than when + * the dev_t is instantiated. Nothing serializes the test/set of that + * field. + * + * XXXRW: what serializes access to si_flags? */ struct tun_softc { TAILQ_ENTRY(tun_softc) tun_list; @@ -121,6 +127,9 @@ static d_ioctl_t tunioctl; static d_poll_t tunpoll; +/* + * XXXRW: can remove D_NEEDGIANT? Probably not because of si_drv1 for now. + */ static struct cdevsw tun_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDGIANT, @@ -364,6 +373,9 @@ ifp->if_flags |= IFF_UP | IFF_RUNNING; getmicrotime(&ifp->if_lastchange); + /* + * XXXRW: interface locking. + */ for (ifa = TAILQ_FIRST(&ifp->if_addrhead); ifa; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr == NULL) --- //depot/vendor/freebsd/src/sys/net/raw_cb.c 2004/06/15 04:15:33 +++ //depot/user/rwatson/netperf/sys/net/raw_cb.c 2004/06/18 00:24:39 @@ -123,6 +123,7 @@ m_freem(dtom(rp->rcb_faddr)); rp->rcb_faddr = 0; #endif + /* Unlocked read. */ if (rp->rcb_socket->so_state & SS_NOFDREF) raw_detach(rp); } --- //depot/vendor/freebsd/src/sys/net/raw_usrreq.c 2004/06/15 04:15:33 +++ //depot/user/rwatson/netperf/sys/net/raw_usrreq.c 2004/06/18 00:24:39 @@ -76,6 +76,10 @@ register struct mbuf *m = m0; struct socket *last; + /* + * XXXRW: Potential lock order issues due to holding the + * rawcb_mtx across all this stuff. Need to revisit. + */ last = 0; mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &rawcb_list, list) { --- //depot/vendor/freebsd/src/sys/net/rtsock.c 2004/06/09 02:50:37 +++ //depot/user/rwatson/netperf/sys/net/rtsock.c 2004/06/09 03:06:49 @@ -54,10 +54,18 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ +/* + * XXXRW: It would be really nice to add const to these, but that may + * not be possible due to where they are passed in. We might need + * to const-poison a whole boatload of APIs...? + */ static struct sockaddr route_dst = { 2, PF_ROUTE, }; static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* + * XXXRW: These fields are locked by RTSOCK_LOCK(). + */ static struct { int ip_count; /* attacked w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ --- //depot/vendor/freebsd/src/sys/netatalk/aarp.c 2004/04/25 09:25:44 +++ //depot/user/rwatson/netperf/sys/netatalk/aarp.c 2004/05/05 03:19:17 @@ -63,6 +63,9 @@ #define AARPT_KILLC 20 #define AARPT_KILLI 3 +/* + * XXXRW: wot? + */ # if !defined(__FreeBSD__) extern u_char etherbroadcastaddr[6]; # endif /* __FreeBSD__ */ @@ -72,7 +75,7 @@ }; /* - * Not used? + * XXXRW: unused? */ u_char at_org_code[ 3 ] = { 0x08, 0x00, 0x07, @@ -81,6 +84,9 @@ 0x00, 0x00, 0x00, }; +/* + * XXXRW: Make use callouts, not timeouts. + */ static struct callout_handle aarptimer_ch = CALLOUT_HANDLE_INITIALIZER(&aarptimer_ch); @@ -639,6 +645,9 @@ struct aarptab *aat; int i; + /* + * XXXRW: Should grab mutex before untimeout? + */ untimeout(aarptimer, 0, aarptimer_ch); AARPTAB_LOCK(); for (i = 0, aat = aarptab; i < AARPTAB_SIZE; i++, aat++) { --- //depot/vendor/freebsd/src/sys/netatalk/at_control.c 2004/03/22 04:56:26 +++ //depot/user/rwatson/netperf/sys/netatalk/at_control.c 2004/03/22 05:14:09 @@ -21,6 +21,9 @@ #include #include +/* + * XXXRW: Requires synchronization. + */ struct at_ifaddr *at_ifaddr_list; static int aa_dorangeroute(struct ifaddr *ifa, --- //depot/vendor/freebsd/src/sys/netatalk/at_rmx.c 2004/03/22 04:01:33 +++ //depot/user/rwatson/netperf/sys/netatalk/at_rmx.c 2004/03/22 04:08:45 @@ -40,11 +40,23 @@ int at_inithead(void **head, int off); -static char hexbuf[256]; +/* + * XXXRW: hexdump was a static global variable, but I moved it into the + * stack rather than stick a mutex around it. 256 bytes is smaller than + * it used to be, but this still might be a problem. Needs to be + * revisited. Should probably just use the new hexdump(9). + * + * XXXRW: All this appears to be present just so as to printf debugging + * information. Assuming that this code is known to work, we could just + * scrap all this. In fact, this code isn't even used as it stands, it's + * here for debugging purposes only and requires modifications to + * at_proto.c. + */ static char * prsockaddr(void *v) { + static char hexbuf[256]; char *bp = &hexbuf[0]; u_char *cp = v; --- //depot/vendor/freebsd/src/sys/netatalk/ddp_input.c 2004/06/13 02:50:44 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_input.c 2004/06/13 04:03:52 @@ -29,6 +29,12 @@ static volatile int ddp_forward = 1; static volatile int ddp_firewall = 0; static struct ddpstat ddpstat; + +/* + * XXXRW: If we're going to keep this cached route data, we'll need to lock it + * down, and change later function-local use of it to grab an extra reference + * after deciding it is useful. + */ static struct route forwro; static void ddp_input(struct mbuf *, struct ifnet *, struct elaphdr *, int); --- //depot/vendor/freebsd/src/sys/netatalk/ddp_pcb.c 2004/06/12 20:50:42 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_pcb.c 2004/06/12 21:17:12 @@ -22,12 +22,18 @@ #include #include +struct mtx ddp_list_mtx; static struct ddpcb *ddp_ports[ ATPORT_LAST ]; -struct ddpcb *ddpcb_list = NULL; +struct ddpcb *ddpcb_list = NULL; void at_sockaddr(struct ddpcb *ddp, struct sockaddr **addr) { + + /* + * Prevent modification of ddp during copy of addr. + */ + DDP_LOCK_ASSERT(ddp); *addr = sodupsockaddr((struct sockaddr *)&ddp->ddp_lsat, M_NOWAIT); } @@ -38,6 +44,12 @@ struct at_ifaddr *aa; struct ddpcb *ddpp; + /* + * We read and write both the ddp passed in, and also ddp_ports. + */ + DDP_LIST_XLOCK_ASSERT(); + DDP_LOCK_ASSERT(ddp); + if (ddp->ddp_lsat.sat_port != ATADDR_ANYPORT) { /* shouldn't be bound */ return (EINVAL); } @@ -134,6 +146,9 @@ struct ifnet *ifp; u_short hintnet = 0, net; + DDP_LIST_XLOCK_ASSERT(); + DDP_LOCK_ASSERT(ddp); + if (sat->sat_family != AF_APPLETALK) { return (EAFNOSUPPORT); } @@ -222,6 +237,9 @@ void at_pcbdisconnect(struct ddpcb *ddp) { + + DDP_LOCK_ASSERT(ddp); + ddp->ddp_fsat.sat_addr.s_net = ATADDR_ANYNET; ddp->ddp_fsat.sat_addr.s_node = ATADDR_ANYNODE; ddp->ddp_fsat.sat_port = ATADDR_ANYPORT; @@ -233,8 +251,17 @@ struct ddpcb *ddp; MALLOC(ddp, struct ddpcb *, sizeof *ddp, M_PCB, M_WAITOK | M_ZERO); + DDP_LOCK_INIT(ddp); ddp->ddp_lsat.sat_port = ATADDR_ANYPORT; + /* + * XXXRW: Is this unlocked assignment payer for socket and + * back-pointer something that needs to be protected? + */ + ddp->ddp_socket = so; + so->so_pcb = (caddr_t)ddp; + + DDP_LIST_XLOCK(); ddp->ddp_next = ddpcb_list; ddp->ddp_prev = NULL; ddp->ddp_pprev = NULL; @@ -243,15 +270,21 @@ ddpcb_list->ddp_prev = ddp; } ddpcb_list = ddp; + DDP_LIST_XUNLOCK(); - ddp->ddp_socket = so; - so->so_pcb = (caddr_t)ddp; - return (0); + return(0); } void at_pcbdetach(struct socket *so, struct ddpcb *ddp) { + + /* + * We modify ddp, ddp_ports, and the global list. + */ + DDP_LIST_XLOCK_ASSERT(); + DDP_LOCK_ASSERT(ddp); + soisdisconnected(so); SOCK_LOCK(so); so->so_pcb = NULL; @@ -282,6 +315,8 @@ if (ddp->ddp_next) { ddp->ddp_next->ddp_prev = ddp->ddp_prev; } + DDP_UNLOCK(ddp); + DDP_LOCK_DESTROY(ddp); FREE(ddp, M_PCB); } @@ -297,6 +332,8 @@ { struct ddpcb *ddp; + DDP_LIST_SLOCK_ASSERT(); + /* * Check for bad ports. */ @@ -309,11 +346,13 @@ * the interface? */ for (ddp = ddp_ports[ to->sat_port - 1 ]; ddp; ddp = ddp->ddp_pnext) { + DDP_LOCK(ddp); /* XXX should we handle 0.YY? */ /* XXXX.YY to socket on destination interface */ if (to->sat_addr.s_net == ddp->ddp_lsat.sat_addr.s_net && to->sat_addr.s_node == ddp->ddp_lsat.sat_addr.s_node) { + DDP_UNLOCK(ddp); break; } @@ -321,6 +360,7 @@ if (to->sat_addr.s_node == ATADDR_BCAST && (to->sat_addr.s_net == 0 || to->sat_addr.s_net == ddp->ddp_lsat.sat_addr.s_net) && ddp->ddp_lsat.sat_addr.s_net == AA_SAT(aa)->sat_addr.s_net) { + DDP_UNLOCK(ddp); break; } @@ -331,8 +371,10 @@ ntohs(aa->aa_firstnet) && ntohs(ddp->ddp_lsat.sat_addr.s_net) <= ntohs(aa->aa_lastnet)) { + DDP_UNLOCK(ddp); break; } + DDP_UNLOCK(ddp); } return (ddp); } --- //depot/vendor/freebsd/src/sys/netatalk/ddp_pcb.h 2004/03/19 07:25:31 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_pcb.h 2004/03/22 04:20:48 @@ -17,4 +17,23 @@ struct thread *td); void at_sockaddr(struct ddpcb *ddp, struct sockaddr **addr); +/* Lock macros for per-pcb locks. */ +#define DDP_LOCK_INIT(ddp) mtx_init(&(ddp)->ddp_mtx, "ddp_mtx", \ + NULL, MTX_DEF) +#define DDP_LOCK_DESTROY(ddp) mtx_destroy(&(ddp)->ddp_mtx) +#define DDP_LOCK(ddp) mtx_lock(&(ddp)->ddp_mtx) +#define DDP_UNLOCK(ddp) mtx_unlock(&(ddp)->ddp_mtx) +#define DDP_LOCK_ASSERT(ddp) mtx_assert(&(ddp)->ddp_mtx, MA_OWNED) + +/* Lock macros for global pcb list lock. */ +#define DDP_LIST_LOCK_INIT() mtx_init(&ddp_list_mtx, "ddp_list_mtx", \ + NULL, MTX_DEF) +#define DDP_LIST_LOCK_DESTROY() mtx_destroy(&ddp_list_mtx) +#define DDP_LIST_XLOCK() mtx_lock(&ddp_list_mtx) +#define DDP_LIST_XUNLOCK() mtx_unlock(&ddp_list_mtx) +#define DDP_LIST_XLOCK_ASSERT() mtx_assert(&ddp_list_mtx, MA_OWNED) +#define DDP_LIST_SLOCK() mtx_lock(&ddp_list_mtx) +#define DDP_LIST_SUNLOCK() mtx_unlock(&ddp_list_mtx) +#define DDP_LIST_SLOCK_ASSERT() mtx_assert(&ddp_list_mtx, MA_OWNED) + #endif --- //depot/vendor/freebsd/src/sys/netatalk/ddp_usrreq.c 2004/05/05 03:36:28 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_usrreq.c 2004/05/23 16:56:02 @@ -22,6 +22,9 @@ #include #include +/* + * XXXRW: These structures are currently not mutable. + */ static u_long ddp_sendspace = DDP_MAXSZ; /* Max ddp size + 1 (ddp_type) */ static u_long ddp_recvspace = 10 * (587 + sizeof(struct sockaddr_at)); @@ -32,36 +35,38 @@ { struct ddpcb *ddp; int error = 0; - int s; + ddp = sotoddpcb(so); + if (ddp != NULL) + return (EINVAL); - ddp = sotoddpcb(so); - if (ddp != NULL) { - return (EINVAL); - } + /* + * Allocate socket buffer space first so that it's present + * before first use. + */ + error = soreserve(so, ddp_sendspace, ddp_recvspace); + if (error) + return (error); - s = splnet(); + DDP_LIST_XLOCK(); error = at_pcballoc(so); - splx(s); - if (error) { - return (error); - } - return (soreserve(so, ddp_sendspace, ddp_recvspace)); + DDP_LIST_XUNLOCK(); + return (error); } static int ddp_detach(struct socket *so) { struct ddpcb *ddp; - int s; ddp = sotoddpcb(so); - if (ddp == NULL) { + if (ddp == NULL) return (EINVAL); - } - s = splnet(); + + DDP_LIST_XLOCK(); + DDP_LOCK(ddp); at_pcbdetach(so, ddp); - splx(s); + DDP_LIST_XUNLOCK(); return (0); } @@ -70,15 +75,16 @@ { struct ddpcb *ddp; int error = 0; - int s; ddp = sotoddpcb(so); if (ddp == NULL) { return (EINVAL); } - s = splnet(); + DDP_LIST_XLOCK(); + DDP_LOCK(ddp); error = at_pcbsetaddr(ddp, nam, td); - splx(s); + DDP_UNLOCK(ddp); + DDP_LIST_XUNLOCK(); return (error); } @@ -87,20 +93,23 @@ { struct ddpcb *ddp; int error = 0; - int s; ddp = sotoddpcb(so); if (ddp == NULL) { return (EINVAL); } + DDP_LIST_XLOCK(); + DDP_LOCK(ddp); if (ddp->ddp_fsat.sat_port != ATADDR_ANYPORT) { + DDP_UNLOCK(ddp); + DDP_LIST_XUNLOCK(); return (EISCONN); } - s = splnet(); - error = at_pcbconnect(ddp, nam, td); - splx(s); + error = at_pcbconnect( ddp, nam, td ); + DDP_UNLOCK(ddp); + DDP_LIST_XUNLOCK(); if (error == 0) soisconnected(so); return (error); @@ -111,20 +120,20 @@ { struct ddpcb *ddp; - int s; ddp = sotoddpcb(so); if (ddp == NULL) { return (EINVAL); } + DDP_LOCK(ddp); if (ddp->ddp_fsat.sat_addr.s_node == ATADDR_ANYNODE) { + DDP_UNLOCK(ddp); return (ENOTCONN); } - s = splnet(); at_pcbdisconnect(ddp); ddp->ddp_fsat.sat_addr.s_node = ATADDR_ANYNODE; - splx(s); + DDP_UNLOCK(ddp); soisdisconnected(so); return (0); } @@ -142,13 +151,19 @@ return (0); } +/* + * XXXRW: If an explicit address is specified, then we temporarily change + * the address on the pcb for sending. This is inefficient because it + * requires us to perform global rather than pcb-local operations. It + * may also create a race if other users of the socket are simultaneously + * sending. + */ static int ddp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ddpcb *ddp; int error = 0; - int s; ddp = sotoddpcb(so); if (ddp == NULL) { @@ -160,28 +175,29 @@ } if (addr != NULL) { + DDP_LIST_XLOCK(); + DDP_LOCK(ddp); if (ddp->ddp_fsat.sat_port != ATADDR_ANYPORT) { - return (EISCONN); + error = EISCONN; + goto out; } - s = splnet(); error = at_pcbconnect(ddp, addr, td); - splx(s); - if (error) { - return (error); + if (error == 0) { + error = ddp_output(m, so); + at_pcbdisconnect(ddp); } +out: + DDP_UNLOCK(ddp); + DDP_LIST_XUNLOCK(); } else { - if (ddp->ddp_fsat.sat_port == ATADDR_ANYPORT) { - return (ENOTCONN); - } + DDP_LOCK(ddp); + if (ddp->ddp_fsat.sat_port == ATADDR_ANYPORT) + error = ENOTCONN; + else + error = ddp_output(m, so); + DDP_UNLOCK(ddp); } - - s = splnet(); - error = ddp_output(m, so); - if (addr != NULL) { - at_pcbdisconnect(ddp); - } - splx(s); return (error); } @@ -189,28 +205,28 @@ ddp_abort(struct socket *so) { struct ddpcb *ddp; - int s; ddp = sotoddpcb(so); if (ddp == NULL) { return (EINVAL); } - s = splnet(); + DDP_LIST_XLOCK(); + DDP_LOCK(ddp); at_pcbdetach(so, ddp); - splx(s); + DDP_LIST_XUNLOCK(); return (0); } void ddp_init(void) { - atintrq1.ifq_maxlen = IFQ_MAXLEN; atintrq2.ifq_maxlen = IFQ_MAXLEN; aarpintrq.ifq_maxlen = IFQ_MAXLEN; mtx_init(&atintrq1.ifq_mtx, "at1_inq", NULL, MTX_DEF); mtx_init(&atintrq2.ifq_mtx, "at2_inq", NULL, MTX_DEF); mtx_init(&aarpintrq.ifq_mtx, "aarp_inq", NULL, MTX_DEF); + DDP_LIST_LOCK_INIT(); netisr_register(NETISR_ATALK1, at1intr, &atintrq1, 0); netisr_register(NETISR_ATALK2, at2intr, &atintrq2, 0); netisr_register(NETISR_AARP, aarpintr, &aarpintrq, 0); @@ -225,6 +241,7 @@ for (ddp = ddpcb_list; ddp != NULL; ddp = ddp->ddp_next) { at_pcbdetach(ddp->ddp_socket, ddp); } + DDP_LIST_LOCK_DESTROY(); } #endif @@ -243,7 +260,9 @@ if (ddp == NULL) { return (EINVAL); } + DDP_LOCK(ddp); at_sockaddr(ddp, nam); + DDP_UNLOCK(ddp); return (0); } --- //depot/vendor/freebsd/src/sys/netatalk/ddp_var.h 2004/03/22 04:56:26 +++ //depot/user/rwatson/netperf/sys/netatalk/ddp_var.h 2004/03/22 05:14:09 @@ -13,6 +13,7 @@ struct socket *ddp_socket; struct ddpcb *ddp_prev, *ddp_next; struct ddpcb *ddp_pprev, *ddp_pnext; + struct mtx ddp_mtx; }; #define sotoddpcb(so) ((struct ddpcb *)(so)->so_pcb) @@ -34,5 +35,6 @@ extern int ddp_cksum; extern struct ddpcb *ddpcb_list; extern struct pr_usrreqs ddp_usrreqs; +extern struct mtx ddp_list_mtx; #endif #endif /* _NETATALK_DDP_VAR_H_ */ --- //depot/vendor/freebsd/src/sys/netgraph/ng_base.c 2004/05/29 07:25:30 +++ //depot/user/rwatson/netperf/sys/netgraph/ng_base.c 2004/05/31 01:41:33 @@ -170,6 +170,7 @@ #define NG_IDHASH_FN(ID) ((ID) % (NG_ID_HASH_SIZE)) #define NG_IDHASH_FIND(ID, node) \ do { \ + mtx_assert(&ng_idhash_mtx, MA_OWNED); \ LIST_FOREACH(node, &ng_ID_hash[NG_IDHASH_FN(ID)], \ nd_idnodes) { \ if (NG_NODE_IS_VALID(node) \ @@ -3231,10 +3232,12 @@ { node_p node; int i = 1; + mtx_lock(&ng_nodelist_mtx); SLIST_FOREACH(node, &ng_allnodes, nd_all) { printf("[%d] ", i++); dumpnode(node, NULL, 0); } + mtx_unlock(&ng_nodelist_mtx); } static void @@ -3242,10 +3245,12 @@ { hook_p hook; int i = 1; + mtx_lock(&ng_nodelist_mtx); SLIST_FOREACH(hook, &ng_allhooks, hk_all) { printf("[%d] ", i++); dumphook(hook, NULL, 0); } + mtx_unlock(&ng_nodelist_mtx); } static int --- //depot/vendor/freebsd/src/sys/netgraph/ng_ksocket.c 2004/06/17 22:51:43 +++ //depot/user/rwatson/netperf/sys/netgraph/ng_ksocket.c 2004/06/18 00:24:39 @@ -1013,6 +1013,9 @@ * before dereferencing the socket pointer. */ +/* + * XXXRW: ng_ksocket_incoming() is called without Giant. Is that OK? + */ static void ng_ksocket_incoming(struct socket *so, void *arg, int waitflag) { --- //depot/vendor/freebsd/src/sys/netgraph/ng_socket.c 2004/05/29 00:56:26 +++ //depot/user/rwatson/netperf/sys/netgraph/ng_socket.c 2004/05/31 01:41:33 @@ -151,6 +151,9 @@ SYSCTL_INT(_net_graph, OID_AUTO, recvspace, CTLFLAG_RW, &ngpdg_recvspace , 0, "Maximum space for incoming Netgraph datagrams"); +/* + * XXXRW: Locking? + */ /* List of all sockets */ static LIST_HEAD(, ngpcb) ngsocklist; --- //depot/vendor/freebsd/src/sys/netinet/accf_http.c 2004/06/14 18:20:38 +++ //depot/user/rwatson/netperf/sys/netinet/accf_http.c 2004/06/14 19:35:13 @@ -161,6 +161,7 @@ sohashttpget(struct socket *so, void *arg, int waitflag) { + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { struct mbuf *m; char *cmp; @@ -214,6 +215,7 @@ struct mbuf *m, *n; int i, cc, spaces, inspaces; + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto fallout; @@ -301,6 +303,7 @@ int ccleft, copied; DPRINT("start"); + /* Unlocked read. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto gotit; --- //depot/vendor/freebsd/src/sys/netinet/if_ether.c 2004/06/13 10:56:09 +++ //depot/user/rwatson/netperf/sys/netinet/if_ether.c 2004/06/13 20:00:27 @@ -98,6 +98,11 @@ #define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ }; +/* + * XXXRW: Need to document (and/or fix) locking for this. We always + * seem to hold a lock (and assert) when referencing this list, but it's + * not clear it's always the same lock. + */ static LIST_HEAD(, llinfo_arp) llinfo_arp; static struct ifqueue arpintrq; --- //depot/vendor/freebsd/src/sys/netinet/igmp.c 2004/06/11 03:45:34 +++ //depot/user/rwatson/netperf/sys/netinet/igmp.c 2004/06/11 13:24:13 @@ -99,6 +99,9 @@ static u_long igmp_all_hosts_group; static u_long igmp_all_rtrs_group; +/* + * XXXRW: These variables make me vaguely nervous. + */ static struct mbuf *router_alert; static struct route igmprt; @@ -123,6 +126,7 @@ /* * Construct a Router Alert option to use in outgoing packets + * XXXRW: This might actually need a MAC label. */ MGET(router_alert, M_DONTWAIT, MT_DATA); ra = mtod(router_alert, struct ipoption *); --- //depot/vendor/freebsd/src/sys/netinet/in_gif.c 2004/06/18 02:06:42 +++ //depot/user/rwatson/netperf/sys/netinet/in_gif.c 2004/06/18 03:27:04 @@ -174,6 +174,9 @@ } bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); + /* + * XXXRW: locking of gif's softc. + */ if (dst->sin_family != sin_dst->sin_family || dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { /* cache route doesn't match */ @@ -321,6 +324,10 @@ case 0: case 127: case 255: return 0; } + + /* + * XXXRW: Lock in_ifaddrhead walking. + */ /* reject packets with broadcast on source */ TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) @@ -329,6 +336,7 @@ return 0; } + /* XXXRW: unlocked read. */ /* ingress filters on outer source */ if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in sin; @@ -384,6 +392,11 @@ in_gif_attach(sc) struct gif_softc *sc; { + + /* + * XXXRW: Technically, NULL can also be returned for ENOMEM, + * not just EEXIST. + */ sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck, &in_gif_protosw, sc); if (sc->encap_cookie4 == NULL) --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.c 2004/06/16 10:05:58 +++ //depot/user/rwatson/netperf/sys/netinet/in_pcb.c 2004/06/18 00:24:39 @@ -671,6 +671,7 @@ #ifdef IPSEC ipsec_pcbdisconn(inp->inp_sp); #endif + /* Unlocked read. */ if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp); } --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.h 2004/04/07 20:52:05 +++ //depot/user/rwatson/netperf/sys/netinet/in_pcb.h 2004/04/08 03:11:34 @@ -244,9 +244,14 @@ #define INP_LOCK(inp) mtx_lock(&(inp)->inp_mtx) #define INP_UNLOCK(inp) mtx_unlock(&(inp)->inp_mtx) #ifndef INET6 -#define INP_LOCK_ASSERT(inp) mtx_assert(&(inp)->inp_mtx, MA_OWNED) +#define INP_LOCK_ASSERT(inp) do { \ + mtx_assert(&(inp)->inp_mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) #else -#define INP_LOCK_ASSERT(inp) +#define INP_LOCK_ASSERT(inp) do { \ + NET_ASSERT_GIANT(); \ +} while (0) #endif #define INP_INFO_LOCK_INIT(ipi, d) \ @@ -256,11 +261,21 @@ #define INP_INFO_RUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) #ifndef INET6 -#define INP_INFO_RLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_mtx, MA_OWNED) -#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_mtx, MA_OWNED) +#define INP_INFO_RLOCK_ASSERT(ipi) do { \ + mtx_assert(&(ipi)->ipi_mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) +#define INP_INFO_WLOCK_ASSERT(ipi) do { \ + mtx_assert(&(ipi)->ipi_mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) #else -#define INP_INFO_RLOCK_ASSERT(ipi) -#define INP_INFO_WLOCK_ASSERT(ipi) +#define INP_INFO_RLOCK_ASSERT(ipi) do { \ + NET_ASSERT_GIANT(); \ +} while (0) +#define INP_INFO_WLOCK_ASSERT(ipi) do { \ + NET_ASSERT_GIANT(); \ +} while (0) #endif #define INP_PCBHASH(faddr, lport, fport, mask) \ --- //depot/vendor/freebsd/src/sys/netinet/ip_divert.c 2004/06/13 02:50:44 +++ //depot/user/rwatson/netperf/sys/netinet/ip_divert.c 2004/06/13 04:03:52 @@ -262,12 +262,6 @@ KASSERT(m->m_pkthdr.rcvif == NULL, ("rcvif not null")); -#ifdef MAC - SOCK_LOCK(so); - mac_create_mbuf_from_socket(so, m); - SOCK_UNLOCK(so); -#endif - if (control) m_freem(control); /* XXX */ @@ -324,6 +318,9 @@ /* Send packet to output processing */ ipstat.ips_rawout++; /* XXX */ +#ifdef MAC + mac_create_mbuf_from_inpcb(inp, m); +#endif error = ip_output(m, inp->inp_options, NULL, (so->so_options & SO_DONTROUTE) | @@ -350,6 +347,14 @@ } m->m_pkthdr.rcvif = ifa->ifa_ifp; } +#ifdef MAC + /* + * XXXRW: perhaps should be mac_create_mbuf_from_inpcb()? + */ + SOCK_LOCK(so); + mac_create_mbuf_from_socket(so, m); + SOCK_UNLOCK(so); +#endif /* Send packet to input processing */ ip_input(m); } @@ -417,6 +422,7 @@ /* The socket is always "connected" because we always know "where" to send the packet */ INP_UNLOCK(inp); + /* XXXRW: so_state locking. */ so->so_state |= SS_ISCONNECTED; return 0; } @@ -459,6 +465,8 @@ static int div_disconnect(struct socket *so) { + + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return div_abort(so); --- //depot/vendor/freebsd/src/sys/netinet/ip_dummynet.c 2004/03/03 01:35:21 +++ //depot/user/rwatson/netperf/sys/netinet/ip_dummynet.c 2004/03/04 04:00:06 @@ -171,7 +171,10 @@ #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx) #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx) #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx) -#define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED) +#define DUMMYNET_LOCK_ASSERT() do { \ + mtx_assert(&dummynet_mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); --- //depot/vendor/freebsd/src/sys/netinet/ip_encap.c 2004/03/10 02:50:38 +++ //depot/user/rwatson/netperf/sys/netinet/ip_encap.c 2004/03/10 03:47:45 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet/ip_encap.c,v 1.19 2004/03/10 02:48:50 rwatson Exp $ */ +/* $FreeBSD: src/sys/netinet/ip_encap.c,v 1.18 2003/06/01 09:20:38 phk Exp $ */ /* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ /* @@ -106,8 +106,7 @@ LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); /* - * We currently keey encap_init() for source code compatibility reasons -- - * it's referenced by KAME pieces in netinet6. + * XXXRW: encap_init() was entirely useless, so I deleted it. */ void encap_init() @@ -185,6 +184,10 @@ } mtx_unlock(&encapmtx); + /* + * XXXRW: Need drain mechanism to prevent the encapsulation + * entry from being released while in use. + */ if (match) { /* found a match, "match" has the best one */ psw = match->psw; @@ -255,6 +258,10 @@ } mtx_unlock(&encapmtx); + /* + * XXXRW: Need drain mechanism so the encap entry isn't freed + * while in use. + */ if (match) { /* found a match */ psw = (const struct ip6protosw *)match->psw; --- //depot/vendor/freebsd/src/sys/netinet/ip_encap.h 2002/03/19 21:30:39 +++ //depot/user/rwatson/netperf/sys/netinet/ip_encap.h 2004/03/12 06:46:13 @@ -35,6 +35,15 @@ #ifdef _KERNEL +/* + * This structure is entirely static after registration, and other than + * its entry in the encapsulation table, requires no locking. The chain + * field is locked using the global encapmtx. + * + * XXXRW: Need to add a refcount/drain mechanism so that encapsulation + * entries can't be removed while in use. This likely requires a + * refcount and cv to wait for it to drain, or an sx lock. + */ struct encaptab { LIST_ENTRY(encaptab) chain; int af; --- //depot/vendor/freebsd/src/sys/netinet/ip_fastfwd.c 2004/05/06 18:50:39 +++ //depot/user/rwatson/netperf/sys/netinet/ip_fastfwd.c 2004/05/23 16:56:02 @@ -644,6 +644,7 @@ /* * Return packet for processing by ip_input */ + /* XXX statistic */ if (ro.ro_rt) RTFREE(ro.ro_rt); return 0; --- //depot/vendor/freebsd/src/sys/netinet/ip_fw2.c 2004/06/11 22:20:41 +++ //depot/user/rwatson/netperf/sys/netinet/ip_fw2.c 2004/06/12 17:06:18 @@ -136,7 +136,10 @@ #define IPFW_LOCK_DESTROY(_chain) mtx_destroy(&(_chain)->mtx) #define IPFW_LOCK(_chain) mtx_lock(&(_chain)->mtx) #define IPFW_UNLOCK(_chain) mtx_unlock(&(_chain)->mtx) -#define IPFW_LOCK_ASSERT(_chain) mtx_assert(&(_chain)->mtx, MA_OWNED) +#define IPFW_LOCK_ASSERT(_chain) do { \ + mtx_assert(&(_chain)->mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) /* * list of rules for layer 3 @@ -1538,7 +1541,8 @@ } static int -check_uidgid(ipfw_insn_u32 *insn, +check_uidgid(struct ip_fw_chain *chain, + ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, @@ -1568,7 +1572,9 @@ return 0; match = 0; if (*lookup == 0) { + IPFW_UNLOCK(chain); INP_INFO_RLOCK(pi); /* XXX LOR with IPFW */ + IPFW_LOCK(chain); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), @@ -1933,7 +1939,7 @@ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) - match = check_uidgid( + match = check_uidgid(chain, (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, --- //depot/vendor/freebsd/src/sys/netinet/ip_id.c 2004/02/26 03:55:40 +++ //depot/user/rwatson/netperf/sys/netinet/ip_id.c 2004/03/12 03:03:57 @@ -79,6 +79,9 @@ 2729 }; +/* + * XXXRW: Locking? + */ static u_int16_t ru_x; static u_int16_t ru_seed, ru_seed2; static u_int16_t ru_a, ru_b; --- //depot/vendor/freebsd/src/sys/netinet/ip_input.c 2004/06/18 13:01:11 +++ //depot/user/rwatson/netperf/sys/netinet/ip_input.c 2004/06/19 04:30:55 @@ -938,8 +938,10 @@ /* attach next hop info for TCP */ struct m_tag *mtag = m_tag_get(PACKET_TAG_IPFORWARD, sizeof(struct sockaddr_in *), M_NOWAIT); - if (mtag == NULL) + if (mtag == NULL) { + /* XXX statistic */ goto bad; + } *(struct sockaddr_in **)(mtag+1) = args.next_hop; m_tag_prepend(m, mtag); } @@ -1871,6 +1873,7 @@ struct m_tag *mtag = m_tag_get(PACKET_TAG_IPFORWARD, sizeof(struct sockaddr_in *), M_NOWAIT); if (mtag == NULL) { + /* XXX statistic */ m_freem(m); return; } --- //depot/vendor/freebsd/src/sys/netinet/ip_mroute.c 2004/05/30 20:25:31 +++ //depot/user/rwatson/netperf/sys/netinet/ip_mroute.c 2004/05/31 01:41:33 @@ -104,7 +104,10 @@ static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) -#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) +#define MFC_LOCK_ASSERT() do { \ + mtx_assert(&mfc_mtx, MA_OWNED); \ + NET_ASSERT_GIANT(); \ +} while (0) #define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) @@ -1304,13 +1307,10 @@ socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { - mtx_lock(&Giant); /* XXX until sockets are locked */ if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup(s); - mtx_unlock(&Giant); return 0; } - mtx_unlock(&Giant); } m_freem(mm); return -1; --- //depot/vendor/freebsd/src/sys/netinet/ip_output.c 2004/06/13 17:30:39 +++ //depot/user/rwatson/netperf/sys/netinet/ip_output.c 2004/06/18 20:24:57 @@ -157,6 +157,12 @@ M_ASSERTPKTHDR(m); + /* + * When packet comes from dummynet restore state from + * previous processing instead of the header. Yech! + * + * XXX add conditional compilation? + */ args.next_hop = m_claim_next(m, PACKET_TAG_IPFORWARD); dummytag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); if (dummytag != NULL) { @@ -878,6 +884,7 @@ PACKET_TAG_IPFORWARD, sizeof(struct sockaddr_in *), M_NOWAIT); if (mtag == NULL) { + /* XXX statistic */ error = ENOBUFS; goto bad; } @@ -896,6 +903,7 @@ CSUM_IP_CHECKED | CSUM_IP_VALID; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); + /* XXX netisr_queue(NETISR_IP, m); */ ip_input(m); goto done; } @@ -1410,9 +1418,11 @@ m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); - - return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, - m)); + INP_LOCK(inp); + error = ip_pcbopts(sopt->sopt_name, &inp->inp_options, + m); + INP_UNLOCK(inp); + return (error); } case IP_TOS: @@ -1437,11 +1447,14 @@ case IP_TTL: inp->inp_ip_ttl = optval; break; -#define OPTSET(bit) \ - if (optval) \ - inp->inp_flags |= bit; \ - else \ - inp->inp_flags &= ~bit; +#define OPTSET(bit) do { \ + INP_LOCK(inp); \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; \ + INP_UNLOCK(inp); \ +} while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); @@ -1480,7 +1493,15 @@ case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: + /* + * XXXRW: ip_setmoptions() will calling a blocking + * memory allocation, so the inpcb lock should + * really be acquired in ip_setmoptions(), which + * isn't possible with the current API. + */ + INP_LOCK(inp); error = ip_setmoptions(sopt, &inp->inp_moptions); + INP_UNLOCK(inp); break; case IP_PORTRANGE: @@ -1489,6 +1510,7 @@ if (error) break; + INP_LOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); @@ -1509,6 +1531,7 @@ error = EINVAL; break; } + INP_UNLOCK(inp); break; #if defined(IPSEC) || defined(FAST_IPSEC) @@ -1529,7 +1552,9 @@ req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; + INP_LOCK(inp); error = ipsec4_set_policy(inp, optname, req, len, priv); + INP_UNLOCK(inp); m_freem(m); break; } @@ -1545,6 +1570,12 @@ switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: + /* + * XXXRW: We should make a local copy of the + * options while holding the inpcb lock, and then + * copy out without holding the lock. Currently, + * we race. + */ if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, @@ -1622,7 +1653,15 @@ case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: + /* + * XXXRW: ip_getmoptions() may perform a copy out + * to user space, and should do that without + * holding the inpcb lock. However, we don't pass + * the inpcb down, so it can't do that. + */ + INP_LOCK(inp); error = ip_getmoptions(sopt, inp->inp_moptions); + INP_UNLOCK(inp); break; #if defined(IPSEC) || defined(FAST_IPSEC) --- //depot/vendor/freebsd/src/sys/netinet/raw_ip.c 2004/06/09 20:10:43 +++ //depot/user/rwatson/netperf/sys/netinet/raw_ip.c 2004/06/10 22:28:54 @@ -87,6 +87,9 @@ * so leave them not initialized and rely on BSS being set to 0. */ +/* + * XXXRW: Locking for mrouter bits? + */ /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter; @@ -628,6 +631,7 @@ } INP_LOCK(inp); soisdisconnected(so); + /* Unlocked read. */ if (so->so_state & SS_NOFDREF) rip_pcbdetach(so, inp); else @@ -639,6 +643,8 @@ static int rip_disconnect(struct socket *so) { + + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return rip_abort(so); @@ -735,6 +741,7 @@ INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); + /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { INP_INFO_WUNLOCK(&ripcbinfo); --- //depot/vendor/freebsd/src/sys/netinet/tcp_debug.c 2004/04/07 20:52:05 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_debug.c 2004/04/08 03:11:34 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include --- //depot/vendor/freebsd/src/sys/netinet/tcp_input.c 2004/06/16 09:35:21 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_input.c 2004/06/20 15:39:13 @@ -355,6 +355,7 @@ flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else @@ -424,7 +425,7 @@ struct tcpopt to; /* options in this segment */ struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; - struct sockaddr_in *next_hop = NULL; + struct sockaddr_in *next_hop; int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; @@ -750,6 +751,7 @@ tiwin = th->th_win; #ifdef MAC + INP_LOCK_ASSERT(inp); if (mac_check_inpcb_deliver(inp, m)) goto drop; #endif @@ -1165,7 +1167,8 @@ acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; - sbdrop(&so->so_snd, acked); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; @@ -1202,7 +1205,9 @@ tp->t_rxtcur, tcp_timer_rexmt, tp); + SOCKBUF_UNLOCK(&so->so_snd); sowwakeup(so); + /* Unlocked read. */ if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; @@ -1240,6 +1245,7 @@ #endif * Add data to socket buffer. */ + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -1712,6 +1718,7 @@ * If new data are received on a connection after the * user processes are gone, then RST the other end. */ + /* Unlocked read. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); @@ -2104,15 +2111,17 @@ incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } + SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; - sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { - sbdrop(&so->so_snd, acked); + sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } + SOCKBUF_UNLOCK(&so->so_snd); sowwakeup(so); /* detect una wraparound */ if (tcp_do_newreno && !IN_FASTRECOVERY(tp) && @@ -2146,6 +2155,7 @@ * we should release the tp also, and use a * compressed state. */ + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, @@ -2229,6 +2239,7 @@ * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ + /* Unlocked read. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ @@ -2248,6 +2259,8 @@ * of data past the urgent section as the original * spec states (in one of two places). */ + /* Unlocked read of sb_cc. */ + /* XXXRW: Unlocked assignment of so_oobmark, sb_state. */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + @@ -2317,6 +2330,7 @@ tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); + /* Unlocked read. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else @@ -2943,6 +2957,10 @@ } tp->t_maxseg = mss; + /* + * XXXRW: read-modify-write on socket buffer without acquiring + * the socket buffer lock. + */ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else --- //depot/vendor/freebsd/src/sys/netinet/tcp_output.c 2004/06/18 09:56:12 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_output.c 2004/06/19 04:30:55 @@ -210,6 +210,7 @@ * to send then the probe will be the FIN * itself. */ + /* Unlocked read of sb_cc. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; @@ -231,6 +232,7 @@ * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ + /* Unlocked read of sb_cc. */ len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off; @@ -292,6 +294,7 @@ len = tp->t_maxseg; sendalot = 1; } + /* Unlocked read of sb_cc. */ if (off + len < so->so_snd.sb_cc) flags &= ~TH_FIN; @@ -319,6 +322,7 @@ * * note: the len + off check is almost certainly unnecessary. */ + /* Unlocked read of sb_cc. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && @@ -397,6 +401,7 @@ * if window is nonzero, transmit what we can, * otherwise force out a byte. */ + /* Unlocked read of sb_cc. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; @@ -664,6 +669,7 @@ * give data to the user when a buffer fills or * a PUSH comes in.) */ + /* Unlocked read of sb_cc. */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; } else { --- //depot/vendor/freebsd/src/sys/netinet/tcp_subr.c 2004/06/12 20:50:42 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_subr.c 2004/06/12 21:17:12 @@ -576,6 +576,7 @@ #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + int callout_flag; tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) @@ -589,11 +590,17 @@ tcp_mssdflt; /* Set up our timeouts. */ - callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); - callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); - callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); - callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); - callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); + /* + * XXXRW: Are these actually MPSAFE? I think so, but need to + * review the timed wait code, as it has some list variables, + * etc, that are global. + */ + callout_flag = debug_mpsafenet ? CALLOUT_MPSAFE : 0; + callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, callout_flag); + callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, callout_flag); + callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, callout_flag); + callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, callout_flag); + callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, callout_flag); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); @@ -1595,7 +1602,7 @@ /* * Move a TCP connection into TIME_WAIT state. - * tcbinfo is unlocked. + * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void @@ -1607,6 +1614,11 @@ int tw_time, acknow; struct socket *so; + INP_INFO_WLOCK_ASSERT(&tcbinfo); +#if 0 + INP_LOCK_ASSERT(tp); +#endif + tw = uma_zalloc(tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_timer_2msl_tw(1); @@ -1740,6 +1752,8 @@ int isipv6 = inp->inp_inc.inc_isipv6; #endif + INP_LOCK_ASSERT(inp); + m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return (ENOBUFS); --- //depot/vendor/freebsd/src/sys/netinet/tcp_syncache.c 2004/06/16 03:38:33 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_syncache.c 2004/06/18 00:24:39 @@ -559,6 +559,9 @@ goto abort2; } #ifdef MAC + /* + * XXXRW: Would prefer inpcb. + */ SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); @@ -726,6 +729,14 @@ abort: INP_UNLOCK(inp); abort2: + /* + * XXXRW: Technically speaking, this soabort() likely doesn't + * do anything, since we insert sockets into the accept queues + * in an already completed state, and soabort() leaves it to + * the close() on the listen socket to remove completed + * connections. However, this means a TCP socket without + * full TCP state could slip through...? + */ if (so != NULL) (void) soabort(so); return (NULL); --- //depot/vendor/freebsd/src/sys/netinet/tcp_timer.c 2004/04/07 20:52:05 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_timer.c 2004/04/08 03:11:34 @@ -269,6 +269,9 @@ } } +/* + * XXXRW: This doesn't look MPSAFE. + */ void tcp_timer_2msl_reset(struct tcptw *tw, int timeo) { @@ -283,6 +286,9 @@ LIST_INSERT_BEFORE(tw_tail, tw, tw_2msl); } +/* + * XXXRW: This doesn't look MPSAFE. + */ void tcp_timer_2msl_stop(struct tcptw *tw) { @@ -291,6 +297,9 @@ LIST_REMOVE(tw, tw_2msl); } +/* + * XXXRW: This doesn't look MPSAFE. + */ struct tcptw * tcp_timer_2msl_tw(int reuse) { --- //depot/vendor/freebsd/src/sys/netinet/tcp_usrreq.c 2004/06/18 20:25:29 +++ //depot/user/rwatson/netperf/sys/netinet/tcp_usrreq.c 2004/06/19 23:42:46 @@ -116,7 +116,6 @@ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { - int s = splnet(); int error; struct inpcb *inp; struct tcpcb *tp = 0; @@ -142,11 +141,54 @@ out: TCPDEBUG2(PRU_ATTACH); INP_INFO_WUNLOCK(&tcbinfo); - splx(s); return error; } /* + * Common code to setup and teardown locking. Most + * code begins with a COMMON_START macro and finishes + * with COMMON_END. You indicate whether the inpcb + * and enclosing head are to be locked read or write. + */ +#define INI_NOLOCK 0 /* no head lock */ +#define INI_READ 1 /* read head lock */ +#define INI_WRITE 2 /* write head lock */ + +#define COMMON_START0(_headrw) do { \ + if (_headrw == INI_READ) \ + INP_INFO_RLOCK(&tcbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WLOCK(&tcbinfo); \ + inp = sotoinpcb(so); \ + if (inp == 0) { \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + else if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + return EINVAL; \ + } \ + INP_LOCK(inp); \ + if (_headrw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ +} while(0) + +#define COMMON_START(_headrw) do { \ + TCPDEBUG0; \ + COMMON_START0(_headrw); \ +} while (0) + +#define COMMON_END(_headrw, req) \ + TCPDEBUG2(req); \ + do { \ + if (tp) \ + INP_UNLOCK(inp); \ + if (_headrw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + } while(0) + +/* * pru_detach() detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a pru_disconnect(), @@ -156,83 +198,26 @@ static int tcp_usr_detach(struct socket *so) { - int s = splnet(); int error = 0; struct inpcb *inp; struct tcpcb *tp; - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - if (inp == 0) { - INP_INFO_WUNLOCK(&tcbinfo); - splx(s); - return EINVAL; /* XXX */ - } - INP_LOCK(inp); - tp = intotcpcb(inp); - TCPDEBUG1(); + COMMON_START(INI_WRITE); tp = tcp_disconnect(tp); - - TCPDEBUG2(PRU_DETACH); - if (tp) - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - splx(s); + COMMON_END(INI_WRITE, PRU_DETACH); return error; } -#define INI_NOLOCK 0 -#define INI_READ 1 -#define INI_WRITE 2 - -#define COMMON_START() \ - TCPDEBUG0; \ - do { \ - if (inirw == INI_READ) \ - INP_INFO_RLOCK(&tcbinfo); \ - else if (inirw == INI_WRITE) \ - INP_INFO_WLOCK(&tcbinfo); \ - inp = sotoinpcb(so); \ - if (inp == 0) { \ - if (inirw == INI_READ) \ - INP_INFO_RUNLOCK(&tcbinfo); \ - else if (inirw == INI_WRITE) \ - INP_INFO_WUNLOCK(&tcbinfo); \ - splx(s); \ - return EINVAL; \ - } \ - INP_LOCK(inp); \ - if (inirw == INI_READ) \ - INP_INFO_RUNLOCK(&tcbinfo); \ - tp = intotcpcb(inp); \ - TCPDEBUG1(); \ -} while(0) - -#define COMMON_END(req)