--- //depot/vendor/freebsd/src/sys/compat/svr4/svr4_stream.c 2003/10/20 03:40:41 +++ //depot/user/rwatson/netperf/sys/compat/svr4/svr4_stream.c 2004/05/24 18:58:02 @@ -171,7 +171,9 @@ return (error); #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) goto done1; #endif @@ -275,7 +277,9 @@ return (error); #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_receive(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) goto done1; #endif --- //depot/vendor/freebsd/src/sys/fs/fifofs/fifo_vnops.c 2004/05/31 18:20:38 +++ //depot/user/rwatson/netperf/sys/fs/fifofs/fifo_vnops.c 2004/05/31 20:01:56 @@ -211,7 +211,9 @@ } fip->fi_readers = fip->fi_writers = 0; wso->so_snd.sb_lowat = PIPE_BUF; - rso->so_state |= SS_CANTRCVMORE; + SOCKBUF_LOCK(&rso->so_rcv); + rso->so_rcv.sb_state |= SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&rso->so_rcv); vp->v_fifoinfo = fip; } @@ -229,7 +231,9 @@ if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { - fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; + SOCKBUF_LOCK(&fip->fi_writesock->so_snd); + fip->fi_writesock->so_snd.sb_state &= ~SBS_CANTSENDMORE; + SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd); if (fip->fi_writers > 0) { wakeup(&fip->fi_writers); sowwakeup(fip->fi_writesock); @@ -243,7 +247,9 @@ } fip->fi_writers++; if (fip->fi_writers == 1) { - fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; + SOCKBUF_LOCK(&fip->fi_writesock->so_rcv); + fip->fi_readsock->so_rcv.sb_state &= ~SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&fip->fi_writesock->so_rcv); if (fip->fi_readers > 0) { wakeup(&fip->fi_readers); sorwakeup(fip->fi_writesock); @@ -425,8 +431,10 @@ ap->a_kn->kn_hook = (caddr_t)so; + SOCKBUF_LOCK(sb); SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext); sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); return (0); } @@ -436,23 +444,34 @@ { struct socket *so = (struct socket *)kn->kn_hook; + SOCKBUF_LOCK(&so->so_rcv); SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; + SOCKBUF_UNLOCK(&so->so_rcv); } static int filt_fiforead(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; + int needlock, result; + needlock = !SOCKBUF_OWNED(&so->so_rcv); + if (needlock) + SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc; - if (so->so_state & SS_CANTRCVMORE) { + /* Unlocked read. */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; - return (1); + result = 1; + } else { + kn->kn_flags &= ~EV_EOF; + result = (kn->kn_data > 0); } - kn->kn_flags &= ~EV_EOF; - return (kn->kn_data > 0); + if (needlock) + SOCKBUF_UNLOCK(&so->so_rcv); + return (result); } static void @@ -460,23 +479,34 @@ { struct socket *so = (struct socket *)kn->kn_hook; + SOCKBUF_LOCK(&so->so_snd); SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; + SOCKBUF_UNLOCK(&so->so_snd); } static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; + int needlock, result; + needlock = !SOCKBUF_OWNED(&so->so_snd); + if (needlock) + SOCKBUF_LOCK(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); - if (so->so_state & SS_CANTSENDMORE) { + /* Unlocked read. */ + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; - return (1); + result = 1; + } else { + kn->kn_flags &= ~EV_EOF; + result = (kn->kn_data >= so->so_snd.sb_lowat); } - kn->kn_flags &= ~EV_EOF; - return (kn->kn_data >= so->so_snd.sb_lowat); + if (needlock) + SOCKBUF_UNLOCK(&so->so_snd); + return (result); } /* ARGSUSED */ --- //depot/vendor/freebsd/src/sys/fs/portalfs/portal_vnops.c 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/fs/portalfs/portal_vnops.c 2004/05/30 21:42:57 @@ -193,6 +193,7 @@ unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + /* XXXRW: Locking? */ if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) sodupsockaddr((struct sockaddr *)unp2->unp_addr, @@ -284,6 +285,7 @@ * and keep polling the reference count. XXX. */ s = splnet(); + /* XXXRW: Locking? */ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { if (fmp->pm_server->f_count == 1) { error = ECONNREFUSED; @@ -304,8 +306,12 @@ */ so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; + SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_NOINTR; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; + SOCKBUF_UNLOCK(&so->so_snd); pcred.pcr_flag = ap->a_mode; --- //depot/vendor/freebsd/src/sys/i386/conf/GENERIC 2004/05/27 17:26:29 +++ //depot/user/rwatson/netperf/sys/i386/conf/GENERIC 2004/05/30 18:41:33 @@ -66,6 +66,7 @@ options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options BREAK_TO_DEBUGGER # To make an SMP kernel, the next two are needed options SMP # Symmetric MultiProcessor Kernel --- //depot/vendor/freebsd/src/sys/kern/kern_descrip.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/kern_descrip.c 2004/05/24 13:43:27 @@ -2024,7 +2024,9 @@ *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; + SOCK_LOCK(*spp); soref(*spp); + SOCK_UNLOCK(*spp); } FILEDESC_UNLOCK(td->td_proc->p_fd); return (error); @@ -2039,6 +2041,7 @@ { NET_ASSERT_GIANT(); + SOCK_LOCK(so); sorele(so); } @@ -2054,7 +2057,7 @@ { struct flock lf; struct vnode *vp; - int error; + int error, type; FILE_LOCK_ASSERT(fp, MA_OWNED); @@ -2064,7 +2067,21 @@ } /* We have the last ref so we can proceed without the file lock. */ FILE_UNLOCK(fp); - mtx_lock(&Giant); + + /* + * XXXRW: It's not pretty, but this way we can avoid holding Giant + * over operation vectors that don't require it. Note that + * technically, this is slightly conservative as badops doesn't + * need Giant. + */ + type = fp->f_type; + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_lock(&Giant); + } if (fp->f_count < 0) panic("fdrop: count < 0"); if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { @@ -2080,7 +2097,13 @@ else error = 0; ffree(fp); - mtx_unlock(&Giant); + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_unlock(&Giant); + } return (error); } --- //depot/vendor/freebsd/src/sys/kern/kern_prot.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/kern_prot.c 2004/05/07 19:12:27 @@ -1685,7 +1685,9 @@ if (error) return (ENOENT); #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_visible(cred, so); + SOCK_UNLOCK(so); if (error) return (error); #endif --- //depot/vendor/freebsd/src/sys/kern/kern_timeout.c 2004/04/24 21:10:43 +++ //depot/user/rwatson/netperf/sys/kern/kern_timeout.c 2004/05/03 19:32:28 @@ -44,6 +44,7 @@ #include #include #include +#include #include static int avg_depth; @@ -55,6 +56,89 @@ static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); + +/*- + * Sampling buffer of function pointers executed by timeouts and callouts. + * This circular buffer wraps when it fills, and uses an inefficient + * sbuf-based sysctl to dump sample data to userspace. Sysctls can select + * to monitor mpsafe and !mpsafe callouts/timeouts as desired. Suggested + * use is: (1) set sample of interest (mpsafe/notmpsafe), (2) reset the + * buffer, (3) do some benchmark/test, (5) disable sampling, (6) dump + * buffer. + * + * XXX: ifdef TIMEOUT_SAMPLING? + */ + +#define MAXFUNC 200000 +static void * func_array[MAXFUNC]; +static int array_off; + +static void +push_cfunc(void *ptr) +{ + + /* XXX */ + func_array[array_off % MAXFUNC] = ptr; + array_off++; +} + +static int +sysctl_cfunc(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sb; + int error, i; + + if (req->newptr != NULL) + return (EINVAL); + + sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND); + + for (i = 0; i < MAXFUNC; i++) { + if (func_array[i] == NULL) + break; + sbuf_printf(&sb, "%p ", func_array[i]); + } + sbuf_finish(&sb); + + error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb) + 1, req); + + sbuf_delete(&sb); + + return (error); +} + +SYSCTL_PROC(_debug, OID_AUTO, to_cfunc, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, + sysctl_cfunc, "A", "callout/timeout sample"); + +static int +sysctl_cfunc_reset(SYSCTL_HANDLER_ARGS) +{ + int dummy, error; + + dummy = 0; + error = sysctl_handle_int(oidp, &dummy, 0, req); + if (error) + return (error); + + if (dummy != 0) { + bzero(func_array, sizeof(void *) * MAXFUNC); + array_off = 0; + } + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, to_cfunc_reset, CTLTYPE_INT|CTLFLAG_RW, 0, 0, + sysctl_cfunc_reset, "I", "Reset sample"); + +static int cfunc_sample_mpsafe; +static int cfunc_sample_notmpsafe; + +SYSCTL_INT(_debug, OID_AUTO, to_cfunc_mpsafe, CTLFLAG_RW, + &cfunc_sample_mpsafe, 0, "Sample mpsafe callouts"); +SYSCTL_INT(_debug, OID_AUTO, to_cfunc_notmpsafe, CTLFLAG_RW, + &cfunc_sample_notmpsafe, 0, "Sample !mpsafe callouts"); + /* * TODO: * allocate more timeout table slots when table overflows. @@ -245,8 +329,12 @@ if (!(c_flags & CALLOUT_MPSAFE)) { mtx_lock(&Giant); gcalls++; + if (cfunc_sample_mpsafe) + push_cfunc(c_func); } else { mpcalls++; + if (cfunc_sample_notmpsafe) + push_cfunc(c_func); } #ifdef DIAGNOSTIC binuptime(&bt1); --- //depot/vendor/freebsd/src/sys/kern/subr_log.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/subr_log.c 2004/04/06 20:50:45 @@ -83,7 +83,12 @@ struct callout sc_callout; /* callout to wakeup syslog */ } logsoftc; -int log_open; /* also used in log() */ +/* + * log_mtx protects logsoftc, log_open. Note that log_mtx does *not* + * protect the structures associated with msgbuf, which require Giant. + */ +struct mtx log_mtx; +int log_open; /* also used in log() */ /* Times per second to check for a pending syslog wakeup. */ static int log_wakeups_per_second = 5; @@ -94,17 +99,24 @@ static int logopen(dev_t dev, int flags, int mode, struct thread *td) { - if (log_open) + + mtx_lock(&log_mtx); + if (log_open) { + mtx_unlock(&log_mtx); return (EBUSY); + } log_open = 1; - callout_init(&logsoftc.sc_callout, 0); + callout_init(&logsoftc.sc_callout, CALLOUT_MPSAFE); + mtx_unlock(&log_mtx); fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */ + mtx_lock(&log_mtx); if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return (0); } @@ -113,9 +125,11 @@ logclose(dev_t dev, int flag, int mode, struct thread *td) { + mtx_lock(&log_mtx); log_open = 0; callout_stop(&logsoftc.sc_callout); logsoftc.sc_state = 0; + mtx_unlock(&log_mtx); funsetown(&logsoftc.sc_sigio); return (0); } @@ -134,14 +148,18 @@ splx(s); return (EWOULDBLOCK); } + mtx_lock(&log_mtx); logsoftc.sc_state |= LOG_RDWAIT; + mtx_unlock(&log_mtx); if ((error = tsleep(mbp, LOG_RDPRI | PCATCH, "klog", 0))) { splx(s); return (error); } } splx(s); + mtx_lock(&log_mtx); logsoftc.sc_state &= ~LOG_RDWAIT; + mtx_unlock(&log_mtx); while (uio->uio_resid > 0) { l = imin(sizeof(buf), uio->uio_resid); @@ -178,8 +196,11 @@ logtimeout(void *arg) { - if (!log_open) + mtx_lock(&log_mtx); + if (!log_open) { + mtx_unlock(&log_mtx); return; + } if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; @@ -187,6 +208,7 @@ if (msgbuftrigger == 0) { callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); return; } msgbuftrigger = 0; @@ -199,6 +221,7 @@ } callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, logtimeout, NULL); + mtx_unlock(&log_mtx); } /*ARGSUSED*/ @@ -217,10 +240,12 @@ break; case FIOASYNC: + mtx_lock(&log_mtx); if (*(int *)data) logsoftc.sc_state |= LOG_ASYNC; else logsoftc.sc_state &= ~LOG_ASYNC; + mtx_unlock(&log_mtx); break; case FIOSETOWN: @@ -249,6 +274,7 @@ log_drvinit(void *unused) { + mtx_init(&log_mtx, "log_mtx", NULL, MTX_DEF); make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog"); } --- //depot/vendor/freebsd/src/sys/kern/subr_prf.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/subr_prf.c 2004/04/06 20:50:45 @@ -83,6 +83,9 @@ size_t remain; }; +/* + * XXXRW: We access subr_log.c's log_open variable unlocked. + */ extern int log_open; static void msglogchar(int c, int pri); --- //depot/vendor/freebsd/src/sys/kern/subr_witness.c 2004/03/22 16:35:21 +++ //depot/user/rwatson/netperf/sys/kern/subr_witness.c 2004/05/24 17:08:42 @@ -268,6 +268,49 @@ { "allprison", &lock_class_mtx_sleep }, { NULL, NULL }, /* + * Sockets + */ + { "filedesc structure", &lock_class_mtx_sleep }, + { "accept", &lock_class_mtx_sleep }, + { "so_snd", &lock_class_mtx_sleep }, + { "so_rcv", &lock_class_mtx_sleep }, + { "sellck", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * Routing + */ + { "so_rcv", &lock_class_mtx_sleep }, + { "radix node head", &lock_class_mtx_sleep }, + { "rtentry", &lock_class_mtx_sleep }, + { "ifaddr", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * UNIX Domain Sockets + */ + { "unp head", &lock_class_mtx_sleep }, + { "unp", &lock_class_mtx_sleep }, + { "so_snd", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * UDP/IP + */ + { "udp", &lock_class_mtx_sleep }, + { "udpinp", &lock_class_mtx_sleep }, + { "so_snd", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * TCP/IP + */ + { "tcp", &lock_class_mtx_sleep }, + { "tcpinp", &lock_class_mtx_sleep }, + { "so_snd", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * SLIP + */ + { "slip_mtx", &lock_class_mtx_sleep }, + { "slip sc_mtx", &lock_class_mtx_sleep }, + /* * spin locks */ #ifdef SMP --- //depot/vendor/freebsd/src/sys/kern/sys_socket.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/sys_socket.c 2004/05/30 11:22:04 @@ -77,7 +77,9 @@ NET_LOCK_GIANT(); #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_receive(active_cred, so); + SOCK_UNLOCK(so); if (error) { NET_UNLOCK_GIANT(); return (error); @@ -102,7 +104,9 @@ NET_LOCK_GIANT(); #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_send(active_cred, so); + SOCK_UNLOCK(so); if (error) { NET_UNLOCK_GIANT(); return (error); @@ -127,25 +131,40 @@ switch (cmd) { case FIONBIO: + SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; + SOCK_UNLOCK(so); return (0); case FIOASYNC: + /* + * XXXRW: Implicit assumption that SOCK_LOCK(so) + * == SOCKBUF_LOCK(&so->so_rcv); + */ if (*(int *)data) { + SOCK_LOCK(so); so->so_state |= SS_ASYNC; so->so_rcv.sb_flags |= SB_ASYNC; + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); } else { + SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); } return (0); case FIONREAD: + /* Unlocked read. */ *(int *)data = so->so_rcv.sb_cc; return (0); @@ -164,7 +183,8 @@ return (0); case SIOCATMARK: - *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + /* Unlocked read. */ + *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; return (0); } /* @@ -203,13 +223,17 @@ bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; /* - * If SS_CANTRCVMORE is set, but there's still data left in the + * If SBS_CANTRCVMORE is set, but there's still data left in the * receive buffer, the socket is still readable. + * + * XXXRW: perhaps should lock socket buffer so st_size result + * is consistent. */ - if ((so->so_state & SS_CANTRCVMORE) == 0 || + /* Unlocked read. */ + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; - if ((so->so_state & SS_CANTSENDMORE) == 0) + if ((so->so_snd.sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; ub->st_uid = so->so_cred->cr_uid; --- //depot/vendor/freebsd/src/sys/kern/uipc_accf.c 2004/05/30 11:02:05 +++ //depot/user/rwatson/netperf/sys/kern/uipc_accf.c 2004/05/30 18:41:33 @@ -35,15 +35,23 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include #include +struct mtx accept_filter_mtx; +MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx", + MTX_DEF); +#define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx) +#define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx) + static SLIST_HEAD(, accept_filter) accept_filtlsthd = SLIST_HEAD_INITIALIZER(&accept_filtlsthd); @@ -67,12 +75,15 @@ { struct accept_filter *p; + ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, filt->accf_name) == 0) { if (p->accf_callback != NULL) { + ACCEPT_FILTER_UNLOCK(); return (EEXIST); } else { p->accf_callback = filt->accf_callback; + ACCEPT_FILTER_UNLOCK(); FREE(filt, M_ACCF); return (0); } @@ -80,6 +91,7 @@ if (p == NULL) SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); + ACCEPT_FILTER_UNLOCK(); return (0); } @@ -101,11 +113,13 @@ { struct accept_filter *p; + ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, name) == 0) - return (p); + break; + ACCEPT_FILTER_UNLOCK(); - return (NULL); + return (p); } int @@ -113,15 +127,13 @@ { struct accept_filter *p; struct accept_filter *accfp = (struct accept_filter *) data; - int s, error; + int error; switch (event) { case MOD_LOAD: MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, M_WAITOK); bcopy(accfp, p, sizeof(*p)); - s = splnet(); error = accept_filt_add(p); - splx(s); break; case MOD_UNLOAD: @@ -132,9 +144,7 @@ * in the struct accept_filter. */ if (unloadable != 0) { - s = splnet(); error = accept_filt_del(accfp->accf_name); - splx(s); } else error = EOPNOTSUPP; break; --- //depot/vendor/freebsd/src/sys/kern/uipc_socket.c 2004/05/31 19:45:32 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket.c 2004/05/31 20:01:56 @@ -106,6 +106,8 @@ &so_zero_copy_send, 0, "Enable zero copy send"); #endif /* ZERO_COPY_SOCKETS */ +struct mtx accept_mtx; +MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * Socket operation routines. @@ -141,6 +143,8 @@ return so; } #endif + SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); + SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); /* XXX race condition for reentrant kernel */ so->so_gencnt = ++so_gencnt; /* sx_init(&so->so_sxlock, "socket sxlock"); */ @@ -196,9 +200,12 @@ #ifdef MAC mac_create_socket(cred, so); #endif + SOCK_LOCK(so); soref(so); + SOCK_UNLOCK(so); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); if (error) { + SOCK_LOCK(so); so->so_state |= SS_NOFDREF; sorele(so); return (error); @@ -242,6 +249,8 @@ mac_destroy_socket(so); #endif crfree(so->so_cred); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); /* sx_destroy(&so->so_sxlock); */ uma_zfree(socket_zone, so); --numopensockets; @@ -256,21 +265,32 @@ int s, error; s = splnet(); + /* Unlocked read. */ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) { splx(s); return (EINVAL); } + /* + * XXXRW: Ordering issue here -- perhaps we need to set + * SO_ACCEPTCONN before the call to pru_listen()? + * XXXRW: General atomic test-and-set concerns here also. + */ error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); if (error) { splx(s); return (error); } - if (TAILQ_EMPTY(&so->so_comp)) + ACCEPT_LOCK(); + if (TAILQ_EMPTY(&so->so_comp)) { + SOCK_LOCK(so); so->so_options |= SO_ACCEPTCONN; + SOCK_UNLOCK(so); + } if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; + ACCEPT_UNLOCK(); splx(s); return (0); } @@ -280,42 +300,103 @@ struct socket *so; { struct socket *head; - int s; KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); + SOCK_LOCK_ASSERT(so); - if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) + /* XXXRW: Why would SS_NOFDREF be unset here? so_count is 0. */ + if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { + SOCK_UNLOCK(so); return; - if (so->so_head != NULL) { - head = so->so_head; - if (so->so_qstate & SQ_INCOMP) { - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - } else if (so->so_qstate & SQ_COMP) { - /* - * We must not decommission a socket that's - * on the accept(2) queue. If we do, then - * accept(2) may hang after select(2) indicated - * that the listening socket was ready. - */ + } + SOCK_UNLOCK(so); + + ACCEPT_LOCK(); + head = so->so_head; + if (head != NULL) { + KASSERT((so->so_qstate & SQ_COMP) != 0 || + (so->so_qstate & SQ_INCOMP) != 0, + ("sofree: so_head != NULL, but neither SQ_COMP nor " + "SQ_INCOMP")); + KASSERT((so->so_qstate & SQ_COMP) == 0 || + (so->so_qstate & SQ_INCOMP) == 0, + ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); + /* + * accept(2) is responsible draining the completed + * connection queue and freeing those sockets, so + * we just return here if this socket is currently + * on the completed connection queue. Otherwise, + * accept(2) may hang after select(2) has indicating + * that a listening socket was ready. If it's an + * incomplete connection, we remove it from the queue + * and free it; otherwise, it won't be released until + * the listening socket is closed. + */ + if ((so->so_qstate & SQ_COMP) != 0) { + ACCEPT_UNLOCK(); return; - } else { - panic("sofree: not queued"); } + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; so->so_head = NULL; } + KASSERT((so->so_qstate & SQ_COMP) == 0 && + (so->so_qstate & SQ_INCOMP) == 0, + ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", + so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); + ACCEPT_UNLOCK(); + + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; (void)sblock(&so->so_snd, M_WAITOK); - s = splimp(); - socantsendmore(so); - splx(s); + socantsendmore_locked(so); sbunlock(&so->so_snd); sbrelease(&so->so_snd, so); + SOCKBUF_UNLOCK(&so->so_snd); sorflush(so); sodealloc(so); } +#if 0 +static void +dump_socket(struct socket *so) +{ + + printf("so = %p\n", so); + printf(" so_count = %d\n", so->so_count); + printf(" so_type = %d\n", so->so_type); + printf(" so_options = %d\n", so->so_options); + printf(" so_linger = %d\n", so->so_linger); + printf(" so_state = %d\n", so->so_state); + printf(" so_qstate = %d\n", so->so_qstate); + printf(" so_pcb = %p\n", so->so_pcb); + printf(" so_proto = %p\n", so->so_proto); + printf(" pr_type = %d\n", so->so_proto->pr_type); + printf(" pr_domain = %p\n", so->so_proto->pr_domain); + printf(" dom_family = %d\n", so->so_proto->pr_domain->dom_family); + printf(" dom_name = %s\n", so->so_proto->pr_domain->dom_name); + printf(" pr_protocol = %d\n", so->so_proto->pr_protocol); + printf(" pr_flags = %d\n", so->so_proto->pr_flags); + printf(" so_head = %p\n", so->so_head); + printf(" TAILQ_FIRST(so_incomp) = %p\n", TAILQ_FIRST(&so->so_incomp)); + printf(" TAILQ_FIRST(so_comp) = %p\n", TAILQ_FIRST(&so->so_comp)); + printf(" so_qlen = %d\n", so->so_qlen); + printf(" so_incqlen = %d\n", so->so_incqlen); + printf(" so_qlimit = %d\n", so->so_qlimit); + printf(" so_timeo = %d\n", so->so_timeo); + printf(" so_error = %d\n", so->so_error); + printf(" so_sigio = %p\n", so->so_sigio); + printf(" so_oobmark = %lu\n", so->so_oobmark); + printf(" so_rcv.sb_state = %d\n", so->so_rcv.sb_state); + printf(" so_snd.sb_state = %d\n", so->so_snd.sb_state); + printf(" so_upcall = %p\n", so->so_upcall); + printf(" so_upcallarg = %p\n", so->so_upcallarg); + printf(" so_cred = %p\n", so->so_cred); + printf(" so_gencnt = %d\n", (int)so->so_gencnt); +} +#endif + /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. @@ -332,27 +413,52 @@ int s = splnet(); /* conservative */ int error = 0; + KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + funsetown(&so->so_sigio); + /*- + * XXXRW: Lots of locking problems here. For one thing, + * we should probably clear the SO_ACCEPTCONN flag and + * push that down to the protocol layer. Also, the locking + * for the queue draining is probably not right either: + * what prevents new ones from being inserted after we get + * past this? Ideally, the downcall would prevent this, but + * there isn't one. + */ + /* + * XXXRW: soclose() should be split into two parts: one to handle + * listen sockets, and the other to handle the remainder. To + * prevent races with the protocol code, we need to detach before + * draining the queues. For non-listen sockets, the disconnect + * has to happen before the detach. + */ + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) { - struct socket *sp, *sonext; - - sp = TAILQ_FIRST(&so->so_incomp); - for (; sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); + struct socket *sp; + ACCEPT_LOCK(); + while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { + TAILQ_REMOVE(&so->so_incomp, sp, so_list); + so->so_incqlen--; + sp->so_qstate &= ~SQ_INCOMP; + sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } - for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); - /* Dequeue from so_comp since sofree() won't do it */ + while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; sp->so_qstate &= ~SQ_COMP; sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } + ACCEPT_UNLOCK(); } if (so->so_pcb == NULL) goto discard; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); @@ -373,13 +479,14 @@ } drop: if (so->so_pcb != NULL) { - int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); + int error2; + error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; } discard: - if (so->so_state & SS_NOFDREF) - panic("soclose: NOFDREF"); + SOCK_LOCK(so); + KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); splx(s); @@ -387,7 +494,9 @@ } /* - * Must be called at splnet... + * XXXRW: soabort() must not be called with any locks held, as the protocol + * will need to be able to grab socket and socket buffer locks, and we also + * try to free the socket if the protocol generates an error. */ int soabort(so) @@ -397,6 +506,7 @@ error = (*so->so_proto->pr_usrreqs->pru_abort)(so); if (error) { + SOCK_LOCK(so); sotryfree(so); /* note: does not decrement the ref count */ return error; } @@ -408,14 +518,13 @@ struct socket *so; struct sockaddr **nam; { - int s = splnet(); int error; - if ((so->so_state & SS_NOFDREF) == 0) - panic("soaccept: !NOFDREF"); + SOCK_LOCK(so); + KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; + SOCK_UNLOCK(so); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); - splx(s); return (error); } @@ -425,25 +534,23 @@ struct sockaddr *nam; struct thread *td; { - int s; int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); - s = splnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ + /* Unlocked read. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) error = EISCONN; else error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); - splx(s); return (error); } @@ -452,11 +559,9 @@ struct socket *so1; struct socket *so2; { - int s = splnet(); int error; error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); - splx(s); return (error); } @@ -464,20 +569,14 @@ sodisconnect(so) struct socket *so; { - int s = splnet(); int error; - if ((so->so_state & SS_ISCONNECTED) == 0) { - error = ENOTCONN; - goto bad; - } - if (so->so_state & SS_ISDISCONNECTING) { - error = EALREADY; - goto bad; - } + /* Unlocked read. */ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + if (so->so_state & SS_ISDISCONNECTING) + return EALREADY; error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); -bad: - splx(s); return (error); } @@ -528,7 +627,7 @@ struct mbuf **mp; struct mbuf *m; long space, len = 0, resid; - int clen = 0, error, s, dontroute; + int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; #ifdef ZERO_COPY_SOCKETS int cow_send; @@ -560,20 +659,19 @@ td->td_proc->p_stats->p_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; -#define snderr(errno) { error = (errno); splx(s); goto release; } +#define snderr(errno) { error = (errno); goto release; } -restart: + SOCKBUF_LOCK(&so->so_snd); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; do { - s = splnet(); - if (so->so_state & SS_CANTSENDMORE) + SOCKBUF_LOCK_ASSERT(&so->so_snd); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; - splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { @@ -602,14 +700,12 @@ (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) snderr(EWOULDBLOCK); - sbunlock(&so->so_snd); error = sbwait(&so->so_snd); - splx(s); if (error) - goto out; - goto restart; + goto release; + continue; } - splx(s); + SOCKBUF_UNLOCK(&so->so_snd); mp = ⊤ space -= clen; do { @@ -630,6 +726,7 @@ MGETHDR(m, M_TRYWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; + SOCKBUF_LOCK(&so->so_snd); goto release; } m->m_pkthdr.len = 0; @@ -638,6 +735,7 @@ MGET(m, M_TRYWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; + SOCKBUF_LOCK(&so->so_snd); goto release; } } @@ -691,6 +789,7 @@ } if (m == NULL) { error = ENOBUFS; + SOCKBUF_LOCK(&so->so_snd); goto release; } @@ -705,8 +804,10 @@ m->m_len = len; *mp = m; top->m_pkthdr.len += len; - if (error) + if (error) { + SOCKBUF_LOCK(&so->so_snd); goto release; + } mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) @@ -714,15 +815,26 @@ break; } } while (space > 0 && atomic); - if (dontroute) + /* + * XXXRW: There may be a race condition here regarding + * SO_DONTROUTE. We hold the sblock() so in theory + * there won't be other consumers of the socket doing + * a pru_send(), but SO_DONTROUTE is also consumed + * from the protocol side. It sounds like we might + * want to use MSG_DONTROUTE here, if the semantics are + * right. + */ + if (dontroute) { + SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; - s = splnet(); /* XXX */ + SOCK_UNLOCK(so); + } /* - * XXX all the SS_CANTSENDMORE checks previously + * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have recieved * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We could - * probably recheck again inside the splnet() protection + * probably recheck again inside the locking protection * here, but there are probably other places that this * also happens. We must rethink this. */ @@ -740,21 +852,32 @@ /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); - splx(s); - if (dontroute) + /* + * XXXRW: Second half of above comment on SO_DONTROUTE. + * and so_options. + */ + if (dontroute) { + SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; + SOCK_UNLOCK(so); + } clen = 0; control = NULL; top = NULL; mp = ⊤ - if (error) + if (error) { + SOCKBUF_LOCK(&so->so_snd); goto release; + } } while (resid && space > 0); + SOCKBUF_LOCK(&so->so_snd); } while (resid); release: + SOCKBUF_LOCK_ASSERT(&so->so_snd); sbunlock(&so->so_snd); out: + SOCKBUF_UNLOCK(&so->so_snd); if (top != NULL) m_freem(top); if (control != NULL) @@ -788,7 +911,7 @@ int *flagsp; { struct mbuf *m, **mp; - int flags, len, error, s, offset; + int flags, len, error, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; @@ -843,15 +966,17 @@ } if (mp != NULL) *mp = NULL; + /* Unlocked read. */ if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreqs->pru_rcvd)(so, 0); -restart: + SOCKBUF_LOCK(&so->so_rcv); error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) - return (error); - s = splnet(); + goto out; +restart: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more @@ -869,9 +994,8 @@ (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { - KASSERT(m != NULL || !so->so_rcv.sb_cc, - ("receive: m == %p so->so_rcv.sb_cc == %u", - m, so->so_rcv.sb_cc)); + KASSERT(!(m == NULL && so->so_rcv.sb_cc), + ("m %p so->so_rcv.sb_cc %u", m, so->so_rcv.sb_cc)); if (so->so_error) { if (m != NULL) goto dontblock; @@ -880,7 +1004,8 @@ so->so_error = 0; goto release; } - if (so->so_state & SS_CANTRCVMORE) { + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m) goto dontblock; else @@ -891,6 +1016,7 @@ m = so->so_rcv.sb_mb; goto dontblock; } + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; @@ -898,6 +1024,7 @@ } if (uio->uio_resid == 0) goto release; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; @@ -905,14 +1032,14 @@ } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); - sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); - splx(s); if (error) - return (error); + goto release; goto restart; } dontblock: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + KASSERT(error == 0, ("unexpected state, error %u", error)); if (uio->uio_td) uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); @@ -921,41 +1048,110 @@ if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); - orig_resid = 0; - if (psa != NULL) + if (psa != NULL) { *psa = sodupsockaddr(mtod(m, struct sockaddr *), - mp0 == NULL ? M_WAITOK : M_NOWAIT); + M_NOWAIT); + if (*psa == NULL) { + error = ENOMEM; + goto release; + } + /* + * XXXRW: In the rwatson_netperf branch, we don't + * release the socket buffer lock here because + * we always use M_NOWAIT. In the main tree, + * or if we restore conditional waiting, we + * need to refresh nextpacket from the socket + * buffer version of the head mbuf, or we may + * have a stale value if it was previously NULL + * and now isn't. + */ + nextrecord = m->m_nextpkt; + } if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; + /* + * XXXRW: When running MPSAFE, we may release + * locks after this point, and therefore have + * effective preemption relative to the socket + * buffer mbuf chain. Since we've modified + * the head, we have to make sure the new head + * has the right nextpkt copied from the + * original head. + */ + m->m_nextpkt = nextrecord; } + orig_resid = 0; } - while (m != NULL && m->m_type == MT_CONTROL && error == 0) { - if (flags & MSG_PEEK) { - if (controlp != NULL) - *controlp = m_copy(m, 0, m->m_len); - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); - so->so_rcv.sb_mb = m->m_next; - m->m_next = NULL; - if (pr->pr_domain->dom_externalize) - error = - (*pr->pr_domain->dom_externalize)(m, controlp); - else if (controlp != NULL) - *controlp = m; - else - m_freem(m); - m = so->so_rcv.sb_mb; + if (m != NULL && m->m_type == MT_CONTROL) { + struct mbuf *cm = NULL; + struct mbuf **cme = &cm; + + do { + if (flags & MSG_PEEK) { + /* + * XXXRW: In the BSD/OS version of this + * code, m_copym() is called with M_TRYWAIT, + * and we catch allcation failures and + * jump to release with error of ENOBUFS. + * For consistency with the current + * implementation, we don't. + */ + if (controlp != NULL) { + *controlp = m_copym(m, 0, m->m_len, + M_DONTWAIT); + controlp = &(*controlp)->m_next; + } + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m->m_next; + m->m_next = NULL; + if (controlp) { + /* + * Collect mbufs for processing below. + */ + *cme = m; + cme = &(*cme)->m_next; + } else + m_free(m); + m = so->so_rcv.sb_mb; + } + } while (m != NULL && m->m_type == MT_CONTROL); + /* + * XXXRW: Since we're dropping the socket buffer locks, and + * may have modified the mbuf list on the socket buffer, + * push out the latest version so it can be seen by any + * other code that accesses the buffer in the mean time. + */ + so->so_rcv.sb_mb->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_lastrecord = so->so_rcv.sb_mb; + if (cm != NULL) { + if (pr->pr_domain->dom_externalize != NULL) { + /* + * NB: drop the lock to avoid potential LORs; + * in particular unix domain sockets grab the + * file descriptor lock which would be a LOR. + */ + SOCKBUF_UNLOCK(&so->so_rcv); + error = (*pr->pr_domain->dom_externalize) + (cm, controlp); + SOCKBUF_LOCK(&so->so_rcv); + } else + m_freem(cm); } - if (controlp != NULL) { - orig_resid = 0; - while (*controlp != NULL) - controlp = &(*controlp)->m_next; - } + /* + * XXXRW: During externalization, additional mbuf chains + * may have been added to the socket buffer. Update our + * local cache to avoid using a stale value and corrupting + * the list. + */ + nextrecord = so->so_rcv.sb_mb->m_nextpkt; + orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { @@ -981,6 +1177,7 @@ SB_EMPTY_FIXUP(&so->so_rcv); } } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); @@ -995,7 +1192,7 @@ else KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, ("m->m_type == %d", m->m_type)); - so->so_state &= ~SS_RCVATMARK; + so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; @@ -1012,7 +1209,7 @@ if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); - splx(s); + SOCKBUF_UNLOCK(&so->so_rcv); #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { vm_page_t pg; @@ -1036,7 +1233,7 @@ } else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, char *) + moff, (int)len, uio); - s = splnet(); + SOCKBUF_LOCK(&so->so_rcv); if (error) goto release; } else @@ -1078,6 +1275,7 @@ *mp = m_copym(m, 0, len, M_TRYWAIT); m->m_data += len; m->m_len -= len; + SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_cc -= len; } } @@ -1085,7 +1283,9 @@ if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { - so->so_state |= SS_RCVATMARK; + /* XXXRW: so_state locking? */ + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { @@ -1105,11 +1305,13 @@ */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { - if (so->so_error || so->so_state & SS_CANTRCVMORE) + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. + * XXXRW: Drop socket buffer lock here? */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) (*pr->pr_usrreqs->pru_rcvd)(so, flags); @@ -1117,9 +1319,8 @@ SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { - sbunlock(&so->so_rcv); - splx(s); - return (0); + error = 0; + goto release; } m = so->so_rcv.sb_mb; if (m != NULL) @@ -1148,21 +1349,23 @@ } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); + /* + * XXXRW: Drop socket buffer lock here? + */ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pru_rcvd)(so, flags); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && - (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { - sbunlock(&so->so_rcv); - splx(s); - goto restart; - } + (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) + goto restart; /* XXX multi-counts msgs */ if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); - splx(s); +out: + SOCKBUF_UNLOCK(&so->so_rcv); return (error); } @@ -1189,23 +1392,23 @@ { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; - int s; struct sockbuf asb; + SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOINTR; (void) sblock(sb, M_WAITOK); - s = splimp(); - socantrcvmore(so); + socantrcvmore_locked(so); sbunlock(sb); asb = *sb; /* - * Invalidate/clear most of the sockbuf structure, but keep - * its selinfo structure valid. + * Invalidate/clear most of the sockbuf structure, but leave + * selinfo and mutex data unchanged. */ bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); - splx(s); + SOCKBUF_UNLOCK(sb); + /* XXXRW: is passing in sb_mb this way really safe? */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease(&asb, so); @@ -1217,20 +1420,31 @@ struct socket *so; struct sockopt *sopt; { - struct accept_filter_arg *afap = NULL; + struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *af = so->so_accf; + struct so_accf *newaf; int error = 0; + newaf = NULL; + afap = NULL; + + /* + * XXXRW: Configuring accept filters should be an atomic + * test-and-set operation to prevent races during setup and + * and attach. There may be more general issues of racing + * and ordering here that are not yet addressed by locking. + */ /* do not set/remove accept filters on non listen sockets */ + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { - error = EINVAL; - goto out; + SOCK_UNLOCK(so); + return (EINVAL); } /* removing the filter */ if (sopt == NULL) { - if (af != NULL) { + if (so->so_accf != NULL) { + struct so_accf *af = so->so_accf; if (af->so_accept_filter != NULL && af->so_accept_filter->accf_destroy != NULL) { af->so_accept_filter->accf_destroy(so); @@ -1242,47 +1456,80 @@ so->so_accf = NULL; } so->so_options &= ~SO_ACCEPTFILTER; + SOCK_UNLOCK(so); return (0); } - /* adding a filter */ - /* must remove previous filter first */ - if (af != NULL) { - error = EINVAL; - goto out; - } + SOCK_UNLOCK(so); + + /*- + * Adding a filter. + * + * Do memory allocation, copyin, and filter lookup now while we're + * not holding any locks. Avoids sleeping with a mutex, as well + * as introducing a lock order between accept filter locks and + * socket locks here. + */ + MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, + M_WAITOK); /* don't put large objects on the kernel stack */ - MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; - if (error) - goto out; + if (error) { + FREE(afap, M_TEMP); + return (error); + } afp = accept_filt_get(afap->af_name); if (afp == NULL) { - error = ENOENT; + FREE(afap, M_TEMP); + return (ENOENT); + } + + /* + * Allocate the new accept filter instance storage. We may have + * to free it again later if we fail to attach it. If attached + * properly, 'newaf' is NULLed to avoid a free() while in use. + */ + MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | + M_ZERO); + if (afp->accf_create != NULL && afap->af_name[0] != '\0') { + int len = strlen(afap->af_name) + 1; + MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, + M_WAITOK); + strcpy(newaf->so_accept_filter_str, afap->af_name); + } + + SOCK_LOCK(so); + /* must remove previous filter first */ + if (so->so_accf != NULL) { + error = EINVAL; goto out; } - MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); + /* + * Invoke the accf_create() method of the filter if required. + * XXXRW: the socket mutex is held over this call, so the + * create method cannot block. This may be something we have + * to change, but it would require addressing possible races. + */ if (afp->accf_create != NULL) { - if (afap->af_name[0] != '\0') { - int len = strlen(afap->af_name) + 1; - - MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); - strcpy(af->so_accept_filter_str, afap->af_name); - } - af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); - if (af->so_accept_filter_arg == NULL) { - FREE(af->so_accept_filter_str, M_ACCF); - FREE(af, M_ACCF); - so->so_accf = NULL; + newaf->so_accept_filter_arg = + afp->accf_create(so, afap->af_arg); + if (newaf->so_accept_filter_arg == NULL) { error = EINVAL; goto out; } } - af->so_accept_filter = afp; - so->so_accf = af; + newaf->so_accept_filter = afp; + so->so_accf = newaf; so->so_options |= SO_ACCEPTFILTER; + newaf = NULL; out: + SOCK_UNLOCK(so); + if (newaf != NULL) { + if (newaf->so_accept_filter_str != NULL) + FREE(newaf->so_accept_filter_str, M_ACCF); + FREE(newaf, M_ACCF); + } if (afap != NULL) FREE(afap, M_TEMP); return (error); @@ -1356,11 +1603,13 @@ if (error) goto bad; + SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; + SOCK_UNLOCK(so); break; case SO_DEBUG: @@ -1378,10 +1627,12 @@ sizeof optval); if (error) goto bad; + SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; + SOCK_UNLOCK(so); break; case SO_SNDBUF: @@ -1543,23 +1794,33 @@ switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: + /* Unlocked read. */ if ((so->so_options & SO_ACCEPTCONN) == 0) return (EINVAL); MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTFILTER) != 0) { strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); if (so->so_accf->so_accept_filter_str != NULL) strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); } + SOCK_UNLOCK(so); error = sooptcopyout(sopt, afap, sizeof(*afap)); FREE(afap, M_TEMP); break; #endif case SO_LINGER: + /* + * XXXRW: We grab the lock here to get a consistent + * snapshot of both fields. This may not really + * be necessary. + */ + SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; + SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; @@ -1574,6 +1835,7 @@ case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: + /* Unlocked read. */ optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); @@ -1780,10 +2042,16 @@ int revents = 0; int s = splnet(); + /* + * XXXRW: Lots of unlocked reads, and some writes. Probably + * some more locking is called for here, especially when + * setting the sb_sel flags. + */ if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) revents |= events & (POLLIN | POLLRDNORM); + /* Unlocked read. */ if (events & POLLINIGNEOF) if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || !TAILQ_EMPTY(&so->so_comp) || so->so_error) @@ -1794,20 +2062,30 @@ revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if (revents == 0) { if (events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND)) { + /* + * XXXRW: Should lock also be held over selrecord()? + */ selrecord(td, &so->so_rcv.sb_sel); + SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_SEL; + SOCKBUF_UNLOCK(&so->so_rcv); } if (events & (POLLOUT | POLLWRNORM)) { + /* + * XXXRW: Should lock also be held over selrecord()? + */ selrecord(td, &so->so_snd.sb_sel); + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_SEL; + SOCKBUF_UNLOCK(&so->so_snd); } } @@ -1820,10 +2098,10 @@ { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; - int s; switch (kn->kn_filter) { case EVFILT_READ: + /* Unlocked read. */ if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else @@ -1838,10 +2116,10 @@ return (1); } - s = splnet(); + SOCKBUF_LOCK(sb); SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; - splx(s); + SOCKBUF_UNLOCK(sb); return (0); } @@ -1849,12 +2127,12 @@ filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - int s = splnet(); + SOCKBUF_LOCK(&so->so_rcv); SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - splx(s); + SOCKBUF_UNLOCK(&so->so_rcv); } /*ARGSUSED*/ @@ -1862,10 +2140,13 @@ filt_soread(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; - int result; + int needlock, result; + needlock = !SOCKBUF_OWNED(&so->so_rcv); + if (needlock) + SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; @@ -1875,6 +2156,8 @@ result = (kn->kn_data >= kn->kn_sdata); else result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); + if (needlock) + SOCKBUF_UNLOCK(&so->so_rcv); return (result); } @@ -1882,12 +2165,12 @@ filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - int s = splnet(); + SOCKBUF_LOCK(&so->so_snd); SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - splx(s); + SOCKBUF_UNLOCK(&so->so_snd); } /*ARGSUSED*/ @@ -1895,10 +2178,13 @@ filt_sowrite(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; - int result; + int needlock, result; + needlock = !SOCKBUF_OWNED(&so->so_snd); + if (needlock) + SOCKBUF_LOCK(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); - if (so->so_state & SS_CANTSENDMORE) { + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; @@ -1911,6 +2197,8 @@ result = (kn->kn_data >= kn->kn_sdata); else result = (kn->kn_data >= so->so_snd.sb_lowat); + if (needlock) + SOCKBUF_UNLOCK(&so->so_snd); return (result); } @@ -1920,6 +2208,7 @@ { struct socket *so = kn->kn_fp->f_data; + /* Unlocked read. */ kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } --- //depot/vendor/freebsd/src/sys/kern/uipc_socket2.c 2004/05/31 19:45:32 +++ //depot/user/rwatson/netperf/sys/kern/uipc_socket2.c 2004/05/31 20:01:56 @@ -105,37 +105,95 @@ register struct socket *so; { + SOCK_LOCK(so); + /* XXXRW: so_state locking? */ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; + SOCK_UNLOCK(so); } +/* + * soisconnected() transitions a socket to a fully connected state. If + * so->so_head is non-NULL, we transition it from the incompletely + * connected queue to the completely connected queue. Otherwise, we + * just wake up the socket since it was an out-bound connection. + */ void soisconnected(so) struct socket *so; { - struct socket *head = so->so_head; + struct socket *head; + int need_lock = !SOCK_OWNED(so); + + KASSERT(need_lock == 1, ("soisconnected(): called with lock!")); + /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - if (head && (so->so_qstate & SQ_INCOMP)) { - if ((so->so_options & SO_ACCEPTFILTER) != 0) { - so->so_upcall = head->so_accf->so_accept_filter->accf_callback; - so->so_upcallarg = head->so_accf->so_accept_filter_arg; + SOCK_UNLOCK(so); + + /* + * XXXRW: Maybe an unlocked read of so_head would be desirable + * here? Unlocked read of so_qstate probably not a good idea + * regardless. so_options handling here is a little unsavory. + */ + ACCEPT_LOCK(); + head = so->so_head; + if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_qstate &= ~SQ_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + head->so_qlen++; + so->so_qstate |= SQ_COMP; + ACCEPT_UNLOCK(); + sorwakeup(head); + wakeup_one(&head->so_timeo); + return; + } else { + void (*so_upcall)(struct socket *, void *, int); + void *so_upcallarg; + /* + * XXXRW: Should probably copy the upcall fields to + * local stack variables while holding the socket + * lock (and clear the option), then release the + * lock and make the call. This will prevent lock + * order reversals/recursion between this code and + * the upcall implementation, which will likely + * also want to frob the socket using locks. + * + * NOTE: We keep a local copy of the function + * pointer so we can invoke the upcall without + * holding locks. However, we also have to copy + * in the upcall fields from the head because the + * filter may need to be called more than once. + */ + SOCK_LOCK(so); + so_upcall = so->so_upcall = + head->so_accf->so_accept_filter->accf_callback; + so_upcallarg = so->so_upcallarg = + head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; - so->so_upcall(so, so->so_upcallarg, M_TRYWAIT); - return; + SOCK_UNLOCK(so); + ACCEPT_UNLOCK(); + /* + * Call with our existing reference, but without + * any locks. + */ + so_upcall(so, so_upcallarg, M_TRYWAIT); } - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - sorwakeup(head); - wakeup_one(&head->so_timeo); - } else { + } else + ACCEPT_UNLOCK(); + if (head == NULL) { wakeup(&so->so_timeo); + /* + * XXXRW: If assuming socket lock is socket buffer receive + * lock, should use a _locked variant of sorwakeup() and + * avoid recursion here. + */ sorwakeup(so); sowwakeup(so); } @@ -146,9 +204,27 @@ register struct socket *so; { + /* + * XXXRW: Assuming we do need SOCK_LOCK(so) here, and the receive + * and base socket lock remain identical, then we should combine + * the SOCK_LOCK() and SOCKBUF_LOCK(...rcv) sections here. + */ + /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; - so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + so->so_state |= SS_ISDISCONNECTING; + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state |= SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_state |= SBS_CANTSENDMORE; + SOCKBUF_UNLOCK(&so->so_snd); wakeup(&so->so_timeo); + /* + * XXXRW: Multiple socket buffer lock/unlock here could be avoided + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); } @@ -157,11 +233,30 @@ soisdisconnected(so) register struct socket *so; { - + + /* + * XXXRW: Assuming we do need SOCK_LOCK(so) here, and the receive + * and base socket lock remain identical, then we should combine + * the SOCK_LOCK() and SOCKBUF_LOCK(...rcv) sections here. + */ + /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); - so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state |= SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_state |= SBS_CANTSENDMORE; + SOCKBUF_UNLOCK(&so->so_snd); + so->so_state |= SS_ISDISCONNECTED; wakeup(&so->so_timeo); + /* Unlocked read of sb_cc. */ sbdrop(&so->so_snd, so->so_snd.sb_cc); + /* + * XXXRW: Avoid multiple lock/unlock of socket buffer locks here + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); } @@ -182,8 +277,12 @@ int connstatus; { register struct socket *so; + int over; - if (head->so_qlen > 3 * head->so_qlimit / 2) + ACCEPT_LOCK(); + over = (head->so_qlen > 3 * head->so_qlimit / 2); + ACCEPT_UNLOCK(); + if (over) return ((struct socket *)0); so = soalloc(M_NOWAIT); if (so == NULL) @@ -194,37 +293,56 @@ so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; + /* XXXRW: so_state locking? */ so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_cred = crhold(head->so_cred); #ifdef MAC + SOCK_LOCK(head); mac_create_socket_from_socket(head, so); + SOCK_UNLOCK(head); #endif + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); return ((struct socket *)0); } - + ACCEPT_LOCK(); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_qstate |= SQ_COMP; head->so_qlen++; } else { - if (head->so_incqlen > head->so_qlimit) { + /* + * XXXRW: Keep removing sockets from the head until there's room + * for us to insert on the tail. In pre-locking revisions, this + * was a simple if(), but as we could be racing with other + * threads and soabort() requires dropping locks, we must loop + * waiting for the condition to be true. + */ + while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); + TAILQ_REMOVE(&so->so_incomp, sp, so_list); + head->so_incqlen--; + sp->so_qstate &= ~SQ_INCOMP; + sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; head->so_incqlen++; } + ACCEPT_UNLOCK(); if (connstatus) { + /* XXXRW: so_state locking? */ + so->so_state |= connstatus; sorwakeup(head); wakeup_one(&head->so_timeo); - so->so_state |= connstatus; } return (so); } @@ -244,8 +362,20 @@ struct socket *so; { - so->so_state |= SS_CANTSENDMORE; - sowwakeup(so); + /* XXXRW: so_state locking? */ + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + SOCKBUF_UNLOCK(&so->so_snd); +} + +void +socantsendmore_locked(so) + struct socket *so; +{ + SOCKBUF_LOCK_ASSERT(&so->so_snd); + + so->so_snd.sb_state |= SBS_CANTSENDMORE; + sowwakeup_locked(so); } void @@ -253,8 +383,20 @@ struct socket *so; { - so->so_state |= SS_CANTRCVMORE; - sorwakeup(so); + /* XXXRW: so_state locking? */ + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_UNLOCK(&so->so_rcv); +} + +void +socantrcvmore_locked(so) + struct socket *so; +{ + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + so->so_rcv.sb_state |= SBS_CANTRCVMORE; + sorwakeup_locked(so); } /* @@ -264,9 +406,10 @@ sbwait(sb) struct sockbuf *sb; { + SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; - return (tsleep(&sb->sb_cc, + return (msleep(&sb->sb_cc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo)); } @@ -281,9 +424,11 @@ { int error; + SOCKBUF_LOCK_ASSERT(sb); + while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; - error = tsleep(&sb->sb_flags, + error = msleep(&sb->sb_flags, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, "sblock", 0); if (error) @@ -294,29 +439,87 @@ } /* + * The part of sowakeup that must be done while + * holding the sockbuf lock. + */ +static __inline void +sowakeup_under_lock(struct socket *so, struct sockbuf *sb) +{ + SOCKBUF_LOCK_ASSERT(sb); + + selwakeuppri(&sb->sb_sel, PSOCK); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup(&sb->sb_cc); + } +} + +/* * Wakeup processes waiting on a socket buffer. * Do asynchronous notification via SIGIO * if the socket has the SS_ASYNC flag set. + * + * The caller is assumed to hold the necessary + * sockbuf lock. */ void +sowakeup_locked(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + + SOCKBUF_LOCK_ASSERT(sb); + + sowakeup_under_lock(so, sb); + + /* XXXRW: so_state locking? */ + if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) + pgsigio(&so->so_sigio, SIGIO, 0); + if (sb->sb_flags & SB_UPCALL) + (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); + if (sb->sb_flags & SB_AIO) /* XXX locking */ + aio_swake(so, sb); + KNOTE(&sb->sb_sel.si_note, 0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + * + * The caller does not hold the sockbuf lock. + */ +void sowakeup(so, sb) register struct socket *so; register struct sockbuf *sb; { - selwakeuppri(&sb->sb_sel, PSOCK); - sb->sb_flags &= ~SB_SEL; - if (sb->sb_flags & SB_WAIT) { - sb->sb_flags &= ~SB_WAIT; - wakeup(&sb->sb_cc); - } + SOCKBUF_LOCK(sb); + sowakeup_under_lock(so, sb); + SOCKBUF_UNLOCK(sb); + + /* Unlocked read. */ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); + /* + * XXXRW: Need to hold a lock over so_upcall to prevent it from + * changing while in process? + */ if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); - if (sb->sb_flags & SB_AIO) + if (sb->sb_flags & SB_AIO) /* XXX locking */ aio_swake(so, sb); + + /* + * XXXRW: More efficient to do this with the sowakeup_under_lock() + * code above to avoid multiple lock/unlock. Does order relative + * to so_upcall(), et al, matter? + */ + SOCKBUF_LOCK(sb); KNOTE(&sb->sb_sel.si_note, 0); + SOCKBUF_UNLOCK(sb); } /* @@ -356,18 +559,22 @@ register struct socket *so; u_long sndcc, rcvcc; { - struct thread *td = curthread; + struct thread *td = curthread; /* XXX */ if (sbreserve(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; + SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease(&so->so_snd, so); @@ -412,6 +619,13 @@ /* * td will only be NULL when we're in an interrupt * (e.g. in tcp_input()) + * + * XXXRW: This comment is true, but only because the caller passed + * in NULL, not for the 4.x reason that there is no thread + * available. Need to be careful of callers that do this wrong; + * I suspect many do it wrong, and therefore many socket buffers + * end up with the wrong limits, especially via the soreserve() + * path. */ if (cc > sb_max_adj) return (0); @@ -476,6 +690,8 @@ { struct mbuf *m = sb->sb_mb; + SOCKBUF_LOCK_ASSERT(sb); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -495,6 +711,8 @@ struct mbuf *m = sb->sb_mb; struct mbuf *n; + SOCKBUF_LOCK_ASSERT(sb); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -517,6 +735,7 @@ #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ + SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ @@ -531,7 +750,7 @@ * discarded and mbufs are compacted where possible. */ void -sbappend(sb, m) +sbappend_locked(sb, m) struct sockbuf *sb; struct mbuf *m; { @@ -539,6 +758,9 @@ if (m == 0) return; + + SOCKBUF_LOCK_ASSERT(sb); + SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { @@ -546,7 +768,7 @@ n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { - sbappendrecord(sb, m); /* XXXXXX!!!! */ + sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); @@ -559,7 +781,7 @@ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { - sbappendrecord(sb, m); /* XXXXXX!!!! */ + sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); @@ -576,13 +798,33 @@ } /* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + sbappend_locked(sb, m); + SOCKBUF_UNLOCK(sb); + } else + sbappend_locked(sb, m); +} + +/* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void -sbappendstream(struct sockbuf *sb, struct mbuf *m) +sbappendstream_locked(struct sockbuf *sb, struct mbuf *m) { + SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); @@ -595,6 +837,22 @@ SBLASTRECORDCHK(sb); } +/* + * This version of sbappend() should only be used when the caller + * absolutely knows that there will never be more than one record + * in the socket buffer, that is, a stream protocol (such as TCP). + */ +void +sbappendstream(struct sockbuf *sb, struct mbuf *m) +{ + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + sbappendstream_locked(sb, m); + SOCKBUF_UNLOCK(sb); + } else + sbappendstream_locked(sb, m); +} + #ifdef SOCKBUF_DEBUG void sbcheck(sb) @@ -604,6 +862,8 @@ struct mbuf *n = 0; u_long len = 0, mbcnt = 0; + SOCKBUF_LOCK_ASSERT(sb); + for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { @@ -626,12 +886,14 @@ * begins a new record. */ void -sbappendrecord(sb, m0) +sbappendrecord_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 == 0) return; m = sb->sb_mb; @@ -659,18 +921,37 @@ } /* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + sbappendrecord_locked(sb, m0); + SOCKBUF_UNLOCK(sb); + } else + sbappendrecord_locked(sb, m0); +} + +/* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void -sbinsertoob(sb, m0) +sbinsertoob_locked(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; register struct mbuf **mp; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 == 0) return; for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { @@ -705,13 +986,31 @@ } /* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + sbinsertoob_locked(sb, m0); + SOCKBUF_UNLOCK(sb); + } else + sbinsertoob_locked(sb, m0); +} + +/* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int -sbappendaddr(sb, asa, m0, control) +sbappendaddr_locked(sb, asa, m0, control) struct sockbuf *sb; struct sockaddr *asa; struct mbuf *m0, *control; @@ -719,11 +1018,14 @@ struct mbuf *m, *n, *nlast; int space = asa->sa_len; + SOCKBUF_LOCK_ASSERT(sb); + if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); + if (space > sbspace(sb)) return (0); #if MSIZE <= 256 @@ -745,25 +1047,50 @@ sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); + sb->sb_mbtail = nlast; - sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); - SBLASTRECORDCHK(sb); return (1); } +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ int -sbappendcontrol(sb, m0, control) +sbappendaddr(sb, asa, m0, control) + struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + int retval; + + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + retval = sbappendaddr_locked(sb, asa, m0, control); + SOCKBUF_UNLOCK(sb); + } else + retval = sbappendaddr_locked(sb, asa, m0, control); + return (retval); +} + +int +sbappendcontrol_locked(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { struct mbuf *m, *n, *mlast; int space; + SOCKBUF_LOCK_ASSERT(sb); + if (control == 0) panic("sbappendcontrol"); space = m_length(control, &n) + m_length(m0, NULL); + if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ @@ -775,14 +1102,30 @@ sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); + sb->sb_mbtail = mlast; - sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); + SBLASTRECORDCHK(sb); - SBLASTRECORDCHK(sb); return (1); } +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + int retval; + + if (!SOCKBUF_OWNED(sb)) { + SOCKBUF_LOCK(sb); + retval = sbappendcontrol_locked(sb, m0, control); + SOCKBUF_UNLOCK(sb); + } else + retval = sbappendcontrol_locked(sb, m0, control); + return (retval); +} + /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n @@ -796,6 +1139,8 @@ register int eor = 0; register struct mbuf *o; + SOCKBUF_LOCK_ASSERT(sb); + while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && @@ -852,6 +1197,8 @@ register struct sockbuf *sb; { + SOCKBUF_LOCK_ASSERT(sb); + if (sb->sb_flags & SB_LOCK) panic("sbflush: locked"); while (sb->sb_mbcnt) { @@ -864,7 +1211,8 @@ sbdrop(sb, (int)sb->sb_cc); } if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) - panic("sbflush: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); + panic("sbflush: cc %u || mb %p || mbcnt %u", + sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); } /* @@ -877,7 +1225,11 @@ { register struct mbuf *m; struct mbuf *next; + int need_lock = !SOCKBUF_OWNED(sb); + if (need_lock) + SOCKBUF_LOCK(sb); + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; while (len > 0) { if (m == 0) { @@ -921,6 +1273,9 @@ } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } + + if (need_lock) + SOCKBUF_UNLOCK(sb); } /* @@ -932,7 +1287,11 @@ register struct sockbuf *sb; { register struct mbuf *m; + int need_lock = !SOCKBUF_OWNED(sb); + if (need_lock) + SOCKBUF_LOCK(sb); + m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; @@ -942,6 +1301,9 @@ } while (m); } SB_EMPTY_FIXUP(sb); + + if (need_lock) + SOCKBUF_UNLOCK(sb); } /* @@ -1076,6 +1438,7 @@ xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; + /* Unlocked read. */ xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; --- //depot/vendor/freebsd/src/sys/kern/uipc_syscalls.c 2004/05/31 19:45:32 +++ //depot/user/rwatson/netperf/sys/kern/uipc_syscalls.c 2004/05/31 20:01:56 @@ -190,7 +190,9 @@ if ((error = fgetsock(td, fd, &so, NULL)) != 0) goto done2; #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_bind(td->td_ucred, so, sa); + SOCK_UNLOCK(so); if (error) goto done1; #endif @@ -223,7 +225,9 @@ NET_LOCK_GIANT(); if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_listen(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) goto done; #endif @@ -255,7 +259,7 @@ struct file *nfp = NULL; struct sockaddr *sa; socklen_t namelen; - int error, s; + int error; struct socket *head, *so; int fd; u_int fflag; @@ -266,84 +270,86 @@ if (uap->name) { error = copyin(uap->anamelen, &namelen, sizeof (namelen)); if(error) - goto done3; - if (namelen < 0) { - error = EINVAL; - goto done3; - } + return (error); + if (namelen < 0) + return (EINVAL); } NET_LOCK_GIANT(); error = fgetsock(td, uap->s, &head, &fflag); if (error) goto done2; - s = splnet(); + /* Unlocked read. */ if ((head->so_options & SO_ACCEPTCONN) == 0) { - splx(s); error = EINVAL; goto done; } + error = falloc(td, &nfp, &fd); + if (error) + goto done; + /* + * Unlocked reads. + * XXXRW: Dubious use of so->so_error. + */ + ACCEPT_LOCK(); + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { + ACCEPT_UNLOCK(); + error = EWOULDBLOCK; + goto done; + } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_state & SS_CANTRCVMORE) { + if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } - if ((head->so_state & SS_NBIO) != 0) { - head->so_error = EWOULDBLOCK; - break; - } - error = tsleep(&head->so_timeo, PSOCK | PCATCH, + error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error) { - splx(s); + ACCEPT_UNLOCK(); goto done; } } if (head->so_error) { error = head->so_error; head->so_error = 0; - splx(s); + ACCEPT_UNLOCK(); goto done; } + so = TAILQ_FIRST(&head->so_comp); + KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); + KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* - * At this point we know that there is at least one connection - * ready to be accepted. Remove it from the queue prior to - * allocating the file descriptor for it since falloc() may - * block allowing another process to accept the connection - * instead. + * XXXRW: Before changing the flags on the socket, we have to bump + * the reference count. Otherwise, if the protocol calls sofree(), + * the socket will be released due to a zero refcount. */ - so = TAILQ_FIRST(&head->so_comp); + SOCK_LOCK(so); + soref(so); /* file descriptor reference */ + SOCK_UNLOCK(so); + TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; + so->so_qstate &= ~SQ_COMP; + so->so_head = NULL; + + ACCEPT_UNLOCK(); - error = falloc(td, &nfp, &fd); - if (error) { - /* - * Probably ran out of file descriptors. Put the - * unaccepted connection back onto the queue and - * do another wakeup so some other process might - * have a chance at it. - */ - TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); - head->so_qlen++; - wakeup_one(&head->so_timeo); - splx(s); - goto done; - } /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; + /* + * XXXRW: Might make life simpler to grab the socket buffer lock + * here so it's held when KNOTE() calls back into the socket + * code. + */ /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); FILE_LOCK(nfp); - soref(so); /* file descriptor reference */ nfp->f_data = so; /* nfp has ref count from falloc */ nfp->f_flag = fflag; nfp->f_ops = &socketops; @@ -372,7 +378,6 @@ namelen = 0; if (uap->name) goto gotnoname; - splx(s); error = 0; goto done; } @@ -410,7 +415,6 @@ FILEDESC_UNLOCK(fdp); } } - splx(s); /* * Release explicitly held references before returning. @@ -421,7 +425,6 @@ fputsock(head); done2: NET_UNLOCK_GIANT(); -done3: return (error); } @@ -488,25 +491,32 @@ NET_LOCK_GIANT(); if ((error = fgetsock(td, fd, &so, NULL)) != 0) goto done2; + /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_connect(td->td_ucred, so, sa); + SOCK_UNLOCK(so); if (error) goto bad; #endif error = soconnect(so, sa, td); if (error) goto bad; + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } s = splnet(); + SOCK_LOCK(so); + /* XXXRW: so_state locking? */ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0); + error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + "connec", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; @@ -517,8 +527,10 @@ error = so->so_error; so->so_error = 0; } + SOCK_UNLOCK(so); splx(s); bad: + /* XXXRW: so_state locking? */ if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) @@ -712,7 +724,9 @@ goto bad2; #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) goto bad; #endif @@ -955,7 +969,9 @@ } #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_receive(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) { fputsock(so); NET_UNLOCK_GIANT(); @@ -1509,6 +1525,7 @@ NET_LOCK_GIANT(); if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) goto done2; + /* XXXRW: so_state locking? */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done1; @@ -1743,6 +1760,7 @@ error = EINVAL; goto done; } + /* XXXRW: so_state locking? */ if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto done; @@ -1753,7 +1771,9 @@ } #ifdef MAC + SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); + SOCK_UNLOCK(so); if (error) goto done; #endif @@ -1792,7 +1812,9 @@ /* * Protect against multiple writers to the socket. */ + SOCKBUF_LOCK(&so->so_snd); (void) sblock(&so->so_snd, M_WAITOK); + SOCKBUF_UNLOCK(&so->so_snd); /* * Loop through the pages in the file, starting with the requested @@ -1832,14 +1854,18 @@ * Optimize the non-blocking case by looking at the socket space * before going to the extra work of constituting the sf_buf. */ + SOCKBUF_LOCK(&so->so_snd); + /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { - if (so->so_state & SS_CANTSENDMORE) + if (so->so_snd.sb_state & SBS_CANTSENDMORE) error = EPIPE; else error = EAGAIN; sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); goto done; } + SOCKBUF_UNLOCK(&so->so_snd); VM_OBJECT_LOCK(obj); /* * Attempt to look up the page. @@ -1895,6 +1921,7 @@ * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; + mtx_lock(&Giant); /* VFS */ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); /* * XXXMAC: Because we don't have fp->f_cred here, @@ -1906,6 +1933,7 @@ IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); + mtx_unlock(&Giant); /* VFS */ if (error) VM_OBJECT_LOCK(obj); vm_page_lock_queues(); @@ -1927,7 +1955,9 @@ } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); + SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); goto done; } vm_page_unlock_queues(); @@ -1943,7 +1973,9 @@ if (pg->wire_count == 0 && pg->object == NULL) vm_page_free(pg); vm_page_unlock_queues(); + SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); error = EINTR; goto done; } @@ -1980,6 +2012,7 @@ * Add the buffer to the socket buffer chain. */ s = splnet(); + SOCKBUF_LOCK(&so->so_snd); retry_space: /* * Make sure that the socket is still able to take more data. @@ -1992,8 +2025,9 @@ * blocks before the pru_send (or more accurately, any blocking * results in a loop back to here to re-check). */ - if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { - if (so->so_state & SS_CANTSENDMORE) { + SOCKBUF_LOCK_ASSERT(&so->so_snd); + if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) { + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; } else { error = so->so_error; @@ -2001,6 +2035,7 @@ } m_freem(m); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); splx(s); goto done; } @@ -2009,10 +2044,12 @@ * after checking the connection state above in order to avoid * a race condition with sbwait(). */ + /* XXXRW: so_state locking? */ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { m_freem(m); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); splx(s); error = EAGAIN; goto done; @@ -2031,15 +2068,20 @@ } goto retry_space; } + SOCKBUF_UNLOCK(&so->so_snd); error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); splx(s); if (error) { + SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); goto done; } headersent = 1; } + SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); /* * Send trailers. Wimp out and use writev(2). --- //depot/vendor/freebsd/src/sys/kern/uipc_usrreq.c 2004/04/05 14:06:48 +++ //depot/user/rwatson/netperf/sys/kern/uipc_usrreq.c 2004/05/30 11:22:04 @@ -81,6 +81,39 @@ static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; static ino_t unp_ino; /* prototype for fake inode numbers */ +static struct mtx unp_mtx; +#define UNP_HEAD_LOCK_INIT() \ + mtx_init(&unp_mtx, "unp head", NULL, MTX_DEF) +#define UNP_HEAD_LOCK() mtx_lock(&unp_mtx) +#define UNP_HEAD_UNLOCK() mtx_unlock(&unp_mtx) +#define UNP_HEAD_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) + +/* NB: DUPOK is to cover the connect2 case XXX */ +#define UNP_LOCK_INIT(_unp) \ + mtx_init(&(_unp)->unp_mtx, "unp", NULL, MTX_DEF | MTX_DUPOK) +#define UNP_LOCK_DESTROY(_unp) mtx_destroy(&(_unp)->unp_mtx) +#define UNP_LOCK(_unp) mtx_lock(&(_unp)->unp_mtx) +#define UNP_UNLOCK(_unp) mtx_unlock(&(_unp)->unp_mtx) +#define UNP_LOCK_ASSERT(_unp) mtx_assert(&(_unp)->unp_mtx, MA_OWNED) + +/* + * A unp lock is always preceded by locking the head. + * Since this occurs often we define convenienc macros + * for entry, exist, and validation in lower-level routines. + */ +#define UNP_ENTER(_unp) do { \ + UNP_HEAD_LOCK(); \ + UNP_LOCK(_unp); \ +} while (0) +#define UNP_EXIT(_unp) do { \ + UNP_UNLOCK(_unp); \ + UNP_HEAD_UNLOCK(); \ +} while (0) +#define UNP_ASSERT(_unp) do { \ + UNP_HEAD_LOCK_ASSERT(); \ + UNP_LOCK_ASSERT(_unp); \ +} while (0) + static int unp_attach(struct socket *); static void unp_detach(struct unpcb *); static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *); @@ -104,8 +137,10 @@ if (unp == NULL) return (EINVAL); + UNP_ENTER(unp); unp_drop(unp, ECONNABORTED); - unp_detach(unp); + unp_detach(unp); /* NB: unlocks unp + head */ + SOCK_LOCK(so); sotryfree(so); return (0); } @@ -114,6 +149,7 @@ uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); + struct sockaddr *sa; if (unp == NULL) return (EINVAL); @@ -123,13 +159,14 @@ * if it was bound and we are still connected * (our peer may have closed already!). */ - if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) { - *nam = sodupsockaddr( - (struct sockaddr *)unp->unp_conn->unp_addr, M_WAITOK); - } else { - *nam = sodupsockaddr((struct sockaddr *)&sun_noname, - M_WAITOK); - } + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); + UNP_ENTER(unp); + if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) + sa = (struct sockaddr *) unp->unp_conn->unp_addr; + else + sa = &sun_noname; + bcopy(sa, *nam, sa->sa_len); + UNP_EXIT(unp); return (0); } @@ -150,7 +187,6 @@ if (unp == NULL) return (EINVAL); - return (unp_bind(unp, nam, td)); } @@ -158,21 +194,30 @@ uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct unpcb *unp = sotounpcb(so); + int retval; - if (unp == NULL) - return (EINVAL); - return (unp_connect(so, nam, curthread)); + if (unp != NULL) { + UNP_ENTER(unp); + retval = unp_connect(so, nam, curthread); + UNP_EXIT(unp); + } else + retval = EINVAL; + return (retval); } int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp = sotounpcb(so1); + int retval; - if (unp == NULL) - return (EINVAL); - - return (unp_connect2(so1, so2)); + if (unp != NULL) { + UNP_ENTER(unp); + retval = unp_connect2(so1, so2); + UNP_EXIT(unp); + } else + retval = EINVAL; + return (retval); } /* control is EOPNOTSUPP */ @@ -184,8 +229,8 @@ if (unp == NULL) return (EINVAL); - - unp_detach(unp); + UNP_ENTER(unp); + unp_detach(unp); /* NB: unlocks unp + head */ return (0); } @@ -194,41 +239,52 @@ { struct unpcb *unp = sotounpcb(so); - if (unp == NULL) + if (unp != NULL) { + UNP_ENTER(unp); + unp_disconnect(unp); + UNP_EXIT(unp); + return (0); + } else return (EINVAL); - unp_disconnect(unp); - return (0); } static int uipc_listen(struct socket *so, struct thread *td) { struct unpcb *unp = sotounpcb(so); + int retval; - if (unp == NULL || unp->unp_vnode == NULL) + if (unp != NULL && unp->unp_vnode != NULL) { + UNP_ENTER(unp); + retval = unp_listen(unp, td); + UNP_EXIT(unp); + } else return (EINVAL); - return (unp_listen(unp, td)); + return (retval); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); + struct sockaddr *sa; if (unp == NULL) return (EINVAL); - if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) - *nam = sodupsockaddr( - (struct sockaddr *)unp->unp_conn->unp_addr, M_WAITOK); + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); + UNP_ENTER(unp); + if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) + sa = (struct sockaddr *) unp->unp_conn->unp_addr; else { /* * XXX: It seems that this test always fails even when * connection is established. So, this else clause is * added as workaround to return PF_LOCAL sockaddr. */ - *nam = sodupsockaddr((struct sockaddr *)&sun_noname, - M_WAITOK); + sa = &sun_noname; } + bcopy(sa, *nam, sa->sa_len); + UNP_EXIT(unp); return (0); } @@ -241,15 +297,27 @@ if (unp == NULL) return (EINVAL); + /* + * Reorder locks to avoid LORs. Note that we + * delay re-locking so_rcv to below so it can + * be done only once. + */ + SOCKBUF_UNLOCK(&so->so_rcv); + UNP_ENTER(unp); switch (so->so_type) { case SOCK_DGRAM: panic("uipc_rcvd DGRAM?"); /*NOTREACHED*/ case SOCK_STREAM: - if (unp->unp_conn == NULL) + if (unp->unp_conn == NULL) { + SOCKBUF_LOCK(&so->so_rcv); break; + } so2 = unp->unp_conn->unp_socket; + /* NB: careful of order here */ + SOCKBUF_LOCK(&so2->so_snd); + SOCKBUF_LOCK(&so->so_rcv); /* * Adjust backpressure on sender * and wakeup any waiting to write. @@ -261,12 +329,14 @@ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = so->so_rcv.sb_cc; - sowwakeup(so2); + sowwakeup_locked(so2); + SOCKBUF_UNLOCK(&so2->so_snd); break; default: panic("uipc_rcvd unknown socktype"); } + UNP_EXIT(unp); return (0); } @@ -293,6 +363,11 @@ if (control != NULL && (error = unp_internalize(&control, td))) goto release; + /* + * Reorder locks to avoid LORs. + */ + UNP_ENTER(unp); + switch (so->so_type) { case SOCK_DGRAM: { @@ -317,13 +392,14 @@ from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; - if (sbappendaddr(&so2->so_rcv, from, m, control)) { - sorwakeup(so2); + SOCKBUF_LOCK(&so2->so_rcv); + if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { + sorwakeup_locked(so2); m = NULL; control = NULL; - } else { + } else error = ENOBUFS; - } + SOCKBUF_UNLOCK(&so2->so_rcv); if (nam != NULL) unp_disconnect(unp); break; @@ -335,6 +411,7 @@ * Note: A better implementation would complain * if not equal to the peer's address. */ + /* Unlocked read. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); @@ -346,24 +423,25 @@ } } - if (so->so_state & SS_CANTSENDMORE) { + /* Unlocked read. */ + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; } if (unp->unp_conn == NULL) panic("uipc_send connected but no connection?"); so2 = unp->unp_conn->unp_socket; + SOCKBUF_LOCK(&so2->so_rcv); /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control != NULL) { - if (sbappendcontrol(&so2->so_rcv, m, control)) + if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; - } else { - sbappend(&so2->so_rcv, m); - } + } else + sbappend_locked(&so2->so_rcv, m); so->so_snd.sb_mbmax -= so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; @@ -372,7 +450,8 @@ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; - sorwakeup(so2); + sorwakeup_locked(so2); + SOCKBUF_UNLOCK(&so2->so_rcv); m = NULL; break; @@ -388,10 +467,10 @@ socantsendmore(so); unp_shutdown(unp); } + UNP_EXIT(unp); if (control != NULL && error != 0) - unp_dispose(control); - + unp_dispose(control); /* XXX need head lock? */ release: if (control != NULL) m_freem(control); @@ -408,15 +487,18 @@ if (unp == NULL) return (EINVAL); + UNP_ENTER(unp); sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { so2 = unp->unp_conn->unp_socket; + /* Unlocked read. */ sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NOUDEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; + UNP_EXIT(unp); return (0); } @@ -427,8 +509,11 @@ if (unp == NULL) return (EINVAL); + UNP_ENTER(unp); + /* XXX socket lock? */ socantsendmore(so); unp_shutdown(unp); + UNP_EXIT(unp); return (0); } @@ -436,15 +521,18 @@ uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp = sotounpcb(so); + struct sockaddr *sa; if (unp == NULL) return (EINVAL); + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); + UNP_ENTER(unp); if (unp->unp_addr != NULL) - *nam = sodupsockaddr((struct sockaddr *)unp->unp_addr, - M_WAITOK); + sa = (struct sockaddr *) unp->unp_addr; else - *nam = sodupsockaddr((struct sockaddr *)&sun_noname, - M_WAITOK); + sa = &sun_noname; + bcopy(sa, *nam, sa->sa_len); + UNP_EXIT(unp); return (0); } @@ -466,6 +554,7 @@ switch (sopt->sopt_dir) { case SOPT_GET: + UNP_ENTER(unp); switch (sopt->sopt_name) { case LOCAL_PEERCRED: if (unp->unp_flags & UNP_HAVEPC) @@ -482,6 +571,7 @@ error = EOPNOTSUPP; break; } + UNP_EXIT(unp); break; case SOPT_SET: default: @@ -554,8 +644,13 @@ unp_count++; LIST_INIT(&unp->unp_refs); unp->unp_socket = so; + UNP_LOCK_INIT(unp); + + UNP_HEAD_LOCK(); LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); + UNP_HEAD_UNLOCK(); + so->so_pcb = unp; return (0); } @@ -564,18 +659,25 @@ unp_detach(unp) register struct unpcb *unp; { + struct vnode *vp; + + UNP_ASSERT(unp); + LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; - if (unp->unp_vnode != NULL) { + if ((vp = unp->unp_vnode) != NULL) { unp->unp_vnode->v_socket = NULL; - vrele(unp->unp_vnode); unp->unp_vnode = NULL; } if (unp->unp_conn != NULL) unp_disconnect(unp); - while (!LIST_EMPTY(&unp->unp_refs)) - unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET); + while (!LIST_EMPTY(&unp->unp_refs)) { + struct unpcb *ref = LIST_FIRST(&unp->unp_refs); + UNP_LOCK(ref); + unp_drop(ref, ECONNRESET); + UNP_UNLOCK(ref); + } soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = NULL; if (unp_rights) { @@ -591,7 +693,11 @@ } if (unp->unp_addr != NULL) FREE(unp->unp_addr, M_SONAME); + UNP_LOCK_DESTROY(unp); + UNP_HEAD_UNLOCK(); uma_zfree(unp_zone, unp); + if (vp) + vrele(vp); } static int @@ -618,15 +724,14 @@ buf = malloc(namelen + 1, M_TEMP, M_WAITOK); strlcpy(buf, soun->sun_path, namelen + 1); + mtx_lock(&Giant); restart: NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, buf, td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); - if (error) { - free(buf, M_TEMP); - return (error); - } + if (error) + goto done; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); @@ -636,14 +741,12 @@ vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); - free(buf, M_TEMP); - return (EADDRINUSE); + error = EADDRINUSE; + goto done; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); - if (error) { - free(buf, M_TEMP); - return (error); - } + if (error) + goto done; goto restart; } VATTR_NULL(&vattr); @@ -659,18 +762,21 @@ } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); - if (error) { - free(buf, M_TEMP); - return (error); - } + if (error) + goto done; vp = nd.ni_vp; + soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); + UNP_ENTER(unp); vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; - unp->unp_addr = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); + unp->unp_addr = soun; + UNP_EXIT(unp); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); +done: + mtx_unlock(&Giant); free(buf, M_TEMP); - return (0); + return (error); } static int @@ -682,10 +788,14 @@ register struct sockaddr_un *soun = (struct sockaddr_un *)nam; register struct vnode *vp; register struct socket *so2, *so3; - struct unpcb *unp, *unp2, *unp3; + struct unpcb *unp = sotounpcb(so); + struct unpcb *unp2, *unp3; int error, len; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; + struct sockaddr *sa; + + UNP_ASSERT(unp); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) @@ -693,9 +803,17 @@ strlcpy(buf, soun->sun_path, len + 1); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); + /* drop locks across namei */ + UNP_EXIT(unp); + mtx_lock(&Giant); error = namei(&nd); + if (!error) + sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); + else + sa = NULL; + UNP_ENTER(unp); if (error) - return (error); + goto bad2; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VSOCK) { @@ -715,19 +833,30 @@ goto bad; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { - if ((so2->so_options & SO_ACCEPTCONN) == 0 || - (so3 = sonewconn(so2, 0)) == NULL) { + if (so2->so_options & SO_ACCEPTCONN) { + /* + * NB: drop locks here so unp_attach is entered + * w/o locks; this avoids a recursive lock + * of the head and holding sleep locks across + * a (potentially) blocking malloc. + */ + UNP_EXIT(unp); + so3 = sonewconn(so2, 0); + UNP_ENTER(unp); + } else + so3 = NULL; + if (so3 == NULL) { error = ECONNREFUSED; goto bad; } unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); - if (unp2->unp_addr != NULL) - unp3->unp_addr = (struct sockaddr_un *) - sodupsockaddr((struct sockaddr *)unp2->unp_addr, - M_WAITOK); - + if (unp2->unp_addr != NULL) { + bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); + unp3->unp_addr = (struct sockaddr_un *) sa; + sa = NULL; + } /* * unp_peercred management: * @@ -750,8 +879,10 @@ sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; #ifdef MAC + SOCK_LOCK(so); mac_set_socket_peer_from_socket(so, so3); mac_set_socket_peer_from_socket(so3, so); + SOCK_UNLOCK(so); #endif so2 = so3; @@ -759,6 +890,10 @@ error = unp_connect2(so, so2); bad: vput(vp); +bad2: + if (sa) + free(sa, M_SONAME); + mtx_unlock(&Giant); return (error); } @@ -770,9 +905,12 @@ register struct unpcb *unp = sotounpcb(so); register struct unpcb *unp2; + UNP_ASSERT(unp); + if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); + UNP_LOCK(unp2); unp->unp_conn = unp2; switch (so->so_type) { @@ -790,6 +928,7 @@ default: panic("unp_connect2"); } + UNP_UNLOCK(unp2); return (0); } @@ -799,6 +938,8 @@ { register struct unpcb *unp2 = unp->unp_conn; + UNP_ASSERT(unp); + if (unp2 == NULL) return; unp->unp_conn = NULL; @@ -806,15 +947,19 @@ case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); + /* XXXRW: so_state locking? */ unp->unp_socket->so_state &= ~SS_ISCONNECTED; break; case SOCK_STREAM: soisdisconnected(unp->unp_socket); + UNP_LOCK(unp2); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); + UNP_UNLOCK(unp2); break; } + return; } #ifdef notdef @@ -857,8 +1002,10 @@ * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); + UNP_HEAD_LOCK(); gencnt = unp_gencnt; n = unp_count; + UNP_HEAD_UNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; @@ -872,6 +1019,7 @@ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); + UNP_HEAD_LOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { if (unp->unp_gencnt <= gencnt) { @@ -881,6 +1029,7 @@ unp_list[i++] = unp; } } + UNP_HEAD_UNLOCK(); n = i; /* in case we lost some during malloc */ error = 0; @@ -939,6 +1088,8 @@ { struct socket *so; + UNP_ASSERT(unp); + if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so); @@ -951,6 +1102,8 @@ { struct socket *so = unp->unp_socket; + UNP_ASSERT(unp); + so->so_error = errno; unp_disconnect(unp); } @@ -1102,6 +1255,8 @@ uma_zone_set_max(unp_zone, nmbclusters); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); + + UNP_HEAD_LOCK_INIT(); } static int @@ -1258,6 +1413,8 @@ struct file **extra_ref, **fpp; int nunref, i; + UNP_HEAD_LOCK_ASSERT(); /* NB: this serializes entry */ + if (unp_gcing) return; unp_gcing = 1; @@ -1348,7 +1505,9 @@ * message buffers. Follow those links and mark them * as accessible too. */ + SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, unp_mark); + SOCKBUF_UNLOCK(&so->so_rcv); } } while (unp_defer); sx_sunlock(&filelist_lock); @@ -1452,6 +1611,7 @@ struct unpcb *unp; struct thread *td; { + UNP_ASSERT(unp); cru2x(td->td_ucred, &unp->unp_peercred); unp->unp_flags |= UNP_HAVEPCCACHED; --- //depot/vendor/freebsd/src/sys/kern/vfs_aio.c 2004/05/30 13:35:40 +++ //depot/user/rwatson/netperf/sys/kern/vfs_aio.c 2004/05/30 22:13:02 @@ -566,8 +566,12 @@ so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); if (TAILQ_EMPTY(&so->so_aiojobq)) { + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_AIO; + SOCKBUF_UNLOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_AIO; + SOCKBUF_UNLOCK(&so->so_rcv); } } TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); @@ -1231,10 +1235,14 @@ if (sb == &so->so_snd) { opcode = LIO_WRITE; + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_AIO; + SOCKBUF_UNLOCK(&so->so_snd); } else { opcode = LIO_READ; + SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_AIO; + SOCKBUF_UNLOCK(&so->so_rcv); } for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { @@ -1443,10 +1451,15 @@ LIO_WRITE) && (!sowriteable(so)))) { TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); - if (opcode == LIO_READ) + if (opcode == LIO_READ) { + SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AIO; - else + SOCKBUF_UNLOCK(&so->so_rcv); + } else { + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AIO; + SOCKBUF_UNLOCK(&so->so_snd); + } aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ ki->kaio_queue_count++; num_queue_count++; --- //depot/vendor/freebsd/src/sys/net/bpf.c 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/bpf.c 2004/04/07 20:11:34 @@ -553,7 +553,7 @@ struct ifnet *ifp; struct mbuf *m; int error; - static struct sockaddr dst; + struct sockaddr dst; int datlen; if (d->bd_bif == 0) @@ -564,6 +564,7 @@ if (uio->uio_resid == 0) return (0); + bzero(&dst, sizeof(dst)); error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, &dst, &datlen); if (error) return (error); @@ -579,12 +580,10 @@ mac_create_mbuf_from_bpfdesc(d, m); BPFD_UNLOCK(d); #endif - mtx_lock(&Giant); + /* NB: the driver frees the mbuf */ + NET_LOCK_GIANT(); error = (*ifp->if_output)(ifp, m, &dst, (struct rtentry *)0); - mtx_unlock(&Giant); - /* - * The driver frees the mbuf. - */ + NET_UNLOCK_GIANT(); return (error); } --- //depot/vendor/freebsd/src/sys/net/if.c 2004/04/24 15:25:43 +++ //depot/user/rwatson/netperf/sys/net/if.c 2004/05/03 19:32:28 @@ -367,6 +367,10 @@ struct sockaddr_dl *sdl; struct ifaddr *ifa; + /* + * XXXRW: Shouldn't we add to the global list only once the ifnet + * is ready for use? + */ IF_AFDATA_LOCK_INIT(ifp); ifp->if_afdata_initialized = 0; IFNET_WLOCK(); @@ -657,6 +661,7 @@ /* * Create a clone network interface. + * XXXRW: Locking? */ int if_clone_create(char *name, int len) @@ -729,6 +734,7 @@ /* * Destroy a clone network interface. + * XXXRW: Locking? */ int if_clone_destroy(const char *name) @@ -766,6 +772,7 @@ /* * Look up a network interface cloner. + * XXXRW: Locking? */ static struct if_clone * if_clone_lookup(const char *name, int *unitp) @@ -807,6 +814,7 @@ /* * Register a network interface cloner. + * XXXRW: Locking? */ void if_clone_attach(struct if_clone *ifc) @@ -849,6 +857,7 @@ /* * Unregister a network interface cloner. + * XXXRW: Locking? */ void if_clone_detach(struct if_clone *ifc) @@ -861,6 +870,7 @@ /* * Provide list of interface cloners to userspace. + * XXXRW: Locking? */ static int if_clone_list(struct if_clonereq *ifcr) --- //depot/vendor/freebsd/src/sys/net/if_gif.c 2004/05/30 13:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_gif.c 2004/05/30 18:41:33 @@ -88,6 +88,10 @@ * gif_mtx protects the global gif_softc_list. * XXX: Per-softc locking is still required. */ +/* + * XXXRW: Note that gif_mtx only protects global gif-related data, not + * per-softc data. See also netinet/in_gif.c for locking needs. + */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static LIST_HEAD(, gif_softc) gif_softc_list; @@ -500,6 +504,9 @@ } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ +/* + * XXXRW: per-gif softc locking required. + */ int gif_ioctl(ifp, cmd, data) struct ifnet *ifp; @@ -756,8 +763,9 @@ int s; int error = 0; - s = splnet(); - + /* + * XXXRW: per-gif softc locking required. + */ mtx_lock(&gif_mtx); LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -786,6 +794,9 @@ } mtx_unlock(&gif_mtx); + /* + * XXXRW: Lock gif softc fields. + */ /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) switch (sc->gif_psrc->sa_family) { --- //depot/vendor/freebsd/src/sys/net/if_gre.c 2004/05/30 13:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_gre.c 2004/05/30 18:41:33 @@ -94,7 +94,8 @@ /* * gre_mtx protects all global variables in if_gre.c. - * XXX: gre_softc data not protected yet. + * + * XXXRW: It does not protect softc-specific data. */ struct mtx gre_mtx; static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); --- //depot/vendor/freebsd/src/sys/net/if_gre.h 2004/03/22 08:06:54 +++ //depot/user/rwatson/netperf/sys/net/if_gre.h 2004/03/22 08:18:59 @@ -54,6 +54,12 @@ WCCP_V2 } wccp_ver_t; +/* + * XXXRW: softc fields need locking. + * + * XXXRW: gre's notion of a 'called' count is not MP-safe, as it assumes + * only one packet can be processed at a time. + */ struct gre_softc { struct ifnet sc_if; LIST_ENTRY(gre_softc) sc_list; --- //depot/vendor/freebsd/src/sys/net/if_sl.c 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/if_sl.c 2004/04/07 20:11:34 @@ -162,6 +162,7 @@ #define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */ static LIST_HEAD(sl_list, sl_softc) sl_list; +static struct mtx slip_mtx; #define FRAME_END 0xc0 /* Frame End */ #define FRAME_ESCAPE 0xdb /* Frame Esc */ @@ -197,9 +198,10 @@ sl_modevent(module_t mod, int type, void *data) { switch (type) { - case MOD_LOAD: + case MOD_LOAD: + mtx_init(&slip_mtx, "slip_mtx", NULL, MTX_DEF); + LIST_INIT(&sl_list); linesw[SLIPDISC] = slipdisc; - LIST_INIT(&sl_list); break; case MOD_UNLOAD: printf("if_sl module unload - not possible for this module type\n"); @@ -216,6 +218,7 @@ DECLARE_MODULE(if_sl, sl_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +/* Locked using slip_mtx. */ static int *st_unit_list; static size_t st_unit_max = 0; @@ -224,6 +227,7 @@ { struct sl_softc *nc; + mtx_assert(&slip_mtx, MA_OWNED); LIST_FOREACH(nc, &sl_list, sl_next) { if (nc->sc_if.if_dunit == unit) return (0); @@ -237,6 +241,7 @@ { size_t i; + mtx_assert(&slip_mtx, MA_OWNED); for (i = 0; i < st_unit_max; i++) if (st_unit_list[i] == unit) return 1; @@ -249,6 +254,7 @@ { int *t; + mtx_assert(&slip_mtx, MA_OWNED); if (slisstatic(unit)) return; @@ -311,10 +317,12 @@ sc->sc_if.if_linkmib = sc; sc->sc_if.if_linkmiblen = sizeof *sc; mtx_init(&sc->sc_fastq.ifq_mtx, "sl_fastq", NULL, MTX_DEF); + mtx_init(&sc->sc_mtx, "slip sc_mtx", NULL, MTX_DEF); /* * Find a suitable unit number. */ + mtx_lock(&slip_mtx); for (unit=0; ; unit++) { if (slisstatic(unit)) continue; @@ -324,6 +332,7 @@ } if_initname(&sc->sc_if, "sl", unit); LIST_INSERT_HEAD(&sl_list, sc, sl_next); + mtx_unlock(&slip_mtx); if_attach(&sc->sc_if); bpfattach(&sc->sc_if, DLT_SLIP, SLIP_HDRLEN); @@ -386,10 +395,20 @@ static void sldestroy(struct sl_softc *sc) { + + /* + * XXXRW: Slight race here: we may detach bpf/if before we + * attach. This appears to be a property of the unit selection + * process, which might be better handled by the interface + * cloning subsystem? + */ bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); + mtx_lock(&slip_mtx); LIST_REMOVE(sc, sl_next); + mtx_unlock(&slip_mtx); m_free(sc->sc_mbuf); + mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_fastq.ifq_mtx); if (sc->bpfbuf) free(sc->bpfbuf, M_SL); @@ -419,6 +438,10 @@ tp->t_line = 0; sc = (struct sl_softc *)tp->t_sc; if (sc != NULL) { + /* + * XXXRW: tear-down race between timeout and slclose()? + */ + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill) { sc->sc_outfill = 0; untimeout(sl_outfill, sc, sc->sc_ofhandle); @@ -427,6 +450,7 @@ sc->sc_keepalive = 0; untimeout(sl_keepalive, sc, sc->sc_kahandle); } + mtx_unlock(&sc->sc_mtx); if_down(&sc->sc_if); sc->sc_ttyp = NULL; tp->t_sc = NULL; @@ -464,12 +488,21 @@ splx(s); return (ENXIO); } + /* + * XXXRW: we hold the mutex over all of this to protect + * the unit change and global list consistency. However, + * some of these functions probably sleep, making this + * wrong. If we have to support renumbering, we probably + * need a way to reserve both numbers to prevent them + * from being reused during the change, or a way to sleep + * waiting for a change to end (i.e., a CV). + */ + mtx_lock(&slip_mtx); if (sc->sc_if.if_dunit != unit) { if (!slisunitfree(unit)) { - splx(s); + mtx_unlock(&slip_mtx); return (ENXIO); } - wasup = sc->sc_if.if_flags & IFF_UP; bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); @@ -487,9 +520,11 @@ SLIP_HIWAT + 2 * sc->sc_if.if_mtu + 1); } slmarkstatic(unit); + mtx_unlock(&slip_mtx); break; case SLIOCSKEEPAL: + mtx_lock(&sc->sc_mtx); sc->sc_keepalive = *(u_int *)data * hz; if (sc->sc_keepalive) { sc->sc_flags |= SC_KEEPALIVE; @@ -501,6 +536,7 @@ sc->sc_flags &= ~SC_KEEPALIVE; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGKEEPAL: @@ -508,6 +544,7 @@ break; case SLIOCSOUTFILL: + mtx_lock(&sc->sc_mtx); sc->sc_outfill = *(u_int *)data * hz; if (sc->sc_outfill) { sc->sc_flags |= SC_OUTWAIT; @@ -519,9 +556,11 @@ sc->sc_flags &= ~SC_OUTWAIT; } } + mtx_unlock(&sc->sc_mtx); break; case SLIOCGOUTFILL: + /* Unlocked read. */ *(int *)data = sc->sc_outfill / hz; break; @@ -614,8 +653,11 @@ (*tp->t_oproc)(tp); if (tp->t_outq.c_cc != 0) { - if (sc != NULL) + if (sc != NULL) { + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); + } if (tp->t_outq.c_cc > SLIP_HIWAT) return 0; } @@ -645,6 +687,7 @@ * queueing, and the connection id compression will get * munged when this happens. */ + mtx_lock(&sc->sc_mtx); if (sc->sc_if.if_bpf) { /* * We need to save the TCP/IP header before it's @@ -675,9 +718,10 @@ } ip = mtod(m, struct ip *); if (ip->ip_v == IPVERSION && ip->ip_p == IPPROTO_TCP) { - if (sc->sc_if.if_flags & SC_COMPRESS) + if (sc->sc_if.if_flags & SC_COMPRESS) { *mtod(m, u_char *) |= sl_compress_tcp(m, ip, &sc->sc_comp, 1); + } } if (sc->sc_if.if_bpf && sc->bpfbuf) { /* @@ -689,6 +733,7 @@ bcopy(mtod(m, caddr_t), &sc->bpfbuf[SLX_CHDR], CHDR_LEN); BPF_TAP(&sc->sc_if, sc->bpfbuf, len + SLIP_HDRLEN); } + mtx_unlock(&sc->sc_mtx); /* * If system is getting low on clists, just flush our @@ -704,7 +749,9 @@ continue; } + mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~SC_OUTWAIT; + mtx_unlock(&sc->sc_mtx); /* * The extra FRAME_END will start up a new packet, and thus * will flush any accumulated garbage. We do this whenever @@ -794,6 +841,8 @@ { struct mbuf *m, *newm; + mtx_assert(&sc->sc_mtx, MA_OWNED); + MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); @@ -849,13 +898,16 @@ if (sc == NULL) return 0; if (c & TTY_ERRORMASK || (tp->t_state & TS_CONNECTED) == 0) { + mtx_lock(&sc->sc_mtx); sc->sc_flags |= SC_ERROR; + mtx_unlock(&sc->sc_mtx); return 0; } c &= TTY_CHARMASK; ++sc->sc_if.if_ibytes; + mtx_lock(&sc->sc_mtx); if (sc->sc_if.if_flags & IFF_DEBUG) { if (c == ABT_ESC) { /* @@ -874,6 +926,7 @@ sc->sc_starttime = time_second; if (sc->sc_abortcount >= ABT_COUNT) { slclose(tp,0); + mtx_unlock(&sc->sc_mtx); return 0; } } @@ -896,6 +949,7 @@ case FRAME_ESCAPE: sc->sc_escape = 1; + mtx_unlock(&sc->sc_mtx); return 0; case FRAME_END: @@ -980,6 +1034,7 @@ if (sc->sc_mp < sc->sc_ep) { *sc->sc_mp++ = c; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -991,6 +1046,7 @@ newpack: sc->sc_mp = sc->sc_buf = sc->sc_ep - SLRMAX; sc->sc_escape = 0; + mtx_unlock(&sc->sc_mtx); return 0; } @@ -1074,6 +1130,7 @@ { struct sl_softc *sc = chan; + mtx_lock(&sc->sc_mtx); if (sc->sc_keepalive) { if (sc->sc_flags & SC_KEEPALIVE) { if (sc->sc_ttyp->t_pgrp != NULL) { @@ -1087,6 +1144,7 @@ } else { sc->sc_flags &= ~SC_KEEPALIVE; } + mtx_unlock(&sc->sc_mtx); } static void @@ -1097,6 +1155,7 @@ register struct tty *tp = sc->sc_ttyp; int s; + mtx_lock(&sc->sc_mtx); if (sc->sc_outfill && tp != NULL) { if (sc->sc_flags & SC_OUTWAIT) { s = splimp (); @@ -1110,4 +1169,5 @@ } else { sc->sc_flags &= ~SC_OUTWAIT; } + mtx_unlock(&sc->sc_mtx); } --- //depot/vendor/freebsd/src/sys/net/if_slvar.h 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/if_slvar.h 2004/04/07 20:11:34 @@ -34,13 +34,16 @@ #ifndef _NET_IF_SLVAR_H_ #define _NET_IF_SLVAR_H_ +#include #include /* * Definitions for SLIP interface data structures * * (This exists so programs like slstats can get at the definition - * of sl_softc.) + * of sl_softc.) Fields owned by the SLIP subsystem are protected + * using sc_mtx, with the exception of sc_next, which is protected + * by the global slip_mtx. */ struct sl_softc { struct ifnet sc_if; /* network-visible interface */ @@ -66,6 +69,7 @@ struct slcompress sc_comp; /* tcp compression data */ LIST_ENTRY(sl_softc) sl_next; u_char *bpfbuf; /* hang buffer for bpf here */ + struct mtx sc_mtx; }; /* internal flags */ --- //depot/vendor/freebsd/src/sys/net/if_spppsubr.c 2004/05/25 14:55:56 +++ //depot/user/rwatson/netperf/sys/net/if_spppsubr.c 2004/05/30 18:41:33 @@ -92,12 +92,8 @@ #include #if defined(__FreeBSD__) && __FreeBSD__ >= 3 -# define UNTIMEOUT(fun, arg, handle) untimeout(fun, arg, handle) -# define TIMEOUT(fun, arg1, arg2, handle) handle = timeout(fun, arg1, arg2) # define IOCTL_CMD_T u_long #else -# define UNTIMEOUT(fun, arg, handle) untimeout(fun, arg) -# define TIMEOUT(fun, arg1, arg2, handle) timeout(fun, arg1, arg2) # define IOCTL_CMD_T int #endif @@ -259,10 +255,11 @@ void (*scr)(struct sppp *sp); }; +struct mtx sppp_mtx; +MTX_SYSINIT(sppp_mtx, &sppp_mtx, "sppp_mtx", MTX_DEF); + static struct sppp *spppq; -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 -static struct callout_handle keepalive_ch; -#endif +static struct callout keepalive_callout; #if defined(__FreeBSD__) && __FreeBSD__ >= 3 && __FreeBSD_version < 501113 #define SPP_FMT "%s%d: " @@ -960,13 +957,18 @@ { struct sppp *sp = (struct sppp*) ifp; + mtx_lock(&sppp_mtx); /* Initialize keepalive handler. */ - if (spppq == NULL) - TIMEOUT(sppp_keepalive, 0, hz * 10, keepalive_ch); + if (spppq == NULL) { + callout_init(&keepalive_callout, 0); + callout_reset(&keepalive_callout, hz * 10, sppp_keepalive, + NULL); + } /* Insert new entry into the keepalive list. */ sp->pp_next = spppq; spppq = sp; + mtx_unlock(&sppp_mtx); sp->pp_if.if_mtu = PP_MTU; sp->pp_if.if_flags = IFF_POINTOPOINT | IFF_MULTICAST; @@ -1012,6 +1014,7 @@ struct sppp **q, *p, *sp = (struct sppp*) ifp; int i; + mtx_lock(&sppp_mtx); /* Remove the entry from the keepalive list. */ for (q = &spppq; (p = *q); q = &p->pp_next) if (p == sp) { @@ -1021,11 +1024,12 @@ /* Stop keepalive handler. */ if (spppq == NULL) - UNTIMEOUT(sppp_keepalive, 0, keepalive_ch); + callout_stop(&keepalive_callout); + mtx_unlock(&sppp_mtx); for (i = 0; i < IDX_COUNT; i++) - UNTIMEOUT((cps[i])->TO, (void *)sp, sp->ch[i]); - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout((cps[i])->TO, (void *)sp, sp->ch[i]); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); } @@ -2004,8 +2008,8 @@ case STATE_STOPPING: sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: @@ -2015,8 +2019,8 @@ break; case STATE_ACK_SENT: (cp->scr)(sp); - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; } @@ -2032,7 +2036,7 @@ { sp->state[cp->protoidx] = newstate; - UNTIMEOUT(cp->TO, (void *)sp, sp->ch[cp->protoidx]); + untimeout(cp->TO, (void *)sp, sp->ch[cp->protoidx]); switch (newstate) { case STATE_INITIAL: case STATE_STARTING: @@ -2045,8 +2049,8 @@ case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: - TIMEOUT(cp->TO, (void *)sp, sp->lcp.timeout, - sp->ch[cp->protoidx]); + sp->ch[cp->protoidx] = timeout(cp->TO, (void *)sp, + sp->lcp.timeout); break; } } @@ -4142,7 +4146,7 @@ * a number between 300 and 810 seconds. */ i = 300 + ((unsigned)(random() & 0xff00) >> 7); - TIMEOUT(chap.TO, (void *)sp, i * hz, sp->ch[IDX_CHAP]); + sp->ch[IDX_CHAP] = timeout(chap.TO, (void *)sp, i * hz); } if (debug) { @@ -4186,7 +4190,7 @@ if (debug) log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp)); - UNTIMEOUT(chap.TO, (void *)sp, sp->ch[IDX_CHAP]); + untimeout(chap.TO, (void *)sp, sp->ch[IDX_CHAP]); sp->lcp.protos &= ~(1 << IDX_CHAP); lcp.Close(sp); @@ -4322,7 +4326,7 @@ /* ack and nak are his authproto */ case PAP_ACK: - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); if (debug) { log(LOG_DEBUG, SPP_FMT "pap success", SPP_ARGS(ifp)); @@ -4351,7 +4355,7 @@ break; case PAP_NAK: - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); if (debug) { log(LOG_INFO, SPP_FMT "pap failure", SPP_ARGS(ifp)); @@ -4408,8 +4412,8 @@ if (sp->myauth.proto == PPP_PAP) { /* we are peer, send a request, and start a timer */ pap.scr(sp); - TIMEOUT(sppp_pap_my_TO, (void *)sp, sp->lcp.timeout, - sp->pap_my_to_ch); + sp->pap_my_to_ch = timeout(sppp_pap_my_TO, (void *)sp, + sp->lcp.timeout); } } @@ -4512,8 +4516,8 @@ if (debug) log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp)); - UNTIMEOUT(pap.TO, (void *)sp, sp->ch[IDX_PAP]); - UNTIMEOUT(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); + untimeout(pap.TO, (void *)sp, sp->ch[IDX_PAP]); + untimeout(sppp_pap_my_TO, (void *)sp, sp->pap_my_to_ch); sp->lcp.protos &= ~(1 << IDX_PAP); lcp.Close(sp); @@ -4640,7 +4644,12 @@ struct sppp *sp; int s; + /* + * XXXRW: It would be nice to avoid calling all this stuff while + * holding sppp_mtx, or we risk lock order reversals. + */ s = splimp(); + mtx_lock(&sppp_mtx); for (sp=spppq; sp; sp=sp->pp_next) { struct ifnet *ifp = &sp->pp_if; @@ -4679,8 +4688,9 @@ sp->lcp.echoid, 4, &nmagic); } } + callout_reset(&keepalive_callout, hz * 10, sppp_keepalive, NULL); + mtx_unlock(&sppp_mtx); splx(s); - TIMEOUT(sppp_keepalive, 0, hz * 10, keepalive_ch); } /* --- //depot/vendor/freebsd/src/sys/net/if_stf.c 2004/05/30 13:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_stf.c 2004/05/30 18:41:33 @@ -137,13 +137,11 @@ #define sc_ro __sc_ro46.__sc_ro4 const struct encaptab *encap_cookie; LIST_ENTRY(stf_softc) sc_list; /* all stf's are linked */ + struct mtx sc_mtx; /* protect sc_ro */ }; /* * All mutable global variables in if_stf.c are protected by stf_mtx. - * XXXRW: Note that mutable fields in the softc are not currently locked: - * in particular, sc_ro needs to be protected from concurrent entrance - * of stf_output(). */ static struct mtx stf_mtx; static LIST_HEAD(, stf_softc) stf_softc_list; @@ -199,6 +197,7 @@ free(sc, M_STF); return (ENOMEM); } + mtx_init(&sc->sc_mtx, "stf sc_mtx", NULL, MTX_DEF); ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; @@ -223,6 +222,7 @@ bpfdetach(&sc->sc_if); if_detach(&sc->sc_if); + mtx_destroy(&sc->sc_mtx); free(sc, M_STF); } @@ -394,9 +394,10 @@ struct ip *ip; struct ip6_hdr *ip6; struct in6_ifaddr *ia6; -#ifdef MAC + struct route ro; int error; +#ifdef MAC error = mac_check_ifnet_transmit(ifp, m); if (error) { m_freem(m); @@ -498,9 +499,7 @@ else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); - /* - * XXXRW: Locking of sc_ro required. - */ + mtx_lock(&sc->sc_mtx); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { @@ -519,12 +518,21 @@ if (sc->sc_ro.ro_rt == NULL) { m_freem(m); ifp->if_oerrors++; + mtx_unlock(&sc->sc_mtx); return ENETUNREACH; } } + /* + * XXXRW: Holding mutex over call to ip_output(): potential lock + * order issue? Hard to resolve cleanly with the current route + * caching model, as we have to synchronize access to shared softc + * state. + */ ifp->if_opackets++; - return ip_output(m, NULL, &sc->sc_ro, 0, NULL, NULL); + error = ip_output(m, NULL, &ro, 0, NULL, NULL); + mtx_unlock(&sc->sc_mtx); + return (error); } static int --- //depot/vendor/freebsd/src/sys/net/if_tap.c 2004/05/30 13:25:31 +++ //depot/user/rwatson/netperf/sys/net/if_tap.c 2004/05/30 18:41:33 @@ -113,6 +113,10 @@ * All global variables in if_tap.c are locked with tapmtx, with the * exception of tapdebug, which is accessed unlocked; tapclones is * static at runtime. + * + * XXXRW: si_flags appears not to be protected from concurrent access, + * and is written at run-time. + * XXXRW: si_drv1 is also used for test-and-set, and isn't synchronized. */ static struct mtx tapmtx; static int tapdebug = 0; /* debug flag */ @@ -162,6 +166,7 @@ * The EBUSY algorithm here can't quite atomically * guarantee that this is race-free since we have to * release the tap mtx to deregister the clone handler. + * XXXRW: is this true? */ mtx_lock(&tapmtx); SLIST_FOREACH(tp, &taphead, tap_next) { @@ -693,6 +698,7 @@ case SIOCSIFADDR: /* set MAC address of the remote side */ mtx_lock(&tp->tap_mtx); + /* XXXRW: Does this actually do anything? */ bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); break; @@ -747,6 +753,7 @@ if (flag & IO_NDELAY) return (EWOULDBLOCK); + /* This looks like a wanna-be condition variable. */ mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_RWAIT; mtx_unlock(&tp->tap_mtx); --- //depot/vendor/freebsd/src/sys/net/if_tun.c 2004/03/29 14:20:33 +++ //depot/user/rwatson/netperf/sys/net/if_tun.c 2004/03/29 16:17:24 @@ -59,6 +59,12 @@ * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. + * + * XXXRW: we allocate si_drv1 for the dev_t on demand, rather than when + * the dev_t is instantiated. Nothing serializes the test/set of that + * field. + * + * XXXRW: what serializes access to si_flags? */ struct tun_softc { TAILQ_ENTRY(tun_softc) tun_list; @@ -121,6 +127,9 @@ static d_ioctl_t tunioctl; static d_poll_t tunpoll; +/* + * XXXRW: can remove D_NEEDGIANT? Probably not because of si_drv1 for now. + */ static struct cdevsw tun_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDGIANT, @@ -364,6 +373,9 @@ ifp->if_flags |= IFF_UP | IFF_RUNNING; getmicrotime(&ifp->if_lastchange); + /* + * XXXRW: interface locking. + */ for (ifa = TAILQ_FIRST(&ifp->if_addrhead); ifa; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr == NULL) --- //depot/vendor/freebsd/src/sys/net/raw_cb.c 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/raw_cb.c 2004/04/22 18:08:25 @@ -32,7 +32,9 @@ #include #include +#include #include +#include #include #include #include @@ -49,10 +51,11 @@ * redo address binding to allow wildcards */ +struct mtx rawcb_mtx; struct rawcb_list_head rawcb_list; -static u_long raw_sendspace = RAWSNDQ; -static u_long raw_recvspace = RAWRCVQ; +static const u_long raw_sendspace = RAWSNDQ; +static const u_long raw_recvspace = RAWRCVQ; /* * Allocate a control block and a nominal amount @@ -79,7 +82,9 @@ rp->rcb_socket = so; rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; + mtx_lock(&rawcb_mtx); LIST_INSERT_HEAD(&rawcb_list, rp, list); + mtx_unlock(&rawcb_mtx); return (0); } @@ -93,6 +98,7 @@ { struct socket *so = rp->rcb_socket; + SOCK_LOCK(so); so->so_pcb = 0; sotryfree(so); LIST_REMOVE(rp, list); @@ -117,6 +123,7 @@ m_freem(dtom(rp->rcb_faddr)); rp->rcb_faddr = 0; #endif + /* Unlocked read. */ if (rp->rcb_socket->so_state & SS_NOFDREF) raw_detach(rp); } --- //depot/vendor/freebsd/src/sys/net/raw_cb.h 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/raw_cb.h 2004/04/07 20:11:34 @@ -57,6 +57,7 @@ #ifdef _KERNEL extern LIST_HEAD(rawcb_list_head, rawcb) rawcb_list; +extern struct mtx rawcb_mtx; /* protosw entries */ pr_ctlinput_t raw_ctlinput; --- //depot/vendor/freebsd/src/sys/net/raw_usrreq.c 2004/04/07 13:52:05 +++ //depot/user/rwatson/netperf/sys/net/raw_usrreq.c 2004/05/28 21:16:19 @@ -31,9 +31,12 @@ */ #include +#include #include #include #include +#include +#include #include #include #include @@ -43,12 +46,15 @@ #include +MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); + /* * Initialize raw connection block q. */ void raw_init() { + LIST_INIT(&rawcb_list); } @@ -71,7 +77,12 @@ register struct mbuf *m = m0; struct socket *last; + /* + * XXXRW: Potential lock order issues due to holding the + * rawcb_mtx across all this stuff. Need to revisit. + */ last = 0; + mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; @@ -116,6 +127,7 @@ } } else m_freem(m); + mtx_unlock(&rawcb_mtx); } /*ARGSUSED*/ @@ -139,8 +151,8 @@ if (rp == 0) return EINVAL; raw_disconnect(rp); + soisdisconnected(so); sotryfree(so); - soisdisconnected(so); /* XXX huh? called after the sofree()? */ return 0; } --- //depot/vendor/freebsd/src/sys/net/rtsock.c 2004/05/10 08:11:13 +++ //depot/user/rwatson/netperf/sys/net/rtsock.c 2004/05/23 09:56:02 @@ -53,10 +53,18 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ +/* + * XXXRW: It would be really nice to add const to these, but that may + * not be possible due to where they are passed in. We might need + * to const-poison a whole boatload of APIs...? + */ static struct sockaddr route_dst = { 2, PF_ROUTE, }; static struct soc