--- //depot/user/rwatson/netperf/sys/fs/fifofs/fifo_vnops.c 2004/05/23 09:56:02 +++ //depot/user/rwatson/net2/sys/fs/fifofs/fifo_vnops.c 2004/05/23 10:05:43 @@ -211,8 +211,9 @@ } fip->fi_readers = fip->fi_writers = 0; wso->so_snd.sb_lowat = PIPE_BUF; - /* XXXRW: so_state locking. */ - rso->so_state |= SS_CANTRCVMORE; + SOCKBUF_LOCK(&rso->so_rcv); + rso->so_rcv.sb_state |= SS_CANTRCVMORE; + SOCKBUF_UNLOCK(&rso->so_rcv); vp->v_fifoinfo = fip; } @@ -230,8 +231,9 @@ if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { - /* XXXRW: so_state locking. */ - fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; + SOCKBUF_LOCK(&fip->fi_writesock->so_snd); + fip->fi_writesock->so_snd.sb_state &= ~SS_CANTSENDMORE; + SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd); if (fip->fi_writers > 0) { wakeup(&fip->fi_writers); sowwakeup(fip->fi_writesock); @@ -245,8 +247,9 @@ } fip->fi_writers++; if (fip->fi_writers == 1) { - /* XXXRW: so_state locking? */ - fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; + SOCKBUF_LOCK(&fip->fi_writesock->so_rcv); + fip->fi_readsock->so_rcv.sb_state &= ~SS_CANTRCVMORE; + SOCKBUF_UNLOCK(&fip->fi_writesock->so_rcv); if (fip->fi_readers > 0) { wakeup(&fip->fi_readers); sorwakeup(fip->fi_writesock); @@ -469,7 +472,7 @@ SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc; /* Unlocked read. */ - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_rcv.sb_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; result = 1; } else { @@ -504,7 +507,7 @@ SOCKBUF_LOCK(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); /* Unlocked read. */ - if (so->so_state & SS_CANTSENDMORE) { + if (so->so_snd.sb_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; result = 1; } else { --- //depot/user/rwatson/netperf/sys/i386/i386/intr_machdep.c 2004/05/23 09:56:02 +++ //depot/user/rwatson/net2/sys/i386/i386/intr_machdep.c 2004/05/23 10:05:43 @@ -193,8 +193,8 @@ */ TAILQ_FOREACH(ih, &it->it_handlers, ih_next) { MPASS(ih->ih_flags & IH_FAST); - CTR3(KTR_INTR, "%s: executing handler %p(%p)", - __func__, ih->ih_handler, + CTR4(KTR_INTR, "%s: irq %d executing handler %p(%p)", + __func__, vector, ih->ih_handler, ih->ih_argument == NULL ? iframe : ih->ih_argument); if (ih->ih_argument == NULL) @@ -209,6 +209,8 @@ * For stray and threaded interrupts, we mask and EOI the * source. */ + CTR2(KTR_INTR, "%s: irq %d scheduling ithread", __func__, + vector); isrc->is_pic->pic_disable_source(isrc); isrc->is_pic->pic_eoi_source(isrc); if (ih == NULL) --- //depot/user/rwatson/netperf/sys/kern/kern_descrip.c 2004/04/06 20:50:45 +++ //depot/user/rwatson/net2/sys/kern/kern_descrip.c 2004/05/22 13:21:16 @@ -2055,7 +2055,7 @@ { struct flock lf; struct vnode *vp; - int error; + int error, type; FILE_LOCK_ASSERT(fp, MA_OWNED); @@ -2065,11 +2065,21 @@ } /* We have the last ref so we can proceed without the file lock. */ FILE_UNLOCK(fp); + /* - * XXXRW: With sockets locked, can we push this Giant grab into - * the purely VFS case? + * XXXRW: It's not pretty, but this way we can avoid holding Giant + * over operation vectors that don't require it. Note that + * technically, this is slightly conservative as badops doesn't + * need Giant. */ - mtx_lock(&Giant); + type = fp->f_type; + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_lock(&Giant); + } if (fp->f_count < 0) panic("fdrop: count < 0"); if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { @@ -2085,7 +2095,13 @@ else error = 0; ffree(fp); - mtx_unlock(&Giant); + switch(type) { + case DTYPE_SOCKET: + case DTYPE_PIPE: + break; + default: + mtx_unlock(&Giant); + } return (error); } --- //depot/user/rwatson/netperf/sys/kern/subr_witness.c 2004/04/27 21:47:20 +++ //depot/user/rwatson/net2/sys/kern/subr_witness.c 2004/05/22 11:11:25 @@ -277,6 +277,7 @@ /* * Routing */ + { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_mtx_sleep }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, @@ -303,6 +304,11 @@ { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* + * SLIP + */ + { "slip_mtx", &lock_class_mtx_sleep }, + { "slip sc_mtx", &lock_class_mtx_sleep }, + /* * spin locks */ #ifdef SMP --- //depot/user/rwatson/netperf/sys/kern/sys_socket.c 2004/05/07 19:12:27 +++ //depot/user/rwatson/net2/sys/kern/sys_socket.c 2004/05/12 20:10:54 @@ -219,10 +219,10 @@ * is consistent. */ /* Unlocked read. */ - if ((so->so_state & SS_CANTRCVMORE) == 0 || + if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; - if ((so->so_state & SS_CANTSENDMORE) == 0) + if ((so->so_snd.sb_state & SS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; ub->st_uid = so->so_cred->cr_uid; --- //depot/user/rwatson/netperf/sys/kern/uipc_accf.c 2004/05/22 21:24:30 +++ //depot/user/rwatson/net2/sys/kern/uipc_accf.c 2004/05/23 00:59:29 @@ -48,6 +48,9 @@ struct mtx accept_filter_mtx; MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx", MTX_DEF); +#define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx) +#define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx) + static SLIST_HEAD(, accept_filter) accept_filtlsthd = SLIST_HEAD_INITIALIZER(&accept_filtlsthd); @@ -71,23 +74,23 @@ { struct accept_filter *p; - mtx_lock(&accept_filter_mtx); + ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, filt->accf_name) == 0) { if (p->accf_callback != NULL) { - mtx_unlock(&accept_filter_mtx); + ACCEPT_FILTER_UNLOCK(); return (EEXIST); } else { p->accf_callback = filt->accf_callback; + ACCEPT_FILTER_UNLOCK(); FREE(filt, M_ACCF); - mtx_unlock(&accept_filter_mtx); return (0); } } if (p == NULL) SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); - mtx_unlock(&accept_filter_mtx); + ACCEPT_FILTER_UNLOCK(); return (0); } @@ -109,15 +112,13 @@ { struct accept_filter *p; - mtx_lock(&accept_filter_mtx); + ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) - if (strcmp(p->accf_name, name) == 0) { - mtx_unlock(&accept_filter_mtx); - return (p); - } + if (strcmp(p->accf_name, name) == 0) + break; + ACCEPT_FILTER_UNLOCK(); - mtx_unlock(&accept_filter_mtx); - return (NULL); + return (p); } int --- //depot/user/rwatson/netperf/sys/kern/uipc_socket.c 2004/04/22 18:08:25 +++ //depot/user/rwatson/net2/sys/kern/uipc_socket.c 2004/05/23 01:53:57 @@ -106,6 +106,8 @@ &so_zero_copy_send, 0, "Enable zero copy send"); #endif /* ZERO_COPY_SOCKETS */ +struct mtx accept_mtx; +MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * Socket operation routines. @@ -204,6 +206,7 @@ SOCK_LOCK(so); /* XXXRW: so_state locking? */ so->so_state |= SS_NOFDREF; + /* printf("NOFDREF -> %p %d\n", so, (int)so->so_gencnt); */ sorele(so); return (error); } @@ -273,13 +276,13 @@ splx(s); return (error); } - SOCKBUF_LOCK(&so->so_rcv); + ACCEPT_LOCK(); if (TAILQ_EMPTY(&so->so_comp)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; - SOCKBUF_UNLOCK(&so->so_rcv); + ACCEPT_UNLOCK(); splx(s); return (0); } @@ -293,34 +296,46 @@ KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); SOCK_LOCK_ASSERT(so); + /* XXXRW: Why would SS_NOFDREF be unset here? so_count is 0. */ /* XXXRW: so_state locking? */ if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { SOCK_UNLOCK(so); return; } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - if ((head = so->so_head) != NULL) { - /* XXXRW: so_state locking? */ - if (so->so_state & SS_INCOMP) { - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - } else if (so->so_state & SS_COMP) { - /* - * We must not decommission a socket that's - * on the accept(2) queue. If we do, then - * accept(2) may hang after select(2) indicated - * that the listening socket was ready. - */ - SOCKBUF_UNLOCK(&so->so_rcv); + + ACCEPT_LOCK(); + head = so->so_head; + if (head != NULL) { + KASSERT((so->so_qstate & SS_COMP) != 0 || + (so->so_qstate & SS_INCOMP) != 0, + ("sofree: so_head != NULL, but neither SS_COMP nor " + "SS_INCOMP")); + /* + * accept(2) is responsible draining the completed + * connection queue and freeing those sockets, so + * we just return here if this socket is currently + * on the completed connection queue. Otherwise, + * accept(2) may hang after select(2) has indicating + * that a listening socket was ready. If it's an + * incomplete connection, we remove it from the queue + * and free it; otherwise, it won't be released until + * the listening socket is closed. + */ + if (so->so_qstate && SS_COMP) { + ACCEPT_UNLOCK(); return; - } else { - panic("sofree: not queued"); } - so->so_state &= ~SS_INCOMP; - so->so_head = NULL; + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_qstate &= ~SS_INCOMP; } - SOCKBUF_UNLOCK(&so->so_rcv); + KASSERT((so->so_qstate & SS_COMP) == 0 && + (so->so_qstate & SS_INCOMP) == 0, + ("sofree: so_head == NULL, but still SS_COMP(%d) or SS_INCOMP(%d)", + so->so_qstate & SS_COMP, so->so_qstate & SS_INCOMP)); + ACCEPT_UNLOCK(); + SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_NOINTR; (void)sblock(&so->so_snd, M_WAITOK); @@ -349,34 +364,54 @@ int error = 0; funsetown(&so->so_sigio); + /*- + * XXXRW: Lots of locking problems here. For one thing, + * we should probably clear the SO_ACCEPTCONN flag and + * push that down to the protocol layer. Also, the locking + * for the queue draining is probably not right either: + * what prevents new ones from being inserted after we get + * past this? Ideally, the downcall would prevent this, but + * there isn't one. + * + * NOTE: sp->so_{incomp,comp} and sp->so_head are protected + * with SOCKBUF_LOCK(&so->so_rcv). + */ + /* + * XXXRW: soclose() should be split into two parts: one to handle + * listen sockets, and the other to handle the remainder. To + * prevent races with the protocol code, we need to detach before + * draining the queues. For non-listen sockets, the disconnect + * has to happen before the detach. + */ if (so->so_options & SO_ACCEPTCONN) { - struct socket *sp, *sonext; - - sp = TAILQ_FIRST(&so->so_incomp); - for (; sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); + struct socket *sp; + ACCEPT_LOCK(); + while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { + TAILQ_REMOVE(&so->so_incomp, sp, so_list); + so->so_incqlen--; + sp->so_qstate &= ~SS_INCOMP; + sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } - for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); - /* Dequeue from so_comp since sofree() won't do it */ + while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; - /* XXXRW: so_state locking? */ - sp->so_state &= ~SS_COMP; + sp->so_qstate &= ~SS_COMP; sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } + ACCEPT_UNLOCK(); } - SOCK_LOCK(so); if (so->so_pcb == NULL) goto discard; /* XXXRW: so_state locking? */ if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { - SOCK_UNLOCK(so); error = sodisconnect(so); - SOCK_LOCK(so); if (error) goto drop; } @@ -385,7 +420,7 @@ (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = msleep(&so->so_timeo, SOCK_MTX(so), + error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; @@ -395,24 +430,60 @@ drop: if (so->so_pcb != NULL) { int error2; - SOCK_UNLOCK(so); error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); - SOCK_LOCK(so); if (error == 0) error = error2; } discard: /* XXXRW: so_state locking? */ - if (so->so_state & SS_NOFDREF) + SOCK_LOCK(so); + if (so->so_state & SS_NOFDREF) { + printf("soclose: NOFDEREF\n"); + printf("so = %p\n", so); + printf(" so_count = %d\n", so->so_count); + printf(" so_type = %d\n", so->so_type); + printf(" so_options = %d\n", so->so_options); + printf(" so_linger = %d\n", so->so_linger); + printf(" so_state = %d\n", so->so_state); + printf(" so_pcb = %p\n", so->so_pcb); + printf(" so_proto = %p\n", so->so_proto); + printf(" pr_type = %d\n", so->so_proto->pr_type); + printf(" pr_domain = %p\n", so->so_proto->pr_domain); + printf(" dom_family = %d\n", so->so_proto->pr_domain->dom_family); + printf(" dom_name = %s\n", so->so_proto->pr_domain->dom_name); + printf(" pr_protocol = %d\n", so->so_proto->pr_protocol); + printf(" pr_flags = %d\n", so->so_proto->pr_flags); + printf(" so_head = %p\n", so->so_head); + printf(" TAILQ_FIRST(so_incomp) = %p\n", TAILQ_FIRST(&so->so_incomp)); + printf(" TAILQ_FIRST(so_comp) = %p\n", TAILQ_FIRST(&so->so_comp)); + printf(" so_qlen = %d\n", so->so_qlen); + printf(" so_incqlen = %d\n", so->so_incqlen); + printf(" so_qlimit = %d\n", so->so_qlimit); + printf(" so_timeo = %d\n", so->so_timeo); + printf(" so_error = %d\n", so->so_error); + printf(" so_sigio = %p\n", so->so_sigio); + printf(" so_oobmark = %lu\n", so->so_oobmark); + printf(" so_rcv.sb_state = %d\n", so->so_rcv.sb_state); + printf(" so_snd.sb_state = %d\n", so->so_snd.sb_state); + printf(" so_upcall = %p\n", so->so_upcall); + printf(" so_upcallarg = %p\n", so->so_upcallarg); + printf(" so_cred = %p\n", so->so_cred); + printf(" so_gencnt = %d\n", (int)so->so_gencnt); + SOCK_UNLOCK(so); + printf("doh\n"); panic("soclose: NOFDREF"); + } so->so_state |= SS_NOFDREF; + /* printf("NOFDREF -> %p %d\n", so, (int)so->so_gencnt); */ sorele(so); splx(s); return (error); } /* - * Must be called at splnet... + * XXXRW: soabort() must not be called with any locks held, as the protocol + * will need to be able to grab socket and socket buffer locks, and we also + * try to free the socket if the protocol generates an error. */ int soabort(so) @@ -437,9 +508,11 @@ int error; /* XXXRW: so_state locking? */ + SOCK_LOCK(so); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; + SOCK_UNLOCK(so); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); return (error); } @@ -582,8 +655,8 @@ if (error) goto out; do { - /* XXXRW: so_state locking? */ - if (so->so_state & SS_CANTSENDMORE) + SOCKBUF_LOCK_ASSERT(&so->so_snd); + if (so->so_snd.sb_state & SS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; @@ -878,8 +951,8 @@ so->so_error = 0; goto release; } - /* XXXRW: so_state locking? */ - if (so->so_state & SS_CANTRCVMORE) { + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (so->so_rcv.sb_state & SS_CANTRCVMORE) { if (m) goto dontblock; else @@ -1177,8 +1250,8 @@ */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { - /* XXXRW: so_state locking? */ - if (so->so_error || so->so_state & SS_CANTRCVMORE) + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (so->so_error || so->so_rcv.sb_state & SS_CANTRCVMORE) break; /* * Notify the protocol that some data has been @@ -1223,9 +1296,9 @@ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pru_rcvd)(so, flags); } - /* XXXRW: so_state locking? */ + SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && - (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) + (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) goto restart; /* XXX multi-counts msgs */ if (flagsp != NULL) @@ -1288,21 +1361,23 @@ struct socket *so; struct sockopt *sopt; { - struct accept_filter_arg *afap = NULL; + struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *af = so->so_accf; + struct so_accf *newaf; int error = 0; -/* XXX locking */ + newaf = NULL; + afap = NULL; + /* do not set/remove accept filters on non listen sockets */ - if ((so->so_options & SO_ACCEPTCONN) == 0) { - error = EINVAL; - goto out; - } + if ((so->so_options & SO_ACCEPTCONN) == 0) + return (EINVAL); /* removing the filter */ if (sopt == NULL) { - if (af != NULL) { + SOCK_LOCK(so); + if (so->so_accf != NULL) { + struct so_accf *af = so->so_accf; if (af->so_accept_filter != NULL && af->so_accept_filter->accf_destroy != NULL) { af->so_accept_filter->accf_destroy(so); @@ -1314,47 +1389,79 @@ so->so_accf = NULL; } so->so_options &= ~SO_ACCEPTFILTER; + SOCK_UNLOCK(so); return (0); } - /* adding a filter */ - /* must remove previous filter first */ - if (af != NULL) { - error = EINVAL; - goto out; - } + + /*- + * Adding a filter. + * + * Do memory allocation, copyin, and filter lookup now while we're + * not holding any locks. Avoids sleeping with a mutex, as well + * as introducing a lock order between accept filter locks and + * socket locks here. + */ + MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, + M_WAITOK); /* don't put large objects on the kernel stack */ - MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; - if (error) - goto out; + if (error) { + FREE(afap, M_TEMP); + return (error); + } afp = accept_filt_get(afap->af_name); if (afp == NULL) { - error = ENOENT; + FREE(afap, M_TEMP); + return (ENOENT); + } + + /* + * Allocate the new accept filter instance storage. We may have + * to free it again later if we fail to attach it. If attached + * properly, 'newaf' is NULLed to avoid a free() while in use. + */ + MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | + M_ZERO); + if (afp->accf_create != NULL && afap->af_name[0] != '\0') { + int len = strlen(afap->af_name) + 1; + MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, + M_WAITOK); + strcpy(newaf->so_accept_filter_str, afap->af_name); + } + + SOCK_LOCK(so); + /* must remove previous filter first */ + if (so->so_accf != NULL) { + error = EINVAL; goto out; } - MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); + /* + * Invoke the accf_create() method of the filter if required. + * WARNING: the socket mutex is held over this call, so the + * create method cannot block. This may be something we have + * to change, but it would require addressing possible races. + */ if (afp->accf_create != NULL) { - if (afap->af_name[0] != '\0') { - int len = strlen(afap->af_name) + 1; - - MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); - strcpy(af->so_accept_filter_str, afap->af_name); - } - af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); - if (af->so_accept_filter_arg == NULL) { - FREE(af->so_accept_filter_str, M_ACCF); - FREE(af, M_ACCF); - so->so_accf = NULL; + newaf->so_accept_filter_arg = + afp->accf_create(so, afap->af_arg); + if (newaf->so_accept_filter_arg == NULL) { error = EINVAL; goto out; } } - af->so_accept_filter = afp; - so->so_accf = af; + newaf->so_accept_filter = afp; + so->so_accf = newaf; so->so_options |= SO_ACCEPTFILTER; + newaf = NULL; out: + SOCK_UNLOCK(so); + if (newaf != NULL) { + if (newaf->so_accept_filter_str != NULL) + FREE(newaf->so_accept_filter_str, M_ACCF); + FREE(newaf, M_ACCF); + } if (afap != NULL) FREE(afap, M_TEMP); return (error); @@ -1861,6 +1968,7 @@ if (soreadable(so)) revents |= events & (POLLIN | POLLRDNORM); + /* Unlocked read. */ if (events & POLLINIGNEOF) if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || !TAILQ_EMPTY(&so->so_comp) || so->so_error) @@ -1944,8 +2052,7 @@ if (needlock) SOCKBUF_LOCK(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; - /* XXXRW: so_state locking? */ - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_rcv.sb_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; @@ -1983,8 +2090,7 @@ if (needlock) SOCKBUF_LOCK(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); - /* XXXRW: so_state locking? */ - if (so->so_state & SS_CANTSENDMORE) { + if (so->so_snd.sb_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; result = 1; @@ -2008,6 +2114,7 @@ { struct socket *so = kn->kn_fp->f_data; + /* Unlocked read. */ kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } --- //depot/user/rwatson/netperf/sys/kern/uipc_socket2.c 2004/05/23 09:56:02 +++ //depot/user/rwatson/net2/sys/kern/uipc_socket2.c 2004/05/23 10:05:43 @@ -112,6 +112,12 @@ SOCK_UNLOCK(so); } +/* + * soisconnected() transitions a socket to a fully connected state. If + * so->so_head is non-NULL, we transition it from the incompletely + * connected queue to the completely connected queue. Otherwise, we + * just wake up the socket since it was an out-bound connection. + */ void soisconnected(so) struct socket *so; @@ -119,40 +125,75 @@ struct socket *head = so->so_head; int need_lock = !SOCK_OWNED(so); - if (need_lock) - SOCK_LOCK(so); + KASSERT(need_lock == 1, ("soisconnected(): called with lock!")); + /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - if (head && (so->so_state & SS_INCOMP)) { + SOCK_UNLOCK(so); + + /* + * XXXRW: Maybe an unlocked read of so_head would be desirable + * here? Unlocked read of so_qstate probably not a good idea + * regardless. + */ + ACCEPT_LOCK(); + head = so->so_head; + if (head != NULL && (so->so_qstate & SS_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { - if (need_lock) - SOCK_UNLOCK(so); - SOCK_LOCK(head); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; - so->so_state &= ~SS_INCOMP; + so->so_qstate &= ~SS_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); head->so_qlen++; - so->so_state |= SS_COMP; - sorwakeup_locked(head); + so->so_qstate |= SS_COMP; + ACCEPT_UNLOCK(); + sorwakeup(head); wakeup_one(&head->so_timeo); - SOCK_UNLOCK(head); return; } else { -/* XXX locking */ - so->so_upcall = head->so_accf->so_accept_filter->accf_callback; - so->so_upcallarg = head->so_accf->so_accept_filter_arg; + void (*so_upcall)(struct socket *, void *, int); + void *so_upcallarg; + /* + * XXXRW: Should probably copy the upcall fields to + * local stack variables while holding the socket + * lock (and clear the option), then release the + * lock and make the call. This will prevent lock + * order reversals/recursion between this code and + * the upcall implementation, which will likely + * also want to frob the socket using locks. + * + * NOTE: We keep a local copy of the function + * pointer so we can invoke the upcall without + * holding locks. However, we also have to copy + * in the upcall fields from the head because the + * filter may need to be called more than once. + */ + SOCK_LOCK(so); + so_upcall = so->so_upcall = + head->so_accf->so_accept_filter->accf_callback; + so_upcallarg = so->so_upcallarg = + head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; - so->so_upcall(so, so->so_upcallarg, M_TRYWAIT); - if (need_lock) - SOCK_UNLOCK(so); + SOCK_UNLOCK(so); + ACCEPT_UNLOCK(); + /* + * Call with our existing reference, but without + * any locks. + */ + so_upcall(so, so_upcallarg, M_TRYWAIT); } - } else { + } else + ACCEPT_UNLOCK(); + if (head == NULL) { wakeup(&so->so_timeo); - if (need_lock) - SOCK_UNLOCK(so); + /* + * XXXRW: If assuming socket lock is socket buffer receive + * lock, should use a _locked variant of sorwakeup() and + * avoid recursion here. + */ sorwakeup(so); sowwakeup(so); } @@ -162,40 +203,62 @@ soisdisconnecting(so) register struct socket *so; { - int need_lock = !SOCK_OWNED(so); - if (need_lock) - SOCK_LOCK(so); + /* + * XXXRW: Assuming we do need SOCK_LOCK(so) here, and the receive + * and base socket lock remain identical, then we should combine + * the SOCK_LOCK() and SOCKBUF_LOCK(...rcv) sections here. + */ /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; - so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + so->so_state |= SS_ISDISCONNECTING; + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state |= SS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_state |= SS_CANTSENDMORE; + SOCKBUF_UNLOCK(&so->so_snd); wakeup(&so->so_timeo); - SOCK_UNLOCK(so); + /* + * XXXRW: Multiple socket buffer lock/unlock here could be avoided + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); - if (!need_lock) - SOCK_LOCK(so); } void soisdisconnected(so) register struct socket *so; { - int need_lock = !SOCK_OWNED(so); - - if (need_lock) - SOCK_LOCK(so); + + /* + * XXXRW: Assuming we do need SOCK_LOCK(so) here, and the receive + * and base socket lock remain identical, then we should combine + * the SOCK_LOCK() and SOCKBUF_LOCK(...rcv) sections here. + */ /* XXXRW: so_state locking? */ + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); - so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + SOCK_UNLOCK(so); + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state |= SS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_state |= SS_CANTSENDMORE; + SOCKBUF_UNLOCK(&so->so_snd); + so->so_state |= SS_ISDISCONNECTED; wakeup(&so->so_timeo); - SOCK_UNLOCK(so); - /* Unlocked read. */ + /* Unlocked read of sb_cc. */ sbdrop(&so->so_snd, so->so_snd.sb_cc); + /* + * XXXRW: Avoid multiple lock/unlock of socket buffer locks here + * by coallescing with the above? + */ sowwakeup(so); sorwakeup(so); - if (!need_lock) - SOCK_LOCK(so); } /* @@ -216,9 +279,9 @@ register struct socket *so; int over; - SOCK_LOCK(head); + ACCEPT_LOCK(); over = (head->so_qlen > 3 * head->so_qlimit / 2); - SOCK_UNLOCK(head); + ACCEPT_UNLOCK(); if (over) return ((struct socket *)0); so = soalloc(M_NOWAIT); @@ -232,6 +295,7 @@ so->so_linger = head->so_linger; /* XXXRW: so_state locking? */ so->so_state = head->so_state | SS_NOFDREF; + /* printf("NOFDREF -> %p %d\n", so, (int)so->so_gencnt); */ so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_cred = crhold(head->so_cred); @@ -246,24 +310,35 @@ sodealloc(so); return ((struct socket *)0); } - SOCKBUF_LOCK(&head->so_rcv); + ACCEPT_LOCK(); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - /* XXXRW: so_state locking? */ - so->so_state |= SS_COMP; + so->so_qstate |= SS_COMP; head->so_qlen++; } else { - if (head->so_incqlen > head->so_qlimit) { + /* + * XXXRW: Keep removing sockets from the head until there's room + * for us to insert on the tail. In pre-locking revisions, this + * was a simple if(), but as we could be racing with other + * threads and soabort() requires dropping locks, we must loop + * waiting for the condition to be true. + */ + while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); + TAILQ_REMOVE(&so->so_incomp, sp, so_list); + head->so_incqlen--; + sp->so_qstate &= ~SS_INCOMP; + sp->so_head = NULL; + ACCEPT_UNLOCK(); (void) soabort(sp); + ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - /* XXXRW: so_state locking? */ - so->so_state |= SS_INCOMP; + so->so_qstate |= SS_INCOMP; head->so_incqlen++; } - SOCKBUF_UNLOCK(&head->so_rcv); + ACCEPT_UNLOCK(); if (connstatus) { /* XXXRW: so_state locking? */ so->so_state |= connstatus; @@ -289,8 +364,9 @@ { /* XXXRW: so_state locking? */ - so->so_state |= SS_CANTSENDMORE; - sowwakeup(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + SOCKBUF_UNLOCK(&so->so_snd); } void @@ -299,8 +375,7 @@ { SOCKBUF_LOCK_ASSERT(&so->so_snd); - /* XXXRW: so_state locking? */ - so->so_state |= SS_CANTSENDMORE; + so->so_snd.sb_state |= SS_CANTSENDMORE; sowwakeup_locked(so); } @@ -310,8 +385,9 @@ { /* XXXRW: so_state locking? */ - so->so_state |= SS_CANTRCVMORE; - sorwakeup(so); + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_UNLOCK(&so->so_rcv); } void @@ -320,8 +396,7 @@ { SOCKBUF_LOCK_ASSERT(&so->so_rcv); - /* XXXRW: so_state locking? */ - so->so_state |= SS_CANTRCVMORE; + so->so_rcv.sb_state |= SS_CANTRCVMORE; sorwakeup_locked(so); } @@ -438,6 +513,11 @@ if (sb->sb_flags & SB_AIO) /* XXX locking */ aio_swake(so, sb); + /* + * XXXRW: More efficient to do this with the sowakeup_under_lock() + * code above to avoid multiple lock/unlock. Does order relative + * to so_upcall(), et al, matter? + */ SOCKBUF_LOCK(sb); KNOTE(&sb->sb_sel.si_note, 0); SOCKBUF_UNLOCK(sb); --- //depot/user/rwatson/netperf/sys/kern/uipc_syscalls.c 2004/05/23 09:56:02 +++ //depot/user/rwatson/net2/sys/kern/uipc_syscalls.c 2004/05/23 10:05:43 @@ -243,7 +243,7 @@ struct file *nfp = NULL; struct sockaddr *sa; socklen_t namelen; - int error, s; + int error; struct socket *head, *so; int fd; u_int fflag; @@ -254,93 +254,70 @@ if (uap->name) { error = copyin(uap->anamelen, &namelen, sizeof (namelen)); if(error) - goto done3; - if (namelen < 0) { - error = EINVAL; - goto done3; - } + return (error); + if (namelen < 0) + return (EINVAL); } NET_LOCK_GIANT(); error = fgetsock(td, uap->s, &head, &fflag); if (error) goto done2; - s = splnet(); - SOCK_LOCK(head); + /* Unlocked read. */ if ((head->so_options & SO_ACCEPTCONN) == 0) { - SOCK_UNLOCK(head); - splx(s); error = EINVAL; goto done; } + error = falloc(td, &nfp, &fd); + if (error) + goto done; + /* + * Unlocked reads. + * XXXRW: Dubious use of so->so_error. + */ + ACCEPT_LOCK(); + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { + ACCEPT_UNLOCK(); + error = EWOULDBLOCK; + goto done; + } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - /* XXXRW: so_state locking? */ - if (head->so_state & SS_CANTRCVMORE) { + if (head->so_rcv.sb_state & SS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } - /* XXXRW: so_state locking? */ - if ((head->so_state & SS_NBIO) != 0) { - head->so_error = EWOULDBLOCK; - break; - } - error = msleep(&head->so_timeo, SOCK_MTX(head), PSOCK | PCATCH, + error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error) { - SOCK_UNLOCK(head); - splx(s); + ACCEPT_UNLOCK(); goto done; } } if (head->so_error) { error = head->so_error; head->so_error = 0; - SOCK_UNLOCK(head); - splx(s); + ACCEPT_UNLOCK(); goto done; } - - /* - * At this point we know that there is at least one connection - * ready to be accepted. Remove it from the queue prior to - * allocating the file descriptor for it since falloc() may - * block allowing another process to accept the connection - * instead. The reference previously owned by the socket queue - * is now thread-local, letting us release the lock on the head. - */ so = TAILQ_FIRST(&head->so_comp); + KASSERT(!(so->so_qstate & SS_INCOMP), ("accept1: so SS_INCOMP")); + KASSERT(so->so_qstate & SS_COMP, ("accept1: so not SS_COMP")); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; - SOCK_UNLOCK(head); + so->so_qstate &= ~SS_COMP; + so->so_head = NULL; + ACCEPT_UNLOCK(); - error = falloc(td, &nfp, &fd); - if (error) { - /* - * Probably ran out of file descriptors. Put the - * unaccepted connection back onto the queue and - * do another wakeup so some other process might - * have a chance at it. Note that strict ordering - * is lost. - */ - SOCK_LOCK(head); - TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); - head->so_qlen++; - wakeup_one(&head->so_timeo); - SOCK_UNLOCK(head); - splx(s); - goto done; - } /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* XXX lock? */ + /* + * XXXRW: Might make life simpler to grab the socket buffer lock + * here so it's held when KNOTE() calls back into the socket + * code. + */ /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); - /* - * XXXRW: so should be locked to modify so_state here? - */ - so->so_state &= ~SS_COMP; - so->so_head = NULL; pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); @@ -375,7 +352,6 @@ namelen = 0; if (uap->name) goto gotnoname; - splx(s); error = 0; goto done; } @@ -413,7 +389,6 @@ FILEDESC_UNLOCK(fdp); } } - splx(s); /* * Release explicitly held references before returning. @@ -424,7 +399,6 @@ fputsock(head); done2: NET_UNLOCK_GIANT(); -done3: return (error); } @@ -1857,7 +1831,7 @@ SOCKBUF_LOCK(&so->so_snd); /* XXXRW: so_state locking? */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { - if (so->so_state & SS_CANTSENDMORE) + if (so->so_snd.sb_state & SS_CANTSENDMORE) error = EPIPE; else error = EAGAIN; @@ -2025,9 +1999,9 @@ * blocks before the pru_send (or more accurately, any blocking * results in a loop back to here to re-check). */ - /* XXXRW: so_state locking? */ - if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { - if (so->so_state & SS_CANTSENDMORE) { + SOCKBUF_LOCK_ASSERT(&so->so_snd); + if ((so->so_snd.sb_state & SS_CANTSENDMORE) || so->so_error) { + if (so->so_snd.sb_state & SS_CANTSENDMORE) { error = EPIPE; } else { error = so->so_error; --- //depot/user/rwatson/netperf/sys/kern/uipc_usrreq.c 2004/05/07 19:12:27 +++ //depot/user/rwatson/net2/sys/kern/uipc_usrreq.c 2004/05/07 21:50:38 @@ -430,7 +430,7 @@ } /* Unlocked read. */ - if (so->so_state & SS_CANTSENDMORE) { + if (so->so_snd.sb_state & SS_CANTSENDMORE) { error = EPIPE; break; } --- //depot/user/rwatson/netperf/sys/net/raw_usrreq.c 2004/04/07 20:11:34 +++ //depot/user/rwatson/net2/sys/net/raw_usrreq.c 2004/05/07 17:47:59 @@ -151,12 +151,8 @@ if (rp == 0) return EINVAL; raw_disconnect(rp); - SOCK_LOCK(so); - if (so->so_count != 0) { - soisdisconnected(so); - SOCK_UNLOCK(so); - } else - sofree(so); + soisdisconnected(so); + sotryfree(so); return 0; } --- //depot/user/rwatson/netperf/sys/netatm/atm_aal5.c 2004/02/28 14:29:37 +++ //depot/user/rwatson/net2/sys/netatm/atm_aal5.c 2004/04/25 13:30:32 @@ -767,7 +767,7 @@ * that there's room in the socket buffer */ if (((so->so_state & SS_ISCONNECTED) == 0) || - (so->so_state & SS_CANTRCVMORE) || + (so->so_rcv.sb_state & SS_CANTRCVMORE) || (len > sbspace(&so->so_rcv))) { atm_sock_stat.as_indrop[atp->atp_type]++; KB_FREEALL(m); --- //depot/user/rwatson/netperf/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c 2004/05/03 19:32:28 +++ //depot/user/rwatson/net2/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c 2004/05/22 10:04:53 @@ -1023,7 +1023,7 @@ { mtx_assert(&s->session_mtx, MA_OWNED); - if (s->l2so->so_state & SS_CANTRCVMORE) { + if (s->l2so->so_rcv.sb_state & SS_CANTRCVMORE) { NG_BTSOCKET_RFCOMM_INFO( "%s: L2CAP connection has been terminated, so=%p, so_state=%#x, so_count=%d, " \ "state=%d, flags=%#x\n", __func__, s->l2so, s->l2so->so_state, @@ -1353,10 +1353,11 @@ return (error); } + ACCEPT_LOCK(); if (TAILQ_EMPTY(&s0->l2so->so_comp)) { - if (s0->l2so->so_state & SS_CANTRCVMORE) + ACCEPT_UNLOCK(); + if (s0->l2so->so_rcv.sb_state & SS_CANTRCVMORE) return (ECONNABORTED); - return (EWOULDBLOCK); } @@ -1368,10 +1369,12 @@ TAILQ_REMOVE(&s0->l2so->so_comp, l2so, so_list); s0->l2so->so_qlen --; - soref(l2so); - l2so->so_state &= ~SS_COMP; + l2so->so_qstate &= ~SS_COMP; l2so->so_state |= SS_NBIO; l2so->so_head = NULL; + ACCEPT_UNLOCK(); + + soref(l2so); error = soaccept(l2so, (struct sockaddr **) &l2sa); if (error != 0) { --- //depot/user/rwatson/netperf/sys/netgraph/ng_ksocket.c 2004/04/04 14:05:41 +++ //depot/user/rwatson/net2/sys/netgraph/ng_ksocket.c 2004/05/22 10:04:53 @@ -1149,7 +1149,7 @@ * If the peer has closed the connection, forward a 0-length mbuf * to indicate end-of-file. */ - if (so->so_state & SS_CANTRCVMORE && !(priv->flags & KSF_EOFSEEN)) { + if (so->so_rcv.sb_state & SS_CANTRCVMORE && !(priv->flags & KSF_EOFSEEN)) { MGETHDR(m, waitflag, MT_DATA); if (m != NULL) { m->m_len = m->m_pkthdr.len = 0; @@ -1174,8 +1174,9 @@ head->so_error = 0; return error; } + /* Unlocked read. */ if (TAILQ_EMPTY(&head->so_comp)) { - if (head->so_state & SS_CANTRCVMORE) + if (head->so_rcv.sb_state & SS_CANTRCVMORE) return ECONNABORTED; return EWOULDBLOCK; } @@ -1199,17 +1200,21 @@ int len; int error; + ACCEPT_LOCK(); so = TAILQ_FIRST(&head->so_comp); - if (so == NULL) /* Should never happen */ + if (so == NULL) { /* Should never happen */ + ACCEPT_UNLOCK(); return; + } TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; + so->so_qstate &= ~SS_COMP; + ACCEPT_UNLOCK(); /* XXX KNOTE(&head->so_rcv.sb_sel.si_note, 0); */ soref(so); - so->so_state &= ~SS_COMP; so->so_state |= SS_NBIO; so->so_head = NULL; --- //depot/user/rwatson/netperf/sys/netinet/accf_http.c 2004/04/22 18:08:25 +++ //depot/user/rwatson/net2/sys/netinet/accf_http.c 2004/04/25 13:30:32 @@ -161,7 +161,7 @@ { /* Unlocked read. */ - if ((so->so_state & SS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { + if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { struct mbuf *m; char *cmp; int cmplen, cc; @@ -215,7 +215,7 @@ int i, cc, spaces, inspaces; /* Unlocked read. */ - if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + if ((so->so_rcv.sb_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto fallout; m = so->so_rcv.sb_mb; @@ -303,7 +303,7 @@ DPRINT("start"); /* Unlocked read. */ - if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + if ((so->so_rcv.sb_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) goto gotit; /* --- //depot/user/rwatson/netperf/sys/netinet/tcp_input.c 2004/05/07 19:12:27 +++ //depot/user/rwatson/net2/sys/netinet/tcp_input.c 2004/05/07 21:50:38 @@ -356,7 +356,7 @@ nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); /* Unlocked read. */ - if (so->so_state & SS_CANTRCVMORE) + if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(q->tqe_m); else sbappendstream(&so->so_rcv, q->tqe_m); @@ -1243,7 +1243,7 @@ * Add data to socket buffer. */ /* Unlocked read. */ - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_rcv.sb_state & SS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ @@ -2154,7 +2154,7 @@ * compressed state. */ /* Unlocked read. */ - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_rcv.sb_state & SS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); @@ -2327,7 +2327,7 @@ tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* Unlocked read. */ - if (so->so_state & SS_CANTRCVMORE) + if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(m); else sbappendstream(&so->so_rcv, m); --- //depot/user/rwatson/netperf/sys/netinet/tcp_usrreq.c 2004/04/22 18:08:25 +++ //depot/user/rwatson/net2/sys/netinet/tcp_usrreq.c 2004/05/22 10:27:12 @@ -1172,6 +1172,8 @@ if (tp == 0) { int nofd = so->so_state & SS_NOFDREF; /* XXX */ + printf("tcp_attach(): possible SS_NOFDREF race\n"); + /* XXXRW: so_state locking. */ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ #ifdef INET6 --- //depot/user/rwatson/netperf/sys/netipx/ipx_usrreq.c 2004/03/19 02:34:30 +++ //depot/user/rwatson/net2/sys/netipx/ipx_usrreq.c 2004/05/07 17:49:02 @@ -423,12 +423,8 @@ s = splnet(); ipx_pcbdetach(ipxp); splx(s); - SOCK_LOCK(so); - if (so->so_count != 0) { - soisdisconnected(so); - SOCK_UNLOCK(so); - } else - sofree(so); + soisdisconnected(so); + sotryfree(so); return (0); } --- //depot/user/rwatson/netperf/sys/netsmb/smb_trantcp.c 2004/04/06 13:53:21 +++ //depot/user/rwatson/net2/sys/netsmb/smb_trantcp.c 2004/04/25 13:30:32 @@ -417,8 +417,8 @@ * If we don't have one waiting, return. */ error = nbssn_recvhdr(nbp, &len, &rpcode, MSG_DONTWAIT, td); - if (so->so_state & - (SS_ISDISCONNECTING | SS_ISDISCONNECTED | SS_CANTRCVMORE)) { + if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED) || + so->so_rcv.sb_state & SS_CANTRCVMORE) { nbp->nbp_state = NBST_CLOSED; NBDEBUG("session closed by peer\n"); return ECONNRESET; --- //depot/user/rwatson/netperf/sys/sys/socketvar.h 2004/05/23 09:37:14 +++ //depot/user/rwatson/net2/sys/sys/socketvar.h 2004/05/23 10:05:43 @@ -46,14 +46,25 @@ */ typedef u_quad_t so_gen_t; +/*- + * Locking key to struct socket: + * (a) constant after allocation, no locking required. + * (b) locked by SOCK_LOCK(so). + * (c) locked by SOCKBUF_LOCK(&so->so_rcv). + * (d) locked by SOCKBUF_LOCK(&so->so_snd). + * (e) locked by ACCEPT_LOCK(). + * (f) not locked since integer reads/writes are atomic. + * (g) used only as a sleep/wakeup address, no value. + */ struct socket { - int so_count; /* reference count */ - short so_type; /* generic type, see socket.h */ + int so_count; /* (b) reference count */ + short so_type; /* (a) generic type, see socket.h */ short so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ - short so_state; /* internal state flags SS_*, below */ + short so_state; /* (b) internal state flags SS_*, below */ + int so_qstate; /* (e) SS_ flags for accept queue's */ void *so_pcb; /* protocol control block */ - struct protosw *so_proto; /* protocol handle */ + struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. @@ -65,16 +76,16 @@ * We allow connections to queue up based on current queue lengths * and limit on number of queued connections for this socket. */ - struct socket *so_head; /* back pointer to accept socket */ - TAILQ_HEAD(, socket) so_incomp; /* queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* queue of complete unaccepted connections */ - TAILQ_ENTRY(socket) so_list; /* list of unaccepted connections */ - short so_qlen; /* number of unaccepted connections */ - short so_incqlen; /* number of unaccepted incomplete + struct socket *so_head; /* (e) back pointer to accept socket */ + TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ + TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ + TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ + short so_qlen; /* (e) number of unaccepted connections */ + short so_incqlen; /* (e) number of unaccepted incomplete connections */ - short so_qlimit; /* max number queued connections */ - short so_timeo; /* connection timeout */ - u_short so_error; /* error affecting connection */ + short so_qlimit; /* (e) max number queued connections */ + short so_timeo; /* (g)connection timeout */ + u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ u_long so_oobmark; /* chars to oob mark */ @@ -85,19 +96,20 @@ struct sockbuf { struct selinfo sb_sel; /* process selecting read/write */ struct mtx sb_mtx; /* sockbuf lock */ + int sb_state; /* (c/d) socket-related state locked on sockbuf */ #define sb_startzero sb_mb - struct mbuf *sb_mb; /* the mbuf chain */ - struct mbuf *sb_mbtail; /* the last mbuf in the chain */ - struct mbuf *sb_lastrecord; /* first mbuf of last record in + struct mbuf *sb_mb; /* (c/d) the mbuf chain */ + struct mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */ + struct mbuf *sb_lastrecord; /* (c/d) first mbuf of last record in * socket buffer */ - u_int sb_cc; /* actual chars in buffer */ - u_int sb_hiwat; /* max actual char count */ - u_int sb_mbcnt; /* chars of mbufs used */ - u_int sb_mbmax; /* max chars of mbufs to use */ - u_int sb_ctl; /* non-data chars in buffer */ - int sb_lowat; /* low water mark */ - int sb_timeo; /* timeout for read/write */ - short sb_flags; /* flags, see below */ + u_int sb_cc; /* (c/d) actual chars in buffer */ + u_int sb_hiwat; /* (c/d) max actual char count */ + u_int sb_mbcnt; /* (c/d) chars of mbufs used */ + u_int sb_mbmax; /* (c/d) max chars of mbufs to use */ + u_int sb_ctl; /* (c/d) non-data chars in buffer */ + int sb_lowat; /* (c/d) low water mark */ + int sb_timeo; /* (c/d) timeout for read/write */ + short sb_flags; /* (c/d) flags, see below */ } so_rcv, so_snd; #define SB_MAX (256*1024) /* default for max chars in sockbuf */ #define SB_LOCK 0x01 /* lock on data queue */ @@ -112,9 +124,9 @@ void (*so_upcall)(struct socket *, void *, int); void *so_upcallarg; - struct ucred *so_cred; /* user credentials */ - struct label *so_label; /* MAC label for socket */ - struct label *so_peerlabel; /* cached MAC label for socket peer */ + struct ucred *so_cred; /* (a) user credentials */ + struct label *so_label; /* (b) MAC label for socket */ + struct label *so_peerlabel; /* (b) cached MAC label for socket peer */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* generation count */ void *so_emuldata; /* private data for emulators */ @@ -132,6 +144,16 @@ } \ } while (/*CONSTCOND*/0) +/* + * Global accept mutex to serialize access to accept queues and + * fields associated with multiple sockets. This allows us to + * avoid defining a lock order between listen and accept sockets + * until such time as it proves to be a good idea. + */ +extern struct mtx accept_mtx; +#define ACCEPT_LOCK() mtx_lock(&accept_mtx) +#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) + #define SOCKBUF_MTX(_sb) (&(_sb)->sb_mtx) #define SOCKBUF_LOCK_INIT(_sb, _name) \ mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF) @@ -148,24 +170,39 @@ #define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) #define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) -/* +/*- * Socket state bits. + * + * Historically, this bits were all kept in the so_state field. For + * locking reasons, they are now in multiple fields, as they are + * locked differently. so_state maintains basic socket state protected + * by the socket lock. so_qstate holds information about the socket + * accept queues. Each socket bufer also has a state field holding + * information relevant to that socket buffer (can't send, rcv). Many + * fields will be read without locks to improve performance and avoid + * lock order issues. However, this approach must be used with caution. */ #define SS_NOFDREF 0x0001 /* no file table ref any more */ #define SS_ISCONNECTED 0x0002 /* socket connected to a peer */ #define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */ #define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */ +#define SS_NBIO 0x0100 /* non-blocking ops */ +#define SS_ASYNC 0x0200 /* async i/o notify */ +#define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */ +#define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ + +/* + * Socket state bits now stored in the socket buffer state field. + */ #define SS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SS_RCVATMARK 0x0040 /* at mark on input */ -#define SS_NBIO 0x0100 /* non-blocking ops */ -#define SS_ASYNC 0x0200 /* async i/o notify */ -#define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */ - +/* + * Socket state bits now stored in so_qstate. + */ #define SS_INCOMP 0x0800 /* unaccepted, incomplete connection */ #define SS_COMP 0x1000 /* unaccepted, complete connection */ -#define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ /* * Externalized form of struct socket used by the sysctl(3) interface. @@ -227,7 +264,7 @@ /* can we read something from so? */ #define soreadable(so) \ ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \ - ((so)->so_state & SS_CANTRCVMORE) || \ + ((so)->so_rcv.sb_state & SS_CANTRCVMORE) || \ !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) /* can we write something to so? */ @@ -235,7 +272,7 @@ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ - ((so)->so_state & SS_CANTSENDMORE) || \ + ((so)->so_snd.sb_state & SS_CANTSENDMORE) || \ (so)->so_error) /* adjust counters in sb reflecting allocation of m */