--- kern/uipc_usrreq.c.old 2006/06/26 16:20:56 +++ kern/uipc_usrreq.c 2006/06/29 23:09:34 @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -88,32 +89,124 @@ struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); /* - * Currently, UNIX domain sockets are protected by a single subsystem lock, - * which covers global data structures and variables, the contents of each - * per-socket unpcb structure, and the so_pcb field in sockets attached to - * the UNIX domain. This provides for a moderate degree of paralellism, as - * receive operations on UNIX domain sockets do not need to acquire the - * subsystem lock. Finer grained locking to permit send() without acquiring - * a global lock would be a logical next step. + * Both send and receive buffers are allocated PIPSIZ bytes of buffering + * for stream sockets, although the total for sender and receiver is + * actually only PIPSIZ. + * Datagram sockets really use the sendspace as the maximum datagram size, + * and don't really want to reserve the sendspace. Their recvspace should + * be large enough for at least one max-size datagram plus address. + */ +#ifndef PIPSIZ +#define PIPSIZ 8192 +#endif +static u_long unpst_sendspace = PIPSIZ; +static u_long unpst_recvspace = PIPSIZ; +static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +static u_long unpdg_recvspace = 4*1024; + +static int unp_rights; /* file descriptors in flight */ + +SYSCTL_DECL(_net_local_stream); +SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, + &unpst_sendspace, 0, ""); +SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, + &unpst_recvspace, 0, ""); +SYSCTL_DECL(_net_local_dgram); +SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, + &unpdg_sendspace, 0, ""); +SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, + &unpdg_recvspace, 0, ""); +SYSCTL_DECL(_net_local); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); + +/*- + * Locking and synchronization: + * + * A global UNIX domain socket rwlock protects all global variables in the + * implementation, as well as the linked lists tracking the set of allocated + * UNIX domain sockets. These variables/fields may be read lockless using + * atomic operations if stale or mutually inconsistent values are + * permissible; otherwise the global rwlock is required to read or + * read-modify-write. The global rwlock also serves to prevent deadlock when + * multiple PCB locks may be acquired at once (see below). Finally, the + * global rwlock protects uncounted references from vnodes to sockets bound + * to those vnodes: to safely dereference the v_socket pointer, the global + * rwlock must be held while a full reference is acquired. Some cases: + * + * - For consistent read-modify-write of global state, hold the global lock + * writable. + * + * - For consistent multiple-read and non-staleness of global state, hold the + * global lock writable. + * + * - To prevent changes to the global linkage, hold the global lock readable. + * + * - When modifying global linkage, hold the global lock writable. + * + * - When acquiring multiple unpcb locks at a time, hold the global lock + * writable. + * + * UNIX domain sockets each have one unpcb PCB associated with them from + * pru_attach() to pru_detach() via the so_pcb pointer. The validity of that + * reference is an invariant for the lifetime of the socket, so no lock is + * required to dereference the so_pcb pointer if a valid socket reference is + * held. + * + * Each PCB has a back-pointer to its socket, unp_socket. This pointer may + * only be safely dereferenced as long as a valid reference to the PCB is + * held. Typically, this reference will be from the socket, or from another + * PCB when the referring PCB's lock is held (in order that the reference not + * be invalidated during use). In particular, to follow + * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn. + * + * Fields of PCBs are locked using a per-unpcb lock, unp_mtx. Individual + * atomic reads without the lock may be performed "lockless", but more + * complex reads and read-modify-writes require the mutex to be held. No + * lock order is defined between PCB locks -- multiple PCB locks may be + * acquired at the same time only when holding the global UNIX domain socket + * rwlock, which prevents deadlocks. To prevent inter-PCB references from + * becoming invalid, either the per-unpcb lock of the unpcb holding the + * reference must be held, or the global rwlock must be held readable for the + * lifetime of the use of the reference. * - * The UNIX domain socket lock preceds all socket layer locks, including the - * socket lock and socket buffer lock, permitting UNIX domain socket code to - * call into socket support routines without releasing its locks. + * Blocking with UNIX domain sockets is a tricky issue: unlike most network + * protocols, bind() is a non-atomic operation, and connect() requires + * potential sleeping in the protocol, due to potentially waiting on local or + * distributed file systems. We try to separate "lookup" operations, which + * may sleep, and the IPC operations themselves, which typically can occur + * with relative atomicity as locks can be held over the entire operation. * - * Some caution is required in areas where the UNIX domain socket code enters - * VFS in order to create or find rendezvous points. This results in - * dropping of the UNIX domain socket subsystem lock, acquisition of the - * Giant lock, and potential sleeping. This increases the chances of races, - * and exposes weaknesses in the socket->protocol API by offering poor - * failure modes. + * Another tricky issue is simultaneous multi-threaded or multi-process + * access to a single UNIX domain socket. These are handled by the flags + * UNP_CONNECTING and UNP_BINDING. */ -static struct mtx unp_mtx; -#define UNP_LOCK_INIT() \ - mtx_init(&unp_mtx, "unp", NULL, MTX_DEF) -#define UNP_LOCK() mtx_lock(&unp_mtx) -#define UNP_UNLOCK() mtx_unlock(&unp_mtx) -#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) -#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) +static struct rwlock unp_global_rwlock; + +#define UNP_GLOBAL_LOCK_INIT() rw_init(&unp_global_rwlock, \ + "unp_global_rwlock") + +#define UNP_GLOBAL_LOCK_ASSERT() rw_assert(&unp_global_rwlock, \ + RA_LOCKED) +#define UNP_GLOBAL_UNLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ + RA_UNLOCKED) + +#define UNP_GLOBAL_WLOCK() rw_wlock(&unp_global_rwlock) +#define UNP_GLOBAL_WUNLOCK() rw_wunlock(&unp_global_rwlock) +#define UNP_GLOBAL_WLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ + RA_WLOCKED) + +#define UNP_GLOBAL_RLOCK() rw_rlock(&unp_global_rwlock) +#define UNP_GLOBAL_RUNLOCK() rw_runlock(&unp_global_rwlock) +#define UNP_GLOBAL_RLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ + RA_RLOCKED) + +#define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ + "unp_mtx", "unp_mtx", \ + MTX_DUPOK|MTX_DEF|MTX_RECURSE) +#define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) +#define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) +#define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) +#define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) /* * Garbage collection of cyclic file descriptor/socket references occurs @@ -123,12 +216,10 @@ */ static struct task unp_gc_task; -static int unp_attach(struct socket *); static void unp_detach(struct unpcb *); -static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *); static int unp_connect(struct socket *,struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); -static void unp_disconnect(struct unpcb *); +static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); static void unp_gc(__unused void *, int); @@ -137,8 +228,6 @@ static void unp_discard(struct file *); static void unp_freerights(struct file **, int); static int unp_internalize(struct mbuf **, struct thread *); -static int unp_listen(struct socket *, struct unpcb *, int, - struct thread *); static void uipc_abort(struct socket *so) @@ -147,54 +236,203 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); - UNP_LOCK(); + + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); unp_drop(unp, ECONNABORTED); unp_detach(unp); - UNP_UNLOCK_ASSERT(); + UNP_GLOBAL_UNLOCK_ASSERT(); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { - struct unpcb *unp; + struct unpcb *unp, *unp2; const struct sockaddr *sa; /* - * Pass back name of connected socket, - * if it was bound and we are still connected - * (our peer may have closed already!). + * Pass back name of connected socket, if it was bound and we are + * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); - UNP_LOCK(); - if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) - sa = (struct sockaddr *) unp->unp_conn->unp_addr; - else + UNP_GLOBAL_RLOCK(); + unp2 = unp->unp_conn; + if (unp2 != NULL && unp2->unp_addr != NULL) { + UNP_PCB_LOCK(unp2); + sa = (struct sockaddr *) unp2->unp_addr; + bcopy(sa, *nam, sa->sa_len); + UNP_PCB_UNLOCK(unp2); + } else { sa = &sun_noname; - bcopy(sa, *nam, sa->sa_len); - UNP_UNLOCK(); + bcopy(sa, *nam, sa->sa_len); + } + UNP_GLOBAL_RUNLOCK(); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { + u_long sendspace, recvspace; + struct unpcb *unp; + int error; + + KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + switch (so->so_type) { + case SOCK_STREAM: + sendspace = unpst_sendspace; + recvspace = unpst_recvspace; + break; + + case SOCK_DGRAM: + sendspace = unpdg_sendspace; + recvspace = unpdg_recvspace; + break; - return (unp_attach(so)); + default: + panic("uipc_attach"); + } + error = soreserve(so, sendspace, recvspace); + if (error) + return (error); + } + unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO); + if (unp == NULL) + return (ENOBUFS); + LIST_INIT(&unp->unp_refs); + UNP_PCB_LOCK_INIT(unp); + unp->unp_socket = so; + so->so_pcb = unp; + + UNP_GLOBAL_WLOCK(); + unp->unp_gencnt = ++unp_gencnt; + unp_count++; + LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead + : &unp_shead, unp, unp_link); + UNP_GLOBAL_WUNLOCK(); + + return (0); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + struct sockaddr_un *soun = (struct sockaddr_un *)nam; + struct vnode *vp; + struct mount *mp; + struct vattr vattr; + int error, namelen; + struct nameidata nd; struct unpcb *unp; - int error; + char *buf; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); - UNP_LOCK(); - error = unp_bind(unp, nam, td); - UNP_UNLOCK(); + + namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); + if (namelen <= 0) + return (EINVAL); + + /* + * We don't allow simultaneous bind() calls on a single UNIX domain + * socket, so flag in-progress operations, and return an error if an + * operation is already in progress. + * + * Historically, we have not allowed a socket to be rebound, so this + * also returns an error. Not allowing re-binding certainly + * simplifies the implementation and avoids a great many possible + * failure modes. + */ + UNP_PCB_LOCK(unp); + if (unp->unp_vnode != NULL) { + UNP_PCB_UNLOCK(unp); + return (EINVAL); + } + if (unp->unp_flags & UNP_BINDING) { + UNP_PCB_UNLOCK(unp); + return (EALREADY); + } + unp->unp_flags |= UNP_BINDING; + UNP_PCB_UNLOCK(unp); + + buf = malloc(namelen + 1, M_TEMP, M_WAITOK); + strlcpy(buf, soun->sun_path, namelen + 1); + + mtx_lock(&Giant); +restart: + mtx_assert(&Giant, MA_OWNED); + NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, + buf, td); +/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ + error = namei(&nd); + if (error) + goto error; + vp = nd.ni_vp; + if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULL) { + vrele(vp); + error = EADDRINUSE; + goto error; + } + error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); + if (error) + goto error; + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VSOCK; + vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); +#ifdef MAC + error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, + &vattr); +#endif + if (error == 0) { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (error) { + vn_finished_write(mp); + goto error; + } + vp = nd.ni_vp; + ASSERT_VOP_LOCKED(vp, "uipc_bind"); + soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); + + /* + * XXXRW: handle race against another consumer also frobbing + * v_socket? Or not. + */ + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); + vp->v_socket = unp->unp_socket; + unp->unp_vnode = vp; + unp->unp_addr = soun; + unp->unp_flags &= ~UNP_BINDING; + UNP_PCB_UNLOCK(unp); + UNP_GLOBAL_WUNLOCK(); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + mtx_unlock(&Giant); + free(buf, M_TEMP); + return (0); + +error: + UNP_PCB_LOCK(unp); + unp->unp_flags &= ~UNP_BINDING; + UNP_PCB_UNLOCK(unp); + mtx_unlock(&Giant); + free(buf, M_TEMP); return (error); } @@ -204,23 +442,30 @@ int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); - UNP_LOCK(); + + UNP_GLOBAL_WLOCK(); error = unp_connect(so, nam, td); - UNP_UNLOCK(); + UNP_GLOBAL_WUNLOCK(); return (error); } int uipc_connect2(struct socket *so1, struct socket *so2) { - struct unpcb *unp; + struct unpcb *unp, *unp2; int error; - unp = sotounpcb(so1); + UNP_GLOBAL_WLOCK(); + unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); - UNP_LOCK(); + UNP_PCB_LOCK(unp); + unp2 = so2->so_pcb; + KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); + UNP_PCB_LOCK(unp2); error = unp_connect2(so1, so2, PRU_CONNECT2); - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp2); + UNP_PCB_UNLOCK(unp); + UNP_GLOBAL_WUNLOCK(); return (error); } @@ -233,21 +478,31 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); - UNP_LOCK(); + + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); unp_detach(unp); - UNP_UNLOCK_ASSERT(); + UNP_GLOBAL_UNLOCK_ASSERT(); } static int uipc_disconnect(struct socket *so) { - struct unpcb *unp; + struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); - UNP_LOCK(); - unp_disconnect(unp); - UNP_UNLOCK(); + + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); + unp2 = unp->unp_conn; + if (unp2 != NULL) { + UNP_PCB_LOCK(unp2); + unp_disconnect(unp, unp2); + UNP_PCB_UNLOCK(unp2); + } + UNP_PCB_UNLOCK(unp); + UNP_GLOBAL_WUNLOCK(); return (0); } @@ -259,81 +514,108 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); - UNP_LOCK(); + + UNP_PCB_LOCK(unp); if (unp->unp_vnode == NULL) { - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); return (EINVAL); } - error = unp_listen(so, unp, backlog, td); - UNP_UNLOCK(); + + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0) { + cru2x(td->td_ucred, &unp->unp_peercred); + unp->unp_flags |= UNP_HAVEPCCACHED; + solisten_proto(so, backlog); + } + SOCK_UNLOCK(so); + UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { - struct unpcb *unp; + struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); - UNP_LOCK(); - if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) - sa = (struct sockaddr *) unp->unp_conn->unp_addr; - else { - /* - * XXX: It seems that this test always fails even when - * connection is established. So, this else clause is - * added as workaround to return PF_LOCAL sockaddr. - */ + UNP_PCB_LOCK(unp); + /* + * XXX: It seems that this test always fails even when connection is + * established. So, this else clause is added as workaround to + * return PF_LOCAL sockaddr. + */ + unp2 = unp->unp_conn; + if (unp2 != NULL) { + UNP_PCB_LOCK(unp2); + if (unp2->unp_addr != NULL) + sa = (struct sockaddr *) unp->unp_conn->unp_addr; + else + sa = &sun_noname; + bcopy(sa, *nam, sa->sa_len); + UNP_PCB_UNLOCK(unp2); + } else { sa = &sun_noname; + bcopy(sa, *nam, sa->sa_len); } - bcopy(sa, *nam, sa->sa_len); - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); return (0); } static int uipc_rcvd(struct socket *so, int flags) { - struct unpcb *unp; + struct unpcb *unp, *unp2; struct socket *so2; + u_int mbcnt, sbcc; u_long newhiwat; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); - UNP_LOCK(); - switch (so->so_type) { - case SOCK_DGRAM: + + if (so->so_type == SOCK_DGRAM) panic("uipc_rcvd DGRAM?"); - /*NOTREACHED*/ - case SOCK_STREAM: - if (unp->unp_conn == NULL) - break; - so2 = unp->unp_conn->unp_socket; - SOCKBUF_LOCK(&so2->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - /* - * Adjust backpressure on sender - * and wakeup any waiting to write. - */ - so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt; - unp->unp_mbcnt = so->so_rcv.sb_mbcnt; - newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - - so->so_rcv.sb_cc; - (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, - newhiwat, RLIM_INFINITY); - unp->unp_cc = so->so_rcv.sb_cc; - SOCKBUF_UNLOCK(&so->so_rcv); - sowwakeup_locked(so2); - break; + if (so->so_type != SOCK_STREAM) + panic("uipc_rcvd unknown socktype"); - default: - panic("uipc_rcvd unknown socktype"); + /* + * Adjust backpressure on sender and wakeup any waiting to write. + * + * The consistency requirements here are a bit complex: we must + * acquire the lock for our own unpcb in order to prevent it from + * disconnecting while in use, changing the unp_conn peer. We do not + * need unp2's lock, since the unp2->unp_socket pointer will remain + * static as long as the unp2 pcb is valid, which it will be until we + * release unp's lock to allow a disconnect. We do need socket + * mutexes for both socket endpoints since we manipulate fields in + * both; we hold both locks at once since we access both + * simultaneously. + */ + SOCKBUF_LOCK(&so->so_rcv); + mbcnt = so->so_rcv.sb_mbcnt; + sbcc = so->so_rcv.sb_cc; + SOCKBUF_UNLOCK(&so->so_rcv); + UNP_PCB_LOCK(unp); + unp2 = unp->unp_conn; + if (unp2 == NULL) { + UNP_PCB_UNLOCK(unp); + return (0); } - UNP_UNLOCK(); + so2 = unp2->unp_socket; + SOCKBUF_LOCK(&so2->so_snd); + so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; + newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; + (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, + newhiwat, RLIM_INFINITY); + sowwakeup_locked(so2); + unp->unp_cc = sbcc; + unp->unp_mbcnt = mbcnt; + UNP_PCB_UNLOCK(unp); return (0); } @@ -343,13 +625,15 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { - int error = 0; - struct unpcb *unp; + struct unpcb *unp, *unp2; struct socket *so2; + u_int mbcnt, sbcc; u_long newhiwat; + int error = 0; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_send: unp == NULL")); + if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; @@ -358,32 +642,44 @@ if (control != NULL && (error = unp_internalize(&control, td))) goto release; - UNP_LOCK(); + if ((nam != NULL) || (flags & PRUS_EOF)) + UNP_GLOBAL_WLOCK(); + else + UNP_GLOBAL_RLOCK(); + switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; + unp2 = unp->unp_conn; if (nam != NULL) { - if (unp->unp_conn != NULL) { + if (unp2 != NULL) { error = EISCONN; + UNP_PCB_LOCK(unp); break; } error = unp_connect(so, nam, td); + UNP_PCB_LOCK(unp); if (error) break; + unp2 = unp->unp_conn; } else { - if (unp->unp_conn == NULL) { + UNP_PCB_LOCK(unp); + if (unp2 == NULL) { error = ENOTCONN; + UNP_PCB_LOCK(unp); break; } } - so2 = unp->unp_conn->unp_socket; + UNP_PCB_LOCK_ASSERT(unp); + UNP_PCB_LOCK(unp2); + so2 = unp2->unp_socket; if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; - if (unp->unp_conn->unp_flags & UNP_WANTCRED) + if (unp2->unp_flags & UNP_WANTCRED) control = unp_addsockcred(td, control); SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { @@ -395,43 +691,57 @@ error = ENOBUFS; } if (nam != NULL) - unp_disconnect(unp); + unp_disconnect(unp, unp2); + UNP_PCB_UNLOCK(unp2); break; } case SOCK_STREAM: /* Connect if not connected yet. */ /* - * Note: A better implementation would complain - * if not equal to the peer's address. + * Note: A better implementation would complain if not equal + * to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); + UNP_PCB_LOCK(unp); if (error) break; /* XXX */ } else { error = ENOTCONN; + UNP_PCB_LOCK(unp); break; } - } + } else + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK_ASSERT(unp); - SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { - SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; break; } - if (unp->unp_conn == NULL) - panic("uipc_send connected but no connection?"); - so2 = unp->unp_conn->unp_socket; + /* + * Lock order here has to be handled carefully: we hold the + * global lock, so acquiring two unpcb locks is OK. We must + * acquire both before acquiring any socket mutexes. We must + * also acquire the local socket send mutex before the remote + * socket receive mutex. The only tricky thing is making + * sure to acquire the unp2 lock before the local socket send + * lock, or we will experience deadlocks. + */ + unp2 = unp->unp_conn; + KASSERT(unp2 != NULL, + ("uipc_send connected but no connection?")); + UNP_PCB_LOCK(unp2); + so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); - if (unp->unp_conn->unp_flags & UNP_WANTCRED) { + if (unp2->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM. */ - unp->unp_conn->unp_flags &= ~UNP_WANTCRED; + unp2->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(td, control); } /* @@ -442,19 +752,20 @@ if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; - } else { + } else sbappend_locked(&so2->so_rcv, m); - } - so->so_snd.sb_mbmax -= - so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; - unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; - newhiwat = so->so_snd.sb_hiwat - - (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc); + mbcnt = so2->so_rcv.sb_mbcnt; + sbcc = so2->so_rcv.sb_cc; + sorwakeup_locked(so2); + SOCKBUF_LOCK(&so->so_snd); + newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); + so->so_snd.sb_mbmax -= mbcnt - unp->unp_conn->unp_mbcnt; SOCKBUF_UNLOCK(&so->so_snd); - unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; - sorwakeup_locked(so2); + unp2->unp_mbcnt = mbcnt; + unp2->unp_cc = sbcc; + UNP_PCB_UNLOCK(unp2); m = NULL; break; @@ -470,7 +781,12 @@ socantsendmore(so); unp_shutdown(unp); } - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); + + if ((nam != NULL) || (flags & PRUS_EOF)) + UNP_GLOBAL_WUNLOCK(); + else + UNP_GLOBAL_RUNLOCK(); if (control != NULL && error != 0) unp_dispose(control); @@ -486,22 +802,26 @@ static int uipc_sense(struct socket *so, struct stat *sb) { - struct unpcb *unp; + struct unpcb *unp, *unp2; struct socket *so2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); - UNP_LOCK(); + sb->st_blksize = so->so_snd.sb_hiwat; - if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { - so2 = unp->unp_conn->unp_socket; + UNP_GLOBAL_RLOCK(); + UNP_PCB_LOCK(unp); + unp2 = unp->unp_conn; + if (so->so_type == SOCK_STREAM && unp2 != NULL) { + so2 = unp2->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); + UNP_GLOBAL_RUNLOCK(); return (0); } @@ -512,10 +832,13 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); - UNP_LOCK(); + + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); + UNP_GLOBAL_WUNLOCK(); return (0); } @@ -527,14 +850,15 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); + *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); - UNP_LOCK(); + UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); - UNP_UNLOCK(); + UNP_PCB_UNLOCK(unp); return (0); } @@ -571,12 +895,13 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); - UNP_LOCK(); + error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: + UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { @@ -585,22 +910,31 @@ else error = EINVAL; } + UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; + case LOCAL_CREDS: + UNP_PCB_LOCK(unp); optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; + UNP_PCB_UNLOCK(unp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; + case LOCAL_CONNWAIT: + UNP_PCB_LOCK(unp); optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; + UNP_PCB_UNLOCK(unp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; + default: error = EOPNOTSUPP; break; } break; + case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: @@ -610,19 +944,24 @@ if (error) break; -#define OPTSET(bit) \ - if (optval) \ - unp->unp_flags |= bit; \ - else \ - unp->unp_flags &= ~bit; +#define OPTSET(bit) do { \ + UNP_PCB_LOCK(unp); \ + if (optval) \ + unp->unp_flags |= bit; \ + else \ + unp->unp_flags &= ~bit; \ + UNP_PCB_UNLOCK(unp); \ +} while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED); break; + case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT); break; + default: break; } @@ -633,117 +972,60 @@ break; } break; + default: error = EOPNOTSUPP; break; } - UNP_UNLOCK(); return (error); } -/* - * Both send and receive buffers are allocated PIPSIZ bytes of buffering - * for stream sockets, although the total for sender and receiver is - * actually only PIPSIZ. - * Datagram sockets really use the sendspace as the maximum datagram size, - * and don't really want to reserve the sendspace. Their recvspace should - * be large enough for at least one max-size datagram plus address. - */ -#ifndef PIPSIZ -#define PIPSIZ 8192 -#endif -static u_long unpst_sendspace = PIPSIZ; -static u_long unpst_recvspace = PIPSIZ; -static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ -static u_long unpdg_recvspace = 4*1024; - -static int unp_rights; /* file descriptors in flight */ - -SYSCTL_DECL(_net_local_stream); -SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, - &unpst_sendspace, 0, ""); -SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, - &unpst_recvspace, 0, ""); -SYSCTL_DECL(_net_local_dgram); -SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, - &unpdg_sendspace, 0, ""); -SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, - &unpdg_recvspace, 0, ""); -SYSCTL_DECL(_net_local); -SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); - -static int -unp_attach(struct socket *so) -{ - struct unpcb *unp; - int error; - - KASSERT(so->so_pcb == NULL, ("unp_attach: so_pcb != NULL")); - if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - switch (so->so_type) { - - case SOCK_STREAM: - error = soreserve(so, unpst_sendspace, unpst_recvspace); - break; - - case SOCK_DGRAM: - error = soreserve(so, unpdg_sendspace, unpdg_recvspace); - break; - - default: - panic("unp_attach"); - } - if (error) - return (error); - } - unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO); - if (unp == NULL) - return (ENOBUFS); - LIST_INIT(&unp->unp_refs); - unp->unp_socket = so; - so->so_pcb = unp; - - UNP_LOCK(); - unp->unp_gencnt = ++unp_gencnt; - unp_count++; - LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead - : &unp_shead, unp, unp_link); - UNP_UNLOCK(); - - return (0); -} - static void unp_detach(struct unpcb *unp) { + int local_unp_rights; struct vnode *vp; - int local_unp_rights; + struct unpcb *unp2; - UNP_LOCK_ASSERT(); + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_PCB_LOCK_ASSERT(unp); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; + + /* + * XXXRW: What if v_socket != our soket? + */ if ((vp = unp->unp_vnode) != NULL) { - /* - * XXXRW: should v_socket be frobbed only while holding - * Giant? - */ unp->unp_vnode->v_socket = NULL; unp->unp_vnode = NULL; } - if (unp->unp_conn != NULL) - unp_disconnect(unp); + unp2 = unp->unp_conn; + if (unp2 != NULL) { + UNP_PCB_LOCK(unp2); + unp_disconnect(unp, unp2); + UNP_PCB_UNLOCK(unp2); + } + + /* + * We hold the global lock, so it's OK to acquire multiple pcb locks + * at a time. + */ while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); + + UNP_PCB_LOCK(ref); unp_drop(ref, ECONNRESET); + UNP_PCB_UNLOCK(ref); } + UNP_GLOBAL_WUNLOCK(); soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = NULL; local_unp_rights = unp_rights; - UNP_UNLOCK(); if (unp->unp_addr != NULL) FREE(unp->unp_addr, M_SONAME); + UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); if (vp) { int vfslocked; @@ -757,97 +1039,6 @@ } static int -unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td) -{ - struct sockaddr_un *soun = (struct sockaddr_un *)nam; - struct vnode *vp; - struct mount *mp; - struct vattr vattr; - int error, namelen; - struct nameidata nd; - char *buf; - - UNP_LOCK_ASSERT(); - - /* - * XXXRW: This test-and-set of unp_vnode is non-atomic; the - * unlocked read here is fine, but the value of unp_vnode needs - * to be tested again after we do all the lookups to see if the - * pcb is still unbound? - */ - if (unp->unp_vnode != NULL) - return (EINVAL); - - namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); - if (namelen <= 0) - return (EINVAL); - - UNP_UNLOCK(); - - buf = malloc(namelen + 1, M_TEMP, M_WAITOK); - strlcpy(buf, soun->sun_path, namelen + 1); - - mtx_lock(&Giant); -restart: - mtx_assert(&Giant, MA_OWNED); - NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, - buf, td); -/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ - error = namei(&nd); - if (error) - goto done; - vp = nd.ni_vp; - if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { - NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (vp != NULL) { - vrele(vp); - error = EADDRINUSE; - goto done; - } - error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); - if (error) - goto done; - goto restart; - } - VATTR_NULL(&vattr); - vattr.va_type = VSOCK; - vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); -#ifdef MAC - error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, - &vattr); -#endif - if (error == 0) { - VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); - error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); - } - NDFREE(&nd, NDF_ONLY_PNBUF); - vput(nd.ni_dvp); - if (error) { - vn_finished_write(mp); - goto done; - } - vp = nd.ni_vp; - ASSERT_VOP_LOCKED(vp, "unp_bind"); - soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); - UNP_LOCK(); - vp->v_socket = unp->unp_socket; - unp->unp_vnode = vp; - unp->unp_addr = soun; - UNP_UNLOCK(); - VOP_UNLOCK(vp, 0, td); - vn_finished_write(mp); -done: - mtx_unlock(&Giant); - free(buf, M_TEMP); - UNP_LOCK(); - return (error); -} - -static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; @@ -859,15 +1050,25 @@ char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; - UNP_LOCK_ASSERT(); + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_GLOBAL_WUNLOCK(); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); + len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); strlcpy(buf, soun->sun_path, len + 1); - UNP_UNLOCK(); + + UNP_PCB_LOCK(unp); + if (unp->unp_flags & UNP_CONNECTING) { + UNP_PCB_UNLOCK(unp); + return (EALREADY); + } + unp->unp_flags |= UNP_CONNECTING; + UNP_PCB_UNLOCK(unp); + sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); @@ -889,9 +1090,15 @@ if (error) goto bad; mtx_unlock(&Giant); - UNP_LOCK(); + unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); + + /* + * Lock global lock for two reasons: make sure v_socket is stable, + * and to protect simultaneous locking of multiple pcbs. + */ + UNP_GLOBAL_WLOCK(); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; @@ -904,14 +1111,18 @@ if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { /* - * NB: drop locks here so unp_attach is entered + * NB: drop locks here so uipc_attach is entered * w/o locks; this avoids a recursive lock * of the head and holding sleep locks across * a (potentially) blocking malloc. + * + * XXXRW: This is actually a non-blocking alloc. + * Lock order issue is real. Releasing lock here + * may invalidate so2 pointer, however. */ - UNP_UNLOCK(); + UNP_GLOBAL_WUNLOCK(); so3 = sonewconn(so2, 0); - UNP_LOCK(); + UNP_GLOBAL_WLOCK(); } else so3 = NULL; if (so3 == NULL) { @@ -921,6 +1132,9 @@ unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); + UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; @@ -938,7 +1152,7 @@ /* * The receiver's (server's) credentials are copied * from the unp_peercred member of socket on which the - * former called listen(); unp_listen() cached that + * former called listen(); uipc_listen() cached that * process's credentials at that time so we can use * them now. */ @@ -949,6 +1163,9 @@ unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; + UNP_PCB_UNLOCK(unp3); + UNP_PCB_UNLOCK(unp2); + UNP_PCB_UNLOCK(unp); #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_socket(so, so3); @@ -958,9 +1175,17 @@ so2 = so3; } + unp = sotounpcb(so); + KASSERT(unp != NULL, ("unp_connect: unp == NULL")); + unp2 = sotounpcb(so2); + KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); error = unp_connect2(so, so2, PRU_CONNECT); + UNP_PCB_UNLOCK(unp2); + UNP_PCB_UNLOCK(unp); bad2: - UNP_UNLOCK(); + UNP_GLOBAL_WUNLOCK(); mtx_lock(&Giant); bad: mtx_assert(&Giant, MA_OWNED); @@ -968,25 +1193,33 @@ vput(vp); mtx_unlock(&Giant); free(sa, M_SONAME); - UNP_LOCK(); + UNP_GLOBAL_WLOCK(); + UNP_PCB_LOCK(unp); + unp->unp_flags &= ~UNP_CONNECTING; + UNP_PCB_UNLOCK(unp); return (error); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { - struct unpcb *unp = sotounpcb(so); + struct unpcb *unp; struct unpcb *unp2; - UNP_LOCK_ASSERT(); + unp = sotounpcb(so); + KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); + unp2 = sotounpcb(so2); + KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); + + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_PCB_LOCK_ASSERT(unp); + UNP_PCB_LOCK_ASSERT(unp2); if (so2->so_type != so->so_type) return (EPROTOTYPE); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); unp->unp_conn = unp2; + switch (so->so_type) { - case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); @@ -1009,15 +1242,16 @@ } static void -unp_disconnect(struct unpcb *unp) +unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { - struct unpcb *unp2 = unp->unp_conn; struct socket *so; - UNP_LOCK_ASSERT(); + KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL")); + + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_PCB_LOCK_ASSERT(unp); + UNP_PCB_LOCK_ASSERT(unp2); - if (unp2 == NULL) - return; unp->unp_conn = NULL; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: @@ -1074,10 +1308,10 @@ * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); - UNP_LOCK(); + UNP_GLOBAL_RLOCK(); gencnt = unp_gencnt; n = unp_count; - UNP_UNLOCK(); + UNP_GLOBAL_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; @@ -1091,23 +1325,34 @@ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); - UNP_LOCK(); + /* + * XXXRW: Note, this code relies very explicitly in pcb's being type + * stable. + */ + UNP_GLOBAL_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { + UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) continue; unp_list[i++] = unp; } + UNP_PCB_UNLOCK(unp); } - UNP_UNLOCK(); + UNP_GLOBAL_RUNLOCK(); n = i; /* in case we lost some during malloc */ + /* + * XXXRW: The logic below asumes that it is OK to lock a mutex in + * an unpcb that may have been freed. + */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; + UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; @@ -1125,8 +1370,10 @@ unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); + UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); - } + } else + UNP_PCB_UNLOCK(unp); } free(xu, M_TEMP); if (!error) { @@ -1157,24 +1404,37 @@ static void unp_shutdown(struct unpcb *unp) { + struct unpcb *unp2; struct socket *so; - UNP_LOCK_ASSERT(); + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_PCB_LOCK_ASSERT(unp); - if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && - (so = unp->unp_conn->unp_socket)) - socantrcvmore(so); + unp2 = unp->unp_conn; + if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) { + so = unp2->unp_socket; + if (so != NULL) + socantrcvmore(so); + } } static void unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; + struct unpcb *unp2; - UNP_LOCK_ASSERT(); + UNP_GLOBAL_WLOCK_ASSERT(); + UNP_PCB_LOCK_ASSERT(unp); so->so_error = errno; - unp_disconnect(unp); + unp2 = unp->unp_conn; + if (unp2 == NULL) + return; + + UNP_PCB_LOCK(unp2); + unp_disconnect(unp, unp2); + UNP_PCB_UNLOCK(unp2); } static void @@ -1184,7 +1444,6 @@ struct file *fp; for (i = 0; i < fdcount; i++) { - fp = *rp; /* * zero the pointer before calling * unp_discard since it may end up @@ -1192,7 +1451,8 @@ * * XXXRW: This is less true than it used to be. */ - *rp++ = 0; + fp = *rp; + *rp++ = NULL; unp_discard(fp); } } @@ -1212,7 +1472,7 @@ int f; u_int newlen; - UNP_UNLOCK_ASSERT(); + UNP_GLOBAL_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ @@ -1317,6 +1577,7 @@ void unp_init(void) { + unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); if (unp_zone == NULL) @@ -1327,7 +1588,7 @@ LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); - UNP_LOCK_INIT(); + UNP_GLOBAL_LOCK_INIT(); } static int @@ -1347,7 +1608,7 @@ int error, oldfds; u_int newlen; - UNP_UNLOCK_ASSERT(); + UNP_GLOBAL_UNLOCK_ASSERT(); error = 0; *controlp = NULL; @@ -1741,25 +2002,6 @@ unp_scan(m, unp_discard); } -static int -unp_listen(struct socket *so, struct unpcb *unp, int backlog, - struct thread *td) -{ - int error; - - UNP_LOCK_ASSERT(); - - SOCK_LOCK(so); - error = solisten_proto_check(so); - if (error == 0) { - cru2x(td->td_ucred, &unp->unp_peercred); - unp->unp_flags |= UNP_HAVEPCCACHED; - solisten_proto(so, backlog); - } - SOCK_UNLOCK(so); - return (error); -} - static void unp_scan(struct mbuf *m0, void (*op)(struct file *)) { @@ -1812,6 +2054,9 @@ static void unp_mark(struct file *fp) { + + /* XXXRW: Should probably assert file list lock here. */ + if (fp->f_gcflag & FMARK) return; unp_defer++; @@ -1821,11 +2066,12 @@ static void unp_discard(struct file *fp) { - UNP_LOCK(); + + UNP_GLOBAL_WLOCK(); FILE_LOCK(fp); fp->f_msgcount--; unp_rights--; FILE_UNLOCK(fp); - UNP_UNLOCK(); + UNP_GLOBAL_WUNLOCK(); (void) closef(fp, (struct thread *)NULL); } --- sys/unpcb.h.old 2005/04/13 00:05:41 +++ sys/unpcb.h 2006/04/27 20:47:29 @@ -78,6 +78,7 @@ unp_gen_t unp_gencnt; /* generation count of this instance */ int unp_flags; /* flags */ struct xucred unp_peercred; /* peer credentials, if applicable */ + struct mtx unp_mtx; /* mutex */ }; /* @@ -98,6 +99,14 @@ #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ +/* + * These flags are used to handle non-atomicity in connect() and bind() + * operations on a socket: in particular, to avoid races between multiple + * threads or processes operating simultaneously on the same socket. + */ +#define UNP_CONNECTING 0x010 /* Currently connecting. */ +#define UNP_BINDING 0x020 /* Currently binding. */ + #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) /* Hack alert -- this structure depends on . */