--- //depot/vendor/freebsd/src/sys/netinet/in_pcb.c 2008/11/29 14:40:26 +++ //depot/user/rwatson/udp/src/sys/netinet/in_pcb.c 2008/11/29 22:17:14 @@ -1,7 +1,7 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. - * Copyright (c) 2007 Robert N. M. Watson + * Copyright (c) 2007-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -236,6 +236,7 @@ #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_refcount = 1; /* Effectively the inpcbinfo reference. */ #if defined(IPSEC) || defined(MAC) out: @@ -870,6 +871,25 @@ } /* + * Occasionally protocols (TCP) need to drop the inpcb lock in order to + * acquire or upgrade the global lock. Allow the protocol to prevent the + * inpcb from being freed by bumping the refcount, although this doesn't + * prevent the state of the connection from changing (i.e., closing) so the + * caller will need not to cache state improperly while the lock is not held + * regardless of the refcount. Drop using in_pcbrele(). + */ +void +in_pcbref(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + inp->inp_refcount++; +} + +/* * Historically, in_pcbdetach() included the functionality now found in * in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the * more complex life cycle of TCP. @@ -890,15 +910,23 @@ } /* - * in_pcbfree() is responsible for freeing an already-detached inpcb, as well - * as removing it from any global inpcb lists it might be on. + * in_pcbfree_internal() is responsible for freeing an already-detached + * inpcb, as well as removing it from any global inpcb lists it might be on. + * + * It has two wrappers: in_pcbfree(), which is called when a protocol wishes + * to discard the inpcb, and in_pcbrele(), which is called when a protocol + * (TCP) wishes to decrement an elevated refcount on an inpcb acquired using + * in_pcbref(). */ -void -in_pcbfree(struct inpcb *inp) +static void +in_pcbfree_internal(struct inpcb *inp) { struct inpcbinfo *ipi = inp->inp_pcbinfo; - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + KASSERT(inp->inp_socket == NULL, + ("%s: inp_socket != NULL", __func__)); + KASSERT(inp->inp_refcount == 0, + ("%s: refcount !0", __func__)); INP_INFO_WLOCK_ASSERT(ipi); INP_WLOCK_ASSERT(inp); @@ -930,6 +958,59 @@ } /* + * Public interface to in_pcbfree_internal() to be called once the protocol + * has detached the inpcb from a socket. If another thread holds a temporary + * reference, such as in tcp_input(), we defer the actual free until the last + * reference goes away. Even if not immediately freeing, we return with inp + * unlocked. + */ +void +in_pcbfree(struct inpcb *inp) +{ +#ifdef INVARIANTS + struct inpcbinfo *ipi = inp->inp_pcbinfo; +#endif + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", + __func__)); + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + INP_INFO_WLOCK_ASSERT(ipi); + INP_WLOCK_ASSERT(inp); + + inp->inp_refcount--; + if (inp->inp_refcount > 0) { + INP_WUNLOCK(inp); + return; + } + in_pcbfree_internal(inp); +} + +/* + * Variation of in_pcbfree() that: (a) returns true if the inpcb was freed + * and false if not, and (b) returns the inpcb in a locked state if it wasn't + * freed. See comment on in_pcbref() for details. + */ +int +in_pcbrele(struct inpcb *inp) +{ +#ifdef INVARIANTS + struct inpcbinfo *ipi = inp->inp_pcbinfo; +#endif + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + INP_INFO_WLOCK_ASSERT(ipi); + INP_WLOCK_ASSERT(inp); + + inp->inp_refcount--; + if (inp->inp_refcount > 0) + return (0); + in_pcbfree_internal(inp); + return (1); +} + +/* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * @@ -1032,6 +1113,10 @@ { struct inpcb *inp, *inp_temp; + /* + * We acquire a write lock on pcbinfo here because TCP will need on, + * but for UDP we could be using a read lock. + */ INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.h 2008/11/19 09:45:46 +++ //depot/user/rwatson/udp/src/sys/netinet/in_pcb.h 2008/11/26 09:51:19 @@ -170,6 +170,7 @@ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_ispare1; /* (x) connection id / queue id */ + u_int inp_refcount; /* (i) refcount */ void *inp_pspare[2]; /* (x) rtentry / general use */ /* Local and foreign ports, local and foreign addr. */ @@ -311,6 +312,10 @@ void *ipi_pspare[2]; }; +/* + * XXXRW: rwlock(9) doesn't support classes, only names, unlike mutexes, so + * we use the class as the name. What should we be doing? + */ #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -478,7 +483,9 @@ struct in_addr, u_int, int, struct ifnet *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); +void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); +int in_pcbrele(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); --- //depot/vendor/freebsd/src/sys/netinet/tcp_input.c 2008/11/26 22:35:45 +++ //depot/user/rwatson/udp/src/sys/netinet/tcp_input.c 2008/11/29 22:17:14 @@ -161,6 +161,34 @@ CTLFLAG_RW, tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); +int tcp_read_locking = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, + &tcp_read_locking, 0, "Enable read locking strategy"); + +int tcp_rlock_atfirst; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rlock_atfirst, CTLFLAG_RD, + &tcp_rlock_atfirst, 0, ""); + +int tcp_wlock_atfirst; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_wlock_atfirst, CTLFLAG_RD, + &tcp_wlock_atfirst, 0, ""); + +int tcp_rlock_downgraded; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rlock_downgraded, CTLFLAG_RD, + &tcp_rlock_downgraded, 0, ""); + +int tcp_wlock_upgraded; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_upgraded, CTLFLAG_RD, + &tcp_wlock_upgraded, 0, ""); + +int tcp_wlock_relocked; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_relocked, CTLFLAG_RD, + &tcp_wlock_relocked, 0, ""); + +int tcp_wlock_looped; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_looped, CTLFLAG_RD, + &tcp_wlock_looped, 0, ""); + #ifdef VIMAGE_GLOBALS struct inpcbhead tcb; struct inpcbinfo tcbinfo; @@ -169,7 +197,8 @@ static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t); + struct socket *, struct tcpcb *, int, int, uint8_t, + int); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, @@ -293,6 +322,10 @@ #endif struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ + int ti_locked; +#define TI_UNLOCKED 1 +#define TI_RLOCKED 2 +#define TI_WLOCKED 3 #ifdef TCPDEBUG /* @@ -445,11 +478,38 @@ drop_hdrlen = off0 + off; /* - * Locate pcb for segment. + * Locate pcb for segment, which requires a lock on tcbinfo. + * Optimisticaly acquire a global read lock unless header flags imply + * a state change. There are two cases where we might discover we + * need a write lock despite the flags: ACKs moving a connection out + * of the syncache, and ACK relating to a connection in TIMEWAIT. We + * will handle it later if we got the wrong one here. */ - INP_INFO_WLOCK(&V_tcbinfo); + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tcp_read_locking == 0) { + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + tcp_wlock_atfirst++; + } else { + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; + tcp_rlock_atfirst++; + } + findpcb: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + break; + + default: + panic("tcp_input: findpcb ti_locked %d\n", ti_locked); + } + #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. @@ -556,14 +616,48 @@ } /* - * A previous connection in TIMEWAIT state is supposed to catch - * stray or duplicate segments arriving late. If this segment - * was a legitimate new connection attempt the old INPCB gets - * removed and we can try again to find a listening socket. + * A previous connection in TIMEWAIT state is supposed to catch stray + * or duplicate segments arriving late. If this segment was a + * legitimate new connection attempt the old INPCB gets removed and + * we can try again to find a listening socket. + * + * We may need to upgrade a global read lock to a global write lock + * for the segment. */ if (inp->inp_vflag & INP_TIMEWAIT) { + switch (ti_locked) { + case TI_RLOCKED: + if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) { + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + INP_WLOCK(inp); + if (in_pcbrele(inp)) { + tcp_wlock_looped++; + inp = NULL; + goto findpcb; + } + tcp_wlock_relocked++; + } else { + ti_locked = TI_WLOCKED; + tcp_wlock_upgraded++; + } + break; + + case TI_WLOCKED: + break; + + default: + panic("tcp_input: INP_TIMEWAIT ti_locked %d", + ti_locked); + } + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); + /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ @@ -572,6 +666,7 @@ INP_INFO_WUNLOCK(&V_tcbinfo); return; } + /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the @@ -583,6 +678,77 @@ goto dropwithreset; } + /* + * Now that we've found an inpcb, we get to find out whether or not + * we needed a global write lock. The rule we use once we have a + * connection is that a global read lock is sufficient if the packet + * doesn't have a SYN/FIN/RST flag set, and that it's in the + * established state. This is somewhat conservative, but by the time + * we know for sure, we can no longer restart processing. + * + * The inpcb lock is required in order to check the connection state, + * but since it follows the pcbinfo lock, we have to try the upgrade + * and, if that fails, handle failure. Currently we do this by + * bumping the refcount on the inpcb, dropping all locks, + * re-acquiring the pcbinfo lock with a write lock, and then dropping + * the inpcb refcount. If the refcount hits zero, the connection + * closed while we were re-acquiring the global lock, so we need to + * re-lookup the inpcb to see if we might deliver the packet to a new + * inpcb with the same connection tuple. + */ + if (tp->t_state != TCPS_ESTABLISHED || + (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tcp_read_locking == 0) { + switch (ti_locked) { + case TI_RLOCKED: + if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) { + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + INP_WLOCK(inp); + if (in_pcbrele(inp)) { + tcp_wlock_looped++; + inp = NULL; + goto findpcb; + } + tcp_wlock_relocked++; + } else { + ti_locked = TI_WLOCKED; + tcp_wlock_upgraded++; + } + break; + + case TI_WLOCKED: + break; + + default: + panic("tcp_input: upgrade check ti_locked %d", + ti_locked); + } + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + } else { + /* + * It's possible that because of looping, we may have + * conservatively acquired a write lock. If so, downgrade. + */ + switch (ti_locked) { + case TI_RLOCKED: + break; + + case TI_WLOCKED: + tcp_rlock_downgraded++; + rw_downgrade(&V_tcbinfo.ipi_lock); + ti_locked = TI_RLOCKED; + break; + + default: + panic("tcp_input: downgrade check ti_locked %d", + ti_locked); + } + } + #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) @@ -695,7 +861,7 @@ * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, - iptos); + iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } @@ -895,13 +1061,25 @@ * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); + tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; dropwithreset: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - INP_INFO_WUNLOCK(&V_tcbinfo); + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + default: + panic("tcp_input: dropwithreset ti_locked %d", ti_locked); + } if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -912,10 +1090,22 @@ goto drop; dropunlock: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + default: + panic("tcp_input: dropunlock ti_locked %d", ti_locked); + } if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); drop: INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); @@ -927,11 +1117,11 @@ static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, + int ti_locked) { INIT_VNET_INET(tp->t_vnet); int thflags, acked, ourfinisacked, needoutput = 0; - int headlocked = 1; int rstreason, todrop, win; u_long tiwin; struct tcpopt to; @@ -947,7 +1137,41 @@ #endif thflags = th->th_flags; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + /* + * If this is either a state-changing packet or current state isn't + * established, we require a write lock on tcbinfo. Otherwise, we + * allow either a read lock or a write lock, as we may have acquired + * a write lock due to a race. + * + * Require a global write lock for SYN/SIN/RST segments or + * non-established connections; otherwise accept either a read or + * write lock, as we may have conservatively acquired a write lock in + * certain cases in tcp_input() (is this still true?). Currently we + * will never enter with no lock, so we try to drop it quickly in the + * common pure ack/pure data cases. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + KASSERT(ti_locked == TI_WLOCKED, ("tcp_do_segment: ti_locked" + " %d for SYN/FIN/RST/!EST", ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + } else { +#ifdef INVARIANTS + switch (ti_locked) { + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + break; + + case TI_RLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + break; + + default: + panic("tcp_do_segment: ti_locked %d for EST", + ti_locked); + } +#endif + } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -1076,7 +1300,6 @@ LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { - /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. @@ -1101,14 +1324,27 @@ !IN_FASTRECOVERY(tp) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { - KASSERT(headlocked, - ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; /* * This is a pure ack for outstanding data. */ + switch (ti_locked) { + case TI_WLOCKED: + INP_INFO_WUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + break; + + case TI_RLOCKED: + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + break; + + default: + panic("tcp_do_segment: ti_locked %d " + "on pure ACK", ti_locked); + } + ++V_tcpstat.tcps_predack; + /* * "bad retransmit" recovery. */ @@ -1195,14 +1431,27 @@ tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ - KASSERT(headlocked, ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; /* - * This is a pure, in-sequence data packet - * with nothing on the reassembly queue and - * we have enough buffer space to take it. + * This is a pure, in-sequence data packet with + * nothing on the reassembly queue and we have enough + * buffer space to take it. */ + switch (ti_locked) { + case TI_WLOCKED: + INP_INFO_WUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + break; + + case TI_RLOCKED: + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + break; + + default: + panic("tcp_do_segment: ti_locked %d on pure " + "data segment", ti_locked); + } + /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); @@ -1429,8 +1678,9 @@ tp->t_state = TCPS_SYN_RECEIVED; } - KASSERT(headlocked, ("%s: trimthenstep6: head not locked", - __func__)); + KASSERT(ti_locked == TI_WLOCKED, + ("trimthenstep6: ti_locked %d", ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -1558,17 +1808,23 @@ case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_RST 1 ti_locked %d", + ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp->t_state = TCPS_CLOSED; V_tcpstat.tcps_drops++; - KASSERT(headlocked, ("%s: trimthenstep6: " - "tcp_close: head not locked", __func__)); tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: - KASSERT(headlocked, ("%s: trimthenstep6: " - "tcp_close.2: head not locked", __func__)); + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_RST 2 ti_locked %d", + ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp = tcp_close(tp); break; } @@ -1673,8 +1929,11 @@ tp->t_state > TCPS_CLOSE_WAIT && tlen) { char *s; - KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " - "not locked", __func__)); + KASSERT(ti_locked == TI_WLOCKED, ("tcp_do_segment: " + "SS_NOFDEREF && CLOSE_WAIT && tlen ti_locked %d", + ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " "was closed, sending RST and removing tcpcb\n", @@ -1746,8 +2005,10 @@ * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { - KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " - "head not locked", __func__)); + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; @@ -2034,8 +2295,9 @@ } process_ACK: - KASSERT(headlocked, ("%s: process_ACK: head not locked", - __func__)); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; @@ -2192,11 +2454,9 @@ */ case TCPS_CLOSING: if (ourfinisacked) { - KASSERT(headlocked, ("%s: process_ACK: " - "head not locked", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; m_freem(m); return; } @@ -2210,8 +2470,7 @@ */ case TCPS_LAST_ACK: if (ourfinisacked) { - KASSERT(headlocked, ("%s: process_ACK: " - "tcp_close: head not locked", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } @@ -2220,7 +2479,9 @@ } step6: - KASSERT(headlocked, ("%s: step6: head not locked", __func__)); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: step6 ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2306,7 +2567,9 @@ tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ - KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: dodata ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2425,15 +2688,32 @@ * standard timers. */ case TCPS_FIN_WAIT_2: - KASSERT(headlocked == 1, ("%s: dodata: " - "TCP_FIN_WAIT_2: head not locked", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_WLOCKED, ("tcp_do_segment: " + "dodata TCP_FIN_WAIT_2 ti_locked: %d", + ti_locked)); + tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); return; } } - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + default: + panic("tco_do_setment: dodata epilogue ti_locked %d", + ti_locked); + } + ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, @@ -2447,10 +2727,11 @@ (void) tcp_output(tp); check_delack: - KASSERT(headlocked == 0, ("%s: check_delack: head locked", - __func__)); + KASSERT(ti_locked == TI_UNLOCKED, ("tcp_do_segment: check_delack " + "ti_locked %d", ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); + if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); @@ -2459,7 +2740,9 @@ return; dropafterack: - KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); + /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. @@ -2486,8 +2769,22 @@ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); - INP_INFO_WUNLOCK(&V_tcbinfo); + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + default: + panic("tcp_do_segment: dropafterack epilogue ti_locked %d", + ti_locked); + } + ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); @@ -2495,8 +2792,20 @@ return; dropwithreset: - KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); - INP_INFO_WUNLOCK(&V_tcbinfo); + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + default: + panic("tcp_do_segment: dropwithreset ti_locked %d", ti_locked); + } if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -2506,6 +2815,22 @@ return; drop: + switch (ti_locked) { + case TI_RLOCKED: + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + break; + + case TI_WLOCKED: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + break; + + case TI_UNLOCKED: + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + break; + } + /* * Drop space held by incoming segment and return. */ @@ -2516,8 +2841,6 @@ #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); - if (headlocked) - INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); } --- //depot/vendor/freebsd/src/sys/netinet6/in6_pcb.h 2008/11/27 12:06:10 +++ //depot/user/rwatson/udp/src/sys/netinet6/in6_pcb.h 2008/11/29 22:14:02 @@ -87,6 +87,7 @@ void in6_pcbnotify __P((struct inpcbinfo *, struct sockaddr *, u_int, const struct sockaddr *, u_int, int, void *, struct inpcb *(*)(struct inpcb *, int))); +int in6_pcbrele __P((struct inpcb *)); struct inpcb * in6_rtchange __P((struct inpcb *, int)); struct sockaddr *