Index: tools/tools/netrate/tcpp/README =================================================================== --- tools/tools/netrate/tcpp/README (revision 207113) +++ tools/tools/netrate/tcpp/README (working copy) @@ -50,15 +50,8 @@ Typical use: - ./tcpp -c 192.168.100.201 -p 4 -t 100000 -m 10000 -b 100000 \ - -l 192.168.100.101 -M 4 + ./tcpp -c 192.168.100.201 -p 4 -t 100000 -m 10000 -b 100000 -M 4 -This creates four workers, each of which will (over its lifetime) set up and -use 100,000 TCP connections carrying 100K of data, up to 10,000 simultaneous -connection at any given moment. tcpp will use four source IP addresses, -starting with 192.168.100.101, and all connections will be to the single -destination IP of 192.168.100.201. - Having (p) <= the number of cores is advisable. When multiple IPs are used on the client, they will be sequential starting with the localIPbase set with -l. @@ -94,6 +87,3 @@ # Consider turning off TSO and/or adjusting the MTU for some scenarios: ifconfig cxgb0 -tso ifconfig cxgb0 mtu 1500 - - -$FreeBSD$ Index: sys/conf/files =================================================================== --- sys/conf/files (revision 207113) +++ sys/conf/files (working copy) @@ -2492,6 +2492,7 @@ netinet/ip_id.c optional inet netinet/in_mcast.c optional inet netinet/in_pcb.c optional inet +netinet/in_pcbgroup.c optional inet netinet/in_proto.c optional inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c (revision 207113) +++ sys/netinet/tcp_input.c (working copy) @@ -176,10 +176,6 @@ &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -int tcp_read_locking = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, - &tcp_read_locking, 0, "Enable read locking strategy"); - VNET_DEFINE(struct inpcbhead, tcb); VNET_DEFINE(struct inpcbinfo, tcbinfo); #define tcb6 tcb /* for KAME src sync over BSD*'s */ @@ -320,8 +316,7 @@ char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_RLOCKED 2 -#define TI_WLOCKED 3 +#define TI_WLOCKED 2 #ifdef TCPDEBUG /* @@ -481,23 +476,19 @@ * despite the flags: ACKs moving a connection out of the syncache, * and ACKs for a connection in TIMEWAIT. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; - } else { - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - } + } else + ti_locked = TI_UNLOCKED; findpcb: #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif #ifdef IPFIREWALL_FORWARD @@ -514,20 +505,16 @@ * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_group(&V_tcbinfo, ip->ip_src, + th->th_sport, ip->ip_dst, th->th_dport, + INPLOOKUP_LOCKPCB, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - next_hop->sin_addr, - next_hop->sin_port ? - ntohs(next_hop->sin_port) : - th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in_pcblookup_group(&V_tcbinfo, ip->ip_src, + th->th_sport, next_hop->sin_addr, + next_hop->sin_port ? ntohs(next_hop->sin_port) : + th->th_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_LOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); @@ -536,18 +523,16 @@ { if (isipv6) { #ifdef INET6 - inp = in6_pcblookup_hash(&V_tcbinfo, - &ip6->ip6_src, th->th_sport, - &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in6_pcblookup_group(&V_tcbinfo, &ip6->ip6_src, + th->th_sport, &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB, + m->m_pkthdr.rcvif); #endif } else - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in_pcblookup_group(&V_tcbinfo, ip->ip_src, + th->th_sport, ip->ip_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB, + m->m_pkthdr.rcvif); } /* @@ -577,7 +562,7 @@ rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_WLOCK(inp); + INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_HW_FLOWID) && (m->m_flags & M_FLOWID) && ((inp->inp_socket == NULL) @@ -618,24 +603,20 @@ * legitimate new connection attempt the old INPCB gets removed and * we can try again to find a listening socket. * - * At this point, due to earlier optimism, we may hold a read lock on - * the inpcbinfo, rather than a write lock. If so, we need to - * upgrade, or if that fails, acquire a reference on the inpcb, drop - * all locks, acquire a global write lock, and then re-acquire the - * inpcb lock. We may at that point discover that another thread has - * tried to free the inpcb, in which case we need to loop back and - * try to find a new inpcb to deliver to. + * At this point, due to earlier optimism, we may hold only an inpcb + * lock, and not the inpcbinfo write lock. If so, we need to try to + * acquire it, or if that fails, acquire a reference on the inpcb, + * drop all locks, acquire a global write lock, and then re-acquire + * the inpcb lock. We may at that point discover that another thread + * has tried to free the inpcb, in which case we need to loop back + * and try to find a new inpcb to deliver to. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); @@ -671,22 +652,20 @@ /* * We've identified a valid inpcb, but it could be that we need an - * inpcbinfo write lock and have only a read lock. In this case, - * attempt to upgrade/relock using the same strategy as the TIMEWAIT - * case above. If we relock, we have to jump back to 'relocked' as - * the connection might now be in TIMEWAIT. + * inpcbinfo write lock but don't hold it. In this case, attempt to + * acquire using the same strategy as the TIMEWAIT case above. If we + * relock, we have to jump back to 'relocked' as the connection might + * now be in TIMEWAIT. */ - if (tp->t_state != TCPS_ESTABLISHED || - (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: upgrade check ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { +#ifdef INVARIANTS + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#endif + if (tp->t_state != TCPS_ESTABLISHED) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); @@ -723,13 +702,16 @@ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. + * attempt or the completion of a previous one. Because listen + * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be + * held in this case. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); bzero(&inc, sizeof(inc)); #ifdef INET6 @@ -1022,12 +1004,15 @@ return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif ti_locked = TI_UNLOCKED; if (inp != NULL) { @@ -1039,12 +1024,15 @@ goto drop; dropunlock: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropunlock ti_locked %d", __func__, ti_locked); +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif ti_locked = TI_UNLOCKED; if (inp != NULL) @@ -1099,13 +1087,13 @@ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: ti_locked %d for EST", __func__, - ti_locked); + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1262,13 +1250,8 @@ /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure ACK", - __func__, ti_locked); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1364,13 +1347,8 @@ * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure data " - "segment", __func__, ti_locked); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -2218,9 +2196,6 @@ } process_ACK: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; @@ -2443,9 +2418,6 @@ } step6: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: step6 ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2531,9 +2503,6 @@ tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dodata ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2662,13 +2631,8 @@ return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dodata epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -2697,9 +2661,6 @@ return; dropafterack: - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); - /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. @@ -2726,13 +2687,8 @@ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropafterack epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; @@ -2742,12 +2698,8 @@ return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -2758,9 +2710,7 @@ return; drop: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); #ifdef INVARIANTS else Index: sys/netinet/in_pcbgroup.c =================================================================== --- sys/netinet/in_pcbgroup.c (revision 0) +++ sys/netinet/in_pcbgroup.c (revision 0) @@ -0,0 +1,366 @@ +/*- + * Copyright (c) 2010 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif /* INET6 */ + + +void +in_pcbgroup_init(struct inpcbinfo *pcbinfo, int hash_nelements, + u_int numpcbgroups, const char *grouplockname) +{ + struct inpcbgroup *pcbgroup; + u_int pgn; + + if (numpcbgroups == 0) + return; + + pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * + sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); + pcbinfo->ipi_npcbgroups = numpcbgroups; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, + &pcbgroup->ipg_hashmask); + INP_GROUP_LOCK_INIT(pcbgroup, grouplockname); + } +} + +void +in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) +{ + struct inpcbgroup *pcbgroup; + u_int pgn; + + if (pcbinfo->ipi_npcbgroups == 0) + return; + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), + ("in_pcbinfo_destroy: listhead not empty")); + INP_GROUP_LOCK_DESTROY(pcbgroup); + hashdestroy(pcbgroup->ipg_hashbase, M_PCB, + pcbgroup->ipg_hashmask); + } + free(pcbinfo->ipi_pcbgroups, M_PCB); + pcbinfo->ipi_pcbgroups = NULL; + pcbinfo->ipi_npcbgroups = 0; +} + +/* + * Select a pcbgroup given a pcbinfo and an IPv4 4-tuple. Once we can share + * RSS key and configuration information vertically here, we should use + * toeplitz here. + */ +static struct inpcbgroup * +in_pcbgroup_select4(struct inpcbinfo *pcbinfo, in_addr_t laddr, + u_short lport, in_addr_t faddr, u_short fport) +{ + u_int pgn; + + /* XXXRW: not what we eventually want to do. */ + pgn = (faddr ^ fport) % pcbinfo->ipi_npcbgroups; + return (&pcbinfo->ipi_pcbgroups[pgn]); +} + +static struct inpcbgroup * +in_pcbgroup_select2(struct inpcbinfo *pcbinfo, in_addr_t laddr, + in_addr_t faddr) +{ + u_int pgn; + + /* XXXRW: not what we eventually want to do. */ + pgn = (faddr ^ laddr) % pcbinfo->ipi_npcbgroups; + return (&pcbinfo->ipi_pcbgroups[pgn]); +} + +struct inpcbgroup * +in_pcbgroup_byinpcb(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + + /* + * XXXRW: pcbinfos should declare what their preferred policy is. + */ + if (pcbinfo == &tcbinfo) + return (in_pcbgroup_select4(pcbinfo, inp->inp_laddr.s_addr, + inp->inp_lport, inp->inp_faddr.s_addr, inp->inp_fport)); + if (pcbinfo == &udbinfo) + return (in_pcbgroup_select2(pcbinfo, inp->inp_laddr.s_addr, + inp->inp_faddr.s_addr)); + return (&pcbinfo->ipi_pcbgroups[0]); +} + +struct inpcbgroup * +in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, in_addr_t laddr, + u_short lport, in_addr_t faddr, u_short fport) +{ + + /* + * XXXRW: pcbinfos should declare what their preferred policy is. + */ + if (pcbinfo == &tcbinfo) + return (in_pcbgroup_select4(pcbinfo, laddr, lport, faddr, + fport)); + if (pcbinfo == &udbinfo) + return (in_pcbgroup_select2(pcbinfo, laddr, faddr)); + return (&pcbinfo->ipi_pcbgroups[0]); +} + +#ifdef INET6 +static struct inpcbgroup * +in6_pcbgroup_select4(struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, + u_short lport, const struct in6_addr *faddr, u_short fport) +{ + u_int pgn, faddr_hashkey; + + /* XXXRW: not what we eventually want to do. */ + faddr_hashkey = faddr->s6_addr32[3]; + pgn = ((faddr_hashkey >> 24) ^ fport) % pcbinfo->ipi_npcbgroups; + return (&pcbinfo->ipi_pcbgroups[pgn]); +} + +static struct inpcbgroup * +in6_pcbgroup_select2(struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, + const struct in6_addr *faddr) +{ + u_int pgn, faddr_hashkey, laddr_hashkey; + + /* XXXRW: not what we eventually want to do. */ + faddr_hashkey = faddr->s6_addr32[3]; + laddr_hashkey = laddr->s6_addr32[3]; + pgn = (faddr_hashkey ^ laddr_hashkey) % pcbinfo->ipi_npcbgroups; + return (&pcbinfo->ipi_pcbgroups[pgn]); +} + +struct inpcbgroup * +in6_pcbgroup_byinpcb(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + + /* + * XXXRW: pcbinfos should declare what their preferred policy is. + */ + if (pcbinfo == &V_tcbinfo) + return (in6_pcbgroup_select4(pcbinfo, &inp->in6p_laddr, + inp->inp_lport, &inp->in6p_faddr, inp->inp_fport)); + if (pcbinfo == &V_udbinfo) + return (in6_pcbgroup_select2(pcbinfo, &inp->in6p_laddr, + &inp->in6p_faddr)); + return (&pcbinfo->ipi_pcbgroups[0]); +} + +struct inpcbgroup * +in6_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, + u_short lport, const struct in6_addr *faddr, u_short fport) +{ + + /* + * XXXRW: pcbinfos should declare what their preferred policy is. + */ + if (pcbinfo == &V_tcbinfo) + return (in6_pcbgroup_select4(pcbinfo, laddr, lport, faddr, + fport)); + if (pcbinfo == &V_udbinfo) + return (in6_pcbgroup_select2(pcbinfo, laddr, faddr)); + return (&pcbinfo->ipi_pcbgroups[0]); +} +#endif + +static void +in_pcbwild_remove(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbgroup *pcbgroup; + struct inpcbwild *pcbwild; + u_int pgn; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbwild = &inp->inp_pcbwild[pgn]; + INP_GROUP_LOCK(pcbgroup); + LIST_REMOVE(pcbwild, ipw_entry); + INP_GROUP_UNLOCK(pcbgroup); + } + free(inp->inp_pcbwild, M_PCB); + inp->inp_pcbwild = NULL; +} + +/* + * Install or update pcbgroup entry and optional pcbgroup wildcard entry for + * this inpcb. + */ +void +in_pcbgroup_update(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbgroup *newpcbgroup, *oldpcbgroup; + struct inpcbhead *pcbhash; + uint32_t hashkey_faddr; + int wildcard_needed; + u_int pgn; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + + oldpcbgroup = inp->inp_pcbgroup; + newpcbgroup = in_pcbgroup_byinpcb(inp); + if (oldpcbgroup != NULL || (inp->inp_vflag & INP_DROPPED)) { + INP_GROUP_LOCK(oldpcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(oldpcbgroup); + } + if (newpcbgroup != NULL && !(inp->inp_vflag & INP_DROPPED)) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ + else +#endif + hashkey_faddr = inp->inp_faddr.s_addr; + INP_GROUP_LOCK(newpcbgroup); + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); + inp->inp_pcbgroup = newpcbgroup; + INP_GROUP_UNLOCK(newpcbgroup); + } + + /* + * If this is a wildcard inpcb, allocate wildcard storage and hook + * up to every pcbgroup. We try to avoid hitting each pcbgroup lock + * twice when a connection moves (considered a rare event anyway). + * + * XXXRW: How to handle allocation failure? + */ +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + wildcard_needed = IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); + else +#endif + wildcard_needed = (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); + + /* + * If it's a wildcard socket and didn't already have wildcard state + * allocated and hooked up to all of the pcbgroups, add it now. If + * it's no longer required, remove it. No action is required if it + * was already present. + * + * XXXRW: How to handle allocation failure here? + */ + if (wildcard_needed && inp->inp_pcbwild == NULL) { + inp->inp_pcbwild = malloc(sizeof(*inp->inp_pcbwild) * + pcbinfo->ipi_npcbgroups, M_PCB, M_ZERO | M_NOWAIT); + if (inp->inp_pcbwild == NULL) + return; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + struct inpcbgroup *pcbgroup; + struct inpcbwild *pcbwild; + + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbwild = &inp->inp_pcbwild[pgn]; + pcbwild->ipw_inpcb = inp; + INP_GROUP_LOCK(pcbgroup); + LIST_INSERT_HEAD(&pcbgroup->ipg_wildlist, pcbwild, + ipw_entry); + INP_GROUP_UNLOCK(pcbgroup); + } + } else if (!wildcard_needed && inp->inp_pcbwild != NULL) + in_pcbwild_remove(inp); +} + +/* + * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. + */ +void +in_pcbgroup_remove(struct inpcb *inp) +{ +#ifdef INVARIANTS + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; +#endif + struct inpcbgroup *pcbgroup; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + + pcbgroup = inp->inp_pcbgroup; + if (pcbgroup == NULL) + return; + + INP_GROUP_LOCK(pcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(pcbgroup); + + if (inp->inp_pcbwild != NULL) + in_pcbwild_remove(inp); +} Property changes on: sys/netinet/in_pcbgroup.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c (revision 207113) +++ sys/netinet/raw_ip.c (working copy) @@ -186,7 +186,7 @@ { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE); + 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, 0, NULL); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c (revision 207113) +++ sys/netinet/tcp_subr.c (working copy) @@ -282,6 +282,13 @@ #endif /* + * Number of connection groups. + * + * XXXRW: This should be tunable and other good/useful stuff. + */ +#define TCP_NUMPCBGROUPS 16 + +/* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl @@ -383,7 +390,8 @@ hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + TCP_NUMPCBGROUPS, "tcppcbgroup"); /* * These have to be type stable for the benefit of the timers. @@ -1215,7 +1223,8 @@ return (error); INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, - addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, + NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); @@ -1271,8 +1280,7 @@ *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], - addrs[0].sin6_port, - 0, NULL); + addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, @@ -1287,10 +1295,9 @@ if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { + } else INP_INFO_RUNLOCK(&V_tcbinfo); error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); Index: sys/netinet/ip_divert.c =================================================================== --- sys/netinet/ip_divert.c (revision 207113) +++ sys/netinet/ip_divert.c (working copy) @@ -153,7 +153,7 @@ * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE); + div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, 0, NULL); } static void Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c (revision 207113) +++ sys/netinet/in_pcb.c (working copy) @@ -2,8 +2,12 @@ * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -48,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +77,7 @@ #include #ifdef INET6 #include +#include #include #endif /* INET6 */ @@ -191,19 +197,23 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, - uint32_t inpcbzone_flags) + uint32_t inpcbzone_flags, u_int numpcbgroups, const char *grouplockname) { INP_INFO_LOCK_INIT(pcbinfo, name); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif + pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); + pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + in_pcbgroup_init(pcbinfo, hash_nelements, numpcbgroups, + grouplockname); pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); @@ -217,9 +227,13 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { + KASSERT(pcbinfo->ipi_count == 0, + ("in_pcbinfo_destroy: ipi_count = %u", pcbinfo->ipi_count)); + hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + in_pcbgroup_destroy(pcbinfo); uma_zdestroy(pcbinfo->ipi_zone); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -275,7 +289,7 @@ #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - inp->inp_refcount = 1; /* Reference from the inpcbinfo */ + refcount_init(&inp->inp_refcount, 1); /* Reference from pcbinfo. */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -331,7 +345,7 @@ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; - int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; int dorandom; @@ -348,7 +362,7 @@ if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); @@ -429,7 +443,7 @@ return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait @@ -523,7 +537,7 @@ *lastport = first; lport = htons(*lastport); } while (in_pcblookup_local(pcbinfo, laddr, - lport, wild, cred)); + lport, lookupflags, cred)); } *laddrp = laddr.s_addr; *lportp = lport; @@ -935,6 +949,7 @@ inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); + in_pcbgroup_remove(inp); } /* @@ -954,54 +969,18 @@ } /* - * in_pcbfree_internal() frees an inpcb that has been detached from its - * socket, and whose reference count has reached 0. It will also remove the - * inpcb from any global lists it might remain on. - */ -static void -in_pcbfree_internal(struct inpcb *inp) -{ - struct inpcbinfo *ipi = inp->inp_pcbinfo; - - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__)); - - INP_INFO_WLOCK_ASSERT(ipi); - INP_WLOCK_ASSERT(inp); - -#ifdef IPSEC - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif /* IPSEC */ - inp->inp_gencnt = ++ipi->ipi_gencnt; - in_pcbremlists(inp); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - if (inp->in6p_moptions != NULL) - ip6_freemoptions(inp->in6p_moptions); - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); - if (inp->inp_moptions != NULL) - inp_freemoptions(inp->inp_moptions); - inp->inp_vflag = 0; - crfree(inp->inp_cred); - -#ifdef MAC - mac_inpcb_destroy(inp); -#endif - INP_WUNLOCK(inp); - uma_zfree(ipi->ipi_zone, inp); -} - -/* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock is already held. + * but where the inpcb lock may already held, or when acquiring a reference + * via a pcbgroup. * + * in_pcbref() should be used only to provide brief memory stability, and + * must always be followed by a call to INP_WLOCK() and in_pcbrele() to + * garbage collect the inpcb if it has been freed from another context. + * Until in_pcbrele() has returned that the inpcb is still valid, lock and + * rele are the *only* safe operations that may be performed on the inpcb. + * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference @@ -1011,11 +990,9 @@ in_pcbref(struct inpcb *inp) { - INP_WLOCK_ASSERT(inp); - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - inp->inp_refcount++; + refcount_acquire(&inp->inp_refcount); } /* @@ -1023,23 +1000,29 @@ * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. + * + * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a + * reference on an inpcb. Historicall more work was done here (actually, in + * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the + * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely + * about memory stability (and continued use of the write lock). */ int in_pcbrele(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); INP_WLOCK_ASSERT(inp); - inp->inp_refcount--; - if (inp->inp_refcount > 0) + if (refcount_release(&inp->inp_refcount) == 0) return (0); - in_pcbfree_internal(inp); + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + + INP_WUNLOCK(inp); + uma_zfree(pcbinfo->ipi_zone, inp); return (1); } @@ -1048,21 +1031,45 @@ * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is - * released using in_pcbrele(), but the inpcb is still unlocked. + * released using in_pcbrele(), but the inpcb is still unlocked. Almost all + * work, including removal from global lists, is done in this context, where + * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); + INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + /* XXXRW: Do as much as possible here. */ +#ifdef IPSEC + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif /* IPSEC */ + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + in_pcbgroup_remove(inp); + in_pcbremlists(inp); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + if (inp->in6p_moptions != NULL) + ip6_freemoptions(inp->in6p_moptions); + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_moptions != NULL) + inp_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + crfree(inp->inp_cred); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif if (!in_pcbrele(inp)) INP_WUNLOCK(inp); } @@ -1236,7 +1243,7 @@ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, int wild_okay, struct ucred *cred) + u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 @@ -1246,9 +1253,12 @@ #endif int wildcard; + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + INP_INFO_LOCK_ASSERT(pcbinfo); - if (!wild_okay) { + if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that @@ -1350,17 +1360,161 @@ #undef INP_LOOKUP_MAPPED_PCB_COST /* - * Lookup PCB in hash list. + * Lookup PCB in hash list, using pcbgroup tables. */ struct inpcb * +in_pcblookup_group(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, + struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcbgroup *pcbgroup; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & INPLOOKUP_LOCKPCB) != 0, + ("%s: INPLOOKUP_LOCKPCB not set", __func__)); + + /* + * First look for an exact match. + */ + tmpinp = NULL; + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr.s_addr, lport, + faddr.s_addr, fport); + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbwild *pcbwild; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + + LIST_FOREACH(pcbwild, &pcbgroup->ipg_wildlist, ipw_entry) { + inp = pcbwild->ipw_inpcb; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif /* defined(INET6) */ + if (inp != NULL) + goto found; + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + INP_WLOCK(inp); + if (in_pcbrele(inp)) + return (NULL); + return (inp); +} + +/* + * Lookup PCB in hash list, using pcbinfo tables. + */ +struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + INP_INFO_LOCK_ASSERT(pcbinfo); /* @@ -1396,7 +1550,7 @@ /* * Then look for a wildcard match, if requested. */ - if (wildcard == INPLOOKUP_WILDCARD) { + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; @@ -1457,18 +1611,18 @@ local_wild = inp; } } /* LIST_FOREACH */ - if (jail_wild != NULL) - return (jail_wild); - if (local_exact != NULL) - return (local_exact); - if (local_wild != NULL) - return (local_wild); + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; #ifdef INET6 - if (local_wild_mapped != NULL) - return (local_wild_mapped); + if (inp == NULL) + inp = local_wild_mapped; #endif /* defined(INET6) */ - } /* if (wildcard == INPLOOKUP_WILDCARD) */ - + if (inp != NULL) + return (inp); + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ return (NULL); } Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h (revision 207113) +++ sys/netinet/in_pcb.h (working copy) @@ -34,6 +34,7 @@ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ +#include #include #include #include @@ -137,6 +138,7 @@ * * Key: * (c) - Constant after initialization + * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks @@ -156,9 +158,12 @@ */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i/p) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct inpcbgroup *inp_pcbgroup; /* (g/i/p) PCB group list */ + struct inpcbwild *inp_pcbwild; /* (g/i/p) PCB group wildcard list */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -272,6 +277,24 @@ u_int ipi_count; /* + * Fields associated with port lookup and allocation. + */ + u_short ipi_lastport; + u_short ipi_lastlow; + u_short ipi_lasthi; + + /* + * UMA zone from which inpcbs are allocated for this protocol. + */ + struct uma_zone *ipi_zone; + + /* + * Connection groups associated with this protocol. + */ + struct inpcbgroup *ipi_pcbgroups; + u_int ipi_npcbgroups; + + /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ @@ -285,18 +308,6 @@ u_long ipi_porthashmask; /* - * Fields associated with port lookup and allocation. - */ - u_short ipi_lastport; - u_short ipi_lastlow; - u_short ipi_lasthi; - - /* - * UMA zone from which inpcbs are allocated for this protocol. - */ - struct uma_zone *ipi_zone; - - /* * Generation count--incremented each time a connection is allocated * or freed. */ @@ -314,6 +325,41 @@ void *ipi_pspare[2]; }; +/* + * Data structure to represent membership of an inpcb in the wildcard lists + * across all pcbgroups. These are allocated in arrays and hung off of the + * inpcb when required. + */ +struct inpcbwild { + struct inpcb *ipw_inpcb; + LIST_ENTRY(inpcbwild) ipw_entry; +}; + +/* + * Connection groups hold sets of connections that have similar CPU/thread + * affinity. Each connection belongs to exactly one connection group. + */ +struct inpcbgroup { + /* + * Per-connection group hash of inpcbs, hashed by local and foreign + * addresses and port numbers. + */ + struct inpcbhead *ipg_hashbase; + u_long ipg_hashmask; + + /* + * List of wildcards in the pcbgroup. + * + * XXXRW: Should be a hash table. + */ + LIST_HEAD(, inpcbwild) ipg_wildlist; + + /* + * Per-connection group lock, not to be confused with ipi_lock. + */ + struct mtx ipg_lock; +} __aligned(CACHE_LINE_SIZE); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -385,6 +431,14 @@ #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ + MTX_DEF) +#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) + +#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) +#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) +#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) + #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ @@ -445,7 +499,13 @@ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ -#define INPLOOKUP_WILDCARD 1 +/* + * Flags passed to in_pcblookup_local(), in_pcblookup_group(), and + * in_pcblookup_hash(). + */ +#define INPLOOKUP_WILDCARD 0x00000001 +#define INPLOOKUP_LOCKPCB 0x00000002 + #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ @@ -486,8 +546,19 @@ void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, uma_fini, uint32_t); + int, int, char *, uma_init, uma_fini, uint32_t, u_int, + const char *); +struct inpcbgroup * + in_pcbgroup_byinpcb(struct inpcb *inp); +struct inpcbgroup * + in_pcbgroup_bytuple(struct inpcbinfo *, in_addr_t, u_short, + in_addr_t, u_short); +void in_pcbgroup_destroy(struct inpcbinfo *); +void in_pcbgroup_init(struct inpcbinfo *, int, u_int, const char *); +void in_pcbgroup_remove(struct inpcb *inp); +void in_pcbgroup_update(struct inpcb *inp); + void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); @@ -506,6 +577,9 @@ in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * + in_pcblookup_group(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *); +struct inpcb * in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c (revision 207113) +++ sys/netinet/tcp_syncache.c (working copy) @@ -761,6 +761,7 @@ goto abort; } } + in_pcbgroup_update(inp); tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; Index: sys/netinet/ipfw/ip_fw2.c =================================================================== --- sys/netinet/ipfw/ip_fw2.c (revision 207113) +++ sys/netinet/ipfw/ip_fw2.c (working copy) @@ -656,7 +656,7 @@ (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct inpcbinfo *pi; - int wildcard; + int lookupflags; struct inpcb *pcb; int match; @@ -681,30 +681,31 @@ if (*ugid_lookupp == -1) return (0); if (proto == IPPROTO_TCP) { - wildcard = 0; + lookupflags = 0; pi = &V_tcbinfo; } else if (proto == IPPROTO_UDP) { - wildcard = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; + lookupflags |= INPLOOKUP_LOCKPCB; match = 0; if (*ugid_lookupp == 0) { - INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), - wildcard, oif) : + lookupflags, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), - wildcard, NULL); + lookupflags, NULL); if (pcb != NULL) { + INP_WLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; + INP_WUNLOCK(pcb); } - INP_INFO_RUNLOCK(pi); if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c (revision 207113) +++ sys/netinet/tcp_usrreq.c (working copy) @@ -355,6 +355,7 @@ if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); + in_pcbgroup_update(inp); tcp_offload_listen_open(tp); } SOCK_UNLOCK(so); @@ -395,6 +396,7 @@ } if (error == 0) { tp->t_state = TCPS_LISTEN; + in_pcbgroup_update(inp); solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -920,6 +922,7 @@ SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } + in_pcbgroup_update(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); } @@ -959,6 +962,7 @@ SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } + in_pcbgroup_update(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); } @@ -1090,6 +1094,7 @@ return EADDRINUSE; inp->inp_laddr = laddr; in_pcbrehash(inp); + in_pcbgroup_update(inp); /* * Compute window scaling to request: @@ -1157,6 +1162,7 @@ inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); + in_pcbgroup_update(inp); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c (revision 207113) +++ sys/netinet/udp_usrreq.c (working copy) @@ -141,6 +141,8 @@ #define UDBHASHSIZE 128 #endif +#define UDP_NUMPCBGROUPS 16 + SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &VNET_NAME(udpstat), udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); @@ -181,7 +183,8 @@ V_udp_blackhole = 0; in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + UDP_NUMPCBGROUPS, "udppcbgroup"); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); @@ -257,7 +260,7 @@ #endif #endif - INP_RLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); #ifdef IPSEC /* Check AH/ESP integrity. */ @@ -451,12 +454,12 @@ } #endif - INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); last = NULL; LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) @@ -570,8 +573,9 @@ /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, ifp); + inp = in_pcblookup_group(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB, + ifp); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; @@ -585,27 +589,30 @@ UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; + goto badunlocked; } if (V_udp_blackhole) - goto badheadlocked; + goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) - goto badheadlocked; + goto badunlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Check the minimum TTL for socket. + * + * XXXRW: Would it make sense to allow a read inpcb lock to be + * returned here? Probably not due to in_pcbrele() inside of hashed + * lookup. */ - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_WLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { - INP_RUNLOCK(inp); - goto badunlocked; + INP_WUNLOCK(inp); + m_freem(m); + return; } up = intoudpcb(inp); if (up->u_tun_func == NULL) { @@ -617,7 +624,7 @@ (*up->u_tun_func)(m, iphlen, inp); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); return; badheadlocked: @@ -680,17 +687,15 @@ return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, 0, NULL); + inp = in_pcblookup_group(&V_udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, INPLOOKUP_LOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); } else in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); @@ -821,8 +826,9 @@ if (error) return (error); INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, + addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, + INPLOOKUP_WILDCARD, NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); @@ -1398,6 +1404,7 @@ if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + in_pcbgroup_update(inp); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1471,6 +1478,7 @@ INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); + in_pcbgroup_update(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (error); @@ -1488,6 +1496,7 @@ if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + in_pcbgroup_remove(inp); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1520,6 +1529,7 @@ error = in_pcbconnect(inp, nam, td->td_ucred); if (error == 0) soisconnected(so); + in_pcbgroup_update(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (error); @@ -1563,6 +1573,7 @@ in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + in_pcbgroup_update(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); Index: sys/net/netisr.c =================================================================== --- sys/net/netisr.c (revision 207113) +++ sys/net/netisr.c (working copy) @@ -1,6 +1,6 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson - * Copyright (c) 2010 Juniper Networks, Inc. + * Copyright (c) 2009-2010 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract @@ -94,6 +94,31 @@ #include #include +/* + * Locking strategy: three types of locks protect netisr processing: + * + * netisr configuration lock - serializes "rethreading" events, in which the + * number of worker threads is changed. + * + * netisr_rmlock - stabilizes the netisr system for network processing, + * almost always acquired as a read lock (except during configuration + * changes). + * + * nws_mtx - per-workstream lock that serializes access to queues. + */ + +/* + * netisr configuration lock: serialize rethread events, in which the thread + * count may be increased and decreased, to avoid interlacing of these + * events, which might expose incompletely started or stopped threads, etc. + * This is a sleep lock so that it can be held over ithread start/stop. + */ +static struct sx netisr_config_sx; +#define NETISR_CONFIG_LOCK_INIT() sx_init(&netisr_config_sx, \ + "netisr_config_sx") +#define NETISR_CONFIG_LOCK() sx_xlock(&netisr_config_sx) +#define NETISR_CONFIG_UNLOCK() sx_xunlock(&netisr_config_sx) + /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to @@ -114,22 +139,32 @@ * * XXXRW: rmlocks don't support assertions. */ +#define NETISR_RMLOCKING + +#ifdef NETISR_RMLOCKING static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ - RM_NOWITNESS) + RM_NOWITNESS | RM_RECURSE) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) -/* #define NETISR_LOCKING */ +#else +#define NETISR_LOCK_INIT() +#define NETISR_LOCK_ASSERT() +#define NETISR_RLOCK(x) +#define NETISR_RUNLOCK(x) +#define NETISR_WLOCK() +#define NETISR_WUNLOCK() +#endif SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- - * Three direct dispatch policies are supported: + * Three global direct dispatch policies are supported: * - * - Always defer: all work is scheduled for a netisr, regardless of context. + * - Always defer: all work is deferred for a netisr, regardless of context. * (!direct) * * - Hybrid: if the executing context allows direct dispatch, and we're @@ -155,10 +190,11 @@ &netisr_direct, 0, "Enable direct dispatch"); /* - * Allow the administrator to limit the number of threads (CPUs) to use for - * netisr. We don't check netisr_maxthreads before creating the thread for - * CPU 0, so in practice we ignore values <= 1. This must be set at boot. - * We will create at most one thread per CPU. + * Maximum numbe of threads to be used by netisr; will be capped to the + * number of CPUs, or set to the number of CPUs if the default of '-1' is + * present. In the future, if we allow the stack/schedule to adjust the + * number of threads dynamically then this will allow the administrator to + * tune that behavior. */ static int netisr_maxthreads = -1; /* Max number of threads. */ TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); @@ -166,15 +202,30 @@ &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); +/* + * The default number of threads to use at boot, which will be capped to + * maxthreads. We default to 1 for now. + */ +#define NETISR_DEFAULT_DEFAULTTHREADS 1 +static int netisr_defaultthreads = NETISR_DEFAULT_DEFAULTTHREADS; +TUNABLE_INT("net.isr.defaultthreads", &netisr_defaultthreads); +SYSCTL_INT(_net_isr, OID_AUTO, defaultthreads, CTLFLAG_RD, + &netisr_defaultthreads, 0, + "Use this many CPUs for netisr processing by default"); + +/* + * Bind workstream threads to CPUs. For now, we allow things to float + * around by default, but as our notions of CPU affinity get stronger, we may + * want to change this policy. + */ static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* - * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, - * both for initial configuration and later modification using - * netisr_setqlimit(). + * Limit per-workstream mbuf queue limits to at most net.isr.maxqlimit, both + * for initial configuration and later modification using netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; @@ -225,11 +276,10 @@ /* * Number of registered workstreams. Will be at most the number of running - * CPUs once fully started. + * CPUs once fully started. To modify this, must hold both of + * netisr_config_sx and netisr_rm for write. */ static u_int nws_count; -SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, - &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields @@ -389,7 +439,9 @@ netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; +#endif #ifdef INVARIANTS const char *name; #endif @@ -423,7 +475,9 @@ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; +#endif #ifdef INVARIANTS const char *name; #endif @@ -680,7 +734,7 @@ static void swi_net(void *arg) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; @@ -693,9 +747,7 @@ ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif -#ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); -#endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) @@ -712,9 +764,7 @@ nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif @@ -767,9 +817,7 @@ struct netisr_work *npwp; int dosignal, error; -#ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); -#endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); @@ -789,7 +837,7 @@ int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif u_int cpuid; @@ -798,9 +846,7 @@ KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); -#ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); -#endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); @@ -811,9 +857,7 @@ error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif return (error); } @@ -831,7 +875,7 @@ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; @@ -847,9 +891,7 @@ KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); -#ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); -#endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); @@ -942,9 +984,7 @@ out_unpin: sched_unpin(); out_unlock: -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif return (error); } @@ -971,63 +1011,126 @@ } #endif +/* + * Given a status quo, adjust the number of threads to match the requested + * configuration. Any policy blending thread resource limits, CPU count, + * etc, must be imposed by the caller and is assumed already done here. + */ static void -netisr_start_swi(u_int cpuid, struct pcpu *pc) +netisr_adjust_threads(int nws_desired) { + struct netisr_workstream *nwsp; + u_int cpuid, nwsid, nws_oldcount; char swiname[12]; - struct netisr_workstream *nwsp; int error; - KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); + NETISR_CONFIG_LOCK(); + nws_oldcount = nws_count; - nwsp = DPCPU_ID_PTR(cpuid, nws); - mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); - nwsp->nws_cpu = cpuid; - snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); - error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, - SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); - if (error) - panic("%s: swi_add %d", __func__, error); - pc->pc_netisr = nwsp->nws_intr_event; - if (netisr_bindthreads) { - error = intr_event_bind(nwsp->nws_intr_event, cpuid); - if (error != 0) - printf("%s: cpu %u: intr_event_bind: %d", __func__, - cpuid, error); + /* + * Add new threads required before hooking them up. + */ + for (nwsid = nws_oldcount; nwsid < nws_desired; nwsid++) { + cpuid = nws_array[nwsid]; + nwsp = DPCPU_ID_PTR(cpuid, nws); + KASSERT(nwsp->nws_intr_event == NULL, + ("%s: nws_intr_event CPU %u non-NULL", __func__, cpuid)); + snprintf(swiname, sizeof(swiname), "netisr %u", nwsid); + error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, + nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); + if (error) + panic("%s: swi_add %d", __func__, error); + if (netisr_bindthreads) { + error = intr_event_bind(nwsp->nws_intr_event, cpuid); + if (error) + printf("%s: cpu %u: intr_event_bind: %d", + __func__, cpuid, error); + } } + + /* + * Suspend netisr processing; mostly drains queues, prevents new work + * from being scheduled. + */ NETISR_WLOCK(); - nws_array[nws_count] = nwsp->nws_cpu; - nws_count++; + + /* + * XXXRW: Is there any drain activity we can perform here to address + * the race between swi_sched() and netisr_lock() in the swi? We + * can't recurse the rmlock by acquiring it read after write, so we + * can't just walk the workstreams and process pending packets. This + * could occur if the protocol calls back into netisr to schedule new + * work, for example. Possibly we need a more complex solution? Or + * should we just drain the queues and drop the packets... + */ + + /* + * Update worker count and resume processing. + */ + nws_count = nws_desired; NETISR_WUNLOCK(); + + /* + * Remove threads that are now no longer in use. + */ + for (nwsid = nws_desired; nwsid < nws_oldcount; nwsid++) { + cpuid = nws_array[nwsid]; + nwsp = DPCPU_ID_PTR(cpuid, nws); + error = swi_remove(nwsp->nws_swi_cookie); + if (error) + panic("%s: swi_remove %d", __func__, error); + nwsp->nws_intr_event = NULL; + } + + NETISR_CONFIG_UNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * - * Start a worker thread for the boot CPU so that we can support network - * traffic immediately in case the network stack is used before additional - * CPUs are started (for example, diskless boot). + * Initialize workstream state for all CPUs, but start a worker only for the + * boot CPU. That way we can support network traffic immediately in case the + * stack is used before additional CPUs are started (for example, diskless + * boot). */ static void netisr_init(void *arg) { + struct netisr_workstream *nwsp; + u_int cpuid, nwsid; KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); + /* + * Initialize global locks. + */ NETISR_LOCK_INIT(); + NETISR_CONFIG_LOCK_INIT(); + + /* + * Digest pre-boot policy and configuration. + */ if (netisr_maxthreads < 1) - netisr_maxthreads = 1; + netisr_maxthreads = mp_ncpus; + if (netisr_defaultthreads < 1) + netisr_defaultthreads = NETISR_DEFAULT_DEFAULTTHREADS; if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } + if (netisr_defaultthreads > netisr_maxthreads) { + printf("netisr_init: forcing defaultthreads from %d to %d\n", + netisr_defaultthreads, netisr_maxthreads); + netisr_defaultthreads = netisr_maxthreads; + } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } + #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with @@ -1042,30 +1145,35 @@ } #endif - netisr_start_swi(curcpu, pcpu_find(curcpu)); + /* + * Initialize workstream data structures, populate nws_array, but + * don't start threads yet as the APs aren't started yet. + */ + for (cpuid = 0, nwsid = 0; cpuid < MAXCPU; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + nwsp = DPCPU_ID_PTR(cpuid, nws); + mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); + nwsp->nws_cpu = cpuid; + nws_array[nwsid] = cpuid; + nwsid++; + } + + /* + * Start a boot CPU netisr to get us going. + */ + netisr_adjust_threads(1); } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); /* - * Start worker threads for additional CPUs. No attempt to gracefully handle - * work reassignment, we don't yet support dynamic reconfiguration. + * Now that SMP is going, create any additional threads we may require. */ static void netisr_start(void *arg) { - struct pcpu *pc; - SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { - if (nws_count >= netisr_maxthreads) - break; - /* XXXRW: Is skipping absent CPUs still required here? */ - if (CPU_ABSENT(pc->pc_cpuid)) - continue; - /* Worker will already be present for boot CPU. */ - if (pc->pc_netisr != NULL) - continue; - netisr_start_swi(pc->pc_cpuid, pc); - } + netisr_adjust_threads(netisr_defaultthreads); } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); @@ -1233,6 +1341,29 @@ "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); +/* + * Run-time query and adjustment of thread count using a sysctl. + */ +static int +sysctl_net_isr_numthreads(SYSCTL_HANDLER_ARGS) +{ + u_int numthreads; + int error; + + numthreads = nws_count; + error = sysctl_handle_int(oidp, &numthreads, 0, req); + if (error || req->newptr == NULL) + return (error); + if (numthreads < 1 || numthreads > netisr_maxthreads) + return (EINVAL); + netisr_adjust_threads(numthreads); + return (0); +} + +SYSCTL_PROC(_net_isr, OID_AUTO, numthreads, CTLTYPE_UINT|CTLFLAG_RW, 0, 0, + sysctl_net_isr_numthreads, "I", + "Number of threads used by the netisr framework"); + #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c (revision 207113) +++ sys/netinet6/udp6_usrreq.c (working copy) @@ -231,11 +231,11 @@ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; - INP_INFO_RLOCK(&V_udbinfo); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct ip6_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address @@ -366,8 +366,9 @@ /* * Locate pcb for datagram. */ - inp = in6_pcblookup_hash(&V_udbinfo, &ip6->ip6_src, uh->uh_sport, - &ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + inp = in6_pcblookup_group(&V_udbinfo, &ip6->ip6_src, uh->uh_sport, + &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_LOCKPCB, m->m_pkthdr.rcvif); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; @@ -384,9 +385,8 @@ if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); - goto badheadlocked; + goto badunlocked; } - INP_INFO_RUNLOCK(&V_udbinfo); if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) @@ -394,8 +394,7 @@ icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_WLOCK_ASSERT(inp); up = intoudpcb(inp); if (up->u_tun_func == NULL) { udp6_append(inp, m, off, &fromsa); @@ -406,7 +405,7 @@ (*up->u_tun_func)(m, off, inp); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); return (IPPROTO_DONE); badheadlocked: @@ -507,8 +506,8 @@ } INP_INFO_RLOCK(&V_udbinfo); inp = in6_pcblookup_hash(&V_udbinfo, &addrs[1].sin6_addr, - addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 1, - NULL); + addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_WILDCARD, NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); @@ -519,7 +518,7 @@ inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); } else { INP_INFO_RUNLOCK(&V_udbinfo); error = ENOENT; @@ -774,6 +773,7 @@ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + in_pcbgroup_update(inp); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -860,6 +860,7 @@ error = in6_pcbbind(inp, nam, td->td_ucred); out: + in_pcbgroup_update(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (error); @@ -887,6 +888,7 @@ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + in_pcbgroup_update(inp); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -943,6 +945,7 @@ } soisconnected(so); } + in_pcbgroup_update(inp); out: INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); @@ -997,6 +1000,7 @@ in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + in_pcbgroup_update(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); Index: sys/netinet6/in6_src.c =================================================================== --- sys/netinet6/in6_src.c (revision 207113) +++ sys/netinet6/in6_src.c (working copy) @@ -837,7 +837,7 @@ { struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; - int count, error, wild = 0, dorandom; + int count, error, lookupflags = 0, dorandom; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); @@ -850,7 +850,7 @@ /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; @@ -921,7 +921,7 @@ *lastport = first; lport = htons(*lastport); } while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, - lport, wild, cred)); + lport, lookupflags, cred)); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c (revision 207113) +++ sys/netinet6/in6_pcb.c (working copy) @@ -1,7 +1,11 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2010 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -94,6 +98,8 @@ #include #include #include +#include +#include #include #include @@ -111,7 +117,8 @@ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; - int error, wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error, lookupflags = 0; + int reuseport = (so->so_options & SO_REUSEPORT); INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); @@ -121,7 +128,7 @@ if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) @@ -224,7 +231,7 @@ } } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && (reuseport & ((t->inp_flags & INP_TIMEWAIT) ? intotw(t)->tw_so_options : t->inp_socket->so_options)) == 0) @@ -235,7 +242,7 @@ in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { if ((reuseport & intotw(t)->tw_so_options) == 0 && @@ -640,14 +647,17 @@ */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, - u_short lport, int wild_okay, struct ucred *cred) + u_short lport, int lookupflags, struct ucred *cred) { register struct inpcb *inp; int matchwild = 3, wildcard; + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + INP_INFO_WLOCK_ASSERT(pcbinfo); - if (!wild_okay) { + if (!(lookupflags & INPLOOKUP_WILDCARD)) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that @@ -802,15 +812,158 @@ * Lookup PCB in hash list. */ struct inpcb * +in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, + u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcbgroup *pcbgroup; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + int faith; + + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & INPLOOKUP_LOCKPCB) != 0, + ("%s: INPLOOKUP_LOCKPCB not set", __func__)); + + if (faithprefix_p != NULL) + faith = (*faithprefix_p)(laddr); + else + faith = 0; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[ + INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && + IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP6)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + INP_GROUP_UNLOCK(pcbgroup); + + /* + * Then look for a wildcard match, if requested. + */ + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, &in6addr_any, lport, + &in6addr_any, 0); + INP_GROUP_LOCK(pcbgroup); + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; + struct inpcb *jail_wild = NULL; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || + inp->inp_lport != lport) { + continue; + } + + /* XXX inp locking */ + if (faith && (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP6); + if (injail) { + if (prison_check_ip6(inp->inp_cred, + laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { + if (injail) + goto found; + else + local_exact = inp; + } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; + if (inp != NULL) + goto found; + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + if ((lookupflags & INPLOOKUP_LOCKPCB) != 0) { + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + INP_WLOCK(inp); + if (in_pcbrele(inp)) + return (NULL); + } else + INP_GROUP_UNLOCK(pcbgroup); + return (inp); +} + +/* + * Lookup PCB in hash list. + */ +struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, - u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int wildcard, - struct ifnet *ifp) + u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; int faith; + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + INP_INFO_LOCK_ASSERT(pcbinfo); if (faithprefix_p != NULL) @@ -850,7 +1003,7 @@ /* * Then look for a wildcard match, if requested. */ - if (wildcard == INPLOOKUP_WILDCARD) { + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; @@ -901,17 +1054,16 @@ } } /* LIST_FOREACH */ - if (jail_wild != NULL) - return (jail_wild); - if (local_exact != NULL) - return (local_exact); - if (local_wild != NULL) - return (local_wild); - } /* if (wildcard == INPLOOKUP_WILDCARD) */ - - /* - * Not found. - */ + inp = jail_wild; + if (inp == NULL) + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; + if (inp != NULL) + return (inp); + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } Index: sys/netinet6/in6_pcb.h =================================================================== --- sys/netinet6/in6_pcb.h (revision 207113) +++ sys/netinet6/in6_pcb.h (working copy) @@ -69,6 +69,12 @@ #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) +struct inpcbgroup * + in6_pcbgroup_byinpcb __P((struct inpcb *)); +struct inpcbgroup * + in6_pcbgroup_bytuple __P((struct inpcbinfo *, const struct in6_addr *, + u_short, const struct in6_addr *, u_short)); + void in6_pcbpurgeif0 __P((struct inpcbinfo *, struct ifnet *)); void in6_losing __P((struct inpcb *)); int in6_pcbbind __P((struct inpcb *, struct sockaddr *, struct ucred *)); @@ -80,6 +86,10 @@ struct in6_addr *, u_short, int, struct ucred *)); struct inpcb * + in6_pcblookup_group __P((struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, + struct ifnet *ifp)); +struct inpcb * in6_pcblookup_hash __P((struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *)); Index: sys/contrib/pf/net/pf.c =================================================================== --- sys/contrib/pf/net/pf.c (revision 207113) +++ sys/contrib/pf/net/pf.c (working copy) @@ -3032,16 +3032,14 @@ #ifdef INET case AF_INET: #ifdef __FreeBSD__ - INP_INFO_RLOCK(pi); /* XXX LOR */ inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, - dport, 0, NULL); + dport, INPLOOKUP_LOCKPCB, NULL); if (inp == NULL) { inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if(inp == NULL) { - INP_INFO_RUNLOCK(pi); + daddr->v4, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_LOCKPCB, NULL); + if (inp == NULL) return (-1); - } } #else inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport); @@ -3056,16 +3054,14 @@ #ifdef INET6 case AF_INET6: #ifdef __FreeBSD__ - INP_INFO_RLOCK(pi); inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, 0, NULL); + &daddr->v6, dport, INPLOOKUP_LOCKPCB, NULL); if (inp == NULL) { inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) { - INP_INFO_RUNLOCK(pi); + &daddr->v6, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_LOCKPCB, NULL); + if (inp == NULL) return (-1); - } } #else inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6, @@ -3083,9 +3079,10 @@ return (-1); } #ifdef __FreeBSD__ + INP_WLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; - INP_INFO_RUNLOCK(pi); + INP_WUNLOCK(inp); #else pd->lookup.uid = inp->inp_socket->so_euid; pd->lookup.gid = inp->inp_socket->so_egid; Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC (revision 207113) +++ sys/amd64/conf/GENERIC (working copy) @@ -68,6 +68,8 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging for use in -current +options ALT_BREAK_TO_DEBUGGER +options BREAK_TO_DEBUGGER options KDB # Enable kernel debugger support. options DDB # Support DDB. options GDB # Support remote GDB. Index: sys/sys/pcpu.h =================================================================== --- sys/sys/pcpu.h (revision 207113) +++ sys/sys/pcpu.h (working copy) @@ -142,7 +142,7 @@ struct vmmeter pc_cnt; /* VM stats counters */ long pc_cp_time[CPUSTATES]; /* statclock ticks */ struct device *pc_device; - void *pc_netisr; /* netisr SWI cookie */ + void *_pc_spareptr; /* unused */ /* * Stuff for read mostly lock