diff -urN -x compile -x LINT vendor/freebsd/src/sys/amd64/conf/BENCHMARK user/rwatson/tcp/src/sys/amd64/conf/BENCHMARK --- vendor/freebsd/src/sys/amd64/conf/BENCHMARK 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/amd64/conf/BENCHMARK 2010-04-12 20:00:13.845348626 +0100 @@ -0,0 +1,8 @@ +include GENERIC +ident BENCHMARK + +nooptions DEADLKRES +nooptions INVARIANTS +nooptions INVARIANT_SUPPORT +nooptions WITNESS +nooptions WITNESS_SKIPSPIN diff -urN -x compile -x LINT vendor/freebsd/src/sys/amd64/conf/GENERIC user/rwatson/tcp/src/sys/amd64/conf/GENERIC --- vendor/freebsd/src/sys/amd64/conf/GENERIC 2011-02-01 09:35:49.147243136 +0000 +++ user/rwatson/tcp/src/sys/amd64/conf/GENERIC 2010-05-22 11:56:29.963634527 +0100 @@ -68,6 +68,8 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging for use in -current +options ALT_BREAK_TO_DEBUGGER +options BREAK_TO_DEBUGGER options KDB # Enable kernel debugger support. options DDB # Support DDB. options GDB # Support remote GDB. diff -urN -x compile -x LINT vendor/freebsd/src/sys/amd64/conf/PCBGROUP user/rwatson/tcp/src/sys/amd64/conf/PCBGROUP --- vendor/freebsd/src/sys/amd64/conf/PCBGROUP 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/amd64/conf/PCBGROUP 2011-01-30 14:25:50.013663875 +0000 @@ -0,0 +1,4 @@ +include GENERIC +ident PCBGROUP + +options PCBGROUP diff -urN -x compile -x LINT vendor/freebsd/src/sys/amd64/conf/RSS user/rwatson/tcp/src/sys/amd64/conf/RSS --- vendor/freebsd/src/sys/amd64/conf/RSS 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/amd64/conf/RSS 2011-01-30 14:25:50.013663875 +0000 @@ -0,0 +1,4 @@ +include PCBGROUP +ident RSS + +options RSS diff -urN -x compile -x LINT vendor/freebsd/src/sys/conf/files user/rwatson/tcp/src/sys/conf/files --- vendor/freebsd/src/sys/conf/files 2011-02-01 09:36:10.043414577 +0000 +++ user/rwatson/tcp/src/sys/conf/files 2011-02-02 16:48:26.283953244 +0000 @@ -2499,9 +2499,11 @@ netinet/ip_id.c optional inet netinet/in_mcast.c optional inet netinet/in_pcb.c optional inet +netinet/in_pcbgroup.c optional inet netinet/in_proto.c optional inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet +netinet/in_rss.c optional inet rss netinet/ip_divert.c optional inet ipdivert ipfirewall netinet/ipfw/dn_heap.c optional inet dummynet netinet/ipfw/dn_sched_fifo.c optional inet dummynet @@ -2557,6 +2559,7 @@ netinet/tcp_timer.c optional inet netinet/tcp_timewait.c optional inet netinet/tcp_usrreq.c optional inet +netinet/toeplitz.c optional inet rss netinet/udp_usrreq.c optional inet netinet/libalias/alias.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_db.c optional libalias inet | netgraph_nat inet @@ -2573,6 +2576,7 @@ netinet6/in6_ifattach.c optional inet6 netinet6/in6_mcast.c optional inet6 netinet6/in6_pcb.c optional inet6 +netinet6/in6_pcbgroup.c optional inet6 pcbgroup netinet6/in6_proto.c optional inet6 netinet6/in6_rmx.c optional inet6 netinet6/in6_src.c optional inet6 diff -urN -x compile -x LINT vendor/freebsd/src/sys/conf/options user/rwatson/tcp/src/sys/conf/options --- vendor/freebsd/src/sys/conf/options 2011-02-01 09:36:10.273637133 +0000 +++ user/rwatson/tcp/src/sys/conf/options 2011-01-30 14:25:50.823549489 +0000 @@ -386,6 +386,7 @@ ETHER_II opt_ef.h ETHER_SNAP opt_ef.h INET opt_inet.h +INET_NOSTATS opt_inet.h INET6 opt_inet6.h IPDIVERT IPFILTER opt_ipfilter.h @@ -414,8 +415,10 @@ NCP NETATALK opt_atalk.h NFSLOCKD +PCBGROUP opt_pcbgroup.h RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h +RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_OFFLOAD_DISABLE opt_inet.h #Disable code to dispatch tcp offloading diff -urN -x compile -x LINT vendor/freebsd/src/sys/contrib/pf/net/pf.c user/rwatson/tcp/src/sys/contrib/pf/net/pf.c --- vendor/freebsd/src/sys/contrib/pf/net/pf.c 2011-02-01 09:36:26.883481122 +0000 +++ user/rwatson/tcp/src/sys/contrib/pf/net/pf.c 2011-02-01 09:49:38.183422874 +0000 @@ -3032,16 +3032,18 @@ #ifdef INET case AF_INET: #ifdef __FreeBSD__ - INP_INFO_RLOCK(pi); /* XXX LOR */ - inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, - dport, 0, NULL); + /* + * XXXRW: would be nice if we had an mbuf here so that we + * could use in_pcblookup_mbuf(). + */ + inp = in_pcblookup(pi, saddr->v4, sport, daddr->v4, + dport, INPLOOKUP_RLOCKPCB, NULL); if (inp == NULL) { - inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if(inp == NULL) { - INP_INFO_RUNLOCK(pi); + inp = in_pcblookup(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL); + if (inp == NULL) return (-1); - } } #else inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport); @@ -3056,16 +3058,18 @@ #ifdef INET6 case AF_INET6: #ifdef __FreeBSD__ - INP_INFO_RLOCK(pi); - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, 0, NULL); + /* + * XXXRW: would be nice if we had an mbuf here so that we + * could use in6_pcblookup_mbuf(). + */ + inp = in6_pcblookup(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_RLOCKPCB, NULL); if (inp == NULL) { - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) { - INP_INFO_RUNLOCK(pi); + inp = in6_pcblookup(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL); + if (inp == NULL) return (-1); - } } #else inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6, @@ -3083,9 +3087,10 @@ return (-1); } #ifdef __FreeBSD__ + INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; - INP_INFO_RUNLOCK(pi); + INP_RUNLOCK(inp); #else pd->lookup.uid = inp->inp_socket->so_euid; pd->lookup.gid = inp->inp_socket->so_egid; diff -urN -x compile -x LINT vendor/freebsd/src/sys/dev/cxgb/common/cxgb_common.h user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_common.h --- vendor/freebsd/src/sys/dev/cxgb/common/cxgb_common.h 2011-02-01 09:36:55.213488427 +0000 +++ user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_common.h 2010-06-11 21:32:48.862294497 +0100 @@ -735,6 +735,8 @@ void t3_config_rss(adapter_t *adapter, unsigned int rss_config, const u8 *cpus, const u16 *rspq); int t3_read_rss(adapter_t *adapter, u8 *lkup, u16 *map); +void t3_config_rss_secret(adapter_t *adapter, u8 *keyp); +void t3_read_rss_secret(adapter_t *adapter, u8 *keyp); int t3_set_proto_sram(adapter_t *adap, const u8 *data); int t3_mps_set_active_ports(adapter_t *adap, unsigned int port_mask); void t3_port_failover(adapter_t *adapter, int port); diff -urN -x compile -x LINT vendor/freebsd/src/sys/dev/cxgb/common/cxgb_t3_hw.c user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_t3_hw.c --- vendor/freebsd/src/sys/dev/cxgb/common/cxgb_t3_hw.c 2011-02-01 09:36:55.873447165 +0000 +++ user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_t3_hw.c 2010-06-11 21:32:49.610301855 +0100 @@ -2990,6 +2990,41 @@ } /** + * t3_config_rss_secret - write RSS key + * @adapter: the adapter + * @keyp: the 16-byte RSS key + * + * Writes the RSS hash key. + */ +void t3_config_rss_secret(adapter_t *adapter, u8 *keyp) +{ + const u32 *buf = (const u32 *)keyp; + + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY3, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY2, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY1, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY0, cpu_to_be32(*buf++)); +} + +/** + * tr_read_rss_secret - read RSS key + * @adapter: the adapter + * @keyp: storage for the retrieved 16-byte RSS key + * + * Reads the RSS hash key. + */ +void t3_read_rss_secret(adapter_t *adapter, u8 *keyp) +{ + u32 *buf = (u32 *)keyp; + + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY3)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY2)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY1)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY0)); +} + + +/** * t3_tp_set_offload_mode - put TP in NIC/offload mode * @adap: the adapter * @enable: 1 to select offload mode, 0 for regular NIC diff -urN -x compile -x LINT vendor/freebsd/src/sys/dev/cxgb/cxgb_main.c user/rwatson/tcp/src/sys/dev/cxgb/cxgb_main.c --- vendor/freebsd/src/sys/dev/cxgb/cxgb_main.c 2011-02-01 09:36:56.167156002 +0000 +++ user/rwatson/tcp/src/sys/dev/cxgb/cxgb_main.c 2011-01-30 14:25:51.403405030 +0000 @@ -27,6 +27,8 @@ ***************************************************************************/ +#include "opt_rss.h" + #include __FBSDID("$FreeBSD: src/sys/dev/cxgb/cxgb_main.c,v 1.112 2010/06/07 08:23:16 np Exp $"); @@ -66,6 +68,7 @@ #include #include +#include #include #include #include @@ -1425,9 +1428,24 @@ { int i; u_int nq[2]; +#ifdef RSS + uint8_t key[RSS_KEYSIZE]; +#endif uint8_t cpus[SGE_QSETS + 1]; uint16_t rspq_map[RSS_TABLE_SIZE]; - + +#ifdef RSS + /* + * XXXRW: Implicit assumption that RSS_KEYSIZE >= the size of the key + * written by t3_config_rss_secret(). + * + * XXXRW: need to program rspq array. + * + * XXXRW: need to program cpu array. + */ + rss_getkey(key); + t3_config_rss_secret(adap, key); +#endif for (i = 0; i < SGE_QSETS; ++i) cpus[i] = i; cpus[SGE_QSETS] = 0xff; diff -urN -x compile -x LINT vendor/freebsd/src/sys/dev/cxgb/cxgb_osdep.h user/rwatson/tcp/src/sys/dev/cxgb/cxgb_osdep.h --- vendor/freebsd/src/sys/dev/cxgb/cxgb_osdep.h 2011-02-01 09:36:56.203398318 +0000 +++ user/rwatson/tcp/src/sys/dev/cxgb/cxgb_osdep.h 2010-06-11 21:32:49.740313631 +0100 @@ -201,6 +201,7 @@ #define max_t(type, a, b) (type)max((a), (b)) #define net_device ifnet #define cpu_to_be32 htobe32 +#define be32_to_cpu be32toh /* Standard PHY definitions */ #define BMCR_LOOPBACK BMCR_LOOP diff -urN -x compile -x LINT vendor/freebsd/src/sys/dev/cxgb/cxgb_sge.c user/rwatson/tcp/src/sys/dev/cxgb/cxgb_sge.c --- vendor/freebsd/src/sys/dev/cxgb/cxgb_sge.c 2011-02-01 09:36:56.334801890 +0000 +++ user/rwatson/tcp/src/sys/dev/cxgb/cxgb_sge.c 2010-10-19 06:25:31.591604613 +0100 @@ -2975,6 +2975,7 @@ uint32_t flags = ntohl(r->flags); uint32_t rss_csum = *(const uint32_t *)r; uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); + uint8_t hash_type = r->rss_hdr.hash_type; eth = (r->rss_hdr.opcode == CPL_RX_PKT); @@ -3024,8 +3025,31 @@ eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r); if (eop) { - rspq->rspq_mh.mh_head->m_flags |= M_FLOWID; - rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash; + struct mbuf *m = rspq->rspq_mh.mh_head; + + if (hash_type) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = rss_hash; + } + switch (hash_type) { + case 0: + M_HASHTYPE_SET(m, M_HASHTYPE_NONE); + break; + + case 1: + M_HASHTYPE_SET(m, + M_HASHTYPE_RSS_2TUPLE); + break; + + case 2: + M_HASHTYPE_SET(m, + M_HASHTYPE_RSS_4TUPLE); + break; + + case 3: + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); + break; + } } ethpad = 2; diff -urN -x compile -x LINT vendor/freebsd/src/sys/i386/conf/LINT-VIMAGE user/rwatson/tcp/src/sys/i386/conf/LINT-VIMAGE --- vendor/freebsd/src/sys/i386/conf/LINT-VIMAGE 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/i386/conf/LINT-VIMAGE 2010-11-13 21:16:38.914821461 +0000 @@ -0,0 +1,3 @@ +include LINT +ident LINT-VIMAGE +options VIMAGE diff -urN -x compile -x LINT vendor/freebsd/src/sys/i386/conf/PCBGROUP user/rwatson/tcp/src/sys/i386/conf/PCBGROUP --- vendor/freebsd/src/sys/i386/conf/PCBGROUP 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/i386/conf/PCBGROUP 2011-01-30 14:25:51.513676866 +0000 @@ -0,0 +1,4 @@ +include GENERIC +ident PCBGROUP + +options PCBGROUP diff -urN -x compile -x LINT vendor/freebsd/src/sys/i386/conf/RSS user/rwatson/tcp/src/sys/i386/conf/RSS --- vendor/freebsd/src/sys/i386/conf/RSS 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/i386/conf/RSS 2011-02-01 09:02:14.733492227 +0000 @@ -0,0 +1,4 @@ +include PCBGROUP +ident RSS + +options RSS diff -urN -x compile -x LINT vendor/freebsd/src/sys/modules/cxgb/cxgb/Makefile user/rwatson/tcp/src/sys/modules/cxgb/cxgb/Makefile --- vendor/freebsd/src/sys/modules/cxgb/cxgb/Makefile 2011-02-01 09:38:33.174677349 +0000 +++ user/rwatson/tcp/src/sys/modules/cxgb/cxgb/Makefile 2010-10-18 17:08:38.081813566 +0100 @@ -8,7 +8,7 @@ SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c cxgb_aq100x.c SRCS+= cxgb_sge.c cxgb_offload.c cxgb_tn1010.c SRCS+= device_if.h bus_if.h pci_if.h -SRCS+= opt_inet.h opt_zero.h opt_sched.h +SRCS+= opt_inet.h opt_rss.h opt_zero.h opt_sched.h SRCS+= uipc_mvec.c CFLAGS+= -g -DDEFAULT_JUMBO -I${CXGB} diff -urN -x compile -x LINT vendor/freebsd/src/sys/net/if_ethersubr.c user/rwatson/tcp/src/sys/net/if_ethersubr.c --- vendor/freebsd/src/sys/net/if_ethersubr.c 2011-02-01 09:38:34.773490048 +0000 +++ user/rwatson/tcp/src/sys/net/if_ethersubr.c 2011-02-01 08:30:17.343401866 +0000 @@ -37,6 +37,7 @@ #include "opt_netgraph.h" #include "opt_carp.h" #include "opt_mbuf_profiling.h" +#include "opt_rss.h" #include #include @@ -70,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +114,9 @@ CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); + /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); @@ -566,7 +571,7 @@ * mbuf chain m with the ethernet header at the front. */ static void -ether_input(struct ifnet *ifp, struct mbuf *m) +ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; @@ -761,6 +766,74 @@ CURVNET_RESTORE(); } +#ifdef RSS +/* + * Ethernet input dispatch; by default, direct dispatch here regardless of + * global configuration. However, if RSS is enabled, hook up RSS affinity + * so that when deferred or hybrid dispatch is enabled, we can redistribute + * load based on RSS. + * + * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or + * not it had already done work distribution via multi-queue. Then we could + * direct dispatch in the event load balancing was already complete and + * handle the case of interfaces with different capabilities better. + * + * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions + * at multiple layers? + * + * XXXRW: For now, enable all this only if RSS is compiled in, although it + * works fine without RSS. Need to characterise the performance overhead + * of the detour through the netisr code in the event the result is always + * direct dispatch. + */ +static void +ether_nh_input(struct mbuf *m) +{ + + ether_input_internal(m->m_pkthdr.rcvif, m); +} + +static struct netisr_handler ether_nh = { + .nh_name = "ether", + .nh_handler = ether_nh_input, + .nh_proto = NETISR_ETHER, +#ifdef RSS + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_DIRECT, + .nh_m2cpuid = rss_m2cpuid, +#else + .nh_policy = NETISR_POLICY_SOURCE, + .nh_dispatch = NETISR_DISPATCH_DIRECT, +#endif +}; + +static void +ether_init(__unused void *arg) +{ + + netisr_register(ðer_nh); +} +SYSINIT(ether, SI_SUB_PSEUDO, SI_ORDER_ANY, ether_init, NULL); +#endif /* RSS */ + +static void +ether_input(struct ifnet *ifp, struct mbuf *m) +{ + +#ifdef RSS + /* + * We will rely on rcvif being set properly in the deferred context, + * so assert it is correct here. Arguably, this should be asserted + * for all cases, not just the RSS case. + */ + KASSERT(m->m_pkthdr.rcvif == ifp, ("ether_input: ifnet mismatch")); + + netisr_dispatch(NETISR_ETHER, m); +#else + ether_input_internal(ifp, m); +#endif +} + /* * Upper layer processing for a received Ethernet packet. */ @@ -993,8 +1066,6 @@ if_detach(ifp); } -SYSCTL_DECL(_net_link); -SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) SYSCTL_VNET_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, &VNET_NAME(ether_ipfw), 0, "Pass ether pkts through firewall"); diff -urN -x compile -x LINT vendor/freebsd/src/sys/net/netisr.c user/rwatson/tcp/src/sys/net/netisr.c --- vendor/freebsd/src/sys/net/netisr.c 2011-02-01 09:38:35.429142931 +0000 +++ user/rwatson/tcp/src/sys/net/netisr.c 2011-02-02 15:57:49.833777034 +0000 @@ -1,6 +1,6 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson - * Copyright (c) 2010 Juniper Networks, Inc. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract @@ -94,6 +94,31 @@ #include #include +/* + * Locking strategy: three types of locks protect netisr processing: + * + * netisr configuration lock - serializes "rethreading" events, in which the + * number of worker threads is changed. + * + * netisr_rmlock - stabilizes the netisr system for network processing, + * almost always acquired as a read lock (except during configuration + * changes). + * + * nws_mtx - per-workstream lock that serializes access to queues. + */ + +/* + * netisr configuration lock: serialize rethread events, in which the thread + * count may be increased and decreased, to avoid interlacing of these + * events, which might expose incompletely started or stopped threads, etc. + * This is a sleep lock so that it can be held over ithread start/stop. + */ +static struct sx netisr_config_sx; +#define NETISR_CONFIG_LOCK_INIT() sx_init(&netisr_config_sx, \ + "netisr_config_sx") +#define NETISR_CONFIG_LOCK() sx_xlock(&netisr_config_sx) +#define NETISR_CONFIG_UNLOCK() sx_xunlock(&netisr_config_sx) + /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to @@ -114,51 +139,74 @@ * * XXXRW: rmlocks don't support assertions. */ +#define NETISR_RMLOCKING + +#ifdef NETISR_RMLOCKING static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ - RM_NOWITNESS) + RM_NOWITNESS | RM_RECURSE) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) -/* #define NETISR_LOCKING */ +#else +#define NETISR_LOCK_INIT() +#define NETISR_LOCK_ASSERT() +#define NETISR_RLOCK(x) +#define NETISR_RUNLOCK(x) +#define NETISR_WLOCK() +#define NETISR_WUNLOCK() +#endif SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- - * Three direct dispatch policies are supported: + * Three global direct dispatch policies are supported: * - * - Always defer: all work is scheduled for a netisr, regardless of context. - * (!direct) + * NETISR_DISPATCH_QUEUED: All work is deferred for a netisr, regardless of + * context (may be overriden by protocols). * - * - Hybrid: if the executing context allows direct dispatch, and we're - * running on the CPU the work would be done on, then direct dispatch if it - * wouldn't violate ordering constraints on the workstream. - * (direct && !direct_force) + * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, + * and we're running on the CPU the work would be performed on, then direct + * dispatch it if it wouldn't violate ordering constraints on the workstream. * - * - Always direct: if the executing context allows direct dispatch, always - * direct dispatch. (direct && direct_force) + * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch, + * always direct dispatch. (The default.) * * Notice that changing the global policy could lead to short periods of * misordered processing, but this is considered acceptable as compared to - * the complexity of enforcing ordering during policy changes. - */ -static int netisr_direct_force = 1; /* Always direct dispatch. */ -TUNABLE_INT("net.isr.direct_force", &netisr_direct_force); -SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW, - &netisr_direct_force, 0, "Force direct dispatch"); - -static int netisr_direct = 1; /* Enable direct dispatch. */ -TUNABLE_INT("net.isr.direct", &netisr_direct); -SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, - &netisr_direct, 0, "Enable direct dispatch"); - -/* - * Allow the administrator to limit the number of threads (CPUs) to use for - * netisr. We don't check netisr_maxthreads before creating the thread for - * CPU 0, so in practice we ignore values <= 1. This must be set at boot. - * We will create at most one thread per CPU. + * the complexity of enforcing ordering during policy changes. Protocols can + * override the global policy (when they're not doing that, they select + * NETISR_DISPATCH_DEFAULT). + */ +#define NETISR_DISPATCH_POLICY_DEFAULT NETISR_DISPATCH_DIRECT +#define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ +static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; +static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RW | + CTLFLAG_TUN, 0, 0, sysctl_netisr_dispatch_policy, "A", + "netisr dispatch policy"); + +/* + * These sysctls were used in previous versions to control and export + * dispatch policy state. Now, we provide read-only export via them so that + * older netstat binaries work. At some point they can be garbage collected. + */ +static int netisr_direct_force; +SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RD, + &netisr_direct_force, 0, "compat: force direct dispatch"); + +static int netisr_direct; +SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RD, &netisr_direct, 0, + "compat: enable direct dispatch"); + +/* + * Maximum numbe of threads to be used by netisr; will be capped to the + * number of CPUs, or set to the number of CPUs if the default of '-1' is + * present. In the future, if we allow the stack/schedule to adjust the + * number of threads dynamically then this will allow the administrator to + * tune that behavior. */ static int netisr_maxthreads = -1; /* Max number of threads. */ TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); @@ -166,15 +214,30 @@ &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); +/* + * The default number of threads to use at boot, which will be capped to + * maxthreads. We default to 1 for now. + */ +#define NETISR_DEFAULT_DEFAULTTHREADS 1 +static int netisr_defaultthreads = NETISR_DEFAULT_DEFAULTTHREADS; +TUNABLE_INT("net.isr.defaultthreads", &netisr_defaultthreads); +SYSCTL_INT(_net_isr, OID_AUTO, defaultthreads, CTLFLAG_RD, + &netisr_defaultthreads, 0, + "Use this many CPUs for netisr processing by default"); + +/* + * Bind workstream threads to CPUs. For now, we allow things to float + * around by default, but as our notions of CPU affinity get stronger, we may + * want to change this policy. + */ static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* - * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, - * both for initial configuration and later modification using - * netisr_setqlimit(). + * Limit per-workstream mbuf queue limits to at most net.isr.maxqlimit, both + * for initial configuration and later modification using netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; @@ -225,11 +288,10 @@ /* * Number of registered workstreams. Will be at most the number of running - * CPUs once fully started. + * CPUs once fully started. To modify this, must hold both of + * netisr_config_sx and netisr_rm for write. */ static u_int nws_count; -SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, - &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields @@ -276,6 +338,103 @@ } /* + * Dispatch tunable and sysctl configuration. + */ +struct netisr_dispatch_table_entry { + u_int ndte_policy; + const char *ndte_policy_str; +}; +static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { + { NETISR_DISPATCH_DEFAULT, "default" }, + { NETISR_DISPATCH_DEFERRED, "deferred" }, + { NETISR_DISPATCH_HYBRID, "hybrid" }, + { NETISR_DISPATCH_DIRECT, "direct" }, +}; +static const u_int netisr_dispatch_table_len = + (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0])); + +static void +netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, + u_int buflen) +{ + const struct netisr_dispatch_table_entry *ndtep; + const char *str; + u_int i; + + str = "unknown"; + for (i = 0; i < netisr_dispatch_table_len; i++) { + ndtep = &netisr_dispatch_table[i]; + if (ndtep->ndte_policy == dispatch_policy) { + str = ndtep->ndte_policy_str; + break; + } + } + snprintf(buffer, buflen, "%s", str); +} + +static int +netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) +{ + const struct netisr_dispatch_table_entry *ndtep; + u_int i; + + for (i = 0; i < netisr_dispatch_table_len; i++) { + ndtep = &netisr_dispatch_table[i]; + if (strcmp(ndtep->ndte_policy_str, str) == 0) { + *dispatch_policyp = ndtep->ndte_policy; + return (0); + } + } + return (EINVAL); +} + +static void +netisr_dispatch_policy_compat(void) +{ + + switch (netisr_dispatch_policy) { + case NETISR_DISPATCH_DEFERRED: + netisr_direct_force = 0; + netisr_direct = 0; + break; + + case NETISR_DISPATCH_HYBRID: + netisr_direct_force = 0; + netisr_direct = 1; + break; + + case NETISR_DISPATCH_DIRECT: + netisr_direct_force = 1; + netisr_direct = 1; + break; + + default: + panic("netisr_dispatch_policy_compat: unknown policy"); + } +} + +static int +sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) +{ + char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; + u_int dispatch_policy; + int error; + + netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp, + sizeof(tmp)); + error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req); + if (error == 0 && req->newptr != NULL) { + error = netisr_dispatch_policy_from_str(tmp, + &dispatch_policy); + if (error == 0) { + netisr_dispatch_policy = dispatch_policy; + netisr_dispatch_policy_compat(); + } + } + return (error); +} + +/* * Register a new netisr handler, which requires initializing per-protocol * fields for each workstream. All netisr work is briefly suspended while * the protocol is installed. @@ -312,6 +471,12 @@ KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, name)); + KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT || + nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED || + nhp->nh_dispatch == NETISR_DISPATCH_HYBRID || + nhp->nh_dispatch == NETISR_DISPATCH_DIRECT, + ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch)); + KASSERT(proto < NETISR_MAXPROT, ("%s(%u, %s): protocol too big", __func__, proto, name)); @@ -339,6 +504,7 @@ } else netisr_proto[proto].np_qlimit = nhp->nh_qlimit; netisr_proto[proto].np_policy = nhp->nh_policy; + netisr_proto[proto].np_dispatch = nhp->nh_dispatch; for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) continue; @@ -389,7 +555,9 @@ netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; +#endif #ifdef INVARIANTS const char *name; #endif @@ -423,7 +591,9 @@ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; +#endif #ifdef INVARIANTS const char *name; #endif @@ -551,15 +721,35 @@ } /* + * Compose the global and per-protocol policies on dispatch, and return the + * dispatch policy to use. + * + * XXXRW: Rename these sysctls to make it clear they are selecting a default, + * and to use the same policy naming scheme. + */ +static u_int +netisr_get_dispatch(struct netisr_proto *npp) +{ + + /* + * Protocol-specific configuration overrides the global default. + */ + if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT) + return (npp->np_dispatch); + return (netisr_dispatch_policy); +} + +/* * Look up the workstream given a packet and source identifier. Do this by * checking the protocol's policy, and optionally call out to the protocol * for assistance if required. */ static struct mbuf * -netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source, - struct mbuf *m, u_int *cpuidp) +netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, + uintptr_t source, struct mbuf *m, u_int *cpuidp) { struct ifnet *ifp; + u_int policy; NETISR_LOCK_ASSERT(); @@ -577,11 +767,30 @@ * If we want to support per-interface policies, we should do that * here first. */ - switch (npp->np_policy) { - case NETISR_POLICY_CPU: - return (npp->np_m2cpuid(m, source, cpuidp)); + policy = npp->np_policy; + if (policy == NETISR_POLICY_CPU) { + m = npp->np_m2cpuid(m, source, cpuidp); + if (m == NULL) + return (NULL); + + /* + * It's possible for a protocol not to have a good idea about + * where to process a packet, in which case we fall back on + * the netisr code to decide. In the hybrid case, return the + * current CPU ID, which will force an immediate direct + * dispatch. In the queued case, fall back on the SOURCE + * policy. + */ + if (*cpuidp != NETISR_CPUID_NONE) + return (m); + if (dispatch_policy == NETISR_DISPATCH_HYBRID) { + *cpuidp = curcpu; + return (m); + } + policy = NETISR_POLICY_SOURCE; + } - case NETISR_POLICY_FLOW: + if (policy == NETISR_POLICY_FLOW) { if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) @@ -592,21 +801,19 @@ netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); } - /* FALLTHROUGH */ - - case NETISR_POLICY_SOURCE: - ifp = m->m_pkthdr.rcvif; - if (ifp != NULL) - *cpuidp = nws_array[(ifp->if_index + source) % - nws_count]; - else - *cpuidp = nws_array[source % nws_count]; - return (m); - - default: - panic("%s: invalid policy %u for %s", __func__, - npp->np_policy, npp->np_name); + policy = NETISR_POLICY_SOURCE; } + + KASSERT(policy == NETISR_POLICY_SOURCE, + ("%s: invalid policy %u for %s", __func__, npp->np_policy, + npp->np_name)); + + ifp = m->m_pkthdr.rcvif; + if (ifp != NULL) + *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; + else + *cpuidp = nws_array[source % nws_count]; + return (m); } /* @@ -680,7 +887,7 @@ static void swi_net(void *arg) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; @@ -693,9 +900,7 @@ ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif -#ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); -#endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) @@ -712,9 +917,7 @@ nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif @@ -767,9 +970,7 @@ struct netisr_work *npwp; int dosignal, error; -#ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); -#endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); @@ -789,7 +990,7 @@ int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif u_int cpuid; @@ -798,22 +999,19 @@ KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); -#ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); -#endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); - m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid); + m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, + source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif return (error); } @@ -831,27 +1029,27 @@ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { -#ifdef NETISR_LOCKING +#ifdef NETISR_RMLOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; + struct netisr_proto *npp; struct netisr_work *npwp; int dosignal, error; - u_int cpuid; - - /* - * If direct dispatch is entirely disabled, fall back on queueing. - */ - if (!netisr_direct) - return (netisr_queue_src(proto, source, m)); + u_int cpuid, dispatch_policy; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); -#ifdef NETISR_LOCKING + NETISR_RLOCK(&tracker); -#endif - KASSERT(netisr_proto[proto].np_handler != NULL, - ("%s: invalid proto %u", __func__, proto)); + npp = &netisr_proto[proto]; + + KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, + proto)); + + dispatch_policy = netisr_get_dispatch(npp); + if (dispatch_policy == NETISR_DISPATCH_DEFERRED) + return (netisr_queue_src(proto, source, m)); /* * If direct dispatch is forced, then unconditionally dispatch @@ -860,7 +1058,7 @@ * nws_flags because all netisr processing will be source ordered due * to always being forced to directly dispatch. */ - if (netisr_direct_force) { + if (dispatch_policy == NETISR_DISPATCH_DIRECT) { nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; @@ -870,18 +1068,22 @@ goto out_unlock; } + KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID, + ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy)); + /* * Otherwise, we execute in a hybrid mode where we will try to direct * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ - m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid); + sched_pin(); + m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID, + source, m, &cpuid); if (m == NULL) { error = ENOBUFS; - goto out_unlock; + goto out_unpin; } KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); - sched_pin(); if (cpuid != curcpu) goto queue_fallback; nwsp = DPCPU_PTR(nws); @@ -942,9 +1144,7 @@ out_unpin: sched_unpin(); out_unlock: -#ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); -#endif return (error); } @@ -971,63 +1171,127 @@ } #endif +/* + * Given a status quo, adjust the number of threads to match the requested + * configuration. Any policy blending thread resource limits, CPU count, + * etc, must be imposed by the caller and is assumed already done here. + */ static void -netisr_start_swi(u_int cpuid, struct pcpu *pc) +netisr_adjust_threads(int nws_desired) { - char swiname[12]; struct netisr_workstream *nwsp; + u_int cpuid, nwsid, nws_oldcount; + char swiname[12]; int error; - KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); + NETISR_CONFIG_LOCK(); + nws_oldcount = nws_count; - nwsp = DPCPU_ID_PTR(cpuid, nws); - mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); - nwsp->nws_cpu = cpuid; - snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); - error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, - SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); - if (error) - panic("%s: swi_add %d", __func__, error); - pc->pc_netisr = nwsp->nws_intr_event; - if (netisr_bindthreads) { - error = intr_event_bind(nwsp->nws_intr_event, cpuid); - if (error != 0) - printf("%s: cpu %u: intr_event_bind: %d", __func__, - cpuid, error); + /* + * Add new threads required before hooking them up. + */ + for (nwsid = nws_oldcount; nwsid < nws_desired; nwsid++) { + cpuid = nws_array[nwsid]; + nwsp = DPCPU_ID_PTR(cpuid, nws); + KASSERT(nwsp->nws_intr_event == NULL, + ("%s: nws_intr_event CPU %u non-NULL", __func__, cpuid)); + snprintf(swiname, sizeof(swiname), "netisr %u", nwsid); + error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, + nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); + if (error) + panic("%s: swi_add %d", __func__, error); + if (netisr_bindthreads) { + error = intr_event_bind(nwsp->nws_intr_event, cpuid); + if (error) + printf("%s: cpu %u: intr_event_bind: %d", + __func__, cpuid, error); + } } + + /* + * Suspend netisr processing; mostly drains queues, prevents new work + * from being scheduled. + */ NETISR_WLOCK(); - nws_array[nws_count] = nwsp->nws_cpu; - nws_count++; + + /* + * XXXRW: Is there any drain activity we can perform here to address + * the race between swi_sched() and netisr_lock() in the swi? We + * can't recurse the rmlock by acquiring it read after write, so we + * can't just walk the workstreams and process pending packets. This + * could occur if the protocol calls back into netisr to schedule new + * work, for example. Possibly we need a more complex solution? Or + * should we just drain the queues and drop the packets... + */ + + /* + * Update worker count and resume processing. + */ + nws_count = nws_desired; NETISR_WUNLOCK(); + + /* + * Remove threads that are now no longer in use. + */ + for (nwsid = nws_desired; nwsid < nws_oldcount; nwsid++) { + cpuid = nws_array[nwsid]; + nwsp = DPCPU_ID_PTR(cpuid, nws); + error = swi_remove(nwsp->nws_swi_cookie); + if (error) + panic("%s: swi_remove %d", __func__, error); + nwsp->nws_intr_event = NULL; + } + + NETISR_CONFIG_UNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * - * Start a worker thread for the boot CPU so that we can support network - * traffic immediately in case the network stack is used before additional - * CPUs are started (for example, diskless boot). + * Initialize workstream state for all CPUs, but start a worker only for the + * boot CPU. That way we can support network traffic immediately in case the + * stack is used before additional CPUs are started (for example, diskless + * boot). */ static void netisr_init(void *arg) { + char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; + struct netisr_workstream *nwsp; + u_int cpuid, dispatch_policy, nwsid; KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); + /* + * Initialize global locks. + */ NETISR_LOCK_INIT(); + NETISR_CONFIG_LOCK_INIT(); + + /* + * Digest pre-boot policy and configuration. + */ if (netisr_maxthreads < 1) - netisr_maxthreads = 1; + netisr_maxthreads = mp_ncpus; + if (netisr_defaultthreads < 1) + netisr_defaultthreads = NETISR_DEFAULT_DEFAULTTHREADS; if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } + if (netisr_defaultthreads > netisr_maxthreads) { + printf("netisr_init: forcing defaultthreads from %d to %d\n", + netisr_defaultthreads, netisr_maxthreads); + netisr_defaultthreads = netisr_maxthreads; + } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } + #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with @@ -1042,30 +1306,47 @@ } #endif - netisr_start_swi(curcpu, pcpu_find(curcpu)); + TUNABLE_STR_FETCH("net.isr.dispatch", tmp, sizeof(tmp)); + if (strlen(tmp) != 0) { + if (netisr_dispatch_policy_from_str(tmp, &dispatch_policy) + == 0) { + netisr_dispatch_policy = dispatch_policy; + netisr_dispatch_policy_compat(); + } else + printf( + "%s: invalid dispatch policy %s, using default\n", + __func__, tmp); + } + + /* + * Initialize workstream data structures, populate nws_array, but + * don't start threads yet as the APs aren't started yet. + */ + for (cpuid = 0, nwsid = 0; cpuid < MAXCPU; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + nwsp = DPCPU_ID_PTR(cpuid, nws); + mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); + nwsp->nws_cpu = cpuid; + nws_array[nwsid] = cpuid; + nwsid++; + } + + /* + * Start a boot CPU netisr to get us going. + */ + netisr_adjust_threads(1); } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); /* - * Start worker threads for additional CPUs. No attempt to gracefully handle - * work reassignment, we don't yet support dynamic reconfiguration. + * Now that SMP is going, create any additional threads we may require. */ static void netisr_start(void *arg) { - struct pcpu *pc; - SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { - if (nws_count >= netisr_maxthreads) - break; - /* XXXRW: Is skipping absent CPUs still required here? */ - if (CPU_ABSENT(pc->pc_cpuid)) - continue; - /* Worker will already be present for boot CPU. */ - if (pc->pc_netisr != NULL) - continue; - netisr_start_swi(pc->pc_cpuid, pc); - } + netisr_adjust_threads(netisr_defaultthreads); } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); @@ -1097,6 +1378,7 @@ snpp->snp_proto = proto; snpp->snp_qlimit = npp->np_qlimit; snpp->snp_policy = npp->np_policy; + snpp->snp_dispatch = npp->np_dispatch; if (npp->np_m2flow != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; if (npp->np_m2cpuid != NULL) @@ -1233,6 +1515,29 @@ "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); +/* + * Run-time query and adjustment of thread count using a sysctl. + */ +static int +sysctl_net_isr_numthreads(SYSCTL_HANDLER_ARGS) +{ + u_int numthreads; + int error; + + numthreads = nws_count; + error = sysctl_handle_int(oidp, &numthreads, 0, req); + if (error || req->newptr == NULL) + return (error); + if (numthreads < 1 || numthreads > netisr_maxthreads) + return (EINVAL); + netisr_adjust_threads(numthreads); + return (0); +} + +SYSCTL_PROC(_net_isr, OID_AUTO, numthreads, CTLTYPE_UINT|CTLFLAG_RW, 0, 0, + sysctl_net_isr_numthreads, "I", + "Number of threads used by the netisr framework"); + #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { diff -urN -x compile -x LINT vendor/freebsd/src/sys/net/netisr.h user/rwatson/tcp/src/sys/net/netisr.h --- vendor/freebsd/src/sys/net/netisr.h 2011-02-01 09:38:35.433566119 +0000 +++ user/rwatson/tcp/src/sys/net/netisr.h 2011-02-02 16:48:26.313571254 +0000 @@ -1,6 +1,6 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson - * Copyright (c) 2010 Juniper Networks, Inc. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract @@ -71,6 +71,15 @@ #define NETISR_POLICY_CPU 3 /* Protocol determines CPU placement. */ /* + * Protocol dispatch policy constants; selects whether and when direct + * dispatch is permitted. + */ +#define NETISR_DISPATCH_DEFAULT 0 /* Use global default. */ +#define NETISR_DISPATCH_DEFERRED 1 /* Always defer dispatch. */ +#define NETISR_DISPATCH_HYBRID 2 /* Allow hybrid dispatch. */ +#define NETISR_DISPATCH_DIRECT 3 /* Always direct dispatch. */ + +/* * Monitoring data structures, exported by sysctl(2). * * Three sysctls are defined. First, a per-protocol structure exported by @@ -84,7 +93,8 @@ u_int snp_qlimit; /* nh_qlimit */ u_int snp_policy; /* nh_policy */ u_int snp_flags; /* Various flags. */ - u_int _snp_ispare[7]; + u_int snp_dispatch; /* Dispatch policy. */ + u_int _snp_ispare[6]; }; /* @@ -173,6 +183,8 @@ typedef struct mbuf *netisr_m2flow_t(struct mbuf *m, uintptr_t source); typedef void netisr_drainedcpu_t(u_int cpuid); +#define NETISR_CPUID_NONE ((u_int)-1) /* No affinity returned. */ + /* * Data structure describing a protocol handler. */ @@ -185,7 +197,8 @@ u_int nh_proto; /* Integer protocol ID. */ u_int nh_qlimit; /* Maximum per-CPU queue depth. */ u_int nh_policy; /* Work placement policy. */ - u_int nh_ispare[5]; /* For future use. */ + u_int nh_dispatch; /* Dispatch policy. */ + u_int nh_ispare[4]; /* For future use. */ void *nh_pspare[4]; /* For future use. */ }; diff -urN -x compile -x LINT vendor/freebsd/src/sys/net/netisr_internal.h user/rwatson/tcp/src/sys/net/netisr_internal.h --- vendor/freebsd/src/sys/net/netisr_internal.h 2011-02-01 09:38:35.433566119 +0000 +++ user/rwatson/tcp/src/sys/net/netisr_internal.h 2011-02-01 08:30:18.237856888 +0000 @@ -1,6 +1,6 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson - * Copyright (c) 2010 Juniper Networks, Inc. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract @@ -64,6 +64,7 @@ netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ u_int np_qlimit; /* Maximum per-CPU queue depth. */ u_int np_policy; /* Work placement policy. */ + u_int np_dispatch; /* Work dispatch policy. */ }; #define NETISR_MAXPROT 16 /* Compile-time limit. */ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/in_pcb.c user/rwatson/tcp/src/sys/netinet/in_pcb.c --- vendor/freebsd/src/sys/netinet/in_pcb.c 2011-02-01 09:38:49.713848948 +0000 +++ user/rwatson/tcp/src/sys/netinet/in_pcb.c 2011-02-01 08:30:20.093483511 +0000 @@ -2,8 +2,12 @@ * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -37,6 +41,8 @@ #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet6.h" +#include "opt_pcbgroup.h" +#include "opt_rss.h" /* XXXRW: possibly a bug. */ #include #include @@ -48,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +72,7 @@ #include #include +#include #include #include #include @@ -72,6 +80,7 @@ #include #ifdef INET6 #include +#include #include #endif /* INET6 */ @@ -116,7 +125,11 @@ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } -static void in_pcbremlists(struct inpcb *inp); +static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp); +static void in_pcbremlists(struct inpcb *inp); static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) @@ -191,19 +204,23 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, - uint32_t inpcbzone_flags) + uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); + INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif + pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); + pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); @@ -217,10 +234,15 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { + KASSERT(pcbinfo->ipi_count == 0, + ("in_pcbinfo_destroy: ipi_count = %u", pcbinfo->ipi_count)); + hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + in_pcbgroup_destroy(pcbinfo); uma_zdestroy(pcbinfo->ipi_zone); + INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -275,7 +297,7 @@ #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - inp->inp_refcount = 1; /* Reference from the inpcbinfo */ + refcount_init(&inp->inp_refcount, 1); /* Reference from pcbinfo. */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -291,8 +313,8 @@ { int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); @@ -331,16 +353,15 @@ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; - int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; int dorandom; /* - * Because no actual state changes occur here, a global write lock on - * the pcbinfo isn't required. + * No state changes, so read locks are sufficient here. */ - INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -348,7 +369,7 @@ if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); @@ -429,7 +450,7 @@ return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait @@ -523,7 +544,7 @@ *lastport = first; lport = htons(*lastport); } while (in_pcblookup_local(pcbinfo, laddr, - lport, wild, cred)); + lport, lookupflags, cred)); } *laddrp = laddr.s_addr; *lportp = lport; @@ -537,14 +558,15 @@ * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, + struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; @@ -570,13 +592,20 @@ inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; - in_pcbrehash(inp); + in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } +int +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + + return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); +} + /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. @@ -832,8 +861,8 @@ * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ - INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; @@ -905,8 +934,8 @@ } } - oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, - 0, NULL); + oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, + laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; @@ -929,8 +958,8 @@ in_pcbdisconnect(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; @@ -949,58 +978,24 @@ KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); + /* XXXRW: lock assertions? */ + inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* - * in_pcbfree_internal() frees an inpcb that has been detached from its - * socket, and whose reference count has reached 0. It will also remove the - * inpcb from any global lists it might remain on. - */ -static void -in_pcbfree_internal(struct inpcb *inp) -{ - struct inpcbinfo *ipi = inp->inp_pcbinfo; - - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__)); - - INP_INFO_WLOCK_ASSERT(ipi); - INP_WLOCK_ASSERT(inp); - -#ifdef IPSEC - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif /* IPSEC */ - inp->inp_gencnt = ++ipi->ipi_gencnt; - in_pcbremlists(inp); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - if (inp->in6p_moptions != NULL) - ip6_freemoptions(inp->in6p_moptions); - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); - if (inp->inp_moptions != NULL) - inp_freemoptions(inp->inp_moptions); - inp->inp_vflag = 0; - crfree(inp->inp_cred); - -#ifdef MAC - mac_inpcb_destroy(inp); -#endif - INP_WUNLOCK(inp); - uma_zfree(ipi->ipi_zone, inp); -} - -/* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock is already held. + * but where the inpcb lock may already held, or when acquiring a reference + * via a pcbgroup. + * + * in_pcbref() should be used only to provide brief memory stability, and + * must always be followed by a call to INP_WLOCK() and in_pcbrele() to + * garbage collect the inpcb if it has been freed from another context. + * Until in_pcbrele() has returned that the inpcb is still valid, lock and + * rele are the *only* safe operations that may be performed on the inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to @@ -1011,11 +1006,9 @@ in_pcbref(struct inpcb *inp) { - INP_WLOCK_ASSERT(inp); - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - inp->inp_refcount++; + refcount_acquire(&inp->inp_refcount); } /* @@ -1023,47 +1016,105 @@ * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. + * + * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a + * reference on an inpcb. Historicall more work was done here (actually, in + * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the + * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely + * about memory stability (and continued use of the write lock). */ int -in_pcbrele(struct inpcb *inp) +in_pcbrele_rlocked(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + INP_RLOCK_ASSERT(inp); + + if (refcount_release(&inp->inp_refcount) == 0) + return (0); + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + + INP_RUNLOCK(inp); + uma_zfree(pcbinfo->ipi_zone, inp); + return (1); +} + +int +in_pcbrele_wlocked(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); INP_WLOCK_ASSERT(inp); - inp->inp_refcount--; - if (inp->inp_refcount > 0) + if (refcount_release(&inp->inp_refcount) == 0) return (0); - in_pcbfree_internal(inp); + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + + INP_WUNLOCK(inp); + uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* + * XXXRW: Temporary. + */ +int +in_pcbrele(struct inpcb *inp) +{ + + return (in_pcbrele_wlocked(inp)); +} + +/* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is - * released using in_pcbrele(), but the inpcb is still unlocked. + * released using in_pcbrele(), but the inpcb is still unlocked. Almost all + * work, including removal from global lists, is done in this context, where + * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); + INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); - if (!in_pcbrele(inp)) + /* XXXRW: Do as much as possible here. */ +#ifdef IPSEC + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif /* IPSEC */ + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + in_pcbremlists(inp); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + if (inp->in6p_moptions != NULL) + ip6_freemoptions(inp->in6p_moptions); + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_moptions != NULL) + inp_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + crfree(inp->inp_cred); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } @@ -1091,20 +1142,28 @@ in_pcbdrop(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + /* + * XXXRW: Possibly we should protect the setting of INP_DROPPED with + * the hash lock...? + * + * XXXRW: inline of in_pcbremlists? + */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; + in_pcbgroup_remove(inp); } } @@ -1231,12 +1290,13 @@ } /* - * Lookup a PCB based on the local address and port. + * Lookup a PCB based on the local address and port. Caller must hold the + * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, int wild_okay, struct ucred *cred) + u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 @@ -1246,9 +1306,12 @@ #endif int wildcard; - INP_INFO_LOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); - if (!wild_okay) { + INP_HASH_LOCK_ASSERT(pcbinfo); + + if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that @@ -1349,19 +1412,166 @@ } #undef INP_LOOKUP_MAPPED_PCB_COST +#ifdef PCBGROUP /* - * Lookup PCB in hash list. + * Lookup PCB in hash list, using pcbgroup tables. */ -struct inpcb * -in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, +static struct inpcb * +in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in_addr faddr, u_int fport_arg, struct in_addr laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, ipw_entry) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif /* defined(INET6) */ + if (inp != NULL) + goto found; + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + return (inp); +} +#endif /* PCBGROUP */ + +/* + * Lookup PCB in hash list, using pcbinfo tables. This variation assumes + * that the caller has locked the hash list, and will not perform any further + * locking or reference operations on either the hash list or the connection. + */ +static struct inpcb * +in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; - INP_INFO_LOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + + INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. @@ -1396,7 +1606,7 @@ /* * Then look for a wildcard match, if requested. */ - if (wildcard == INPLOOKUP_WILDCARD) { + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; @@ -1457,26 +1667,137 @@ local_wild = inp; } } /* LIST_FOREACH */ - if (jail_wild != NULL) - return (jail_wild); - if (local_exact != NULL) - return (local_exact); - if (local_wild != NULL) - return (local_wild); + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; #ifdef INET6 - if (local_wild_mapped != NULL) - return (local_wild_mapped); + if (inp == NULL) + inp = local_wild_mapped; #endif /* defined(INET6) */ - } /* if (wildcard == INPLOOKUP_WILDCARD) */ - + if (inp != NULL) + return (inp); + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ return (NULL); } /* + * Lookup PCB in hash list, using pcbinfo tables. This variation locks the + * hash list lock, and will return the inpcb locked (i.e., requires + * INPLOOKUP_LOCKPCB). + */ +static struct inpcb * +in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp) +{ + struct inpcb *inp; + + INP_HASH_RLOCK(pcbinfo); + inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + if (inp != NULL) { + if (lookupflags & INPLOOKUP_WLOCKPCB) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + } else + INP_HASH_RUNLOCK(pcbinfo); + return (inp); +} + +/* + * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf + * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in_pcbgroup.c. + */ +struct inpcb * +in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, + struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) +{ +#if defined(PCBGROUP) && !defined(RSS) + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); + + /* + * When not using RSS, use connection groups in preference to the + * reservation table when looking up 4-tuples. When using RSS, just + * use the reservation table, due to the cost of the Toeplitz hash + * in software. + * + * XXXRW: This policy belongs in the pcbgroup code, as in principle + * we could be doing RSS with a non-Toeplitz hash that is affordable + * in software. + */ +#if defined(PCBGROUP) && !defined(RSS) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} + +struct inpcb * +in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp, struct mbuf *m) +{ +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; + + /* + * If we can use a hardware-generated hash to look up the connection + * group, use that connection group to find the inpcb. Otherwise + * fall back on a software hash -- or the reservation table if we're + * using RSS. + * + * XXXRW: As above, that policy belongs in the pcbgroup code. + */ + if (in_pcbgroup_enabled(pcbinfo) && + !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { + pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); +#ifndef RSS + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); +#endif + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} + +/* * Insert PCB onto various hash lists. */ -int -in_pcbinshash(struct inpcb *inp) +static int +in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; @@ -1484,7 +1805,7 @@ struct inpcbport *phd; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); @@ -1525,23 +1846,50 @@ LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; + if (do_pcbgroup_update) + in_pcbgroup_update(inp); return (0); } /* + * For now, two public interfaces to insert an inpcb into the hash lists -- + * one that does update pcbgroups, and one that doesn't. The latter is used + * only in the TCP syncache, where in_pcbinshash is called before the full + * 4-tuple is set for the inpcb, and we don't want to install in the pcbgroup + * until later. + * + * XXXRW: This seems like a misfeature. in_pcbinshash should always update + * connection groups, and partially initialised inpcbs should not be exposed + * to either reservation hash tables or pcbgroups. + */ +int +in_pcbinshash(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 1)); +} + +int +in_pcbinshash_nopcbgroup(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 0)); +} + +/* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void -in_pcbrehash(struct inpcb *inp) +in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); @@ -1558,6 +1906,18 @@ LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); + + if (m != NULL) + in_pcbgroup_update_mbuf(inp, m); + else + in_pcbgroup_update(inp); +} + +void +in_pcbrehash(struct inpcb *inp) +{ + + in_pcbrehash_mbuf(inp, NULL); } /* @@ -1575,16 +1935,19 @@ if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; + in_pcbgroup_remove(inp); } /* diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/in_pcb.h user/rwatson/tcp/src/sys/netinet/in_pcb.h --- vendor/freebsd/src/sys/netinet/in_pcb.h 2011-02-01 09:38:49.713848948 +0000 +++ user/rwatson/tcp/src/sys/netinet/in_pcb.h 2011-02-02 15:57:49.833777034 +0000 @@ -1,8 +1,12 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -34,6 +38,7 @@ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ +#include #include #include #include @@ -137,6 +142,7 @@ * * Key: * (c) - Constant after initialization + * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks @@ -156,9 +162,12 @@ */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ + LIST_ENTRY(inpcb) ipw_entry; /* (g/i/p) group wildcard list entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -260,53 +269,93 @@ u_short phd_port; }; -/* +/*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. + * + * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, + * the former covering mutable global fields (such as the global pcb list), + * and the latter covering the hashed lookup tables. The lock order is: + * + * ipi_lock (before) inpcb locks (before) ipi_hash_lock + * + * Locking key: + * + * (c) Constant or nearly constant after initialisation. + * (g) Locked by ipi_lock. + * (h) Read using either ipi_hash_lock or inpcb lock; write requires both. + * (p) Protected by one or more pcbgroup locks. */ struct inpcbinfo { /* - * Global list of inpcbs on the protocol. + * Global lock protecting global inpcb list, inpcb count, etc. */ - struct inpcbhead *ipi_listhead; - u_int ipi_count; + struct rwlock ipi_lock; /* - * Global hash of inpcbs, hashed by local and foreign addresses and - * port numbers. + * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_hashbase; - u_long ipi_hashmask; + struct inpcbhead *ipi_listhead; /* (g) */ + u_int ipi_count; /* (g) */ /* - * Global hash of inpcbs, hashed by only local port number. + * Generation count--incremented each time a connection is allocated + * or freed. */ - struct inpcbporthead *ipi_porthashbase; - u_long ipi_porthashmask; + u_quad_t ipi_gencnt; /* (g) */ /* - * Fields associated with port lookup and allocation. + * Fields associated with port lookup and allocation. Essentially + * read-only. */ - u_short ipi_lastport; - u_short ipi_lastlow; - u_short ipi_lasthi; + u_short ipi_lastport; /* (c) */ + u_short ipi_lastlow; /* (c) */ + u_short ipi_lasthi; /* (c) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ - struct uma_zone *ipi_zone; + struct uma_zone *ipi_zone; /* (c) */ /* - * Generation count--incremented each time a connection is allocated - * or freed. + * Connection groups associated with this protocol. These fields are + * constant, but pcbgroup structures themselves are protected by + * per-pcbgroup locks. */ - u_quad_t ipi_gencnt; - struct rwlock ipi_lock; + struct inpcbgroup *ipi_pcbgroups; /* (c) */ + u_int ipi_npcbgroups; /* (c) */ + u_int ipi_hashfields; /* (c) */ + + /* + * Global lock protecting non-pcbgroup hash lookup tables. + */ + struct rwlock ipi_hash_lock; + + /* + * Global hash of inpcbs, hashed by local and foreign addresses and + * port numbers. + */ + struct inpcbhead *ipi_hashbase; /* (h) */ + u_long ipi_hashmask; /* (h) */ + + /* + * Global hash of inpcbs, hashed by only local port number. + */ + struct inpcbporthead *ipi_porthashbase; /* (h) */ + u_long ipi_porthashmask; /* (h) */ + + /* + * List of wildcard inpcbs for use with pcbgroups. In the past, was + * per-pcbgroup but is now global. All pcbgroup locks must be held + * to modify the list, so any is sufficient to read it. + */ + struct inpcbhead *ipi_wildbase; /* (p) */ + u_long ipi_wildmask; /* (p) */ /* * Pointer to network stack instance */ - struct vnet *ipi_vnet; + struct vnet *ipi_vnet; /* (c) */ /* * general use 2 @@ -314,6 +363,31 @@ void *ipi_pspare[2]; }; +/* + * Connection groups hold sets of connections that have similar CPU/thread + * affinity. Each connection belongs to exactly one connection group. + */ +struct inpcbgroup { + /* + * Per-connection group hash of inpcbs, hashed by local and foreign + * addresses and port numbers. + */ + struct inpcbhead *ipg_hashbase; /* (c) */ + u_long ipg_hashmask; /* (c) */ + + /* + * Notional affinity of this pcbgroup. + */ + u_int ipg_cpu; /* (p) */ + + /* + * Per-connection group lock, not to be confused with ipi_lock. + * Protects the hash table hung off the group, but also the global + * wildcard list in inpcbinfo. + */ + struct mtx ipg_lock; +} __aligned(CACHE_LINE_SIZE); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -385,6 +459,26 @@ #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) +#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_LOCKED) +#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_WLOCKED) + +#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ + MTX_DEF | MTX_DUPOK) +#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) + +#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) +#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) +#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) + #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ @@ -444,8 +538,18 @@ */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ +#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ + +/* + * Flags passed to in_pcblookup*() functions. + */ +#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ +#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ +#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ + +#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ + INPLOOKUP_WLOCKPCB) -#define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ @@ -453,6 +557,13 @@ #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) +/* + * Constants for pcbinfo.ipi_hashfields. + */ +#define IPI_HASHFIELDS_NONE 0 +#define IPI_HASHFIELDS_2TUPLE 1 +#define IPI_HASHFIELDS_4TUPLE 2 + #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -486,7 +597,21 @@ void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, uma_fini, uint32_t); + int, int, char *, uma_init, uma_fini, uint32_t, u_int); + +struct inpcbgroup * + in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in_pcbgroup_byinpcb(struct inpcb *inp); +struct inpcbgroup * + in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, + struct in_addr, u_short); +void in_pcbgroup_destroy(struct inpcbinfo *); +int in_pcbgroup_enabled(struct inpcbinfo *); +void in_pcbgroup_init(struct inpcbinfo *, u_int, int); +void in_pcbgroup_remove(struct inpcb *inp); +void in_pcbgroup_update(struct inpcb *inp); +void in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); @@ -494,6 +619,8 @@ int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); +int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, + struct mbuf *m); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); @@ -502,17 +629,24 @@ void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); +int in_pcbinshash_nopcbgroup(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * - in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, + in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +struct inpcb * + in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); +void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); +int in_pcbrele_rlocked(struct inpcb *); +int in_pcbrele_wlocked(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/in_pcbgroup.c user/rwatson/tcp/src/sys/netinet/in_pcbgroup.c --- vendor/freebsd/src/sys/netinet/in_pcbgroup.c 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/in_pcbgroup.c 2011-02-04 20:43:28.163813385 +0000 @@ -0,0 +1,585 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_pcbgroup.h" +#include "opt_rss.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif /* INET6 */ + +/* + * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's + * 2006 USENIX paper, An Evaluation of Network Stack Parallelization + * Strategies in Modern Operating Systems. This implementation differs + * significantly from that described in the paper, in that it attempts to + * introduce not just notions of affinity for connections and distribute work + * so as to reduce lock contention, but also align those notions with + * hardware work distribution strategies such as RSS. In this construction, + * connection groups supplement, rather than replace, existing reservation + * tables for protocol 4-tuples, offering CPU-affine lookup tables with + * minimal cache line migration and lock contention during steady state + * operation. + * + * Hardware-offloaded checksums are often inefficient in software -- for + * example, Toeplitz, specified by RSS, introduced a significant overhead if + * performed during per-packge processing. It is therefore desirable to fall + * back on traditional reservation table lookups without affinity where + * hardware-offloaded checksums aren't available, such as for traffic over + * non-RSS interfaces. + * + * Internet protocols, such as UDP and TCP, register to use connection groups + * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this + * indicates to the connection group code whether a 2-tuple or 4-tuple is + * used as an argument to hashes that assign a connection to a particular + * group. This must be aligned with any hardware offloaded distribution + * model, such as RSS or similar approaches taken in embedded network boards. + * Wildcard sockets require special handling, as in Willman 2006, and are + * shared between connection groups -- while being protected by group-local + * locks. This means that connection establishment and teardown can be + * signficantly more expensive than without connection groups, but that + * stead-state processing can be significantly faster. + * + * When RSS is used, certain connection group parameters, such as the number + * of groups, are provided by the RSS implementation, found in in_rss.c. + * Otherwise, in_pcbgroup.c selects possible sensible parameters + * corresponding to the degree of parallelism exposed by netisr. + * + * Most of the implementation of connection groups is in this file; however, + * connection group lookup is implemented in in_pcb.c alongside reservation + * table lookups -- see in_pcblookup_group(). + * + * TODO: + * + * Implement dynamic rebalancing of buckets with connection groups; when + * load is unevenly distributed, search for more optimal balancing on + * demand. This might require scaling up the number of connection groups + * by <<1. + * + * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection + * groups for ip_input, allowing non-offloaded work distribution. + * + * Expose effective CPU affinity of connections to userspace using socket + * options. + * + * Investigate per-connection affinity overrides based on socket options; an + * option could be set, certainly resulting in work being distributed + * differently in software, and possibly propagated to supporting hardware + * with TCAMs or hardware hash tables. This might require connections to + * exist in more than one connection group at a time. + * + * Hook netisr thread reconfiguration events, and propagate those to RSS so + * that rebalancing can occur when the thread pool grows or shrinks. + * + * Expose per-pcbgroup statistics to userspace monitoring tools such as + * netstat, in order to allow better debugging and profiling. + */ + +#ifdef PCBGROUP + +void +in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, + int hash_nelements) +{ + struct inpcbgroup *pcbgroup; + u_int numpcbgroups, pgn; + + /* + * Only enable connection groups for a protocol if it has been + * specifically requested. + */ + if (hashfields == IPI_HASHFIELDS_NONE) + return; + + /* + * Connection groups are about multi-processor load distribution, + * lock contention, and connection CPU affinity. As such, no point + * in turning them on for a uniprocessor machine, it only wastes + * memory. + */ + if (mp_ncpus == 1) + return; + +#ifdef RSS + /* + * If we're using RSS, then RSS determines the number of connection + * groups to use: one connection group per RSS bucket. If for some + * reason RSS isn't able to provide a number of buckets, disable + * connection groups entirely. + * + * XXXRW: Can this ever happen? + */ + numpcbgroups = rss_getnumbuckets(); + if (numpcbgroups == 0) + return; +#else + /* + * Otherwise, we'll just use one per CPU for now. If we decide to + * do dynamic rebalancing a la RSS, we'll need similar logic here. + */ + numpcbgroups = mp_ncpus; +#endif + + pcbinfo->ipi_hashfields = hashfields; + pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * + sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); + pcbinfo->ipi_npcbgroups = numpcbgroups; + pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_wildmask); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, + &pcbgroup->ipg_hashmask); + INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); + + /* + * Initialise notional affinity of the pcbgroup -- for RSS, + * we want the same notion of affinity as NICs will be used. + * In the non-RSS case, just round robin for the time being. + * + * XXXRW: The notion of a bucket to CPU mapping is common at + * both pcbgroup and RSS layers -- does that mean that we + * should migrate it all from RSS to here, and just leave RSS + * responsible only for providing hashing and mapping funtions? + */ +#ifdef RSS + pcbgroup->ipg_cpu = rss_getcpu(pgn); +#else + pcbgroup->ipg_cpu = (pgn % mp_ncpus); +#endif + } +} + +void +in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) +{ + struct inpcbgroup *pcbgroup; + u_int pgn; + + if (pcbinfo->ipi_npcbgroups == 0) + return; + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), + ("in_pcbinfo_destroy: listhead not empty")); + INP_GROUP_LOCK_DESTROY(pcbgroup); + hashdestroy(pcbgroup->ipg_hashbase, M_PCB, + pcbgroup->ipg_hashmask); + } + hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); + free(pcbinfo->ipi_pcbgroups, M_PCB); + pcbinfo->ipi_pcbgroups = NULL; + pcbinfo->ipi_npcbgroups = 0; + pcbinfo->ipi_hashfields = 0; +} + +/* + * Given a hash of whatever the covered tuple might be, return a pcbgroup + * index. Where RSS is supported, try to align bucket selection with RSS CPU + * affinity strategy. + */ +static __inline u_int +in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) +{ + +#ifdef RSS + return (rss_getbucket(hash)); +#else + return (hash % pcbinfo->ipi_npcbgroups); +#endif +} + +/* + * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash + * information is insufficient to identify the pcbgroup. This might occur if + * a TCP packet turnsup with a 2-tuple hash, or if an RSS hash is present but + * RSS is not compiled into the kernel. + */ +struct inpcbgroup * +in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) +{ + +#ifdef RSS + if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && + hashtype == M_HASHTYPE_RSS_4TUPLE) || + (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && + hashtype == M_HASHTYPE_RSS_2TUPLE)) + return (&pcbinfo->ipi_pcbgroups[ + in_pcbgroup_getbucket(pcbinfo, hash)]); +#endif + return (NULL); +} + +static struct inpcbgroup * +in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) +{ + + return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid)); +} + +struct inpcbgroup * +in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, struct in_addr faddr, u_short fport) +{ + uint32_t hash; + + /* + * RSS note: we pass foreign addr/port as source, and local addr/port + * as destination, as we want to align with what the hardware is + * doing. + */ + switch (pcbinfo->ipi_hashfields) { + case IPI_HASHFIELDS_4TUPLE: +#ifdef RSS + hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); +#else + hash = faddr.s_addr ^ fport; +#endif + break; + + case IPI_HASHFIELDS_2TUPLE: +#ifdef RSS + hash = rss_hash_ip4_2tuple(faddr, laddr); +#else + hash = faddr.s_addr ^ laddr.s_addr; +#endif + + default: + hash = 0; + } + return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, + hash)]); +} + +struct inpcbgroup * +in_pcbgroup_byinpcb(struct inpcb *inp) +{ + + return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, + inp->inp_lport, inp->inp_faddr, inp->inp_fport)); +} + +static void +in_pcbwild_add(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbhead *head; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), + ("in_pcbwild_add: is wild")); + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_INSERT_HEAD(head, inp, ipw_entry); + inp->inp_flags2 |= INP_PCBGROUPWILD; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); +} + +static void +in_pcbwild_remove(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), + ("in_pcbwild_remove: not wild")); + + /* + * Modifying the wildcard list requires all group locks. + */ + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + LIST_REMOVE(inp, ipw_entry); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); + inp->inp_flags2 &= ~INP_PCBGROUPWILD; +} + +static __inline int +in_pcbwild_needed(struct inpcb *inp) +{ + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); + else +#endif + return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); +} + +static void +in_pcbwild_update_internal(struct inpcb *inp) +{ + int wildcard_needed; + + wildcard_needed = in_pcbwild_needed(inp); + if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_add(inp); + else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_remove(inp); +} + +/* + * Update the pcbgroup of an inpcb, which might include removing an old + * pcbgroup reference and/or adding a new one. Wildcard processing is not + * performed here, although ideally we'll never install a pcbgroup for a + * wildcard inpcb (asserted below). + */ +static void +in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, + struct inpcbgroup *newpcbgroup, struct inpcb *inp) +{ + struct inpcbgroup *oldpcbgroup; + struct inpcbhead *pcbhash; + uint32_t hashkey_faddr; + + INP_WLOCK_ASSERT(inp); + + oldpcbgroup = inp->inp_pcbgroup; + if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { + INP_GROUP_LOCK(oldpcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(oldpcbgroup); + } + if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ + else +#endif + hashkey_faddr = inp->inp_faddr.s_addr; + INP_GROUP_LOCK(newpcbgroup); + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); + inp->inp_pcbgroup = newpcbgroup; + INP_GROUP_UNLOCK(newpcbgroup); + } + + KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), + ("in_pcbgroup_update_internal: pcbgroup and wildcard!")); +} + +/* + * Two update paths: one in which the 4-tuple on an inpcb has been updated + * and therefore connection groups may need to change (or a wildcard entry + * may needed to be installed), and another in which the 4-tuple has been + * set as a result of a packet received, in which case we may be able to use + * the hash on the mbuf to avoid doing a software hash calculation for RSS. + * + * In each case: first, let the wildcard code have a go at placing it as a + * wildcard socket. If it was a wildcard, or if the connection has been + * dropped, then no pcbgroup is required (so potentially clear it); + * otherwise, calculate and update the pcbgroup for the inpcb. + */ +void +in_pcbgroup_update(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) + return; + + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { + if (inp->inp_vflag & INP_IPV6) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + else + newpcbgroup = in_pcbgroup_byinpcb(inp); + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +void +in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) + return; + + /* + * Possibly should assert !INP_PCBGROUPWILD rather than testing for + * it; presumably this function should never be called for anything + * other than non-wildcard socket? + */ + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { + if (inp->inp_vflag & INP_IPV6) { + newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); + if (newpcbgroup == NULL) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + } else { + newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); + if (newpcbgroup == NULL) + newpcbgroup = in_pcbgroup_byinpcb(inp); + } + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +/* + * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. + */ +void +in_pcbgroup_remove(struct inpcb *inp) +{ + struct inpcbgroup *pcbgroup; + + INP_WLOCK_ASSERT(inp); + + if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) + return; + + if (inp->inp_flags2 & INP_PCBGROUPWILD) + in_pcbwild_remove(inp); + + pcbgroup = inp->inp_pcbgroup; + if (pcbgroup != NULL) { + INP_GROUP_LOCK(pcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(pcbgroup); + } +} + +/* + * Query whether or not it is appropriate to use pcbgroups to look up inpcbs + * for a protocol. + */ +int +in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) +{ + + return (pcbinfo->ipi_npcbgroups > 0); +} + +#else /* !PCBGROUP */ + +/* + * No-op versions of pcbgroup functions for when options PCBGROUP isn't + * compiled into the kernel. + */ +void +in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, + int hash_nelements) +{ + +} + +void +in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) +{ + +} + +void +in_pcbgroup_update(struct inpcb *inp) +{ + +} + +void +in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) +{ + +} + +void +in_pcbgroup_remove(struct inpcb *inp) +{ + +} + +int +in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) +{ + + return (0); +} + +#endif /* !PCBGROUP */ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/in_rss.c user/rwatson/tcp/src/sys/netinet/in_rss.c --- vendor/freebsd/src/sys/netinet/in_rss.c 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/in_rss.c 2011-02-01 08:30:20.600604108 +0000 @@ -0,0 +1,525 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_pcbgroup.h" + +#ifndef PCBGROUP +#error "options RSS depends on options PCBGROUP" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif /* INET6 */ + +/*- + * Operating system parts of receiver-side steering (RSS), which allows + * network cards to direct flows to particular receive queues based on hashes + * if header tuples. This implementation aligns RSS buckets with connection + * groups at the TCP/IP layer, so each bucket is associated with exactly one + * group. As a result, the group lookup structures (and lock) should have an + * effective affinity with exactly one CPU. + * + * Network device drivers needing to configure RSS will query this framework + * for parameters, such as the current RSS key, hashing policies, number of + * bits, and indirection table mapping hashes to buckets and CPUs. They may + * provide their own supplementary information, such as queue<->CPU bindings. + * It is the responsibility of the network device driver to inject packets + * into the stack on as close to the right CPU as possible, if playing by RSS + * rules. + * + * TODO: + * + * - Synchronization for rss_key and other future-configurable parameters. + * - Event handler drivers can register to pick up RSS configuration changes. + * - Should we allow rss_basecpu to be configured? + * - Randomize key on boot. + * - IPv6 support. + * - Statistics on how often there's a misalignment between hardware + * placement and pcbgroup expectations. + */ + +SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); + +/* + * Toeplitz is the only required hash function in the RSS spec, so use it by + * default. + */ +static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; +SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RD, &rss_hashalgo, 0, + "RSS hash algorithm"); +TUNABLE_INT("net.inet.rss.hashalgo", &rss_hashalgo); + +/* + * Size of the indirection table; at most 128 entries per the RSS spec. We + * size it to at least 2 times the number of CPUs by default to allow useful + * rebalancing. If not set explicitly with a loader tunable, we tune based + * on the number of CPUs present. + * + * XXXRW: buckets might be better to use for the tunable than bits. + */ +static u_int rss_bits; +SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RD, &rss_bits, 0, + "RSS bits"); +TUNABLE_INT("net.inet.rss.bits", &rss_bits); + +static u_int rss_mask; +SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, + "RSS mask"); + +static const u_int rss_maxbits = RSS_MAXBITS; +SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, + __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); + +/* + * RSS's own count of the number of CPUs it could be using for processing. + * Bounded to 64 by RSS constants. + */ +static u_int rss_ncpus; +SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, + "Number of CPUs available to RSS"); + +#define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) +static const u_int rss_maxcpus = RSS_MAXCPUS; +SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, + __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); + +/* + * Variable exists just for reporting rss_bits in a user-friendly way. + */ +static u_int rss_buckets; +SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, + "RSS buckets"); + +/* + * Base CPU number; devices will add this to all CPU numbers returned by the + * RSS indirection table. Currently unmodifable in FreeBSD. + */ +static const u_int rss_basecpu; +SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, + __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); + +/* + * RSS secret key, intended to prevent attacks on load-balancing. Its + * effectiveness may be limited by algorithm choice and available entropy + * during the boot. + * + * XXXRW: And that we don't randomize it yet! + * + * XXXRW: This default is actually the default key from Chelsio T3 cards, as + * it offers reasonable distribution, unlike all-0 keys which always + * generate a hash of 0 (upsettingly). + */ +static uint8_t rss_key[RSS_KEYSIZE] = { + 0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d, + 0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* + * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. + * Drivers may supplement this table with a seperate CPU<->queue table when + * programming devices. + */ +struct rss_table_entry { + uint8_t rte_cpu; /* CPU affinity of bucket. */ +}; +static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; + +static void +rss_init(__unused void *arg) +{ + u_int i; + + /* + * Validate tunables, coerce to sensible values. + */ + switch (rss_hashalgo) { + case RSS_HASH_TOEPLITZ: + case RSS_HASH_NAIVE: + break; + + default: + printf("rss_init: invalid RSS hashalgo %u, coercing to %u", + rss_hashalgo, RSS_HASH_TOEPLITZ); + rss_hashalgo = RSS_HASH_TOEPLITZ; + } + + /* + * Count available CPUs. + * + * XXXRW: Note incorrect assumptions regarding contiguity of this set + * elsewhere. + */ + rss_ncpus = 0; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + rss_ncpus++; + } + if (rss_ncpus > RSS_MAXCPUS) + rss_ncpus = RSS_MAXCPUS; + + /* + * Tune RSS table entries to be no less than 2x the number of CPUs + * -- unless we're running uniprocessor, in which case there's not + * much point in having buckets to rearrange for load-balancing! + */ + if (rss_ncpus > 1) { + if (rss_bits == 0) + rss_bits = fls(rss_ncpus - 1) + 1; + + /* + * Microsoft limits RSS table entries to 128, so apply that + * limit to both auto-detected CPU counts and user-configured + * ones. + */ + if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { + printf("rss_init: RSS bits %u not valid, coercing to " + " %u", rss_bits, RSS_MAXBITS); + rss_bits = RSS_MAXBITS; + } + + /* + * Figure out how many buckets to use; warn if less than the + * number of configured CPUs, although this is not a fatal + * problem. + */ + rss_buckets = (1 << rss_bits); + if (rss_buckets < rss_ncpus) + printf("rss_init: WARNING: rss_buckets (%u) less than " + "rss_ncpus (%u)\n", rss_buckets, rss_ncpus); + rss_mask = rss_buckets - 1; + } else { + rss_bits = 0; + rss_buckets = 1; + rss_mask = 0; + } + + /* + * Set up initial CPU assignments: round-robin by default. + * + * XXXRW: Need a mapping to non-contiguous IDs here. + */ + for (i = 0; i < rss_buckets; i++) + rss_table[i].rte_cpu = i % rss_ncpus; + + /* + * Randomize rrs_key. + * + * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() + * loop to check for "bad" RSS keys. + */ +} +SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); + +static uint32_t +rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, + const uint8_t *data) +{ + uint32_t v; + u_int i; + + v = 0; + for (i = 0; i < keylen; i++) + v += key[i]; + for (i = 0; i < datalen; i++) + v += data[i]; + return (v); +} + +static uint32_t +rss_hash(u_int datalen, const uint8_t *data) +{ + + switch (rss_hashalgo) { + case RSS_HASH_TOEPLITZ: + return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, + data)); + + case RSS_HASH_NAIVE: + return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, + data)); + + default: + panic("rss_hash: unsupported/unknown hashalgo %d", + rss_hashalgo); + } +} + +/* + * Hash an IPv4 2-tuple. + */ +uint32_t +rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst) +{ + uint8_t data[sizeof(src) + sizeof(dst)]; + u_int datalen; + + datalen = 0; + bcopy(&src, &data[datalen], sizeof(src)); + datalen += sizeof(src); + bcopy(&dst, &data[datalen], sizeof(dst)); + datalen += sizeof(dst); + return (rss_hash(datalen, data)); +} + +/* + * Hash an IPv4 4-tuple. + */ +uint32_t +rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst, + u_short dstport) +{ + uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + + sizeof(dstport)]; + u_int datalen; + + datalen = 0; + bcopy(&src, &data[datalen], sizeof(src)); + datalen += sizeof(src); + bcopy(&dst, &data[datalen], sizeof(dst)); + datalen += sizeof(dst); + bcopy(&srcport, &data[datalen], sizeof(srcport)); + datalen += sizeof(srcport); + bcopy(&dstport, &data[datalen], sizeof(dstport)); + datalen += sizeof(dstport); + return (rss_hash(datalen, data)); +} + +#ifdef INET6 +/* + * Hash an IPv6 2-tuple. + */ +uint32_t +rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst) +{ + uint8_t data[sizeof(src) + sizeof(dst)]; + u_int datalen; + + datalen = 0; + bcopy(&src, &data[datalen], sizeof(src)); + datalen += sizeof(src); + bcopy(&dst, &data[datalen], sizeof(dst)); + datalen += sizeof(dst); + return (rss_hash(datalen, data)); +} + +/* + * Hash an IPv6 4-tuple. + */ +uint32_t +rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, + struct in6_addr dst, u_short dstport) +{ + uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + + sizeof(dstport)]; + u_int datalen; + + datalen = 0; + bcopy(&src, &data[datalen], sizeof(src)); + datalen += sizeof(src); + bcopy(&dst, &data[datalen], sizeof(dst)); + datalen += sizeof(dst); + bcopy(&srcport, &data[datalen], sizeof(srcport)); + datalen += sizeof(srcport); + bcopy(&dstport, &data[datalen], sizeof(dstport)); + datalen += sizeof(dstport); + return (rss_hash(datalen, data)); +} +#endif /* INET6 */ + +/* + * Query the number of RSS bits in use. + */ +u_int +rss_getbits(void) +{ + + return (rss_bits); +} + +/* + * Query the RSS bucket associated with an RSS hash. + */ +u_int +rss_getbucket(u_int hash) +{ + + return (hash & rss_mask); +} + +/* + * Query the RSS CPU associated with an RSS bucket. + */ +u_int +rss_getcpu(u_int bucket) +{ + + return (rss_table[bucket].rte_cpu); +} + +/* + * netisr CPU affinity lookup routine for use by protocols. + */ +struct mbuf * +rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) +{ + + M_ASSERTPKTHDR(m); + + switch (M_HASHTYPE_GET(m)) { + case M_HASHTYPE_RSS_2TUPLE: + case M_HASHTYPE_RSS_4TUPLE: + *cpuid = rss_getcpu(rss_getbucket(m->m_pkthdr.flowid)); + return (m); + + default: + *cpuid = NETISR_CPUID_NONE; + return (m); + } +} + +/* + * Query the RSS hash algorithm. + */ +u_int +rss_gethashalgo(void) +{ + + return (rss_hashalgo); +} + +/* + * Query the current RSS key; likely to be used by device drivers when + * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. + * + * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? + */ +void +rss_getkey(uint8_t *key) +{ + + bcopy(rss_key, key, sizeof(rss_key)); +} + +/* + * Query the number of buckets; this may be used by both network device + * drivers, which will need to populate hardware shadows of the software + * indirection table, and the network stack itself (such as when deciding how + * many connection groups to allocate). + */ +u_int +rss_getnumbuckets(void) +{ + + return (rss_buckets); +} + +/* + * Query the number of CPUs in use by RSS; may be useful to device drivers + * trying to figure out how to map a larger number of CPUs into a smaller + * number of receive queues. + */ +u_int +rss_getnumcpus(void) +{ + + return (rss_ncpus); +} + +/* + * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want + * it appearing in debugging output unnecessarily. + */ +static int +sysctl_rss_key(SYSCTL_HANDLER_ARGS) +{ + uint8_t temp_rss_key[RSS_KEYSIZE]; + int error; + + error = priv_check(req->td, PRIV_NETINET_HASHKEY); + if (error) + return (error); + + bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); + error = sysctl_handle_opaque(oidp, temp_rss_key, + sizeof(temp_rss_key), req); + if (error) + return (error); + if (req->newptr != NULL) { + /* XXXRW: Not yet. */ + return (EINVAL); + } + return (0); +} +SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, + "", "RSS keying material"); diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/in_rss.h user/rwatson/tcp/src/sys/netinet/in_rss.h --- vendor/freebsd/src/sys/netinet/in_rss.h 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/in_rss.h 2011-02-01 08:30:20.723504365 +0000 @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IN_RSS_H_ +#define _NETINET_IN_RSS_H_ + +#include /* in_addr_t */ + +/* + * Supported RSS hash functions. + */ +#define RSS_HASH_NAIVE 0x00000001 /* Poor but fast hash. */ +#define RSS_HASH_TOEPLITZ 0x00000002 /* Required by RSS. */ +#define RSS_HASH_CRC32 0x00000004 /* Future; some NICs do it. */ + +#define RSS_HASH_MASK (RSS_HASH_NAIVE | RSS_HASH_TOEPLITZ) + +/* + * Instances of struct inpcbinfo declare an RSS hash type indicating what + * header fields are covered. + */ +#define RSS_HASHFIELDS_NONE 0 +#define RSS_HASHFIELDS_4TUPLE 1 +#define RSS_HASHFIELDS_2TUPLE 2 + +/* + * Compile-time limits on the size of the indirection table. + */ +#define RSS_MAXBITS 7 +#define RSS_TABLE_MAXLEN (1 << RSS_MAXBITS) + +/* + * Maximum key size used throughout. It's OK for hardware to use only the + * first 16 bytes, which is all that's required for IPv4. + */ +#define RSS_KEYSIZE 40 + +/* + * Device driver interfaces to query RSS properties that must be programmed + * into hardware. + */ +u_int rss_getbits(void); +u_int rss_getbucket(u_int hash); +u_int rss_getcpu(u_int bucket); +void rss_getkey(uint8_t *key); +u_int rss_gethashalgo(void); +u_int rss_getnumbuckets(void); +u_int rss_getnumcpus(void); + +/* + * Network stack interface to generate a hash for a protocol tuple. + */ +uint32_t rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, + struct in_addr dst, u_short dstport); +uint32_t rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst); +uint32_t rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, + struct in6_addr dst, u_short dstport); +uint32_t rss_hash_ip6_2tuple(struct in6_addr src, + struct in6_addr dst); + +/* + * Network stack interface to query desired CPU affinity of a packet. + */ +struct mbuf *rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid); + +#endif /* !_NETINET_IN_RSS_H_ */ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/ip_divert.c user/rwatson/tcp/src/sys/netinet/ip_divert.c --- vendor/freebsd/src/sys/netinet/ip_divert.c 2011-02-01 09:38:50.196618406 +0000 +++ user/rwatson/tcp/src/sys/netinet/ip_divert.c 2011-02-01 08:30:20.943440139 +0000 @@ -153,7 +153,8 @@ * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE); + div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); } static void @@ -659,9 +660,9 @@ INP_INFO_WLOCK(&V_divcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_divcbinfo); diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/ip_var.h user/rwatson/tcp/src/sys/netinet/ip_var.h --- vendor/freebsd/src/sys/netinet/ip_var.h 2011-02-01 09:38:50.613527818 +0000 +++ user/rwatson/tcp/src/sys/netinet/ip_var.h 2010-05-22 11:56:55.700208030 +0100 @@ -135,8 +135,13 @@ * In-kernel consumers can use these accessor macros directly to update * stats. */ +#ifdef INET_NOSTATS +#define IPSTAT_ADD(name, val) +#define IPSTAT_SUB(name, val) +#else #define IPSTAT_ADD(name, val) V_ipstat.name += (val) #define IPSTAT_SUB(name, val) V_ipstat.name -= (val) +#endif #define IPSTAT_INC(name) IPSTAT_ADD(name, 1) #define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/ipfw/ip_fw2.c user/rwatson/tcp/src/sys/netinet/ipfw/ip_fw2.c --- vendor/freebsd/src/sys/netinet/ipfw/ip_fw2.c 2011-02-01 09:38:51.215359894 +0000 +++ user/rwatson/tcp/src/sys/netinet/ipfw/ip_fw2.c 2011-01-30 14:25:51.823414835 +0000 @@ -656,7 +656,7 @@ (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct inpcbinfo *pi; - int wildcard; + int lookupflags; struct inpcb *pcb; int match; @@ -681,30 +681,34 @@ if (*ugid_lookupp == -1) return (0); if (proto == IPPROTO_TCP) { - wildcard = 0; + lookupflags = 0; pi = &V_tcbinfo; } else if (proto == IPPROTO_UDP) { - wildcard = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; + lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { - INP_INFO_RLOCK(pi); + /* + * XXXRW: If we had the mbuf here, could use in_pcblookupm(). + */ pcb = (oif) ? - in_pcblookup_hash(pi, + in_pcblookup(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), - wildcard, oif) : - in_pcblookup_hash(pi, + lookupflags, oif) : + in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), - wildcard, NULL); + lookupflags, NULL); if (pcb != NULL) { + INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; + INP_RUNLOCK(pcb); } - INP_INFO_RUNLOCK(pi); if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/raw_ip.c user/rwatson/tcp/src/sys/netinet/raw_ip.c --- vendor/freebsd/src/sys/netinet/raw_ip.c 2011-02-01 09:38:52.243772723 +0000 +++ user/rwatson/tcp/src/sys/netinet/raw_ip.c 2011-02-01 08:30:20.973383889 +0000 @@ -186,7 +186,8 @@ { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE); + 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -206,7 +207,7 @@ { int policyfail = 0; - INP_RLOCK_ASSERT(last); + INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ @@ -814,16 +815,18 @@ static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_WLOCK_ASSERT(inp); - + INP_INFO_WLOCK(pcbinfo); + INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(pcbinfo); } static void @@ -834,11 +837,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -849,11 +848,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static int @@ -867,11 +862,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1056,9 +1047,9 @@ INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_input.c user/rwatson/tcp/src/sys/netinet/tcp_input.c --- vendor/freebsd/src/sys/netinet/tcp_input.c 2011-02-01 09:38:55.223432987 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_input.c 2011-02-04 19:39:28.873581247 +0000 @@ -185,10 +185,6 @@ &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -int tcp_read_locking = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, - &tcp_read_locking, 0, "Enable read locking strategy"); - VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); @@ -329,8 +325,7 @@ char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_RLOCKED 2 -#define TI_WLOCKED 3 +#define TI_WLOCKED 2 #ifdef TCPDEBUG /* @@ -483,30 +478,25 @@ drop_hdrlen = off0 + off; /* - * Locate pcb for segment, which requires a lock on tcbinfo. - * Optimisticaly acquire a global read lock rather than a write lock - * unless header flags necessarily imply a state change. There are - * two cases where we might discover later we need a write lock - * despite the flags: ACKs moving a connection out of the syncache, - * and ACKs for a connection in TIMEWAIT. + * Locate pcb for segment; if we're likely to add or remove a + * connection then first acquire pcbinfo lock. There are two cases + * where we might discover later we need a write lock despite the + * flags: ACKs moving a connection out of the syncache, and ACKs for + * a connection in TIMEWAIT. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; - } else { - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - } + } else + ti_locked = TI_UNLOCKED; findpcb: #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif #ifdef IPFIREWALL_FORWARD @@ -523,20 +513,22 @@ * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); if (!inp) { - /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - next_hop->sin_addr, - next_hop->sin_port ? - ntohs(next_hop->sin_port) : - th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + /* + * It's new. Try to find the ambushing socket. + * Because we've rewritten the destination address, + * any hardware-generated hash is ignored. + */ + inp = in_pcblookup(&V_tcbinfo, + ip->ip_src, th->th_sport, + next_hop->sin_addr, + next_hop->sin_port ? ntohs(next_hop->sin_port) : + th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); @@ -545,18 +537,19 @@ { if (isipv6) { #ifdef INET6 - inp = in6_pcblookup_hash(&V_tcbinfo, - &ip6->ip6_src, th->th_sport, - &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in6_pcblookup_mbuf(&V_tcbinfo, + &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); #endif - } else - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_mbuf(&V_tcbinfo, + ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); + } } /* @@ -586,7 +579,7 @@ rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_WLOCK(inp); + INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_HW_FLOWID) && (m->m_flags & M_FLOWID) && ((inp->inp_socket == NULL) @@ -627,28 +620,26 @@ * legitimate new connection attempt the old INPCB gets removed and * we can try again to find a listening socket. * - * At this point, due to earlier optimism, we may hold a read lock on - * the inpcbinfo, rather than a write lock. If so, we need to - * upgrade, or if that fails, acquire a reference on the inpcb, drop - * all locks, acquire a global write lock, and then re-acquire the - * inpcb lock. We may at that point discover that another thread has - * tried to free the inpcb, in which case we need to loop back and - * try to find a new inpcb to deliver to. + * At this point, due to earlier optimism, we may hold only an inpcb + * lock, and not the inpcbinfo write lock. If so, we need to try to + * acquire it, or if that fails, acquire a reference on the inpcb, + * drop all locks, acquire a global write lock, and then re-acquire + * the inpcb lock. We may at that point discover that another thread + * has tried to free the inpcb, in which case we need to loop back + * and try to find a new inpcb to deliver to. + * + * XXXRW: It may be time to rethink timewait locking. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -680,26 +671,24 @@ /* * We've identified a valid inpcb, but it could be that we need an - * inpcbinfo write lock and have only a read lock. In this case, - * attempt to upgrade/relock using the same strategy as the TIMEWAIT - * case above. If we relock, we have to jump back to 'relocked' as - * the connection might now be in TIMEWAIT. - */ - if (tp->t_state != TCPS_ESTABLISHED || - (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: upgrade check ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + * inpcbinfo write lock but don't hold it. In this case, attempt to + * acquire using the same strategy as the TIMEWAIT case above. If we + * relock, we have to jump back to 'relocked' as the connection might + * now be in TIMEWAIT. + */ +#ifdef INVARIANTS + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#endif + if (tp->t_state != TCPS_ESTABLISHED) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -732,13 +721,16 @@ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. + * attempt or the completion of a previous one. Because listen + * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be + * held in this case. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); bzero(&inc, sizeof(inc)); #ifdef INET6 @@ -1031,12 +1023,15 @@ return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif ti_locked = TI_UNLOCKED; if (inp != NULL) { @@ -1048,12 +1043,15 @@ goto drop; dropunlock: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropunlock ti_locked %d", __func__, ti_locked); +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif ti_locked = TI_UNLOCKED; if (inp != NULL) @@ -1108,13 +1106,13 @@ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: ti_locked %d for EST", __func__, - ti_locked); + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1271,13 +1269,8 @@ /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure ACK", - __func__, ti_locked); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1373,13 +1366,8 @@ * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure data " - "segment", __func__, ti_locked); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -2227,9 +2215,6 @@ } process_ACK: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; @@ -2452,9 +2437,6 @@ } step6: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: step6 ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2540,9 +2522,6 @@ tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dodata ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2671,13 +2650,8 @@ return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dodata epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -2706,9 +2680,6 @@ return; dropafterack: - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); - /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. @@ -2735,13 +2706,8 @@ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropafterack epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; @@ -2751,12 +2717,8 @@ return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -2767,9 +2729,7 @@ return; drop: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); #ifdef INVARIANTS else diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_subr.c user/rwatson/tcp/src/sys/netinet/tcp_subr.c --- vendor/freebsd/src/sys/netinet/tcp_subr.c 2011-02-01 09:38:55.443852063 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_subr.c 2011-02-01 08:30:21.543567250 +0000 @@ -334,7 +334,8 @@ hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_4TUPLE); V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; @@ -1127,9 +1128,9 @@ INP_INFO_WLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1169,12 +1170,9 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, - addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1182,10 +1180,8 @@ if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1221,21 +1217,18 @@ return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); if (mapped == 1) - inp = in_pcblookup_hash(&V_tcbinfo, + inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], - addrs[0].sin6_port, - 0, NULL); + addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else - inp = in6_pcblookup_hash(&V_tcbinfo, + inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, - &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); + &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1243,10 +1236,8 @@ if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1306,10 +1297,9 @@ th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_WLOCK(inp); if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { @@ -2181,18 +2171,17 @@ switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: - inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, - fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, - NULL); + inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, + INPLOOKUP_WLOCKPCB, NULL); break; #endif case AF_INET: - inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, - fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; } if (inp != NULL) { - INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_syncache.c user/rwatson/tcp/src/sys/netinet/tcp_syncache.c --- vendor/freebsd/src/sys/netinet/tcp_syncache.c 2011-02-01 09:38:55.535313408 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_syncache.c 2011-02-01 08:30:21.543567250 +0000 @@ -36,6 +36,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_pcbgroup.h" #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #ifdef INET6 @@ -660,6 +662,7 @@ inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); + INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; @@ -674,8 +677,20 @@ #ifdef INET6 } #endif + + /* + * Install in the reservation hash table for now, but don't yet + * install a connection group since the full 4-tuple isn't yet + * configured. + * + * XXXRW: Why install in the hash now, why not let the pcbconnect + * call later do it all then? + * + * XXXRW: Why do the IPSEC copy between the two phases of 4-tuple + * setup, with the hash lock held? + */ inp->inp_lport = sc->sc_inc.inc_lport; - if (in_pcbinshash(inp) != 0) { + if (in_pcbinshash_nopcbgroup(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. @@ -687,6 +702,7 @@ #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC @@ -721,9 +737,10 @@ laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; - if (in6_pcbconnect(inp, (struct sockaddr *)&sin6, - thread0.td_ucred)) { + if (in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, + thread0.td_ucred, m)) { inp->in6p_laddr = laddr6; + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ @@ -750,12 +767,14 @@ laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; - if (in_pcbconnect(inp, (struct sockaddr *)&sin, - thread0.td_ucred)) { + if (in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, + thread0.td_ucred, m)) { inp->inp_laddr = laddr; + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } + INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_timer.c user/rwatson/tcp/src/sys/netinet/tcp_timer.c --- vendor/freebsd/src/sys/netinet/tcp_timer.c 2011-02-01 09:38:55.543439859 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_timer.c 2011-01-27 21:36:20.163577632 +0000 @@ -490,7 +490,7 @@ INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_usrreq.c user/rwatson/tcp/src/sys/netinet/tcp_usrreq.c --- vendor/freebsd/src/sys/netinet/tcp_usrreq.c 2011-02-01 09:38:55.673411406 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_usrreq.c 2011-01-30 14:25:51.954979322 +0000 @@ -252,7 +252,6 @@ return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -262,11 +261,12 @@ } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -292,7 +292,6 @@ return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -302,6 +301,7 @@ } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { @@ -315,14 +315,15 @@ inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } error = in6_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -338,7 +339,6 @@ struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -350,8 +350,10 @@ TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); @@ -362,7 +364,6 @@ out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -375,7 +376,6 @@ struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -387,12 +387,14 @@ TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); @@ -402,7 +404,6 @@ out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -435,7 +436,6 @@ return (error); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -451,7 +451,6 @@ out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -476,7 +475,6 @@ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -486,6 +484,11 @@ } tp = intotcpcb(inp); TCPDEBUG1(); + /* + * XXXRW: Some confusion: V4/V6 flags relate to binding, and + * therefore probably require the hash lock, which isn't held here. + * Is this a significant problem? + */ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; @@ -517,7 +520,6 @@ out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -629,6 +631,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -654,6 +657,7 @@ out: TCPDEBUG2(PRU_ACCEPT); INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -740,25 +744,16 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* - * We require the pcbinfo lock in two cases: - * - * (1) An implied connect is taking place, which can result in - * binding IPs and ports and hence modification of the pcb hash - * chains. - * - * (2) PRUS_EOF is set, resulting in explicit close on the send. + * We require the pcbinfo lock if we will close the socket as part of * this call. */ - if ((nam != NULL) || (flags & PRUS_EOF)) { + if (flags & PRUS_EOF) INP_INFO_WLOCK(&V_tcbinfo); - headlocked = 1; - } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -795,7 +790,6 @@ * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -816,10 +810,6 @@ socantsendmore(so); tcp_usrclosed(tp); } - if (headlocked) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; @@ -855,7 +845,6 @@ * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -866,11 +855,6 @@ goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } else if (nam) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; @@ -881,7 +865,7 @@ TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); - if (headlocked) + if (flags & PRUS_EOF) INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -1066,13 +1050,13 @@ u_short lport; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1085,11 +1069,14 @@ error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) - return error; - if (oinp) - return EADDRINUSE; + goto out; + if (oinp) { + error = EADDRINUSE; + goto out; + } inp->inp_laddr = laddr; in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: @@ -1109,6 +1096,10 @@ tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return (error); } #ifdef INET6 @@ -1121,13 +1112,13 @@ struct in6_addr addr6; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1135,18 +1126,23 @@ * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. * in6_pcbladdr() also handles scope zone IDs. + * + * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked() + * outside of in6_pcb.c if there were an in6_pcbconnect_setup(). */ error = in6_pcbladdr(inp, nam, &addr6); if (error) return error; - oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL); - if (oinp) - return EADDRINUSE; + if (oinp) { + error = EADDRINUSE; + goto out; + } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = addr6; inp->in6p_faddr = sin6->sin6_addr; @@ -1157,6 +1153,7 @@ inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -1172,6 +1169,10 @@ tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return error; } #endif /* INET6 */ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/tcp_var.h user/rwatson/tcp/src/sys/netinet/tcp_var.h --- vendor/freebsd/src/sys/netinet/tcp_var.h 2011-02-01 09:38:55.703619156 +0000 +++ user/rwatson/tcp/src/sys/netinet/tcp_var.h 2010-05-22 11:56:58.029927031 +0100 @@ -478,7 +478,11 @@ * In-kernel consumers can use these accessor macros directly to update * stats. */ +#ifdef INET_NOSTATS +#define TCPSTAT_ADD(name, val) +#else #define TCPSTAT_ADD(name, val) V_tcpstat.name += (val) +#endif #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* Files vendor/freebsd/src/sys/netinet/test and user/rwatson/tcp/src/sys/netinet/test differ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/test.c user/rwatson/tcp/src/sys/netinet/test.c --- vendor/freebsd/src/sys/netinet/test.c 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/test.c 2010-06-11 21:01:04.915021589 +0100 @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 2010 David Malone + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifndef _KERNEL +static uint8_t tkey8[RSS_KEYSIZE] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; + +static uint8_t zkey8[RSS_KEYSIZE]; + +static uint8_t okey8[RSS_KEYSIZE] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + +/* Default Chelsio hardware key. */ +static uint8_t ckey8[] = { + 0x6d, 0x5a, 0x56, 0xda, + 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, + 0x43, 0xa3, 0x8f, 0xb0, +}; + +#define min(x, y) ((x) < (y) ? (x) : (y)) +#endif + +uint32_t +toeplitz_hash(u_int keylen, const uint8_t *key, u_int netperf_datalen, + const uint8_t *netperf_data) +{ + uint32_t hash = 0, v; + u_int i, b; + + /* XXXRW: Perhaps an assertion about key length vs. netperf_data length? */ + + v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; + for (i = 0; i < netperf_datalen; i++) { + for (b = 0; b < 8; b++) { + if (netperf_data[i] & (1<<(7-b))) + hash ^= v; + v <<= 1; + if ((i + 4) < RSS_KEYSIZE && + (key[i+4] & (1<<(7-b)))) + v |= 1; + } + } + return (hash); +} + +/* + * Remaning code in this file consists of test vectors and a userspace main() + * function to validate the hash implementation. + */ +#ifndef _KERNEL +static u_int hash1 = 0x323e8fc2; + +static uint8_t netperf_data2[] = { 199, 92, 111, 2, 65, 69, 140, 83 }; +static u_int hash2 = 0xd718262a; + +static uint8_t netperf_data3[] = { 24, 19, 198, 95, 12, 22, 207, 184 }; +static u_int hash3 = 0xd2d0a5de; + +static uint8_t netperf_data4[] = { 38, 27, 205, 30, 209, 142, 163, 6 }; +static u_int hash4 = 0x82989176; + +static uint8_t netperf_data5[] = { 153, 39, 163, 191, 202, 188, 127, 2 }; +static u_int hash5 = 0x5d1809c5; + +#define PORT(n) (((n)>>8)&0xff), ((n)&0xff) + +static uint8_t netperf_data1[] = { 66, 9, 149, 187, 161, 142, 100, 80 }; +static uint8_t netperf_data1p[] = { + 66, 9, 149, 187, 161, 142, 100, 80, PORT(2794), PORT(1766) +}; + +static u_int hash1p = 0x51ccc178; + +static uint8_t netperf_data2p[] = { + 199, 92, 111, 2, 65, 69, 140, 83, PORT(14230), PORT(4739) +}; +static u_int hash2p = 0xc626b0ea; + +static uint8_t netperf_data3p[] = { + 24, 19, 198, 95, 12, 22, 207, 184, PORT(12898), PORT(38024) +}; +static u_int hash3p = 0x5c2b394a; + +static uint8_t netperf_data4p[] = { + 38, 27, 205, 30, 209, 142, 163, 6, PORT(48228), PORT(2217) +}; +static u_int hash4p = 0xafc7327f; + +static uint8_t netperf_data5p[] = { + 153, 39, 163, 191, 202, 188, 127, 2, PORT(44251), PORT(1303) +}; +static u_int hash5p = 0x10e828a2; + +#define IPv6(a1,a2,a3,a4,a5,a6,a7,a8) \ + (((0x##a1)>>8)&0xff), ((0x##a1)&0xff), \ + (((0x##a2)>>8)&0xff), ((0x##a2)&0xff), \ + (((0x##a3)>>8)&0xff), ((0x##a3)&0xff), \ + (((0x##a4)>>8)&0xff), ((0x##a4)&0xff), \ + (((0x##a5)>>8)&0xff), ((0x##a5)&0xff), \ + (((0x##a6)>>8)&0xff), ((0x##a6)&0xff), \ + (((0x##a7)>>8)&0xff), ((0x##a7)&0xff), \ + (((0x##a8)>>8)&0xff), ((0x##a8)&0xff) + +static uint8_t netperf_data1v6[] = { + IPv6(3ffe, 2501, 200, 1fff, 0, 0, 0, 7), + IPv6(3ffe, 2501, 200, 3, 0, 0, 0, 1) +}; +static u_int hash1v6 = 0x2cc18cd5; + +static uint8_t netperf_data2v6[] = { + IPv6(3ffe, 501, 8, 0, 260, 97ff, fe40, efab), + IPv6(ff02, 0, 0, 0, 0, 0, 0, 1) +}; +static u_int hash2v6 = 0x0f0c461c; + +static uint8_t netperf_data3v6[] = { + IPv6(3ffe, 1900, 4545, 3, 200, f8ff, fe21, 67cf), + IPv6(fe80, 0, 0, 0, 200, f8ff, fe21, 67cf) +}; +static u_int hash3v6 = 0x4b61e985; + +static uint8_t netperf_data1v6p[] = { + IPv6(3ffe, 2501, 200, 1fff, 0, 0, 0, 7), + IPv6(3ffe, 2501, 200, 3, 0, 0, 0, 1), + PORT(2794), PORT(1766) +}; +static u_int hash1v6p = 0x40207d3d; + +static uint8_t netperf_data2v6p[] = { + IPv6(3ffe, 501, 8, 0, 260, 97ff, fe40, efab), + IPv6(ff02, 0, 0, 0, 0, 0, 0, 1), + PORT(14230), PORT(4739) +}; +static u_int hash2v6p = 0xdde51bbf; + +static uint8_t netperf_data3v6p[] = { + IPv6(3ffe, 1900, 4545, 3, 200, f8ff, fe21, 67cf), + IPv6(fe80, 0, 0, 0, 200, f8ff, fe21, 67cf), + PORT(44251), PORT(38024) +}; +static u_int hash3v6p = 0x02d1feef; + +/* + * Netperf cluster. + */ +static uint8_t netperf_data[] = { + 192, 168, 100, 102, 192, 168, 100, 101, PORT(22), PORT(42658) +}; + +#define TEST(tag) do { \ + unsigned int v1, v2; \ + v1 = hash ## tag; \ + v2 = toeplitz_hash(sizeof(tkey8), tkey8, sizeof(netperf_data ## tag), \ + netperf_data ## tag); \ + if (v1 != v2) \ + errx(-1, "FAIL vector failed:%08x %08x %08x " \ + "%zd\n", v1, v2, v1 ^ v2, sizeof(netperf_data ## tag)); \ +} while (0) + +int +main(int argc, char *argv[]) +{ + unsigned int v; + TEST(1); + TEST(2); + TEST(3); + TEST(4); + TEST(5); + + TEST(1p); + TEST(2p); + TEST(3p); + TEST(4p); + TEST(5p); + + TEST(1v6); + TEST(2v6); + TEST(3v6); + + TEST(1v6p); + TEST(2v6p); + TEST(3v6p); + + printf("PASS\n"); + + printf("Sample hashes:\n"); + + printf("Test key: %08x\n", + toeplitz_hash(sizeof(tkey8), tkey8, sizeof(netperf_data1), netperf_data1)); + + printf("All zeroes: %08x\n", + toeplitz_hash(sizeof(zkey8), zkey8, sizeof(netperf_data1), netperf_data1)); + + printf("All ones: %08x\n", + toeplitz_hash(sizeof(okey8), okey8, sizeof(netperf_data1), netperf_data1)); + + printf("Chelsio key: %08x\n", + toeplitz_hash(sizeof(ckey8), ckey8, sizeof(netperf_data), + netperf_data)); + + return (0); +} +#endif Files vendor/freebsd/src/sys/netinet/toeplitz and user/rwatson/tcp/src/sys/netinet/toeplitz differ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/toeplitz.c user/rwatson/tcp/src/sys/netinet/toeplitz.c --- vendor/freebsd/src/sys/netinet/toeplitz.c 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/toeplitz.c 2010-06-07 13:14:58.623200617 +0100 @@ -0,0 +1,211 @@ +/*- + * Copyright (c) 2010 David Malone + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifndef _KERNEL +static uint8_t tkey8[RSS_KEYSIZE] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; + +#define min(x, y) ((x) < (y) ? (x) : (y)) +#endif + +uint32_t +toeplitz_hash(u_int keylen, const uint8_t *key, u_int datalen, + const uint8_t *data) +{ + uint32_t hash = 0, v; + u_int i, b; + + /* XXXRW: Perhaps an assertion about key length vs. data length? */ + + v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; + for (i = 0; i < datalen; i++) { + for (b = 0; b < 8; b++) { + if (data[i] & (1<<(7-b))) + hash ^= v; + v <<= 1; + if ((i + 4) < RSS_KEYSIZE && + (key[i+4] & (1<<(7-b)))) + v |= 1; + } + } + return (hash); +} + +/* + * Remaning code in this file consists of test vectors and a userspace main() + * function to validate the hash implementation. + */ +#ifndef _KERNEL +static u_int hash1 = 0x323e8fc2; + +static uint8_t data2[] = { 199, 92, 111, 2, 65, 69, 140, 83 }; +static u_int hash2 = 0xd718262a; + +static uint8_t data3[] = { 24, 19, 198, 95, 12, 22, 207, 184 }; +static u_int hash3 = 0xd2d0a5de; + +static uint8_t data4[] = { 38, 27, 205, 30, 209, 142, 163, 6 }; +static u_int hash4 = 0x82989176; + +static uint8_t data5[] = { 153, 39, 163, 191, 202, 188, 127, 2 }; +static u_int hash5 = 0x5d1809c5; + +#define PORT(n) (((n)>>8)&0xff), ((n)&0xff) + +static uint8_t data1[] = { 66, 9, 149, 187, 161, 142, 100, 80 }; +static uint8_t data1p[] = { + 66, 9, 149, 187, 161, 142, 100, 80, PORT(2794), PORT(1766) +}; + +static u_int hash1p = 0x51ccc178; + +static uint8_t data2p[] = { + 199, 92, 111, 2, 65, 69, 140, 83, PORT(14230), PORT(4739) +}; +static u_int hash2p = 0xc626b0ea; + +static uint8_t data3p[] = { + 24, 19, 198, 95, 12, 22, 207, 184, PORT(12898), PORT(38024) +}; +static u_int hash3p = 0x5c2b394a; + +static uint8_t data4p[] = { + 38, 27, 205, 30, 209, 142, 163, 6, PORT(48228), PORT(2217) +}; +static u_int hash4p = 0xafc7327f; + +static uint8_t data5p[] = { + 153, 39, 163, 191, 202, 188, 127, 2, PORT(44251), PORT(1303) +}; +static u_int hash5p = 0x10e828a2; + +#define IPv6(a1,a2,a3,a4,a5,a6,a7,a8) \ + (((0x##a1)>>8)&0xff), ((0x##a1)&0xff), \ + (((0x##a2)>>8)&0xff), ((0x##a2)&0xff), \ + (((0x##a3)>>8)&0xff), ((0x##a3)&0xff), \ + (((0x##a4)>>8)&0xff), ((0x##a4)&0xff), \ + (((0x##a5)>>8)&0xff), ((0x##a5)&0xff), \ + (((0x##a6)>>8)&0xff), ((0x##a6)&0xff), \ + (((0x##a7)>>8)&0xff), ((0x##a7)&0xff), \ + (((0x##a8)>>8)&0xff), ((0x##a8)&0xff) + +static uint8_t data1v6[] = { + IPv6(3ffe, 2501, 200, 1fff, 0, 0, 0, 7), + IPv6(3ffe, 2501, 200, 3, 0, 0, 0, 1) +}; +static u_int hash1v6 = 0x2cc18cd5; + +static uint8_t data2v6[] = { + IPv6(3ffe, 501, 8, 0, 260, 97ff, fe40, efab), + IPv6(ff02, 0, 0, 0, 0, 0, 0, 1) +}; +static u_int hash2v6 = 0x0f0c461c; + +static uint8_t data3v6[] = { + IPv6(3ffe, 1900, 4545, 3, 200, f8ff, fe21, 67cf), + IPv6(fe80, 0, 0, 0, 200, f8ff, fe21, 67cf) +}; +static u_int hash3v6 = 0x4b61e985; + +static uint8_t data1v6p[] = { + IPv6(3ffe, 2501, 200, 1fff, 0, 0, 0, 7), + IPv6(3ffe, 2501, 200, 3, 0, 0, 0, 1), + PORT(2794), PORT(1766) +}; +static u_int hash1v6p = 0x40207d3d; + +static uint8_t data2v6p[] = { + IPv6(3ffe, 501, 8, 0, 260, 97ff, fe40, efab), + IPv6(ff02, 0, 0, 0, 0, 0, 0, 1), + PORT(14230), PORT(4739) +}; +static u_int hash2v6p = 0xdde51bbf; + +static uint8_t data3v6p[] = { + IPv6(3ffe, 1900, 4545, 3, 200, f8ff, fe21, 67cf), + IPv6(fe80, 0, 0, 0, 200, f8ff, fe21, 67cf), + PORT(44251), PORT(38024) +}; +static u_int hash3v6p = 0x02d1feef; + +#define TEST(tag) do { \ + unsigned int v1, v2; \ + v1 = hash ## tag; \ + v2 = toeplitz_hash(sizeof(tkey8), tkey8, sizeof(data ## tag), \ + data ## tag); \ + if (v1 != v2) \ + errx(-1, "FAIL vector failed:%08x %08x %08x " \ + "%zd\n", v1, v2, v1 ^ v2, sizeof(data ## tag)); \ +} while (0) + +int +main(int argc, char *argv[]) +{ + TEST(1); + TEST(2); + TEST(3); + TEST(4); + TEST(5); + + TEST(1p); + TEST(2p); + TEST(3p); + TEST(4p); + TEST(5p); + + TEST(1v6); + TEST(2v6); + TEST(3v6); + + TEST(1v6p); + TEST(2v6p); + TEST(3v6p); + + printf("PASS\n"); + + return (0); +} +#endif diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/toeplitz.h user/rwatson/tcp/src/sys/netinet/toeplitz.h --- vendor/freebsd/src/sys/netinet/toeplitz.h 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet/toeplitz.h 2010-06-07 13:14:58.623200617 +0100 @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2010 David Malone + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_TOEPLITZ_H_ +#define _NETINET_TOEPLITZ_H_ + +/* + * Toeplitz (RSS) hash algorithm; possibly we should cache intermediate + * results between runs, in which case we'll need explicit init/destroy and + * state management. + */ +uint32_t toeplitz_hash(u_int keylen, const uint8_t *key, + u_int datalen, const uint8_t *data); + +#endif /* !_NETINET_TOEPLITZ_H_ */ diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet/udp_usrreq.c user/rwatson/tcp/src/sys/netinet/udp_usrreq.c --- vendor/freebsd/src/sys/netinet/udp_usrreq.c 2011-02-01 09:38:55.724042981 +0000 +++ user/rwatson/tcp/src/sys/netinet/udp_usrreq.c 2011-02-01 08:30:21.553409283 +0000 @@ -139,6 +139,7 @@ #endif VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ + SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &VNET_NAME(udpstat), udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); @@ -178,7 +179,8 @@ { in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); @@ -254,7 +256,7 @@ #endif #endif - INP_RLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); #ifdef IPSEC /* Check AH/ESP integrity. */ @@ -448,12 +450,12 @@ } #endif - INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); last = NULL; LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) @@ -567,8 +569,9 @@ /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, ifp); + inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, + ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; @@ -582,27 +585,26 @@ UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; + goto badunlocked; } if (V_udp_blackhole) - goto badheadlocked; + goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) - goto badheadlocked; + goto badunlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Check the minimum TTL for socket. */ - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); - goto badunlocked; + m_freem(m); + return; } up = intoudpcb(inp); if (up->u_tun_func == NULL) { @@ -677,17 +679,15 @@ return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, 0, NULL); + inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); } else in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); @@ -776,9 +776,9 @@ INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); @@ -817,12 +817,11 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -830,10 +829,8 @@ if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_udbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1028,14 +1025,16 @@ * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. + * + * XXXRW: Check that hash locking update here is correct. */ sin = (struct sockaddr_in *)addr; INP_RLOCK(inp); if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_udbinfo); unlock_udbinfo = 2; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || @@ -1043,11 +1042,7 @@ (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) { - INP_RUNLOCK(inp); - INP_INFO_RLOCK(&V_udbinfo); - INP_RLOCK(inp); - } + INP_HASH_RLOCK(&V_udbinfo); unlock_udbinfo = 1; } else unlock_udbinfo = 0; @@ -1060,7 +1055,7 @@ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { @@ -1111,7 +1106,7 @@ inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); @@ -1125,8 +1120,8 @@ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { - INP_INFO_WLOCK_ASSERT(&V_udbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(&V_udbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1222,9 +1217,9 @@ UDPSTAT_INC(udps_opackets); if (unlock_udbinfo == 2) - INP_INFO_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(&V_udbinfo); else if (unlock_udbinfo == 1) - INP_INFO_RUNLOCK(&V_udbinfo); + INP_HASH_RUNLOCK(&V_udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); if (unlock_udbinfo == 2) @@ -1235,11 +1230,11 @@ release: if (unlock_udbinfo == 2) { + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } else if (unlock_udbinfo == 1) { + INP_HASH_RUNLOCK(&V_udbinfo); INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); } else INP_RUNLOCK(inp); m_freem(m); @@ -1390,15 +1385,15 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1465,11 +1460,11 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1480,15 +1475,15 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1500,25 +1495,23 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1550,21 +1543,19 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (ENOTCONN); } - + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (0); } diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet6/in6_pcb.c user/rwatson/tcp/src/sys/netinet6/in6_pcb.c --- vendor/freebsd/src/sys/netinet6/in6_pcb.c 2011-02-01 09:38:56.213433672 +0000 +++ user/rwatson/tcp/src/sys/netinet6/in6_pcb.c 2011-02-01 08:30:21.673495216 +0000 @@ -1,7 +1,11 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -66,6 +70,8 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_pcbgroup.h" +#include "opt_rss.h" #include #include @@ -94,6 +100,8 @@ #include #include #include +#include +#include #include #include @@ -111,17 +119,18 @@ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; - int error, wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error, lookupflags = 0; + int reuseport = (so->so_options & SO_REUSEPORT); - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in6_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) @@ -224,7 +233,7 @@ } } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && (reuseport & ((t->inp_flags & INP_TIMEWAIT) ? intotw(t)->tw_so_options : t->inp_socket->so_options)) == 0) @@ -235,7 +244,7 @@ in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { if ((reuseport & intotw(t)->tw_so_options) == 0 && @@ -290,8 +299,8 @@ int scope_ambiguous = 0; struct in6_addr in6a; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (nam->sa_len != sizeof (*sin6)) return (EINVAL); @@ -352,15 +361,16 @@ * then pick one. */ int -in6_pcbconnect(register struct inpcb *inp, struct sockaddr *nam, - struct ucred *cred) +in6_pcbconnect_mbuf(register struct inpcb *inp, struct sockaddr *nam, + struct ucred *cred, struct mbuf *m) { + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Call inner routine, to assign local interface address. @@ -369,7 +379,7 @@ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); - if (in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, + if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, @@ -392,17 +402,24 @@ inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - in_pcbrehash(inp); + in_pcbrehash_mbuf(inp, m); return (0); } +int +in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + + return (in6_pcbconnect_mbuf(inp, nam, cred, NULL)); +} + void in6_pcbdisconnect(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; @@ -576,7 +593,7 @@ notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; - INP_INFO_WLOCK(pcbinfo); + INP_HASH_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { @@ -632,22 +649,26 @@ } else INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(pcbinfo); + INP_HASH_WUNLOCK(pcbinfo); } /* - * Lookup a PCB based on the local address and port. + * Lookup a PCB based on the local address and port. Caller must hold the + * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, - u_short lport, int wild_okay, struct ucred *cred) + u_short lport, int lookupflags, struct ucred *cred) { register struct inpcb *inp; int matchwild = 3, wildcard; - INP_INFO_WLOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); - if (!wild_okay) { + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (!(lookupflags & INPLOOKUP_WILDCARD)) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that @@ -798,20 +819,159 @@ return inp; } +#ifdef PCBGROUP +/* + * Lookup PCB in hash list, using pcbgroup tables. + */ +static struct inpcb * +in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + int faith; + + if (faithprefix_p != NULL) + faith = (*faithprefix_p)(laddr); + else + faith = 0; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[ + INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && + IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP6)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, ipw_entry) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || + inp->inp_lport != lport) { + continue; + } + + /* XXX inp locking */ + if (faith && (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP6); + if (injail) { + if (prison_check_ip6(inp->inp_cred, + laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { + if (injail) + goto found; + else + local_exact = inp; + } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; + if (inp != NULL) + goto found; + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking buf", __func__); + return (inp); +} +#endif /* PCBGROUP */ + /* * Lookup PCB in hash list. */ struct inpcb * -in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, - u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int wildcard, - struct ifnet *ifp) +in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, + u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; int faith; - INP_INFO_LOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + + INP_HASH_LOCK_ASSERT(pcbinfo); if (faithprefix_p != NULL) faith = (*faithprefix_p)(laddr); @@ -850,7 +1010,7 @@ /* * Then look for a wildcard match, if requested. */ - if (wildcard == INPLOOKUP_WILDCARD) { + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; @@ -901,18 +1061,128 @@ } } /* LIST_FOREACH */ - if (jail_wild != NULL) - return (jail_wild); - if (local_exact != NULL) - return (local_exact); - if (local_wild != NULL) - return (local_wild); - } /* if (wildcard == INPLOOKUP_WILDCARD) */ + inp = jail_wild; + if (inp == NULL) + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; + if (inp != NULL) + return (inp); + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ + return (NULL); +} + +/* + * Lookup PCB in hash list, using pcbinfo tables. This variation locks the + * hash list lock, and will return the inpcb locked (i.e., requires + * INPLOOKUP_LOCKPCB). + */ +static struct inpcb * +in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, + u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, + struct ifnet *ifp) +{ + struct inpcb *inp; + + INP_HASH_RLOCK(pcbinfo); + inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + if (inp != NULL) { + if (lookupflags & INPLOOKUP_WLOCKPCB) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + } else + INP_HASH_RUNLOCK(pcbinfo); + return (inp); +} + +/* + * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf + * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in6_pcbgroup.c. + */ +struct inpcb * +in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, + struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) +{ +#if defined(PCBGROUP) && !defined(RSS) + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); /* - * Not found. + * When not using RSS, use connection groups in preference to the + * reservation table when looking up 4-tuples. When using RSS, just + * use the reservation table, due to the cost of the Toeplitz hash + * in software. + * + * XXXRW: This policy belongs in the pcbgroup code, as in principle + * we could be doing RSS with a non-Toeplitz hash that is affordable + * in software. */ - return (NULL); +#if defined(PCBGROUP) && !defined(RSS) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} + +struct inpcb * +in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, + u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, + struct ifnet *ifp, struct mbuf *m) +{ +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; + + /* + * If we can use a hardware-generated hash to look up the connection + * group, use that connection group to find the inpcb. Otherwise + * fall back on a software hash -- or the reservation table if we're + * using RSS. + * + * XXXRW: As above, that policy belongs in the pcbgroup code. + */ + if (in_pcbgroup_enabled(pcbinfo) && + !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { + pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); +#ifndef RSS + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); +#endif + } +#endif + return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); } void diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet6/in6_pcb.h user/rwatson/tcp/src/sys/netinet6/in6_pcb.h --- vendor/freebsd/src/sys/netinet6/in6_pcb.h 2011-02-01 09:38:56.213433672 +0000 +++ user/rwatson/tcp/src/sys/netinet6/in6_pcb.h 2011-02-04 20:43:28.203678508 +0000 @@ -69,10 +69,22 @@ #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) +struct inpcbgroup * + in6_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in6_pcbgroup_byinpcb __P((struct inpcb *)); +struct inpcbgroup * + in6_pcbgroup_bymbuf(struct inpcbinfo *, struct mbuf *); +struct inpcbgroup * + in6_pcbgroup_bytuple __P((struct inpcbinfo *, const struct in6_addr *, + u_short, const struct in6_addr *, u_short)); + void in6_pcbpurgeif0 __P((struct inpcbinfo *, struct ifnet *)); void in6_losing __P((struct inpcb *)); int in6_pcbbind __P((struct inpcb *, struct sockaddr *, struct ucred *)); int in6_pcbconnect __P((struct inpcb *, struct sockaddr *, struct ucred *)); +int in6_pcbconnect_mbuf __P((struct inpcb *, struct sockaddr *, + struct ucred *, struct mbuf *)); void in6_pcbdisconnect __P((struct inpcb *)); int in6_pcbladdr(struct inpcb *, struct sockaddr *, struct in6_addr *); struct inpcb * @@ -80,9 +92,17 @@ struct in6_addr *, u_short, int, struct ucred *)); struct inpcb * - in6_pcblookup_hash __P((struct inpcbinfo *, - struct in6_addr *, u_int, struct in6_addr *, - u_int, int, struct ifnet *)); + in6_pcblookup __P((struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, + struct ifnet *ifp)); +struct inpcb * + in6_pcblookup_hash_locked __P((struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, + struct ifnet *ifp)); +struct inpcb * + in6_pcblookup_mbuf __P((struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, + struct ifnet *ifp, struct mbuf *)); void in6_pcbnotify __P((struct inpcbinfo *, struct sockaddr *, u_int, const struct sockaddr *, u_int, int, void *, struct inpcb *(*)(struct inpcb *, int))); diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet6/in6_pcbgroup.c user/rwatson/tcp/src/sys/netinet6/in6_pcbgroup.c --- vendor/freebsd/src/sys/netinet6/in6_pcbgroup.c 1970-01-01 01:00:00.000000000 +0100 +++ user/rwatson/tcp/src/sys/netinet6/in6_pcbgroup.c 2011-02-04 20:43:28.232596708 +0000 @@ -0,0 +1,158 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_rss.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif /* INET6 */ + +/* + * Given a hash of whatever the covered tuple might be, return a pcbgroup + * index. Where RSS is supported, try to align bucket selection with RSS CPU + * affinity strategy. + */ +static __inline u_int +in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) +{ + +#ifdef RSS + return (rss_getbucket(hash)); +#else + return (hash % pcbinfo->ipi_npcbgroups); +#endif +} + +/* + * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash + * information is insufficient to identify the pcbgroup. This might occur if + * a TCP packet turnsup with a 2-tuple hash, or if an RSS hash is present but + * RSS is not compiled into the kernel. + */ +struct inpcbgroup * +in6_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) +{ + +#ifdef RSS + if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && + hashtype == M_HASHTYPE_RSS_4TUPLE) || + (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && + hashtype == M_HASHTYPE_RSS_2TUPLE)) + return (&pcbinfo->ipi_pcbgroups[ + in_pcbgroup_getbucket(pcbinfo, hash)]); +#endif + return (NULL); +} + +struct inpcbgroup * +in6_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) +{ + + return (in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid)); +} + +struct inpcbgroup * +in6_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, const struct in6_addr *laddrp, + u_short lport, const struct in6_addr *faddrp, u_short fport) +{ + uint32_t hash; + + /* + * RSS note: we pass foreign addr/port as source, and local addr/port + * as destination, as we want to align with what the hardware is + * doing. + */ + switch (pcbinfo->ipi_hashfields) { + case IPI_HASHFIELDS_4TUPLE: +#ifdef RSS + hash = rss_hash_ip6_4tuple(*faddrp, fport, *laddrp, lport); +#else + hash = faddrp->s6_addr32[3] ^ fport; +#endif + break; + + case IPI_HASHFIELDS_2TUPLE: +#ifdef RSS + hash = rss_hash_ip6_2tuple(*faddrp, *laddrp); +#else + hash = faddrp->s6_addr32[3] ^ laddrp->s6_addr32[3]; +#endif + + default: + hash = 0; + } + return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, + hash)]); +} + +struct inpcbgroup * +in6_pcbgroup_byinpcb(struct inpcb *inp) +{ + + return (in6_pcbgroup_bytuple(inp->inp_pcbinfo, &inp->in6p_laddr, + inp->inp_lport, &inp->in6p_faddr, inp->inp_fport)); +} diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet6/in6_src.c user/rwatson/tcp/src/sys/netinet6/in6_src.c --- vendor/freebsd/src/sys/netinet6/in6_src.c 2011-02-01 09:38:56.323453800 +0000 +++ user/rwatson/tcp/src/sys/netinet6/in6_src.c 2011-01-27 21:36:20.353444221 +0000 @@ -851,11 +851,11 @@ { struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; - int count, error, wild = 0, dorandom; + int count, error, lookupflags = 0, dorandom; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); error = prison_local_ip6(cred, laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); @@ -864,7 +864,7 @@ /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; @@ -935,7 +935,7 @@ *lastport = first; lport = htons(*lastport); } while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, - lport, wild, cred)); + lport, lookupflags, cred)); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { diff -urN -x compile -x LINT vendor/freebsd/src/sys/netinet6/udp6_usrreq.c user/rwatson/tcp/src/sys/netinet6/udp6_usrreq.c --- vendor/freebsd/src/sys/netinet6/udp6_usrreq.c 2011-02-01 09:38:57.156388712 +0000 +++ user/rwatson/tcp/src/sys/netinet6/udp6_usrreq.c 2011-02-01 08:30:21.773457362 +0000 @@ -231,11 +231,11 @@ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; - INP_INFO_RLOCK(&V_udbinfo); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct ip6_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address @@ -363,11 +363,13 @@ INP_RUNLOCK(last); return (IPPROTO_DONE); } + /* * Locate pcb for datagram. */ - inp = in6_pcblookup_hash(&V_udbinfo, &ip6->ip6_src, uh->uh_sport, - &ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + inp = in6_pcblookup_mbuf(&V_udbinfo, &ip6->ip6_src, uh->uh_sport, + &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; @@ -384,9 +386,8 @@ if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); - goto badheadlocked; + goto badunlocked; } - INP_INFO_RUNLOCK(&V_udbinfo); if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) @@ -394,8 +395,7 @@ icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (up->u_tun_func == NULL) { udp6_append(inp, m, off, &fromsa); @@ -505,13 +505,11 @@ (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } - INP_INFO_RLOCK(&V_udbinfo); - inp = in6_pcblookup_hash(&V_udbinfo, &addrs[1].sin6_addr, - addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 1, - NULL); + inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -520,10 +518,8 @@ if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_udbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -552,6 +548,7 @@ struct sockaddr_in6 tmp; INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (addr6) { /* addr6 has been validated in udp6_send(). */ @@ -769,15 +766,15 @@ } #endif - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + INP_HASH_WLOCK(&V_udbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -835,8 +832,8 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_udbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { @@ -860,8 +857,8 @@ error = in6_pcbbind(inp, nam, td->td_ucred); out: + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -882,15 +879,15 @@ return; } #endif - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + INP_HASH_WLOCK(&V_udbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -904,7 +901,9 @@ sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); + /* + * XXXRW: Need to clarify locking of v4/v6 flags. + */ INP_WLOCK(inp); if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { @@ -918,8 +917,10 @@ error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); if (error == 0) { inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; @@ -934,7 +935,9 @@ error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; + INP_HASH_WLOCK(&V_udbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); if (error == 0) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { /* should be non mapped addr */ @@ -945,7 +948,6 @@ } out: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -977,32 +979,31 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); - INP_WLOCK(inp); - #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; - error = (*pru->pru_disconnect)(so); - goto out; + return ((*pru->pru_disconnect)(so)); } #endif + INP_WLOCK(inp); + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto out; } + INP_HASH_WLOCK(&V_udbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + INP_HASH_WUNLOCK(&V_udbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); out: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (0); } @@ -1016,7 +1017,10 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); + /* + * XXXRW: UDPv6 locking during output is much more conservative than + * UDPv4 locking. This should be fixed. + */ INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { @@ -1065,7 +1069,6 @@ * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; @@ -1078,17 +1081,17 @@ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif + INP_HASH_WLOCK(&V_udbinfo); error = udp6_output(inp, m, addr, control, td); + INP_HASH_WUNLOCK(&V_udbinfo); #ifdef INET out: #endif INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); bad: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); m_freem(m); return (error); } diff -urN -x compile -x LINT vendor/freebsd/src/sys/sys/mbuf.h user/rwatson/tcp/src/sys/sys/mbuf.h --- vendor/freebsd/src/sys/sys/mbuf.h 2011-02-01 09:39:10.063486681 +0000 +++ user/rwatson/tcp/src/sys/sys/mbuf.h 2011-02-01 09:49:38.473420346 +0000 @@ -200,6 +200,9 @@ #define M_PROTO7 0x00100000 /* protocol-specific */ #define M_PROTO8 0x00200000 /* protocol-specific */ #define M_FLOWID 0x00400000 /* flowid is valid */ +#define M_DISTRIBUTED 0x00800000 /* at least one layer has load balanced */ +#define M_HASHTYPEBITS 0x07000000 /* mask of bits holding flowid hash type */ + /* * For RELENG_{6,7} steal these flags for limited multiple routing table * support. In RELENG_8 and beyond, use just one flag and a tag. @@ -215,11 +218,39 @@ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8) /* + * Increasingly, network interface cards are able to hash higher-level + * protocol fields (such as IP addresses and port numbers) to identify + * possible flows that received packets may belong to. NICs use these hashes + * to maintain ordering while load balancing, as well as (in many cases) + * provide a stateless affinity model. When NICs pass up the resulting hash, + * the driver will store it in m->m_pkthdr.flowid, and set m_flag bits to + * indicate how the hash should be interpreted. + * + * Most NICs support RSS, which provides ordering and explicit affinity, and + * use the hash m_flag bits to indicate what header fields were covered by + * the hash. M_HASHTYPE_OPAQUE can be set by non-RSS cards or configurations + * that provide a flow identifier, allowing for ordering and distribution + * without explicit affinity. + */ +#define M_HASHTYPE_SHIFT 24 +#define M_HASHTYPE_NONE 0x0 +#define M_HASHTYPE_RSS_2TUPLE 0x1 +#define M_HASHTYPE_RSS_4TUPLE 0x2 +#define M_HASHTYPE_OPAQUE 0x7 + +#define M_HASHTYPE_CLEAR(m) (m)->m_flags &= ~(M_HASHTYPEBITS) +#define M_HASHTYPE_GET(m) (((m)->m_flags & M_HASHTYPEBITS) >> \ + M_HASHTYPE_SHIFT) +#define M_HASHTYPE_SET(m, v) (m)->m_flags |= (((v) << M_HASHTYPE_SHIFT) & \ + M_HASHTYPEBITS) +#define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) + +/* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_PROTOFLAGS|M_SKIP_FIREWALL|M_BCAST|M_MCAST|\ - M_FRAG|M_FIRSTFRAG|M_LASTFRAG|M_VLANTAG|M_PROMISC|M_FIB) + M_FRAG|M_FIRSTFRAG|M_LASTFRAG|M_VLANTAG|M_PROMISC|M_FIB|M_HASHTYPEBITS) /* * External buffer types: identify ext_buf type. diff -urN -x compile -x LINT vendor/freebsd/src/sys/sys/pcpu.h user/rwatson/tcp/src/sys/sys/pcpu.h --- vendor/freebsd/src/sys/sys/pcpu.h 2011-02-01 09:39:11.207128960 +0000 +++ user/rwatson/tcp/src/sys/sys/pcpu.h 2010-06-02 18:27:06.525396180 +0100 @@ -142,7 +142,6 @@ struct vmmeter pc_cnt; /* VM stats counters */ long pc_cp_time[CPUSTATES]; /* statclock ticks */ struct device *pc_device; - void *pc_netisr; /* netisr SWI cookie */ int pc_dnweight; /* vm_page_dontneed() */ /* diff -urN -x compile -x LINT vendor/freebsd/src/sys/sys/priv.h user/rwatson/tcp/src/sys/sys/priv.h --- vendor/freebsd/src/sys/sys/priv.h 2011-02-01 09:39:11.811069405 +0000 +++ user/rwatson/tcp/src/sys/sys/priv.h 2010-06-05 20:11:07.527371933 +0100 @@ -387,6 +387,7 @@ #define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ #define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ #define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ +#define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6 */ /* * IPX/SPX privileges.