--- //depot/vendor/freebsd/src/sys/conf/files 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/conf/files 2011-06-06 13:20:23.000000000 0000 @@ -2752,6 +2752,7 @@ netinet/in_proto.c optional inet | inet6 \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet +netinet/in_rss.c optional inet rss | inet6 rss netinet/ip_divert.c optional inet ipdivert ipfirewall netinet/ipfw/dn_heap.c optional inet dummynet netinet/ipfw/dn_sched_fifo.c optional inet dummynet @@ -2810,6 +2811,7 @@ netinet/tcp_timer.c optional inet | inet6 netinet/tcp_timewait.c optional inet | inet6 netinet/tcp_usrreq.c optional inet | inet6 +netinet/toeplitz.c optional inet rss | inet6 rss netinet/udp_usrreq.c optional inet | inet6 netinet/libalias/alias.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_db.c optional libalias inet | netgraph_nat inet --- //depot/vendor/freebsd/src/sys/conf/options 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/conf/options 2011-06-06 13:20:23.000000000 0000 @@ -422,6 +423,7 @@ PCBGROUP opt_pcbgroup.h RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h +RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_OFFLOAD_DISABLE opt_inet.h #Disable code to dispatch tcp offloading --- //depot/vendor/freebsd/src/sys/dev/cxgb/common/cxgb_common.h 2011-03-25 20:55:29.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_common.h 2011-05-21 20:03:25.000000000 0000 @@ -733,6 +733,8 @@ void t3_config_rss(adapter_t *adapter, unsigned int rss_config, const u8 *cpus, const u16 *rspq); int t3_read_rss(adapter_t *adapter, u8 *lkup, u16 *map); +void t3_config_rss_secret(adapter_t *adapter, u8 *keyp); +void t3_read_rss_secret(adapter_t *adapter, u8 *keyp); int t3_set_proto_sram(adapter_t *adap, const u8 *data); int t3_mps_set_active_ports(adapter_t *adap, unsigned int port_mask); void t3_port_failover(adapter_t *adapter, int port); --- //depot/vendor/freebsd/src/sys/dev/cxgb/common/cxgb_t3_hw.c 2011-03-24 01:15:28.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/dev/cxgb/common/cxgb_t3_hw.c 2011-05-21 20:03:25.000000000 0000 @@ -3010,6 +3010,41 @@ } /** + * t3_config_rss_secret - write RSS key + * @adapter: the adapter + * @keyp: the 16-byte RSS key + * + * Writes the RSS hash key. + */ +void t3_config_rss_secret(adapter_t *adapter, u8 *keyp) +{ + const u32 *buf = (const u32 *)keyp; + + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY3, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY2, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY1, cpu_to_be32(*buf++)); + t3_write_reg(adapter, A_TP_RSS_SECRET_KEY0, cpu_to_be32(*buf++)); +} + +/** + * tr_read_rss_secret - read RSS key + * @adapter: the adapter + * @keyp: storage for the retrieved 16-byte RSS key + * + * Reads the RSS hash key. + */ +void t3_read_rss_secret(adapter_t *adapter, u8 *keyp) +{ + u32 *buf = (u32 *)keyp; + + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY3)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY2)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY1)); + *buf++ = be32_to_cpu(t3_read_reg(adapter, A_TP_RSS_SECRET_KEY0)); +} + + +/** * t3_tp_set_offload_mode - put TP in NIC/offload mode * @adap: the adapter * @enable: 1 to select offload mode, 0 for regular NIC --- //depot/vendor/freebsd/src/sys/dev/cxgb/cxgb_main.c 2011-03-24 01:20:16.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/dev/cxgb/cxgb_main.c 2011-05-21 20:03:25.000000000 0000 @@ -27,6 +27,8 @@ ***************************************************************************/ +#include "opt_rss.h" + #include __FBSDID("$FreeBSD: src/sys/dev/cxgb/cxgb_main.c,v 1.122 2011/03/24 01:16:48 np Exp $"); @@ -66,6 +68,7 @@ #include #include +#include #include #include #include @@ -1404,9 +1407,24 @@ { int i; u_int nq[2]; +#ifdef RSS + uint8_t key[RSS_KEYSIZE]; +#endif uint8_t cpus[SGE_QSETS + 1]; uint16_t rspq_map[RSS_TABLE_SIZE]; - + +#ifdef RSS + /* + * XXXRW: Implicit assumption that RSS_KEYSIZE >= the size of the key + * written by t3_config_rss_secret(). + * + * XXXRW: need to program rspq array. + * + * XXXRW: need to program cpu array. + */ + rss_getkey(key); + t3_config_rss_secret(adap, key); +#endif for (i = 0; i < SGE_QSETS; ++i) cpus[i] = i; cpus[SGE_QSETS] = 0xff; --- //depot/vendor/freebsd/src/sys/dev/cxgb/cxgb_osdep.h 2010-02-24 10:20:14.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/dev/cxgb/cxgb_osdep.h 2010-06-12 00:54:09.000000000 0000 @@ -201,6 +201,7 @@ #define max_t(type, a, b) (type)max((a), (b)) #define net_device ifnet #define cpu_to_be32 htobe32 +#define be32_to_cpu be32toh /* Standard PHY definitions */ #define BMCR_LOOPBACK BMCR_LOOP --- //depot/vendor/freebsd/src/sys/dev/cxgb/cxgb_sge.c 2011-03-24 01:20:16.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/dev/cxgb/cxgb_sge.c 2011-06-04 21:25:59.000000000 0000 @@ -2974,6 +2974,7 @@ uint32_t flags = ntohl(r->flags); uint32_t rss_csum = *(const uint32_t *)r; uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); + uint8_t hash_type = r->rss_hdr.hash_type; eth = (r->rss_hdr.opcode == CPL_RX_PKT); @@ -3022,10 +3023,32 @@ int drop_thresh = eth ? SGE_RX_DROP_THRES : 0; eop = get_packet(adap, drop_thresh, qs, mh, r); - if (eop) { - if (r->rss_hdr.hash_type && !adap->timestamp) - mh->mh_head->m_flags |= M_FLOWID; - mh->mh_head->m_pkthdr.flowid = rss_hash; + if (eop && !adap->timestamp) { + struct mbuf *m = rspq->rspq_mh.mh_head; + + if (hash_type) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = rss_hash; + } + switch (hash_type) { + case 0: + M_HASHTYPE_SET(m, M_HASHTYPE_NONE); + break; + + case 1: + M_HASHTYPE_SET(m, + M_HASHTYPE_RSS_IPV4); + break; + + case 2: + M_HASHTYPE_SET(m, + M_HASHTYPE_RSS_TCP_IPV4); + break; + + case 3: + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); + break; + } } ethpad = 2; --- //depot/vendor/freebsd/src/sys/modules/cxgb/cxgb/Makefile 2010-02-24 10:20:14.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/modules/cxgb/cxgb/Makefile 2010-11-29 22:02:57.000000000 0000 @@ -8,7 +8,7 @@ SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c cxgb_aq100x.c SRCS+= cxgb_sge.c cxgb_offload.c cxgb_tn1010.c SRCS+= device_if.h bus_if.h pci_if.h -SRCS+= opt_inet.h opt_zero.h opt_sched.h +SRCS+= opt_inet.h opt_rss.h opt_zero.h opt_sched.h SRCS+= uipc_mvec.c CFLAGS+= -g -DDEFAULT_JUMBO -I${CXGB} --- //depot/vendor/freebsd/src/sys/net/if_ethersubr.c 2011-06-01 20:05:15.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/net/if_ethersubr.c 2011-06-01 20:29:53.000000000 0000 @@ -36,6 +36,7 @@ #include "opt_ipx.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" +#include "opt_rss.h" #include #include @@ -69,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +108,9 @@ CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); + /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); @@ -756,7 +761,22 @@ /* * Ethernet input dispatch; by default, direct dispatch here regardless of - * global configuration. + * global configuration. However, if RSS is enabled, hook up RSS affinity + * so that when deferred or hybrid dispatch is enabled, we can redistribute + * load based on RSS. + * + * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or + * not it had already done work distribution via multi-queue. Then we could + * direct dispatch in the event load balancing was already complete and + * handle the case of interfaces with different capabilities better. + * + * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions + * at multiple layers? + * + * XXXRW: For now, enable all this only if RSS is compiled in, although it + * works fine without RSS. Need to characterise the performance overhead + * of the detour through the netisr code in the event the result is always + * direct dispatch. */ static void ether_nh_input(struct mbuf *m) @@ -769,8 +789,14 @@ .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, +#ifdef RSS + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_DIRECT, + .nh_m2cpuid = rss_m2cpuid, +#else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, +#endif }; static void @@ -1048,8 +1074,6 @@ } #endif -SYSCTL_DECL(_net_link); -SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) SYSCTL_VNET_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, &VNET_NAME(ether_ipfw), 0, "Pass ether pkts through firewall"); --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.c 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/netinet/in_pcb.c 2011-06-06 13:20:23.000000000 0000 @@ -43,6 +43,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pcbgroup.h" +#include "opt_rss.h" /* XXXRW: possibly a bug. */ #include #include @@ -74,6 +75,7 @@ #if defined(INET) || defined(INET6) #include #include +#include #include #include #include @@ -1796,7 +1802,7 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { -#if defined(PCBGROUP) +#if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif @@ -1805,7 +1811,17 @@ KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); -#if defined(PCBGROUP) + /* + * When not using RSS, use connection groups in preference to the + * reservation table when looking up 4-tuples. When using RSS, just + * use the reservation table, due to the cost of the Toeplitz hash + * in software. + * + * XXXRW: This policy belongs in the pcbgroup code, as in principle + * we could be doing RSS with a non-Toeplitz hash that is affordable + * in software. + */ +#if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); @@ -1832,16 +1848,27 @@ ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP - if (in_pcbgroup_enabled(pcbinfo)) { + /* + * If we can use a hardware-generated hash to look up the connection + * group, use that connection group to find the inpcb. Otherwise + * fall back on a software hash -- or the reservation table if we're + * using RSS. + * + * XXXRW: As above, that policy belongs in the pcbgroup code. + */ + if (in_pcbgroup_enabled(pcbinfo) && + !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, --- //depot/vendor/freebsd/src/sys/netinet/in_pcb.h 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/netinet/in_pcb.h 2011-06-06 13:20:23.000000000 0000 @@ -38,6 +38,7 @@ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ +#include #include #include #include --- //depot/vendor/freebsd/src/sys/netinet/in_pcbgroup.c 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/netinet/in_pcbgroup.c 2011-06-06 13:20:23.000000000 0000 @@ -29,9 +29,10 @@ #include -__FBSDID("$FreeBSD: src/sys/netinet/in_pcbgroup.c,v 1.1 2011/06/06 12:55:02 rwatson Exp $"); +__FBSDID("$FreeBSD$"); #include "opt_inet6.h" +#include "opt_rss.h" #include #include @@ -43,6 +44,7 @@ #include #include +#include #ifdef INET6 #include #endif /* INET6 */ @@ -60,6 +62,13 @@ * minimal cache line migration and lock contention during steady state * operation. * + * Hardware-offloaded checksums are often inefficient in software -- for + * example, Toeplitz, specified by RSS, introduced a significant overhead if + * performed during per-packge processing. It is therefore desirable to fall + * back on traditional reservation table lookups without affinity where + * hardware-offloaded checksums aren't available, such as for traffic over + * non-RSS interfaces. + * * Internet protocols, such as UDP and TCP, register to use connection groups * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this * indicates to the connection group code whether a 2-tuple or 4-tuple is @@ -72,6 +81,11 @@ * signficantly more expensive than without connection groups, but that * steady-state processing can be significantly faster. * + * When RSS is used, certain connection group parameters, such as the number + * of groups, are provided by the RSS implementation, found in in_rss.c. + * Otherwise, in_pcbgroup.c selects possible sensible parameters + * corresponding to the degree of parallelism exposed by netisr. + * * Most of the implementation of connection groups is in this file; however, * connection group lookup is implemented in in_pcb.c alongside reservation * table lookups -- see in_pcblookup_group(). @@ -126,11 +140,25 @@ if (mp_ncpus == 1) return; +#ifdef RSS + /* + * If we're using RSS, then RSS determines the number of connection + * groups to use: one connection group per RSS bucket. If for some + * reason RSS isn't able to provide a number of buckets, disable + * connection groups entirely. + * + * XXXRW: Can this ever happen? + */ + numpcbgroups = rss_getnumbuckets(); + if (numpcbgroups == 0) + return; +#else /* - * Use one group per CPU for now. If we decide to do dynamic - * rebalancing a la RSS, we'll need to shift left by at least 1. + * Otherwise, we'll just use one per CPU for now. If we decide to + * do dynamic rebalancing a la RSS, we'll need similar logic here. */ numpcbgroups = mp_ncpus; +#endif pcbinfo->ipi_hashfields = hashfields; pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * @@ -146,10 +174,19 @@ /* * Initialise notional affinity of the pcbgroup -- for RSS, - * we want the same notion of affinity as NICs to be used. - * Just round robin for the time being. + * we want the same notion of affinity as NICs to be used. In + * the non-RSS case, just round robin for the time being. + * + * XXXRW: The notion of a bucket to CPU mapping is common at + * both pcbgroup and RSS layers -- does that mean that we + * should migrate it all from RSS to here, and just leave RSS + * responsible only for providing hashing and mapping funtions? */ +#ifdef RSS + pcbgroup->ipg_cpu = rss_getcpu(pgn); +#else pcbgroup->ipg_cpu = (pgn % mp_ncpus); +#endif } } @@ -179,23 +216,38 @@ /* * Given a hash of whatever the covered tuple might be, return a pcbgroup - * index. + * index. Where RSS is supported, try to align bucket selection with RSS CPU + * affinity strategy. */ static __inline u_int in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) { +#ifdef RSS + return (rss_getbucket(hash)); +#else return (hash % pcbinfo->ipi_npcbgroups); +#endif } /* * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash - * information is insufficient to identify the pcbgroup. + * information is insufficient to identify the pcbgroup. This might occur if + * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but + * RSS is not compiled into the kernel. */ struct inpcbgroup * in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) { +#ifdef RSS + if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && + hashtype == M_HASHTYPE_RSS_TCP_IPV4) || + (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && + hashtype == M_HASHTYPE_RSS_IPV4)) + return (&pcbinfo->ipi_pcbgroups[ + in_pcbgroup_getbucket(pcbinfo, hash)]); +#endif return (NULL); } @@ -213,13 +265,26 @@ { uint32_t hash; + /* + * RSS note: we pass foreign addr/port as source, and local addr/port + * as destination, as we want to align with what the hardware is + * doing. + */ switch (pcbinfo->ipi_hashfields) { case IPI_HASHFIELDS_4TUPLE: +#ifdef RSS + hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); +#else hash = faddr.s_addr ^ fport; +#endif break; case IPI_HASHFIELDS_2TUPLE: +#ifdef RSS + hash = rss_hash_ip4_2tuple(faddr, laddr); +#else hash = faddr.s_addr ^ laddr.s_addr; +#endif break; default: --- //depot/vendor/freebsd/src/sys/netinet6/in6_pcb.c 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/netinet6/in6_pcb.c 2011-06-06 13:20:23.000000000 0000 @@ -71,6 +71,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" +#include "opt_rss.h" #include #include @@ -1126,7 +1127,7 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { -#if defined(PCBGROUP) +#if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif @@ -1135,7 +1136,17 @@ KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); -#if defined(PCBGROUP) + /* + * When not using RSS, use connection groups in preference to the + * reservation table when looking up 4-tuples. When using RSS, just + * use the reservation table, due to the cost of the Toeplitz hash + * in software. + * + * XXXRW: This policy belongs in the pcbgroup code, as in principle + * we could be doing RSS with a non-Toeplitz hash that is affordable + * in software. + */ +#if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); @@ -1162,16 +1173,27 @@ ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP - if (in_pcbgroup_enabled(pcbinfo)) { + /* + * If we can use a hardware-generated hash to look up the connection + * group, use that connection group to find the inpcb. Otherwise + * fall back on a software hash -- or the reservation table if we're + * using RSS. + * + * XXXRW: As above, that policy belongs in the pcbgroup code. + */ + if (in_pcbgroup_enabled(pcbinfo) && + !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#ifndef RSS pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#endif } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, --- //depot/vendor/freebsd/src/sys/netinet6/in6_pcbgroup.c 2011-06-06 13:00:32.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/netinet6/in6_pcbgroup.c 2011-06-06 13:20:23.000000000 0000 @@ -29,38 +29,55 @@ #include -__FBSDID("$FreeBSD: src/sys/netinet6/in6_pcbgroup.c,v 1.1 2011/06/06 12:55:02 rwatson Exp $"); +__FBSDID("$FreeBSD$"); #include "opt_inet6.h" +#include "opt_rss.h" #include #include #include #include +#include #ifdef INET6 #include #endif /* INET6 */ /* * Given a hash of whatever the covered tuple might be, return a pcbgroup - * index. + * index. Where RSS is supported, try to align bucket selection with RSS CPU + * affinity strategy. */ static __inline u_int in6_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) { +#ifdef RSS + return (rss_getbucket(hash)); +#else return (hash % pcbinfo->ipi_npcbgroups); +#endif } /* * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash - * information is insufficient to identify the pcbgroup. + * information is insufficient to identify the pcbgroup. This might occur if + * a TCP packet turnsup with a 2-tuple hash, or if an RSS hash is present but + * RSS is not compiled into the kernel. */ struct inpcbgroup * in6_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) { +#ifdef RSS + if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && + hashtype == M_HASHTYPE_RSS_TCP_IPV4) || + (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && + hashtype == M_HASHTYPE_RSS_IPV4)) + return (&pcbinfo->ipi_pcbgroups[ + in6_pcbgroup_getbucket(pcbinfo, hash)]); +#endif return (NULL); } @@ -78,13 +95,26 @@ { uint32_t hash; + /* + * RSS note: we pass foreign addr/port as source, and local addr/port + * as destination, as we want to align with what the hardware is + * doing. + */ switch (pcbinfo->ipi_hashfields) { case IPI_HASHFIELDS_4TUPLE: +#ifdef RSS + hash = rss_hash_ip6_4tuple(*faddrp, fport, *laddrp, lport); +#else hash = faddrp->s6_addr32[3] ^ fport; +#endif break; case IPI_HASHFIELDS_2TUPLE: +#ifdef RSS + hash = rss_hash_ip6_2tuple(*faddrp, *laddrp); +#else hash = faddrp->s6_addr32[3] ^ laddrp->s6_addr32[3]; +#endif break; default: --- //depot/vendor/freebsd/src/sys/sys/mbuf.h 2011-06-05 10:10:22.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/sys/mbuf.h 2011-06-06 13:20:23.000000000 0000 @@ -200,6 +200,7 @@ #define M_PROTO7 0x00100000 /* protocol-specific */ #define M_PROTO8 0x00200000 /* protocol-specific */ #define M_FLOWID 0x00400000 /* deprecated: flowid is valid */ +#define M_DISTRIBUTED 0x00800000 /* at least one layer has load balanced */ #define M_HASHTYPEBITS 0x0F000000 /* mask of bits holding flowid hash type */ /* --- //depot/vendor/freebsd/src/sys/sys/priv.h 2011-04-10 18:36:22.000000000 0000 +++ //depot/user/rwatson/tcp/src/sys/sys/priv.h 2011-05-21 20:03:25.000000000 0000 @@ -388,6 +388,7 @@ #define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ #define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ #define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ +#define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6 */ /* * IPX/SPX privileges.