Index: sys/netinet/in_rss.c =================================================================== --- sys/netinet/in_rss.c (revision 269448) +++ sys/netinet/in_rss.c (working copy) @@ -57,6 +57,11 @@ #include #include +/* for software rss hash support */ +#include +#include +#include + /*- * Operating system parts of receiver-side scaling (RSS), which allows * network cards to direct flows to particular receive queues based on hashes @@ -170,6 +175,8 @@ }; static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; +static inline u_int rss_gethashconfig_local(void); + static void rss_init(__unused void *arg) { @@ -491,6 +498,191 @@ } /* + * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given + * IPv4 source/destination address, UDP or TCP source/destination ports + * and the protocol type. + * + * The protocol code may wish to do a software hash of the given + * tuple. This depends upon the currently configured RSS hash types. + * + * dir is 0 for in, 1 for out. + * proto is the IPv4 protocol type. + */ +int +rss_software_hash_proto_v4(struct in_addr src, struct in_addr dst, + u_short src_port, u_short dst_port, int proto, int dir, + uint32_t *hashval, uint32_t *hashtype) +{ + struct in_addr s, d; + u_short sp, dp; + uint32_t hash; + + /* first, assign data appropriately */ + if (dir == 0) { + s = src; + d = dst; + sp = src_port; + dp = dst_port; + } else { + s = dst; + d = src; + sp = dst_port; + dp = src_port; + } + + /* + * Next, choose the hash type depending upon the protocol + * identifier. + */ + if ((proto == IPPROTO_TCP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_TCP_IPV4; + return (0); + } else if ((proto == IPPROTO_UDP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_UDP_IPV4; + return (0); + } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { + /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ + hash = rss_hash_ip4_2tuple(s, d); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_IPV4; + return (0); + } + + /* No configured available hashtypes! */ + return (-1); +} + +/* + * Do a software calculation of the RSS for the given mbuf. + * + * This is typically used by the input path to recalculate the RSS after + * some form of packet processing (eg de-capsulation, IP fragment reassembly.) + * + * dir is 0 for in, 1 for out. + * + * Returns 0 if a hash was done, -1 if no hash was done, +1 if + * the mbuf already had a valid RSS flowid. + * + * This function doesn't modify the mbuf. It's up to the caller to + * assign flowid/flowtype as appropriate. + * + * TODO: Make this more efficient! + */ +int +rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, + uint32_t *hashtype) +{ + const struct ip *ip; + const struct tcphdr *th; + const struct udphdr *uh; + uint8_t proto; + int iphlen; + + /* + * First, validate that the mbuf we have is long enough + * to have an IPv4 header in it. + */ + + if (m->m_pkthdr.len < (sizeof(struct ip))) + return (-1); + if (m->m_len < (sizeof(struct ip))) + return (-1); + + /* Ok, let's dereference that */ + ip = mtod(m, struct ip *); + proto = ip->ip_p; + /* XXX unaligned access! */ + iphlen = ip->ip_hl << 2; + + /* + * If the mbuf flowid/flowtype matches the packet type, + * then signal to the owner that it can trust the flowid/flowtype + * details. + */ + if (m->m_flags & M_FLOWID) { + uint32_t flowid, flowtype; + + flowid = m->m_pkthdr.flowid; + flowtype = M_HASHTYPE_GET(m); + + switch (proto) { + case IPPROTO_UDP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && + flowtype == M_HASHTYPE_RSS_UDP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + case IPPROTO_TCP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && + flowtype == M_HASHTYPE_RSS_TCP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + default: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + } + } + + /* + * Decode enough information to make a hash decision. + */ + if (proto == IPPROTO_TCP) { + if (m->m_len < iphlen + sizeof(struct tcphdr)) + return (-1); + th = (struct tcphdr *)((caddr_t)ip + iphlen); + return rss_software_hash_proto_v4(ip->ip_src, ip->ip_dst, + th->th_sport, + th->th_dport, + proto, + dir, + hashval, + hashtype); + } else if (proto == IPPROTO_UDP) { + uh = (struct udphdr *)((caddr_t)ip + iphlen); + if (m->m_len < iphlen + sizeof(struct udphdr)) + return (-1); + return rss_software_hash_proto_v4(ip->ip_src, ip->ip_dst, + uh->uh_sport, + uh->uh_dport, + proto, + dir, + hashval, + hashtype); + } else { + /* Default to 2-tuple hash */ + return rss_software_hash_proto_v4(ip->ip_src, ip->ip_dst, + 0, /* source port */ + 0, /* destination port */ + 0, /* IPPROTO_IP */ + dir, + hashval, + hashtype); + } + + /* Default (shouldn't get here) - no hashing done */ + printf("%s: .. eep!\n", __func__); + return (-1); +} + +/* * Query the RSS hash algorithm. */ u_int @@ -538,15 +730,10 @@ return (rss_ncpus); } -/* - * Return the supported RSS hash configuration. - * - * NICs should query this to determine what to configure in their redirection - * matching table. - */ -u_int -rss_gethashconfig(void) +static inline u_int +rss_gethashconfig_local(void) { + /* Return 4-tuple for TCP; 2-tuple for others */ /* * UDP may fragment more often than TCP and thus we'll end up with @@ -573,6 +760,19 @@ } /* + * Return the supported RSS hash configuration. + * + * NICs should query this to determine what to configure in their redirection + * matching table. + */ +u_int +rss_gethashconfig(void) +{ + + return (rss_gethashconfig_local()); +} + +/* * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want * it appearing in debugging output unnecessarily. */ Index: sys/netinet/in_rss.h =================================================================== --- sys/netinet/in_rss.h (revision 269448) +++ sys/netinet/in_rss.h (working copy) @@ -116,4 +116,15 @@ uint32_t *bucket_id); int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); +/* + * Functions to calculate a software RSS hash for a given mbuf or + * packet detail. + */ +int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, + uint32_t *hashval, uint32_t *hashtype); +int rss_software_hash_proto_v4(struct in_addr src, + struct in_addr dst, u_short src_port, u_short dst_port, + int proto, int dir, uint32_t *hashval, + uint32_t *hashtype); + #endif /* !_NETINET_IN_RSS_H_ */ Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c (revision 269448) +++ sys/netinet/ip_input.c (working copy) @@ -37,6 +37,7 @@ #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" +#include "opt_rss.h" #include #include @@ -77,6 +78,7 @@ #ifdef IPSEC #include #endif /* IPSEC */ +#include #include @@ -119,6 +121,15 @@ &VNET_NAME(ip_do_randomid), 0, "Assign random ip_id values"); +#ifdef RSS +static int ip_reass_netisr_dispatch = 1; +#else +static int ip_reass_netisr_dispatch = 0; +#endif +SYSCTL_INT(_net_inet_ip, OID_AUTO, reass_netisr_dispatch, CTLFLAG_RW, + &ip_reass_netisr_dispatch, 0, + "IP fragment reassembly - direct=0, netisr dispatch=1, queue=2"); + /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -140,11 +151,21 @@ VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ +/* + * We may need to re-inject packets into the IP stack for further work. + * In this instance, use the CPU policy and query the RSS layer for the + * relevant CPU ID to use. + */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, +#ifdef RSS + .nh_m2cpuid = rss_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, +#else .nh_policy = NETISR_POLICY_FLOW, +#endif }; extern struct domain inetdomain; @@ -817,6 +838,9 @@ int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; +#ifdef RSS + uint32_t rss_hash, rss_type; +#endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { @@ -1106,6 +1130,51 @@ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); + +#ifdef RSS + /* + * Query the RSS layer for the flowid / flowtype for the + * mbuf payload. + * + * We then queue into the relevant netisr so it can be dispatched + * to the correct CPU. + * + * Note - this may return 1, which means the flowid in the mbuf + * is correct for the configured RSS hash types and can be used. + */ + if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { + m->m_pkthdr.flowid = rss_hash; + M_HASHTYPE_SET(m, rss_type); + m->m_flags |= M_FLOWID; + } +#endif + + /* + * Queue/dispatch for reprocessing. + * + * When doing queue to the same CPU, the netisr and the NIC queue + * end up taking most of the CPU, starving it of time for userland. + * This means that most of the packets do get dropped. + * + * If another CPU handles the netisr side - and/or another CPU handles + * the userland side - then performance is much, much better. + * + * So use dispatch for now. + * + * Note: Doing a dispatch with the nh_m2cpuid method and netisr + * versus handling the fragment via the normal path gives some + * pretty spectacularly crappy performance in comparison. + * That needs to be addressed. + */ + if (ip_reass_netisr_dispatch == 1) { + netisr_dispatch(NETISR_IP, m); + return (NULL); + } else if (ip_reass_netisr_dispatch == 2) { + netisr_queue(NETISR_IP, m); + return (NULL); + } + + /* No netisr dispatch; handle inline */ return (m); dropfrag: Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c (revision 269448) +++ sys/netinet/ip_output.c (working copy) @@ -145,7 +145,9 @@ if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + + if (((flags & IP_NODEFAULTFLOWID) == 0) && + inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; Index: sys/netinet/ip_var.h =================================================================== --- sys/netinet/ip_var.h (revision 269448) +++ sys/netinet/ip_var.h (working copy) @@ -161,6 +161,7 @@ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c (revision 269448) +++ sys/netinet/udp_usrreq.c (working copy) @@ -43,6 +43,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -89,6 +90,7 @@ #include #include #include +#include #ifdef IPSEC #include @@ -206,6 +208,13 @@ udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); @@ -1393,6 +1402,42 @@ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); +#ifdef RSS + if (pr == IPPROTO_UDP) { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hsah. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + */ + if (rss_software_hash_proto_v4(laddr, faddr, lport, fport, + IPPROTO_UDP, 1, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, hash_type); + } + } +#endif + + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; + if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c (revision 269448) +++ sys/netinet6/ip6_output.c (working copy) @@ -227,6 +227,9 @@ * * ifpp - XXX: just for statistics */ +/* + * XXX TODO: no flowid is assigned for outbound flows? + */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, @@ -260,8 +263,14 @@ goto bad; } - if (inp != NULL) + if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); + if (((flags & IP_NODEFAULTFLOWID) == 0) && + (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID))) { + m->m_pkthdr.flowid = inp->inp_flowid; + m->m_flags |= M_FLOWID; + } + } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c (revision 269448) +++ sys/netinet6/udp6_usrreq.c (working copy) @@ -74,6 +74,7 @@ #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -111,6 +112,7 @@ #include #include #include +#include #include #include @@ -850,8 +852,28 @@ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } + /* + * XXX for now assume UDP is 2-tuple. + * Later on this may become configurable as 4-tuple; + * we should support that. + * + * XXX .. and we should likely cache this in the inpcb. + */ +#ifdef RSS + m->m_pkthdr.flowid = rss_hash_ip6_2tuple(*faddr, *laddr); + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); +#endif flags = 0; + /* + * Don't override with the inp cached flowid. + * + * Until the whole UDP path is vetted, it may actually + * be incorrect. + */ + flags |= IP_NODEFAULTFLOWID; + UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,