diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 254401f..4776278 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -492,6 +492,8 @@ __END_DECLS #define IP_FLOWID 90 /* get flow id for the given socket/inp */ #define IP_FLOWTYPE 91 /* get flow type (M_HASHTYPE) */ #define IP_RSSBUCKETID 92 /* get RSS flowid -> bucket mapping */ +#define IP_RECVFLOWID 93 /* bool; receive IP flowid/flowtype w/ datagram */ +#define IP_RECVRSSBUCKETID 94 /* bool; receive IP RSS bucket id w/ datagram */ /* * Defaults and limits for options diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 8c73f2d..6207ddd 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -549,6 +549,8 @@ short inp_so_options(const struct inpcb *inp); #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ +#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ +#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ /* * Flags passed to in_pcblookup*() functions. diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 3a24296..28216ac 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" +#include "opt_rss.h" #include #include @@ -77,6 +78,7 @@ __FBSDID("$FreeBSD$"); #ifdef IPSEC #include #endif /* IPSEC */ +#include #include @@ -806,6 +808,9 @@ SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. + * + * XXX TODO: re-calculate the RSS flowid upon completing the received + * IP packet. */ struct mbuf * ip_reass(struct mbuf *m) @@ -836,6 +841,11 @@ ip_reass(struct mbuf *m) /* * Look for queue of fragments * of this datagram. + * + * XXX TODO: make this a hash? + * + * XXX TODO: for UDP this may actually be very frqeuently used, + * so perhaps we need to lock this stuff better? */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && @@ -1106,6 +1116,9 @@ found: m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); + + /* XXX TODO: update flowid/flowtype details */ + return (m); dropfrag: @@ -1662,6 +1675,43 @@ makedummy: if (*mp) mp = &(*mp)->m_next; } + + if (inp->inp_flags2 & INP_RECVFLOWID) { + uint32_t flowid, flow_type; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + /* + * XXX should handle the failure of one or the + * other - don't populate both? + */ + *mp = sbcreatecontrol((caddr_t) &flowid, + sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + *mp = sbcreatecontrol((caddr_t) &flow_type, + sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + +#ifdef RSS + if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { + uint32_t flowid, flow_type; + uint32_t rss_bucketid; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { + *mp = sbcreatecontrol((caddr_t) &rss_bucketid, + sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + } +#endif } /* diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 4aea44f..bbc7981 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -145,7 +145,23 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + + /* + * Force the flowid / flowtype for outbound data to match the + * inp. + * + * For UDP that's going to be a bit special because we may be + * sending with a different source/destination address + * than the initial bind (which may be a global bind.) + * + * I'm not sure what to do about this just yet. + * Maybe the correct thing to do is to do the flowid + * assignment in the callers of ip_output() since they'll + * know if the inp flowid details are supposed to be + * used. + */ + if( ((flags & IP_NODEFAULTFLOWID) == 0) && + inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; @@ -158,6 +174,11 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, } #ifdef FLOWTABLE + /* + * Ugh - if there's no flowid assigned at this point the + * lookup call below will /also/ assign the flowtable hash + * value to the mbuf flowid. + */ if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif @@ -720,6 +741,8 @@ bad: * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) + * + * XXX TODO: ensure the flowid/flowtype/M_FLOW is copied into each fragment. */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, @@ -1016,6 +1039,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RECVRSSBUCKETID: +#endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1094,6 +1121,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; + case IP_RECVFLOWID: + OPTSET2(INP_RECVFLOWID, optval); + break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && @@ -1104,6 +1134,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) error = EINVAL; } break; + case IP_RECVRSSBUCKETID: + OPTSET2(INP_RECVRSSBUCKETID, optval); + break; #endif } break; @@ -1219,8 +1252,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: + case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: + case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { @@ -1290,6 +1325,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_FLOWTYPE: optval = inp->inp_flowtype; break; + case IP_RECVFLOWID: + optval = OPTBIT2(INP_RECVFLOWID); + break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, @@ -1300,6 +1338,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) else error = EINVAL; break; + case IP_RECVRSSBUCKETID: + optval = OPTBIT2(INP_RECVRSSBUCKETID); + break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index b2251ac..fb5138d7 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -161,6 +161,7 @@ void kmod_ipstat_dec(int statnum); #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 4b565fc..be1511b 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -206,6 +206,13 @@ void udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); @@ -1082,6 +1089,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, u_char tos; uint8_t pr; uint16_t cscov = 0; + uint32_t flowid = 0; + int flowid_type = 0; + int use_flowid = 0; /* * udp_output() may need to temporarily bind or connect the current @@ -1145,6 +1155,34 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, tos = *(u_char *)CMSG_DATA(cm); break; + case IP_FLOWID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid = *(uint32_t *) CMSG_DATA(cm); + break; + + case IP_FLOWTYPE: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid_type = *(uint32_t *) CMSG_DATA(cm); + use_flowid = 1; + break; + + case IP_RSSBUCKETID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + /* + * XXX don't error out for now, but don't + * do anything + */ + break; + default: error = ENOPROTOOPT; break; @@ -1393,6 +1431,37 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + * + * Remember ip_output() overrides with the inp flowid details + * if they exist. + * + * .. and ip_output() -> flowtable_lookup() also assigns + * a flowid too. Ugh. + */ + if (use_flowid) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = flowid; + M_HASHTYPE_SET(m, flowid_type); + } + + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; + if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED)