Index: in.h =================================================================== --- in.h (revision 266429) +++ in.h (working copy) @@ -432,6 +432,8 @@ #define IP_ONESBCAST 23 /* bool: send all-ones broadcast */ #define IP_BINDANY 24 /* bool: allow bind to any address */ +#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */ +#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */ /* * Options for controlling the firewall and dummynet. @@ -468,9 +470,10 @@ #define IP_MINTTL 66 /* minimum TTL for packet or drop */ #define IP_DONTFRAG 67 /* don't fragment packet */ #define IP_RECVTOS 68 /* bool; receive IP TOS w/dgram */ -#define IP_FLOWID 69 /* flow id for the given socket/inp */ -#define IP_FLOWTYPE 70 /* flow type (M_HASHTYPE) */ -#define IP_RSSCPUID 71 /* RSS flowid -> CPU id mapping */ +#define IP_FLOWID 69 /* get flow id for the given socket/inp */ +#define IP_FLOWTYPE 70 /* get flow type (M_HASHTYPE) */ +#define IP_RSSCPUID 71 /* get RSS flowid -> CPU id mapping */ +#define IP_RSSBUCKETID 72 /* get RSS flowid -> bucket mapping */ /* IPv4 Source Filter Multicast API [RFC3678] */ #define IP_ADD_SOURCE_MEMBERSHIP 70 /* join a source-specific group */ Index: in_pcb.c =================================================================== --- in_pcb.c (revision 266429) +++ in_pcb.c (working copy) @@ -588,7 +588,19 @@ * XXX * This entire block sorely needs a rewrite. */ + + /* + * XXX for now just allow INP_BINDMULTI to override all of the + * in use address checks. + * + * Later on, before this stuff goes into the upstream kernel, + * actually enumerate and tidy up the EADDRINUSE cases and + * what's permissable for BINDMULTI. Eg - that the user credentials + * allow it (ie, you don't have some other uid deciding to listen + * on a daemon port.) + */ if (t && + ((t->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && @@ -601,7 +613,9 @@ } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); - if (t && (t->inp_flags & INP_TIMEWAIT)) { + if (t && + ((t->inp_flags2 & INP_BINDMULTI) == 0) && + (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as @@ -612,7 +626,9 @@ if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); - } else if (t && (reuseport & inp_so_options(t)) == 0) { + } else if (t && + ((t->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -1559,7 +1575,89 @@ goto found; } +#ifdef RSS /* + * For incoming connections, we may wish to do a wildcard + * match for an RSS-local socket. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, + lport, 0, pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif + if (inp != NULL) + goto found; + } +#endif + + /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { @@ -1871,6 +1969,13 @@ KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); + if (ntohs(lport) == 6969) { + printf("%s: lport=6969, m=%p, hashtype=%d, flowid=0x%08x\n", + __func__, + m, + M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + } #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection @@ -1884,6 +1989,12 @@ !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); + if (ntohs(lport) == 6969) { + printf("%s: pcbgroup=%p, pcg_cpu=%u\n", + __func__, + pcbgroup, + (pcbgroup == NULL) ? -1 : pcbgroup->ipg_cpu); + } if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); Index: in_pcb.h =================================================================== --- in_pcb.h (revision 266429) +++ in_pcb.h (working copy) @@ -181,7 +181,8 @@ u_int inp_refcount; /* (i) refcount */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ - u_int inp_ispare[5]; /* (x) route caching / user cookie / + uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ + u_int inp_ispare[4]; /* (x) route caching / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ @@ -546,6 +547,8 @@ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ +#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ +#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ /* * Flags passed to in_pcblookup*() functions. Index: in_pcbgroup.c =================================================================== --- in_pcbgroup.c (revision 266429) +++ in_pcbgroup.c (working copy) @@ -297,6 +297,24 @@ struct inpcbgroup * in_pcbgroup_byinpcb(struct inpcb *inp) { + printf("%s: %p; called; rssflag=%d, listenbucket=%d\n", __func__, inp, !! (inp->inp_flags2 & INP_RSS_BUCKET_SET), inp->inp_rss_listen_bucket); +#ifdef RSS + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + printf("%s: %p: RSS_BUCKET_SET (%d)\n", + __func__, + inp, + inp->inp_rss_listen_bucket); + /* + * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined + * RSS bucket and thus we should use this pcbgroup, rather than + * using a tuple or hash. + * + * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket + * fits in that! + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); +#endif return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, inp->inp_lport, inp->inp_faddr, inp->inp_fport)); @@ -346,6 +364,21 @@ static __inline int in_pcbwild_needed(struct inpcb *inp) { + printf("%s: %p: called; rssflag=%d\n", __func__, inp, inp->inp_flags2 & INP_RSS_BUCKET_SET); +#ifdef RSS + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + printf("%s: %p: RSS_BUCKET_SET (%d)\n", + __func__, + inp, + inp->inp_rss_listen_bucket); + /* + * If it's a listen socket and INP_RSS_BUCKET_SET is set, + * it's a wildcard socket _but_ it's in a specific pcbgroup. + * Thus we don't treat it as a pcbwild inp. + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + return (0); +#endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) @@ -398,9 +431,21 @@ #endif hashkey_faddr = inp->inp_faddr.s_addr; INP_GROUP_LOCK(newpcbgroup); - pcbhash = &newpcbgroup->ipg_hashbase[ - INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, - newpcbgroup->ipg_hashmask)]; + /* + * XXX TODO: this doesn't work for wildcard matches; + * as the wildcard address can't know about the far + * address (it has to be INADDR_ANY) or fport (it + * has to be 0.) + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, + newpcbgroup->ipg_hashmask)]; + } else { + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + } LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); inp->inp_pcbgroup = newpcbgroup; INP_GROUP_UNLOCK(newpcbgroup); @@ -430,6 +475,8 @@ INP_WLOCK_ASSERT(inp); + printf("%s: called; inp=%p\n", __func__, inp); + pcbinfo = inp->inp_pcbinfo; if (!in_pcbgroup_enabled(pcbinfo)) return; @@ -445,6 +492,13 @@ newpcbgroup = in_pcbgroup_byinpcb(inp); } else newpcbgroup = NULL; + if (newpcbgroup) { + printf("%s: %p: newpcbgroup=%p, ipg_cpu=%d\n", + __func__, + inp, + newpcbgroup, + (newpcbgroup == NULL) ? -1 : newpcbgroup->ipg_cpu); + } in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); } Index: in_rss.c =================================================================== --- in_rss.c (revision 266537) +++ in_rss.c (working copy) @@ -425,6 +425,24 @@ } /* + * Query the RSS bucket associated with the given hash value and + * type. + */ +int +rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) +{ + + switch (hash_type) { + case M_HASHTYPE_RSS_IPV4: + case M_HASHTYPE_RSS_TCP_IPV4: + *bucket_id = rss_getbucket(hash_val); + return (0); + default: + return (-1); + } +} + +/* * netisr CPU affinity lookup routine for use by protocols. */ struct mbuf * @@ -436,6 +454,16 @@ return (m); } +int +rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) +{ + + M_ASSERTPKTHDR(m); + + return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), + bucket_id)); +} + /* * Query the RSS hash algorithm. */ Index: in_rss.h =================================================================== --- in_rss.h (revision 266429) +++ in_rss.h (working copy) @@ -91,5 +91,8 @@ */ struct mbuf *rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid); u_int rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type); +int rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, + uint32_t *bucket_id); +int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); #endif /* !_NETINET_IN_RSS_H_ */ Index: ip_output.c =================================================================== --- ip_output.c (revision 266429) +++ ip_output.c (working copy) @@ -908,6 +908,10 @@ { struct inpcb *inp = sotoinpcb(so); int error, optval; +#ifdef RSS + uint32_t rss_bucket; + int retval; +#endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { @@ -986,6 +990,8 @@ break; } /* FALLTHROUGH */ + case IP_BINDMULTI: + case IP_RSS_LISTEN_BUCKET: case IP_TOS: case IP_TTL: case IP_MINTTL: @@ -1028,6 +1034,15 @@ INP_WUNLOCK(inp); \ } while (0) +#define OPTSET2(bit, val) do { \ + INP_WLOCK(inp); \ + if (val) \ + inp->inp_flags2 |= bit; \ + else \ + inp->inp_flags2 &= ~bit; \ + INP_WUNLOCK(inp); \ +} while (0) + case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; @@ -1064,9 +1079,29 @@ case IP_RECVTOS: OPTSET(INP_RECVTOS); break; + case IP_BINDMULTI: + OPTSET2(INP_BINDMULTI, optval); + break; +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: + if ((optval >= 0) && + (optval < rss_getnumbuckets())) { + inp->inp_rss_listen_bucket = optval; + OPTSET2(INP_RSS_BUCKET_SET, 1); + printf("%s: %p: listen_bucket=%d, flags2=0x%08x\n", + __func__, + inp, + inp->inp_rss_listen_bucket, + inp->inp_flags2); + } else { + error = EINVAL; + } + break; +#endif } break; #undef OPTSET +#undef OPTSET2 /* * Multicast socket options are processed by the in_mcast @@ -1174,10 +1209,12 @@ case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: + case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: #ifdef RSS case IP_RSSCPUID: + case IP_RSSBUCKETID: #endif switch (sopt->sopt_name) { @@ -1194,6 +1231,7 @@ break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); @@ -1251,7 +1289,20 @@ optval = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); break; + case IP_RSSBUCKETID: + retval = rss_hash2bucket(inp->inp_flowid, + inp->inp_flowtype, + &rss_bucket); + /* XXX TODO: return some suitable error? */ + if (retval == 0) + optval = rss_bucket; + else + optval = 0; + break; #endif + case IP_BINDMULTI: + optval = OPTBIT2(INP_BINDMULTI); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break;