Index: share/man/man4/tcp.4 =================================================================== --- share/man/man4/tcp.4 (revision 366063) +++ share/man/man4/tcp.4 (working copy) @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd July 23, 2020 +.Dd December 3, 2020 .Dt TCP 4 .Os .Sh NAME @@ -380,7 +380,22 @@ of a socket. The available modes are the same as for .Dv TCP_TXTLS_MODE . +.It Dv TCP_REUSPORT_LB_NUMA +Changes NUMA affinity filtering for an established TCP listen +socket. +This option takes a single integer argument which specifies +the NUMA domain to filter on for this listen socket. +The argument can also have the follwing special values: +.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA" +.It Dv TCP_REUSPORT_LB_NUMA_NODOM +Remove NUMA filtering for this listen socket. +.It Dv TCP_REUSPORT_LB_NUMA_CURDOM +Filter traffic associated with the domain where the calling thread is +currently executing. +This is typically used after a process or thread inherits a listen +socket from its parent, and sets its CPU affinity to a particular core. .El +.El .Pp The option level for the .Xr setsockopt 2 Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h (revision 366063) +++ sys/netinet/in_pcb.h (working copy) @@ -565,7 +565,7 @@ struct epoch_context il_epoch_ctx; uint16_t il_lport; /* (c) */ u_char il_vflag; /* (c) */ - u_char il_pad; + u_int8_t il_numa_domain; uint32_t il_pad2; union in_dependaddr il_dependladdr; /* (c) */ #define il_laddr il_dependladdr.id46_addr.ia46_addr4 @@ -845,6 +845,7 @@ int in_pcbinshash_mbuf(struct inpcb *, struct mbuf *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); +int in_pcblbgroup_numa(struct inpcb *, int arg); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c (revision 366063) +++ sys/netinet/in_pcb.c (working copy) @@ -74,6 +74,7 @@ #endif #include +#include #include #include @@ -149,7 +150,8 @@ static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, - int lookupflags, struct ifnet *ifp); + int lookupflags, struct ifnet *ifp, + uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ @@ -247,7 +249,8 @@ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, - uint16_t port, const union in_dependaddr *addr, int size) + uint16_t port, const union in_dependaddr *addr, int size, + uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; @@ -258,6 +261,7 @@ return (NULL); grp->il_vflag = vflag; grp->il_lport = port; + grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); @@ -289,7 +293,8 @@ int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, - old_grp->il_lport, &old_grp->il_dependladdr, size); + old_grp->il_lport, &old_grp->il_dependladdr, size, + old_grp->il_numa_domain); if (grp == NULL) return (NULL); @@ -332,7 +337,7 @@ * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int -in_pcbinslbgrouphash(struct inpcb *inp) +in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; @@ -368,6 +373,7 @@ CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && + grp->il_numa_domain == numa_domain && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) @@ -377,7 +383,7 @@ /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, - INPCBLBGROUP_SIZMIN); + INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { @@ -438,6 +444,56 @@ } } +int +in_pcblbgroup_numa(struct inpcb *inp, int arg) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + int err, i; + uint8_t numa_domain; + + switch (arg) { + case TCP_REUSPORT_LB_NUMA_NODOM: + numa_domain = M_NODOM; + break; + case TCP_REUSPORT_LB_NUMA_CURDOM: + numa_domain = PCPU_GET(domain); + break; + default: + if (arg < 0 || arg >= vm_ndomains) + return (EINVAL); + else + numa_domain = arg; + } + + err = 0; + pcbinfo = inp->inp_pcbinfo; + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(pcbinfo); + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + CK_LIST_FOREACH(grp, hdr, il_list) { + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_numa_domain == numa_domain) { + goto abort_with_hash_wlock; + } + /* Remove it from the old group. */ + in_pcbremlbgrouphash(inp); + /* Add it to the new group based on numa domain. */ + in_pcbinslbgrouphash(inp, numa_domain); + goto abort_with_hash_wlock; + } + } + err = ENOENT; +abort_with_hash_wlock: + INP_HASH_WUNLOCK(pcbinfo); + return (err); +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -730,7 +786,7 @@ if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, - NULL); + NULL, M_NODOM); } #endif #ifdef INET6 @@ -737,7 +793,7 @@ if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, - NULL); + NULL, M_NODOM); } #endif } else { @@ -1390,7 +1446,7 @@ } if (lport != 0) { oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, - fport, laddr, lport, 0, NULL); + fport, laddr, lport, 0, NULL, M_NODOM); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; @@ -2008,9 +2064,9 @@ static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, - uint16_t fport, int lookupflags) + uint16_t fport, int lookupflags, int numa_domain) { - struct inpcb *local_wild; + struct inpcb *local_wild, *numa_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; @@ -2030,6 +2086,7 @@ * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ local_wild = NULL; + numa_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) @@ -2040,12 +2097,24 @@ idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % grp->il_inpcnt; - if (grp->il_laddr.s_addr == laddr->s_addr) - return (grp->il_inp[idx]); + if (grp->il_laddr.s_addr == laddr->s_addr) { + if (numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain) { + return (grp->il_inp[idx]); + } else { + numa_wild = grp->il_inp[idx]; + } + } if (grp->il_laddr.s_addr == INADDR_ANY && - (lookupflags & INPLOOKUP_WILDCARD) != 0) + (lookupflags & INPLOOKUP_WILDCARD) != 0 && + (local_wild == NULL || numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain)) { local_wild = grp->il_inp[idx]; + } } + if (numa_wild != NULL) + return (numa_wild); + return (local_wild); } @@ -2292,7 +2361,7 @@ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; @@ -2337,7 +2406,7 @@ */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, - fport, lookupflags); + fport, lookupflags, numa_domain); if (inp != NULL) return (inp); } @@ -2424,12 +2493,13 @@ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, + numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); @@ -2496,7 +2566,7 @@ } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, M_NODOM)); } struct inpcb * @@ -2538,7 +2608,7 @@ } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, m->m_pkthdr.numa_domain)); } #endif /* INET */ @@ -2580,7 +2650,7 @@ */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { - int ret = in_pcbinslbgrouphash(inp); + int ret = in_pcbinslbgrouphash(inp, M_NODOM); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h (revision 366063) +++ sys/netinet/tcp.h (working copy) @@ -190,9 +190,11 @@ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ +#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ + /* Options for Rack and BBR */ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ #define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */ @@ -391,6 +393,12 @@ #define TLS_GET_RECORD 2 /* + * TCP_REUSPORT_LB_NUMA args + */ +#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */ +#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */ + +/* * TCP specific variables of interest for tp->t_stats stats(9) accounting. */ #define VOI_TCP_TXPB 0 /* Transmit payload bytes */ Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c (revision 366063) +++ sys/netinet/tcp_usrreq.c (working copy) @@ -2143,6 +2143,16 @@ INP_WUNLOCK(inp); break; + case TCP_REUSPORT_LB_NUMA: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + INP_WLOCK_RECHECK(inp); + if (!error) + error = in_pcblbgroup_numa(inp, optval); + INP_WUNLOCK(inp); + break; + #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c (revision 366063) +++ sys/netinet6/in6_pcb.c (working copy) @@ -434,7 +434,7 @@ sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &laddr6.sin6_addr : &inp->in6p_laddr, - inp->inp_lport, 0, NULL) != NULL) { + inp->inp_lport, 0, NULL, M_NODOM) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { @@ -890,9 +890,9 @@ static struct inpcb * in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, - uint16_t fport, int lookupflags) + uint16_t fport, int lookupflags, uint8_t numa_domain) { - struct inpcb *local_wild; + struct inpcb *local_wild, *numa_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; @@ -912,6 +912,7 @@ * - Load balanced does not contain IPv4 mapped INET6 wild sockets. */ local_wild = NULL; + numa_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET if (!(grp->il_vflag & INP_IPV6)) @@ -922,12 +923,23 @@ idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport, fport) % grp->il_inpcnt; - if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) - return (grp->il_inp[idx]); + if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) { + if (numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain) { + return (grp->il_inp[idx]); + } + else + numa_wild = grp->il_inp[idx]; + } if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) && - (lookupflags & INPLOOKUP_WILDCARD) != 0) + (lookupflags & INPLOOKUP_WILDCARD) != 0 && + (local_wild == NULL || numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain)) { local_wild = grp->il_inp[idx]; + } } + if (numa_wild != NULL) + return (numa_wild); return (local_wild); } @@ -1138,7 +1150,7 @@ struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, - int lookupflags, struct ifnet *ifp) + int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; @@ -1182,7 +1194,7 @@ */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr, - fport, lookupflags); + fport, lookupflags, numa_domain); if (inp != NULL) return (inp); } @@ -1260,12 +1272,13 @@ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, + numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); @@ -1331,7 +1344,7 @@ } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, M_NODOM)); } struct inpcb * @@ -1373,7 +1386,7 @@ } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, m->m_pkthdr.numa_domain)); } void