diff -r e6ac5bd286f7 -r 75a81ee0da11 sbin/ifconfig/ifconfig.c --- a/sbin/ifconfig/ifconfig.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sbin/ifconfig/ifconfig.c Mon Nov 07 13:00:58 2016 -0800 @@ -1145,7 +1145,7 @@ unsetifdescr(const char *val, int value, "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\30TXRTLMT" /* * Print the status of the interface. If an address family was diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/kern/uipc_socket.c --- a/sys/kern/uipc_socket.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/kern/uipc_socket.c Mon Nov 07 13:00:58 2016 -0800 @@ -2501,6 +2501,9 @@ sosetopt(struct socket *so, struct socko #ifdef MAC struct mac extmac; #endif +#ifdef RATELIMIT + u_int uval; +#endif CURVNET_SET(so->so_vnet); error = 0; @@ -2686,11 +2689,11 @@ sosetopt(struct socket *so, struct socko case SO_MAX_PACING_RATE: #ifdef RATELIMIT - error = sooptcopyin(sopt, &val32, sizeof(val32), - sizeof(val32)); + error = sooptcopyin(sopt, &uval, sizeof(uval), + sizeof(uval)); if (error) goto bad; - so->so_max_pacing_rate = val32; + so->so_max_pacing_rate = uval; #else error = EOPNOTSUPP; #endif @@ -2752,7 +2755,7 @@ sogetopt(struct socket *so, struct socko struct mac extmac; #endif #ifdef RATELIMIT - uint32_t val32; + u_int uval; #endif CURVNET_SET(so->so_vnet); @@ -2888,8 +2891,8 @@ integer: case SO_MAX_PACING_RATE: #ifdef RATELIMIT - val32 = so->so_max_pacing_rate; - error = sooptcopyout(sopt, &val32, sizeof(val32)); + uval = so->so_max_pacing_rate; + error = sooptcopyout(sopt, &uval, sizeof(uval)); #else error = EOPNOTSUPP; #endif diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/net/if.c --- a/sys/net/if.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/net/if.c Mon Nov 07 13:00:58 2016 -0800 @@ -2767,17 +2767,6 @@ ifioctl(struct socket *so, u_long cmd, c ifr = (struct ifreq *)data; switch (cmd) { - /* - * The TX rate limiting IOCTLs should only be used - * within the kernel. Prevent user-space from using - * them: - */ - case SIOCARATECTL: - case SIOCSRATECTL: - case SIOCDRATECTL: - CURVNET_RESTORE(); - return (EOPNOTSUPP); - #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/net/if.h --- a/sys/net/if.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/net/if.h Mon Nov 07 13:00:58 2016 -0800 @@ -372,16 +372,6 @@ struct ifreq_buffer { }; /* - * Interface to create/delete/modify TX rate limiting. - */ -struct ifreq_txrtlmt { - uint32_t txring_max_rate; /* limit in bytes per second */ - uint32_t txring_id; /* driver specific value */ - uint32_t txring_flow_id; /* current flowid */ - uint32_t txring_flow_type; /* current flowtype */ -}; - -/* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/net/if_var.h --- a/sys/net/if_var.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/net/if_var.h Mon Nov 07 13:00:58 2016 -0800 @@ -177,6 +177,28 @@ struct if_encap_req { /* + * Storage for this comes from the ifnet driver and it's free to allocate as + * much additional space as it wants for its own use. The spares are for the + * kernel. + * + * XXXNP: need a better name before commit. + */ +struct txrtlmt { + struct ifnet *ifp; + void *pspare[3]; /* One for rt_ifp/pseudo_ifp, rest TBD */ + u_int max_rate; + u_int spare[3]; + + int drv_data[]; +}; + +struct ifnet_txrtlmt_ops { + int (*ito_alloc)(struct ifnet *, u_int, struct txrtlmt **); + int (*ito_update)(struct ifnet *, u_int, struct txrtlmt **); + int (*ito_free)(struct ifnet *, struct txrtlmt *); +}; + +/* * Structure defining a network interface. */ struct ifnet { @@ -303,12 +325,14 @@ struct ifnet { u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ + struct ifnet_txrtlmt_ops *if_txrtlmt_ops; + /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ - void *if_pspare[4]; /* packet pacing / general use */ + void *if_pspare[3]; /* packet pacing / general use */ int if_ispare[4]; /* packet pacing / general use */ }; diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/netinet/in_pcb.c --- a/sys/netinet/in_pcb.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/netinet/in_pcb.c Mon Nov 07 13:00:58 2016 -0800 @@ -1146,7 +1146,7 @@ in_pcbdetach(struct inpcb *inp) KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT - if (inp->inp_txring_ifp != NULL) + if (inp->inp_txrtlmt != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; @@ -2689,197 +2689,148 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb) #ifdef RATELIMIT /* - * Modify existing TX rate limit on inp_txring_ifp and update - * inpcb info: + * Create a TX rate limit on ifp and attach it to inpcb. */ static int -in_pcbmodify_txrtlmt(struct inpcb *inp, struct ifnet *ifp, - uint32_t max_pacing_rate) +in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + u_int max_pacing_rate) { - struct ifreq_txrtlmt req; int error; INP_WLOCK_ASSERT(inp); - - req.txring_max_rate = max_pacing_rate; - req.txring_id = inp->inp_txring_id; - req.txring_flow_id = inp->inp_flowid; - req.txring_flow_type = inp->inp_flowtype; + MPASS(ifp != NULL); + MPASS(inp->inp_txrtlmt == NULL); - error = ifp->if_ioctl(ifp, SIOCSRATECTL, (caddr_t)&req); + if (!(ifp->if_capenable & IFCAP_TXRTLMT)) + return (ENOTSUP); + MPASS(ifp->if_txrtlmt_ops != NULL); + MPASS(ifp->if_txrtlmt_ops->ito_alloc != NULL); + + if_ref(ifp); + error = (*ifp->if_txrtlmt_ops->ito_alloc)(ifp, max_pacing_rate, + &inp->inp_txrtlmt); if (error) - return (error); + if_rele(ifp); - inp->inp_txring_max_rate = max_pacing_rate; return (0); } /* - * Create a TX rate limit on ifp and attach it to inpcb: - */ -static int -in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, - uint32_t max_pacing_rate) -{ - struct ifreq_txrtlmt req; - int error; - - INP_WLOCK_ASSERT(inp); - KASSERT(inp->inp_txring_ifp == NULL, - ("%s: inp_txring_ifp != NULL", __func__)); - - req.txring_max_rate = max_pacing_rate; - req.txring_flow_id = inp->inp_flowid; - req.txring_flow_type = inp->inp_flowtype; - - if_ref(ifp); - error = ifp->if_ioctl(ifp, SIOCARATECTL, (caddr_t)&req); - - if (error) { - if_rele(ifp); - return (error); - } - - inp->inp_txring_ifp = ifp; - inp->inp_txring_max_rate = max_pacing_rate; - inp->inp_txring_id = req.txring_id; - return (0); -} - -/* - * Remove TX rate limit from inp_txring_ifp and detach it from - * the inpcb: + * Release any tx rate limiting resources associated with this inpcb. */ static void in_pcbdetach_txrtlmt(struct inpcb *inp) { - struct ifreq_txrtlmt req; + struct txrtlmt *lmt; struct ifnet *ifp; INP_WLOCK_ASSERT(inp); - KASSERT(inp->inp_txring_ifp != NULL, - ("%s: inp->inp_txring_ifp == NULL", __func__)); + lmt = inp->inp_txrtlmt; + if (lmt == NULL) + return; /* Nothing to do */ + ifp = lmt->ifp; + MPASS(ifp != NULL); + MPASS(ifp->if_txrtlmt_ops != NULL); + MPASS(ifp->if_txrtlmt_ops->ito_free != NULL); + (*ifp->if_txrtlmt_ops->ito_free)(ifp, lmt); + if_rele(ifp); + inp->inp_txrtlmt = NULL; +} - ifp = inp->inp_txring_ifp; - req.txring_id = inp->inp_txring_id; - req.txring_flow_id = inp->inp_flowid; - req.txring_flow_type = inp->inp_flowtype; +/* + * Update the tx rate limiting information associated with this inp. Resources + * may be allocated or updated depending on existing state. + */ +int +in_pcbmodify_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + u_int max_pacing_rate) +{ + struct txrtlmt *lmt; - inp->inp_txring_ifp = NULL; - inp->inp_txring_id = 0; - inp->inp_txring_max_rate = 0; + INP_WLOCK_ASSERT(inp); + + /* 0 means no rate limiting, release any existing resources. */ + if (max_pacing_rate == 0) { + in_pcbdetach_txrtlmt(inp); + return (0); + } + + lmt = inp->inp_txrtlmt; + if (lmt != NULL) { + MPASS(lmt->ifp != NULL); + + if (ifp == NULL || ifp == lmt->ifp) { + /* + * Rate limit change on the same ifnet. + */ + + ifp = lmt->ifp; + MPASS(ifp->if_txrtlmt_ops != NULL); + MPASS(ifp->if_txrtlmt_ops->ito_update != NULL); + return ((*ifp->if_txrtlmt_ops->ito_update)(ifp, + max_pacing_rate, &inp->inp_txrtlmt)); + } else { + /* + * ifnet change. Release any existing resources first. + */ + in_pcbdetach_txrtlmt(inp); + } + } /* - * If the device was detached while we still had reference on - * ifp, we assume if_dead() was called and replaced callbacks - * with stubs. + * Brand new rate limiter being allocated for an inpcb that doesn't have + * one already. */ - ifp->if_ioctl(ifp, SIOCDRATECTL, (caddr_t)&req); - if_rele(ifp); + MPASS(inp->inp_txrtlmt == NULL); + if (ifp == NULL) { + /* + * The ifnet isn't known at this point. + */ + inp->inp_flags2 |= INP_TXRTLMT_CHANGED; + return (0); + } + + return (in_pcbattach_txrtlmt(inp, ifp, max_pacing_rate)); } + /* * Track route changes and modify the TX rate limit hint in the given * mbuf to match what the network driver expects. */ void -in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m) { - struct socket *socket; - uint32_t max_pacing_rate; - int error; - - if (inp == NULL) - return; - - socket = inp->inp_socket; - if (socket == NULL) - return; - - /* - * NOTE: The so_max_pacing_rate value is read unlocked, - * because atomic updates are not required since the variable - * is checked at every mbuf we send. It is assumed that the - * variable read itself will be atomic. - */ - max_pacing_rate = socket->so_max_pacing_rate; - if (max_pacing_rate == 0 && inp->inp_txring_ifp == NULL) - return; - - /* - * NOTE: When attaching to a network interface a reference is - * made to ensure the network interface doesn't go away until - * all ratelimit connections are gone. The network interface - * pointers compared below represent valid network interfaces, - * except when comparing towards NULL. - */ - if (ifp != inp->inp_txring_ifp) { - bool wlocked = INP_WLOCKED(inp); + MPASS(inp != NULL); + INP_LOCK_ASSERT(inp); - if (!wlocked) { - /* - * NOTE: If the write locking fails, we need - * to bail out and use the non-ratelimited - * ring for the transmit until there is a new - * chance to get the write lock. - */ - if (!INP_TRY_UPGRADE(inp)) - return; - } + M_ASSERTPKTHDR(m); + m->m_pkthdr.rcvif = NULL; - if (inp->inp_txring_ifp != NULL) - in_pcbdetach_txrtlmt(inp); + if (inp->inp_flags2 & INP_TXRTLMT_CHANGED) { + int error; + bool wlocked = INP_WLOCKED(inp); + struct socket *so = inp->inp_socket; /* - * In order to utilize packet pacing with RSS, we need - * to wait until there is a valid RSS hash before we - * can proceed: + * Ensure that the inpcb is wlocked and let in_pcbmodify_txrtlmt + * do the actual work. */ - if (inp->inp_flowtype == M_HASHTYPE_NONE) { - if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { - if (!wlocked) - INP_DOWNGRADE(inp); - return; - } - /* typically UDP ends up here */ - inp->inp_flowid = mb->m_pkthdr.flowid; - inp->inp_flowtype = M_HASHTYPE_GET(mb); - } - - error = in_pcbattach_txrtlmt(inp, ifp, max_pacing_rate); - + if (!wlocked && !INP_TRY_UPGRADE(inp)) + return; + error = in_pcbmodify_txrtlmt(inp, ifp, so->so_max_pacing_rate); if (!wlocked) INP_DOWNGRADE(inp); - if (error) - return; - - } else if (inp->inp_txring_max_rate != max_pacing_rate) { - bool wlocked = INP_WLOCKED(inp); - if (!wlocked) { - /* - * NOTE: If the write locking fails, use the - * current pacing rate until there is a new - * chance to write lock: - */ - if (!INP_TRY_UPGRADE(inp)) - goto done; + if (error == 0 || error == ENOTSUP) { + /* Success or permanent failure. Won't retry. */ + inp->inp_flags2 &= ~INP_TXRTLMT_CHANGED; } + } - error = in_pcbmodify_txrtlmt(inp, ifp, max_pacing_rate); - if (!wlocked) - INP_DOWNGRADE(inp); - if (error) - goto done; /* use old rate */ - } -done: - /* - * Update the flow ID and RSS hash for the transmitted mbuf. - */ - mb->m_pkthdr.flowid = inp->inp_txring_id; - M_HASHTYPE_SET(mb, M_HASHTYPE_TXRTLMT); + m->m_pkthdr.rcvif = (void *)inp->inp_txrtlmt; } #endif /* RATELIMIT */ diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/netinet/in_pcb.h --- a/sys/netinet/in_pcb.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/netinet/in_pcb.h Mon Nov 07 13:00:58 2016 -0800 @@ -202,14 +202,11 @@ struct inpcb { u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ - struct ifnet *inp_txring_ifp; /* (i) ifp of TX ring */ - void *inp_pspare[4]; /* (x) packet pacing / general use */ + struct txrtlmt *inp_txrtlmt; /* (i) tx rate limiting information */ + void *inp_pspare[4]; /* (x) general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ - uint32_t inp_txring_max_rate; /* (i) driver TX ring rate */ - uint32_t inp_txring_id; /* (i) driver TX ring ID */ - u_int inp_ispare[2]; /* (x) packet pacing / user cookie / - * general use */ + u_int inp_ispare[4]; /* (x) user cookie / general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i) list for PCB's local port */ @@ -619,6 +616,7 @@ short inp_so_options(const struct inpcb #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ +#define INP_TXRTLMT_CHANGED 0x00000400 /* inp_txrtlmt needs attention */ /* * Flags passed to in_pcblookup*() functions. @@ -741,6 +739,7 @@ struct sockaddr * void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *); +int in_pcbmodify_txrtlmt(struct inpcb *, struct ifnet *, u_int); #endif #endif /* _KERNEL */ diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/netinet/ip_output.c --- a/sys/netinet/ip_output.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/netinet/ip_output.c Mon Nov 07 13:00:58 2016 -0800 @@ -663,8 +663,10 @@ sendit: m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); #ifdef RATELIMIT - if (ifp->if_capabilities & IFCAP_TXRTLMT) + if (inp != NULL && (inp->inp_txrtlmt != NULL || + inp->inp_flags2 | INP_TXRTLMT_CHANGED)) { in_pcboutput_txrtlmt(inp, ifp, m); + } #endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); @@ -703,8 +705,10 @@ sendit: IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); #ifdef RATELIMIT - if (ifp->if_capabilities & IFCAP_TXRTLMT) + if (inp != NULL && (inp->inp_txrtlmt != NULL || + inp->inp_flags2 | INP_TXRTLMT_CHANGED)) { in_pcboutput_txrtlmt(inp, ifp, m); + } #endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); @@ -982,6 +986,17 @@ ip_ctloutput(struct socket *so, struct s INP_WUNLOCK(inp); error = 0; break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + INP_WLOCK(inp); + in_pcbmodify_txrtlmt(inp, NULL, + so->so_max_pacing_rate); + INP_WUNLOCK(inp); + error = 0; +#else + error = ENOTSUP; +#endif + break; default: break; } diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/netinet6/ip6_output.c --- a/sys/netinet6/ip6_output.c Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/netinet6/ip6_output.c Mon Nov 07 13:00:58 2016 -0800 @@ -956,8 +956,10 @@ passout: ifa_free(&ia6->ia_ifa); } #ifdef RATELIMIT - if (ifp->if_capabilities & IFCAP_TXRTLMT) + if (inp != NULL && (inp->inp_txrtlmt != NULL || + inp->inp_flags2 | INP_TXRTLMT_CHANGED)) { in_pcboutput_txrtlmt(inp, ifp, m); + } #endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); @@ -1060,8 +1062,10 @@ sendorfree: m->m_pkthdr.len); } #ifdef RATELIMIT - if (ifp->if_capabilities & IFCAP_TXRTLMT) + if (inp != NULL && (inp->inp_txrtlmt != NULL || + inp->inp_flags2 | INP_TXRTLMT_CHANGED)) { in_pcboutput_txrtlmt(inp, ifp, m); + } #endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/sys/mbuf.h --- a/sys/sys/mbuf.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/sys/mbuf.h Mon Nov 07 13:00:58 2016 -0800 @@ -345,7 +345,6 @@ struct mbuf { #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ -#define M_HASHTYPE_TXRTLMT 62 /* rate limited TX traffic */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/sys/socketvar.h --- a/sys/sys/socketvar.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/sys/socketvar.h Mon Nov 07 13:00:58 2016 -0800 @@ -79,7 +79,6 @@ struct socket { void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ - uint32_t so_max_pacing_rate; /* (f) TX pacing rate info */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. @@ -129,7 +128,8 @@ struct socket { uint32_t so_user_cookie; void *so_pspare[2]; /* packet pacing / general use */ - int so_ispare[2]; /* packet pacing / general use */ + u_int so_max_pacing_rate;/* (f) TX pacing rate info */ + int so_ispare[1]; /* packet pacing / general use */ }; /* diff -r e6ac5bd286f7 -r 75a81ee0da11 sys/sys/sockio.h --- a/sys/sys/sockio.h Tue Oct 11 17:05:52 2016 -0700 +++ b/sys/sys/sockio.h Mon Nov 07 13:00:58 2016 -0800 @@ -133,8 +133,4 @@ #define SIOCGIFGMEMB _IOWR('i', 138, struct ifgroupreq) /* get members */ #define SIOCGIFXMEDIA _IOWR('i', 139, struct ifmediareq) /* get net xmedia */ -#define SIOCARATECTL _IOWR('i', 140, struct ifreq_txrtlmt) /* add TX rate limit */ -#define SIOCSRATECTL _IOWR('i', 141, struct ifreq_txrtlmt) /* set TX rate limit */ -#define SIOCDRATECTL _IOW('i', 142, struct ifreq_txrtlmt) /* del TX rate limit */ - #endif /* !_SYS_SOCKIO_H_ */