diff -r 54d8e9872bb6 sys/conf/files --- a/sys/conf/files Sat Jan 28 20:54:43 2017 +0000 +++ b/sys/conf/files Sun Jan 29 21:15:45 2017 -0800 @@ -1347,6 +1347,8 @@ dev/cxgbe/t4_sge.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_l2t.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" +dev/cxgbe/t4_lro.c optional cxgbe pci \ + compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_tracer.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_vf.c optional cxgbev pci \ diff -r 54d8e9872bb6 sys/dev/cxgbe/adapter.h --- a/sys/dev/cxgbe/adapter.h Sat Jan 28 20:54:43 2017 +0000 +++ b/sys/dev/cxgbe/adapter.h Sun Jan 29 21:15:45 2017 -0800 @@ -52,6 +52,7 @@ #include <net/if_media.h> #include <netinet/in.h> #include <netinet/tcp_lro.h> +#include "t4_lro.h" #include "offload.h" #include "t4_ioctl.h" @@ -322,6 +323,8 @@ enum { IQ_HAS_FL = (1 << 1), /* iq associated with a freelist */ IQ_INTR = (1 << 2), /* iq takes direct interrupt */ IQ_LRO_ENABLED = (1 << 3), /* iq is an eth rxq with LRO enabled */ + IQ_LRO2_ENABLED = (1 << 4), /* iq is an eth rxq with LRO2 enabled */ + IQ_ADJ_CREDIT = (1 << 5), /* hw is off by 1 credit for this iq */ /* iq state */ IQS_DISABLED = 0, @@ -543,6 +546,7 @@ struct sge_rxq { struct ifnet *ifp; /* the interface this rxq belongs to */ #if defined(INET) || defined(INET6) struct lro_ctrl lro; /* LRO state */ + struct t4_lro_ctrl t4_lro; #endif /* stats for common events first */ @@ -1144,7 +1148,9 @@ int t4_setup_vi_queues(struct vi_info *) int t4_teardown_vi_queues(struct vi_info *); void t4_intr_all(void *); void t4_intr(void *); +void t4_intr2(void *); void t4_vi_intr(void *); +void t4_vi_intr2(void *); void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_lro.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/dev/cxgbe/t4_lro.c Sun Jan 29 21:15:45 2017 -0800 @@ -0,0 +1,982 @@ +/*- + * Copyright (c) 2007, Myricom Inc. + * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2012 The FreeBSD Foundation + * Copyright (c) 2016 Mellanox Technologies. + * All rights reserved. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/ethernet.h> +#include <net/vnet.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip6.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include "t4_lro.h" +#include <netinet/tcp_var.h> + +#include <netinet6/ip6_var.h> + +#include <machine/in_cksum.h> + +MALLOC_DECLARE(M_CXGBE); + +#define TCP_LRO_UPDATE_CSUM 1 +#ifndef TCP_LRO_UPDATE_CSUM +#define TCP_LRO_INVALID_CSUM 0x0000 +#endif + +static void t4_lro_rx_done(struct t4_lro_ctrl *lc); +static int t4_lro_rx2(struct t4_lro_ctrl *lc, struct mbuf *m, + uint32_t csum, int use_hash); + +#if 0 +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP LRO"); + +static unsigned t4_lro_entries = T4_LRO_ENTRIES; +SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, t4entries, + CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &t4_lro_entries, 0, + "default number of T4 LRO entries"); +#endif + +static __inline void +t4_lro_active_insert(struct t4_lro_ctrl *lc, struct t4_lro_head *bucket, + struct t4_lro_entry *le) +{ + + LIST_INSERT_HEAD(&lc->lro_active, le, next); + LIST_INSERT_HEAD(bucket, le, hash_next); +} + +static __inline void +t4_lro_active_remove(struct t4_lro_entry *le) +{ + + LIST_REMOVE(le, next); /* active list */ + LIST_REMOVE(le, hash_next); /* hash bucket */ +} + +int +t4_lro_init(struct t4_lro_ctrl *lc) +{ + return (t4_lro_init_args(lc, NULL, T4_LRO_ENTRIES, 0)); +} + +int +t4_lro_init_args(struct t4_lro_ctrl *lc, struct ifnet *ifp, + unsigned lro_entries, unsigned lro_mbufs) +{ + struct t4_lro_entry *le; + size_t size; + unsigned i, elements; + + lc->lro_bad_csum = 0; + lc->lro_queued = 0; + lc->lro_flushed = 0; + lc->lro_cnt = 0; + lc->lro_mbuf_count = 0; + lc->lro_mbuf_max = lro_mbufs; + lc->lro_cnt = lro_entries; + lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; + lc->lro_length_lim = TCP_LRO_LENGTH_MAX; + lc->ifp = ifp; + LIST_INIT(&lc->lro_free); + LIST_INIT(&lc->lro_active); + + /* create hash table to accelerate entry lookup */ + if (lro_entries > lro_mbufs) + elements = lro_entries; + else + elements = lro_mbufs; + lc->lro_hash = phashinit_flags(elements, M_CXGBE, &lc->lro_hashsz, + HASH_NOWAIT); + if (lc->lro_hash == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + + /* compute size to allocate */ + size = (lro_mbufs * sizeof(struct t4_lro_mbuf_sort)) + + (lro_entries * sizeof(*le)); + lc->lro_mbuf_data = (struct t4_lro_mbuf_sort *) + malloc(size, M_CXGBE, M_NOWAIT | M_ZERO); + + /* check for out of memory */ + if (lc->lro_mbuf_data == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute offset for LRO entries */ + le = (struct t4_lro_entry *) + (lc->lro_mbuf_data + lro_mbufs); + + /* setup linked list */ + for (i = 0; i != lro_entries; i++) + LIST_INSERT_HEAD(&lc->lro_free, le + i, next); + + return (0); +} + +void +t4_lro_free(struct t4_lro_ctrl *lc) +{ + struct t4_lro_entry *le; + unsigned x; + + /* reset LRO free list */ + LIST_INIT(&lc->lro_free); + + /* free active mbufs, if any */ + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + t4_lro_active_remove(le); + m_freem(le->m_head); + } + + /* free hash table */ + if (lc->lro_hash != NULL) { + free(lc->lro_hash, M_CXGBE); + lc->lro_hash = NULL; + } + lc->lro_hashsz = 0; + + /* free mbuf array, if any */ + for (x = 0; x != lc->lro_mbuf_count; x++) + m_freem(lc->lro_mbuf_data[x].mb); + lc->lro_mbuf_count = 0; + + /* free allocated memory, if any */ + free(lc->lro_mbuf_data, M_CXGBE); + lc->lro_mbuf_data = NULL; +} + +#ifdef TCP_LRO_UPDATE_CSUM +static uint16_t +t4_lro_csum_th(struct tcphdr *th) +{ + uint32_t ch; + uint16_t *p, l; + + ch = th->th_sum = 0x0000; + l = th->th_off; + p = (uint16_t *)th; + while (l > 0) { + ch += *p; + p++; + ch += *p; + p++; + l--; + } + while (ch > 0xffff) + ch = (ch >> 16) + (ch & 0xffff); + + return (ch & 0xffff); +} + +static uint16_t +t4_lro_rx_csum_fixup(struct t4_lro_entry *le, void *l3hdr, struct tcphdr *th, + uint16_t tcp_data_len, uint16_t csum) +{ + uint32_t c; + uint16_t cs; + + c = csum; + + /* Remove length from checksum. */ + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)l3hdr; + if (le->append_cnt == 0) + cs = ip6->ip6_plen; + else { + uint32_t cx; + + cx = ntohs(ip6->ip6_plen); + cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); + } + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip4; + + ip4 = (struct ip *)l3hdr; + if (le->append_cnt == 0) + cs = ip4->ip_len; + else { + cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), + IPPROTO_TCP); + cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, + htons(cs)); + } + break; + } +#endif + default: + cs = 0; /* Keep compiler happy. */ + } + + cs = ~cs; + c += cs; + + /* Remove TCP header csum. */ + cs = ~t4_lro_csum_th(th); + c += cs; + while (c > 0xffff) + c = (c >> 16) + (c & 0xffff); + + return (c & 0xffff); +} +#endif + +static void +t4_lro_rx_done(struct t4_lro_ctrl *lc) +{ + struct t4_lro_entry *le; + + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + } +} + +void +t4_lro_flush_inactive(struct t4_lro_ctrl *lc, const struct timeval *timeout) +{ + struct t4_lro_entry *le, *le_tmp; + struct timeval tv; + + if (LIST_EMPTY(&lc->lro_active)) + return; + + getmicrotime(&tv); + timevalsub(&tv, timeout); + LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { + if (timevalcmp(&tv, &le->mtime, >=)) { + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + } + } +} + +void +t4_lro_flush(struct t4_lro_ctrl *lc, struct t4_lro_entry *le) +{ + + if (le->append_cnt > 0) { + struct tcphdr *th; + uint16_t p_len; + + p_len = htons(le->p_len); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6; + + ip6 = le->le_ip6; + ip6->ip6_plen = p_len; + th = (struct tcphdr *)(ip6 + 1); + le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + le->p_len += ETHER_HDR_LEN + sizeof(*ip6); + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip4; +#ifdef TCP_LRO_UPDATE_CSUM + uint32_t cl; + uint16_t c; +#endif + + ip4 = le->le_ip4; +#ifdef TCP_LRO_UPDATE_CSUM + /* Fix IP header checksum for new length. */ + c = ~ip4->ip_sum; + cl = c; + c = ~ip4->ip_len; + cl += c + p_len; + while (cl > 0xffff) + cl = (cl >> 16) + (cl & 0xffff); + c = cl; + ip4->ip_sum = ~c; +#else + ip4->ip_sum = TCP_LRO_INVALID_CSUM; +#endif + ip4->ip_len = p_len; + th = (struct tcphdr *)(ip4 + 1); + le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | + CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; + le->p_len += ETHER_HDR_LEN; + break; + } +#endif + default: + th = NULL; /* Keep compiler happy. */ + } + le->m_head->m_pkthdr.csum_data = 0xffff; + le->m_head->m_pkthdr.len = le->p_len; + + /* Incorporate the latest ACK into the TCP header. */ + th->th_ack = le->ack_seq; + th->th_win = le->window; + /* Incorporate latest timestamp into the TCP header. */ + if (le->timestamp != 0) { + uint32_t *ts_ptr; + + ts_ptr = (uint32_t *)(th + 1); + ts_ptr[1] = htonl(le->tsval); + ts_ptr[2] = le->tsecr; + } +#ifdef TCP_LRO_UPDATE_CSUM + /* Update the TCP header checksum. */ + le->ulp_csum += p_len; + le->ulp_csum += t4_lro_csum_th(th); + while (le->ulp_csum > 0xffff) + le->ulp_csum = (le->ulp_csum >> 16) + + (le->ulp_csum & 0xffff); + th->th_sum = (le->ulp_csum & 0xffff); + th->th_sum = ~th->th_sum; +#else + th->th_sum = TCP_LRO_INVALID_CSUM; +#endif + } + + le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1; + (*lc->ifp->if_input)(lc->ifp, le->m_head); + lc->lro_queued += le->append_cnt + 1; + lc->lro_flushed++; + bzero(le, sizeof(*le)); + LIST_INSERT_HEAD(&lc->lro_free, le, next); +} + +#ifdef HAVE_INLINE_FLSLL +#define t4_lro_msb_64(x) (1ULL << (flsll(x) - 1)) +#else +static inline uint64_t +t4_lro_msb_64(uint64_t x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + x |= (x >> 32); + return (x & ~(x >> 1)); +} +#endif + +/* + * The t4_lro_sort() routine is comparable to qsort(), except it has + * a worst case complexity limit of O(MIN(N,64)*N), where N is the + * number of elements to sort and 64 is the number of sequence bits + * available. The algorithm is bit-slicing the 64-bit sequence number, + * sorting one bit at a time from the most significant bit until the + * least significant one, skipping the constant bits. This is + * typically called a radix sort. + */ +static void +t4_lro_sort(struct t4_lro_mbuf_sort *parray, uint32_t size) +{ + struct t4_lro_mbuf_sort temp; + uint64_t ones; + uint64_t zeros; + uint32_t x; + uint32_t y; + +repeat: + /* for small arrays insertion sort is faster */ + if (size <= 12) { + for (x = 1; x < size; x++) { + temp = parray[x]; + for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) + parray[y] = parray[y - 1]; + parray[y] = temp; + } + return; + } + + /* compute sequence bits which are constant */ + ones = 0; + zeros = 0; + for (x = 0; x != size; x++) { + ones |= parray[x].seq; + zeros |= ~parray[x].seq; + } + + /* compute bits which are not constant into "ones" */ + ones &= zeros; + if (ones == 0) + return; + + /* pick the most significant bit which is not constant */ + ones = t4_lro_msb_64(ones); + + /* + * Move entries having cleared sequence bits to the beginning + * of the array: + */ + for (x = y = 0; y != size; y++) { + /* skip set bits */ + if (parray[y].seq & ones) + continue; + /* swap entries */ + temp = parray[x]; + parray[x] = parray[y]; + parray[y] = temp; + x++; + } + + KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); + + /* sort zeros */ + t4_lro_sort(parray, x); + + /* sort ones */ + parray += x; + size -= x; + goto repeat; +} + +void +t4_lro_flush_all(struct t4_lro_ctrl *lc) +{ + uint64_t seq; + uint64_t nseq; + unsigned x; + + /* check if no mbufs to flush */ + if (lc->lro_mbuf_count == 0) + goto done; + + /* sort all mbufs according to stream */ + t4_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); + + /* input data into LRO engine, stream by stream */ + seq = 0; + for (x = 0; x != lc->lro_mbuf_count; x++) { + struct mbuf *mb; + + /* get mbuf */ + mb = lc->lro_mbuf_data[x].mb; + + /* get sequence number, masking away the packet index */ + nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); + + /* check for new stream */ + if (seq != nseq) { + seq = nseq; + + /* flush active streams */ + t4_lro_rx_done(lc); + } + + /* add packet to LRO engine */ + if (t4_lro_rx2(lc, mb, 0, 0) != 0) { + /* input packet to network layer */ + (*lc->ifp->if_input)(lc->ifp, mb); + lc->lro_queued++; + lc->lro_flushed++; + } + } +done: + /* flush active streams */ + t4_lro_rx_done(lc); + + lc->lro_mbuf_count = 0; +} + +#ifdef INET6 +static int +t4_lro_rx_ipv6(struct t4_lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, + struct tcphdr **th) +{ + + /* XXX-BZ we should check the flow-label. */ + + /* XXX-BZ We do not yet support ext. hdrs. */ + if (ip6->ip6_nxt != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Find the TCP header. */ + *th = (struct tcphdr *)(ip6 + 1); + + return (0); +} +#endif + +#ifdef INET +static int +t4_lro_rx_ipv4(struct t4_lro_ctrl *lc, struct mbuf *m, struct ip *ip4, + struct tcphdr **th) +{ + int csum_flags; + uint16_t csum; + + if (ip4->ip_p != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Ensure there are no options. */ + if ((ip4->ip_hl << 2) != sizeof (*ip4)) + return (TCP_LRO_CANNOT); + + /* .. and the packet is not fragmented. */ + if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) + return (TCP_LRO_CANNOT); + + /* Legacy IP has a header checksum that needs to be correct. */ + csum_flags = m->m_pkthdr.csum_flags; + if (csum_flags & CSUM_IP_CHECKED) { + if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } else { + csum = in_cksum_hdr(ip4); + if (__predict_false((csum) != 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } + + /* Find the TCP header (we assured there are no IP options). */ + *th = (struct tcphdr *)(ip4 + 1); + + return (0); +} +#endif + +static int +t4_lro_rx2(struct t4_lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) +{ + struct t4_lro_entry *le; + struct ether_header *eh; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + struct tcphdr *th; + void *l3hdr = NULL; /* Keep compiler happy. */ + uint32_t *ts_ptr; + tcp_seq seq; + int error, ip_len, l; + uint16_t eh_type, tcp_data_len; + struct t4_lro_head *bucket; + int force_flush = 0; + + /* We expect a contiguous header [eh, ip, tcp]. */ + + eh = mtod(m, struct ether_header *); + eh_type = ntohs(eh->ether_type); + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + CURVNET_SET(lc->ifp->if_vnet); + if (V_ip6_forwarding != 0) { + /* XXX-BZ stats but changing lro_ctrl is a problem. */ + CURVNET_RESTORE(); + return (TCP_LRO_CANNOT); + } + CURVNET_RESTORE(); + l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); + error = t4_lro_rx_ipv6(lc, m, ip6, &th); + if (error != 0) + return (error); + tcp_data_len = ntohs(ip6->ip6_plen); + ip_len = sizeof(*ip6) + tcp_data_len; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + CURVNET_SET(lc->ifp->if_vnet); + if (V_ipforwarding != 0) { + /* XXX-BZ stats but changing lro_ctrl is a problem. */ + CURVNET_RESTORE(); + return (TCP_LRO_CANNOT); + } + CURVNET_RESTORE(); + l3hdr = ip4 = (struct ip *)(eh + 1); + error = t4_lro_rx_ipv4(lc, m, ip4, &th); + if (error != 0) + return (error); + ip_len = ntohs(ip4->ip_len); + tcp_data_len = ip_len - sizeof(*ip4); + break; + } +#endif + /* XXX-BZ what happens in case of VLAN(s)? */ + default: + return (TCP_LRO_NOT_SUPPORTED); + } + + /* + * If the frame is padded beyond the end of the IP packet, then we must + * trim the extra bytes off. + */ + l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); + if (l != 0) { + if (l < 0) + /* Truncated packet. */ + return (TCP_LRO_CANNOT); + + m_adj(m, -l); + } + + /* + * Check TCP header constraints. + */ + /* Ensure no bits set besides ACK or PSH. */ + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + if (th->th_flags & TH_SYN) + return (TCP_LRO_CANNOT); + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement, e.g. FIN. + */ + force_flush = 1; + } + + /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ + /* XXX-BZ Ideally we'd flush on PUSH? */ + + /* + * Check for timestamps. + * Since the only option we handle are timestamps, we only have to + * handle the simple case of aligned timestamps. + */ + l = (th->th_off << 2); + tcp_data_len -= l; + l -= sizeof(*th); + ts_ptr = (uint32_t *)(th + 1); + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement. + */ + force_flush = 1; + } + + /* If the driver did not pass in the checksum, set it now. */ + if (csum == 0x0000) + csum = th->th_sum; + + seq = ntohl(th->th_seq); + + if (!use_hash) { + bucket = &lc->lro_hash[0]; + } else if (M_HASHTYPE_ISHASH(m)) { + bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; + } else { + uint32_t hash; + + switch (eh_type) { +#ifdef INET + case ETHERTYPE_IP: + hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + hash = ip6->ip6_src.s6_addr32[0] + + ip6->ip6_dst.s6_addr32[0]; + hash += ip6->ip6_src.s6_addr32[1] + + ip6->ip6_dst.s6_addr32[1]; + hash += ip6->ip6_src.s6_addr32[2] + + ip6->ip6_dst.s6_addr32[2]; + hash += ip6->ip6_src.s6_addr32[3] + + ip6->ip6_dst.s6_addr32[3]; + break; +#endif + default: + hash = 0; + break; + } + hash += th->th_sport + th->th_dport; + bucket = &lc->lro_hash[hash % lc->lro_hashsz]; + } + + /* Try to find a matching previous segment. */ + LIST_FOREACH(le, bucket, hash_next) { + if (le->eh_type != eh_type) + continue; + if (le->source_port != th->th_sport || + le->dest_port != th->th_dport) + continue; + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + if (bcmp(&le->source_ip6, &ip6->ip6_src, + sizeof(struct in6_addr)) != 0 || + bcmp(&le->dest_ip6, &ip6->ip6_dst, + sizeof(struct in6_addr)) != 0) + continue; + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + if (le->source_ip4 != ip4->ip_src.s_addr || + le->dest_ip4 != ip4->ip_dst.s_addr) + continue; + break; +#endif + } + + if (force_flush) { + /* Timestamps mismatch; this is a FIN, etc */ + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + return (TCP_LRO_CANNOT); + } + + /* Flush now if appending will result in overflow. */ + if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + break; + } + + /* Try to append the new segment. */ + if (__predict_false(seq != le->next_seq || + (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { + /* Out of order packet or duplicate ACK. */ + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + return (TCP_LRO_CANNOT); + } + + if (l != 0) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* Make sure timestamp values are increasing. */ + /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ + if (__predict_false(le->tsval > tsval || + *(ts_ptr + 2) == 0)) + return (TCP_LRO_CANNOT); + le->tsval = tsval; + le->tsecr = *(ts_ptr + 2); + } + + le->next_seq += tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + le->append_cnt++; + +#ifdef TCP_LRO_UPDATE_CSUM + le->ulp_csum += t4_lro_rx_csum_fixup(le, l3hdr, th, + tcp_data_len, ~csum); +#endif + + if (tcp_data_len == 0) { + m_freem(m); + /* + * Flush this LRO entry, if this ACK should not + * be further delayed. + */ + if (le->append_cnt >= lc->lro_ackcnt_lim) { + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + } + return (0); + } + + le->p_len += tcp_data_len; + + /* + * Adjust the mbuf so that m_data points to the first byte of + * the ULP payload. Adjust the mbuf to avoid complications and + * append new segment to existing mbuf chain. + */ + m_adj(m, m->m_pkthdr.len - tcp_data_len); + m_demote_pkthdr(m); + + le->m_tail->m_next = m; + le->m_tail = m_last(m); + + /* + * If a possible next full length packet would cause an + * overflow, pro-actively flush now. + */ + if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { + t4_lro_active_remove(le); + t4_lro_flush(lc, le); + } else + getmicrotime(&le->mtime); + + return (0); + } + + if (force_flush) { + /* + * Nothing to flush, but this segment can not be further + * aggregated/delayed. + */ + return (TCP_LRO_CANNOT); + } + + /* Try to find an empty slot. */ + if (LIST_EMPTY(&lc->lro_free)) + return (TCP_LRO_NO_ENTRIES); + + /* Start a new segment chain. */ + le = LIST_FIRST(&lc->lro_free); + LIST_REMOVE(le, next); + t4_lro_active_insert(lc, bucket, le); + getmicrotime(&le->mtime); + + /* Start filling in details. */ + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + le->le_ip6 = ip6; + le->source_ip6 = ip6->ip6_src; + le->dest_ip6 = ip6->ip6_dst; + le->eh_type = eh_type; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + le->le_ip4 = ip4; + le->source_ip4 = ip4->ip_src.s_addr; + le->dest_ip4 = ip4->ip_dst.s_addr; + le->eh_type = eh_type; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; + break; +#endif + } + le->source_port = th->th_sport; + le->dest_port = th->th_dport; + + le->next_seq = seq + tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + if (l != 0) { + le->timestamp = 1; + le->tsval = ntohl(*(ts_ptr + 1)); + le->tsecr = *(ts_ptr + 2); + } + +#ifdef TCP_LRO_UPDATE_CSUM + /* + * Do not touch the csum of the first packet. However save the + * "adjusted" checksum of just the source and destination addresses, + * the next header and the TCP payload. The length and TCP header + * parts may change, so we remove those from the saved checksum and + * re-add with final values on t4_lro_flush() if needed. + */ + KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", + __func__, le, le->ulp_csum)); + + le->ulp_csum = t4_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, + ~csum); + th->th_sum = csum; /* Restore checksum on first packet. */ +#endif + + le->m_head = m; + le->m_tail = m_last(m); + + return (0); +} + +int +t4_lro_rx(struct t4_lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + + return t4_lro_rx2(lc, m, csum, 1); +} + +void +t4_lro_queue_mbuf(struct t4_lro_ctrl *lc, struct mbuf *mb) +{ + /* sanity checks */ + if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || + lc->lro_mbuf_max == 0)) { + /* packet drop */ + m_freem(mb); + return; + } + + /* check if packet is not LRO capable */ + if (__predict_false(mb->m_pkthdr.csum_flags == 0 || + (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { + lc->lro_flushed++; + lc->lro_queued++; + + /* input packet to network layer */ + (*lc->ifp->if_input) (lc->ifp, mb); + return; + } + + /* check if array is full */ + if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) + t4_lro_flush_all(lc); + + /* create sequence number */ + lc->lro_mbuf_data[lc->lro_mbuf_count].seq = + (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | + (((uint64_t)mb->m_pkthdr.flowid) << 24) | + ((uint64_t)lc->lro_mbuf_count); + + /* enter mbuf */ + lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb; +} + +/* end */ diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_lro.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/dev/cxgbe/t4_lro.h Sun Jan 29 21:15:45 2017 -0800 @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 2006, Myricom Inc. + * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2016 Mellanox Technologies. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _T4_LRO_H_ +#define _T4_LRO_H_ + +#include <sys/time.h> + +#ifndef T4_LRO_ENTRIES +/* Define default number of LRO entries per RX queue */ +#define T4_LRO_ENTRIES 8 +#endif + +struct t4_lro_entry { + LIST_ENTRY(t4_lro_entry) next; + LIST_ENTRY(t4_lro_entry) hash_next; + struct mbuf *m_head; + struct mbuf *m_tail; + union { + struct ip *ip4; + struct ip6_hdr *ip6; + } leip; + union { + in_addr_t s_ip4; + struct in6_addr s_ip6; + } lesource; + union { + in_addr_t d_ip4; + struct in6_addr d_ip6; + } ledest; + uint16_t source_port; + uint16_t dest_port; + uint16_t eh_type; /* EthernetHeader type. */ + uint16_t append_cnt; + uint32_t p_len; /* IP header payload length. */ + uint32_t ulp_csum; /* TCP, etc. checksum. */ + uint32_t next_seq; /* tcp_seq */ + uint32_t ack_seq; /* tcp_seq */ + uint32_t tsval; + uint32_t tsecr; + uint16_t window; + uint16_t timestamp; /* flag, not a TCP hdr field. */ + struct timeval mtime; +}; +#if 0 +LIST_HEAD(t4_lro_head, t4_lro_entry); +#endif +struct t4_lro_head { + struct t4_lro_entry *lh_first; +}; + +#define le_ip4 leip.ip4 +#define le_ip6 leip.ip6 +#define source_ip4 lesource.s_ip4 +#define dest_ip4 ledest.d_ip4 +#define source_ip6 lesource.s_ip6 +#define dest_ip6 ledest.d_ip6 + +struct t4_lro_mbuf_sort { + uint64_t seq; + struct mbuf *mb; +}; + +/* NB: This is part of driver structs. */ +struct t4_lro_ctrl { + struct ifnet *ifp; + struct t4_lro_mbuf_sort *lro_mbuf_data; + uint64_t lro_queued; + uint64_t lro_flushed; + uint64_t lro_bad_csum; + unsigned lro_cnt; + unsigned lro_mbuf_count; + unsigned lro_mbuf_max; + unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ + unsigned lro_length_lim; /* max len of aggregated data */ + + u_long lro_hashsz; + struct t4_lro_head *lro_hash; + struct t4_lro_head lro_active; + struct t4_lro_head lro_free; +}; + +#define TCP_LRO_LENGTH_MAX 65535 +#define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */ + +int t4_lro_init(struct t4_lro_ctrl *); +int t4_lro_init_args(struct t4_lro_ctrl *, struct ifnet *, unsigned, unsigned); +void t4_lro_free(struct t4_lro_ctrl *); +void t4_lro_flush_inactive(struct t4_lro_ctrl *, const struct timeval *); +void t4_lro_flush(struct t4_lro_ctrl *, struct t4_lro_entry *); +void t4_lro_flush_all(struct t4_lro_ctrl *); +int t4_lro_rx(struct t4_lro_ctrl *, struct mbuf *, uint32_t); +void t4_lro_queue_mbuf(struct t4_lro_ctrl *, struct mbuf *); + +#define TCP_LRO_NO_ENTRIES -2 +#define TCP_LRO_CANNOT -1 +#define TCP_LRO_NOT_SUPPORTED 1 + +#endif /* _TCP_LRO_H_ */ diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_main.c --- a/sys/dev/cxgbe/t4_main.c Sat Jan 28 20:54:43 2017 +0000 +++ b/sys/dev/cxgbe/t4_main.c Sun Jan 29 21:15:45 2017 -0800 @@ -1728,10 +1728,13 @@ redo_sifflags: ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(vi, i, rxq) { - if (ifp->if_capenable & IFCAP_LRO) + if (ifp->if_capenable & IFCAP_LRO) { rxq->iq.flags |= IQ_LRO_ENABLED; - else + rxq->iq.flags |= IQ_LRO2_ENABLED; + } else { rxq->iq.flags &= ~IQ_LRO_ENABLED; + rxq->iq.flags &= ~IQ_LRO2_ENABLED; + } } #endif } @@ -4100,7 +4103,7 @@ cxgbe_uninit_synchronized(struct vi_info int t4_setup_intr_handlers(struct adapter *sc) { - int rc, rid, p, q, v; + int rc, rid, p, q, v, direct; char s[8]; struct irq *irq; struct port_info *pi; @@ -4154,6 +4157,12 @@ t4_setup_intr_handlers(struct adapter *s for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; + if (vi->flags & INTR_RXQ && + (vi->nofldrxq == 0 || vi->flags & INTR_OFLD_RXQ)) + direct = 1; + else + direct = 0; + if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); @@ -4173,7 +4182,8 @@ t4_setup_intr_handlers(struct adapter *s irq->nm_rxq = nm_rxq++; #endif rc = t4_alloc_irq(sc, irq, rid, - t4_vi_intr, irq, s); + direct ? t4_vi_intr2 : t4_vi_intr, + irq, s); if (rc != 0) return (rc); irq++; @@ -4185,7 +4195,8 @@ t4_setup_intr_handlers(struct adapter *s snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, - t4_intr, rxq, s); + direct ? t4_intr2 : t4_intr, rxq, + s); if (rc != 0) return (rc); #ifdef RSS @@ -4203,7 +4214,8 @@ t4_setup_intr_handlers(struct adapter *s snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, - t4_intr, ofld_rxq, s); + direct ? t4_intr2 : t4_intr, + ofld_rxq, s); if (rc != 0) return (rc); irq++; diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_sge.c --- a/sys/dev/cxgbe/t4_sge.c Sat Jan 28 20:54:43 2017 +0000 +++ b/sys/dev/cxgbe/t4_sge.c Sun Jan 29 21:15:45 2017 -0800 @@ -164,6 +164,7 @@ struct sgl { }; static int service_iq(struct sge_iq *, int); +static int service_iq2(struct sge_iq *); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); @@ -1344,6 +1345,17 @@ t4_intr(void *arg) } void +t4_intr2(void *arg) +{ + struct sge_iq *iq = arg; + + if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { + service_iq2(iq); + atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); + } +} + +void t4_vi_intr(void *arg) { struct irq *irq = arg; @@ -1358,6 +1370,21 @@ t4_vi_intr(void *arg) t4_intr(irq->rxq); } +void +t4_vi_intr2(void *arg) +{ + struct irq *irq = arg; + +#ifdef DEV_NETMAP + if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) { + t4_nm_intr(irq->nm_rxq); + atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON); + } +#endif + if (irq->rxq != NULL) + t4_intr2(irq->rxq); +} + /* * Deals with anything and everything on the given ingress queue. */ @@ -1565,6 +1592,170 @@ process_iql: return (0); } +/* + * Specialized version of service_iq that is used when all queues are taking + * direct interrupts. + */ +static int +service_iq2(struct sge_iq *iq) +{ + struct sge_rxq *rxq = iq_to_rxq(iq); + struct sge_fl *fl = &rxq->fl; + struct adapter *sc = iq->adapter; + struct iq_desc *d = &iq->desc[iq->cidx]; + int ndescs = 0, limit; + int rsp_type; + uint32_t lq; + uint16_t fl_hw_cidx; + struct mbuf *m0; +#if 0 +#if defined(INET) || defined(INET6) + const struct timeval lro_timeout = {0, sc->lro_timeout}; +#endif +#endif + + KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); + + /* service_iq2 isn't as flexible as service_iq */ + MPASS(iq->flags & IQ_HAS_FL); + + limit = iq->qsize >> 3; + fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ + if (iq->flags & IQ_ADJ_CREDIT) { + ndescs++; + iq->flags &= ~IQ_ADJ_CREDIT; + if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) { + t4_lro_flush_all(&rxq->t4_lro); + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | + V_INGRESSQID((u32)iq->cntxt_id) | + V_SEINTARM(iq->intr_params)); + return (0); + } + } + + while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { + + rmb(); + + m0 = NULL; + rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); + lq = be32toh(d->rsp.pldbuflen_qid); + + switch (rsp_type) { + case X_RSPD_TYPE_FLBUF: + + KASSERT(iq->flags & IQ_HAS_FL, + ("%s: data for an iq (%p) with no freelist", + __func__, iq)); + + m0 = get_fl_payload(sc, fl, lq); + if (__predict_false(m0 == NULL)) + break; + + /* fall through */ + + case X_RSPD_TYPE_CPL: + KASSERT(d->rss.opcode < NUM_CPL_CMDS, + ("%s: bad opcode %02x.", __func__, + d->rss.opcode)); + t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); + break; + + case X_RSPD_TYPE_INTR: + + /* + * There are 1K interrupt-capable queues (qids 0 + * through 1023). A response type indicating a + * forwarded interrupt with a qid >= 1K is an + * iWARP async notification. + */ + if (lq >= 1024) { + t4_an_handler(iq, &d->rsp); + break; + } + + KASSERT(0, ("%s: indirect interrupt on iq %p", + __func__, iq)); + log(LOG_ERR, + "%s: unexpected interrupt on iq %p", + __func__, iq); + break; + + default: + KASSERT(0, + ("%s: illegal response type %d on iq %p", + __func__, rsp_type, iq)); + log(LOG_ERR, + "%s: illegal response type %d on iq %p", + device_get_nameunit(sc->dev), rsp_type, iq); + break; + } + + d++; + if (__predict_false(++iq->cidx == iq->sidx)) { + iq->cidx = 0; + iq->gen ^= F_RSPD_GEN; + d = &iq->desc[0]; + } + if (__predict_false(++ndescs == limit)) { + t4_write_reg(sc, sc->sge_gts_reg, + V_CIDXINC(ndescs) | + V_INGRESSQID(iq->cntxt_id) | + V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); + ndescs = 0; + +#if 0 +#if defined(INET) || defined(INET6) + if (iq->flags & IQ_LRO2_ENABLED && + sc->lro_timeout != 0) { + t4_lro_flush_inactive(&rxq->t4_lro, + &lro_timeout); + } +#endif +#endif + } + if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2) { + FL_LOCK(fl); + refill_fl(sc, fl, 32); + FL_UNLOCK(fl); + fl_hw_cidx = fl->hw_cidx; + } + } + +#if defined(INET) || defined(INET6) + if (iq->flags & IQ_LRO2_ENABLED) { + if (ndescs > 0 && rxq->t4_lro.lro_mbuf_count > 0) { + /* hold back one credit and don't flush LRO state */ + iq->flags |= IQ_ADJ_CREDIT; + t4_write_reg(sc, sc->sge_gts_reg, + V_CIDXINC(ndescs - 1) | + V_INGRESSQID((u32)iq->cntxt_id) | + V_SEINTARM(V_QINTR_TIMER_IDX(SGE_NTIMERS - 1))); + } else { + t4_lro_flush_all(&rxq->t4_lro); + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | + V_INGRESSQID((u32)iq->cntxt_id) | + V_SEINTARM(iq->intr_params)); + } + } +#else + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | + V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); +#endif + + if (iq->flags & IQ_HAS_FL) { + int starved; + + FL_LOCK(fl); + starved = refill_fl(sc, fl, 64); + FL_UNLOCK(fl); + if (__predict_false(starved != 0)) + add_fl_to_sfl(sc, fl); + } + + return (0); +} + static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { @@ -1787,9 +1978,6 @@ t4_eth_rx(struct sge_iq *iq, const struc struct ifnet *ifp = rxq->ifp; struct adapter *sc = iq->adapter; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); -#if defined(INET) || defined(INET6) - struct lro_ctrl *lro = &rxq->lro; -#endif static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, @@ -1834,13 +2022,18 @@ t4_eth_rx(struct sge_iq *iq, const struc } #if defined(INET) || defined(INET6) - if (iq->flags & IQ_LRO_ENABLED && - tcp_lro_rx(lro, m0, 0) == 0) { - /* queued for LRO */ - } else + if (iq->flags & IQ_LRO2_ENABLED) { + t4_lro_queue_mbuf(&rxq->t4_lro, m0); + return (0); + } + if (iq->flags & IQ_LRO_ENABLED) { + if (tcp_lro_rx(&rxq->lro, m0, 0) != 0) + ifp->if_input(ifp, m0); + return (0); + } +#else + ifp->if_input(ifp, m0); #endif - ifp->if_input(ifp, m0); - return (0); } @@ -3005,6 +3198,9 @@ alloc_rxq(struct vi_info *vi, struct sge struct adapter *sc = vi->pi->adapter; struct sysctl_oid_list *children; char name[16]; +#if defined(INET) || defined(INET6) + int lro_entries; +#endif rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(vi->pi, cong_drop)); @@ -3033,8 +3229,19 @@ alloc_rxq(struct vi_info *vi, struct sge return (rc); rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */ - if (vi->ifp->if_capenable & IFCAP_LRO) + lro_entries = 512; + TUNABLE_INT_FETCH("hw.cxgbe.lro_entries", &lro_entries); + if (lro_entries < 8 || lro_entries > 8192) + lro_entries = 512; + rc = t4_lro_init_args(&rxq->t4_lro, vi->ifp, lro_entries, lro_entries); + if (rc != 0) + return (rc); + rxq->t4_lro.ifp = vi->ifp; /* also indicates LRO init'ed */ + + if (vi->ifp->if_capenable & IFCAP_LRO) { rxq->iq.flags |= IQ_LRO_ENABLED; + rxq->iq.flags |= IQ_LRO2_ENABLED; + } #endif rxq->ifp = vi->ifp; @@ -3063,6 +3270,10 @@ alloc_rxq(struct vi_info *vi, struct sge &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); + SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "t4_lro_queued", CTLFLAG_RD, + &rxq->t4_lro.lro_queued, 0, NULL); + SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "t4_lro_flushed", CTLFLAG_RD, + &rxq->t4_lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); @@ -3085,6 +3296,10 @@ free_rxq(struct vi_info *vi, struct sge_ tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } + if (rxq->t4_lro.ifp) { + t4_lro_free(&rxq->t4_lro); + rxq->t4_lro.ifp = NULL; + } #endif rc = free_iq_fl(vi, &rxq->iq, &rxq->fl); diff -r 54d8e9872bb6 sys/modules/cxgbe/if_cxgbe/Makefile --- a/sys/modules/cxgbe/if_cxgbe/Makefile Sat Jan 28 20:54:43 2017 +0000 +++ b/sys/modules/cxgbe/if_cxgbe/Makefile Sun Jan 29 21:15:45 2017 -0800 @@ -23,6 +23,7 @@ SRCS+= t4_mp_ring.c SRCS+= t4_netmap.c SRCS+= t4_sge.c SRCS+= t4_tracer.c +SRCS+= t4_lro.c # Provide the timestamp of a packet in its header mbuf. #CFLAGS+= -DT4_PKT_TIMESTAMP