diff -r 54d8e9872bb6 sys/conf/files
--- a/sys/conf/files	Sat Jan 28 20:54:43 2017 +0000
+++ b/sys/conf/files	Sun Jan 29 21:15:45 2017 -0800
@@ -1347,6 +1347,8 @@ dev/cxgbe/t4_sge.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_l2t.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
+dev/cxgbe/t4_lro.c		optional cxgbe pci \
+	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_tracer.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_vf.c		optional cxgbev pci \
diff -r 54d8e9872bb6 sys/dev/cxgbe/adapter.h
--- a/sys/dev/cxgbe/adapter.h	Sat Jan 28 20:54:43 2017 +0000
+++ b/sys/dev/cxgbe/adapter.h	Sun Jan 29 21:15:45 2017 -0800
@@ -52,6 +52,7 @@
 #include <net/if_media.h>
 #include <netinet/in.h>
 #include <netinet/tcp_lro.h>
+#include "t4_lro.h"
 
 #include "offload.h"
 #include "t4_ioctl.h"
@@ -322,6 +323,8 @@ enum {
 	IQ_HAS_FL	= (1 << 1),	/* iq associated with a freelist */
 	IQ_INTR		= (1 << 2),	/* iq takes direct interrupt */
 	IQ_LRO_ENABLED	= (1 << 3),	/* iq is an eth rxq with LRO enabled */
+	IQ_LRO2_ENABLED	= (1 << 4),	/* iq is an eth rxq with LRO2 enabled */
+	IQ_ADJ_CREDIT	= (1 << 5),	/* hw is off by 1 credit for this iq */
 
 	/* iq state */
 	IQS_DISABLED	= 0,
@@ -543,6 +546,7 @@ struct sge_rxq {
 	struct ifnet *ifp;	/* the interface this rxq belongs to */
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl lro;	/* LRO state */
+	struct t4_lro_ctrl t4_lro;
 #endif
 
 	/* stats for common events first */
@@ -1144,7 +1148,9 @@ int t4_setup_vi_queues(struct vi_info *)
 int t4_teardown_vi_queues(struct vi_info *);
 void t4_intr_all(void *);
 void t4_intr(void *);
+void t4_intr2(void *);
 void t4_vi_intr(void *);
+void t4_vi_intr2(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_lro.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/t4_lro.c	Sun Jan 29 21:15:45 2017 -0800
@@ -0,0 +1,982 @@
+/*-
+ * Copyright (c) 2007, Myricom Inc.
+ * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2016 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <net/vnet.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include "t4_lro.h"
+#include <netinet/tcp_var.h>
+
+#include <netinet6/ip6_var.h>
+
+#include <machine/in_cksum.h>
+
+MALLOC_DECLARE(M_CXGBE);
+
+#define	TCP_LRO_UPDATE_CSUM	1
+#ifndef	TCP_LRO_UPDATE_CSUM
+#define	TCP_LRO_INVALID_CSUM	0x0000
+#endif
+
+static void	t4_lro_rx_done(struct t4_lro_ctrl *lc);
+static int	t4_lro_rx2(struct t4_lro_ctrl *lc, struct mbuf *m,
+		    uint32_t csum, int use_hash);
+
+#if 0
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "TCP LRO");
+
+static unsigned	t4_lro_entries = T4_LRO_ENTRIES;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, t4entries,
+    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &t4_lro_entries, 0,
+    "default number of T4 LRO entries");
+#endif
+
+static __inline void
+t4_lro_active_insert(struct t4_lro_ctrl *lc, struct t4_lro_head *bucket,
+    struct t4_lro_entry *le)
+{
+
+	LIST_INSERT_HEAD(&lc->lro_active, le, next);
+	LIST_INSERT_HEAD(bucket, le, hash_next);
+}
+
+static __inline void
+t4_lro_active_remove(struct t4_lro_entry *le)
+{
+
+	LIST_REMOVE(le, next);		/* active list */
+	LIST_REMOVE(le, hash_next);	/* hash bucket */
+}
+
+int
+t4_lro_init(struct t4_lro_ctrl *lc)
+{
+	return (t4_lro_init_args(lc, NULL, T4_LRO_ENTRIES, 0));
+}
+
+int
+t4_lro_init_args(struct t4_lro_ctrl *lc, struct ifnet *ifp,
+    unsigned lro_entries, unsigned lro_mbufs)
+{
+	struct t4_lro_entry *le;
+	size_t size;
+	unsigned i, elements;
+
+	lc->lro_bad_csum = 0;
+	lc->lro_queued = 0;
+	lc->lro_flushed = 0;
+	lc->lro_cnt = 0;
+	lc->lro_mbuf_count = 0;
+	lc->lro_mbuf_max = lro_mbufs;
+	lc->lro_cnt = lro_entries;
+	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
+	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
+	lc->ifp = ifp;
+	LIST_INIT(&lc->lro_free);
+	LIST_INIT(&lc->lro_active);
+
+	/* create hash table to accelerate entry lookup */
+	if (lro_entries > lro_mbufs)
+		elements = lro_entries;
+	else
+		elements = lro_mbufs;
+	lc->lro_hash = phashinit_flags(elements, M_CXGBE, &lc->lro_hashsz,
+	    HASH_NOWAIT);
+	if (lc->lro_hash == NULL) {
+		memset(lc, 0, sizeof(*lc));
+		return (ENOMEM);
+	}
+
+	/* compute size to allocate */
+	size = (lro_mbufs * sizeof(struct t4_lro_mbuf_sort)) +
+	    (lro_entries * sizeof(*le));
+	lc->lro_mbuf_data = (struct t4_lro_mbuf_sort *)
+	    malloc(size, M_CXGBE, M_NOWAIT | M_ZERO);
+
+	/* check for out of memory */
+	if (lc->lro_mbuf_data == NULL) {
+		memset(lc, 0, sizeof(*lc));
+		return (ENOMEM);
+	}
+	/* compute offset for LRO entries */
+	le = (struct t4_lro_entry *)
+	    (lc->lro_mbuf_data + lro_mbufs);
+
+	/* setup linked list */
+	for (i = 0; i != lro_entries; i++)
+		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
+
+	return (0);
+}
+
+void
+t4_lro_free(struct t4_lro_ctrl *lc)
+{
+	struct t4_lro_entry *le;
+	unsigned x;
+
+	/* reset LRO free list */
+	LIST_INIT(&lc->lro_free);
+
+	/* free active mbufs, if any */
+	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+		t4_lro_active_remove(le);
+		m_freem(le->m_head);
+	}
+
+	/* free hash table */
+	if (lc->lro_hash != NULL) {
+		free(lc->lro_hash, M_CXGBE);
+		lc->lro_hash = NULL;
+	}
+	lc->lro_hashsz = 0;
+
+	/* free mbuf array, if any */
+	for (x = 0; x != lc->lro_mbuf_count; x++)
+		m_freem(lc->lro_mbuf_data[x].mb);
+	lc->lro_mbuf_count = 0;
+
+	/* free allocated memory, if any */
+	free(lc->lro_mbuf_data, M_CXGBE);
+	lc->lro_mbuf_data = NULL;
+}
+
+#ifdef TCP_LRO_UPDATE_CSUM
+static uint16_t
+t4_lro_csum_th(struct tcphdr *th)
+{
+	uint32_t ch;
+	uint16_t *p, l;
+
+	ch = th->th_sum = 0x0000;
+	l = th->th_off;
+	p = (uint16_t *)th;
+	while (l > 0) {
+		ch += *p;
+		p++;
+		ch += *p;
+		p++;
+		l--;
+	}
+	while (ch > 0xffff)
+		ch = (ch >> 16) + (ch & 0xffff);
+
+	return (ch & 0xffff);
+}
+
+static uint16_t
+t4_lro_rx_csum_fixup(struct t4_lro_entry *le, void *l3hdr, struct tcphdr *th,
+    uint16_t tcp_data_len, uint16_t csum)
+{
+	uint32_t c;
+	uint16_t cs;
+
+	c = csum;
+
+	/* Remove length from checksum. */
+	switch (le->eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		struct ip6_hdr *ip6;
+
+		ip6 = (struct ip6_hdr *)l3hdr;
+		if (le->append_cnt == 0)
+			cs = ip6->ip6_plen;
+		else {
+			uint32_t cx;
+
+			cx = ntohs(ip6->ip6_plen);
+			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
+		}
+		break;
+	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		struct ip *ip4;
+
+		ip4 = (struct ip *)l3hdr;
+		if (le->append_cnt == 0)
+			cs = ip4->ip_len;
+		else {
+			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
+			    IPPROTO_TCP);
+			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
+			    htons(cs));
+		}
+		break;
+	}
+#endif
+	default:
+		cs = 0;		/* Keep compiler happy. */
+	}
+
+	cs = ~cs;
+	c += cs;
+
+	/* Remove TCP header csum. */
+	cs = ~t4_lro_csum_th(th);
+	c += cs;
+	while (c > 0xffff)
+		c = (c >> 16) + (c & 0xffff);
+
+	return (c & 0xffff);
+}
+#endif
+
+static void
+t4_lro_rx_done(struct t4_lro_ctrl *lc)
+{
+	struct t4_lro_entry *le;
+
+	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+		t4_lro_active_remove(le);
+		t4_lro_flush(lc, le);
+	}
+}
+
+void
+t4_lro_flush_inactive(struct t4_lro_ctrl *lc, const struct timeval *timeout)
+{
+	struct t4_lro_entry *le, *le_tmp;
+	struct timeval tv;
+
+	if (LIST_EMPTY(&lc->lro_active))
+		return;
+
+	getmicrotime(&tv);
+	timevalsub(&tv, timeout);
+	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
+		if (timevalcmp(&tv, &le->mtime, >=)) {
+			t4_lro_active_remove(le);
+			t4_lro_flush(lc, le);
+		}
+	}
+}
+
+void
+t4_lro_flush(struct t4_lro_ctrl *lc, struct t4_lro_entry *le)
+{
+
+	if (le->append_cnt > 0) {
+		struct tcphdr *th;
+		uint16_t p_len;
+
+		p_len = htons(le->p_len);
+		switch (le->eh_type) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+		{
+			struct ip6_hdr *ip6;
+
+			ip6 = le->le_ip6;
+			ip6->ip6_plen = p_len;
+			th = (struct tcphdr *)(ip6 + 1);
+			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+			    CSUM_PSEUDO_HDR;
+			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
+			break;
+		}
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+		{
+			struct ip *ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+			uint32_t cl;
+			uint16_t c;
+#endif
+
+			ip4 = le->le_ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+			/* Fix IP header checksum for new length. */
+			c = ~ip4->ip_sum;
+			cl = c;
+			c = ~ip4->ip_len;
+			cl += c + p_len;
+			while (cl > 0xffff)
+				cl = (cl >> 16) + (cl & 0xffff);
+			c = cl;
+			ip4->ip_sum = ~c;
+#else
+			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
+#endif
+			ip4->ip_len = p_len;
+			th = (struct tcphdr *)(ip4 + 1);
+			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
+			le->p_len += ETHER_HDR_LEN;
+			break;
+		}
+#endif
+		default:
+			th = NULL;	/* Keep compiler happy. */
+		}
+		le->m_head->m_pkthdr.csum_data = 0xffff;
+		le->m_head->m_pkthdr.len = le->p_len;
+
+		/* Incorporate the latest ACK into the TCP header. */
+		th->th_ack = le->ack_seq;
+		th->th_win = le->window;
+		/* Incorporate latest timestamp into the TCP header. */
+		if (le->timestamp != 0) {
+			uint32_t *ts_ptr;
+
+			ts_ptr = (uint32_t *)(th + 1);
+			ts_ptr[1] = htonl(le->tsval);
+			ts_ptr[2] = le->tsecr;
+		}
+#ifdef TCP_LRO_UPDATE_CSUM
+		/* Update the TCP header checksum. */
+		le->ulp_csum += p_len;
+		le->ulp_csum += t4_lro_csum_th(th);
+		while (le->ulp_csum > 0xffff)
+			le->ulp_csum = (le->ulp_csum >> 16) +
+			    (le->ulp_csum & 0xffff);
+		th->th_sum = (le->ulp_csum & 0xffff);
+		th->th_sum = ~th->th_sum;
+#else
+		th->th_sum = TCP_LRO_INVALID_CSUM;
+#endif
+	}
+
+	le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1;
+	(*lc->ifp->if_input)(lc->ifp, le->m_head);
+	lc->lro_queued += le->append_cnt + 1;
+	lc->lro_flushed++;
+	bzero(le, sizeof(*le));
+	LIST_INSERT_HEAD(&lc->lro_free, le, next);
+}
+
+#ifdef HAVE_INLINE_FLSLL
+#define	t4_lro_msb_64(x) (1ULL << (flsll(x) - 1))
+#else
+static inline uint64_t
+t4_lro_msb_64(uint64_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	x |= (x >> 32);
+	return (x & ~(x >> 1));
+}
+#endif
+
+/*
+ * The t4_lro_sort() routine is comparable to qsort(), except it has
+ * a worst case complexity limit of O(MIN(N,64)*N), where N is the
+ * number of elements to sort and 64 is the number of sequence bits
+ * available. The algorithm is bit-slicing the 64-bit sequence number,
+ * sorting one bit at a time from the most significant bit until the
+ * least significant one, skipping the constant bits. This is
+ * typically called a radix sort.
+ */
+static void
+t4_lro_sort(struct t4_lro_mbuf_sort *parray, uint32_t size)
+{
+	struct t4_lro_mbuf_sort temp;
+	uint64_t ones;
+	uint64_t zeros;
+	uint32_t x;
+	uint32_t y;
+
+repeat:
+	/* for small arrays insertion sort is faster */
+	if (size <= 12) {
+		for (x = 1; x < size; x++) {
+			temp = parray[x];
+			for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
+				parray[y] = parray[y - 1];
+			parray[y] = temp;
+		}
+		return;
+	}
+
+	/* compute sequence bits which are constant */
+	ones = 0;
+	zeros = 0;
+	for (x = 0; x != size; x++) {
+		ones |= parray[x].seq;
+		zeros |= ~parray[x].seq;
+	}
+
+	/* compute bits which are not constant into "ones" */
+	ones &= zeros;
+	if (ones == 0)
+		return;
+
+	/* pick the most significant bit which is not constant */
+	ones = t4_lro_msb_64(ones);
+
+	/*
+	 * Move entries having cleared sequence bits to the beginning
+	 * of the array:
+	 */
+	for (x = y = 0; y != size; y++) {
+		/* skip set bits */
+		if (parray[y].seq & ones)
+			continue;
+		/* swap entries */
+		temp = parray[x];
+		parray[x] = parray[y];
+		parray[y] = temp;
+		x++;
+	}
+
+	KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
+
+	/* sort zeros */
+	t4_lro_sort(parray, x);
+
+	/* sort ones */
+	parray += x;
+	size -= x;
+	goto repeat;
+}
+
+void
+t4_lro_flush_all(struct t4_lro_ctrl *lc)
+{
+	uint64_t seq;
+	uint64_t nseq;
+	unsigned x;
+
+	/* check if no mbufs to flush */
+	if (lc->lro_mbuf_count == 0)
+		goto done;
+
+	/* sort all mbufs according to stream */
+	t4_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
+
+	/* input data into LRO engine, stream by stream */
+	seq = 0;
+	for (x = 0; x != lc->lro_mbuf_count; x++) {
+		struct mbuf *mb;
+
+		/* get mbuf */
+		mb = lc->lro_mbuf_data[x].mb;
+
+		/* get sequence number, masking away the packet index */
+		nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
+
+		/* check for new stream */
+		if (seq != nseq) {
+			seq = nseq;
+
+			/* flush active streams */
+			t4_lro_rx_done(lc);
+		}
+
+		/* add packet to LRO engine */
+		if (t4_lro_rx2(lc, mb, 0, 0) != 0) {
+			/* input packet to network layer */
+			(*lc->ifp->if_input)(lc->ifp, mb);
+			lc->lro_queued++;
+			lc->lro_flushed++;
+		}
+	}
+done:
+	/* flush active streams */
+	t4_lro_rx_done(lc);
+
+	lc->lro_mbuf_count = 0;
+}
+
+#ifdef INET6
+static int
+t4_lro_rx_ipv6(struct t4_lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
+    struct tcphdr **th)
+{
+
+	/* XXX-BZ we should check the flow-label. */
+
+	/* XXX-BZ We do not yet support ext. hdrs. */
+	if (ip6->ip6_nxt != IPPROTO_TCP)
+		return (TCP_LRO_NOT_SUPPORTED);
+
+	/* Find the TCP header. */
+	*th = (struct tcphdr *)(ip6 + 1);
+
+	return (0);
+}
+#endif
+
+#ifdef INET
+static int
+t4_lro_rx_ipv4(struct t4_lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
+    struct tcphdr **th)
+{
+	int csum_flags;
+	uint16_t csum;
+
+	if (ip4->ip_p != IPPROTO_TCP)
+		return (TCP_LRO_NOT_SUPPORTED);
+
+	/* Ensure there are no options. */
+	if ((ip4->ip_hl << 2) != sizeof (*ip4))
+		return (TCP_LRO_CANNOT);
+
+	/* .. and the packet is not fragmented. */
+	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
+		return (TCP_LRO_CANNOT);
+
+	/* Legacy IP has a header checksum that needs to be correct. */
+	csum_flags = m->m_pkthdr.csum_flags;
+	if (csum_flags & CSUM_IP_CHECKED) {
+		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
+			lc->lro_bad_csum++;
+			return (TCP_LRO_CANNOT);
+		}
+	} else {
+		csum = in_cksum_hdr(ip4);
+		if (__predict_false((csum) != 0)) {
+			lc->lro_bad_csum++;
+			return (TCP_LRO_CANNOT);
+		}
+	}
+
+	/* Find the TCP header (we assured there are no IP options). */
+	*th = (struct tcphdr *)(ip4 + 1);
+
+	return (0);
+}
+#endif
+
+static int
+t4_lro_rx2(struct t4_lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash)
+{
+	struct t4_lro_entry *le;
+	struct ether_header *eh;
+#ifdef INET6
+	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
+#endif
+#ifdef INET
+	struct ip *ip4 = NULL;		/* Keep compiler happy. */
+#endif
+	struct tcphdr *th;
+	void *l3hdr = NULL;		/* Keep compiler happy. */
+	uint32_t *ts_ptr;
+	tcp_seq seq;
+	int error, ip_len, l;
+	uint16_t eh_type, tcp_data_len;
+	struct t4_lro_head *bucket;
+	int force_flush = 0;
+
+	/* We expect a contiguous header [eh, ip, tcp]. */
+
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		CURVNET_SET(lc->ifp->if_vnet);
+		if (V_ip6_forwarding != 0) {
+			/* XXX-BZ stats but changing lro_ctrl is a problem. */
+			CURVNET_RESTORE();
+			return (TCP_LRO_CANNOT);
+		}
+		CURVNET_RESTORE();
+		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
+		error = t4_lro_rx_ipv6(lc, m, ip6, &th);
+		if (error != 0)
+			return (error);
+		tcp_data_len = ntohs(ip6->ip6_plen);
+		ip_len = sizeof(*ip6) + tcp_data_len;
+		break;
+	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		CURVNET_SET(lc->ifp->if_vnet);
+		if (V_ipforwarding != 0) {
+			/* XXX-BZ stats but changing lro_ctrl is a problem. */
+			CURVNET_RESTORE();
+			return (TCP_LRO_CANNOT);
+		}
+		CURVNET_RESTORE();
+		l3hdr = ip4 = (struct ip *)(eh + 1);
+		error = t4_lro_rx_ipv4(lc, m, ip4, &th);
+		if (error != 0)
+			return (error);
+		ip_len = ntohs(ip4->ip_len);
+		tcp_data_len = ip_len - sizeof(*ip4);
+		break;
+	}
+#endif
+	/* XXX-BZ what happens in case of VLAN(s)? */
+	default:
+		return (TCP_LRO_NOT_SUPPORTED);
+	}
+
+	/*
+	 * If the frame is padded beyond the end of the IP packet, then we must
+	 * trim the extra bytes off.
+	 */
+	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
+	if (l != 0) {
+		if (l < 0)
+			/* Truncated packet. */
+			return (TCP_LRO_CANNOT);
+
+		m_adj(m, -l);
+	}
+
+	/*
+	 * Check TCP header constraints.
+	 */
+	/* Ensure no bits set besides ACK or PSH. */
+	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
+		if (th->th_flags & TH_SYN)
+			return (TCP_LRO_CANNOT);
+		/*
+		 * Make sure that previously seen segements/ACKs are delivered
+		 * before this segement, e.g. FIN.
+		 */
+		force_flush = 1;
+	}
+
+	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
+	/* XXX-BZ Ideally we'd flush on PUSH? */
+
+	/*
+	 * Check for timestamps.
+	 * Since the only option we handle are timestamps, we only have to
+	 * handle the simple case of aligned timestamps.
+	 */
+	l = (th->th_off << 2);
+	tcp_data_len -= l;
+	l -= sizeof(*th);
+	ts_ptr = (uint32_t *)(th + 1);
+	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
+	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
+	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
+		/*
+		 * Make sure that previously seen segements/ACKs are delivered
+		 * before this segement.
+		 */
+		force_flush = 1;
+	}
+
+	/* If the driver did not pass in the checksum, set it now. */
+	if (csum == 0x0000)
+		csum = th->th_sum;
+
+	seq = ntohl(th->th_seq);
+
+	if (!use_hash) {
+		bucket = &lc->lro_hash[0];
+	} else if (M_HASHTYPE_ISHASH(m)) {
+		bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz];
+	} else {
+		uint32_t hash;
+
+		switch (eh_type) {
+#ifdef INET
+		case ETHERTYPE_IP:
+			hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr;
+			break;
+#endif
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+			hash = ip6->ip6_src.s6_addr32[0] +
+			    ip6->ip6_dst.s6_addr32[0];
+			hash += ip6->ip6_src.s6_addr32[1] +
+			    ip6->ip6_dst.s6_addr32[1];
+			hash += ip6->ip6_src.s6_addr32[2] +
+			    ip6->ip6_dst.s6_addr32[2];
+			hash += ip6->ip6_src.s6_addr32[3] +
+			    ip6->ip6_dst.s6_addr32[3];
+			break;
+#endif
+		default:
+			hash = 0;
+			break;
+		}
+		hash += th->th_sport + th->th_dport;
+		bucket = &lc->lro_hash[hash % lc->lro_hashsz];
+	}
+
+	/* Try to find a matching previous segment. */
+	LIST_FOREACH(le, bucket, hash_next) {
+		if (le->eh_type != eh_type)
+			continue;
+		if (le->source_port != th->th_sport ||
+		    le->dest_port != th->th_dport)
+			continue;
+		switch (eh_type) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+			if (bcmp(&le->source_ip6, &ip6->ip6_src,
+			    sizeof(struct in6_addr)) != 0 ||
+			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
+			    sizeof(struct in6_addr)) != 0)
+				continue;
+			break;
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+			if (le->source_ip4 != ip4->ip_src.s_addr ||
+			    le->dest_ip4 != ip4->ip_dst.s_addr)
+				continue;
+			break;
+#endif
+		}
+
+		if (force_flush) {
+			/* Timestamps mismatch; this is a FIN, etc */
+			t4_lro_active_remove(le);
+			t4_lro_flush(lc, le);
+			return (TCP_LRO_CANNOT);
+		}
+
+		/* Flush now if appending will result in overflow. */
+		if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
+			t4_lro_active_remove(le);
+			t4_lro_flush(lc, le);
+			break;
+		}
+
+		/* Try to append the new segment. */
+		if (__predict_false(seq != le->next_seq ||
+		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
+			/* Out of order packet or duplicate ACK. */
+			t4_lro_active_remove(le);
+			t4_lro_flush(lc, le);
+			return (TCP_LRO_CANNOT);
+		}
+
+		if (l != 0) {
+			uint32_t tsval = ntohl(*(ts_ptr + 1));
+			/* Make sure timestamp values are increasing. */
+			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
+			if (__predict_false(le->tsval > tsval ||
+			    *(ts_ptr + 2) == 0))
+				return (TCP_LRO_CANNOT);
+			le->tsval = tsval;
+			le->tsecr = *(ts_ptr + 2);
+		}
+
+		le->next_seq += tcp_data_len;
+		le->ack_seq = th->th_ack;
+		le->window = th->th_win;
+		le->append_cnt++;
+
+#ifdef TCP_LRO_UPDATE_CSUM
+		le->ulp_csum += t4_lro_rx_csum_fixup(le, l3hdr, th,
+		    tcp_data_len, ~csum);
+#endif
+
+		if (tcp_data_len == 0) {
+			m_freem(m);
+			/*
+			 * Flush this LRO entry, if this ACK should not
+			 * be further delayed.
+			 */
+			if (le->append_cnt >= lc->lro_ackcnt_lim) {
+				t4_lro_active_remove(le);
+				t4_lro_flush(lc, le);
+			}
+			return (0);
+		}
+
+		le->p_len += tcp_data_len;
+
+		/*
+		 * Adjust the mbuf so that m_data points to the first byte of
+		 * the ULP payload.  Adjust the mbuf to avoid complications and
+		 * append new segment to existing mbuf chain.
+		 */
+		m_adj(m, m->m_pkthdr.len - tcp_data_len);
+		m_demote_pkthdr(m);
+
+		le->m_tail->m_next = m;
+		le->m_tail = m_last(m);
+
+		/*
+		 * If a possible next full length packet would cause an
+		 * overflow, pro-actively flush now.
+		 */
+		if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
+			t4_lro_active_remove(le);
+			t4_lro_flush(lc, le);
+		} else
+			getmicrotime(&le->mtime);
+
+		return (0);
+	}
+
+	if (force_flush) {
+		/*
+		 * Nothing to flush, but this segment can not be further
+		 * aggregated/delayed.
+		 */
+		return (TCP_LRO_CANNOT);
+	}
+
+	/* Try to find an empty slot. */
+	if (LIST_EMPTY(&lc->lro_free))
+		return (TCP_LRO_NO_ENTRIES);
+
+	/* Start a new segment chain. */
+	le = LIST_FIRST(&lc->lro_free);
+	LIST_REMOVE(le, next);
+	t4_lro_active_insert(lc, bucket, le);
+	getmicrotime(&le->mtime);
+
+	/* Start filling in details. */
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+		le->le_ip6 = ip6;
+		le->source_ip6 = ip6->ip6_src;
+		le->dest_ip6 = ip6->ip6_dst;
+		le->eh_type = eh_type;
+		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
+		break;
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+		le->le_ip4 = ip4;
+		le->source_ip4 = ip4->ip_src.s_addr;
+		le->dest_ip4 = ip4->ip_dst.s_addr;
+		le->eh_type = eh_type;
+		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
+		break;
+#endif
+	}
+	le->source_port = th->th_sport;
+	le->dest_port = th->th_dport;
+
+	le->next_seq = seq + tcp_data_len;
+	le->ack_seq = th->th_ack;
+	le->window = th->th_win;
+	if (l != 0) {
+		le->timestamp = 1;
+		le->tsval = ntohl(*(ts_ptr + 1));
+		le->tsecr = *(ts_ptr + 2);
+	}
+
+#ifdef TCP_LRO_UPDATE_CSUM
+	/*
+	 * Do not touch the csum of the first packet.  However save the
+	 * "adjusted" checksum of just the source and destination addresses,
+	 * the next header and the TCP payload.  The length and TCP header
+	 * parts may change, so we remove those from the saved checksum and
+	 * re-add with final values on t4_lro_flush() if needed.
+	 */
+	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
+	    __func__, le, le->ulp_csum));
+
+	le->ulp_csum = t4_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
+	    ~csum);
+	th->th_sum = csum;	/* Restore checksum on first packet. */
+#endif
+
+	le->m_head = m;
+	le->m_tail = m_last(m);
+
+	return (0);
+}
+
+int
+t4_lro_rx(struct t4_lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+{
+
+	return t4_lro_rx2(lc, m, csum, 1);
+}
+
+void
+t4_lro_queue_mbuf(struct t4_lro_ctrl *lc, struct mbuf *mb)
+{
+	/* sanity checks */
+	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
+	    lc->lro_mbuf_max == 0)) {
+		/* packet drop */
+		m_freem(mb);
+		return;
+	}
+
+	/* check if packet is not LRO capable */
+	if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
+	    (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
+		lc->lro_flushed++;
+		lc->lro_queued++;
+
+		/* input packet to network layer */
+		(*lc->ifp->if_input) (lc->ifp, mb);
+		return;
+	}
+
+	/* check if array is full */
+	if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
+		t4_lro_flush_all(lc);
+
+	/* create sequence number */
+	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
+	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
+	    (((uint64_t)mb->m_pkthdr.flowid) << 24) |
+	    ((uint64_t)lc->lro_mbuf_count);
+
+	/* enter mbuf */
+	lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb;
+}
+
+/* end */
diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_lro.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/t4_lro.h	Sun Jan 29 21:15:45 2017 -0800
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2006, Myricom Inc.
+ * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2016 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _T4_LRO_H_
+#define _T4_LRO_H_
+
+#include <sys/time.h>
+
+#ifndef T4_LRO_ENTRIES
+/* Define default number of LRO entries per RX queue */
+#define	T4_LRO_ENTRIES	8
+#endif
+
+struct t4_lro_entry {
+	LIST_ENTRY(t4_lro_entry)	next;
+	LIST_ENTRY(t4_lro_entry)	hash_next;
+	struct mbuf		*m_head;
+	struct mbuf		*m_tail;
+	union {
+		struct ip	*ip4;
+		struct ip6_hdr	*ip6;
+	} leip;
+	union {
+		in_addr_t	s_ip4;
+		struct in6_addr	s_ip6;
+	} lesource;
+	union {
+		in_addr_t	d_ip4;
+		struct in6_addr	d_ip6;
+	} ledest;
+	uint16_t		source_port;
+	uint16_t		dest_port;
+	uint16_t		eh_type;	/* EthernetHeader type. */
+	uint16_t		append_cnt;
+	uint32_t		p_len;		/* IP header payload length. */
+	uint32_t		ulp_csum;	/* TCP, etc. checksum. */
+	uint32_t		next_seq;	/* tcp_seq */
+	uint32_t		ack_seq;	/* tcp_seq */
+	uint32_t		tsval;
+	uint32_t		tsecr;
+	uint16_t		window;
+	uint16_t		timestamp;	/* flag, not a TCP hdr field. */
+	struct timeval		mtime;
+};
+#if 0
+LIST_HEAD(t4_lro_head, t4_lro_entry);
+#endif
+struct t4_lro_head {
+	struct t4_lro_entry *lh_first;
+};
+
+#define	le_ip4			leip.ip4
+#define	le_ip6			leip.ip6
+#define	source_ip4		lesource.s_ip4
+#define	dest_ip4		ledest.d_ip4
+#define	source_ip6		lesource.s_ip6
+#define	dest_ip6		ledest.d_ip6
+
+struct t4_lro_mbuf_sort {
+	uint64_t seq;
+	struct mbuf *mb;
+};
+
+/* NB: This is part of driver structs. */
+struct t4_lro_ctrl {
+	struct ifnet	*ifp;
+	struct t4_lro_mbuf_sort *lro_mbuf_data;
+	uint64_t	lro_queued;
+	uint64_t	lro_flushed;
+	uint64_t	lro_bad_csum;
+	unsigned	lro_cnt;
+	unsigned	lro_mbuf_count;
+	unsigned	lro_mbuf_max;
+	unsigned short	lro_ackcnt_lim;		/* max # of aggregated ACKs */
+	unsigned 	lro_length_lim;		/* max len of aggregated data */
+
+	u_long		lro_hashsz;
+	struct t4_lro_head	*lro_hash;
+	struct t4_lro_head	lro_active;
+	struct t4_lro_head	lro_free;
+};
+
+#define	TCP_LRO_LENGTH_MAX	65535
+#define	TCP_LRO_ACKCNT_MAX	65535		/* unlimited */
+
+int t4_lro_init(struct t4_lro_ctrl *);
+int t4_lro_init_args(struct t4_lro_ctrl *, struct ifnet *, unsigned, unsigned);
+void t4_lro_free(struct t4_lro_ctrl *);
+void t4_lro_flush_inactive(struct t4_lro_ctrl *, const struct timeval *);
+void t4_lro_flush(struct t4_lro_ctrl *, struct t4_lro_entry *);
+void t4_lro_flush_all(struct t4_lro_ctrl *);
+int t4_lro_rx(struct t4_lro_ctrl *, struct mbuf *, uint32_t);
+void t4_lro_queue_mbuf(struct t4_lro_ctrl *, struct mbuf *);
+
+#define	TCP_LRO_NO_ENTRIES	-2
+#define	TCP_LRO_CANNOT		-1
+#define	TCP_LRO_NOT_SUPPORTED	1
+
+#endif /* _TCP_LRO_H_ */
diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_main.c
--- a/sys/dev/cxgbe/t4_main.c	Sat Jan 28 20:54:43 2017 +0000
+++ b/sys/dev/cxgbe/t4_main.c	Sun Jan 29 21:15:45 2017 -0800
@@ -1728,10 +1728,13 @@ redo_sifflags:
 
 			ifp->if_capenable ^= IFCAP_LRO;
 			for_each_rxq(vi, i, rxq) {
-				if (ifp->if_capenable & IFCAP_LRO)
+				if (ifp->if_capenable & IFCAP_LRO) {
 					rxq->iq.flags |= IQ_LRO_ENABLED;
-				else
+					rxq->iq.flags |= IQ_LRO2_ENABLED;
+				} else {
 					rxq->iq.flags &= ~IQ_LRO_ENABLED;
+					rxq->iq.flags &= ~IQ_LRO2_ENABLED;
+				}
 			}
 #endif
 		}
@@ -4100,7 +4103,7 @@ cxgbe_uninit_synchronized(struct vi_info
 int
 t4_setup_intr_handlers(struct adapter *sc)
 {
-	int rc, rid, p, q, v;
+	int rc, rid, p, q, v, direct;
 	char s[8];
 	struct irq *irq;
 	struct port_info *pi;
@@ -4154,6 +4157,12 @@ t4_setup_intr_handlers(struct adapter *s
 		for_each_vi(pi, v, vi) {
 			vi->first_intr = rid - 1;
 
+			if (vi->flags & INTR_RXQ &&
+			    (vi->nofldrxq == 0 || vi->flags & INTR_OFLD_RXQ))
+				direct = 1;
+			else
+				direct = 0;
+
 			if (vi->nnmrxq > 0) {
 				int n = max(vi->nrxq, vi->nnmrxq);
 
@@ -4173,7 +4182,8 @@ t4_setup_intr_handlers(struct adapter *s
 						irq->nm_rxq = nm_rxq++;
 #endif
 					rc = t4_alloc_irq(sc, irq, rid,
-					    t4_vi_intr, irq, s);
+					    direct ? t4_vi_intr2 : t4_vi_intr,
+					    irq, s);
 					if (rc != 0)
 						return (rc);
 					irq++;
@@ -4185,7 +4195,8 @@ t4_setup_intr_handlers(struct adapter *s
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					rc = t4_alloc_irq(sc, irq, rid,
-					    t4_intr, rxq, s);
+					    direct ? t4_intr2 : t4_intr, rxq,
+					    s);
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
@@ -4203,7 +4214,8 @@ t4_setup_intr_handlers(struct adapter *s
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'A' + v, q);
 					rc = t4_alloc_irq(sc, irq, rid,
-					    t4_intr, ofld_rxq, s);
+					    direct ? t4_intr2 : t4_intr,
+					    ofld_rxq, s);
 					if (rc != 0)
 						return (rc);
 					irq++;
diff -r 54d8e9872bb6 sys/dev/cxgbe/t4_sge.c
--- a/sys/dev/cxgbe/t4_sge.c	Sat Jan 28 20:54:43 2017 +0000
+++ b/sys/dev/cxgbe/t4_sge.c	Sun Jan 29 21:15:45 2017 -0800
@@ -164,6 +164,7 @@ struct sgl {
 };
 
 static int service_iq(struct sge_iq *, int);
+static int service_iq2(struct sge_iq *);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
@@ -1344,6 +1345,17 @@ t4_intr(void *arg)
 }
 
 void
+t4_intr2(void *arg)
+{
+	struct sge_iq *iq = arg;
+
+	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
+		service_iq2(iq);
+		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
+	}
+}
+
+void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
@@ -1358,6 +1370,21 @@ t4_vi_intr(void *arg)
 		t4_intr(irq->rxq);
 }
 
+void
+t4_vi_intr2(void *arg)
+{
+	struct irq *irq = arg;
+
+#ifdef DEV_NETMAP
+	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
+		t4_nm_intr(irq->nm_rxq);
+		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
+	}
+#endif
+	if (irq->rxq != NULL)
+		t4_intr2(irq->rxq);
+}
+
 /*
  * Deals with anything and everything on the given ingress queue.
  */
@@ -1565,6 +1592,170 @@ process_iql:
 	return (0);
 }
 
+/*
+ * Specialized version of service_iq that is used when all queues are taking
+ * direct interrupts.
+ */
+static int
+service_iq2(struct sge_iq *iq)
+{
+	struct sge_rxq *rxq = iq_to_rxq(iq);
+	struct sge_fl *fl = &rxq->fl;
+	struct adapter *sc = iq->adapter;
+	struct iq_desc *d = &iq->desc[iq->cidx];
+	int ndescs = 0, limit;
+	int rsp_type;
+	uint32_t lq;
+	uint16_t fl_hw_cidx;
+	struct mbuf *m0;
+#if 0
+#if defined(INET) || defined(INET6)
+	const struct timeval lro_timeout = {0, sc->lro_timeout};
+#endif
+#endif
+
+	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
+
+	/* service_iq2 isn't as flexible as service_iq */
+	MPASS(iq->flags & IQ_HAS_FL);
+
+	limit = iq->qsize >> 3;
+	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
+	if (iq->flags & IQ_ADJ_CREDIT) {
+		ndescs++;
+		iq->flags &= ~IQ_ADJ_CREDIT;
+		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
+			t4_lro_flush_all(&rxq->t4_lro);
+			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
+			    V_INGRESSQID((u32)iq->cntxt_id) |
+			    V_SEINTARM(iq->intr_params));
+			return (0);
+		}
+	}
+
+	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
+
+		rmb();
+
+		m0 = NULL;
+		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
+		lq = be32toh(d->rsp.pldbuflen_qid);
+
+		switch (rsp_type) {
+		case X_RSPD_TYPE_FLBUF:
+
+			KASSERT(iq->flags & IQ_HAS_FL,
+			    ("%s: data for an iq (%p) with no freelist",
+			    __func__, iq));
+
+			m0 = get_fl_payload(sc, fl, lq);
+			if (__predict_false(m0 == NULL))
+				break;
+
+			/* fall through */
+
+		case X_RSPD_TYPE_CPL:
+			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
+			    ("%s: bad opcode %02x.", __func__,
+			    d->rss.opcode));
+			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
+			break;
+
+		case X_RSPD_TYPE_INTR:
+
+			/*
+			 * There are 1K interrupt-capable queues (qids 0
+			 * through 1023).  A response type indicating a
+			 * forwarded interrupt with a qid >= 1K is an
+			 * iWARP async notification.
+			 */
+			if (lq >= 1024) {
+				t4_an_handler(iq, &d->rsp);
+				break;
+			}
+
+			KASSERT(0, ("%s: indirect interrupt on iq %p",
+			    __func__, iq));
+			log(LOG_ERR,
+			    "%s: unexpected interrupt on iq %p",
+			    __func__, iq);
+			break;
+
+		default:
+			KASSERT(0,
+			    ("%s: illegal response type %d on iq %p",
+			    __func__, rsp_type, iq));
+			log(LOG_ERR,
+			    "%s: illegal response type %d on iq %p",
+			    device_get_nameunit(sc->dev), rsp_type, iq);
+			break;
+		}
+
+		d++;
+		if (__predict_false(++iq->cidx == iq->sidx)) {
+			iq->cidx = 0;
+			iq->gen ^= F_RSPD_GEN;
+			d = &iq->desc[0];
+		}
+		if (__predict_false(++ndescs == limit)) {
+			t4_write_reg(sc, sc->sge_gts_reg,
+			    V_CIDXINC(ndescs) |
+			    V_INGRESSQID(iq->cntxt_id) |
+			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
+			ndescs = 0;
+
+#if 0
+#if defined(INET) || defined(INET6)
+			if (iq->flags & IQ_LRO2_ENABLED &&
+			    sc->lro_timeout != 0) {
+				t4_lro_flush_inactive(&rxq->t4_lro,
+				    &lro_timeout);
+			}
+#endif
+#endif
+		}
+		if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2) {
+			FL_LOCK(fl);
+			refill_fl(sc, fl, 32);
+			FL_UNLOCK(fl);
+			fl_hw_cidx = fl->hw_cidx;
+		}
+	}
+
+#if defined(INET) || defined(INET6)
+	if (iq->flags & IQ_LRO2_ENABLED) {
+		if (ndescs > 0 && rxq->t4_lro.lro_mbuf_count > 0) {
+			/* hold back one credit and don't flush LRO state */
+			iq->flags |= IQ_ADJ_CREDIT;
+			t4_write_reg(sc, sc->sge_gts_reg,
+			    V_CIDXINC(ndescs - 1) |
+			    V_INGRESSQID((u32)iq->cntxt_id) |
+			    V_SEINTARM(V_QINTR_TIMER_IDX(SGE_NTIMERS - 1)));
+		} else {
+			t4_lro_flush_all(&rxq->t4_lro);
+			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
+			    V_INGRESSQID((u32)iq->cntxt_id) |
+			    V_SEINTARM(iq->intr_params));
+		}
+	}
+#else
+	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
+	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
+#endif
+
+	if (iq->flags & IQ_HAS_FL) {
+		int starved;
+
+		FL_LOCK(fl);
+		starved = refill_fl(sc, fl, 64);
+		FL_UNLOCK(fl);
+		if (__predict_false(starved != 0))
+			add_fl_to_sfl(sc, fl);
+	}
+
+	return (0);
+}
+
 static inline int
 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
@@ -1787,9 +1978,6 @@ t4_eth_rx(struct sge_iq *iq, const struc
 	struct ifnet *ifp = rxq->ifp;
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
-#if defined(INET) || defined(INET6)
-	struct lro_ctrl *lro = &rxq->lro;
-#endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
@@ -1834,13 +2022,18 @@ t4_eth_rx(struct sge_iq *iq, const struc
 	}
 
 #if defined(INET) || defined(INET6)
-	if (iq->flags & IQ_LRO_ENABLED &&
-	    tcp_lro_rx(lro, m0, 0) == 0) {
-		/* queued for LRO */
-	} else
+	if (iq->flags & IQ_LRO2_ENABLED) {
+		t4_lro_queue_mbuf(&rxq->t4_lro, m0);
+		return (0);
+	}
+	if (iq->flags & IQ_LRO_ENABLED) {
+		if (tcp_lro_rx(&rxq->lro, m0, 0) != 0)
+			ifp->if_input(ifp, m0);
+		return (0);
+	}
+#else
+	ifp->if_input(ifp, m0);
 #endif
-	ifp->if_input(ifp, m0);
-
 	return (0);
 }
 
@@ -3005,6 +3198,9 @@ alloc_rxq(struct vi_info *vi, struct sge
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
+#if defined(INET) || defined(INET6)
+	int lro_entries;
+#endif
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
@@ -3033,8 +3229,19 @@ alloc_rxq(struct vi_info *vi, struct sge
 		return (rc);
 	rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */
 
-	if (vi->ifp->if_capenable & IFCAP_LRO)
+	lro_entries = 512;
+	TUNABLE_INT_FETCH("hw.cxgbe.lro_entries", &lro_entries);
+	if (lro_entries < 8 || lro_entries > 8192)
+		lro_entries = 512;
+	rc = t4_lro_init_args(&rxq->t4_lro, vi->ifp, lro_entries, lro_entries);
+	if (rc != 0)
+		return (rc);
+	rxq->t4_lro.ifp = vi->ifp; /* also indicates LRO init'ed */
+
+	if (vi->ifp->if_capenable & IFCAP_LRO) {
 		rxq->iq.flags |= IQ_LRO_ENABLED;
+		rxq->iq.flags |= IQ_LRO2_ENABLED;
+	}
 #endif
 	rxq->ifp = vi->ifp;
 
@@ -3063,6 +3270,10 @@ alloc_rxq(struct vi_info *vi, struct sge
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
+	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "t4_lro_queued", CTLFLAG_RD,
+	    &rxq->t4_lro.lro_queued, 0, NULL);
+	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "t4_lro_flushed", CTLFLAG_RD,
+	    &rxq->t4_lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
@@ -3085,6 +3296,10 @@ free_rxq(struct vi_info *vi, struct sge_
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
+	if (rxq->t4_lro.ifp) {
+		t4_lro_free(&rxq->t4_lro);
+		rxq->t4_lro.ifp = NULL;
+	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
diff -r 54d8e9872bb6 sys/modules/cxgbe/if_cxgbe/Makefile
--- a/sys/modules/cxgbe/if_cxgbe/Makefile	Sat Jan 28 20:54:43 2017 +0000
+++ b/sys/modules/cxgbe/if_cxgbe/Makefile	Sun Jan 29 21:15:45 2017 -0800
@@ -23,6 +23,7 @@ SRCS+=	t4_mp_ring.c
 SRCS+=	t4_netmap.c
 SRCS+=	t4_sge.c
 SRCS+=	t4_tracer.c
+SRCS+=	t4_lro.c
 
 # Provide the timestamp of a packet in its header mbuf.
 #CFLAGS+= -DT4_PKT_TIMESTAMP