diff -r 7cec8c20120e sbin/ifconfig/ifconfig.c
--- a/sbin/ifconfig/ifconfig.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sbin/ifconfig/ifconfig.c	Mon Jun 11 00:15:24 2012 -0700
@@ -916,7 +916,7 @@
 #define	IFCAPBITS \
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
-"\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
+"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
 "\26RXCSUM_IPV6\27TXCSUM_IPV6"
 
 /*
@@ -1212,6 +1212,8 @@
 	DEF_CMD("-tso4",	-IFCAP_TSO4,	setifcap),
 	DEF_CMD("tso",		IFCAP_TSO,	setifcap),
 	DEF_CMD("-tso",		-IFCAP_TSO,	setifcap),
+	DEF_CMD("toe",		IFCAP_TOE,	setifcap),
+	DEF_CMD("-toe",		-IFCAP_TOE,	setifcap),
 	DEF_CMD("lro",		IFCAP_LRO,	setifcap),
 	DEF_CMD("-lro",		-IFCAP_LRO,	setifcap),
 	DEF_CMD("wol",		IFCAP_WOL,	setifcap),
diff -r 7cec8c20120e sys/amd64/conf/GENERIC
--- a/sys/amd64/conf/GENERIC	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/amd64/conf/GENERIC	Mon Jun 11 00:15:24 2012 -0700
@@ -28,6 +28,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
diff -r 7cec8c20120e sys/conf/NOTES
--- a/sys/conf/NOTES	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/conf/NOTES	Mon Jun 11 00:15:24 2012 -0700
@@ -545,6 +545,8 @@
 
 options 	ROUTETABLES=2		# max 16. 1 is back compatible.
 
+options 	TCP_OFFLOAD		# TCP offload support.
+
 # In order to enable IPSEC you MUST also add device crypto to 
 # your kernel configuration
 options 	IPSEC			#IP security (requires device crypto)
diff -r 7cec8c20120e sys/conf/files
--- a/sys/conf/files	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/conf/files	Mon Jun 11 00:15:24 2012 -0700
@@ -1038,8 +1038,6 @@
 dev/cs/if_cs_pccard.c		optional cs pccard
 dev/cxgb/cxgb_main.c		optional cxgb pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
-dev/cxgb/cxgb_offload.c		optional cxgb pci \
-	compile-with "${NORMAL_C} -I$S/dev/cxgb"
 dev/cxgb/cxgb_sge.c		optional cxgb pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
 dev/cxgb/common/cxgb_mc5.c	optional cxgb pci \
@@ -3037,7 +3035,7 @@
 netinet/tcp_input.c		optional inet | inet6
 netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
-netinet/tcp_offload.c		optional inet | inet6
+netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6
 netinet/tcp_subr.c		optional inet | inet6
diff -r 7cec8c20120e sys/conf/options
--- a/sys/conf/options	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/conf/options	Mon Jun 11 00:15:24 2012 -0700
@@ -434,7 +434,7 @@
 ROUTETABLES		opt_route.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
-TCP_OFFLOAD_DISABLE	opt_inet.h #Disable code to dispatch tcp offloading
+TCP_OFFLOAD		opt_inet.h # Enable code to dispatch TCP offloading
 TCP_SIGNATURE		opt_inet.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
diff -r 7cec8c20120e sys/contrib/rdma/krping/krping.c
--- a/sys/contrib/rdma/krping/krping.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/contrib/rdma/krping/krping.c	Mon Jun 11 00:15:24 2012 -0700
@@ -41,7 +41,6 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
-#include <sys/module.h>
 #include <sys/endian.h>
 #include <sys/limits.h>
 #include <sys/proc.h>
@@ -53,11 +52,13 @@
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/syslog.h>
+#include <netinet/in.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
-#include <contrib/rdma/rdma_cm.h>
+#include <linux/types.h>
+#include <rdma/rdma_cm.h>
 
 #include "getopt.h"
 #include "krping.h"
@@ -83,6 +84,7 @@
 	{"bw", OPT_NOPARAM, 'B'},
 	{"tx-depth", OPT_INT, 't'},
   	{"poll", OPT_NOPARAM, 'P'},
+  	{"memlimit", OPT_INT, 'm'},
 	{NULL, 0, 0}
 };
 
@@ -254,10 +256,14 @@
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
 		if (wc.status) {
-			if (wc.status != IB_WC_WR_FLUSH_ERR)
-				log(LOG_ERR, "cq completion failed status %d\n",
+			if (wc.status == IB_WC_WR_FLUSH_ERR) {
+				DEBUG_LOG("cq flushed\n");
+				continue;
+			} else {
+				log(LOG_CRIT, "cq completion failed status %d\n",
 					wc.status);
-			goto error;
+				goto error;
+			}
 		}
 
 		switch (wc.opcode) {
@@ -432,8 +438,17 @@
 		}
 	}
 
-	cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
-		PAGE_SIZE, 0);
+	/* RNIC adapters have a limit upto which it can register physical memory
+	 * If DMA-MR memory mode is set then normally driver registers maximum
+	 * supported memory. After that if contigmalloc allocates memory beyond the
+	 * specified RNIC limit then Krping may not work.
+	 */
+	if (cb->use_dmamr && cb->memlimit)
+		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit,
+					    PAGE_SIZE, 0);
+	else 
+		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
+					    PAGE_SIZE, 0);
 
 	if (!cb->rdma_buf) {
 		log(LOG_ERR, "rdma_buf malloc failed\n");
@@ -458,8 +473,12 @@
 	}
 
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
-		cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
-			0, -1UL, PAGE_SIZE, 0);
+		if (cb->use_dmamr && cb->memlimit)
+			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
+						     0, cb->memlimit, PAGE_SIZE, 0);
+		else
+			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
+						     0, -1UL, PAGE_SIZE, 0);
 		if (!cb->start_buf) {
 			log(LOG_ERR, "start_buf malloc failed\n");
 			ret = ENOMEM;
@@ -1636,6 +1655,8 @@
 	cb->state = IDLE;
 	cb->size = 64;
 	cb->txdepth = RPING_SQ_DEPTH;
+	cb->use_dmamr = 1;
+	cb->memlimit = 0;
 	mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
 
 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
@@ -1713,6 +1734,15 @@
 		case 'd':
 			debug++;
 			break;
+		case 'm':
+                        cb->memlimit = optint;
+                        if (cb->memlimit < 1) {
+                                log(LOG_ERR, "Invalid memory limit %ju\n",
+				    cb->memlimit);
+                                ret = EINVAL;
+                        } else
+                                DEBUG_LOG(PFX "memory limit %d\n", (int)optint);
+                        break;
 		default:
 			log(LOG_ERR, "unknown opt %s\n", optarg);
 			ret = EINVAL;
diff -r 7cec8c20120e sys/contrib/rdma/krping/krping.h
--- a/sys/contrib/rdma/krping/krping.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/contrib/rdma/krping/krping.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,7 +1,7 @@
 /*
  * $FreeBSD$
  */
-#include <contrib/rdma/ib_verbs.h>
+#include <rdma/ib_verbs.h>
 #include <netinet/in.h>
 
 /*
@@ -92,6 +92,8 @@
 	int count;			/* ping count */
 	int size;			/* ping data size */
 	int validate;			/* validate ping data */
+	uint64_t memlimit;		/* limit of the physical memory that
+					   can be registered with dma_mr mode */
 
 	/* CM stuff */
 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
diff -r 7cec8c20120e sys/contrib/rdma/krping/krping_dev.c
--- a/sys/contrib/rdma/krping/krping_dev.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/contrib/rdma/krping/krping_dev.c	Mon Jun 11 00:15:24 2012 -0700
@@ -14,7 +14,6 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <sys/module.h>
 #include <sys/systm.h>  /* uprintf */
 #include <sys/errno.h>
 #include <sys/param.h>  /* defines used in kernel.h */
@@ -51,6 +50,9 @@
 /* vars */
 static struct cdev *krping_dev;
 
+#undef MODULE_VERSION
+#include <sys/module.h>
+
 static int
 krping_loader(struct module *m, int what, void *arg)
 {
@@ -175,6 +177,4 @@
 	return(err);
 }
 
-MODULE_DEPEND(krping, rdma_core, 1, 1, 1);
-MODULE_DEPEND(krping, rdma_cma, 1, 1, 1);
 DEV_MODULE(krping,krping_loader,NULL);
diff -r 7cec8c20120e sys/contrib/rdma/rdma_addr.c
--- a/sys/contrib/rdma/rdma_addr.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/contrib/rdma/rdma_addr.c	Mon Jun 11 00:15:24 2012 -0700
@@ -117,7 +117,8 @@
 		     const unsigned char *dst_dev_addr)
 {
 	dev_addr->dev_type = RDMA_NODE_RNIC;
-	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), MAX_ADDR_LEN);
+	memset(dev_addr->src_dev_addr, 0, MAX_ADDR_LEN);
+	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
 	memcpy(dev_addr->broadcast, dev->if_broadcastaddr, MAX_ADDR_LEN);
 	if (dst_dev_addr)
 		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
@@ -207,7 +208,7 @@
 		goto put;
 	}
  	ret = arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL, 
-		rt_key(iproute.ro_rt), dmac, &lle);
+		(struct sockaddr *)dst_in, dmac, &lle);
 	if (ret) {
 		goto put;
 	}
diff -r 7cec8c20120e sys/contrib/rdma/rdma_cache.c
--- a/sys/contrib/rdma/rdma_cache.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/contrib/rdma/rdma_cache.c	Mon Jun 11 00:15:24 2012 -0700
@@ -132,7 +132,7 @@
 	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
 		cache = device->cache.gid_cache[p];
 		for (i = 0; i < cache->table_len; ++i) {
-			if (!memcmp(gid, &cache->table[i], 6)) { /* XXX */
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
 				*port_num = p + start_port(device);
 				if (index)
 					*index = i;
diff -r 7cec8c20120e sys/dev/cxgb/common/cxgb_ctl_defs.h
--- a/sys/dev/cxgb/common/cxgb_ctl_defs.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/common/cxgb_ctl_defs.h	Mon Jun 11 00:15:24 2012 -0700
@@ -60,14 +60,12 @@
 	const unsigned short *mtus; /* the MTU table values */
 };
 
-struct net_device;
-
 /*
- * Structure used to request the adapter net_device owning a given MAC address.
+ * Structure used to request the ifnet that owns a given MAC address.
  */
 struct iff_mac {
-	struct net_device *dev;          /* the net_device */
-	const unsigned char *mac_addr;   /* MAC address to lookup */
+	struct ifnet *dev;
+	const unsigned char *mac_addr;
 	u16 vlan_tag;
 };
 
@@ -85,7 +83,7 @@
 
 struct adap_ports {
 	unsigned int nports;     /* number of ports on this adapter */
-	struct net_device *lldevs[MAX_NPORTS];
+	struct ifnet *lldevs[MAX_NPORTS];
 };
 
 /*
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_adapter.h
--- a/sys/dev/cxgb/cxgb_adapter.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/cxgb_adapter.h	Mon Jun 11 00:15:24 2012 -0700
@@ -57,7 +57,6 @@
 #include <dev/pci/pcivar.h>
 
 #include <cxgb_osdep.h>
-#include <t3cdev.h>
 #include <sys/mbufq.h>
 
 struct adapter;
@@ -130,6 +129,7 @@
 	CXGB_OFLD_INIT	= (1 << 7),
 	TP_PARITY_INIT	= (1 << 8),
 	CXGB_BUSY	= (1 << 9),
+	TOM_INIT_DONE	= (1 << 10),
 
 	/* port flags */
 	DOOMED		= (1 << 0),
@@ -179,7 +179,6 @@
 	uint32_t        async_notif;
 	uint32_t	cntxt_id;
 	uint32_t        offload_pkts;
-	uint32_t        offload_bundles;
 	uint32_t        pure_rsps;
 	uint32_t        unhandled_irqs;
 	uint32_t        starved;
@@ -291,6 +290,7 @@
 	uint32_t                txq_stopped;       /* which Tx queues are stopped */
 	uint64_t                port_stats[SGE_PSTAT_MAX];
 	struct port_info        *port;
+	struct adapter          *adap;
 	int                     idx; /* qset # */
 	int                     qs_flags;
 	int			coalescing;
@@ -307,10 +307,13 @@
 
 struct filter_info;
 
+typedef int (*cpl_handler_t)(struct sge_qset *, struct rsp_desc *,
+    struct mbuf *);
+
 struct adapter {
+	SLIST_ENTRY(adapter)	link;
 	device_t		dev;
 	int			flags;
-	TAILQ_ENTRY(adapter)    adapter_entry;
 
 	/* PCI register resources */
 	int			regs_rid;
@@ -376,11 +379,16 @@
 
 	struct port_info	port[MAX_NPORTS];
 	device_t		portdev[MAX_NPORTS];
-	struct t3cdev           tdev;
+#ifdef TCP_OFFLOAD
+	void 			*tom_softc;
+	void 			*iwarp_softc;
+#endif
 	char                    fw_version[64];
 	char                    port_types[MAX_NPORTS + 1];
 	uint32_t                open_device_map;
-	uint32_t                registered_device_map;
+#ifdef TCP_OFFLOAD
+	int			offload_map;
+#endif
 	struct mtx              lock;
 	driver_intr_t           *cxgb_intr;
 	int                     msi_count;
@@ -392,6 +400,11 @@
 	char                    elmerlockbuf[ADAPTER_LOCK_NAME_LEN];
 
 	int			timestamp;
+
+#ifdef TCP_OFFLOAD
+#define NUM_CPL_HANDLERS	0xa7
+	cpl_handler_t cpl_handler[NUM_CPL_HANDLERS] __aligned(CACHE_LINE_SIZE);
+#endif
 };
 
 struct t3_rx_mode {
@@ -502,10 +515,12 @@
 			int speed, int duplex, int fc, int mac_was_reset);
 void t3_os_phymod_changed(struct adapter *adap, int port_id);
 void t3_sge_err_intr_handler(adapter_t *adapter);
-int t3_offload_tx(struct t3cdev *, struct mbuf *);
+#ifdef TCP_OFFLOAD
+int t3_offload_tx(struct adapter *, struct mbuf *);
+#endif
 void t3_os_set_hw_addr(adapter_t *adapter, int port_idx, u8 hw_addr[]);
 int t3_mgmt_tx(adapter_t *adap, struct mbuf *m);
-
+int t3_register_cpl_handler(struct adapter *, int, cpl_handler_t);
 
 int t3_sge_alloc(struct adapter *);
 int t3_sge_free(struct adapter *);
@@ -556,15 +571,9 @@
 	return container_of(q, struct sge_qset, txq[qidx]);
 }
 
-static __inline struct adapter *
-tdev2adap(struct t3cdev *d)
-{
-	return container_of(d, struct adapter, tdev);
-}
-
 #undef container_of
 
-#define OFFLOAD_DEVMAP_BIT 15
+#define OFFLOAD_DEVMAP_BIT (1 << MAX_NPORTS)
 static inline int offload_running(adapter_t *adapter)
 {
         return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
@@ -573,4 +582,5 @@
 void cxgb_tx_watchdog(void *arg);
 int cxgb_transmit(struct ifnet *ifp, struct mbuf *m);
 void cxgb_qflush(struct ifnet *ifp);
+void t3_iterate(void (*)(struct adapter *, void *), void *);
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_main.c
--- a/sys/dev/cxgb/cxgb_main.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/cxgb_main.c	Mon Jun 11 00:15:24 2012 -0700
@@ -30,6 +30,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -107,6 +109,9 @@
     unsigned int, u64, u64);
 static inline void set_tcb_field_ulp(struct cpl_set_tcb_field *, unsigned int,
     unsigned int, u64, u64);
+#ifdef TCP_OFFLOAD
+static int cpl_not_handled(struct sge_qset *, struct rsp_desc *, struct mbuf *);
+#endif
 
 /* Attachment glue for the PCI controller end of the device.  Each port of
  * the device is attached separately, as defined later.
@@ -119,10 +124,11 @@
     unsigned int end);
 static void cxgb_get_regs(adapter_t *sc, struct ch_ifconf_regs *regs, uint8_t *buf);
 static int cxgb_get_regs_len(void);
-static int offload_open(struct port_info *pi);
 static void touch_bars(device_t dev);
-static int offload_close(struct t3cdev *tdev);
 static void cxgb_update_mac_settings(struct port_info *p);
+#ifdef TCP_OFFLOAD
+static int toe_capability(struct port_info *, int);
+#endif
 
 static device_method_t cxgb_controller_methods[] = {
 	DEVMETHOD(device_probe,		cxgb_controller_probe),
@@ -138,8 +144,11 @@
 	sizeof(struct adapter)
 };
 
+static int cxgbc_mod_event(module_t, int, void *);
 static devclass_t	cxgb_controller_devclass;
-DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass, 0, 0);
+DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass,
+    cxgbc_mod_event, 0);
+MODULE_VERSION(cxgbc, 1);
 
 /*
  * Attachment glue for the ports.  Attachment is done directly to the
@@ -177,6 +186,14 @@
 
 static devclass_t	cxgb_port_devclass;
 DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, cxgb_port_devclass, 0, 0);
+MODULE_VERSION(cxgb, 1);
+
+static struct mtx t3_list_lock;
+static SLIST_HEAD(, adapter) t3_list;
+#ifdef TCP_OFFLOAD
+static struct mtx t3_uld_list_lock;
+static SLIST_HEAD(, uld_info) t3_uld_list;
+#endif
 
 /*
  * The driver uses the best interrupt scheme available on a platform in the
@@ -195,15 +212,6 @@
     "MSI-X, MSI, INTx selector");
 
 /*
- * The driver enables offload as a default.
- * To disable it, use ofld_disable = 1.
- */
-static int ofld_disable = 0;
-TUNABLE_INT("hw.cxgb.ofld_disable", &ofld_disable);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, ofld_disable, CTLFLAG_RDTUN, &ofld_disable, 0,
-    "disable ULP offload");
-
-/*
  * The driver uses an auto-queue algorithm by default.
  * To disable it and force a single queue-set per port, use multiq = 0
  */
@@ -445,6 +453,25 @@
 	sc->msi_count = 0;
 	ai = cxgb_get_adapter_info(dev);
 
+	snprintf(sc->lockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb controller lock %d",
+	    device_get_unit(dev));
+	ADAPTER_LOCK_INIT(sc, sc->lockbuf);
+
+	snprintf(sc->reglockbuf, ADAPTER_LOCK_NAME_LEN, "SGE reg lock %d",
+	    device_get_unit(dev));
+	snprintf(sc->mdiolockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb mdio lock %d",
+	    device_get_unit(dev));
+	snprintf(sc->elmerlockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb elmer lock %d",
+	    device_get_unit(dev));
+	
+	MTX_INIT(&sc->sge.reg_lock, sc->reglockbuf, NULL, MTX_SPIN);
+	MTX_INIT(&sc->mdio_lock, sc->mdiolockbuf, NULL, MTX_DEF);
+	MTX_INIT(&sc->elmer_lock, sc->elmerlockbuf, NULL, MTX_DEF);
+
+	mtx_lock(&t3_list_lock);
+	SLIST_INSERT_HEAD(&t3_list, sc, link);
+	mtx_unlock(&t3_list_lock);
+
 	/* find the PCIe link width and set max read request to 4KB*/
 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
 		uint16_t lnk;
@@ -471,24 +498,10 @@
 	if ((sc->regs_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
 	    &sc->regs_rid, RF_ACTIVE)) == NULL) {
 		device_printf(dev, "Cannot allocate BAR region 0\n");
-		return (ENXIO);
+		error = ENXIO;
+		goto out;
 	}
 
-	snprintf(sc->lockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb controller lock %d",
-	    device_get_unit(dev));
-	ADAPTER_LOCK_INIT(sc, sc->lockbuf);
-
-	snprintf(sc->reglockbuf, ADAPTER_LOCK_NAME_LEN, "SGE reg lock %d",
-	    device_get_unit(dev));
-	snprintf(sc->mdiolockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb mdio lock %d",
-	    device_get_unit(dev));
-	snprintf(sc->elmerlockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb elmer lock %d",
-	    device_get_unit(dev));
-	
-	MTX_INIT(&sc->sge.reg_lock, sc->reglockbuf, NULL, MTX_SPIN);
-	MTX_INIT(&sc->mdio_lock, sc->mdiolockbuf, NULL, MTX_DEF);
-	MTX_INIT(&sc->elmer_lock, sc->elmerlockbuf, NULL, MTX_DEF);
-	
 	sc->bt = rman_get_bustag(sc->regs_res);
 	sc->bh = rman_get_bushandle(sc->regs_res);
 	sc->mmio_len = rman_get_size(sc->regs_res);
@@ -604,7 +617,7 @@
 	} else {
 		sc->flags |= TPS_UPTODATE;
 	}
-	
+
 	/*
 	 * Create a child device for each MAC.  The ethernet attachment
 	 * will be done in these children.
@@ -636,12 +649,7 @@
 	t3_sge_init_adapter(sc);
 
 	t3_led_ready(sc);
-	
-	cxgb_offload_init();
-	if (is_offload(sc)) {
-		setbit(&sc->registered_device_map, OFFLOAD_DEVMAP_BIT);
-		cxgb_adapter_ofld(sc);
-        }
+
 	error = t3_get_fw_version(sc, &vers);
 	if (error)
 		goto out;
@@ -662,6 +670,11 @@
 	device_printf(sc->dev, "Firmware Version %s\n", &sc->fw_version[0]);
 	callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc);
 	t3_add_attach_sysctls(sc);
+
+#ifdef TCP_OFFLOAD
+	for (i = 0; i < NUM_CPL_HANDLERS; i++)
+		sc->cpl_handler[i] = cpl_not_handled;
+#endif
 out:
 	if (error)
 		cxgb_free(sc);
@@ -775,20 +788,9 @@
 		sc->tq = NULL;
 	}
 	
-	if (is_offload(sc)) {
-		clrbit(&sc->registered_device_map, OFFLOAD_DEVMAP_BIT);
-		cxgb_adapter_unofld(sc);
-	}
-
-#ifdef notyet
-	if (sc->flags & CXGB_OFLD_INIT)
-		cxgb_offload_deactivate(sc);
-#endif
 	free(sc->filters, M_DEVBUF);
 	t3_sge_free(sc);
 
-	cxgb_offload_exit();
-
 	if (sc->udbs_res != NULL)
 		bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->udbs_rid,
 		    sc->udbs_res);
@@ -800,6 +802,9 @@
 	MTX_DESTROY(&sc->mdio_lock);
 	MTX_DESTROY(&sc->sge.reg_lock);
 	MTX_DESTROY(&sc->elmer_lock);
+	mtx_lock(&t3_list_lock);
+	SLIST_REMOVE(&t3_list, sc, adapter, link);
+	mtx_unlock(&t3_list_lock);
 	ADAPTER_LOCK_DEINIT(sc);
 }
 
@@ -1017,6 +1022,10 @@
 	ifp->if_qflush = cxgb_qflush;
 
 	ifp->if_capabilities = CXGB_CAP;
+#ifdef TCP_OFFLOAD
+	if (is_offload(sc))
+		ifp->if_capabilities |= IFCAP_TOE4;
+#endif
 	ifp->if_capenable = CXGB_CAP_ENABLE;
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO;
 
@@ -1420,65 +1429,6 @@
 	              cpus, rspq_map);
 
 }
-
-/*
- * Sends an mbuf to an offload queue driver
- * after dealing with any active network taps.
- */
-static inline int
-offload_tx(struct t3cdev *tdev, struct mbuf *m)
-{
-	int ret;
-
-	ret = t3_offload_tx(tdev, m);
-	return (ret);
-}
-
-static int
-write_smt_entry(struct adapter *adapter, int idx)
-{
-	struct port_info *pi = &adapter->port[idx];
-	struct cpl_smt_write_req *req;
-	struct mbuf *m;
-
-	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
-		return (ENOMEM);
-
-	req = mtod(m, struct cpl_smt_write_req *);
-	m->m_pkthdr.len = m->m_len = sizeof(struct cpl_smt_write_req);
-	
-	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
-	req->mtu_idx = NMTUS - 1;  /* should be 0 but there's a T3 bug */
-	req->iff = idx;
-	memset(req->src_mac1, 0, sizeof(req->src_mac1));
-	memcpy(req->src_mac0, pi->hw_addr, ETHER_ADDR_LEN);
-
-	m_set_priority(m, 1);
-
-	offload_tx(&adapter->tdev, m);
-
-	return (0);
-}
-
-static int
-init_smt(struct adapter *adapter)
-{
-	int i;
-
-	for_each_port(adapter, i)
-		write_smt_entry(adapter, i);
-	return 0;
-}
-
-static void
-init_port_mtus(adapter_t *adapter)
-{
-	unsigned int mtus = ETHERMTU | (ETHERMTU << 16);
-
-	t3_write_reg(adapter, A_TP_MTU_PORT_TABLE, mtus);
-}
-
 static void
 send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo,
 			      int hi, int port)
@@ -1705,45 +1655,6 @@
 	t3_intr_disable(sc);
 }
 
-static int
-offload_open(struct port_info *pi)
-{
-	struct adapter *sc = pi->adapter;
-	struct t3cdev *tdev = &sc->tdev;
-
-	setbit(&sc->open_device_map, OFFLOAD_DEVMAP_BIT);
-
-	t3_tp_set_offload_mode(sc, 1);
-	tdev->lldev = pi->ifp;
-	init_port_mtus(sc);
-	t3_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd,
-		     sc->params.rev == 0 ?  sc->port[0].ifp->if_mtu : 0xffff);
-	init_smt(sc);
-	cxgb_add_clients(tdev);
-
-	return (0);
-}
-
-static int
-offload_close(struct t3cdev *tdev)
-{
-	struct adapter *adapter = tdev2adap(tdev);
-
-	if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT))
-		return (0);
-
-	/* Call back all registered clients */
-	cxgb_remove_clients(tdev);
-
-	tdev->lldev = NULL;
-	cxgb_set_dummy_ops(tdev);
-	t3_tp_set_offload_mode(adapter, 0);
-
-	clrbit(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
-
-	return (0);
-}
-
 /*
  * if_init for cxgb ports.
  */
@@ -1793,15 +1704,9 @@
 		ADAPTER_UNLOCK(sc);
 	}
 
-	if (sc->open_device_map == 0) {
-		if ((rc = cxgb_up(sc)) != 0)
+	if (sc->open_device_map == 0 && ((rc = cxgb_up(sc)) != 0))
 			goto done;
 
-		if (is_offload(sc) && !ofld_disable && offload_open(p))
-			log(LOG_WARNING,
-			    "Could not initialize offload capabilities\n");
-	}
-
 	PORT_LOCK(p);
 	if (isset(&sc->open_device_map, p->port_id) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
@@ -1929,7 +1834,6 @@
 	DELAY(100 * 1000);
 	t3_mac_disable(&pi->mac, MAC_DIRECTION_RX);
 
-
 	pi->phy.ops->power_down(&pi->phy, 1);
 
 	PORT_UNLOCK(pi);
@@ -1937,9 +1841,6 @@
 	pi->link_config.link_ok = 0;
 	t3_os_link_changed(sc, pi->port_id, 0, 0, 0, 0, 0);
 
-	if ((sc->open_device_map & PORT_MASK) == 0)
-		offload_close(&sc->tdev);
-
 	if (sc->open_device_map == 0)
 		cxgb_down(pi->adapter);
 
@@ -2081,6 +1982,15 @@
 			/* Safe to do this even if cxgb_up not called yet */
 			cxgb_set_lro(p, ifp->if_capenable & IFCAP_LRO);
 		}
+#ifdef TCP_OFFLOAD
+		if (mask & IFCAP_TOE4) {
+			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE4;
+
+			error = toe_capability(p, enable);
+			if (error == 0)
+				ifp->if_capenable ^= mask;
+		}
+#endif
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
@@ -3362,3 +3272,235 @@
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
 	mk_set_tcb_field(req, tid, word, mask, val);
 }
+
+void
+t3_iterate(void (*func)(struct adapter *, void *), void *arg)
+{
+	struct adapter *sc;
+
+	mtx_lock(&t3_list_lock);
+	SLIST_FOREACH(sc, &t3_list, link) {
+		/*
+		 * func should not make any assumptions about what state sc is
+		 * in - the only guarantee is that sc->sc_lock is a valid lock.
+		 */
+		func(sc, arg);
+	}
+	mtx_unlock(&t3_list_lock);
+}
+
+#ifdef TCP_OFFLOAD
+static int
+toe_capability(struct port_info *pi, int enable)
+{
+	int rc;
+	struct adapter *sc = pi->adapter;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);
+
+	if (!is_offload(sc))
+		return (ENODEV);
+
+	if (enable) {
+		if (!(sc->flags & FULL_INIT_DONE)) {
+			log(LOG_WARNING,
+			    "You must enable a cxgb interface first\n");
+			return (EAGAIN);
+		}
+
+		if (isset(&sc->offload_map, pi->port_id))
+			return (0);
+
+		if (!(sc->flags & TOM_INIT_DONE)) {
+			rc = t3_activate_uld(sc, ULD_TOM);
+			if (rc == EAGAIN) {
+				log(LOG_WARNING,
+				    "You must kldload t3_tom.ko before trying "
+				    "to enable TOE on a cxgb interface.\n");
+			}
+			if (rc != 0)
+				return (rc);
+			KASSERT(sc->tom_softc != NULL,
+			    ("%s: TOM activated but softc NULL", __func__));
+			KASSERT(sc->flags & TOM_INIT_DONE,
+			    ("%s: TOM activated but flag not set", __func__));
+		}
+
+		setbit(&sc->offload_map, pi->port_id);
+
+		/*
+		 * XXX: Temporary code to allow iWARP to be enabled when TOE is
+		 * enabled on any port.  Need to figure out how to enable,
+		 * disable, load, and unload iWARP cleanly.
+		 */
+		if (!isset(&sc->offload_map, MAX_NPORTS) &&
+		    t3_activate_uld(sc, ULD_IWARP) == 0)
+			setbit(&sc->offload_map, MAX_NPORTS);
+	} else {
+		if (!isset(&sc->offload_map, pi->port_id))
+			return (0);
+
+		KASSERT(sc->flags & TOM_INIT_DONE,
+		    ("%s: TOM never initialized?", __func__));
+		clrbit(&sc->offload_map, pi->port_id);
+	}
+
+	return (0);
+}
+
+/*
+ * Add an upper layer driver to the global list.
+ */
+int
+t3_register_uld(struct uld_info *ui)
+{
+	int rc = 0;
+	struct uld_info *u;
+
+	mtx_lock(&t3_uld_list_lock);
+	SLIST_FOREACH(u, &t3_uld_list, link) {
+	    if (u->uld_id == ui->uld_id) {
+		    rc = EEXIST;
+		    goto done;
+	    }
+	}
+
+	SLIST_INSERT_HEAD(&t3_uld_list, ui, link);
+	ui->refcount = 0;
+done:
+	mtx_unlock(&t3_uld_list_lock);
+	return (rc);
+}
+
+int
+t3_unregister_uld(struct uld_info *ui)
+{
+	int rc = EINVAL;
+	struct uld_info *u;
+
+	mtx_lock(&t3_uld_list_lock);
+
+	SLIST_FOREACH(u, &t3_uld_list, link) {
+	    if (u == ui) {
+		    if (ui->refcount > 0) {
+			    rc = EBUSY;
+			    goto done;
+		    }
+
+		    SLIST_REMOVE(&t3_uld_list, ui, uld_info, link);
+		    rc = 0;
+		    goto done;
+	    }
+	}
+done:
+	mtx_unlock(&t3_uld_list_lock);
+	return (rc);
+}
+
+int
+t3_activate_uld(struct adapter *sc, int id)
+{
+	int rc = EAGAIN;
+	struct uld_info *ui;
+
+	mtx_lock(&t3_uld_list_lock);
+
+	SLIST_FOREACH(ui, &t3_uld_list, link) {
+		if (ui->uld_id == id) {
+			rc = ui->activate(sc);
+			if (rc == 0)
+				ui->refcount++;
+			goto done;
+		}
+	}
+done:
+	mtx_unlock(&t3_uld_list_lock);
+
+	return (rc);
+}
+
+int
+t3_deactivate_uld(struct adapter *sc, int id)
+{
+	int rc = EINVAL;
+	struct uld_info *ui;
+
+	mtx_lock(&t3_uld_list_lock);
+
+	SLIST_FOREACH(ui, &t3_uld_list, link) {
+		if (ui->uld_id == id) {
+			rc = ui->deactivate(sc);
+			if (rc == 0)
+				ui->refcount--;
+			goto done;
+		}
+	}
+done:
+	mtx_unlock(&t3_uld_list_lock);
+
+	return (rc);
+}
+
+static int
+cpl_not_handled(struct sge_qset *qs __unused, struct rsp_desc *r __unused,
+    struct mbuf *m)
+{
+	m_freem(m);
+	return (EDOOFUS);
+}
+
+int
+t3_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h)
+{
+	uintptr_t *loc, new;
+
+	if (opcode >= NUM_CPL_HANDLERS)
+		return (EINVAL);
+
+	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
+	loc = (uintptr_t *) &sc->cpl_handler[opcode];
+	atomic_store_rel_ptr(loc, new);
+
+	return (0);
+}
+#endif
+
+static int
+cxgbc_mod_event(module_t mod, int cmd, void *arg)
+{
+	int rc = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		mtx_init(&t3_list_lock, "T3 adapters", 0, MTX_DEF);
+		SLIST_INIT(&t3_list);
+#ifdef TCP_OFFLOAD
+		mtx_init(&t3_uld_list_lock, "T3 ULDs", 0, MTX_DEF);
+		SLIST_INIT(&t3_uld_list);
+#endif
+		break;
+
+	case MOD_UNLOAD:
+#ifdef TCP_OFFLOAD
+		mtx_lock(&t3_uld_list_lock);
+		if (!SLIST_EMPTY(&t3_uld_list)) {
+			rc = EBUSY;
+			mtx_unlock(&t3_uld_list_lock);
+			break;
+		}
+		mtx_unlock(&t3_uld_list_lock);
+		mtx_destroy(&t3_uld_list_lock);
+#endif
+		mtx_lock(&t3_list_lock);
+		if (!SLIST_EMPTY(&t3_list)) {
+			rc = EBUSY;
+			mtx_unlock(&t3_list_lock);
+			break;
+		}
+		mtx_unlock(&t3_list_lock);
+		mtx_destroy(&t3_list_lock);
+		break;
+	}
+
+	return (rc);
+}
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_offload.c
--- a/sys/dev/cxgb/cxgb_offload.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,465 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-***************************************************************************/
-
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/bus.h>
-#include <sys/module.h>
-#include <sys/pciio.h>
-#include <sys/conf.h>
-#include <machine/bus.h>
-#include <machine/resource.h>
-#include <sys/bus_dma.h>
-#include <sys/rman.h>
-#include <sys/ioccom.h>
-#include <sys/mbuf.h>
-#include <sys/linker.h>
-#include <sys/firmware.h>
-#include <sys/socket.h>
-#include <sys/sockio.h>
-#include <sys/smp.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/queue.h>
-#include <sys/taskqueue.h>
-#include <sys/proc.h>
-
-#include <cxgb_include.h>
-
-#include <net/route.h>
-
-#define VALIDATE_TID 0
-MALLOC_DEFINE(M_CXGB, "cxgb", "Chelsio 10 Gigabit Ethernet and services");
-
-TAILQ_HEAD(, cxgb_client) client_list;
-TAILQ_HEAD(, t3cdev) ofld_dev_list;
-
-
-static struct mtx cxgb_db_lock;
-
-
-static int inited = 0;
-
-static inline int
-offload_activated(struct t3cdev *tdev)
-{
-	struct adapter *adapter = tdev2adap(tdev);
-	
-	return (isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT));
-}
-
-static inline void
-register_tdev(struct t3cdev *tdev)
-{
-	static int unit;
-
-	mtx_lock(&cxgb_db_lock);
-	snprintf(tdev->name, sizeof(tdev->name), "ofld_dev%d", unit++);
-	TAILQ_INSERT_TAIL(&ofld_dev_list, tdev, entry);
-	mtx_unlock(&cxgb_db_lock);
-}
-
-static inline void
-unregister_tdev(struct t3cdev *tdev)
-{
-	if (!inited)
-		return;
-
-	mtx_lock(&cxgb_db_lock);
-	TAILQ_REMOVE(&ofld_dev_list, tdev, entry);
-	mtx_unlock(&cxgb_db_lock);	
-}
-
-#ifndef TCP_OFFLOAD_DISABLE
-/**
- *	cxgb_register_client - register an offload client
- *	@client: the client
- *
- *	Add the client to the client list,
- *	and call backs the client for each activated offload device
- */
-void
-cxgb_register_client(struct cxgb_client *client)
-{
-	struct t3cdev *tdev;
-
-	mtx_lock(&cxgb_db_lock);
-	TAILQ_INSERT_TAIL(&client_list, client, client_entry);
-
-	if (client->add) {
-		TAILQ_FOREACH(tdev, &ofld_dev_list, entry) {
-			if (offload_activated(tdev)) {
-				client->add(tdev);
-			} else
-				CTR1(KTR_CXGB,
-				    "cxgb_register_client: %p not activated", tdev);
-			
-		}
-	}
-	mtx_unlock(&cxgb_db_lock);
-}
-
-/**
- *	cxgb_unregister_client - unregister an offload client
- *	@client: the client
- *
- *	Remove the client to the client list,
- *	and call backs the client for each activated offload device.
- */
-void
-cxgb_unregister_client(struct cxgb_client *client)
-{
-	struct t3cdev *tdev;
-
-	mtx_lock(&cxgb_db_lock);
-	TAILQ_REMOVE(&client_list, client, client_entry);
-
-	if (client->remove) {
-		TAILQ_FOREACH(tdev, &ofld_dev_list, entry) {
-			if (offload_activated(tdev))
-				client->remove(tdev);
-		}
-	}
-	mtx_unlock(&cxgb_db_lock);
-}
-
-/**
- *	cxgb_add_clients - activate register clients for an offload device
- *	@tdev: the offload device
- *
- *	Call backs all registered clients once a offload device is activated 
- */
-void
-cxgb_add_clients(struct t3cdev *tdev)
-{
-	struct cxgb_client *client;
-
-	mtx_lock(&cxgb_db_lock);
-	TAILQ_FOREACH(client, &client_list, client_entry) {
-		if (client->add)
-			client->add(tdev);
-	}
-	mtx_unlock(&cxgb_db_lock);
-}
-
-/**
- *	cxgb_remove_clients - activate register clients for an offload device
- *	@tdev: the offload device
- *
- *	Call backs all registered clients once a offload device is deactivated 
- */
-void
-cxgb_remove_clients(struct t3cdev *tdev)
-{
-	struct cxgb_client *client;
-
-	mtx_lock(&cxgb_db_lock);
-	TAILQ_FOREACH(client, &client_list, client_entry) {
-		if (client->remove)
-			client->remove(tdev);
-	}
-	mtx_unlock(&cxgb_db_lock);
-}
-#endif
-
-/**
- * cxgb_ofld_recv - process n received offload packets
- * @dev: the offload device
- * @m: an array of offload packets
- * @n: the number of offload packets
- *
- * Process an array of ingress offload packets.  Each packet is forwarded
- * to any active network taps and then passed to the offload device's receive
- * method.  We optimize passing packets to the receive method by passing
- * it the whole array at once except when there are active taps.
- */
-int
-cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n)
-{
-
-	return dev->recv(dev, m, n);
-}
-
-/*
- * Dummy handler for Rx offload packets in case we get an offload packet before
- * proper processing is setup.  This complains and drops the packet as it isn't
- * normal to get offload packets at this stage.
- */
-static int
-rx_offload_blackhole(struct t3cdev *dev, struct mbuf **m, int n)
-{
-	while (n--)
-		m_freem(m[n]);
-	return 0;
-}
-
-static void
-dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr,
-    struct sockaddr *sa)
-{
-}
-
-void
-cxgb_set_dummy_ops(struct t3cdev *dev)
-{
-	dev->recv         = rx_offload_blackhole;
-	dev->arp_update = dummy_neigh_update;
-}
-
-static int
-do_smt_write_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_smt_write_rpl *rpl = cplhdr(m);
-
-	if (rpl->status != CPL_ERR_NONE)
-		log(LOG_ERR,
-		       "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
-		       rpl->status, GET_TID(rpl));
-
-	return CPL_RET_BUF_DONE;
-}
-
-static int
-do_l2t_write_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_l2t_write_rpl *rpl = cplhdr(m);
-
-	if (rpl->status != CPL_ERR_NONE)
-		log(LOG_ERR,
-		       "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
-		       rpl->status, GET_TID(rpl));
-
-	return CPL_RET_BUF_DONE;
-}
-
-static int
-do_rte_write_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_rte_write_rpl *rpl = cplhdr(m);
-
-	if (rpl->status != CPL_ERR_NONE)
-		log(LOG_ERR,
-		       "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
-		       rpl->status, GET_TID(rpl));
-
-	return CPL_RET_BUF_DONE;
-}
-
-static int
-do_set_tcb_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_set_tcb_rpl *rpl = cplhdr(m);
-
-	if (rpl->status != CPL_ERR_NONE)
-		log(LOG_ERR,
-		    "Unexpected SET_TCB_RPL status %u for tid %u\n",
-			rpl->status, GET_TID(rpl));
-	return CPL_RET_BUF_DONE;
-}
-
-static int
-do_trace(struct t3cdev *dev, struct mbuf *m)
-{
-#if 0
-	struct cpl_trace_pkt *p = cplhdr(m);
-
-
-	skb->protocol = 0xffff;
-	skb->dev = dev->lldev;
-	skb_pull(skb, sizeof(*p));
-	skb->mac.raw = mtod(m, (char *));
-	netif_receive_skb(skb);
-#endif	
-	return 0;
-}
-
-/*
- * Process a received packet with an unknown/unexpected CPL opcode.
- */
-static int
-do_bad_cpl(struct t3cdev *dev, struct mbuf *m)
-{
-	log(LOG_ERR, "%s: received bad CPL command 0x%x\n", dev->name,
-	    0xFF & *mtod(m, uint32_t *));
-	return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
-}
-
-/*
- * Handlers for each CPL opcode
- */
-static cpl_handler_func cpl_handlers[256];
-
-/*
- * T3CDEV's receive method.
- */
-int
-process_rx(struct t3cdev *dev, struct mbuf **m, int n)
-{
-	while (n--) {
-		struct mbuf *m0 = *m++;
-		unsigned int opcode = G_OPCODE(ntohl(m0->m_pkthdr.csum_data));
-		int ret;
-
-		DPRINTF("processing op=0x%x m=%p data=%p\n", opcode, m0, m0->m_data);
-		
-		ret = cpl_handlers[opcode] (dev, m0);
-
-#if VALIDATE_TID
-		if (ret & CPL_RET_UNKNOWN_TID) {
-			union opcode_tid *p = cplhdr(m0);
-
-			log(LOG_ERR, "%s: CPL message (opcode %u) had "
-			       "unknown TID %u\n", dev->name, opcode,
-			       G_TID(ntohl(p->opcode_tid)));
-		}
-#endif
-		if (ret & CPL_RET_BUF_DONE)
-			m_freem(m0);
-	}
-	return 0;
-}
-
-/*
- * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
- * to unregister an existing handler.
- */
-void
-t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h)
-{
-	if (opcode < NUM_CPL_CMDS)
-		cpl_handlers[opcode] = h ? h : do_bad_cpl;
-	else
-		log(LOG_ERR, "T3C: handler registration for "
-		       "opcode %x failed\n", opcode);
-}
-
-/*
- * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc.
- * The allocated memory is cleared.
- */
-void *
-cxgb_alloc_mem(unsigned long size)
-{
-
-	return malloc(size, M_CXGB, M_ZERO|M_NOWAIT);
-}
-
-/*
- * Free memory allocated through t3_alloc_mem().
- */
-void
-cxgb_free_mem(void *addr)
-{
-	free(addr, M_CXGB);
-}
-
-static __inline int
-adap2type(struct adapter *adapter) 
-{ 
-        int type = 0; 
- 
-        switch (adapter->params.rev) { 
-        case T3_REV_A: 
-                type = T3A; 
-                break; 
-        case T3_REV_B: 
-        case T3_REV_B2: 
-                type = T3B; 
-                break; 
-        case T3_REV_C: 
-                type = T3C; 
-                break; 
-        } 
-        return type; 
-}
-
-void
-cxgb_adapter_ofld(struct adapter *adapter)
-{
-	struct t3cdev *tdev = &adapter->tdev;
-
-	cxgb_set_dummy_ops(tdev);
-	tdev->type = adap2type(adapter);
-	tdev->adapter = adapter;
-	register_tdev(tdev);	
-
-}
-
-void
-cxgb_adapter_unofld(struct adapter *adapter)
-{
-	struct t3cdev *tdev = &adapter->tdev;
-
-	tdev->recv = NULL;
-	tdev->arp_update = NULL;
-	unregister_tdev(tdev);	
-}
-
-void
-cxgb_offload_init(void)
-{
-	int i;
-
-	if (inited++)
-		return;
-	
-	mtx_init(&cxgb_db_lock, "ofld db", NULL, MTX_DEF);
-
-	TAILQ_INIT(&client_list);
-	TAILQ_INIT(&ofld_dev_list);
-	
-	for (i = 0; i < 0x100; ++i)
-		cpl_handlers[i] = do_bad_cpl;
-	
-	t3_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl);
-	t3_register_cpl_handler(CPL_RTE_WRITE_RPL, do_rte_write_rpl);
-	t3_register_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
-
-	t3_register_cpl_handler(CPL_SET_TCB_RPL, do_set_tcb_rpl);
-	t3_register_cpl_handler(CPL_TRACE_PKT, do_trace);
-	
-}
-
-void 
-cxgb_offload_exit(void)
-{
-
-	if (--inited)
-		return;
-
-	mtx_destroy(&cxgb_db_lock);
-}
-
-MODULE_VERSION(if_cxgb, 1);
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_offload.h
--- a/sys/dev/cxgb/cxgb_offload.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/cxgb_offload.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,4 +1,3 @@
-
 /**************************************************************************
 
 Copyright (c) 2007-2008, Chelsio Inc.
@@ -33,221 +32,93 @@
 #ifndef _CXGB_OFFLOAD_H
 #define _CXGB_OFFLOAD_H
 
-#include <common/cxgb_tcb.h>
-#include <t3cdev.h>
-
-MALLOC_DECLARE(M_CXGB);
+#ifdef TCP_OFFLOAD
+enum {
+	ULD_TOM = 1,
+	ULD_IWARP = 2,
+};
 
 struct adapter;
-struct cxgb_client;
-
-void cxgb_offload_init(void);
-void cxgb_offload_exit(void);
-
-void cxgb_adapter_ofld(struct adapter *adapter);
-void cxgb_adapter_unofld(struct adapter *adapter);
-int cxgb_offload_activate(struct adapter *adapter);
-void cxgb_offload_deactivate(struct adapter *adapter);
-int cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n);
-
-void cxgb_set_dummy_ops(struct t3cdev *dev);
-
-
-/*
- * Client registration.  Users of T3 driver must register themselves.
- * The T3 driver will call the add function of every client for each T3
- * adapter activated, passing up the t3cdev ptr.  Each client fills out an
- * array of callback functions to process CPL messages.
- */
-
-void cxgb_register_client(struct cxgb_client *client);
-void cxgb_unregister_client(struct cxgb_client *client);
-void cxgb_add_clients(struct t3cdev *tdev);
-void cxgb_remove_clients(struct t3cdev *tdev);
-
-typedef int (*cxgb_cpl_handler_func)(struct t3cdev *dev,
-				      struct mbuf *m, void *ctx);
-
-struct l2t_entry;
-struct cxgb_client {
-	char 			*name;
-	void 			(*add) (struct t3cdev *);
-	void 			(*remove) (struct t3cdev *);
-	cxgb_cpl_handler_func 	*handlers;
-	int			(*redirect)(void *ctx, struct rtentry *old,
-					    struct rtentry *new,
-					    struct l2t_entry *l2t);
-	TAILQ_ENTRY(cxgb_client)         client_entry;
+struct uld_info {
+	SLIST_ENTRY(uld_info) link;
+	int refcount;
+	int uld_id;
+	int (*activate)(struct adapter *);
+	int (*deactivate)(struct adapter *);
 };
 
-/*
- * TID allocation services.
- */
-int cxgb_alloc_atid(struct t3cdev *dev, struct cxgb_client *client,
-		     void *ctx);
-int cxgb_alloc_stid(struct t3cdev *dev, struct cxgb_client *client,
-		     void *ctx);
-void *cxgb_free_atid(struct t3cdev *dev, int atid);
-void cxgb_free_stid(struct t3cdev *dev, int stid);
-void *cxgb_get_lctx(struct t3cdev *tdev, int stid);
-void cxgb_insert_tid(struct t3cdev *dev, struct cxgb_client *client,
-		      void *ctx,
-	unsigned int tid);
-void cxgb_queue_tid_release(struct t3cdev *dev, unsigned int tid);
-void cxgb_remove_tid(struct t3cdev *dev, void *ctx, unsigned int tid);
-
-struct toe_tid_entry {
-	struct cxgb_client 	*client;
-	void 			*ctx;
+struct tom_tunables {
+	int sndbuf;
+	int ddp;
+	int indsz;
+	int ddp_thres;
 };
 
 /* CPL message priority levels */
 enum {
 	CPL_PRIORITY_DATA = 0,     /* data messages */
-	CPL_PRIORITY_SETUP = 1,	   /* connection setup messages */
-	CPL_PRIORITY_TEARDOWN = 0, /* connection teardown messages */
-	CPL_PRIORITY_LISTEN = 1,   /* listen start/stop messages */
-	CPL_PRIORITY_ACK = 1,      /* RX ACK messages */
 	CPL_PRIORITY_CONTROL = 1   /* offload control messages */
 };
 
-/* Flags for return value of CPL message handlers */
-enum {
-	CPL_RET_BUF_DONE = 1,   // buffer processing done, buffer may be freed
-	CPL_RET_BAD_MSG = 2,    // bad CPL message (e.g., unknown opcode)
-	CPL_RET_UNKNOWN_TID = 4	// unexpected unknown TID
-};
+#define S_HDR_NDESC	0
+#define M_HDR_NDESC	0xf
+#define V_HDR_NDESC(x)	((x) << S_HDR_NDESC)
+#define G_HDR_NDESC(x)	(((x) >> S_HDR_NDESC) & M_HDR_NDESC)
 
-typedef int (*cpl_handler_func)(struct t3cdev *dev, struct mbuf *m);
+#define S_HDR_QSET	4
+#define M_HDR_QSET	0xf
+#define V_HDR_QSET(x)	((x) << S_HDR_QSET)
+#define G_HDR_QSET(x)	(((x) >> S_HDR_QSET) & M_HDR_QSET)
 
-/*
- * Returns a pointer to the first byte of the CPL header in an sk_buff that
- * contains a CPL message.
- */
-static inline void *cplhdr(struct mbuf *m)
+#define S_HDR_CTRL	8
+#define V_HDR_CTRL(x)	((x) << S_HDR_CTRL)
+#define F_HDR_CTRL	V_HDR_CTRL(1U)
+
+#define S_HDR_DF	9
+#define V_HDR_DF(x)	((x) << S_HDR_DF)
+#define F_HDR_DF	V_HDR_DF(1U)
+
+#define S_HDR_SGL	10
+#define V_HDR_SGL(x)	((x) << S_HDR_SGL)
+#define F_HDR_SGL	V_HDR_SGL(1U)
+
+struct ofld_hdr
 {
-	return mtod(m, uint8_t *);
-}
-
-void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h);
-
-union listen_entry {
-	struct toe_tid_entry toe_tid;
-	union listen_entry *next;
-};
-
-union active_open_entry {
-	struct toe_tid_entry toe_tid;
-	union active_open_entry *next;
+	void *sgl;	/* SGL, if F_HDR_SGL set in flags */
+	int plen;	/* amount of payload (in bytes) */
+	int flags;
 };
 
 /*
- * Holds the size, base address, free list start, etc of the TID, server TID,
- * and active-open TID tables for a offload device.
- * The tables themselves are allocated dynamically.
+ * Convenience function for fixed size CPLs that fit in 1 desc.
  */
-struct tid_info {
-	struct toe_tid_entry *tid_tab;
-	unsigned int ntids;
-	volatile unsigned int tids_in_use;
+#define M_GETHDR_OFLD(qset, ctrl, cpl) \
+    m_gethdr_ofld(qset, ctrl, sizeof(*cpl), (void **)&cpl)
+static inline struct mbuf *
+m_gethdr_ofld(int qset, int ctrl, int cpllen, void **cpl)
+{
+	struct mbuf *m;
+	struct ofld_hdr *oh;
 
-	union listen_entry *stid_tab;
-	unsigned int nstids;
-	unsigned int stid_base;
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL)
+		return (NULL);
 
-	union active_open_entry *atid_tab;
-	unsigned int natids;
-	unsigned int atid_base;
+	oh = mtod(m, struct ofld_hdr *);
+	oh->flags = V_HDR_NDESC(1) | V_HDR_QSET(qset) | V_HDR_CTRL(ctrl);
+	*cpl = (void *)(oh + 1);
+	m->m_pkthdr.len = m->m_len = sizeof(*oh) + cpllen;
 
-	/*
-	 * The following members are accessed R/W so we put them in their own
-	 * cache lines.
-	 *
-	 * XXX We could combine the atid fields above with the lock here since
-	 * atids are use once (unlike other tids).  OTOH the above fields are
-	 * usually in cache due to tid_tab.
-	 */
-	struct mtx atid_lock /* ____cacheline_aligned_in_smp */;
-	union active_open_entry *afree;
-	unsigned int atids_in_use;
-
-	struct mtx stid_lock /*____cacheline_aligned */;
-	union listen_entry *sfree;
-	unsigned int stids_in_use;
-};
-
-struct t3c_data {
-	struct t3cdev *dev;
-	unsigned int tx_max_chunk;  /* max payload for TX_DATA */
-	unsigned int max_wrs;       /* max in-flight WRs per connection */
-	unsigned int nmtus;
-	const unsigned short *mtus;
-	struct tid_info tid_maps;
-
-	struct toe_tid_entry *tid_release_list;
-	struct mtx tid_release_lock;
-	struct task tid_release_task;
-};
-
-/*
- * t3cdev -> toe_data accessor
- */
-#define T3C_DATA(dev) (*(struct t3c_data **)&(dev)->l4opt)
-
-/*
- * Map an ATID or STID to their entries in the corresponding TID tables.
- */
-static inline union active_open_entry *atid2entry(const struct tid_info *t,
-                                                  unsigned int atid)
-{
-        return &t->atid_tab[atid - t->atid_base];
+	return (m);
 }
 
+int t3_register_uld(struct uld_info *);
+int t3_unregister_uld(struct uld_info *);
+int t3_activate_uld(struct adapter *, int);
+int t3_deactivate_uld(struct adapter *, int);
+#endif	/* TCP_OFFLOAD */
 
-static inline union listen_entry *stid2entry(const struct tid_info *t,
-                                             unsigned int stid)
-{
-        return &t->stid_tab[stid - t->stid_base];
-}
+#define CXGB_UNIMPLEMENTED() \
+    panic("IMPLEMENT: %s:%s:%d", __FUNCTION__, __FILE__, __LINE__)
 
-/*
- * Find the connection corresponding to a TID.
- */
-static inline struct toe_tid_entry *lookup_tid(const struct tid_info *t,
-                                               unsigned int tid)
-{
-        return tid < t->ntids ? &(t->tid_tab[tid]) : NULL;
-}
-
-/*
- * Find the connection corresponding to a server TID.
- */
-static inline struct toe_tid_entry *lookup_stid(const struct tid_info *t,
-                                                unsigned int tid)
-{
-        if (tid < t->stid_base || tid >= t->stid_base + t->nstids)
-                return NULL;
-        return &(stid2entry(t, tid)->toe_tid);
-}
-
-/*
- * Find the connection corresponding to an active-open TID.
- */
-static inline struct toe_tid_entry *lookup_atid(const struct tid_info *t,
-                                                unsigned int tid)
-{
-        if (tid < t->atid_base || tid >= t->atid_base + t->natids)
-                return NULL;
-        return &(atid2entry(t, tid)->toe_tid);
-}
-
-void *cxgb_alloc_mem(unsigned long size);
-void cxgb_free_mem(void *addr);
-void cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
-void cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa);
-int process_rx(struct t3cdev *dev, struct mbuf **m, int n);
-int attach_t3cdev(struct t3cdev *dev);
-void detach_t3cdev(struct t3cdev *dev);
-
-#define CXGB_UNIMPLEMENTED() panic("IMPLEMENT: %s:%s:%d", __FUNCTION__, __FILE__, __LINE__)
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_osdep.h
--- a/sys/dev/cxgb/cxgb_osdep.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/cxgb_osdep.h	Mon Jun 11 00:15:24 2012 -0700
@@ -67,27 +67,6 @@
 } while (0)
 #endif
 
-#define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif)
-#define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri))
-#define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl))
-#define m_get_sgl(m) ((bus_dma_segment_t *)(m)->m_pkthdr.header)
-#define m_set_sgllen(m, len) ((m)->m_pkthdr.ether_vtag = len)
-#define m_get_sgllen(m) ((m)->m_pkthdr.ether_vtag)
-
-/*
- * XXX FIXME
- */
-#define m_set_toep(m, a) ((m)->m_pkthdr.header = (a))
-#define m_get_toep(m) ((m)->m_pkthdr.header)
-#define m_set_handler(m, handler) ((m)->m_pkthdr.header = (handler))
-
-#define m_set_socket(m, a) ((m)->m_pkthdr.header = (a))
-#define m_get_socket(m) ((m)->m_pkthdr.header)
-
-#define	KTR_CXGB	KTR_SPARE2
-
-#define MT_DONTFREE  128
-
 #if __FreeBSD_version < 800054
 #if defined (__GNUC__)
   #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
@@ -123,13 +102,6 @@
 
 #define CXGB_TX_CLEANUP_THRESHOLD        32
 
-
-#ifdef DEBUG_PRINT
-#define DPRINTF printf
-#else 
-#define DPRINTF(...)
-#endif
-
 #define TX_MAX_SIZE                (1 << 16)    /* 64KB                          */
 #define TX_MAX_SEGS                      36     /* maximum supported by card     */
 
@@ -199,7 +171,6 @@
 #define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1<<bit)), ((*(p)) & ~(1<<bit)))
 
 #define max_t(type, a, b) (type)max((a), (b))
-#define net_device ifnet
 #define cpu_to_be32            htobe32
 
 /* Standard PHY definitions */
diff -r 7cec8c20120e sys/dev/cxgb/cxgb_sge.c
--- a/sys/dev/cxgb/cxgb_sge.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/cxgb_sge.c	Mon Jun 11 00:15:24 2012 -0700
@@ -54,6 +54,7 @@
 #include <sys/systm.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
+#include <sys/sglist.h>
 
 #include <net/bpf.h>	
 #include <net/ethernet.h>
@@ -78,6 +79,10 @@
 int	txq_fills = 0;
 int	multiq_tx_enable = 1;
 
+#ifdef TCP_OFFLOAD
+CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
+#endif
+
 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
@@ -471,10 +476,17 @@
 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
 {
 
-	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
+	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
+		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
+		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
+	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
+		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
+		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
+	} else
+		m->m_len = IMMED_PKT_SIZE;
 	m->m_ext.ext_buf = NULL;
 	m->m_ext.ext_type = 0;
-	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE); 
+	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); 
 	return (0);	
 }
 
@@ -703,7 +715,8 @@
 	cb_arg.error = 0;
 	while (n--) {
 		/*
-		 * We only allocate a cluster, mbuf allocation happens after rx
+		 * We allocate an uninitialized mbuf + cluster, mbuf is
+		 * initialized after rx.
 		 */
 		if (q->zone == zone_pack) {
 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
@@ -1170,57 +1183,6 @@
 	return flits_to_desc(flits);
 }
 
-static unsigned int
-busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
-    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
-{
-	struct mbuf *m0;
-	int err, pktlen, pass = 0;
-	bus_dma_tag_t tag = txq->entry_tag;
-
-retry:
-	err = 0;
-	m0 = *m;
-	pktlen = m0->m_pkthdr.len;
-#if defined(__i386__) || defined(__amd64__)
-	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
-		goto done;
-	} else
-#endif
-		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
-
-	if (err == 0) {
-		goto done;
-	}
-	if (err == EFBIG && pass == 0) {
-		pass = 1;
-		/* Too many segments, try to defrag */
-		m0 = m_defrag(m0, M_DONTWAIT);
-		if (m0 == NULL) {
-			m_freem(*m);
-			*m = NULL;
-			return (ENOBUFS);
-		}
-		*m = m0;
-		goto retry;
-	} else if (err == ENOMEM) {
-		return (err);
-	} if (err) {
-		if (cxgb_debug)
-			printf("map failure err=%d pktlen=%d\n", err, pktlen);
-		m_freem(m0);
-		*m = NULL;
-		return (err);
-	}
-done:
-#if !defined(__i386__) && !defined(__amd64__)
-	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
-#endif	
-	txsd->flags |= TX_SW_DESC_MAPPED;
-
-	return (0);
-}
-
 /**
  *	make_sgl - populate a scatter/gather list for a packet
  *	@sgp: the SGL to populate
@@ -1328,10 +1290,10 @@
 	
 	if (__predict_true(ndesc == 1)) {
 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
-			V_WR_SGLSFLT(flits)) | wr_hi,
-		    htonl(V_WR_LEN(flits + sgl_flits) |
-			V_WR_GEN(txqs->gen)) | wr_lo);
-		/* XXX gen? */
+		    V_WR_SGLSFLT(flits)) | wr_hi,
+		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
+		    wr_lo);
+
 		wr_gen2(txd, txqs->gen);
 		
 	} else {
@@ -1813,34 +1775,23 @@
  *	its entirety.
  */
 static __inline void
-write_imm(struct tx_desc *d, struct mbuf *m,
+write_imm(struct tx_desc *d, caddr_t src,
 	  unsigned int len, unsigned int gen)
 {
-	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
+	struct work_request_hdr *from = (struct work_request_hdr *)src;
 	struct work_request_hdr *to = (struct work_request_hdr *)d;
 	uint32_t wr_hi, wr_lo;
 
-	if (len > WR_LEN)
-		panic("len too big %d\n", len);
-	if (len < sizeof(*from))
-		panic("len too small %d", len);
+	KASSERT(len <= WR_LEN && len >= sizeof(*from),
+	    ("%s: invalid len %d", __func__, len));
 	
 	memcpy(&to[1], &from[1], len - sizeof(*from));
 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
-					V_WR_BCNTLFLT(len & 7));
-	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
-					V_WR_LEN((len + 7) / 8));
+	    V_WR_BCNTLFLT(len & 7));
+	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
 	set_wr_hdr(to, wr_hi, wr_lo);
 	wmb();
 	wr_gen2(d, gen);
-
-	/*
-	 * This check is a hack we should really fix the logic so
-	 * that this can't happen
-	 */
-	if (m->m_type != MT_DONTFREE)
-		m_freem(m);
-	
 }
 
 /**
@@ -1908,12 +1859,6 @@
 	q->cleaned += reclaim;
 }
 
-static __inline int
-immediate(const struct mbuf *m)
-{
-	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
-}
-
 /**
  *	ctrl_xmit - send a packet through an SGE control Tx queue
  *	@adap: the adapter
@@ -1931,11 +1876,8 @@
 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	
-	if (__predict_false(!immediate(m))) {
-		m_freem(m);
-		return 0;
-	}
-	
+	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
+
 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
 
@@ -1950,7 +1892,7 @@
 		}
 		goto again;
 	}
-	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
+	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
 	
 	q->in_use++;
 	if (++q->pidx >= q->size) {
@@ -1960,7 +1902,9 @@
 	TXQ_UNLOCK(qs);
 	wmb();
 	t3_write_reg(adap, A_SG_KDOORBELL,
-		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+
+	m_free(m);
 	return (0);
 }
 
@@ -1985,7 +1929,8 @@
 	while (q->in_use < q->size &&
 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
 
-		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
+		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
+		m_free(m);
 
 		if (++q->pidx >= q->size) {
 			q->pidx = 0;
@@ -2239,6 +2184,7 @@
 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
 #define NOMEM_INTR_DELAY 2500
 
+#ifdef TCP_OFFLOAD
 /**
  *	write_ofld_wr - write an offload work request
  *	@adap: the adapter
@@ -2252,71 +2198,66 @@
  *	data already carry the work request with most fields populated.
  */
 static void
-write_ofld_wr(adapter_t *adap, struct mbuf *m,
-    struct sge_txq *q, unsigned int pidx,
-    unsigned int gen, unsigned int ndesc,
-    bus_dma_segment_t *segs, unsigned int nsegs)
+write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
+    unsigned int pidx, unsigned int gen, unsigned int ndesc)
 {
 	unsigned int sgl_flits, flits;
+	int i, idx, nsegs, wrlen;
 	struct work_request_hdr *from;
-	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
+	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
 	struct tx_desc *d = &q->desc[pidx];
 	struct txq_state txqs;
-	
-	if (immediate(m) && nsegs == 0) {
-		write_imm(d, m, m->m_len, gen);
+	struct sglist_seg *segs;
+	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
+	struct sglist *sgl;
+
+	from = (void *)(oh + 1);	/* Start of WR within mbuf */
+	wrlen = m->m_len - sizeof(*oh);
+
+	if (!(oh->flags & F_HDR_SGL)) {
+		write_imm(d, (caddr_t)from, wrlen, gen);
+
+		/*
+		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
+		 * t3_push_frames and freed in wr_ack.  Others, like those sent
+		 * down by close_conn, t3_send_reset, etc. should be freed here.
+		 */
+		if (!(oh->flags & F_HDR_DF))
+			m_free(m);
 		return;
 	}
 
-	/* Only TX_DATA builds SGLs */
-	from = mtod(m, struct work_request_hdr *);
-	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
-
-	flits = m->m_len / 8;
-	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
-
-	make_sgl(sgp, segs, nsegs);
+	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
+
+	sgl = oh->sgl;
+	flits = wrlen / 8;
+	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
+
+	nsegs = sgl->sg_nseg;
+	segs = sgl->sg_segs;
+	for (idx = 0, i = 0; i < nsegs; i++) {
+		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
+		if (i && idx == 0) 
+			++sgp;
+		sgp->len[idx] = htobe32(segs[i].ss_len);
+		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
+		idx ^= 1;
+	}
+	if (idx) {
+		sgp->len[idx] = 0;
+		sgp->addr[idx] = 0;
+	}
+
 	sgl_flits = sgl_len(nsegs);
-
 	txqs.gen = gen;
 	txqs.pidx = pidx;
 	txqs.compl = 0;
 
-	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
+	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
 	    from->wrh_hi, from->wrh_lo);
 }
 
 /**
- *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
- *	@m: the packet
- *
- * 	Returns the number of Tx descriptors needed for the given offload
- * 	packet.  These packets are already fully constructed.
- */
-static __inline unsigned int
-calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
-{
-	unsigned int flits, cnt = 0;
-	int ndescs;
-
-	if (m->m_len <= WR_LEN && nsegs == 0)
-		return (1);                 /* packet fits as immediate data */
-
-	/*
-	 * This needs to be re-visited for TOE
-	 */
-
-	cnt = nsegs;
-		
-	/* headers */
-	flits = m->m_len / 8;
-
-	ndescs = flits_to_desc(flits + sgl_len(cnt));
-
-	return (ndescs);
-}
-
-/**
  *	ofld_xmit - send a packet through an offload queue
  *	@adap: the adapter
  *	@q: the Tx offload queue
@@ -2327,28 +2268,19 @@
 static int
 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
 {
-	int ret, nsegs;
+	int ret;
 	unsigned int ndesc;
 	unsigned int pidx, gen;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
-	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
-	struct tx_sw_desc *stx;
-
-	nsegs = m_get_sgllen(m);
-	vsegs = m_get_sgl(m);
-	ndesc = calc_tx_descs_ofld(m, nsegs);
-	busdma_map_sgl(vsegs, segs, nsegs);
-
-	stx = &q->sdesc[q->pidx];
-	
+	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
+
+	ndesc = G_HDR_NDESC(oh->flags);
+
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
-			printf("no ofld desc avail\n");
-			
-			m_set_priority(m, ndesc);     /* save for restart */
 			TXQ_UNLOCK(qs);
 			return (EINTR);
 		}
@@ -2363,16 +2295,11 @@
 		q->pidx -= q->size;
 		q->gen ^= 1;
 	}
-#ifdef T3_TRACE
-	T3_TRACE5(adap->tb[q->cntxt_id & 7],
-		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
-		  ndesc, pidx, skb->len, skb->len - skb->data_len,
-		  skb_shinfo(skb)->nr_frags);
-#endif
+
+	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
+	check_ring_tx_db(adap, q, 1);
 	TXQ_UNLOCK(qs);
 
-	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
-	check_ring_tx_db(adap, q, 1);
 	return (0);
 }
 
@@ -2389,16 +2316,15 @@
 	struct sge_qset *qs = data;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	adapter_t *adap = qs->port->adapter;
-	bus_dma_segment_t segs[TX_MAX_SEGS];
-	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
-	int nsegs, cleaned;
+	int cleaned;
 		
 	TXQ_LOCK(qs);
 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
 
 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
 		unsigned int gen, pidx;
-		unsigned int ndesc = m_get_priority(m);
+		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
+		unsigned int ndesc = G_HDR_NDESC(oh->flags);
 
 		if (__predict_false(q->size - q->in_use < ndesc)) {
 			setbit(&qs->txq_stopped, TXQ_OFLD);
@@ -2419,9 +2345,8 @@
 		}
 		
 		(void)mbufq_dequeue(&q->sendq);
-		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
 		TXQ_UNLOCK(qs);
-		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
+		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
 		TXQ_LOCK(qs);
 	}
 #if USE_GTS
@@ -2435,34 +2360,7 @@
 }
 
 /**
- *	queue_set - return the queue set a packet should use
- *	@m: the packet
- *
- *	Maps a packet to the SGE queue set it should use.  The desired queue
- *	set is carried in bits 1-3 in the packet's priority.
- */
-static __inline int
-queue_set(const struct mbuf *m)
-{
-	return m_get_priority(m) >> 1;
-}
-
-/**
- *	is_ctrl_pkt - return whether an offload packet is a control packet
- *	@m: the packet
- *
- *	Determines whether an offload packet should use an OFLD or a CTRL
- *	Tx queue.  This is indicated by bit 0 in the packet's priority.
- */
-static __inline int
-is_ctrl_pkt(const struct mbuf *m)
-{
-	return m_get_priority(m) & 1;
-}
-
-/**
  *	t3_offload_tx - send an offload packet
- *	@tdev: the offload device to send to
  *	@m: the packet
  *
  *	Sends an offload packet.  We use the packet priority to select the
@@ -2470,77 +2368,35 @@
  *	should be sent as regular or control, bits 1-3 select the queue set.
  */
 int
-t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
+t3_offload_tx(struct adapter *sc, struct mbuf *m)
 {
-	adapter_t *adap = tdev2adap(tdev);
-	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
-
-	if (__predict_false(is_ctrl_pkt(m))) 
-		return ctrl_xmit(adap, qs, m);
-
-	return ofld_xmit(adap, qs, m);
+	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
+	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
+
+	if (oh->flags & F_HDR_CTRL) {
+		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
+		return (ctrl_xmit(sc, qs, m));
+	} else
+		return (ofld_xmit(sc, qs, m));
 }
-
-/**
- *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
- *	@tdev: the offload device that will be receiving the packets
- *	@q: the SGE response queue that assembled the bundle
- *	@m: the partial bundle
- *	@n: the number of packets in the bundle
- *
- *	Delivers a (partial) bundle of Rx offload packets to an offload device.
- */
-static __inline void
-deliver_partial_bundle(struct t3cdev *tdev,
-			struct sge_rspq *q,
-			struct mbuf *mbufs[], int n)
-{
-	if (n) {
-		q->offload_bundles++;
-		cxgb_ofld_recv(tdev, mbufs, n);
-	}
-}
-
-static __inline int
-rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
-    struct mbuf *m, struct mbuf *rx_gather[],
-    unsigned int gather_idx)
-{
-	
-	rq->offload_pkts++;
-	m->m_pkthdr.header = mtod(m, void *);
-	rx_gather[gather_idx++] = m;
-	if (gather_idx == RX_BUNDLE_SIZE) {
-		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
-		gather_idx = 0;
-		rq->offload_bundles++;
-	}
-	return (gather_idx);
-}
+#endif
 
 static void
 restart_tx(struct sge_qset *qs)
 {
 	struct adapter *sc = qs->port->adapter;
-	
-	
+
 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
 		qs->txq[TXQ_OFLD].restarts++;
-		DPRINTF("restarting TXQ_OFLD\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 	}
-	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
-	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
-	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
-	    qs->txq[TXQ_CTRL].in_use);
-	
+
 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
 		qs->txq[TXQ_CTRL].restarts++;
-		DPRINTF("restarting TXQ_CTRL\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 }
@@ -2569,6 +2425,7 @@
 
 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
 	q->port = pi;
+	q->adap = sc;
 
 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
@@ -2630,8 +2487,10 @@
 		q->txq[i].gen = 1;
 		q->txq[i].size = p->txq_size[i];
 	}
-	
+
+#ifdef TCP_OFFLOAD
 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
+#endif
 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
@@ -2736,8 +2595,7 @@
 	
 	mtx_unlock_spin(&sc->sge.reg_lock);
 	t3_update_qset_coalesce(q, p);
-	q->port = pi;
-	
+
 	refill_fl(sc, &q->fl[0], q->fl[0].size);
 	refill_fl(sc, &q->fl[1], q->fl[1].size);
 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
@@ -2768,8 +2626,6 @@
 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
 	struct ifnet *ifp = pi->ifp;
 	
-	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
-
 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
 	    cpl->csum_valid && cpl->csum == 0xffff) {
 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
@@ -2967,8 +2823,6 @@
 	int skip_lro;
 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
 #endif
-	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
-	int ngathered = 0;
 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
 #ifdef DEBUG	
 	static int last_holdoff = 0;
@@ -2982,10 +2836,10 @@
 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
 		int eth, eop = 0, ethpad = 0;
 		uint32_t flags = ntohl(r->flags);
-		uint32_t rss_csum = *(const uint32_t *)r;
 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
+		uint8_t opcode = r->rss_hdr.opcode;
 		
-		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
+		eth = (opcode == CPL_RX_PKT);
 		
 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
 			struct mbuf *m;
@@ -3005,27 +2859,27 @@
                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
-			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
+			opcode = CPL_ASYNC_NOTIF;
 			eop = 1;
                         rspq->async_notif++;
 			goto skip;
 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
-			struct mbuf *m = NULL;
-
-			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
-			    r->rss_hdr.opcode, rspq->cidx);
-			if (mh->mh_head == NULL)
-				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
-                        else 
-				m = m_gethdr(M_DONTWAIT, MT_DATA);
-
-			if (mh->mh_head == NULL &&  m == NULL) {	
+			struct mbuf *m = m_gethdr(M_DONTWAIT, MT_DATA);
+
+			if (m == NULL) {	
 		no_mem:
 				rspq->next_holdoff = NOMEM_INTR_DELAY;
 				budget_left--;
 				break;
 			}
-			get_imm_packet(adap, r, mh->mh_head);
+			if (mh->mh_head == NULL)
+				mh->mh_head = m;
+                        else 
+				mh->mh_tail->m_next = m;
+			mh->mh_tail = m;
+
+			get_imm_packet(adap, r, m);
+			mh->mh_head->m_pkthdr.len += m->m_len;
 			eop = 1;
 			rspq->imm_data++;
 		} else if (r->len_cq) {
@@ -3048,30 +2902,14 @@
 			handle_rsp_cntrl_info(qs, flags);
 		}
 
-		r++;
-		if (__predict_false(++rspq->cidx == rspq->size)) {
-			rspq->cidx = 0;
-			rspq->gen ^= 1;
-			r = rspq->desc;
-		}
-
-		if (++rspq->credits >= 64) {
-			refill_rspq(adap, rspq, rspq->credits);
-			rspq->credits = 0;
-		}
 		if (!eth && eop) {
-			mh->mh_head->m_pkthdr.csum_data = rss_csum;
-			/*
-			 * XXX size mismatch
-			 */
-			m_set_priority(mh->mh_head, rss_hash);
-
-			
-			ngathered = rx_offload(&adap->tdev, rspq,
-			    mh->mh_head, offload_mbufs, ngathered);
+			rspq->offload_pkts++;
+#ifdef TCP_OFFLOAD
+			adap->cpl_handler[opcode](qs, r, mh->mh_head);
+#else
+			m_freem(mh->mh_head);
+#endif
 			mh->mh_head = NULL;
-			DPRINTF("received offload packet\n");
-			
 		} else if (eth && eop) {
 			struct mbuf *m = mh->mh_head;
 
@@ -3106,13 +2944,23 @@
 			mh->mh_head = NULL;
 
 		}
+
+		r++;
+		if (__predict_false(++rspq->cidx == rspq->size)) {
+			rspq->cidx = 0;
+			rspq->gen ^= 1;
+			r = rspq->desc;
+		}
+
+		if (++rspq->credits >= 64) {
+			refill_rspq(adap, rspq, rspq->credits);
+			rspq->credits = 0;
+		}
 		__refill_fl_lt(adap, &qs->fl[0], 32);
 		__refill_fl_lt(adap, &qs->fl[1], 32);
 		--budget_left;
 	}
 
-	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
-
 #if defined(INET6) || defined(INET)
 	/* Flush LRO */
 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
diff -r 7cec8c20120e sys/dev/cxgb/sys/mvec.h
--- a/sys/dev/cxgb/sys/mvec.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/sys/mvec.h	Mon Jun 11 00:15:24 2012 -0700
@@ -31,15 +31,6 @@
 #define _MVEC_H_
 #include <machine/bus.h>
 
-#define	M_DDP		0x200000	/* direct data placement mbuf */
-#define	EXT_PHYS	10		/* physical/bus address  */
-
-#define m_cur_offset	m_ext.ext_size		/* override to provide ddp offset */
-#define m_seq		m_pkthdr.csum_data	/* stored sequence */
-#define m_ddp_gl	m_ext.ext_buf		/* ddp list	*/
-#define m_ddp_flags	m_pkthdr.csum_flags	/* ddp flags	*/
-#define m_ulp_mode	m_pkthdr.tso_segsz	/* upper level protocol	*/
-
 static __inline void
 busdma_map_mbuf_fast(bus_dma_tag_t tag, bus_dmamap_t map,
     struct mbuf *m, bus_dma_segment_t *seg)
@@ -58,17 +49,6 @@
     struct mbuf **m, bus_dma_segment_t *segs, int *nsegs);
 void busdma_map_sg_vec(bus_dma_tag_t tag, bus_dmamap_t map,
     struct mbuf *m, bus_dma_segment_t *segs, int *nsegs);
-static __inline int
-busdma_map_sgl(bus_dma_segment_t *vsegs, bus_dma_segment_t *segs, int count) 
-{
-	while (count--) {
-		segs->ds_addr = pmap_kextract((vm_offset_t)vsegs->ds_addr);
-		segs->ds_len = vsegs->ds_len;
-		segs++;
-		vsegs++;
-	}
-	return (0);
-}
 
 static __inline void
 m_freem_list(struct mbuf *m)
@@ -84,5 +64,4 @@
 	}	
 }
 
-
 #endif /* _MVEC_H_ */
diff -r 7cec8c20120e sys/dev/cxgb/t3cdev.h
--- a/sys/dev/cxgb/t3cdev.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,62 +0,0 @@
-/*-
- * Copyright (c) 2007-2008, Chelsio Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-#ifndef _T3CDEV_H_
-#define _T3CDEV_H_
-
-#define T3CNAMSIZ 16
-
-/* Get the t3cdev associated with an ifnet */
-#define T3CDEV(ifp) (&(((struct port_info *)(ifp)->if_softc))->adapter->tdev)
-
-struct cxgb3_client;
-
-enum t3ctype {
-        T3A = 0,
-        T3B,
-	T3C
-};
-
-struct t3cdev {
-	char name[T3CNAMSIZ];		    /* T3C device name */
-	enum t3ctype type;
-	TAILQ_ENTRY(t3cdev) entry;  /* for list linking */
-        struct ifnet *lldev;     /* LL dev associated with T3C messages */
-	struct adapter *adapter;			    
-	int (*send)(struct t3cdev *dev, struct mbuf *m);
-	int (*recv)(struct t3cdev *dev, struct mbuf **m, int n);
-	int (*ctl)(struct t3cdev *dev, unsigned int req, void *data);
-	void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, struct sockaddr *sa);
-	void *priv;                         /* driver private data */
-	void *l2opt;                        /* optional layer 2 data */
-	void *l3opt;                        /* optional layer 3 data */
-	void *l4opt;                        /* optional layer 4 data */
-	void *ulp;			    /* ulp stuff */
-};
-
-#endif /* _T3CDEV_H_ */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -54,20 +55,14 @@
 #include <sys/proc.h>
 #include <sys/eventhandler.h>
 
-#if __FreeBSD_version < 800044
-#define V_ifnet ifnet
-#endif
+#include <netinet/in.h>
+#include <netinet/toecore.h>
 
-#include <net/if.h>
-#include <net/if_var.h>
-#if __FreeBSD_version >= 800056
-#include <net/vnet.h>
-#endif
+#include <rdma/ib_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
-#include <netinet/in.h>
-
-#include <contrib/rdma/ib_verbs.h>
-
+#ifdef TCP_OFFLOAD
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
 #include <ulp/iw_cxgb/iw_cxgb_hal.h>
@@ -75,26 +70,21 @@
 #include <ulp/iw_cxgb/iw_cxgb_cm.h>
 #include <ulp/iw_cxgb/iw_cxgb.h>
 
-/*
- * XXX :-/
- * 
- */
+static int iwch_mod_load(void);
+static int iwch_mod_unload(void);
+static int iwch_activate(struct adapter *);
+static int iwch_deactivate(struct adapter *);
 
-#define idr_init(x)
-
-cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
-
-static void open_rnic_dev(struct t3cdev *);
-static void close_rnic_dev(struct t3cdev *);
-
-static TAILQ_HEAD( ,iwch_dev) dev_list;
-static struct mtx dev_mutex;
-static eventhandler_tag event_tag;
+static struct uld_info iwch_uld_info = {
+	.uld_id = ULD_IWARP,
+	.activate = iwch_activate,
+	.deactivate = iwch_deactivate,
+};
 
 static void
 rnic_init(struct iwch_dev *rnicp)
 {
-	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__,  rnicp);
+
 	idr_init(&rnicp->cqidr);
 	idr_init(&rnicp->qpidr);
 	idr_init(&rnicp->mmidr);
@@ -103,15 +93,16 @@
 	rnicp->attr.vendor_id = 0x168;
 	rnicp->attr.vendor_part_id = 7;
 	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
-	rnicp->attr.max_wrs = (1UL << 24) - 1;
+	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
 	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
 	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
 	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
-	rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
+	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
 	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
 	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
 	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
-	rnicp->attr.mem_pgsizes_bitmask = 0x7FFF;	/* 4KB-128MB */
+	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
+	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
 	rnicp->attr.can_resize_wq = 0;
 	rnicp->attr.max_rdma_reads_per_qp = 8;
 	rnicp->attr.max_rdma_read_resources =
@@ -127,170 +118,183 @@
 	rnicp->attr.zbva_support = 1;
 	rnicp->attr.local_invalidate_fence = 1;
 	rnicp->attr.cq_overflow_detection = 1;
+
 	return;
 }
 
 static void
-open_rnic_dev(struct t3cdev *tdev)
+rnic_uninit(struct iwch_dev *rnicp)
+{
+	idr_destroy(&rnicp->cqidr);
+	idr_destroy(&rnicp->qpidr);
+	idr_destroy(&rnicp->mmidr);
+	mtx_destroy(&rnicp->lock);
+}
+
+static int
+iwch_activate(struct adapter *sc)
 {
 	struct iwch_dev *rnicp;
-	static int vers_printed;
+	int rc;
 
-	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__,  tdev);
-	if (!vers_printed++)
-		printf("Chelsio T3 RDMA Driver - version x.xx\n");
+	KASSERT(!isset(&sc->offload_map, MAX_NPORTS),
+	    ("%s: iWARP already activated on %s", __func__,
+	    device_get_nameunit(sc->dev)));
+
 	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
-	if (!rnicp) {
-		printf("Cannot allocate ib device\n");
-		return;
-	}
-	rnicp->rdev.ulp = rnicp;
-	rnicp->rdev.t3cdev_p = tdev;
+	if (rnicp == NULL)
+		return (ENOMEM);
 
-	mtx_lock(&dev_mutex);
+	sc->iwarp_softc = rnicp;
+	rnicp->rdev.adap = sc;
 
-	if (cxio_rdev_open(&rnicp->rdev)) {
-		mtx_unlock(&dev_mutex);
+	cxio_hal_init(sc);
+	iwch_cm_init_cpl(sc);
+
+	rc = cxio_rdev_open(&rnicp->rdev);
+	if (rc != 0) {
 		printf("Unable to open CXIO rdev\n");
-		ib_dealloc_device(&rnicp->ibdev);
-		return;
+		goto err1;
 	}
 
 	rnic_init(rnicp);
 
-	TAILQ_INSERT_TAIL(&dev_list, rnicp, entry);
-	mtx_unlock(&dev_mutex);
+	rc = iwch_register_device(rnicp);
+	if (rc != 0) {
+		printf("Unable to register device\n");
+		goto err2;
+	}
 
-	if (iwch_register_device(rnicp)) {
-		printf("Unable to register device\n");
-		close_rnic_dev(tdev);
-	}
-#ifdef notyet	
-	printf("Initialized device %s\n",
-	       pci_name(rnicp->rdev.rnic_info.pdev));
-#endif	
-	return;
+	return (0);
+
+err2:
+	rnic_uninit(rnicp);
+	cxio_rdev_close(&rnicp->rdev);
+err1:
+	cxio_hal_uninit(sc);
+	iwch_cm_term_cpl(sc);
+	sc->iwarp_softc = NULL;
+
+	return (rc);
+}
+
+static int
+iwch_deactivate(struct adapter *sc)
+{
+	struct iwch_dev *rnicp;
+
+	rnicp = sc->iwarp_softc;
+
+	iwch_unregister_device(rnicp);
+	rnic_uninit(rnicp);
+	cxio_rdev_close(&rnicp->rdev);
+	cxio_hal_uninit(sc);
+	iwch_cm_term_cpl(sc);
+	ib_dealloc_device(&rnicp->ibdev);
+
+	sc->iwarp_softc = NULL;
+
+	return (0);
 }
 
 static void
-close_rnic_dev(struct t3cdev *tdev)
+iwch_activate_all(struct adapter *sc, void *arg __unused)
 {
-	struct iwch_dev *dev, *tmp;
-	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__,  tdev);
-	mtx_lock(&dev_mutex);
-
-	TAILQ_FOREACH_SAFE(dev, &dev_list, entry, tmp) {
-		if (dev->rdev.t3cdev_p == tdev) {
-#ifdef notyet			
-			list_del(&dev->entry);
-			iwch_unregister_device(dev);
-			cxio_rdev_close(&dev->rdev);
-			idr_destroy(&dev->cqidr);
-			idr_destroy(&dev->qpidr);
-			idr_destroy(&dev->mmidr);
-			ib_dealloc_device(&dev->ibdev);
-#endif			
-			break;
-		}
-	}
-	mtx_unlock(&dev_mutex);
-}
-
-static ifaddr_event_handler_t
-ifaddr_event_handler(void *arg, struct ifnet *ifp)
-{
-	printf("%s if name %s \n", __FUNCTION__, ifp->if_xname);
-	if (ifp->if_capabilities & IFCAP_TOE4) {
-		KASSERT(T3CDEV(ifp) != NULL, ("null t3cdev ptr!"));
-		if (cxio_hal_find_rdev_by_t3cdev(T3CDEV(ifp)) == NULL)
-			open_rnic_dev(T3CDEV(ifp));
-	}
-	return 0;
-}
-
-
-static int
-iwch_init_module(void)
-{
-	VNET_ITERATOR_DECL(vnet_iter);
-	int err;
-	struct ifnet *ifp;
-
-	printf("%s enter\n", __FUNCTION__);
-	TAILQ_INIT(&dev_list);
-	mtx_init(&dev_mutex, "iwch dev_list lock", NULL, MTX_DEF);
-	
-	err = cxio_hal_init();
-	if (err)
-		return err;
-	err = iwch_cm_init();
-	if (err)
-		return err;
-	cxio_register_ev_cb(iwch_ev_dispatch);
-
-	/* Register for ifaddr events to dynamically add TOE devs */
-	event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_event_handler,
-			NULL, EVENTHANDLER_PRI_ANY);
-
-	/* Register existing TOE interfaces by walking the ifnet chain */
-	IFNET_RLOCK();
-	VNET_LIST_RLOCK();
-	VNET_FOREACH(vnet_iter) {
-		CURVNET_SET(vnet_iter); /* XXX CURVNET_SET_QUIET() ? */
-		TAILQ_FOREACH(ifp, &V_ifnet, if_link)
-			(void)ifaddr_event_handler(NULL, ifp);
-		CURVNET_RESTORE();
-	}
-	VNET_LIST_RUNLOCK();
-	IFNET_RUNLOCK();
-	return 0;
+	ADAPTER_LOCK(sc);
+	if ((sc->open_device_map & sc->offload_map) != 0 &&
+	    t3_activate_uld(sc, ULD_IWARP) == 0)
+		setbit(&sc->offload_map, MAX_NPORTS);
+	ADAPTER_UNLOCK(sc);
 }
 
 static void
-iwch_exit_module(void)
+iwch_deactivate_all(struct adapter *sc, void *arg __unused)
 {
-	EVENTHANDLER_DEREGISTER(ifaddr_event, event_tag);
-	cxio_unregister_ev_cb(iwch_ev_dispatch);
-	iwch_cm_term();
-	cxio_hal_exit();
+	ADAPTER_LOCK(sc);
+	if (isset(&sc->offload_map, MAX_NPORTS) &&
+	    t3_deactivate_uld(sc, ULD_IWARP) == 0)
+		clrbit(&sc->offload_map, MAX_NPORTS);
+	ADAPTER_UNLOCK(sc);
 }
 
-static int 
-iwch_load(module_t mod, int cmd, void *arg)
+static int
+iwch_mod_load(void)
 {
-        int err = 0;
+	int rc;
 
-        switch (cmd) {
-        case MOD_LOAD:
-                printf("Loading iw_cxgb.\n");
+	rc = iwch_cm_init();
+	if (rc != 0)
+		return (rc);
 
-                iwch_init_module();
-                break;
-        case MOD_QUIESCE:
-                break;
-        case MOD_UNLOAD:
-                printf("Unloading iw_cxgb.\n");
-		iwch_exit_module();
-                break;
-        case MOD_SHUTDOWN:
-                break;
-        default:
-                err = EOPNOTSUPP;
-                break;
-        }
+	rc = t3_register_uld(&iwch_uld_info);
+	if (rc != 0) {
+		iwch_cm_term();
+		return (rc);
+	}
 
-        return (err);
+	t3_iterate(iwch_activate_all, NULL);
+
+	return (rc);
 }
 
-static moduledata_t mod_data = {
+static int
+iwch_mod_unload(void)
+{
+	t3_iterate(iwch_deactivate_all, NULL);
+
+	iwch_cm_term();
+
+	if (t3_unregister_uld(&iwch_uld_info) == EBUSY)
+		return (EBUSY);
+
+	return (0);
+}
+#endif	/* TCP_OFFLOAD */
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+
+static int
+iwch_modevent(module_t mod, int cmd, void *arg)
+{
+	int rc = 0;
+
+#ifdef TCP_OFFLOAD
+	switch (cmd) {
+	case MOD_LOAD:
+		rc = iwch_mod_load();
+		if(rc)
+			printf("iw_cxgb: Chelsio T3 RDMA Driver failed to load\n");
+		else
+			printf("iw_cxgb: Chelsio T3 RDMA Driver loaded\n");
+		break;
+
+	case MOD_UNLOAD:
+		rc = iwch_mod_unload();
+		if(rc)
+			printf("iw_cxgb: Chelsio T3 RDMA Driver failed to unload\n");
+		else
+			printf("iw_cxgb: Chelsio T3 RDMA Driver unloaded\n");
+		break;
+
+	default:
+		rc = EINVAL;
+	}
+#else
+	printf("iw_cxgb: compiled without TCP_OFFLOAD support.\n");
+	rc = EOPNOTSUPP;
+#endif
+	return (rc);
+}
+
+static moduledata_t iwch_mod_data = {
 	"iw_cxgb",
-	iwch_load,
+	iwch_modevent,
 	0
 };
 
 MODULE_VERSION(iw_cxgb, 1);
-DECLARE_MODULE(iw_cxgb, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
-MODULE_DEPEND(iw_cxgb, rdma_core, 1, 1, 1);
-MODULE_DEPEND(iw_cxgb, if_cxgb, 1, 1, 1);
+DECLARE_MODULE(iw_cxgb, iwch_mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
+MODULE_DEPEND(t3_tom, cxgbc, 1, 1, 1);
+MODULE_DEPEND(iw_cxgb, toecore, 1, 1, 1);
 MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1);
-
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h	Mon Jun 11 00:15:24 2012 -0700
@@ -37,6 +37,13 @@
 struct iwch_qp;
 struct iwch_mr;
 
+enum t3ctype {
+        T3A = 0,
+        T3B,
+        T3C
+};
+
+#define PAGE_MASK_IWARP (~(PAGE_SIZE-1))
 
 struct iwch_rnic_attributes {
 	u32 vendor_id;
@@ -57,6 +64,7 @@
 	 * size (4k)^i.  Phys block list mode unsupported.
 	 */
 	u32 mem_pgsizes_bitmask;
+	u64 max_mr_size;
 	u8 can_resize_wq;
 
 	/*
@@ -97,9 +105,9 @@
 	struct cxio_rdev rdev;
 	u32 device_cap_flags;
 	struct iwch_rnic_attributes attr;
-	struct kvl cqidr;
-	struct kvl qpidr;
-	struct kvl mmidr;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
 	struct mtx lock;
 	TAILQ_ENTRY(iwch_dev) entry;
 };
@@ -113,40 +121,43 @@
 	return container_of(ibdev, struct iwch_dev, ibdev);
 }
 
-static inline int t3b_device(const struct iwch_dev *rhp)
+static inline int t3b_device(const struct iwch_dev *rhp __unused)
 {
-	return rhp->rdev.t3cdev_p->type == T3B;
+	return (0);
 }
 
-static inline int t3a_device(const struct iwch_dev *rhp)
+static inline int t3a_device(const struct iwch_dev *rhp __unused)
 {
-	return rhp->rdev.t3cdev_p->type == T3A;
+	return (0);
 }
 
 static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
 {
-	return kvl_lookup(&rhp->cqidr, cqid);
+	return idr_find(&rhp->cqidr, cqid);
 }
 
 static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
 {
-	return kvl_lookup(&rhp->qpidr, qpid);
+	return idr_find(&rhp->qpidr, qpid);
 }
 
 static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
 {
-	return kvl_lookup(&rhp->mmidr, mmid);
+	return idr_find(&rhp->mmidr, mmid);
 }
 
-static inline int insert_handle(struct iwch_dev *rhp, struct kvl *kvlp,
+static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
 				void *handle, u32 id)
 {
 	int ret;
 	u32 newid;
 
 	do {
+		if (!idr_pre_get(idr, GFP_KERNEL)) {
+                        return -ENOMEM;
+                }
 		mtx_lock(&rhp->lock);
-		ret = kvl_alloc_above(kvlp, handle, id, &newid);
+		ret = idr_get_new_above(idr, handle, id, &newid);
 		WARN_ON(ret != 0);
 		WARN_ON(!ret && newid != id);
 		mtx_unlock(&rhp->lock);
@@ -155,14 +166,12 @@
 	return ret;
 }
 
-static inline void remove_handle(struct iwch_dev *rhp, struct kvl *kvlp, u32 id)
+static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
 {
 	mtx_lock(&rhp->lock);
-	kvl_delete(kvlp, id);
+	idr_remove(idr, id);
 	mtx_unlock(&rhp->lock);
 }
 
-extern struct cxgb_client t3c_client;
-extern cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
-extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m);
+void iwch_ev_dispatch(struct iwch_dev *, struct mbuf *);
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -66,13 +68,17 @@
 #include <netinet/tcp.h>
 #include <netinet/tcpip.h>
 
-#include <contrib/rdma/ib_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_defs.h>
 #include <ulp/tom/cxgb_toepcb.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
+#include <rdma/ib_verbs.h>
+#include <linux/idr.h>
+
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
 #include <ulp/iw_cxgb/iw_cxgb_hal.h>
 #include <ulp/iw_cxgb/iw_cxgb_provider.h>
@@ -97,46 +103,46 @@
 };
 #endif
 
-SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters");
+SYSCTL_NODE(_hw, OID_AUTO, iw_cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters");
 
-static int ep_timeout_secs = 10;
+static int ep_timeout_secs = 60;
 TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0,
-    "CM Endpoint operation timeout in seconds (default=10)");
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RW, &ep_timeout_secs, 0,
+    "CM Endpoint operation timeout in seconds (default=60)");
 
 static int mpa_rev = 1;
 TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0,
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RW, &mpa_rev, 0,
     "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)");
 
 static int markers_enabled = 0;
 TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0,
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RW, &markers_enabled, 0,
     "Enable MPA MARKERS (default(0)=disabled)");
 
 static int crc_enabled = 1;
 TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0,
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RW, &crc_enabled, 0,
     "Enable MPA CRC (default(1)=enabled)");
 
 static int rcv_win = 256 * 1024;
 TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0,
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RW, &rcv_win, 0,
     "TCP receive window in bytes (default=256KB)");
 
 static int snd_win = 32 * 1024;
 TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win);
-SYSCTL_INT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0,
+SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, snd_win, CTLFLAG_RW, &snd_win, 0,
     "TCP send window in bytes (default=32KB)");
 
 static unsigned int nocong = 0;
 TUNABLE_INT("hw.iw_cxgb.nocong", &nocong);
-SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0,
+SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, nocong, CTLFLAG_RW, &nocong, 0,
     "Turn off congestion control (default=0)");
 
 static unsigned int cong_flavor = 1;
 TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor);
-SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0,
+SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RW, &cong_flavor, 0,
     "TCP Congestion control flavor (default=1)");
 
 static void ep_timeout(void *arg);
@@ -174,42 +180,44 @@
 stop_ep_timer(struct iwch_ep *ep)
 {
 	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	if (!callout_pending(&ep->timer)) {
+		CTR3(KTR_IW_CXGB, "%s timer stopped when its not running!  ep %p state %u\n",
+                       __func__, ep, ep->com.state);
+		return;
+	}
 	callout_drain(&ep->timer);
 	put_ep(&ep->com);
 }
 
-static int set_tcpinfo(struct iwch_ep *ep)
+static int
+set_tcpinfo(struct iwch_ep *ep)
 {
-	struct tcp_info ti;
-	struct sockopt sopt;
-	int err;
+	struct socket *so = ep->com.so;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp;
+	struct toepcb *toep;
+	int rc = 0;
 
-	sopt.sopt_dir = SOPT_GET;
-	sopt.sopt_level = IPPROTO_TCP;
-	sopt.sopt_name = TCP_INFO;
-	sopt.sopt_val = (caddr_t)&ti;
-	sopt.sopt_valsize = sizeof ti;
-	sopt.sopt_td = NULL;
-	
-	err = sogetopt(ep->com.so, &sopt);
-	if (err) {
-		printf("%s can't get tcpinfo\n", __FUNCTION__);
-		return -err;
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	if ((tp->t_flags & TF_TOE) == 0) {
+		rc = EINVAL;
+		printf("%s: connection NOT OFFLOADED!\n", __func__);
+		goto done;
 	}
-	if (!(ti.tcpi_options & TCPI_OPT_TOE)) {
-		printf("%s connection NOT OFFLOADED!\n", __FUNCTION__);
-		return -EINVAL;
-	}
+	toep = tp->t_toe;
 
-	ep->snd_seq = ti.tcpi_snd_nxt;
-	ep->rcv_seq = ti.tcpi_rcv_nxt;
-	ep->emss = ti.tcpi_snd_mss - sizeof(struct tcpiphdr);
-	ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */
-	if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS)
-		ep->emss -= 12;
+	ep->hwtid = toep->tp_tid;
+	ep->snd_seq = tp->snd_nxt;
+	ep->rcv_seq = tp->rcv_nxt;
+	ep->emss = tp->t_maxseg;
 	if (ep->emss < 128)
 		ep->emss = 128;
-	return 0;
+done:
+	INP_WUNLOCK(inp);
+	return (rc);
+
 }
 
 static enum iwch_ep_state
@@ -264,56 +272,6 @@
 	free(epc, M_DEVBUF);
 }
 
-int
-iwch_quiesce_tid(struct iwch_ep *ep)
-{
-#ifdef notyet
-	struct cpl_set_tcb_field *req;
-	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
-
-	if (m == NULL)
-		return (-ENOMEM);
-	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
-	req->reply = 0;
-	req->cpu_idx = 0;
-	req->word = htons(W_TCB_RX_QUIESCE);
-	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
-	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
-
-	m_set_priority(m, CPL_PRIORITY_DATA); 
-	cxgb_ofld_send(ep->com.tdev, m);
-#endif
-	return 0;
-}
-
-int
-iwch_resume_tid(struct iwch_ep *ep)
-{
-#ifdef notyet
-	struct cpl_set_tcb_field *req;
-	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
-
-	if (m == NULL)
-		return (-ENOMEM);
-	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
-	req->reply = 0;
-	req->cpu_idx = 0;
-	req->word = htons(W_TCB_RX_QUIESCE);
-	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
-	req->val = 0;
-
-	m_set_priority(m, CPL_PRIORITY_DATA);
-	cxgb_ofld_send(ep->com.tdev, m);
-#endif
-	return 0;
-}
-
 static struct rtentry *
 find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port,
     __be16 peer_port, u8 tos)
@@ -331,13 +289,16 @@
 }
 
 static void
-close_socket(struct iwch_ep_common *epc)
+close_socket(struct iwch_ep_common *epc, int close)
 {
 	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
 	SOCK_LOCK(epc->so);
 	soupcall_clear(epc->so, SO_RCV);
 	SOCK_UNLOCK(epc->so);
-	soshutdown(epc->so, SHUT_WR|SHUT_RD);
+	if (close)
+		soclose(epc->so);
+	else
+		soshutdown(epc->so, SHUT_WR|SHUT_RD);
 	epc->so = NULL;
 }
 
@@ -500,7 +461,7 @@
 	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
 	state_set(&ep->com, ABORTING);
 	abort_socket(ep);
-	close_socket(&ep->com);
+	close_socket(&ep->com, 0);
 	close_complete_upcall(ep);
 	state_set(&ep->com, DEAD);
 	put_ep(&ep->com);
@@ -582,12 +543,13 @@
 	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
 	event.provider_data = ep;
 	event.so = ep->com.so;
-	if (state_read(&ep->parent_ep->com) != DEAD)
+	if (state_read(&ep->parent_ep->com) != DEAD) {
+		get_ep(&ep->com);
 		ep->parent_ep->com.cm_id->event_handler(
 						ep->parent_ep->com.cm_id,
 						&event);
+	}
 	put_ep(&ep->parent_ep->com);
-	ep->parent_ep = NULL;
 }
 
 static void
@@ -729,6 +691,7 @@
 	 */
 	CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__);
 	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.initiator = 1;
 	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
 	ep->mpa_attr.recv_marker_enabled = markers_enabled;
 	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
@@ -885,6 +848,7 @@
 	 * If we get here we have accumulated the entire mpa
 	 * start reply message including private data.
 	 */
+	ep->mpa_attr.initiator = 0;
 	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
 	ep->mpa_attr.recv_marker_enabled = markers_enabled;
 	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
@@ -934,7 +898,6 @@
 		 * rejects the CR.
 		 */
 		__state_set(&ep->com, CLOSING);
-		get_ep(&ep->com);
 		break;
 	case MPA_REP_SENT:
 		__state_set(&ep->com, CLOSING);
@@ -961,7 +924,7 @@
 			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
 				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
 		}
-		close_socket(&ep->com);
+		close_socket(&ep->com, 0);
 		close_complete_upcall(ep);
 		__state_set(&ep->com, DEAD);
 		release = 1;
@@ -986,11 +949,10 @@
 {
 	struct iwch_qp_attributes attrs;
 	int ret;
-	int state;
 
-	state = state_read(&ep->com);
-	CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]);
-	switch (state) {
+	mtx_lock(&ep->com.lock);
+	CTR3(KTR_IW_CXGB, "%s ep %p state %u", __func__, ep, ep->com.state);
+	switch (ep->com.state) {
 	case MPA_REQ_WAIT:
 		stop_ep_timer(ep);
 		break;
@@ -1009,7 +971,6 @@
 		 * the reference on it until the ULP accepts or
 		 * rejects the CR.
 		 */
-		get_ep(&ep->com);
 		break;
 	case MORIBUND:
 	case CLOSING:
@@ -1031,6 +992,7 @@
 	case ABORTING:
 		break;
 	case DEAD:
+		mtx_unlock(&ep->com.lock);
 		CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, 
 			ep->com.so->so_error);
 		return;
@@ -1039,11 +1001,12 @@
 		break;
 	}
 
-	if (state != ABORTING) {
-		close_socket(&ep->com);
-		state_set(&ep->com, DEAD);
+	if (ep->com.state != ABORTING) {
+		close_socket(&ep->com, 0);
+		__state_set(&ep->com, DEAD);
 		put_ep(&ep->com);
 	}
+	mtx_unlock(&ep->com.lock);
 	return;
 }
 
@@ -1071,7 +1034,10 @@
 					     IWCH_QP_ATTR_NEXT_STATE,
 					     &attrs, 1);
 		}
-		close_socket(&ep->com);
+		if (ep->parent_ep)
+			close_socket(&ep->com, 1);
+		else
+			close_socket(&ep->com, 0);
 		close_complete_upcall(ep);
 		__state_set(&ep->com, DEAD);
 		release = 1;
@@ -1102,77 +1068,59 @@
  * terminate() handles case (1)...
  */
 static int
-terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx)
+terminate(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct toepcb *toep = (struct toepcb *)ctx;
-	struct socket *so = toeptoso(toep);
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	uint32_t hash = *((uint32_t *)r + 1);
+	unsigned int tid = ntohl(hash) >> 8 & 0xfffff;
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct socket *so = toep->tp_inp->inp_socket;
 	struct iwch_ep *ep = so->so_rcv.sb_upcallarg;
 
-	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	if (state_read(&ep->com) != FPDU_MODE)
+		goto done;
+
 	m_adj(m, sizeof(struct cpl_rdma_terminate));
-	CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len);
+
+	CTR4(KTR_IW_CXGB, "%s: tid %u, ep %p, saved %d bytes",
+	    __func__, tid, ep, m->m_len);
+
 	m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer);
 	ep->com.qp->attr.terminate_msg_len = m->m_len;
 	ep->com.qp->attr.is_terminate_local = 0;
-	return CPL_RET_BUF_DONE;
+
+done:
+	m_freem(m);
+	return (0);
 }
 
 static int
-ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx)
+ec_status(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct toepcb *toep = (struct toepcb *)ctx;
-	struct socket *so = toeptoso(toep);
-	struct cpl_rdma_ec_status *rep = cplhdr(m);
-	struct iwch_ep *ep;
-	struct iwch_qp_attributes attrs;
-	int release = 0;
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_rdma_ec_status *rep = mtod(m, void *);
+	unsigned int tid = GET_TID(rep);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct socket *so = toep->tp_inp->inp_socket;
+	struct iwch_ep *ep = so->so_rcv.sb_upcallarg;
 
-	ep = so->so_rcv.sb_upcallarg;
-	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status);
-	if (!so || !ep) {
-		panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1); 
-	}
-	mtx_lock(&ep->com.lock);
-	switch (ep->com.state) {
-	case CLOSING:
-		if (!rep->status)
-			__state_set(&ep->com, MORIBUND);
-		else
-			__state_set(&ep->com, ABORTING);
-		break;
-	case MORIBUND:
+	if (rep->status) {
+		struct iwch_qp_attributes attrs;
+
+		CTR1(KTR_IW_CXGB, "%s BAD CLOSE - Aborting", __FUNCTION__);
 		stop_ep_timer(ep);
-		if (!rep->status) {
-			if ((ep->com.cm_id) && (ep->com.qp)) {
-				attrs.next_state = IWCH_QP_STATE_IDLE;
-				iwch_modify_qp(ep->com.qp->rhp,
-					     ep->com.qp,
-					     IWCH_QP_ATTR_NEXT_STATE,
-					     &attrs, 1);
-			}
-			close_socket(&ep->com);
-			close_complete_upcall(ep);
-			__state_set(&ep->com, DEAD);
-			release = 1;
-		}
-		break;
-	case DEAD:
-		break;
-	default:
-		panic("unknown state: %d\n", ep->com.state);
-	}
-	mtx_unlock(&ep->com.lock);
-	if (rep->status) {
-		log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n",
-		       __FUNCTION__, ep->hwtid);
 		attrs.next_state = IWCH_QP_STATE_ERROR;
 		iwch_modify_qp(ep->com.qp->rhp,
-			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
-			       &attrs, 1);
+			     ep->com.qp,
+			     IWCH_QP_ATTR_NEXT_STATE,
+			     &attrs, 1);
+		abort_connection(ep);
 	}
-	if (release)
-		put_ep(&ep->com);
-	return CPL_RET_BUF_DONE;
+
+	m_freem(m);
+	return (0);
 }
 
 static void
@@ -1181,24 +1129,29 @@
 	struct iwch_ep *ep = (struct iwch_ep *)arg;
 	struct iwch_qp_attributes attrs;
 	int err = 0;
+	int abort = 1;
 
 	mtx_lock(&ep->com.lock);
 	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
 	switch (ep->com.state) {
 	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
 		connect_reply_upcall(ep, -ETIMEDOUT);
 		break;
 	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
 		break;
 	case CLOSING:
 	case MORIBUND:
 		if (ep->com.cm_id && ep->com.qp)
 			err = 1;
+		__state_set(&ep->com, ABORTING);
 		break;
 	default:
-		panic("unknown state: %d\n", ep->com.state);
+		CTR3(KTR_IW_CXGB, "%s unexpected state ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		abort = 0;
 	}
-	__state_set(&ep->com, ABORTING);
 	mtx_unlock(&ep->com.lock);
 	if (err){
 		attrs.next_state = IWCH_QP_STATE_ERROR;
@@ -1206,7 +1159,8 @@
 			     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
 			     &attrs, 1);
 	}
-	abort_connection(ep);
+	if (abort)
+		abort_connection(ep);
 	put_ep(&ep->com);
 }
 
@@ -1228,6 +1182,7 @@
 		err = send_mpa_reject(ep, pdata, pdata_len);
 		err = soshutdown(ep->com.so, 3);
 	}
+	put_ep(&ep->com);
 	return 0;
 }
 
@@ -1242,8 +1197,10 @@
 	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
 
 	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
-	if (state_read(&ep->com) == DEAD)
-		return (-ECONNRESET);
+	if (state_read(&ep->com) == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
 
 	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
 	PANIC_IF(!qp);
@@ -1251,7 +1208,8 @@
 	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
 	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
 		abort_connection(ep);
-		return (-EINVAL);
+		err = -EINVAL;
+		goto err;
 	}
 
 	cm_id->add_ref(cm_id);
@@ -1263,11 +1221,10 @@
 	ep->ird = conn_param->ird;
 	ep->ord = conn_param->ord;
 	CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord);
-	get_ep(&ep->com);
 
 	/* bind QP to EP and move to RTS */
 	attrs.mpa_attr = ep->mpa_attr;
-	attrs.max_ird = ep->ord;
+	attrs.max_ird = ep->ird;
 	attrs.max_ord = ep->ord;
 	attrs.llp_stream_handle = ep;
 	attrs.next_state = IWCH_QP_STATE_RTS;
@@ -1283,20 +1240,21 @@
 			     ep->com.qp, mask, &attrs, 1);
 
 	if (err) 
-		goto err;
+		goto err1;
 
 	err = send_mpa_reply(ep, conn_param->private_data,
  			     conn_param->private_data_len);
 	if (err)
-		goto err;
+		goto err1;
 	state_set(&ep->com, FPDU_MODE);
 	established_upcall(ep);
 	put_ep(&ep->com);
 	return 0;
-err:
+err1:
 	ep->com.cm_id = NULL;
 	ep->com.qp = NULL;
 	cm_id->rem_ref(cm_id);
+err:
 	put_ep(&ep->com);
 	return err;
 }
@@ -1312,15 +1270,6 @@
 	epc->so->so_state |= SS_NBIO;
 	SOCK_UNLOCK(epc->so);
 	sopt.sopt_dir = SOPT_SET;
-	sopt.sopt_level = SOL_SOCKET;
-	sopt.sopt_name = SO_NO_DDP;
-	sopt.sopt_val = (caddr_t)&on;
-	sopt.sopt_valsize = sizeof on;
-	sopt.sopt_td = NULL;
-	err = sosetopt(epc->so, &sopt);
-	if (err) 
-		printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err);
-	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = IPPROTO_TCP;
 	sopt.sopt_name = TCP_NODELAY;
 	sopt.sopt_val = (caddr_t)&on;
@@ -1400,16 +1349,14 @@
 
 	if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) {
 		printf("%s - interface not TOE capable.\n", __FUNCTION__);
-		goto fail3;
+		RTFREE(rt);
+		goto fail2;
 	}
 	tdev = TOEDEV(rt->rt_ifp);
 	if (tdev == NULL) {
 		printf("%s - No toedev for interface.\n", __FUNCTION__);
-		goto fail3;
-	}
-	if (!tdev->tod_can_offload(tdev, ep->com.so)) {
-		printf("%s - interface cannot offload!.\n", __FUNCTION__);
-		goto fail3;
+		RTFREE(rt);
+		goto fail2;
 	}
 	RTFREE(rt);
 
@@ -1420,8 +1367,6 @@
 		ep->com.thread);
 	if (!err)
 		goto out;
-fail3:
-	RTFREE(ep->dst);
 fail2:
 	put_ep(&ep->com);
 out:
@@ -1458,7 +1403,7 @@
 		cm_id->provider_data = ep;
 		goto out;
 	}
-	close_socket(&ep->com);
+	close_socket(&ep->com, 0);
 fail:
 	cm_id->rem_ref(cm_id);
 	put_ep(&ep->com);
@@ -1474,7 +1419,7 @@
 	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
 
 	state_set(&ep->com, DEAD);
-	close_socket(&ep->com);
+	close_socket(&ep->com, 0);
 	cm_id->rem_ref(cm_id);
 	put_ep(&ep->com);
 	return 0;
@@ -1493,47 +1438,48 @@
 	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep,
 	     ep->com.so, states[ep->com.state], abrupt);
 
-	if (ep->com.state == DEAD) {
-		CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep);
-		goto out;
-	}
-
-	if (abrupt) {
-		if (ep->com.state != ABORTING) {
-			ep->com.state = ABORTING;
-			close = 1;
-		}
-		goto out;
-	}
-
 	switch (ep->com.state) {
 	case MPA_REQ_WAIT:
 	case MPA_REQ_SENT:
 	case MPA_REQ_RCVD:
 	case MPA_REP_SENT:
 	case FPDU_MODE:
-		start_ep_timer(ep);
-		ep->com.state = CLOSING;
 		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
 		break;
 	case CLOSING:
-		ep->com.state = MORIBUND;
 		close = 1;
+		if (abrupt) {
+			stop_ep_timer(ep);
+			ep->com.state = ABORTING;
+		} else
+			ep->com.state = MORIBUND;
 		break;
 	case MORIBUND:
 	case ABORTING:
+	case DEAD:
+		CTR3(KTR_IW_CXGB, "%s ignoring disconnect ep %p state %u\n",
+			__func__, ep, ep->com.state);
 		break;
 	default:
 		panic("unknown state: %d\n", ep->com.state);
 		break;
 	}
-out:
+
 	mtx_unlock(&ep->com.lock);
 	if (close) {
 		if (abrupt)
 			abort_connection(ep);
-		else
+		else {
+			if (!ep->parent_ep)
+				__state_set(&ep->com, MORIBUND);
 			shutdown_socket(&ep->com);
+		}
 	}
 	return 0;
 }
@@ -1587,7 +1533,7 @@
 		send_mpa_req(ep);
 	} else {
 		connect_reply_upcall(ep, -ep->com.so->so_error);
-		close_socket(&ep->com);
+		close_socket(&ep->com, 0);
 		state_set(&ep->com, DEAD);
 		put_ep(&ep->com);
 	}
@@ -1643,10 +1589,20 @@
 	}
 	CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, 
 		inet_ntoa(remote->sin_addr), ntohs(remote->sin_port));
+	child_ep->com.tdev = parent_ep->com.tdev;
+	child_ep->com.local_addr.sin_family = parent_ep->com.local_addr.sin_family;
+	child_ep->com.local_addr.sin_port = parent_ep->com.local_addr.sin_port;
+	child_ep->com.local_addr.sin_addr.s_addr = parent_ep->com.local_addr.sin_addr.s_addr;
+	child_ep->com.local_addr.sin_len = parent_ep->com.local_addr.sin_len;
+	child_ep->com.remote_addr.sin_family = remote->sin_family;
+	child_ep->com.remote_addr.sin_port = remote->sin_port;
+	child_ep->com.remote_addr.sin_addr.s_addr = remote->sin_addr.s_addr;
+	child_ep->com.remote_addr.sin_len = remote->sin_len;
 	child_ep->com.so = child_so;
 	child_ep->com.cm_id = NULL;
 	child_ep->com.thread = parent_ep->com.thread;
 	child_ep->parent_ep = parent_ep;
+
 	free(remote, M_SONAME);
 	get_ep(&parent_ep->com);
 	child_ep->parent_ep = parent_ep;
@@ -1747,17 +1703,30 @@
         }
         taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq");
         TASK_INIT(&iw_cxgb_task, 0, process_req, NULL);
-	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate);
-	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status);
-	return 0;
+	return (0);
 }
 
 void
 iwch_cm_term(void)
 {
-	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL);
-	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL);
+
 	taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task);
 	taskqueue_free(iw_cxgb_taskq);
 }
 
+void
+iwch_cm_init_cpl(struct adapter *sc)
+{
+
+	t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, terminate);
+	t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, ec_status);
+}
+
+void
+iwch_cm_term_cpl(struct adapter *sc)
+{
+
+	t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, NULL);
+	t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, NULL);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h	Mon Jun 11 00:15:24 2012 -0700
@@ -31,8 +31,8 @@
 
 #ifndef _IWCH_CM_H_
 #define _IWCH_CM_H_
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
 #include <sys/refcount.h>
 #include <sys/condvar.h>
 #include <sys/proc.h>
@@ -42,21 +42,21 @@
 #define MPA_KEY_REP "MPA ID Rep Frame"
 
 #define MPA_MAX_PRIVATE_DATA	256
-#define MPA_REV		o0	/* XXX - amso1100 uses rev 0 ! */
+#define MPA_REV			0	/* XXX - amso1100 uses rev 0 ! */
 #define MPA_REJECT		0x20
 #define MPA_CRC			0x40
 #define MPA_MARKERS		0x80
 #define MPA_FLAGS_MASK		0xE0
 
 #define put_ep(ep) { \
-	CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__,  \
+	CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d", __FUNCTION__, __LINE__,  \
 	     ep, atomic_load_acq_int(&((ep)->refcount))); \
 	if (refcount_release(&((ep)->refcount)))  \
 		__free_ep(ep); \
 }
 
 #define get_ep(ep) { \
-	CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \
+	CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d", __FUNCTION__, __LINE__, \
 	     ep, atomic_load_acq_int(&((ep)->refcount))); \
 	refcount_acquire(&((ep)->refcount));	  \
 }
@@ -148,7 +148,7 @@
 	TAILQ_ENTRY(iwch_ep_common) entry;
 	struct iw_cm_id *cm_id;
 	struct iwch_qp *qp;
-	struct t3cdev *tdev;
+	struct toedev *tdev;
 	enum iwch_ep_state state;
 	u_int refcount;
 	struct cv waitq;
@@ -176,7 +176,6 @@
 	u32 snd_seq;
 	u32 rcv_seq;
 	struct l2t_entry *l2t;
-	struct rtentry *dst;
 	struct mbuf *mpa_mbuf;
 	struct iwch_mpa_attributes mpa_attr;
 	unsigned int mpa_pkt_len;
@@ -237,13 +236,13 @@
 int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
 int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
 int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags);
-int iwch_quiesce_tid(struct iwch_ep *ep);
-int iwch_resume_tid(struct iwch_ep *ep);
 void __free_ep(struct iwch_ep_common *ep);
 void iwch_rearp(struct iwch_ep *ep);
 int iwch_ep_redirect(void *ctx, struct rtentry *old, struct rtentry *new, struct l2t_entry *l2t);
 
 int iwch_cm_init(void);
 void iwch_cm_term(void);
+void iwch_cm_init_cpl(struct adapter *);
+void iwch_cm_term_cpl(struct adapter *);
 
 #endif				/* _IWCH_CM_H_ */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c	Mon Jun 11 00:15:24 2012 -0700
@@ -30,11 +30,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -59,9 +61,11 @@
 #include <sys/libkern.h>
 
 #include <netinet/in.h>
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
@@ -261,4 +265,4 @@
 		return npolled;
 	}
 }
-
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c	Mon Jun 11 00:15:24 2012 -0700
@@ -30,11 +30,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -60,11 +61,13 @@
 
 #include <netinet/in.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
-#ifdef DEBUG
+#if defined(INVARIANTS) && defined(TCP_OFFLOAD)
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
 #include <ulp/iw_cxgb/iw_cxgb_hal.h>
@@ -74,75 +77,100 @@
 #include <ulp/iw_cxgb/iw_cxgb_resource.h>
 #include <ulp/iw_cxgb/iw_cxgb_user.h>
 
+static int
+cxio_rdma_get_mem(struct cxio_rdev *rdev, struct ch_mem_range *m)
+{
+	struct adapter *sc = rdev->adap;
+	struct mc7 *mem;
+
+	if ((m->addr & 7) || (m->len & 7))
+		return (EINVAL);
+	if (m->mem_id == MEM_CM)
+		mem = &sc->cm;
+	else if (m->mem_id == MEM_PMRX)
+		mem = &sc->pmrx;
+	else if (m->mem_id == MEM_PMTX)
+		mem = &sc->pmtx;
+	else
+		return (EINVAL);
+
+	return (t3_mc7_bd_read(mem, m->addr/8, m->len/8, (u64 *)m->buf));
+}
+
 void cxio_dump_tpt(struct cxio_rdev *rdev, uint32_t stag)
 {
-	struct ch_mem_range *m;
+	struct ch_mem_range m;
 	u64 *data;
+	u32 addr;
 	int rc;
 	int size = 32;
 
-	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
-	if (!m) {
+	m.buf = malloc(size, M_DEVBUF, M_NOWAIT);
+	if (m.buf == NULL) {
 		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
 		return;
 	}
-	m->mem_id = MEM_PMRX;
-	m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base;
-	m->len = size;
-	CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m->addr, m->len);
-	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	m.mem_id = MEM_PMRX;
+	m.addr = (stag >> 8) * 32 + rdev->rnic_info.tpt_base;
+	m.len = size;
+	CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m.addr, m.len);
+
+	rc = cxio_rdma_get_mem(rdev, &m);
 	if (rc) {
 		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
-		free(m, M_DEVBUF);
+		free(m.buf, M_DEVBUF);
 		return;
 	}
 
-	data = (u64 *)m->buf;
+	data = (u64 *)m.buf;
+	addr = m.addr;
 	while (size > 0) {
-		CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", m->addr, (unsigned long long) *data);
+		CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", addr, (unsigned long long) *data);
 		size -= 8;
 		data++;
-		m->addr += 8;
+		addr += 8;
 	}
-	free(m, M_DEVBUF);
+	free(m.buf, M_DEVBUF);
 }
 
 void cxio_dump_pbl(struct cxio_rdev *rdev, uint32_t pbl_addr, uint32_t len, u8 shift)
 {
-	struct ch_mem_range *m;
+	struct ch_mem_range m;
 	u64 *data;
+	u32 addr;
 	int rc;
 	int size, npages;
 
 	shift += 12;
 	npages = (len + (1ULL << shift) - 1) >> shift;
 	size = npages * sizeof(u64);
-
-	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
-	if (!m) {
+	m.buf = malloc(size, M_DEVBUF, M_NOWAIT);
+	if (m.buf == NULL) {
 		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
 		return;
 	}
-	m->mem_id = MEM_PMRX;
-	m->addr = pbl_addr;
-	m->len = size;
+	m.mem_id = MEM_PMRX;
+	m.addr = pbl_addr;
+	m.len = size;
 	CTR4(KTR_IW_CXGB, "%s PBL addr 0x%x len %d depth %d",
-		__FUNCTION__, m->addr, m->len, npages);
-	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+		__FUNCTION__, m.addr, m.len, npages);
+
+	rc = cxio_rdma_get_mem(rdev, &m);
 	if (rc) {
 		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
-		free(m, M_DEVBUF);
+		free(m.buf, M_DEVBUF);
 		return;
 	}
 
-	data = (u64 *)m->buf;
+	data = (u64 *)m.buf;
+	addr = m.addr;
 	while (size > 0) {
-		CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", m->addr, (unsigned long long) *data);
+		CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", addr, (unsigned long long) *data);
 		size -= 8;
 		data++;
-		m->addr += 8;
+		addr += 8;
 	}
-	free(m, M_DEVBUF);
+	free(m.buf, M_DEVBUF);
 }
 
 void cxio_dump_wqe(union t3_wr *wqe)
@@ -175,70 +203,76 @@
 
 void cxio_dump_rqt(struct cxio_rdev *rdev, uint32_t hwtid, int nents)
 {
-	struct ch_mem_range *m;
+	struct ch_mem_range m;
 	int size = nents * 64;
 	u64 *data;
+	u32 addr;
 	int rc;
 
-	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
-	if (!m) {
+	m.buf = malloc(size, M_DEVBUF, M_NOWAIT);
+	if (m.buf == NULL) {
 		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
 		return;
 	}
-	m->mem_id = MEM_PMRX;
-	m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
-	m->len = size;
-	CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m->addr, m->len);
-	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	m.mem_id = MEM_PMRX;
+	m.addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
+	m.len = size;
+	CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m.addr, m.len);
+
+	rc = cxio_rdma_get_mem(rdev, &m);
 	if (rc) {
 		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
-		free(m, M_DEVBUF);
+		free(m.buf, M_DEVBUF);
 		return;
 	}
 
-	data = (u64 *)m->buf;
+	data = (u64 *)m.buf;
+	addr = m.addr;
 	while (size > 0) {
-		CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", m->addr, (unsigned long long) *data);
+		CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", addr, (unsigned long long) *data);
 		size -= 8;
 		data++;
-		m->addr += 8;
+		addr += 8;
 	}
-	free(m, M_DEVBUF);
+	free(m.buf, M_DEVBUF);
 }
 
 void cxio_dump_tcb(struct cxio_rdev *rdev, uint32_t hwtid)
 {
-	struct ch_mem_range *m;
+	struct ch_mem_range m;
 	int size = TCB_SIZE;
 	uint32_t *data;
+	uint32_t addr;
 	int rc;
 
-	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
-	if (!m) {
+	m.buf = malloc(size, M_DEVBUF, M_NOWAIT);
+	if (m.buf == NULL) {
 		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
 		return;
 	}
-	m->mem_id = MEM_CM;
-	m->addr = hwtid * size;
-	m->len = size;
-	CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m->addr, m->len);
-	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	m.mem_id = MEM_CM;
+	m.addr = hwtid * size;
+	m.len = size;
+	CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m.addr, m.len);
+
+	rc = cxio_rdma_get_mem(rdev, &m);
 	if (rc) {
 		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
-		free(m, M_DEVBUF);
+		free(m.buf, M_DEVBUF);
 		return;
 	}
 
-	data = (uint32_t *)m->buf;
+	data = (uint32_t *)m.buf;
+	addr = m.addr;
 	while (size > 0) {
 		printf("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n",
-			m->addr,
+			addr,
 			*(data+2), *(data+3), *(data),*(data+1),
 			*(data+6), *(data+7), *(data+4), *(data+5));
 		size -= 32;
 		data += 8;
-		m->addr += 32;
+		addr += 32;
 	}
-	free(m, M_DEVBUF);
+	free(m.buf, M_DEVBUF);
 }
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -59,9 +61,11 @@
 
 #include <netinet/in.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
@@ -81,11 +85,22 @@
 	struct ib_event event;
 	struct iwch_qp_attributes attrs;
 
+	mtx_lock(&rnicp->lock);
+
+	if (!qhp) {
+                CTR3(KTR_IW_CXGB, "%s unaffiliated error 0x%x qpid 0x%x\n",
+                       __func__, CQE_STATUS(rsp_msg->cqe),
+                       CQE_QPID(rsp_msg->cqe));
+                mtx_unlock(&rnicp->lock);
+                return;
+        }
+
 	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
 	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
 		CTR4(KTR_IW_CXGB, "%s AE received after RTS - "
 		     "qp state %d qpid 0x%x status 0x%x", __FUNCTION__,
 		     qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe));
+                mtx_unlock(&rnicp->lock);
 		return;
 	}
 
@@ -95,6 +110,15 @@
 	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
 	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
 
+        mtx_unlock(&rnicp->lock);
+
+	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+                attrs.next_state = IWCH_QP_STATE_TERMINATE;
+                iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+                               &attrs, 1);
+                if (send_term)
+                        iwch_post_terminate(qhp, rsp_msg);
+        }
 
 	event.event = ib_event;
 	event.device = chp->ibcq.device;
@@ -106,25 +130,17 @@
 	if (qhp->ibqp.event_handler)
 		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
 
-	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
-		attrs.next_state = IWCH_QP_STATE_TERMINATE;
-		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
-			       &attrs, 1);
-		if (send_term)
-			iwch_post_terminate(qhp, rsp_msg);
-	}
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
 }
 
 void
-iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m)
+iwch_ev_dispatch(struct iwch_dev *rnicp, struct mbuf *m)
 {
-	struct iwch_dev *rnicp;
 	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data;
 	struct iwch_cq *chp;
 	struct iwch_qp *qhp;
 	u32 cqid = RSPQ_CQID(rsp_msg);
 
-	rnicp = (struct iwch_dev *) rdev_p->ulp;
 	mtx_lock(&rnicp->lock);
 	chp = get_chp(rnicp, cqid);
 	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
@@ -136,7 +152,7 @@
 		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
 		       CQE_WRID_LOW(rsp_msg->cqe));
 		mtx_unlock(&rnicp->lock);
-		goto out;
+		return;
 	}
 	iwch_qp_add_ref(&qhp->ibqp);
 	mtx_lock(&chp->lock);
@@ -200,12 +216,6 @@
 	case TPT_ERR_BOUND:
 	case TPT_ERR_INVALIDATE_SHARED_MR:
 	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
-		log(LOG_ERR, "%s - CQE Err qpid 0x%x opcode %d status 0x%x "
-		       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__,
-		       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
-		       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
-		       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
-		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
 		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
 		break;
 
@@ -248,6 +258,5 @@
 	        wakeup(chp);
 	mtx_unlock(&chp->lock);
 	iwch_qp_rem_ref(&qhp->ibqp);
-out:
-	m_free(m);
 }
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,4 +1,3 @@
-
 /**************************************************************************
 
 Copyright (c) 2007, Chelsio Inc.
@@ -30,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -47,6 +48,8 @@
 #include <sys/linker.h>
 #include <sys/firmware.h>
 #include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockopt.h>
 #include <sys/sockio.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
@@ -59,12 +62,25 @@
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
+#include <net/route.h>
+#include <netinet/in_systm.h>
 #include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp_var.h>
+#include <netinet/toecore.h>
+#include <netinet/tcp.h>
+#include <netinet/tcpip.h>
 
-#include <contrib/rdma/ib_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/tom/cxgb_l2t.h>
+#include <ulp/tom/cxgb_tom.h>
+#include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
 #include <ulp/iw_cxgb/iw_cxgb_hal.h>
 #include <ulp/iw_cxgb/iw_cxgb_provider.h>
@@ -72,29 +88,21 @@
 #include <ulp/iw_cxgb/iw_cxgb.h>
 #include <ulp/iw_cxgb/iw_cxgb_resource.h>
 
-static TAILQ_HEAD( ,cxio_rdev) rdev_list;
-static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+/* Response queue used for RDMA events. */
+#define ASYNC_NOTIF_RSPQ 0
+static inline int
+cxio_rdma_cq_setup(struct cxio_rdev *rdev_p, unsigned id, uint64_t base_addr,
+    unsigned size, unsigned ovfl_mode, unsigned credits, unsigned credit_thres)
+{
+	struct adapter *sc = rdev_p->adap;
+	int rc;
 
-static struct cxio_rdev *
-cxio_hal_find_rdev_by_name(char *dev_name)
-{
-	struct cxio_rdev *rdev;
+	mtx_lock_spin(&sc->sge.reg_lock);
+	rc = -t3_sge_init_cqcntxt(sc, id, base_addr, size, ASYNC_NOTIF_RSPQ,
+	    ovfl_mode, credits, credit_thres);
+	mtx_unlock_spin(&sc->sge.reg_lock);
 
-	TAILQ_FOREACH(rdev, &rdev_list, entry)
-		if (!strcmp(rdev->dev_name, dev_name))
-			return rdev;
-	return NULL;
-}
-
-struct cxio_rdev *
-cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
-{
-	struct cxio_rdev *rdev;
-
-	TAILQ_FOREACH(rdev, &rdev_list, entry)
-		if (rdev->t3cdev_p == tdev)
-			return rdev;
-	return NULL;
+	return (rc);
 }
 
 int
@@ -104,12 +112,14 @@
 	int ret;
 	struct t3_cqe *cqe;
 	u32 rptr;
+	struct adapter *sc = rdev_p->adap;
 
-	struct rdma_cq_op setup;
-	setup.id = cq->cqid;
-	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
-	setup.op = op;
-	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+	if (op != CQ_CREDIT_UPDATE)
+		credit = 0;
+
+	mtx_lock_spin(&sc->sge.reg_lock);
+	ret = t3_sge_cqcntxt_op(sc, cq->cqid, op, credit);
+	mtx_unlock_spin(&sc->sge.reg_lock);
 
 	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
 		return (ret);
@@ -140,30 +150,26 @@
 		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
 			DELAY(1);
 			if (i++ > 1000000) {
+				struct adapter *sc = rdev_p->adap;
+
+				log(LOG_ERR, "%s: stalled rnic\n",
+				    device_get_nameunit(sc->dev));
 				PANIC_IF(1);
-				log(LOG_ERR, "%s: stalled rnic\n",
-				       rdev_p->dev_name);
 				return (-EIO);
 			}
 		}
 
-		return 1;
+		return (1);
 	}
 
-	return 0;
+	return (0);
 }
 
 static int
 cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
 {
-	struct rdma_cq_setup setup;
-	setup.id = cqid;
-	setup.base_addr = 0;	/* NULL address */
-	setup.size = 0;		/* disaable the CQ */
-	setup.credits = 0;
-	setup.credit_thres = 0;
-	setup.ovfl_mode = 0;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+
+	return (cxio_rdma_cq_setup(rdev_p, cqid, 0, 0, 0, 0, 0));
 }
 
 static int
@@ -171,43 +177,38 @@
 {
 	u64 sge_cmd;
 	struct t3_modify_qp_wr *wqe;
-	struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT);
+	struct mbuf *m;
+       
+	m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, wqe);
 	if (m == NULL) {
 		CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__);
 		return (-ENOMEM);
 	}
 	wqe = mtod(m, struct t3_modify_qp_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
 	memset(wqe, 0, sizeof(*wqe));
 	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7);
 	wqe->flags = htobe32(MODQP_WRITE_EC);
 	sge_cmd = qpid << 8 | 3;
 	wqe->sge_cmd = htobe64(sge_cmd);
-	m_set_priority(m, CPL_PRIORITY_CONTROL);
-	m_set_sgl(m, NULL);
-	m_set_sgllen(m, 0);
-	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
+	return t3_offload_tx(rdev_p->adap, m);
 }
 
 int
-cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
 {
-	struct rdma_cq_setup setup;
 	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
 	
+	size += 1; /* one extra page for storing cq-in-err state */
 	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
 	if (!cq->cqid)
 		return (-ENOMEM);
-	cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO);
-	if (!cq->sw_queue)
-		return (-ENOMEM);
-#if 0	
-	cq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev,
-					     (1UL << (cq->size_log2)) *
-					     sizeof(struct t3_cqe),
-					     &(cq->dma_addr), M_NOWAIT);
-#else
-	cq->queue = contigmalloc((1UL << (cq->size_log2))*sizeof(struct t3_cqe),
+	if (kernel) {
+		cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO);
+		if (!cq->sw_queue)
+			return (-ENOMEM);
+	}
+
+	cq->queue = contigmalloc(size,
 	    M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
 	if (cq->queue)
 		cq->dma_addr = vtophys(cq->queue);
@@ -215,35 +216,10 @@
 		free(cq->sw_queue, M_DEVBUF);
 		return (-ENOMEM);
 	}
-#endif
-	
-#ifdef notyet	
-	pci_unmap_addr_set(cq, mapping, cq->dma_addr);
-#endif
 	memset(cq->queue, 0, size);
-	setup.id = cq->cqid;
-	setup.base_addr = (u64) (cq->dma_addr);
-	setup.size = 1UL << cq->size_log2;
-	setup.credits = 65535;
-	setup.credit_thres = 1;
-	if (rdev_p->t3cdev_p->type != T3A)
-		setup.ovfl_mode = 0;
-	else
-		setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
 
-int
-cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
-{
-	struct rdma_cq_setup setup;
-	setup.id = cq->cqid;
-	setup.base_addr = (u64) (cq->dma_addr);
-	setup.size = 1UL << cq->size_log2;
-	setup.credits = setup.size;
-	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
-	setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+	return (cxio_rdma_cq_setup(rdev_p, cq->cqid, cq->dma_addr,
+	    1UL << cq->size_log2, 0, 65535, 1));
 }
 
 static u32
@@ -325,7 +301,7 @@
 	if (!wq->qpid)
 		return (-ENOMEM);
 
-	wq->rq = malloc(depth * sizeof(u64), M_DEVBUF, M_NOWAIT|M_ZERO);
+	wq->rq = malloc(depth * sizeof(struct t3_swrq), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (!wq->rq)
 		goto err1;
 
@@ -336,28 +312,19 @@
 	wq->sq = malloc(depth * sizeof(struct t3_swsq), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (!wq->sq)
 		goto err3;
-#if 0
-	wq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev,
-					     depth * sizeof(union t3_wr),
-					     &(wq->dma_addr), M_NOWAIT);
-#else
 	wq->queue = contigmalloc(depth *sizeof(union t3_wr),
 	    M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
 	if (wq->queue)
 		wq->dma_addr = vtophys(wq->queue);
-
-#endif
-	if (!wq->queue)
+	else
 		goto err4;
 
 	memset(wq->queue, 0, depth * sizeof(union t3_wr));
-#ifdef notyet	
-	pci_unmap_addr_set(wq, mapping, wq->dma_addr);
-#endif
 	wq->doorbell = rdev_p->rnic_info.kdb_addr;
 	if (!kernel_domain)
 		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
 					(wq->qpid << rdev_p->qpshift);
+	wq->rdev = rdev_p;
 	CTR4(KTR_IW_CXGB, "%s qpid 0x%x doorbell 0x%p udb 0x%llx", __FUNCTION__,
 	     wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
 	return 0;
@@ -431,10 +398,11 @@
 	cq->sw_wptr++;
 }
 
-void
+int
 cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
 {
 	u32 ptr;
+	int flushed = 0;
 
 	CTR3(KTR_IW_CXGB, "%s wq %p cq %p", __FUNCTION__, wq, cq);
 
@@ -442,8 +410,11 @@
 	CTR4(KTR_IW_CXGB, "%s rq_rptr %u rq_wptr %u skip count %u", __FUNCTION__,
 	    wq->rq_rptr, wq->rq_wptr, count);
 	ptr = wq->rq_rptr + count;
-	while (ptr++ != wq->rq_wptr)
+	while (ptr++ != wq->rq_wptr) {
 		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+       	return flushed;
 }
 
 static void
@@ -468,19 +439,22 @@
 	cq->sw_wptr++;
 }
 
-void
+int
 cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
 {
 	__u32 ptr;
+	int flushed = 0;
 	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
 
 	ptr = wq->sq_rptr + count;
-	sqp += count;
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
 	while (ptr != wq->sq_wptr) {
 		insert_sq_cqe(wq, cq, sqp);
-		sqp++;
 		ptr++;
+		sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+		flushed++;
 	}
+	return flushed;
 }
 
 /*
@@ -516,7 +490,7 @@
 	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
 		return 0;
 
-	if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) &&
+	if (CQE_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
 	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
 		return 0;
 
@@ -563,16 +537,8 @@
 static int
 cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
 {
-	struct rdma_cq_setup setup;
-	setup.id = 0;
-	setup.base_addr = 0;	/* NULL address */
-	setup.size = 1;		/* enable the CQ */
-	setup.credits = 0;
 
-	/* force SGE to redirect to RspQ and interrupt */
-	setup.credit_thres = 0;
-	setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+	return (cxio_rdma_cq_setup(rdev_p, 0, 0, 1, 1, 0, 0));
 }
 
 static int
@@ -584,41 +550,28 @@
 	struct t3_modify_qp_wr *wqe;
 	struct mbuf *m;
 
-	m = m_gethdr(MT_DATA, M_NOWAIT);
+	m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, wqe);
 	if (m == NULL) {
 		CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__);
-		return (-ENOMEM);
+		return (ENOMEM);
 	}
 	err = cxio_hal_init_ctrl_cq(rdev_p);
 	if (err) {
 		CTR2(KTR_IW_CXGB, "%s err %d initializing ctrl_cq", __FUNCTION__, err);
 		goto err;
 	}
-#if 0	
-	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
-		rdev_p->rnic_info.pdev,
-		    (1 << T3_CTRL_QP_SIZE_LOG2) *
-		    sizeof(union t3_wr),
-		    &(rdev_p->ctrl_qp.dma_addr),
-		    M_NOWAIT);
-#else
+
 	rdev_p->ctrl_qp.workq = contigmalloc((1 << T3_CTRL_QP_SIZE_LOG2) 
 	    *sizeof(union t3_wr), M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
 	if (rdev_p->ctrl_qp.workq)
 		rdev_p->ctrl_qp.dma_addr = vtophys(rdev_p->ctrl_qp.workq);
-
-#endif	
-	
-	if (!rdev_p->ctrl_qp.workq) {
+	else {
 		CTR1(KTR_IW_CXGB, "%s dma_alloc_coherent failed", __FUNCTION__);
-		err = -ENOMEM;
+		err = ENOMEM;
 		goto err;
 	}
-#if 0	
-	pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
-			   rdev_p->ctrl_qp.dma_addr);
-#endif	
-	rdev_p->ctrl_qp.doorbell = (void /*__iomem */ *)rdev_p->rnic_info.kdb_addr;
+
+	rdev_p->ctrl_qp.doorbell = rdev_p->rnic_info.kdb_addr;
 	memset(rdev_p->ctrl_qp.workq, 0,
 	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
 
@@ -637,10 +590,8 @@
 	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
 			V_EC_TYPE(0) | V_EC_GEN(1) |
 			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
-	wqe = mtod(m, struct t3_modify_qp_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
 	memset(wqe, 0, sizeof(*wqe));
-	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0,
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
 		       T3_CTL_QP_TID, 7);
 	wqe->flags = htobe32(MODQP_WRITE_EC);
 	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
@@ -650,12 +601,9 @@
 	CTR3(KTR_IW_CXGB, "CtrlQP dma_addr 0x%llx workq %p size %d",
 	     (unsigned long long) rdev_p->ctrl_qp.dma_addr,
 	     rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
-	m_set_priority(m, CPL_PRIORITY_CONTROL);
-	m_set_sgl(m, NULL);
-	m_set_sgllen(m, 0);
-	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
+	return t3_offload_tx(rdev_p->adap, m);
 err:
-	m_free(m);
+	m_freem(m);
 	return err;
 }
 
@@ -681,7 +629,7 @@
  */
 static int
 cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
-				      u32 len, void *data, int completion)
+				      u32 len, void *data)
 {
 	u32 i, nr_wqe, copy_len;
 	u8 *copy_data;
@@ -718,7 +666,7 @@
 		flag = 0;
 		if (i == (nr_wqe - 1)) {
 			/* last WQE */
-			flag = completion ? T3_COMPLETION_FLAG : 0;
+			flag = T3_COMPLETION_FLAG;
 			if (len % 32)
 				utx_len = len / 32 + 1;
 			else
@@ -786,14 +734,13 @@
 __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
 			 u32 *stag, u8 stag_state, u32 pdid,
 			 enum tpt_mem_type type, enum tpt_mem_perm perm,
-			 u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl,
-			 u32 *pbl_size, u32 *pbl_addr)
+			 u32 zbva, u64 to, u32 len, u8 page_size,
+			 u32 pbl_size, u32 pbl_addr)
 {
 	int err;
 	struct tpt_entry tpt;
 	u32 stag_idx;
 	u32 wptr;
-	int rereg = (*stag != T3_STAG_UNSET);
 
 	stag_state = stag_state > 0;
 	stag_idx = (*stag) >> 8;
@@ -807,30 +754,8 @@
 	CTR5(KTR_IW_CXGB, "%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x",
 	     __FUNCTION__, stag_state, type, pdid, stag_idx);
 
-	if (reset_tpt_entry)
-		cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3);
-	else if (!rereg) {
-		*pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3);
-		if (!*pbl_addr) {
-			return (-ENOMEM);
-		}
-	}
-
 	mtx_lock(&rdev_p->ctrl_qp.lock);
 
-	/* write PBL first if any - update pbl only if pbl list exist */
-	if (pbl) {
-
-		CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d",
-		     __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base,
-		     *pbl_size);
-		err = cxio_hal_ctrl_qp_write_mem(rdev_p,
-				(*pbl_addr >> 5),
-				(*pbl_size << 3), pbl, 0);
-		if (err)
-			goto ret;
-	}
-
 	/* write TPT entry */
 	if (reset_tpt_entry)
 		memset(&tpt, 0, sizeof(tpt));
@@ -845,23 +770,23 @@
 				V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
 				V_TPT_PAGE_SIZE(page_size));
 		tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 :
-				    htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3));
+				    htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
 		tpt.len = htobe32(len);
 		tpt.va_hi = htobe32((u32) (to >> 32));
 		tpt.va_low_or_fbo = htobe32((u32) (to & 0xFFFFFFFFULL));
 		tpt.rsvd_bind_cnt_or_pstag = 0;
 		tpt.rsvd_pbl_size = reset_tpt_entry ? 0 :
-				  htobe32(V_TPT_PBL_SIZE((*pbl_size) >> 2));
+				  htobe32(V_TPT_PBL_SIZE((pbl_size) >> 2));
 	}
 	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
 				       stag_idx +
 				       (rdev_p->rnic_info.tpt_base >> 5),
-				       sizeof(tpt), &tpt, 1);
+				       sizeof(tpt), &tpt);
 
 	/* release the stag index to free pool */
 	if (reset_tpt_entry)
 		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
-ret:
+
 	wptr = rdev_p->ctrl_qp.wptr;
 	mtx_unlock(&rdev_p->ctrl_qp.lock);
 	if (!err)
@@ -872,61 +797,90 @@
 	return err;
 }
 
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+			u32 pbl_addr, u32 pbl_size)
+{
+	u32 wptr;
+	int err;
+
+	CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d",
+		__func__, pbl_addr, rdev_p->rnic_info.pbl_base,
+		pbl_size);
+
+	mtx_lock(&rdev_p->ctrl_qp.lock);
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
+					pbl);
+	wptr = rdev_p->ctrl_qp.wptr;
+	mtx_unlock(&rdev_p->ctrl_qp.lock);
+	if (err)
+		return err;
+
+	if (cxio_wait(&rdev_p->ctrl_qp,
+                        &rdev_p->ctrl_qp.lock,
+                        SEQ32_GE(rdev_p->ctrl_qp.rptr, wptr)))
+		return ERESTART;
+
+	return 0;
+}
+
 int
 cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr)
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
 {
 	*stag = T3_STAG_UNSET;
 	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
 }
 
 int
 cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr)
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)	
 {
 	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
 }
 
 int
 cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
 		   u32 pbl_addr)
 {
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
-			     &pbl_size, &pbl_addr);
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     pbl_size, pbl_addr);
 }
 
 int
 cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
 {
-	u32 pbl_size = 0;
 	*stag = T3_STAG_UNSET;
 	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
-			     NULL, &pbl_size, NULL);
+			     0, 0);
 }
 
 int
 cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
 {
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
-			     NULL, NULL);
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     0, 0);
 }
 
 int
-cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr,
+    struct socket *so)
 {
 	struct t3_rdma_init_wr *wqe;
-	struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT);
+	struct mbuf *m;
+	struct ofld_hdr *oh;
+	int rc;
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	struct toepcb *toep;
+
+	m = M_GETHDR_OFLD(0, CPL_PRIORITY_DATA, wqe);
 	if (m == NULL)
 		return (-ENOMEM);
 	CTR2(KTR_IW_CXGB, "%s rdev_p %p", __FUNCTION__, rdev_p);
-	wqe = mtod(m, struct t3_rdma_init_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
 	wqe->wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_INIT));
 	wqe->wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(attr->tid) |
 					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
@@ -940,36 +894,41 @@
 	wqe->mpaattrs = attr->mpaattrs;
 	wqe->qpcaps = attr->qpcaps;
 	wqe->ulpdu_size = htobe16(attr->tcp_emss);
-	wqe->flags = htobe32(attr->flags);
+	wqe->rqe_count = htobe16(attr->rqe_count);
+	wqe->flags_rtr_type = htobe16(attr->flags |
+					V_RTR_TYPE(attr->rtr_type) |
+					V_CHAN(attr->chan));	
 	wqe->ord = htobe32(attr->ord);
 	wqe->ird = htobe32(attr->ird);
 	wqe->qp_dma_addr = htobe64(attr->qp_dma_addr);
 	wqe->qp_dma_size = htobe32(attr->qp_dma_size);
 	wqe->irs = htobe32(attr->irs);
-	m_set_priority(m, 0);	/* 0=>ToeQ; 1=>CtrlQ */
-	m_set_sgl(m, NULL);
-	m_set_sgllen(m, 0);
-	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
-}
 
-void
-cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
-{
-	cxio_ev_cb = ev_cb;
-}
+	/* XXX: bad form, fix later */
+	inp = sotoinpcb(so);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+	toep = tp->t_toe;
+	oh = mtod(m, struct ofld_hdr *);
+	oh->plen = 0;
+	oh->flags |= F_HDR_DF;
+	enqueue_wr(toep, m);
+	toep->tp_wr_avail--;
+	toep->tp_wr_unacked++;
+	rc = t3_offload_tx(rdev_p->adap, m);
+	INP_WUNLOCK(inp);
 
-void
-cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
-{
-	cxio_ev_cb = NULL;
+	return (rc);
 }
 
 static int
-cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct mbuf *m)
+cxio_hal_ev_handler(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	static int cnt;
-	struct cxio_rdev *rdev_p = NULL;
+	struct adapter *sc = qs->adap;
+	struct iwch_dev *rnicp = sc->iwarp_softc;
+	struct cxio_rdev *rdev_p = &rnicp->rdev;
 	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data;
+	int qpid = CQE_QPID(rsp_msg->cqe);
 	
 	CTR6(KTR_IW_CXGB, "%s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x",
 	     __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
@@ -978,80 +937,50 @@
 	     RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
 	     RSPQ_CREDIT_THRESH(rsp_msg));
 	CTR4(KTR_IW_CXGB, "CQE: QPID 0x%0x type 0x%0x status 0x%0x opcode %d",
-	     CQE_QPID(rsp_msg->cqe), 
-	     CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
-	     CQE_OPCODE(rsp_msg->cqe));
+	    qpid, CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+	    CQE_OPCODE(rsp_msg->cqe));
 	CTR3(KTR_IW_CXGB, "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x",
 	     CQE_LEN(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
-	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
-	if (!rdev_p) {
-		CTR2(KTR_IW_CXGB, "%s called by t3cdev %p with null ulp", __FUNCTION__,
-		     t3cdev_p);
-		return 0;
-	}
-	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+
+	switch(qpid) {
+	case T3_CTRL_QP_ID:
 		mtx_lock(&rdev_p->ctrl_qp.lock);
 		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
 		wakeup(&rdev_p->ctrl_qp);
 		mtx_unlock(&rdev_p->ctrl_qp.lock);
-		m_free(m);
-	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
-		m_free(m);
-	else if (cxio_ev_cb)
-		(*cxio_ev_cb) (rdev_p, m);
-	else
-		m_free(m);
-	cnt++;
-	return 0;
+		break;
+	case 0xfff8:
+		break;
+	default:
+		iwch_ev_dispatch(rnicp, m);
+	}
+
+	m_freem(m);
+	return (0);
 }
 
 /* Caller takes care of locking if needed */
 int
 cxio_rdev_open(struct cxio_rdev *rdev_p)
 {
-	struct ifnet *ifp;
 	int err = 0;
+	struct rdma_info *ri = &rdev_p->rnic_info;
+	struct adapter *sc = rdev_p->adap;
 
-	if (strlen(rdev_p->dev_name)) {
-		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
-			return (-EBUSY);
-		}
-		ifp = rdev_p->ifp; 
-		if (ifp == NULL) 
-			return (-EINVAL);
-		if_free(ifp);
-	} else if (rdev_p->t3cdev_p) {
-		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) 
-			return (-EBUSY);
-		ifp = rdev_p->t3cdev_p->lldev;
-		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
-			T3_MAX_DEV_NAME_LEN);
-	} else {
-		CTR1(KTR_IW_CXGB, "%s t3cdev_p or dev_name must be set", __FUNCTION__);
-		return (-EINVAL);
-	}
+	KASSERT(rdev_p->adap, ("%s: adap is NULL", __func__));
 
-	TAILQ_INSERT_TAIL(&rdev_list, rdev_p, entry);
+	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
 
-	CTR2(KTR_IW_CXGB, "%s opening rnic dev %s", __FUNCTION__, rdev_p->dev_name);
-	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
-	if (!rdev_p->t3cdev_p)
-		rdev_p->t3cdev_p = T3CDEV(ifp);
-	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
-	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
-					 &(rdev_p->rnic_info));
-	if (err) {
-		log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n",
-		     __FUNCTION__, rdev_p->t3cdev_p, err);
-		goto err1;
-	}
-	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
-				    &(rdev_p->port_info));
-	if (err) {
-		log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n",
-		     __FUNCTION__, rdev_p->t3cdev_p, err);
-		goto err1;
-	}
+	ri->udbell_physbase = rman_get_start(sc->udbs_res);
+	ri->udbell_len = rman_get_size(sc->udbs_res);
+	ri->tpt_base = t3_read_reg(sc, A_ULPTX_TPT_LLIMIT);
+	ri->tpt_top  = t3_read_reg(sc, A_ULPTX_TPT_ULIMIT);
+	ri->pbl_base = t3_read_reg(sc, A_ULPTX_PBL_LLIMIT);
+	ri->pbl_top  = t3_read_reg(sc, A_ULPTX_PBL_ULIMIT);
+	ri->rqt_base = t3_read_reg(sc, A_ULPRX_RQ_LLIMIT);
+	ri->rqt_top  = t3_read_reg(sc, A_ULPRX_RQ_ULIMIT);
+	ri->kdb_addr =  (void *)((unsigned long)
+	    rman_get_virtual(sc->regs_res) + A_SG_KDOORBELL);
 
 	/*
 	 * qpshift is the number of bits to shift the qpid left in order
@@ -1064,8 +993,8 @@
 					      PAGE_SHIFT));
 	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
 	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
-	CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d",
-	     rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+	CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %p info: tpt_base 0x%0x tpt_top 0x%0x num stags %d",
+	     rdev_p->adap, rdev_p->rnic_info.tpt_base,
 	     rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p));
 	CTR4(KTR_IW_CXGB, "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x",
 	     rdev_p->rnic_info.pbl_base,
@@ -1111,43 +1040,34 @@
 err2:
 	cxio_hal_destroy_ctrl_qp(rdev_p);
 err1:
-	TAILQ_REMOVE(&rdev_list, rdev_p, entry);
 	return err;
 }
 
 void
 cxio_rdev_close(struct cxio_rdev *rdev_p)
 {
-	if (rdev_p) {
-		cxio_hal_pblpool_destroy(rdev_p);
-		cxio_hal_rqtpool_destroy(rdev_p);
-		TAILQ_REMOVE(&rdev_list, rdev_p, entry);
-		rdev_p->t3cdev_p->ulp = NULL;
-		cxio_hal_destroy_ctrl_qp(rdev_p);
-		cxio_hal_destroy_resource(rdev_p->rscp);
-	}
+	cxio_hal_pblpool_destroy(rdev_p);
+	cxio_hal_rqtpool_destroy(rdev_p);
+	cxio_hal_destroy_ctrl_qp(rdev_p);
+	cxio_hal_destroy_resource(rdev_p->rscp);
 }
 
 int
-cxio_hal_init(void)
+cxio_hal_init(struct adapter *sc)
 {
-	TAILQ_INIT(&rdev_list);
 #ifdef needed
 	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
-		return (-ENOMEM);
+		return (ENOMEM);
 #endif
-	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
-	return 0;
+	t3_register_cpl_handler(sc, CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+
+	return (0);
 }
 
 void
-cxio_hal_exit(void)
+cxio_hal_uninit(struct adapter *sc)
 {
-	struct cxio_rdev *rdev, *tmp;
-
-	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
-	TAILQ_FOREACH_SAFE(rdev, &rdev_list, entry, tmp)
-		cxio_rdev_close(rdev);
+	t3_register_cpl_handler(sc, CPL_ASYNC_NOTIF, NULL);
 #ifdef needed
 	cxio_hal_destroy_rhdl_resource();
 #endif
@@ -1304,11 +1224,12 @@
 		}
 
 		/* incoming SEND with no receive posted failures */
-		if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) &&
+		if (CQE_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
 		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
 			ret = -1;
 			goto skip_cqe;
 		}
+		PANIC_IF((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
 		goto proc_cqe;
 	}
 
@@ -1323,6 +1244,13 @@
 		 * then we complete this with TPT_ERR_MSN and mark the wq in
 		 * error.
 		 */
+		
+		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
 		if (__predict_false((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
 			wq->error = 1;
 			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
@@ -1367,13 +1295,17 @@
 		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
 		CTR2(KTR_IW_CXGB, "%s completing sq idx %ld", __FUNCTION__,
 		     Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
-		*cookie = (wq->sq +
-			   Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id;
+		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
 		wq->sq_rptr++;
 	} else {
 		CTR2(KTR_IW_CXGB, "%s completing rq idx %ld", __FUNCTION__,
 		     Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
-		*cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+                        cxio_hal_pblpool_free(wq->rdev,
+                                wq->rq[Q_PTR2IDX(wq->rq_rptr,
+                                wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
+		PANIC_IF(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
 		wq->rq_rptr++;
 	}
 
@@ -1404,5 +1336,4 @@
 	}
 	return ret;
 }
-
-
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h	Mon Jun 11 00:15:24 2012 -0700
@@ -45,7 +45,11 @@
 #define T3_MAX_NUM_PD (1<<15)
 #define T3_MAX_PBL_SIZE 256
 #define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 65536
 #define T3_MAX_NUM_STAG (1<<15)
+#define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
 
 #define T3_STAG_UNSET 0xffffffff
 
@@ -55,12 +59,9 @@
 	u32 wptr;
 	u32 rptr;
 	struct mtx lock;	/* for the wtpr, can sleep */
-#ifdef notyet
-	DECLARE_PCI_UNMAP_ADDR(mapping)
-#endif	
 	union t3_wr *workq;	/* the work request queue */
 	bus_addr_t dma_addr;	/* pci bus address of the workq */
-	void /* __iomem */ *doorbell;
+	void *doorbell;
 };
 
 struct cxio_hal_resource {
@@ -85,13 +86,10 @@
 };
 
 struct cxio_rdev {
-	char dev_name[T3_MAX_DEV_NAME_LEN];
-	struct t3cdev *t3cdev_p;
+	struct adapter *adap;
 	struct rdma_info rnic_info;
-	struct adap_ports port_info;
 	struct cxio_hal_resource *rscp;
 	struct cxio_hal_ctrl_qp ctrl_qp;
-	void *ulp;
 	unsigned long qpshift;
 	u32 qpnr;
 	u32 qpmask;
@@ -139,9 +137,8 @@
 void cxio_rdev_close(struct cxio_rdev *rdev);
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
 		   enum t3_cq_opcode op, u32 credit);
-int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
 int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
-int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
@@ -149,27 +146,27 @@
 int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
 		    struct cxio_ucontext *uctx);
 int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size);
 int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr);
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
 int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr);
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
 int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
 		   u32 pbl_addr);
 int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
 int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
-int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
-void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
-void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr,
+    struct socket *so);
 u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
 void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
-int cxio_hal_init(void);
+int cxio_hal_init(struct adapter *);
+void cxio_hal_uninit(struct adapter *);
 void cxio_hal_exit(void);
-void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
-void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
 void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
 void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
 void cxio_flush_hw_cq(struct t3_cq *cq);
@@ -178,7 +175,7 @@
 
 #define MOD "iw_cxgb: "
 
-#ifdef DEBUG
+#ifdef INVARIANTS
 void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag);
 void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint32_t len, u8 shift);
 void cxio_dump_wqe(union t3_wr *wqe);
@@ -187,60 +184,7 @@
 void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid);
 #endif
 
-
- static unsigned char hiBitSetTab[] = {
-    0, 1, 2, 2, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5, 5, 5,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7
-
-};
-
-
-static __inline
-int ilog2(unsigned long val)
-{
-    unsigned long   tmp;
-
-    tmp = val >> 24;
-    if (tmp) {
-        return hiBitSetTab[tmp] + 23;
-    }
-    tmp = (val >> 16) & 0xff;
-    if (tmp) {
-        return hiBitSetTab[tmp] + 15;
-    }
-    tmp = (val >> 8) & 0xff;
-    if (tmp) {
-        return hiBitSetTab[tmp] + 7;
-
-    }
-    return hiBitSetTab[val & 0xff] - 1;
-} 
-
 #define cxfree(a) free((a), M_DEVBUF);
-#define kmalloc(a, b) malloc((a), M_DEVBUF, (b))
-#define kzalloc(a, b) malloc((a), M_DEVBUF, (b)|M_ZERO)
-
-static __inline __attribute__((const))
-unsigned long roundup_pow_of_two(unsigned long n)
-{
-	return 1UL << flsl(n - 1);
-}
-
-#define PAGE_ALIGN(x) roundup2((x), PAGE_SIZE)
 
 #include <sys/blist.h>
 struct gen_pool {
@@ -259,6 +203,7 @@
 	if (gp == NULL)
 		return (NULL);
 	
+	memset(gp, 0, sizeof(struct gen_pool));
 	gp->gen_list = blist_create(len >> chunk_shift, M_NOWAIT);
 	if (gp->gen_list == NULL) {
 		free(gp, M_DEVBUF);
@@ -323,8 +268,7 @@
 	mtx_unlock(lockp); \
 	__ret; \
 }) 
-extern struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev);
 
-#define KTR_IW_CXGB KTR_SPARE4
+#define KTR_IW_CXGB KTR_SPARE3
 
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,20 @@
+#ifndef  __IB_INTFC_H__
+#define  __IB_INTFC_H__
+
+#undef prefetch
+#undef WARN_ON
+#undef max_t
+#undef udelay
+#undef le32_to_cpu
+#undef le16_to_cpu
+#undef cpu_to_le32
+#undef swab32
+#undef container_of
+
+#undef LIST_HEAD
+#define LIST_HEAD(name, type)                                           \
+struct name {                                                           \
+        struct type *lh_first;  /* first element */                     \
+}
+
+#endif /* __IB_INTFC_H__ */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -59,9 +61,11 @@
 
 #include <netinet/in.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
@@ -72,14 +76,24 @@
 #include <ulp/iw_cxgb/iw_cxgb_resource.h>
 #include <ulp/iw_cxgb/iw_cxgb_user.h>
 
+static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
-					int shift,
-					__be64 *page_list)
+					int shift)
 {
 	u32 stag;
-	u32 mmid;
-
+	int ret;
 
 	if (cxio_register_phys_mem(&rhp->rdev,
 				   &stag, mhp->attr.pdid,
@@ -87,28 +101,24 @@
 				   mhp->attr.zbva,
 				   mhp->attr.va_fbo,
 				   mhp->attr.len,
-				   shift-12,
-				   page_list,
-				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
 		return (-ENOMEM);
-	mhp->attr.state = 1;
-	mhp->attr.stag = stag;
-	mmid = stag >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
-	CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp);
-	return 0;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+			mhp->attr.pbl_addr);
+	return ret;
 }
 
 int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
 					int shift,
-					__be64 *page_list,
 					int npages)
 {
 	u32 stag;
-	u32 mmid;
-
+	int ret;
 
 	/* We could support this... */
 	if (npages > mhp->attr.pbl_size)
@@ -121,17 +131,40 @@
 				   mhp->attr.zbva,
 				   mhp->attr.va_fbo,
 				   mhp->attr.len,
-				   shift-12,
-				   page_list,
-				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
 		return (-ENOMEM);
-	mhp->attr.state = 1;
-	mhp->attr.stag = stag;
-	mmid = stag >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
-	CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp);
+	
+	ret = iwch_finish_mem_reg(mhp, stag);
+        if (ret)
+                cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+                        mhp->attr.pbl_addr);
+        return ret;
+}
+
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
 	return 0;
+ }
+
+void iwch_free_pbl(struct iwch_mr *mhp)
+{
+	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+}
+
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
+{
+	return cxio_write_pbl(&mhp->rhp->rdev, pages,
+			      mhp->attr.pbl_addr + (offset << 3), npages);
 }
 
 int build_phys_page_list(struct ib_phys_buf *buffer_list,
@@ -204,3 +237,4 @@
 	return 0;
 
 }
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -62,9 +64,12 @@
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
+
 
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
@@ -180,6 +185,8 @@
 	struct iwch_create_cq_resp uresp;
 	struct iwch_create_cq_req ureq;
 	struct iwch_ucontext *ucontext = NULL;
+	static int warned;
+	size_t resplen;
 
 	CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries);
 	rhp = to_iwch_dev(ibdev);
@@ -214,7 +221,7 @@
 	entries = roundup_pow_of_two(entries);
 	chp->cq.size_log2 = ilog2(entries);
 
-	if (cxio_create_cq(&rhp->rdev, &chp->cq)) {
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
 		cxfree(chp);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -222,7 +229,11 @@
 	chp->ibcq.cqe = 1 << chp->cq.size_log2;
 	mtx_init(&chp->lock, "cxgb cq", NULL, MTX_DEF|MTX_DUPOK);
 	chp->refcnt = 1;
-	insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+		cxfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	if (ucontext) {
 		struct iwch_mm_entry *mm;
@@ -238,15 +249,27 @@
 		uresp.key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		mtx_unlock(&ucontext->mmap_lock);
-		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+		mm->key = uresp.key;
+		mm->addr = vtophys(chp->cq.queue);
+               	if (udata->outlen < sizeof uresp) {
+                	if (!warned++)
+                        	CTR1(KTR_IW_CXGB, "%s Warning - "
+                                	"downlevel libcxgb3 (non-fatal).\n",
+					__func__);
+                       	mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+                       				sizeof(struct t3_cqe));
+                       	resplen = sizeof(struct iwch_create_cq_resp_v0);
+               	} else {
+                	mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) *
+                        			sizeof(struct t3_cqe));
+                       	uresp.memsize = mm->len;
+                      	resplen = sizeof uresp;
+               	}
+              	if (ib_copy_to_udata(udata, &uresp, resplen)) {
 			cxfree(mm);
 			iwch_destroy_cq(&chp->ibcq);
 			return ERR_PTR(-EFAULT);
 		}
-		mm->key = uresp.key;
-		mm->addr = vtophys(chp->cq.queue);
-		mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
-					     sizeof (struct t3_cqe));
 		insert_mmap(ucontext, mm);
 	}
 	CTR4(KTR_IW_CXGB, "created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx",
@@ -256,72 +279,11 @@
 }
 
 static int
-iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+iwch_resize_cq(struct ib_cq *cq __unused, int cqe __unused,
+    struct ib_udata *udata __unused)
 {
-#ifdef notyet
-	struct iwch_cq *chp = to_iwch_cq(cq);
-	struct t3_cq oldcq, newcq;
-	int ret;
 
-	CTR3(KTR_IW_CXGB, "%s ib_cq %p cqe %d", __FUNCTION__, cq, cqe);
-
-	/* We don't downsize... */
-	if (cqe <= cq->cqe)
-		return 0;
-
-	/* create new t3_cq with new size */
-	cqe = roundup_pow_of_two(cqe+1);
-	newcq.size_log2 = ilog2(cqe);
-
-	/* Dont allow resize to less than the current wce count */
-	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
-		return (-ENOMEM);
-	}
-
-	/* Quiesce all QPs using this CQ */
-	ret = iwch_quiesce_qps(chp);
-	if (ret) {
-		return (ret);
-	}
-
-	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
-	if (ret) {
-		return (ret);
-	}
-
-	/* copy CQEs */
-	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
-				        sizeof(struct t3_cqe));
-
-	/* old iwch_qp gets new t3_cq but keeps old cqid */
-	oldcq = chp->cq;
-	chp->cq = newcq;
-	chp->cq.cqid = oldcq.cqid;
-
-	/* resize new t3_cq to update the HW context */
-	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
-	if (ret) {
-		chp->cq = oldcq;
-		return ret;
-	}
-	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
-
-	/* destroy old t3_cq */
-	oldcq.cqid = newcq.cqid;
-	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
-	if (ret) {
-		log(LOG_ERR, "%s - cxio_destroy_cq failed %d\n",
-			__FUNCTION__, ret);
-	}
-
-	/* add user hooks here */
-
-	/* resume qps */
-	ret = iwch_resume_qps(chp);
-	return ret;
-#else
 	return (-ENOSYS);
-#endif
 }
 
 static int
@@ -357,67 +319,12 @@
 	return err;
 }
 
-#ifdef notyet
 static int
-iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+iwch_mmap(struct ib_ucontext *context __unused, struct vm_area_struct *vma __unused)
 {
-#ifdef notyet	
-	int len = vma->vm_end - vma->vm_start;
-	u32 key = vma->vm_pgoff << PAGE_SHIFT;
-	struct cxio_rdev *rdev_p;
-	int ret = 0;
-	struct iwch_mm_entry *mm;
-	struct iwch_ucontext *ucontext;
-	u64 addr;
 
-	CTR4(KTR_IW_CXGB, "%s pgoff 0x%lx key 0x%x len %d", __FUNCTION__, vma->vm_pgoff,
-	     key, len);
-
-	if (vma->vm_start & (PAGE_SIZE-1)) {
-	        return (-EINVAL);
-	}
-
-	rdev_p = &(to_iwch_dev(context->device)->rdev);
-	ucontext = to_iwch_ucontext(context);
-
-	mm = remove_mmap(ucontext, key, len);
-	if (!mm)
-		return (-EINVAL);
-	addr = mm->addr;
-	cxfree(mm);
-
-	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
-	    (addr < (rdev_p->rnic_info.udbell_physbase +
-		       rdev_p->rnic_info.udbell_len))) {
-
-		/*
-		 * Map T3 DB register.
-		 */
-		if (vma->vm_flags & VM_READ) {
-			return (-EPERM);
-		}
-
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-		vma->vm_flags &= ~VM_MAYREAD;
-		ret = io_remap_pfn_range(vma, vma->vm_start,
-					 addr >> PAGE_SHIFT,
-				         len, vma->vm_page_prot);
-	} else {
-
-		/*
-		 * Map WQ or CQ contig dma memory...
-		 */
-		ret = remap_pfn_range(vma, vma->vm_start,
-				      addr >> PAGE_SHIFT,
-				      len, vma->vm_page_prot);
-	}
-
-	return ret;
-#endif
-	return (0);
+	return (-ENOSYS);
 }
-#endif
 
 static int iwch_deallocate_pd(struct ib_pd *pd)
 {
@@ -470,7 +377,7 @@
 
 	CTR2(KTR_IW_CXGB, "%s ib_mr %p", __FUNCTION__, ib_mr);
 	/* There can be no memory windows */
-	if (atomic_load_acq_int(&ib_mr->usecnt))
+	if (atomic_load_acq_int(&ib_mr->usecnt.counter))
 		return (-EINVAL);
 
 	mhp = to_iwch_mr(ib_mr);
@@ -478,6 +385,7 @@
 	mmid = mhp->attr.stag >> 8;
 	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
 		       mhp->attr.pbl_addr);
+	iwch_free_pbl(mhp);
 	remove_handle(rhp, &rhp->mmidr, mmid);
 	if (mhp->kva)
 		cxfree((void *) (unsigned long) mhp->kva);
@@ -511,6 +419,8 @@
 	if (!mhp)
 		return ERR_PTR(-ENOMEM);
 
+	mhp->rhp = rhp;
+
 	/* First check that we have enough alignment */
 	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
 		ret = -EINVAL;
@@ -528,7 +438,17 @@
 	if (ret)
 		goto err;
 
-	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, npages);
+	if (ret) {
+		cxfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = iwch_write_pbl(mhp, page_list, npages, 0);
+	cxfree(page_list);
+	if (ret)
+		goto err;
+
 	mhp->attr.pdid = php->pdid;
 	mhp->attr.zbva = 0;
 
@@ -538,15 +458,18 @@
 
 	mhp->attr.len = (u32) total_size;
 	mhp->attr.pbl_size = npages;
-	ret = iwch_register_mem(rhp, php, mhp, shift, page_list);
-	cxfree(page_list);
-	if (ret) {
-		goto err;
-	}
+	ret = iwch_register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
 	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
 err:
 	cxfree(mhp);
-	return ERR_PTR(-ret);
+	return ERR_PTR(ret);
 
 }
 
@@ -570,7 +493,7 @@
 	CTR3(KTR_IW_CXGB, "%s ib_mr %p ib_pd %p", __FUNCTION__, mr, pd);
 
 	/* There can be no memory windows */
-	if (atomic_load_acq_int(&mr->usecnt))
+	if (atomic_load_acq_int(&mr->usecnt.counter))
 		return (-EINVAL);
 
 	mhp = to_iwch_mr(mr);
@@ -596,7 +519,7 @@
 			return ret;
 	}
 
-	ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages);
+	ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
 	cxfree(page_list);
 	if (ret) {
 		return ret;
@@ -640,7 +563,9 @@
 	if (!mhp)
 		return ERR_PTR(-ENOMEM);
 
-	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
 	if (IS_ERR(mhp->umem)) {
 		err = PTR_ERR(mhp->umem);
 		cxfree(mhp);
@@ -650,18 +575,22 @@
 	shift = ffs(mhp->umem->page_size) - 1;
 
 	n = 0;
-	TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry)
+	list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
 		n += chunk->nents;
 
-	pages = kmalloc(n * sizeof(u64), M_NOWAIT);
+	err = iwch_alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) kmalloc(n * sizeof(u64), M_NOWAIT);
 	if (!pages) {
 		err = -ENOMEM;
-		goto err;
+		goto err_pbl;
 	}
 
 	i = n = 0;
 
-#if 0	
+#ifdef notyet
 	TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry)
 		for (j = 0; j < chunk->nmap; ++j) {
 			len = sg_dma_len(&chunk->page_list[j]) >> shift;
@@ -669,21 +598,36 @@
 				pages[i++] = htobe64(sg_dma_address(
 					&chunk->page_list[j]) +
 					mhp->umem->page_size * k);
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = iwch_write_pbl(mhp, pages, i, n);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
 			}
 		}
 #endif
-	mhp->rhp = rhp;
+
+	if (i)
+		err = iwch_write_pbl(mhp, pages, i, n);
+#ifdef notyet
+pbl_done:
+#endif
+	cxfree(pages);
+	if (err)
+		goto err_pbl;
+
 	mhp->attr.pdid = php->pdid;
 	mhp->attr.zbva = 0;
 	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
 	mhp->attr.va_fbo = virt;
 	mhp->attr.page_size = shift - 12;
 	mhp->attr.len = (u32) length;
-	mhp->attr.pbl_size = i;
-	err = iwch_register_mem(rhp, php, mhp, shift, pages);
-	cxfree(pages);
+	
+	err = iwch_register_mem(rhp, php, mhp, shift);
 	if (err)
-		goto err;
+		goto err_pbl;
 
 	if (udata && !t3a_device(rhp)) {
 		uresp.pbl_addr = (mhp->attr.pbl_addr -
@@ -700,6 +644,9 @@
 
 	return &mhp->ibmr;
 
+err_pbl:
+	iwch_free_pbl(mhp);
+
 err:
 	ib_umem_release(mhp->umem);
 	cxfree(mhp);
@@ -748,7 +695,12 @@
 	mhp->attr.type = TPT_MW;
 	mhp->attr.stag = stag;
 	mmid = (stag) >> 8;
-	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+		cxfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}	
 	CTR4(KTR_IW_CXGB, "%s mmid 0x%x mhp %p stag 0x%x", __FUNCTION__, mmid, mhp, stag);
 	return &(mhp->ibmw);
 }
@@ -893,7 +845,13 @@
 
 	mtx_init(&qhp->lock, "cxgb qp", NULL, MTX_DEF|MTX_DUPOK);
 	qhp->refcnt = 1;
-	insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid);
+
+	if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+		cxfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	if (udata) {
 
@@ -1023,12 +981,14 @@
 {
 	struct iwch_dev *dev;
 	struct port_info *pi;
+	struct adapter *sc;
 
 	CTR5(KTR_IW_CXGB, "%s ibdev %p, port %d, index %d, gid %p",
 	       __FUNCTION__, ibdev, port, index, gid);
 	dev = to_iwch_dev(ibdev);
+	sc = dev->rdev.adap;
 	PANIC_IF(port == 0 || port > 2);
-	pi = ((struct port_info *)dev->rdev.port_info.lldevs[port-1]->if_softc);
+	pi = &sc->port[port - 1];
 	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
 	memcpy(&(gid->raw[0]), pi->hw_addr, 6);
 	return 0;
@@ -1037,21 +997,20 @@
 static int iwch_query_device(struct ib_device *ibdev,
 			     struct ib_device_attr *props)
 {
+	struct iwch_dev *dev;
+	struct adapter *sc;
 
-	struct iwch_dev *dev;
 	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
 
 	dev = to_iwch_dev(ibdev);
+	sc = dev->rdev.adap;
 	memset(props, 0, sizeof *props);
-#ifdef notyet	
-	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->if_addr.ifa_addr, 6);
-#endif	
+	memcpy(&props->sys_image_guid, sc->port[0].hw_addr, 6);
 	props->device_cap_flags = dev->device_cap_flags;
-#ifdef notyet
-	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
-	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
-#endif
-	props->max_mr_size = ~0ull;
+	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
+	props->vendor_id = pci_get_vendor(sc->dev);
+	props->vendor_part_id = pci_get_device(sc->dev);
+	props->max_mr_size = dev->attr.max_mr_size;
 	props->max_qp = dev->attr.max_qps;
 	props->max_qp_wr = dev->attr.max_wrs;
 	props->max_sge = dev->attr.max_sge_per_wr;
@@ -1071,13 +1030,10 @@
 			   u8 port, struct ib_port_attr *props)
 {
 	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+	memset(props, 0, sizeof(struct ib_port_attr));
 	props->max_mtu = IB_MTU_4096;
-	props->lid = 0;
-	props->lmc = 0;
-	props->sm_lid = 0;
-	props->sm_sl = 0;
+	props->active_mtu = IB_MTU_2048;
 	props->state = IB_PORT_ACTIVE;
-	props->phys_state = 0;
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
 	    IB_PORT_SNMP_TUNNEL_SUP |
@@ -1086,7 +1042,6 @@
 	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
 	props->pkey_tbl_len = 1;
-	props->qkey_viol_cntr = 0;
 	props->active_width = 2;
 	props->active_speed = 2;
 	props->max_msg_sz = -1;
@@ -1094,80 +1049,18 @@
 	return 0;
 }
 
-#ifdef notyet
-static ssize_t show_rev(struct class_device *cdev, char *buf)
-{
-	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
-					    ibdev.class_dev);
-	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
-	return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type);
-}
-
-static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
-{
-	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
-					    ibdev.class_dev);
-	struct ethtool_drvinfo info;
-	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
-
-	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
-	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	return sprintf(buf, "%s\n", info.fw_version);
-}
-
-static ssize_t show_hca(struct class_device *cdev, char *buf)
-{
-	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
-					    ibdev.class_dev);
-	struct ethtool_drvinfo info;
-	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
-
-	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
-	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	return sprintf(buf, "%s\n", info.driver);
-}
-
-static ssize_t show_board(struct class_device *cdev, char *buf)
-{
-	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
-					    ibdev.class_dev);
-	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, dev);
-#ifdef notyet
-	return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor,
-		                       dev->rdev.rnic_info.pdev->device);
-#else
-	return sprintf(buf, "%x.%x\n", 0xdead, 0xbeef);	 /* XXX */
-#endif
-}
-
-static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-
-static struct class_device_attribute *iwch_class_attributes[] = {
-	&class_device_attr_hw_rev,
-	&class_device_attr_fw_ver,
-	&class_device_attr_hca_type,
-	&class_device_attr_board_id
-};
-#endif
-
 int iwch_register_device(struct iwch_dev *dev)
 {
 	int ret;
-#ifdef notyet	
-	int i;
-#endif
+	struct adapter *sc = dev->rdev.adap;
+
 	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev);
 	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
-#ifdef notyet	
-	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-#endif	
+	memcpy(&dev->ibdev.node_guid, sc->port[0].hw_addr, 6);
 	dev->device_cap_flags =
-	    (IB_DEVICE_ZERO_STAG |
-	     IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW);
+		(IB_DEVICE_LOCAL_DMA_LKEY |
+		 IB_DEVICE_MEM_WINDOW);
 
 	dev->ibdev.uverbs_cmd_mask =
 	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -1189,9 +1082,9 @@
 	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
 	dev->ibdev.node_type = RDMA_NODE_RNIC;
 	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
-	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+	dev->ibdev.phys_port_cnt = sc->params.nports;
 	dev->ibdev.num_comp_vectors = 1;
-	dev->ibdev.dma_device = dev->rdev.rnic_info.pdev;
+	dev->ibdev.dma_device = dev->rdev.adap->dev;
 	dev->ibdev.query_device = iwch_query_device;
 	dev->ibdev.query_port = iwch_query_port;
 	dev->ibdev.modify_port = iwch_modify_port;
@@ -1199,9 +1092,7 @@
 	dev->ibdev.query_gid = iwch_query_gid;
 	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
 	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
-#ifdef notyet	
 	dev->ibdev.mmap = iwch_mmap;
-#endif	
 	dev->ibdev.alloc_pd = iwch_allocate_pd;
 	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
 	dev->ibdev.create_ah = iwch_ah_create;
@@ -1229,11 +1120,13 @@
 	dev->ibdev.req_notify_cq = iwch_arm_cq;
 	dev->ibdev.post_send = iwch_post_send;
 	dev->ibdev.post_recv = iwch_post_receive;
-
+	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
 
 	dev->ibdev.iwcm =
-	    (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs),
-					   M_NOWAIT);
+	    kmalloc(sizeof(struct iw_cm_verbs), M_NOWAIT);
+	if (!dev->ibdev.iwcm)
+		return (ENOMEM);
+
 	dev->ibdev.iwcm->connect = iwch_connect;
 	dev->ibdev.iwcm->accept = iwch_accept_cr;
 	dev->ibdev.iwcm->reject = iwch_reject_cr;
@@ -1246,35 +1139,19 @@
 	ret = ib_register_device(&dev->ibdev);
 	if (ret)
 		goto bail1;
-#ifdef notyet
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
-		ret = class_device_create_file(&dev->ibdev.class_dev,
-					       iwch_class_attributes[i]);
-		if (ret) {
-			goto bail2;
-		}
-	}
-#endif	
-	return 0;
-#ifdef notyet	
-bail2:
-#endif	
-	ib_unregister_device(&dev->ibdev);
+
+	return (0);
+
 bail1:
-	return ret;
+	cxfree(dev->ibdev.iwcm);
+	return (ret);
 }
 
 void iwch_unregister_device(struct iwch_dev *dev)
 {
-#ifdef notyet
-	int i;
 
-	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev);
-
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
-		class_device_remove_file(&dev->ibdev.class_dev,
-					 iwch_class_attributes[i]);
-#endif	
 	ib_unregister_device(&dev->ibdev);
+	cxfree(dev->ibdev.iwcm);
 	return;
 }
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h	Mon Jun 11 00:15:24 2012 -0700
@@ -31,7 +31,7 @@
 #ifndef __IWCH_PROVIDER_H__
 #define __IWCH_PROVIDER_H__
 
-#include <contrib/rdma/ib_verbs.h>
+#include <rdma/ib_verbs.h>
 
 struct iwch_pd {
 	struct ib_pd ibpd;
@@ -116,6 +116,7 @@
 };
 
 struct iwch_mpa_attributes {
+	u8 initiator;
 	u8 recv_marker_enabled;
 	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
 	u8 crc_enabled;
@@ -336,18 +337,17 @@
 int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
 int iwch_register_device(struct iwch_dev *dev);
 void iwch_unregister_device(struct iwch_dev *dev);
-int iwch_quiesce_qps(struct iwch_cq *chp);
-int iwch_resume_qps(struct iwch_cq *chp);
 void stop_read_rep_timer(struct iwch_qp *qhp);
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
-					int shift,
-					__be64 *page_list);
+					int shift);
 int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
 					int shift,
-					__be64 *page_list,
 					int npages);
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
+void iwch_free_pbl(struct iwch_mr *mhp);
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
 int build_phys_page_list(struct ib_phys_buf *buffer_list,
 					int num_phys_buf,
 					u64 *iova_start,
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,4 +1,3 @@
-
 /**************************************************************************
 
 Copyright (c) 2007, Chelsio Inc.
@@ -30,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -48,6 +49,7 @@
 #include <sys/linker.h>
 #include <sys/firmware.h>
 #include <sys/socket.h>
+#include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
@@ -57,14 +59,26 @@
 #include <sys/proc.h>
 #include <sys/queue.h>
 
+#include <net/route.h>
+#include <netinet/in_systm.h>
 #include <netinet/in.h>
+#include <netinet/toecore.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcpip.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/tom/cxgb_l2t.h>
+#include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
 #include <ulp/iw_cxgb/iw_cxgb_hal.h>
 #include <ulp/iw_cxgb/iw_cxgb_provider.h>
@@ -75,7 +89,7 @@
 
 #define NO_SUPPORT -1
 
-static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
 				u8 * flit_cnt)
 {
 	int i;
@@ -83,59 +97,46 @@
 
 	switch (wr->opcode) {
 	case IB_WR_SEND:
-	case IB_WR_SEND_WITH_IMM:
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			wqe->send.rdmaop = T3_SEND_WITH_SE;
 		else
 			wqe->send.rdmaop = T3_SEND;
 		wqe->send.rem_stag = 0;
 		break;
-#if 0				/* Not currently supported */
-	case TYPE_SEND_INVALIDATE:
-	case TYPE_SEND_INVALIDATE_IMMEDIATE:
-		wqe->send.rdmaop = T3_SEND_WITH_INV;
-		wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey);
+	case IB_WR_SEND_WITH_IMM:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		else
+			wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = 0;
 		break;
-	case TYPE_SEND_SE_INVALIDATE:
-		wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
-		wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey);
-		break;
-#endif
 	default:
-		break;
+		return -EINVAL;
 	}
 	if (wr->num_sge > T3_MAX_SGE)
 		return (-EINVAL);
 	wqe->send.reserved[0] = 0;
 	wqe->send.reserved[1] = 0;
 	wqe->send.reserved[2] = 0;
-	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
-		plen = 4;
-		wqe->send.sgl[0].stag = wr->imm_data;
-		wqe->send.sgl[0].len = 0;
-		wqe->send.num_sgle = 0;
-		*flit_cnt = 5;
-	} else {
-		plen = 0;
-		for (i = 0; i < wr->num_sge; i++) {
-			if ((plen + wr->sg_list[i].length) < plen) {
-				return (-EMSGSIZE);
-			}
-			plen += wr->sg_list[i].length;
-			wqe->send.sgl[i].stag =
-			    htobe32(wr->sg_list[i].lkey);
-			wqe->send.sgl[i].len =
-			    htobe32(wr->sg_list[i].length);
-			wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr);
+	plen = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen) {
+			return (-EMSGSIZE);
 		}
-		wqe->send.num_sgle = htobe32(wr->num_sge);
-		*flit_cnt = 4 + ((wr->num_sge) << 1);
+		plen += wr->sg_list[i].length;
+		wqe->send.sgl[i].stag =
+		    htobe32(wr->sg_list[i].lkey);
+		wqe->send.sgl[i].len =
+		    htobe32(wr->sg_list[i].length);
+		wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr);
 	}
+	wqe->send.num_sgle = htobe32(wr->num_sge);
+	*flit_cnt = 4 + ((wr->num_sge) << 1);
 	wqe->send.plen = htobe32(plen);
 	return 0;
 }
 
-static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
 				 u8 *flit_cnt)
 {
 	int i;
@@ -152,7 +153,7 @@
 
 	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
 		plen = 4;
-		wqe->write.sgl[0].stag = wr->imm_data;
+		wqe->write.sgl[0].stag = wr->ex.imm_data;
 		wqe->write.sgl[0].len = 0;
 		wqe->write.num_sgle = 0; 
 		*flit_cnt = 6;
@@ -177,7 +178,7 @@
 	return 0;
 }
 
-static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
 				u8 *flit_cnt)
 {
 	if (wr->num_sge > 1)
@@ -195,15 +196,12 @@
 	return 0;
 }
 
-/*
- * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
- */
 static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
 			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
 {
 	int i;
 	struct iwch_mr *mhp;
-	u32 offset;
+	u64 offset;
 	for (i = 0; i < num_sgle; i++) {
 
 		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
@@ -235,8 +233,8 @@
 			return (-EINVAL);
 		}
 		offset = sg_list[i].addr - mhp->attr.va_fbo;
-		offset += ((u32) mhp->attr.va_fbo) %
-		          (1UL << (12 + mhp->attr.page_size));
+		offset += mhp->attr.va_fbo &
+			  ((1UL << (12 + mhp->attr.page_size)) - 1);
 		pbl_addr[i] = ((mhp->attr.pbl_addr -
 			        rhp->rdev.rnic_info.pbl_base) >> 3) +
 			      (offset >> (12 + mhp->attr.page_size));
@@ -245,26 +243,113 @@
 	return 0;
 }
 
-static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe,
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
 				struct ib_recv_wr *wr)
 {
-	int i;
-	if (wr->num_sge > T3_MAX_SGE)
+       int i, err = 0;
+       u32 pbl_addr[T3_MAX_SGE];
+       u8 page_size[T3_MAX_SGE];
+
+       if (wr->num_sge > T3_MAX_SGE)
 		return (-EINVAL);
+
+
+        err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+                               page_size);
+        if (err)
+                return err;
+        wqe->recv.pagesz[0] = page_size[0];
+        wqe->recv.pagesz[1] = page_size[1];
+        wqe->recv.pagesz[2] = page_size[2];
+        wqe->recv.pagesz[3] = page_size[3];
 	wqe->recv.num_sgle = htobe32(wr->num_sge);
+
 	for (i = 0; i < wr->num_sge; i++) {
 		wqe->recv.sgl[i].stag = htobe32(wr->sg_list[i].lkey);
 		wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length);
-		wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr);
+		wqe->recv.sgl[i].to = htobe64(((u32)wr->sg_list[i].addr) &
+				((1UL << (12 + page_size[i])) - 1));
+		/* pbl_addr is the adapters address in the PBL */
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
 	}
 	for (; i < T3_MAX_SGE; i++) {
 		wqe->recv.sgl[i].stag = 0;
 		wqe->recv.sgl[i].len = 0;
 		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
 	}
+
+        qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+                             qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+        qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+                             qhp->wq.rq_size_log2)].pbl_addr = 0;
+
 	return 0;
 }
 
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+                                struct ib_recv_wr *wr)
+{
+        int i;
+        u32 pbl_addr;
+        u32 pbl_offset;
+
+
+        /*
+         * The T3 HW requires the PBL in the HW recv descriptor to reference
+         * a PBL entry.  So we allocate the max needed PBL memory here and pass
+         * it to the uP in the recv WR.  The uP will build the PBL and setup
+         * the HW recv descriptor.
+         */
+        pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+        if (!pbl_addr)
+                return -ENOMEM;
+
+        /*
+         * Compute the 8B aligned offset.
+         */
+        pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+        wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+        for (i = 0; i < wr->num_sge; i++) {
+
+                /*
+                 * Use a 128MB page size. This and an imposed 128MB
+                 * sge length limit allows us to require only a 2-entry HW
+                 * PBL for each SGE.  This restriction is acceptable since
+                 * since it is not possible to allocate 128MB of contiguous
+                 * DMA coherent memory!
+                 */
+                if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+                        return -EINVAL;
+                wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+                /*
+                 * T3 restricts a recv to all zero-stag or all non-zero-stag.
+                 */
+                if (wr->sg_list[i].lkey != 0)
+                        return -EINVAL;
+                wqe->recv.sgl[i].stag = 0;
+                wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length);
+                wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr);
+                wqe->recv.pbl_addr[i] = htobe32(pbl_offset);
+                pbl_offset += 2;
+        }
+        for (; i < T3_MAX_SGE; i++) {
+                wqe->recv.pagesz[i] = 0;
+                wqe->recv.sgl[i].stag = 0;
+                wqe->recv.sgl[i].len = 0;
+                wqe->recv.sgl[i].to = 0;
+                wqe->recv.pbl_addr[i] = 0;
+        }
+        qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+                             qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+        qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+                             qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
+        return 0;
+}
+
 int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		      struct ib_send_wr **bad_wr)
 {
@@ -282,18 +367,19 @@
 	mtx_lock(&qhp->lock);
 	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
 		mtx_unlock(&qhp->lock);
-		return (-EINVAL);
+		err = -EINVAL;
+		goto out;
 	}
 	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
 		  qhp->wq.sq_size_log2);
-	if (num_wrs <= 0) {
+	if (num_wrs == 0) {
 		mtx_unlock(&qhp->lock);
-		return (-ENOMEM);
+		err = -EINVAL;
+		goto out;
 	}
 	while (wr) {
 		if (num_wrs == 0) {
 			err = -ENOMEM;
-			*bad_wr = wr;
 			break;
 		}
 		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
@@ -311,17 +397,17 @@
 		case IB_WR_SEND:
 		case IB_WR_SEND_WITH_IMM:
 			t3_wr_opcode = T3_WR_SEND;
-			err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
 			break;
 		case IB_WR_RDMA_WRITE:
 		case IB_WR_RDMA_WRITE_WITH_IMM:
 			t3_wr_opcode = T3_WR_WRITE;
-			err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
 			break;
 		case IB_WR_RDMA_READ:
 			t3_wr_opcode = T3_WR_READ;
 			t3_wr_flags = 0; /* T3 reads are always signaled */
-			err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
 			if (err)
 				break;
 			sqp->read_len = wqe->read.local_len;
@@ -333,10 +419,9 @@
 			     wr->opcode);
 			err = -EINVAL;
 		}
-		if (err) {
-			*bad_wr = wr;
+		if (err)
 			break;
-		}
+
 		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
 		sqp->wr_id = wr->wr_id;
 		sqp->opcode = wr2opcode(t3_wr_opcode);
@@ -358,6 +443,9 @@
 	}
 	mtx_unlock(&qhp->lock);
 	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+out:
+	if (err)
+		*bad_wr = wr;
 	return err;
 }
 
@@ -374,27 +462,35 @@
 	mtx_lock(&qhp->lock);
 	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
 		mtx_unlock(&qhp->lock);
-		return (-EINVAL);
+		err = -EINVAL;
+		goto out;
 	}
 	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
 			    qhp->wq.rq_size_log2) - 1;
 	if (!wr) {
 		mtx_unlock(&qhp->lock);
-		return (-EINVAL);
+		err = -EINVAL;
+		goto out;
 	}
+
 	while (wr) {
+	        if (wr->num_sge > T3_MAX_SGE) {
+                        err = -EINVAL;
+                        break;
+                }
+
 		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
 		wqe = (union t3_wr *) (qhp->wq.queue + idx);
-		if (num_wrs)
-			err = iwch_build_rdma_recv(qhp->rhp, wqe, wr);
-		else
+		if (num_wrs) {
+                        if (wr->sg_list[0].lkey)
+                                err = build_rdma_recv(qhp, wqe, wr);
+                        else
+                                err = build_zero_stag_recv(qhp, wqe, wr);
+		} else
 			err = -ENOMEM;
-		if (err) {
-			*bad_wr = wr;
+		if (err)
 			break;
-		}
-		qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] =
-			wr->wr_id;
+
 		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
 			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
 			       0, sizeof(struct t3_receive_wr) >> 3);
@@ -408,6 +504,9 @@
 	}
 	mtx_unlock(&qhp->lock);
 	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+out:
+        if (err)
+                *bad_wr = wr;
 	return err;
 }
 
@@ -439,7 +538,7 @@
 	}
 	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
 			    qhp->wq.sq_size_log2);
-	if ((num_wrs) <= 0) {
+	if ((num_wrs) == 0) {
 		mtx_unlock(&qhp->lock);
 		return (-ENOMEM);
 	}
@@ -491,7 +590,7 @@
 	return err;
 }
 
-static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
+static void build_term_codes(struct respQ_msg_t *rsp_msg,
 				    u8 *layer_type, u8 *ecode)
 {
 	int status = TPT_ERR_INTERNAL_ERR;
@@ -631,15 +730,18 @@
 	union t3_wr *wqe;
 	struct terminate_message *term;
 	struct mbuf *m;
+	struct ofld_hdr *oh;
 
-	CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
-	m = m_gethdr(MT_DATA, M_NOWAIT);
-	if (!m) {
+	CTR3(KTR_IW_CXGB, "%s: tid %u, %p", __func__, qhp->ep->hwtid, rsp_msg);
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
 		log(LOG_ERR, "%s cannot send TERMINATE!\n", __FUNCTION__);
 		return (-ENOMEM);
 	}
-	wqe = mtod(m, union t3_wr *);
-	m->m_len = m->m_pkthdr.len = 40;
+	oh = mtod(m, struct ofld_hdr *);
+	m->m_pkthdr.len = m->m_len = sizeof(*oh) + 40;
+	oh->flags = V_HDR_NDESC(1) | V_HDR_CTRL(CPL_PRIORITY_DATA) | V_HDR_QSET(0);
+	wqe = (void *)(oh + 1);
 	memset(wqe, 0, 40);
 	wqe->send.rdmaop = T3_TERMINATE;
 
@@ -653,22 +755,17 @@
 		V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
 	wqe->send.wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(qhp->ep->hwtid));
 
-	m_set_priority(m, CPL_PRIORITY_DATA);
-	m_set_sgl(m, NULL);
-	m_set_sgllen(m, 0);
-	return cxgb_ofld_send(qhp->rhp->rdev.t3cdev_p, m);
+	return t3_offload_tx(qhp->rhp->rdev.adap, m);
 }
 
 /*
  * Assumes qhp lock is held.
  */
-static void __flush_qp(struct iwch_qp *qhp)
+static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp,
+			struct iwch_cq *schp)
 {
-	struct iwch_cq *rchp, *schp;
 	int count;
-
-	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
-	schp = get_chp(qhp->rhp, qhp->attr.scq);
+	int flushed;
 
 	CTR4(KTR_IW_CXGB, "%s qhp %p rchp %p schp %p", __FUNCTION__, qhp, rchp, schp);
 	/* take a ref on the qhp since we must release the lock */
@@ -680,20 +777,22 @@
 	mtx_lock(&qhp->lock);
 	cxio_flush_hw_cq(&rchp->cq);
 	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
-	cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
 	mtx_unlock(&qhp->lock);
 	mtx_unlock(&rchp->lock);
- 	(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+	if (flushed)
+ 		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
 
 	/* locking hierarchy: cq lock first, then qp lock. */
 	mtx_lock(&schp->lock);
 	mtx_lock(&qhp->lock);
 	cxio_flush_hw_cq(&schp->cq);
 	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
-	cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
 	mtx_unlock(&qhp->lock);
 	mtx_unlock(&schp->lock);
- 	(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+	if (flushed)
+ 		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
 
 	/* deref */
 	mtx_lock(&qhp->lock);
@@ -703,10 +802,23 @@
 
 static void flush_qp(struct iwch_qp *qhp)
 {
-	if (qhp->ibqp.uobject)
+	struct iwch_cq *rchp, *schp;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	if (qhp->ibqp.uobject) {
 		cxio_set_wq_in_error(&qhp->wq);
-	else
-		__flush_qp(qhp);
+		cxio_set_cq_in_error(&rchp->cq);
+               	(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+               	if (schp != rchp) {
+                	cxio_set_cq_in_error(&schp->cq);
+                       	(*schp->ibcq.comp_handler)(&schp->ibcq,
+                        				schp->ibcq.cq_context);
+               	}
+               	return;
+       	}
+       	__flush_qp(qhp, rchp, schp);
 }
 
 
@@ -715,7 +827,13 @@
  */
 static int rqes_posted(struct iwch_qp *qhp)
 {
-	return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV;
+       union t3_wr *wqe = qhp->wq.queue;
+        u16 count = 0;
+        while ((count+1) != 0 && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) {
+                count++;
+                wqe++;
+        }
+        return count;
 }
 
 static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
@@ -724,6 +842,10 @@
 {
 	struct t3_rdma_init_attr init_attr;
 	int ret;
+	struct socket *so = qhp->ep->com.so;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp;
+	struct toepcb *toep;
 
 	init_attr.tid = qhp->ep->hwtid;
 	init_attr.qpid = qhp->wq.qpid;
@@ -737,32 +859,28 @@
 		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
 		(qhp->attr.mpa_attr.crc_enabled << 2);
 
-	/*
-	 * XXX - The IWCM doesn't quite handle getting these
-	 * attrs set before going into RTS.  For now, just turn
-	 * them on always...
-	 */
-#if 0
-	init_attr.qpcaps = qhp->attr.enableRdmaRead |
-		(qhp->attr.enableRdmaWrite << 1) |
-		(qhp->attr.enableBind << 2) |
-		(qhp->attr.enable_stag0_fastreg << 3) |
-		(qhp->attr.enable_stag0_fastreg << 4);
-#else
-	init_attr.qpcaps = 0x1f;
-#endif
+	init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE |
+			   uP_RI_QP_RDMA_WRITE_ENABLE |
+			   uP_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE;
 	init_attr.tcp_emss = qhp->ep->emss;
 	init_attr.ord = qhp->attr.max_ord;
 	init_attr.ird = qhp->attr.max_ird;
 	init_attr.qp_dma_addr = qhp->wq.dma_addr;
 	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
-	init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0;
+	init_attr.rqe_count = rqes_posted(qhp);
+	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
+	init_attr.rtr_type = 0;
+	tp = intotcpcb(inp);
+	toep = tp->t_toe;
+	init_attr.chan = toep->tp_l2t->smt_idx;
 	init_attr.irs = qhp->ep->rcv_seq;
 	CTR5(KTR_IW_CXGB, "%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
 	     "flags 0x%x qpcaps 0x%x", __FUNCTION__,
 	     init_attr.rq_addr, init_attr.rq_size,
 	     init_attr.flags, init_attr.qpcaps);
-	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+	ret = cxio_rdma_init(&rhp->rdev, &init_attr, qhp->ep->com.so);
 	CTR2(KTR_IW_CXGB, "%s ret %d", __FUNCTION__, ret);
 	return ret;
 }
@@ -870,8 +988,8 @@
 				abort=0;
 				disconnect = 1;
 				ep = qhp->ep;
+				get_ep(&ep->com);
 			}
-			flush_qp(qhp);
 			break;
 		case IWCH_QP_STATE_TERMINATE:
 			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
@@ -886,6 +1004,7 @@
 				abort=1;
 				disconnect = 1;
 				ep = qhp->ep;
+				get_ep(&ep->com);
 			}
 			goto err;
 			break;
@@ -901,6 +1020,7 @@
 		}
 		switch (attrs->next_state) {
 			case IWCH_QP_STATE_IDLE:
+				flush_qp(qhp);
 				qhp->attr.state = IWCH_QP_STATE_IDLE;
 				qhp->attr.llp_stream_handle = NULL;
 				put_ep(&qhp->ep->com);
@@ -908,7 +1028,6 @@
 				wakeup(qhp);
 				break;
 			case IWCH_QP_STATE_ERROR:
-				disconnect=1;
 				goto err;
 			default:
 				ret = -EINVAL;
@@ -960,81 +1079,29 @@
 out:
 	mtx_unlock(&qhp->lock);
 
-	if (terminate)
+	if (terminate) 
 		iwch_post_terminate(qhp, NULL);
+	
 
 	/*
 	 * If disconnect is 1, then we need to initiate a disconnect
 	 * on the EP.  This can be a normal close (RTS->CLOSING) or
 	 * an abnormal close (RTS/CLOSING->ERROR).
 	 */
-	if (disconnect)
+	if (disconnect) {
 		iwch_ep_disconnect(ep, abort, M_NOWAIT);
-
+		put_ep(&ep->com);
+	}
+	
 	/*
 	 * If free is 1, then we've disassociated the EP from the QP
 	 * and we need to dereference the EP.
 	 */
-	if (free)
+	if (free) 
 		put_ep(&ep->com);
+	
 
 	CTR2(KTR_IW_CXGB, "%s exit state %d", __FUNCTION__, qhp->attr.state);
 	return ret;
 }
-
-static int quiesce_qp(struct iwch_qp *qhp)
-{
-	mtx_lock(&qhp->lock);
-	iwch_quiesce_tid(qhp->ep);
-	qhp->flags |= QP_QUIESCED;
-	mtx_unlock(&qhp->lock);
-	return 0;
-}
-
-static int resume_qp(struct iwch_qp *qhp)
-{
-	mtx_lock(&qhp->lock);
-	iwch_resume_tid(qhp->ep);
-	qhp->flags &= ~QP_QUIESCED;
-	mtx_lock(&qhp->lock);
-	return 0;
-}
-
-int iwch_quiesce_qps(struct iwch_cq *chp)
-{
-	int i;
-	struct iwch_qp *qhp;
-
-	for (i=0; i < T3_MAX_NUM_QP; i++) {
-		qhp = get_qhp(chp->rhp, i);
-		if (!qhp)
-			continue;
-		if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) {
-			quiesce_qp(qhp);
-			continue;
-		}
-		if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp))
-			quiesce_qp(qhp);
-	}
-	return 0;
-}
-
-int iwch_resume_qps(struct iwch_cq *chp)
-{
-	int i;
-	struct iwch_qp *qhp;
-
-	for (i=0; i < T3_MAX_NUM_QP; i++) {
-		qhp = get_qhp(chp->rhp, i);
-		if (!qhp)
-			continue;
-		if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) {
-			resume_qp(qhp);
-			continue;
-		}
-		if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp))
-			resume_qp(qhp);
-	}
-	return 0;
-}
-
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c	Mon Jun 11 00:15:24 2012 -0700
@@ -29,11 +29,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/module.h>
 #include <sys/pciio.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
@@ -59,9 +61,11 @@
 
 #include <netinet/in.h>
 
-#include <contrib/rdma/ib_verbs.h>
-#include <contrib/rdma/ib_umem.h>
-#include <contrib/rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/idr.h>
+#include <ulp/iw_cxgb/iw_cxgb_ib_intfc.h>
 
 #include <cxgb_include.h>
 #include <ulp/iw_cxgb/iw_cxgb_wr.h>
@@ -369,3 +373,4 @@
 {
 	gen_pool_destroy(rdev_p->rqt_pool);
 }
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h	Mon Jun 11 00:15:24 2012 -0700
@@ -47,10 +47,18 @@
 	uint64_t user_rptr_addr;
 };
 
+struct iwch_create_cq_resp_v0 {
+        __u64 key;
+        __u32 cqid;
+        __u32 size_log2;
+};
+
 struct iwch_create_cq_resp {
 	uint64_t key;
 	uint32_t cqid;
 	uint32_t size_log2;
+	__u32 memsize;
+	__u32 reserved;
 };
 
 struct iwch_create_qp_resp {
diff -r 7cec8c20120e sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h	Mon Jun 11 00:15:24 2012 -0700
@@ -32,6 +32,9 @@
 #define __CXIO_WR_H__
 #define T3_MAX_SGE      4
 #define T3_MAX_INLINE	64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
 
 #define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
 #define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
@@ -272,6 +275,22 @@
 	uP_RI_QP_STAG0_ENABLE = 0x10
 } __attribute__ ((packed));
 
+enum rdma_init_rtr_types {
+        RTR_READ = 1,
+        RTR_WRITE = 2,
+        RTR_SEND = 3,
+};
+
+#define S_RTR_TYPE      2
+#define M_RTR_TYPE      0x3
+#define V_RTR_TYPE(x)   ((x) << S_RTR_TYPE)
+#define G_RTR_TYPE(x)   ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE)
+
+#define S_CHAN          4
+#define M_CHAN          0x3
+#define V_CHAN(x)       ((x) << S_CHAN)
+#define G_CHAN(x)       ((((x) >> S_CHAN)) & M_CHAN)
+
 struct t3_rdma_init_attr {
 	u32 tid;
 	u32 qpid;
@@ -287,8 +306,11 @@
 	u32 ird;
 	u64 qp_dma_addr;
 	u32 qp_dma_size;
-	u32 flags;
+	enum rdma_init_rtr_types rtr_type;
+	u16 flags;
+	u16 rqe_count;
 	u32 irs;
+	u32 chan;
 };
 
 struct t3_rdma_init_wr {
@@ -303,13 +325,13 @@
 	u8 mpaattrs;		/* 5 */
 	u8 qpcaps;
 	__be16 ulpdu_size;
-	__be32 flags;		/* bits 31-1 - reservered */
-				/* bit     0 - set if RECV posted */
+	__be16 flags_rtr_type;
+        __be16 rqe_count;
 	__be32 ord;		/* 6 */
 	__be32 ird;
 	__be64 qp_dma_addr;	/* 7 */
 	__be32 qp_dma_size;	/* 8 */
-	u32 irs;
+	__be32 irs;
 };
 
 struct t3_genbit {
@@ -318,7 +340,8 @@
 };
 
 enum rdma_init_wr_flags {
-	RECVS_POSTED = 1,
+        MPA_INITIATOR = (1<<0),
+        PRIV_QP = (1<<1),
 };
 
 union t3_wr {
@@ -531,6 +554,12 @@
 #define CQE_STATUS(x)     (G_CQE_STATUS(be32toh((x).header)))
 #define CQE_OPCODE(x)     (G_CQE_OPCODE(be32toh((x).header)))
 
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
+
 #define CQE_LEN(x)        (be32toh((x).len))
 
 /* used for RQ completion processing */
@@ -589,21 +618,23 @@
 	uint64_t		wr_id;
 	struct t3_cqe		cqe;
 	uint32_t		sq_wptr;
-	uint32_t		read_len;
+	__be32   		read_len;
 	int			opcode;
 	int			complete;
 	int			signaled;
 };
 
+struct t3_swrq {
+        __u64                   wr_id;
+        __u32                   pbl_addr;
+};
+
 /*
  * A T3 WQ implements both the SQ and RQ.
  */
 struct t3_wq {
 	union t3_wr *queue;		/* DMA accessable memory */
 	bus_addr_t dma_addr;		/* DMA address for HW */
-#ifdef notyet	
-	DECLARE_PCI_UNMAP_ADDR(mapping)	/* unmap kruft */
-#endif		
 	u32 error;			/* 1 once we go to ERROR */
 	u32 qpid;
 	u32 wptr;			/* idx to next available WR slot */
@@ -613,14 +644,15 @@
 	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
 	u32 sq_rptr;			/* pending wrs */
 	u32 sq_size_log2;		/* sq size */
-	u64 *rq;			/* SW RQ (holds consumer wr_ids */
+        struct t3_swrq *rq;             /* SW RQ (holds consumer wr_ids */
 	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
 	u32 rq_rptr;			/* pending wrs */
-	u64 *rq_oldest_wr;		/* oldest wr on the SW RQ */
+	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
 	u32 rq_size_log2;		/* rq size */
 	u32 rq_addr;			/* rq adapter address */
-	void /* __iomem */ *doorbell;	/* kernel db */
+	void *doorbell;			/* kernel db */
 	u64 udb;			/* user db if any */
+	struct cxio_rdev *rdev;
 };
 
 struct t3_cq {
@@ -629,9 +661,6 @@
 	u32 wptr;
 	u32 size_log2;
 	bus_addr_t dma_addr;
-#ifdef notyet	
-	DECLARE_PCI_UNMAP_ADDR(mapping)
-#endif		
 	struct t3_cqe *queue;
 	struct t3_cqe *sw_queue;
 	u32 sw_rptr;
@@ -641,6 +670,22 @@
 #define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
 					 CQE_GENBIT(*cqe))
 
+struct t3_cq_status_page {
+        u32 cq_err;
+};
+
+static inline int cxio_cq_in_error(struct t3_cq *cq)
+{
+        return ((struct t3_cq_status_page *)
+                &cq->queue[1 << cq->size_log2])->cq_err;
+}
+
+static inline void cxio_set_cq_in_error(struct t3_cq *cq)
+{
+        ((struct t3_cq_status_page *)
+         &cq->queue[1 << cq->size_log2])->cq_err = 1;
+}
+
 static inline void cxio_set_wq_in_error(struct t3_wq *wq)
 {
 	wq->queue->flit[13] = 1;
diff -r 7cec8c20120e sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
--- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD$
-
-***************************************************************************/
-
-#ifndef _CXGB_TOEDEV_H_
-#define _CXGB_TOEDEV_H_
-#include <netinet/toedev.h>
-
-
-/* offload type ids */
-enum {
-	TOE_ID_CHELSIO_T1 = 1,
-	TOE_ID_CHELSIO_T1C,
-	TOE_ID_CHELSIO_T2,
-	TOE_ID_CHELSIO_T3,
-	TOE_ID_CHELSIO_T3B,
-	TOE_ID_CHELSIO_T3C,
-}
-	;
-
-#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/toecore/toedev.c
--- a/sys/dev/cxgb/ulp/toecore/toedev.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,420 +0,0 @@
-
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/bus.h>
-#include <sys/module.h>
-#include <sys/queue.h>
-#include <sys/mbuf.h>
-#include <sys/proc.h>
-
-#include <sys/socket.h>
-#include <sys/sockio.h>
-
-#include <net/bpf.h>
-#include <net/ethernet.h>
-#include <net/if.h>
-#include <net/route.h>
-
-
-/*
- * XXX 
- */
-#include <cxgb_include.h>
-#include <ulp/toecore/cxgb_toedev.h>
-
-static struct mtx offload_db_lock;
-static TAILQ_HEAD(, toedev) offload_dev_list;
-static TAILQ_HEAD(, tom_info) offload_module_list;
-
-/*
- * Returns the entry in the given table with the given offload id, or NULL
- * if the id is not found.
- */
-static const struct offload_id *
-id_find(unsigned int id, const struct offload_id *table)
-{
-	for ( ; table->id; ++table)
-		if (table->id == id)
-			return table;
-	return NULL;
-}
-
-/*
- * Returns true if an offload device is presently attached to an offload module.
- */
-static inline int
-is_attached(const struct toedev *dev)
-{
-	return dev->tod_offload_mod != NULL;
-}
-
-/*
- * Try to attach a new offload device to an existing TCP offload module that
- * can handle the device's offload id.  Returns 0 if it succeeds.
- *
- * Must be called with the offload_db_lock held.
- */
-static int
-offload_attach(struct toedev *dev)
-{
-	struct tom_info *t;
-
-	TAILQ_FOREACH(t, &offload_module_list, entry) {
-		const struct offload_id *entry;
-
-		entry = id_find(dev->tod_ttid, t->ti_id_table);
-		if (entry && t->ti_attach(dev, entry) == 0) {
-			dev->tod_offload_mod = t;
-			return 0;
-		}
-	}
-	return (ENOPROTOOPT);
-}
-
-/**
- * register_tom - register a TCP Offload Module (TOM)
- * @t: the offload module to register
- *
- * Register a TCP Offload Module (TOM).
- */
-int
-register_tom(struct tom_info *t)
-{
-	mtx_lock(&offload_db_lock);
-	toedev_registration_count++;
-	TAILQ_INSERT_HEAD(&offload_module_list, t, entry);
-	mtx_unlock(&offload_db_lock);
-	return 0;
-}
-
-/**
- * unregister_tom - unregister a TCP Offload Module (TOM)
- * @t: the offload module to register
- *
- * Unregister a TCP Offload Module (TOM).  Note that this does not affect any
- * TOE devices to which the TOM is already attached.
- */
-int
-unregister_tom(struct tom_info *t)
-{
-	mtx_lock(&offload_db_lock);
-	TAILQ_REMOVE(&offload_module_list, t, entry);
-	mtx_unlock(&offload_db_lock);
-	return 0;
-}
-
-/*
- * Find an offload device by name.  Must be called with offload_db_lock held.
- */
-static struct toedev *
-__find_offload_dev_by_name(const char *name)
-{
-	struct toedev *dev;
-
-	TAILQ_FOREACH(dev, &offload_dev_list, entry) {
-		if (!strncmp(dev->tod_name, name, TOENAMSIZ))
-			return dev;
-	}
-	return NULL;
-}
-
-/*
- * Returns true if an offload device is already registered.
- * Must be called with the offload_db_lock held.
- */
-static int
-is_registered(const struct toedev *dev)
-{
-	struct toedev *d;
-
-	TAILQ_FOREACH(d, &offload_dev_list, entry) {
-		if (d == dev)
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * Finalize the name of an offload device by assigning values to any format
- * strings in its name.
- */
-static int
-assign_name(struct toedev *dev, const char *name, int limit)
-{
-	int i;
-
-	for (i = 0; i < limit; ++i) {
-		char s[TOENAMSIZ];
-
-		if (snprintf(s, sizeof(s), name, i) >= sizeof(s))
-			return -1;                  /* name too long */
-		if (!__find_offload_dev_by_name(s)) {
-			strcpy(dev->tod_name, s);
-			return 0;
-		}
-	}
-	return -1;
-}
-
-/**
- * register_toedev - register a TOE device
- * @dev: the device
- * @name: a name template for the device
- *
- * Register a TOE device and try to attach an appropriate TCP offload module
- * to it.  @name is a template that may contain at most one %d format
- * specifier.
- */
-int
-register_toedev(struct toedev *dev, const char *name)
-{
-	int ret;
-	const char *p;
-
-	/*
-	 * Validate the name template.  Only one %d allowed and name must be
-	 * a valid filename so it can appear in sysfs.
-	 */
-	if (!name || !*name || !strcmp(name, ".") || !strcmp(name, "..") ||
-	    strchr(name, '/'))
-		return EINVAL;
-
-	p = strchr(name, '%');
-	if (p && (p[1] != 'd' || strchr(p + 2, '%')))
-		return EINVAL;
-
-	mtx_lock(&offload_db_lock);
-	if (is_registered(dev)) {  /* device already registered */
-		ret = EEXIST;
-		goto out;
-	}
-
-	if ((ret = assign_name(dev, name, 32)) != 0)
-		goto out;
-
-	dev->tod_offload_mod = NULL;
-	TAILQ_INSERT_TAIL(&offload_dev_list, dev, entry);
-out:
-	mtx_unlock(&offload_db_lock);
-	return ret;
-}
-
-/**
- * unregister_toedev - unregister a TOE device
- * @dev: the device
- *
- * Unregister a TOE device.  The device must not be attached to an offload
- * module.
- */
-int
-unregister_toedev(struct toedev *dev)
-{
-	int ret = 0;
-
-	mtx_lock(&offload_db_lock);
-	if (!is_registered(dev)) {
-		ret = ENODEV;
-		goto out;
-	}
-	if (is_attached(dev)) {
-		ret = EBUSY;
-		goto out;
-	}
-	TAILQ_REMOVE(&offload_dev_list, dev, entry);
-out:
-	mtx_unlock(&offload_db_lock);
-	return ret;
-}
-
-/**
- * activate_offload - activate an offload device
- * @dev: the device
- *
- * Activate an offload device by locating an appropriate registered offload
- * module.  If no module is found the operation fails and may be retried at
- * a later time.
- */
-int
-activate_offload(struct toedev *dev)
-{
-	int ret = 0;
-
-	mtx_lock(&offload_db_lock);
-	if (!is_registered(dev))
-		ret = ENODEV;
-	else if (!is_attached(dev))
-		ret = offload_attach(dev);
-	mtx_unlock(&offload_db_lock);
-	return ret;
-}
-
-/**
- * toe_send - send a packet to a TOE device
- * @dev: the device
- * @m: the packet
- *
- * Sends an mbuf to a TOE driver after dealing with any active network taps.
- */
-int
-toe_send(struct toedev *dev, struct mbuf *m)
-{
-	int r;
-
-	critical_enter(); /* XXX neccessary? */
-	r = dev->tod_send(dev, m);
-	critical_exit();
-	if (r)
-		BPF_MTAP(dev->tod_lldev, m);
-	return r;
-}
-
-/**
- * toe_receive_mbuf - process n received TOE packets
- * @dev: the toe device
- * @m: an array of offload packets
- * @n: the number of offload packets
- *
- * Process an array of ingress offload packets.  Each packet is forwarded
- * to any active network taps and then passed to the toe device's receive
- * method.  We optimize passing packets to the receive method by passing
- * it the whole array at once except when there are active taps.
- */
-int
-toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n)
-{
-	if (__predict_true(!bpf_peers_present(dev->tod_lldev->if_bpf)))
-		return dev->tod_recv(dev, m, n);
-
-	for ( ; n; n--, m++) {
-		m[0]->m_pkthdr.rcvif = dev->tod_lldev;
-		BPF_MTAP(dev->tod_lldev, m[0]);
-		dev->tod_recv(dev, m, 1);
-	}
-	return 0;
-}
-
-static inline int
-ifnet_is_offload(const struct ifnet *ifp)
-{
-	return (ifp->if_flags & IFCAP_TOE);
-}
-
-void
-toe_arp_update(struct rtentry *rt)
-{
-	struct ifnet *ifp = rt->rt_ifp;
-
-	if (ifp && ifnet_is_offload(ifp)) {
-		struct toedev *tdev = TOEDEV(ifp);
-
-		if (tdev && tdev->tod_arp_update)
-			tdev->tod_arp_update(tdev, rt);
-	}
-}
-
-/**
- * offload_get_phys_egress - find the physical egress device
- * @root_dev: the root device anchoring the search
- * @so: the socket used to determine egress port in bonding mode
- * @context: in bonding mode, indicates a connection set up or failover
- *
- * Given a root network device it returns the physical egress device that is a
- * descendant of the root device.  The root device may be either a physical
- * device, in which case it is the device returned, or a virtual device, such
- * as a VLAN or bonding device.  In case of a bonding device the search
- * considers the decisions of the bonding device given its mode to locate the
- * correct egress device.
- */
-struct ifnet *
-offload_get_phys_egress(struct ifnet *root_dev, struct socket *so, int context)
-{
-
-#if 0
-	while (root_dev && ifnet_is_offload(root_dev)) {
-		if (root_dev->tod_priv_flags & IFF_802_1Q_VLAN)
-			root_dev = VLAN_DEV_INFO(root_dev)->real_dev;
-		else if (root_dev->tod_flags & IFF_MASTER)
-			root_dev = toe_bond_get_slave(root_dev, sk, context);
-		else
-			break;
-	}
-#endif
-	return root_dev;
-}
-
-static int
-toecore_load(module_t mod, int cmd, void *arg)
-{
-	int err = 0;
-
-	switch (cmd) {
-	case MOD_LOAD:
-		mtx_init(&offload_db_lock, "toedev lock", NULL, MTX_DEF);
-		TAILQ_INIT(&offload_dev_list);
-		TAILQ_INIT(&offload_module_list);
-		break;
-	case MOD_QUIESCE:
-		break;
-	case MOD_UNLOAD:
-		mtx_lock(&offload_db_lock);
-		if (!TAILQ_EMPTY(&offload_dev_list) ||
-		    !TAILQ_EMPTY(&offload_module_list)) {
-			err = EBUSY;
-			mtx_unlock(&offload_db_lock);
-			break;
-		}
-		mtx_unlock(&offload_db_lock);
-		mtx_destroy(&offload_db_lock);
-		break;
-	case MOD_SHUTDOWN:
-		break;
-	default:
-		err = EOPNOTSUPP;
-		break;
-	}
-
-	return (err);
-}
-
-
-static moduledata_t mod_data= {
-	"toecore",
-	toecore_load,
-	0
-};
-
-MODULE_VERSION(toecore, 1);
-DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,35 +1,35 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
@@ -42,22 +42,17 @@
 #include <sys/sockstate.h>
 #include <sys/sockopt.h>
 #include <sys/socket.h>
+#include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <sys/priv.h>
-
-#if __FreeBSD_version < 800044
-#define V_tcp_do_autosndbuf tcp_do_autosndbuf
-#define V_tcp_autosndbuf_max tcp_autosndbuf_max
-#define V_tcp_do_rfc1323 tcp_do_rfc1323
-#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
-#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
-#define V_tcpstat tcpstat
-#endif
+#include <sys/sglist.h>
+#include <sys/taskqueue.h>
 
 #include <net/if.h>
+#include <net/ethernet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
@@ -65,37 +60,33 @@
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 
-
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
-
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
+#define TCPSTATES
 #include <netinet/tcp_fsm.h>
-#include <netinet/tcp_offload.h>
+#include <netinet/toecore.h>
 #include <netinet/tcp_seq.h>
-#include <netinet/tcp_syncache.h>
 #include <netinet/tcp_timer.h>
 #include <net/route.h>
 
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_t3_cpl.h>
-#include <common/cxgb_tcb.h>
-#include <common/cxgb_ctl_defs.h>
-#include <cxgb_offload.h>
-#include <vm/vm.h>
-#include <vm/pmap.h>
-#include <machine/bus.h>
-#include <sys/mvec.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_l2t.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_toepcb.h>
-#include <ulp/tom/cxgb_tcp.h>
-#include <ulp/tom/cxgb_tcp_offload.h>
+#include "cxgb_include.h"
+#include "ulp/tom/cxgb_l2t.h"
+#include "ulp/tom/cxgb_tom.h"
+#include "ulp/tom/cxgb_toepcb.h"
+
+VNET_DECLARE(int, tcp_do_autosndbuf);
+#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
+VNET_DECLARE(int, tcp_autosndbuf_inc);
+#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
+VNET_DECLARE(int, tcp_autosndbuf_max);
+#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+extern int always_keepalive;
 
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
@@ -108,29 +99,6 @@
  */
 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 
-#ifdef notyet
-/*
- * This sk_buff holds a fake header-only TCP segment that we use whenever we
- * need to exploit SW TCP functionality that expects TCP headers, such as
- * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
- * CPUs without locking.
- */
-static struct mbuf *tcphdr_mbuf __read_mostly;
-#endif
-
-/*
- * Size of WRs in bytes.  Note that we assume all devices we are handling have
- * the same WR size.
- */
-static unsigned int wrlen __read_mostly;
-
-/*
- * The number of WRs needed for an skb depends on the number of page fragments
- * in the skb and whether it has any payload in its main body.  This maps the
- * length of the gather list represented by an skb into the # of necessary WRs.
- */
-static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
-
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
@@ -144,1261 +112,760 @@
 #define MIN_RCV_WND (24 * 1024U)
 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 
-#define VALIDATE_SEQ 0
-#define VALIDATE_SOCK(so)
-#define DEBUG_WR 0
+static void t3_release_offload_resources(struct toepcb *);
+static void send_reset(struct toepcb *toep);
 
-#define TCP_TIMEWAIT	1
-#define TCP_CLOSE	2
-#define TCP_DROP	3
+/*
+ * Called after the last CPL for the toepcb has been received.
+ *
+ * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
+ * time this function exits.
+ */
+static int
+toepcb_release(struct toepcb *toep)
+{
+	struct inpcb *inp = toep->tp_inp;
+	struct toedev *tod = toep->tp_tod;
+	struct tom_data *td = t3_tomdata(tod);
+	int rc;
 
-static void t3_send_reset(struct toepcb *toep);
-static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
-static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
-static void handle_syncache_event(int event, void *arg);
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
+	    ("%s: double release?", __func__));
 
-static inline void
-SBAPPEND(struct sockbuf *sb, struct mbuf *n)
-{
-	struct mbuf *m;
+	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
 
-	m = sb->sb_mb;
-	while (m) {
-		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
-		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
-			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
-		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
-			m->m_next, m->m_nextpkt, m->m_flags));
-		m = m->m_next;
-	}
-	m = n;
-	while (m) {
-		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
-		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
-			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
-		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
-			m->m_next, m->m_nextpkt, m->m_flags));
-		m = m->m_next;
-	}
-	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
-	sbappendstream_locked(sb, n);
-	m = sb->sb_mb;
+	toep->tp_flags |= TP_CPL_DONE;
+	toep->tp_inp = NULL;
 
-	while (m) {
-		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
-			m->m_next, m->m_nextpkt, m->m_flags));
-		m = m->m_next;
-	}
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+
+	if (!(toep->tp_flags & TP_ATTACHED))
+		t3_release_offload_resources(toep);
+
+	rc = in_pcbrele_wlocked(inp);
+	if (!rc)
+		INP_WUNLOCK(inp);
+	return (rc);
 }
 
-static inline int
-is_t3a(const struct toedev *dev)
+/*
+ * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
+ * hanging off it.  If the TOE driver is also done with the toepcb we'll release
+ * all offload resources.
+ */
+static void
+toepcb_detach(struct inpcb *inp)
 {
-	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
+	struct toepcb *toep;
+	struct tcpcb *tp;
+
+	KASSERT(inp, ("%s: inp is NULL", __func__));
+	INP_WLOCK_ASSERT(inp);
+
+	tp = intotcpcb(inp);
+	toep = tp->t_toe;
+
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
+
+	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
+	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
+	    toep, inp, tp);
+
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+	toep->tp_flags &= ~TP_ATTACHED;
+
+	if (toep->tp_flags & TP_CPL_DONE)
+		t3_release_offload_resources(toep);
+}
+
+void
+t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
+{
+
+	toepcb_detach(tp->t_inpcb);
+}
+
+static int
+alloc_atid(struct tid_info *t, void *ctx)
+{
+	int atid = -1;
+
+	mtx_lock(&t->atid_lock);
+	if (t->afree) {
+		union active_open_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->ctx = ctx;
+		t->atids_in_use++;
+	}
+	mtx_unlock(&t->atid_lock);
+
+	return (atid);
 }
 
 static void
-dump_toepcb(struct toepcb *toep)
+free_atid(struct tid_info *t, int atid)
 {
-	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
-	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
-	    toep->tp_mtu_idx, toep->tp_tid);
+	union active_open_entry *p = atid2entry(t, atid);
 
-	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
-	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 
-	    toep->tp_mss_clamp, toep->tp_flags);
+	mtx_lock(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	mtx_unlock(&t->atid_lock);
 }
 
-#ifndef RTALLOC2_DEFINED
-static struct rtentry *
-rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
+void
+insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
-	struct rtentry *rt = NULL;
-	
-	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
-		RT_UNLOCK(rt);
+	struct tid_info *t = &td->tid_maps;
 
-	return (rt);
-}
-#endif
-
-/*
- * Determine whether to send a CPL message now or defer it.  A message is
- * deferred if the connection is in SYN_SENT since we don't know the TID yet.
- * For connections in other states the message is sent immediately.
- * If through_l2t is set the message is subject to ARP processing, otherwise
- * it is sent directly.
- */
-static inline void
-send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
-{
-	struct tcpcb *tp = toep->tp_tp;
-
-	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
-		inp_wlock(tp->t_inpcb);
-		mbufq_tail(&toep->out_of_order_queue, m);  // defer
-		inp_wunlock(tp->t_inpcb);
-	} else if (through_l2t)
-		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
-	else
-		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
+	t->tid_tab[tid] = ctx;
+	atomic_add_int(&t->tids_in_use, 1);
 }
 
-static inline unsigned int
-mkprio(unsigned int cntrl, const struct toepcb *toep)
+void
+update_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
-        return (cntrl);
+	struct tid_info *t = &td->tid_maps;
+
+	t->tid_tab[tid] = ctx;
+}
+
+void
+remove_tid(struct tom_data *td, unsigned int tid)
+{
+	struct tid_info *t = &td->tid_maps;
+
+	t->tid_tab[tid] = NULL;
+	atomic_add_int(&t->tids_in_use, -1);
+}
+
+/* use ctx as a next pointer in the tid release list */
+void
+queue_tid_release(struct toedev *tod, unsigned int tid)
+{
+	struct tom_data *td = t3_tomdata(tod);
+	void **p = &td->tid_maps.tid_tab[tid];
+	struct adapter *sc = tod->tod_softc;
+
+	mtx_lock(&td->tid_release_lock);
+	*p = td->tid_release_list;
+	td->tid_release_list = p;
+	if (!*p)
+		taskqueue_enqueue(sc->tq, &td->tid_release_task);
+	mtx_unlock(&td->tid_release_lock);
 }
 
 /*
- * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ * Populate a TID_RELEASE WR.
  */
 static inline void
-mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
+mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
 {
-	struct cpl_tid_release *req;
 
-	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	req = mtod(m, struct cpl_tid_release *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+void
+release_tid(struct toedev *tod, unsigned int tid, int qset)
+{
+	struct tom_data *td = t3_tomdata(tod);
+	struct adapter *sc = tod->tod_softc;
+	struct mbuf *m;
+	struct cpl_tid_release *cpl;
+#ifdef INVARIANTS
+	struct tid_info *t = &td->tid_maps;
+#endif
+
+	KASSERT(tid >= 0 && tid < t->ntids,
+	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
+
+	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
+	if (m) {
+		mk_tid_release(cpl, tid);
+		t3_offload_tx(sc, m);
+		remove_tid(td, tid);
+	} else
+		queue_tid_release(tod, tid);
+
+}
+
+void
+t3_process_tid_release_list(void *data, int pending)
+{
+	struct mbuf *m;
+	struct tom_data *td = data;
+	struct adapter *sc = td->tod.tod_softc;
+
+	mtx_lock(&td->tid_release_lock);
+	while (td->tid_release_list) {
+		void **p = td->tid_release_list;
+		unsigned int tid = p - td->tid_maps.tid_tab;
+		struct cpl_tid_release *cpl;
+
+		td->tid_release_list = (void **)*p;
+		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
+		if (m == NULL)
+			break;	/* XXX: who reschedules the release task? */
+		mtx_unlock(&td->tid_release_lock);
+		mk_tid_release(cpl, tid);
+		t3_offload_tx(sc, m);
+		remove_tid(td, tid);
+		mtx_lock(&td->tid_release_lock);
+	}
+	mtx_unlock(&td->tid_release_lock);
+}
+
+static void
+close_conn(struct adapter *sc, struct toepcb *toep)
+{
+	struct mbuf *m;
+	struct cpl_close_con_req *req;
+
+	if (toep->tp_flags & TP_FIN_SENT)
+		return;
+
+	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
+	if (m == NULL)
+		CXGB_UNIMPLEMENTED();
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
+	req->rsvd = 0;
+
+	toep->tp_flags |= TP_FIN_SENT;
+	t3_offload_tx(sc, m);
 }
 
 static inline void
-make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
+make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
+    struct mbuf *tail)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
-	struct tx_data_wr *req;
 	struct sockbuf *snd;
-	
+
 	inp_lock_assert(tp->t_inpcb);
 	snd = so_sockbuf_snd(so);
-	
-	req = mtod(m, struct tx_data_wr *);
-	m->m_len = sizeof(*req);
-	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
-	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	/* len includes the length of any HW ULP additions */
 	req->len = htonl(len);
 	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 	/* V_TX_ULP_SUBMODE sets both the mode and submode */
-	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
-	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
-	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
-				   (tail ? 0 : 1))));
+	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
+	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
 	req->sndseq = htonl(tp->snd_nxt);
 	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
-		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 
-				    V_TX_CPU_IDX(toep->tp_qset));
- 
-		/* Sendbuffer is in units of 32KB.
-		 */
+		struct adapter *sc = toep->tp_tod->tod_softc;
+		int cpu_idx = sc->rrss_map[toep->tp_qset];
+
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
+		    V_TX_CPU_IDX(cpu_idx));
+
+		/* Sendbuffer is in units of 32KB. */
 		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
-			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
-		else {
+			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
+		else
 			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
-		}
-		
+
 		toep->tp_flags |= TP_DATASENT;
 	}
 }
 
-#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
+/*
+ * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
+ * TOM_XXX_MOVE to some common header file.
+ */
+/*
+ * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
+ * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
+ * for the second gen bit flit.  This leaves us with 12 flits.
+ *
+ * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
+ * The first desc has a tx_data_wr (which includes the WR header), the rest have
+ * the WR header only.  All descs have the second gen bit flit.
+ *
+ * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
+ * desc has a tx_data_wr (which includes the WR header), the rest have the WR
+ * header only.  All descs have the second gen bit flit.
+ *
+ * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
+ *
+ */
+#define IMM_LEN 96
+static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
+static int sgllen_to_descs[TX_MAX_SEGS] = {
+	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
+	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
+	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
+	4, 4, 4, 4, 4, 4		/* 30 - 35 */
+};
+#if 0
+static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
+	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
+};
+#endif
+#if SGE_NUM_GENBITS != 2
+#error "SGE_NUM_GENBITS really must be 2"
+#endif
 
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
-	
-	struct mbuf *tail, *m0, *last;
-	struct t3cdev *cdev;
-	struct tom_data *d;
-	int state, bytes, count, total_bytes;
-	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
+	struct mbuf *m0, *sndptr, *m;
+	struct toedev *tod = toep->tp_tod;
+	struct adapter *sc = tod->tod_softc;
+	int bytes, ndesc, total_bytes = 0, mlen;
 	struct sockbuf *snd;
-	
-	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
-		DPRINTF("tcp state=%d\n", tp->t_state);	
-		return (0);
-	}	
-
-	state = so_state_get(so);
-	
-	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
-		DPRINTF("disconnecting\n");
-		
-		return (0);
-	}
+	struct sglist *sgl;
+	struct ofld_hdr *oh;
+	caddr_t dst;
+	struct tx_data_wr *wr;
 
 	inp_lock_assert(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
-	sockbuf_lock(snd);
+	SOCKBUF_LOCK(snd);
 
-	d = TOM_DATA(toep->tp_toedev);
-	cdev = d->cdev;
-
-	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
-
-	total_bytes = 0;
-	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
-	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
-
-	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
-		KASSERT(tail, ("sbdrop error"));
-		last = tail = tail->m_next;
+	/*
+	 * Autosize the send buffer.
+	 */
+	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
+		if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) &&
+		    snd->sb_cc < VNET(tcp_autosndbuf_max)) {
+			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
+			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
+			    so, curthread))
+				snd->sb_flags &= ~SB_AUTOSIZE;
+		}
 	}
 
-	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
-		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
-		sockbuf_unlock(snd);
+	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
+		sndptr = toep->tp_m_last->m_next;
+	else
+		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 
-		return (0);		
-	}
-			
-	toep->tp_m_last = NULL;
-	while (toep->tp_wr_avail && (tail != NULL)) {
-		count = bytes = 0;
-		segp = segs;
-		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
-			sockbuf_unlock(snd);
-			return (0);
+	/* Nothing to send or no WRs available for sending data */
+	if (toep->tp_wr_avail == 0 || sndptr == NULL)
+		goto out;
+
+	/* Something to send and at least 1 WR available */
+	while (toep->tp_wr_avail && sndptr != NULL) {
+
+		m0 = m_gethdr(M_NOWAIT, MT_DATA);
+		if (m0 == NULL)
+			break;
+		oh = mtod(m0, struct ofld_hdr *);
+		wr = (void *)(oh + 1);
+		dst = (void *)(wr + 1);
+
+		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
+		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
+		    V_HDR_QSET(toep->tp_qset);
+
+		/*
+		 * Try to construct an immediate data WR if possible.  Stuff as
+		 * much data into it as possible, one whole mbuf at a time.
+		 */
+		mlen = sndptr->m_len;
+		ndesc = bytes = 0;
+		while (mlen <= IMM_LEN - bytes) {
+			bcopy(sndptr->m_data, dst, mlen);
+			bytes += mlen;
+			dst += mlen;
+
+			if (!(sndptr = sndptr->m_next))
+				break;
+			mlen = sndptr->m_len;
 		}
-		/*
-		 * If the data in tail fits as in-line, then
-		 * make an immediate data wr.
-		 */
-		if (tail->m_len <= IMM_LEN) {
-			count = 1;
-			bytes = tail->m_len;
-			last = tail;
-			tail = tail->m_next;
-			m_set_sgl(m0, NULL);
-			m_set_sgllen(m0, 0);
-			make_tx_data_wr(so, m0, bytes, tail);
-			m_append(m0, bytes, mtod(last, caddr_t));
-			KASSERT(!m0->m_next, ("bad append"));
+
+		if (bytes) {
+
+			/* Was able to fit 'bytes' bytes in an immediate WR */
+
+			ndesc = 1;
+			make_tx_data_wr(so, wr, bytes, sndptr);
+
+			m0->m_len += bytes;
+			m0->m_pkthdr.len = m0->m_len;
+
 		} else {
-			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
-			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
-				bytes += tail->m_len;
-				last = tail;
-				count++;
-				/*
-				 * technically an abuse to be using this for a VA
-				 * but less gross than defining my own structure
-				 * or calling pmap_kextract from here :-|
-				 */
-				segp->ds_addr = (bus_addr_t)tail->m_data;
-				segp->ds_len = tail->m_len;
-				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
-				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
-				segp++;
-				tail = tail->m_next;
+			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
+
+			/* Need to make an SGL */
+
+			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
+			if (sgl == NULL)
+				break;
+
+			for (m = sndptr; m != NULL; m = m->m_next) {
+				if ((mlen = m->m_len) > 0) {
+					if (sglist_append(sgl, m->m_data, mlen))
+					    break;
+				}
+				bytes += mlen;
 			}
-			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
-			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
+			sndptr = m;
+			if (bytes == 0) {
+				sglist_free(sgl);
+				break;
+			}
+			ndesc = sgllen_to_descs[sgl->sg_nseg];
+			oh->flags |= F_HDR_SGL;
+			oh->sgl = sgl;
+			make_tx_data_wr(so, wr, bytes, sndptr);
+		}
 
-			m_set_sgl(m0, segs);
-			m_set_sgllen(m0, count);
-			make_tx_data_wr(so, m0, bytes, tail);
-		}
-		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
+		oh->flags |= V_HDR_NDESC(ndesc);
+		oh->plen = bytes;
 
-		if (tail) {
-			snd->sb_sndptr = tail;
+		snd->sb_sndptr = sndptr;
+		snd->sb_sndptroff += bytes;
+		if (sndptr == NULL) {
+			snd->sb_sndptr = snd->sb_mbtail;
+			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
+			toep->tp_m_last = snd->sb_mbtail;
+		} else
 			toep->tp_m_last = NULL;
-		} else 
-			toep->tp_m_last = snd->sb_sndptr = last;
 
+		total_bytes += bytes;
 
-		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
+		toep->tp_wr_avail -= ndesc;
+		toep->tp_wr_unacked += ndesc;
 
-		snd->sb_sndptroff += bytes;
-		total_bytes += bytes;
-		toep->tp_write_seq += bytes;
-		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
-		    " tail=%p sndptr=%p sndptroff=%d",
-		    toep->tp_wr_avail, count, mbuf_wrs[count],
-		    tail, snd->sb_sndptr, snd->sb_sndptroff);	
-		if (tail)
-			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
-			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
-			    total_bytes, toep->tp_m_last, tail->m_data,
-			    tp->snd_una);
-		else
-			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
-			    " tp_m_last=%p snd_una=0x%08x",
-			    total_bytes, toep->tp_m_last, tp->snd_una);
-
-
-#ifdef KTR		
-{
-		int i;
-
-		i = 0;
-		while (i < count && m_get_sgllen(m0)) {
-			if ((count - i) >= 3) {
-				CTR6(KTR_TOM,
-				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
-				    " len=%d pa=0x%zx len=%d",
-				    segs[i].ds_addr, segs[i].ds_len,
-				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
-				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
-				    i += 3;
-			} else if ((count - i) == 2) {
-				CTR4(KTR_TOM, 
-				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
-				    " len=%d",
-				    segs[i].ds_addr, segs[i].ds_len,
-				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
-				    i += 2;
-			} else {
-				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
-				    segs[i].ds_addr, segs[i].ds_len);
-				i++;
-			}
-	
-		}
-}
-#endif		
-                 /*
-		 * remember credits used
-		 */
-		m0->m_pkthdr.csum_data = mbuf_wrs[count];
-		m0->m_pkthdr.len = bytes;
-		toep->tp_wr_avail -= mbuf_wrs[count];
-		toep->tp_wr_unacked += mbuf_wrs[count];
-		
-		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
+		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
-			struct work_request_hdr *wr = cplhdr(m0);
-
-			wr->wr_hi |= htonl(F_WR_COMPL);
+			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
-		KASSERT((m0->m_pkthdr.csum_data > 0) &&
-		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
-			m0->m_pkthdr.csum_data));
-		m0->m_type = MT_DONTFREE;
+
 		enqueue_wr(toep, m0);
-		DPRINTF("sending offload tx with %d bytes in %d segments\n",
-		    bytes, count);
-		l2t_send(cdev, m0, toep->tp_l2t);
+		l2t_send(sc, m0, toep->tp_l2t);
 	}
-	sockbuf_unlock(snd);
+out:
+	SOCKBUF_UNLOCK(snd);
+
+	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
+		close_conn(sc, toep);
+
 	return (total_bytes);
 }
 
-/*
- * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
- * under any circumstances.  We take the easy way out and always queue the
- * message to the write_queue.  We can optimize the case where the queue is
- * already empty though the optimization is probably not worth it.
- */
-static void
-close_conn(struct socket *so)
-{
-	struct mbuf *m;
-	struct cpl_close_con_req *req;
-	struct tom_data *d;
-	struct inpcb *inp = so_sotoinpcb(so);
-	struct tcpcb *tp;
-	struct toepcb *toep;
-	unsigned int tid; 
-
-
-	inp_wlock(inp);
-	tp = so_sototcpcb(so);
-	toep = tp->t_toe;
-	
-	if (tp->t_state != TCPS_SYN_SENT)
-		t3_push_frames(so, 1);
-	
-	if (toep->tp_flags & TP_FIN_SENT) {
-		inp_wunlock(inp);
-		return;
-	}
-
-	tid = toep->tp_tid;
-	    
-	d = TOM_DATA(toep->tp_toedev);
-	
-	m = m_gethdr_nofail(sizeof(*req));
-	m_set_priority(m, CPL_PRIORITY_DATA);
-	m_set_sgl(m, NULL);
-	m_set_sgllen(m, 0);
-
-	toep->tp_flags |= TP_FIN_SENT;
-	req = mtod(m, struct cpl_close_con_req *);
-	
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
-	req->wr.wr_lo = htonl(V_WR_TID(tid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
-	req->rsvd = 0;
-	inp_wunlock(inp);
-	/*
-	 * XXX - need to defer shutdown while there is still data in the queue
-	 *
-	 */
-	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
-	cxgb_ofld_send(d->cdev, m);
-
-}
-
-/*
- * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
- * and send it along.
- */
-static void
-abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
-{
-	struct cpl_abort_req *req = cplhdr(m);
-
-	req->cmd = CPL_ABORT_NO_RST;
-	cxgb_ofld_send(cdev, m);
-}
-
-/*
- * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
- * permitted to return without sending the message in case we cannot allocate
- * an sk_buff.  Returns the number of credits sent.
- */
-uint32_t
-t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
+static int
+send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
-	struct toepcb *toep = tp->t_toe;
-	struct toedev *tdev = toep->tp_toedev;
-	
-	m = m_gethdr_nofail(sizeof(*req));
+	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
-	DPRINTF("returning %u credits to HW\n", credits);
-	
-	req = mtod(m, struct cpl_rx_data_ack *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
+	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
+	if (m == NULL)
+		return (0);
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wrh_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
-	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 
-	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	t3_offload_tx(sc, m);
 	return (credits);
 }
 
-/*
- * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
- * This is only used in DDP mode, so we take the opportunity to also set the
- * DACK mode and flush any Rx credits.
- */
 void
-t3_send_rx_modulate(struct toepcb *toep)
+t3_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
-	struct mbuf *m;
-	struct cpl_rx_data_ack *req;
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	struct sockbuf *so_rcv = &so->so_rcv;
+	struct toepcb *toep = tp->t_toe;
+	int must_send;
 
-	m = m_gethdr_nofail(sizeof(*req));
+	INP_WLOCK_ASSERT(inp);
 
-	req = mtod(m, struct cpl_rx_data_ack *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
-	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
-				 V_RX_DACK_MODE(1) |
-				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
-	toep->tp_rcv_wup = toep->tp_copied_seq;
+	SOCKBUF_LOCK(so_rcv);
+	KASSERT(toep->tp_enqueued >= so_rcv->sb_cc,
+	    ("%s: so_rcv->sb_cc > enqueued", __func__));
+	toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc;
+	toep->tp_enqueued = so_rcv->sb_cc;
+	SOCKBUF_UNLOCK(so_rcv);
+
+	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
+	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
+		int credits;
+
+		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
+		toep->tp_rx_credits -= credits;
+		tp->rcv_wnd += credits;
+		tp->rcv_adv += credits;
+	}
 }
 
-/*
- * Handle receipt of an urgent pointer.
- */
-static void
-handle_urg_ptr(struct socket *so, uint32_t urg_seq)
+static int
+do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-#ifdef URGENT_DATA_SUPPORTED
-	struct tcpcb *tp = so_sototcpcb(so);
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
+	unsigned int tid = GET_TID(hdr);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
-	urg_seq--;   /* initially points past the urgent data, per BSD */
-
-	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
-		return;                                 /* duplicate pointer */
-	sk_send_sigurg(sk);
-	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
-	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
-		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
-		tp->copied_seq++;
-		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
-			tom_eat_skb(sk, skb, 0);
-	}
-	tp->urg_data = TCP_URG_NOTYET;
-	tp->urg_seq = urg_seq;
-#endif
-}
-
-/*
- * Returns true if a socket cannot accept new Rx data.
- */
-static inline int
-so_no_receive(const struct socket *so)
-{
-	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
-}
-
-/*
- * Process an urgent data notification.
- */
-static void
-rx_urg_notify(struct toepcb *toep, struct mbuf *m)
-{
-	struct cpl_rx_urg_notify *hdr = cplhdr(m);
-	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
-
-	VALIDATE_SOCK(so);
-
-	if (!so_no_receive(so))
-		handle_urg_ptr(so, ntohl(hdr->seq));
+	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
 
 	m_freem(m);
-}
-
-/*
- * Handler for RX_URG_NOTIFY CPL messages.
- */
-static int
-do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct toepcb *toep = (struct toepcb *)ctx;
-
-	rx_urg_notify(toep, m);
 	return (0);
 }
 
-static __inline int
-is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
-{
-	return (toep->tp_ulp_mode ||
-		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
-		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
-}
-
-/*
- * Set of states for which we should return RX credits.
- */
-#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
-
-/*
- * Called after some received data has been read.  It returns RX credits
- * to the HW for the amount of data processed.
- */
-void
-t3_cleanup_rbuf(struct tcpcb *tp, int copied)
+int
+t3_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
-	struct socket *so;
-	struct toedev *dev;
-	int dack_mode, must_send, read;
-	u32 thres, credits, dack = 0;
-	struct sockbuf *rcv;
-	
-	so = inp_inpcbtosocket(tp->t_inpcb);
-	rcv = so_sockbuf_rcv(so);
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp_inpcbtosocket(inp);
+#if defined(KTR)
+	unsigned int tid = toep->tp_tid;
+#endif
 
-	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
-		(tp->t_state == TCPS_FIN_WAIT_2))) {
-		if (copied) {
-			sockbuf_lock(rcv);
-			toep->tp_copied_seq += copied;
-			sockbuf_unlock(rcv);
-		}
-		
-		return;
-	}
-	
-	inp_lock_assert(tp->t_inpcb); 
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(inp);
 
-	sockbuf_lock(rcv);
-	if (copied)
-		toep->tp_copied_seq += copied;
-	else {
-		read = toep->tp_enqueued_bytes - rcv->sb_cc;
-		toep->tp_copied_seq += read;
-	}
-	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
-	toep->tp_enqueued_bytes = rcv->sb_cc;
-	sockbuf_unlock(rcv);
+	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
+	    toep->tp_flags);
 
-	if (credits > rcv->sb_mbmax) {
-		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
-		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
-	    credits = rcv->sb_mbmax;
-	}
-	
-	    
-	/*
-	 * XXX this won't accurately reflect credit return - we need
-	 * to look at the difference between the amount that has been 
-	 * put in the recv sockbuf and what is there now
-	 */
+	toep->tp_flags |= TP_SEND_FIN;
+	t3_push_frames(so, 1);
 
-	if (__predict_false(!credits))
-		return;
-
-	dev = toep->tp_toedev;
-	thres = TOM_TUNABLE(dev, rx_credit_thres);
-
-	if (__predict_false(thres == 0))
-		return;
-
-	if (is_delack_mode_valid(dev, toep)) {
-		dack_mode = TOM_TUNABLE(dev, delack);
-		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
-			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
-
-			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
-				dack = F_RX_DACK_CHANGE |
-				       V_RX_DACK_MODE(dack_mode);
-		}
-	} else 
-		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
-		
-	/*
-	 * For coalescing to work effectively ensure the receive window has
-	 * at least 16KB left.
-	 */
-	must_send = credits + 16384 >= tp->rcv_wnd;
-
-	if (must_send || credits >= thres)
-		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
-}
-
-static int
-cxgb_toe_disconnect(struct tcpcb *tp)
-{
-	struct socket *so;
-	
-	DPRINTF("cxgb_toe_disconnect\n");
-
-	so = inp_inpcbtosocket(tp->t_inpcb);
-	close_conn(so);
 	return (0);
 }
 
-static int
-cxgb_toe_reset(struct tcpcb *tp)
+int
+t3_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
-	struct toepcb *toep = tp->t_toe;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
 
-	t3_send_reset(toep);
-
-	/*
-	 * unhook from socket
-	 */
-	tp->t_flags &= ~TF_TOE;
-	toep->tp_tp = NULL;
-	tp->t_toe = NULL;
-	return (0);
-}
-
-static int
-cxgb_toe_send(struct tcpcb *tp)
-{
-	struct socket *so;
-	
-	DPRINTF("cxgb_toe_send\n");
-	dump_toepcb(tp->t_toe);
-
-	so = inp_inpcbtosocket(tp->t_inpcb);
 	t3_push_frames(so, 1);
 	return (0);
 }
 
-static int
-cxgb_toe_rcvd(struct tcpcb *tp)
+/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
+int
+find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
+	unsigned short *mtus = &sc->params.mtus[0];
+	int i = 0, mss;
 
-	inp_lock_assert(tp->t_inpcb);
+	KASSERT(inc != NULL || pmss > 0,
+	    ("%s: at least one of inc/pmss must be specified", __func__));
 
-	t3_cleanup_rbuf(tp, 0);
-	
-	return (0);
+	mss = inc ? tcp_mssopt(inc) : pmss;
+	if (pmss > 0 && mss > pmss)
+		mss = pmss;
+
+	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
+		++i;
+
+	return (i);
 }
 
-static void
-cxgb_toe_detach(struct tcpcb *tp)
-{
-	struct toepcb *toep;
-
-        /*
-	 * XXX how do we handle teardown in the SYN_SENT state?
-	 *
-	 */
-	inp_lock_assert(tp->t_inpcb);
-	toep = tp->t_toe;
-	toep->tp_tp = NULL;
-
-	/*
-	 * unhook from socket
-	 */
-	tp->t_flags &= ~TF_TOE;
-	tp->t_toe = NULL;
-}
-	
-
-static struct toe_usrreqs cxgb_toe_usrreqs = {
-	.tu_disconnect = cxgb_toe_disconnect,
-	.tu_reset = cxgb_toe_reset,
-	.tu_send = cxgb_toe_send,
-	.tu_rcvd = cxgb_toe_rcvd,
-	.tu_detach = cxgb_toe_detach,
-	.tu_detach = cxgb_toe_detach,
-	.tu_syncache_event = handle_syncache_event,
-};
-
-
-static void
-__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
-			    uint64_t mask, uint64_t val, int no_reply)
-{
-	struct cpl_set_tcb_field *req;
-
-	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
-	    toep->tp_tid, word, mask, val);
-
-	req = mtod(m, struct cpl_set_tcb_field *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
-	req->reply = V_NO_REPLY(no_reply);
-	req->cpu_idx = 0;
-	req->word = htons(word);
-	req->mask = htobe64(mask);
-	req->val = htobe64(val);
-
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	send_or_defer(toep, m, 0);
-}
-
-static void
-t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
+static inline void
+purge_wr_queue(struct toepcb *toep)
 {
 	struct mbuf *m;
-	struct tcpcb *tp = toep->tp_tp;
-	
-	if (toep == NULL)
-		return;
- 
-	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
-		printf("not seting field\n");
-		return;
+	struct ofld_hdr *oh;
+
+	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
+		oh = mtod(m, struct ofld_hdr *);
+		if (oh->flags & F_HDR_SGL)
+			sglist_free(oh->sgl);
+		m_freem(m);
 	}
-	
-	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
-
-	__set_tcb_field(toep, m, word, mask, val, 1);
 }
 
 /*
- * Set one of the t_flags bits in the TCB.
- */
-static void
-set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
-{
-
-	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
-}
-
-/*
- * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
- */
-static void
-t3_set_nagle(struct toepcb *toep)
-{
-	struct tcpcb *tp = toep->tp_tp;
-	
-	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
-}
-
-/*
- * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
- */
-void
-t3_set_keepalive(struct toepcb *toep, int on_off)
-{
-
-	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
-}
-
-void
-t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
-{
-	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
-}
-
-void
-t3_set_dack_mss(struct toepcb *toep, int on_off)
-{
-
-	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
-}
-
-/*
- * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
- */
-static void
-t3_set_tos(struct toepcb *toep)
-{
-	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);	
-	
-	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
-			 V_TCB_TOS(tos));
-}
-
-
-/*
- * In DDP mode, TP fails to schedule a timer to push RX data to the host when
- * DDP is disabled (data is delivered to freelist). [Note that, the peer should
- * set the PSH bit in the last segment, which would trigger delivery.]
- * We work around the issue by setting a DDP buffer in a partial placed state,
- * which guarantees that TP will schedule a timer.
- */
-#define TP_DDP_TIMER_WORKAROUND_MASK\
-    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
-     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
-       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
-#define TP_DDP_TIMER_WORKAROUND_VAL\
-    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
-     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
-      32))
-
-static void
-t3_enable_ddp(struct toepcb *toep, int on)
-{
-	if (on) {
-		
-		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
-				 V_TF_DDP_OFF(0));
-	} else
-		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
-				 V_TF_DDP_OFF(1) |
-				 TP_DDP_TIMER_WORKAROUND_MASK,
-				 V_TF_DDP_OFF(1) |
-				 TP_DDP_TIMER_WORKAROUND_VAL);
-
-}
-
-void
-t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
-{
-	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
-			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
-			 tag_color);
-}
-
-void
-t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
-		    unsigned int len)
-{
-	if (buf_idx == 0)
-		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
-			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
-			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
-			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
-			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
-	else
-		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
-			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
-			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
-			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
-			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
-}
-
-static int
-t3_set_cong_control(struct socket *so, const char *name)
-{
-#ifdef CONGESTION_CONTROL_SUPPORTED	
-	int cong_algo;
-
-	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
-		if (!strcmp(name, t3_cong_ops[cong_algo].name))
-			break;
-
-	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
-		return -EINVAL;
-#endif
-	return 0;
-}
-
-int
-t3_get_tcb(struct toepcb *toep)
-{
-	struct cpl_get_tcb *req;
-	struct tcpcb *tp = toep->tp_tp;
-	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
-
-	if (!m)
-		return (ENOMEM);
-	
-	inp_lock_assert(tp->t_inpcb);	
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	req = mtod(m, struct cpl_get_tcb *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
-	req->cpuno = htons(toep->tp_qset);
-	req->rsvd = 0;
-	if (tp->t_state == TCPS_SYN_SENT)
-		mbufq_tail(&toep->out_of_order_queue, m);	// defer
-	else
-		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
-	return 0;
-}
-
-static inline void
-so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
-{
-
-	toepcb_hold(toep);
-
-	cxgb_insert_tid(d->cdev, d->client, toep, tid);
-}
-
-/**
- *	find_best_mtu - find the entry in the MTU table closest to an MTU
- *	@d: TOM state
- *	@mtu: the target MTU
- *
- *	Returns the index of the value in the MTU table that is closest to but
- *	does not exceed the target MTU.
- */
-static unsigned int
-find_best_mtu(const struct t3c_data *d, unsigned short mtu)
-{
-	int i = 0;
-
-	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
-		++i;
-	return (i);
-}
-
-static unsigned int
-select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
-{
-	unsigned int idx;
-	
-#ifdef notyet
-	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
-#endif
-	if (tp) {
-		tp->t_maxseg = pmtu - 40;
-		if (tp->t_maxseg < td->mtus[0] - 40)
-			tp->t_maxseg = td->mtus[0] - 40;
-		idx = find_best_mtu(td, tp->t_maxseg + 40);
-
-		tp->t_maxseg = td->mtus[idx] - 40;
-	} else
-		idx = find_best_mtu(td, pmtu);
-	
-	return (idx);
-}
-
-static inline void
-free_atid(struct t3cdev *cdev, unsigned int tid)
-{
-	struct toepcb *toep = cxgb_free_atid(cdev, tid);
-
-	if (toep)
-		toepcb_release(toep);
-}
-
-/*
- * Release resources held by an offload connection (TID, L2T entry, etc.)
+ * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
+ * entry, etc.)
  */
 static void
 t3_release_offload_resources(struct toepcb *toep)
 {
-	struct tcpcb *tp = toep->tp_tp;
-	struct toedev *tdev = toep->tp_toedev;
-	struct t3cdev *cdev;
-	struct socket *so;
-	unsigned int tid = toep->tp_tid;
-	struct sockbuf *rcv;
-	
-	CTR0(KTR_TOM, "t3_release_offload_resources");
+	struct toedev *tod = toep->tp_tod;
+	struct tom_data *td = t3_tomdata(tod);
 
-	if (!tdev)
-		return;
-
-	cdev = TOEP_T3C_DEV(toep);
-	if (!cdev)
-		return;
-
-	toep->tp_qset = 0;
-	t3_release_ddp_resources(toep);
-
-#ifdef CTRL_SKB_CACHE
-	kfree_skb(CTRL_SKB_CACHE(tp));
-	CTRL_SKB_CACHE(tp) = NULL;
-#endif
-
-	if (toep->tp_wr_avail != toep->tp_wr_max) {
-		purge_wr_queue(toep);
-		reset_wr_list(toep);
+	/*
+	 * The TOM explicitly detaches its toepcb from the system's inp before
+	 * it releases the offload resources.
+	 */
+	if (toep->tp_inp) {
+		panic("%s: inp %p still attached to toepcb %p",
+		    __func__, toep->tp_inp, toep);
 	}
 
+	if (toep->tp_wr_avail != toep->tp_wr_max)
+		purge_wr_queue(toep);
+
 	if (toep->tp_l2t) {
-		l2t_release(L2DATA(cdev), toep->tp_l2t);
+		l2t_release(td->l2t, toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
-	toep->tp_tp = NULL;
-	if (tp) {
-		inp_lock_assert(tp->t_inpcb);
-		so = inp_inpcbtosocket(tp->t_inpcb);
-		rcv = so_sockbuf_rcv(so);		
-		/*
-		 * cancel any offloaded reads
-		 *
-		 */
-		sockbuf_lock(rcv);
-		tp->t_toe = NULL;
-		tp->t_flags &= ~TF_TOE;
-		if (toep->tp_ddp_state.user_ddp_pending) {
-			t3_cancel_ubuf(toep, rcv);
-			toep->tp_ddp_state.user_ddp_pending = 0;
-		}
-		so_sorwakeup_locked(so);
-			
-	}
-	
-	if (toep->tp_state == TCPS_SYN_SENT) {
-		free_atid(cdev, tid);
-#ifdef notyet		
-		__skb_queue_purge(&tp->out_of_order_queue);
-#endif		
-	} else {                                          // we have TID
-		cxgb_remove_tid(cdev, toep, tid);
-		toepcb_release(toep);
-	}
-#if 0
-	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
-#endif
-}
 
-static void
-install_offload_ops(struct socket *so)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
+	if (toep->tp_tid >= 0)
+		release_tid(tod, toep->tp_tid, toep->tp_qset);
 
-	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
-	
-	t3_install_socket_ops(so);
-	tp->t_flags |= TF_TOE;
-	tp->t_tu = &cxgb_toe_usrreqs;
-}
-
-/*
- * Determine the receive window scaling factor given a target max
- * receive window.
- */
-static __inline int
-select_rcv_wscale(int space, struct vnet *vnet)
-{
-	int wscale = 0;
-
-	if (space > MAX_RCV_WND)
-		space = MAX_RCV_WND;
-
-	if (V_tcp_do_rfc1323)
-		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
-
-	return (wscale);
+	toepcb_free(toep);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
-static unsigned long
-select_rcv_wnd(struct toedev *dev, struct socket *so)
+unsigned long
+select_rcv_wnd(struct socket *so)
 {
-	struct tom_data *d = TOM_DATA(dev);
-	unsigned int wnd;
-	unsigned int max_rcv_wnd;
-	struct sockbuf *rcv;
+	unsigned long wnd;
 
-	rcv = so_sockbuf_rcv(so);
-	
-	if (V_tcp_do_autorcvbuf)
-		wnd = V_tcp_autorcvbuf_max;
-	else
-		wnd = rcv->sb_hiwat;
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
-	
-	
-	/* XXX
-	 * For receive coalescing to work effectively we need a receive window
-	 * that can accomodate a coalesced segment.
-	 */	
+	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
-		wnd = MIN_RCV_WND; 
-	
-	/* PR 5138 */
-	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 
-				    (uint32_t)d->rx_page_size * 23 :
-				    MAX_RCV_WND);
-	
-	return min(wnd, max_rcv_wnd);
+		wnd = MIN_RCV_WND;
+
+	return min(wnd, MAX_RCV_WND);
+}
+
+int
+select_rcv_wscale(void)
+{
+	int wscale = 0;
+	unsigned long space = sb_max;
+
+	if (space > MAX_RCV_WND)
+		space = MAX_RCV_WND;
+
+	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
+		wscale++;
+
+	return (wscale);
+}
+
+
+/*
+ * Set up the socket for TCP offload.
+ */
+void
+offload_socket(struct socket *so, struct toepcb *toep)
+{
+	struct toedev *tod = toep->tp_tod;
+	struct tom_data *td = t3_tomdata(tod);
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+
+	INP_WLOCK_ASSERT(inp);
+
+	/* Update socket */
+	SOCKBUF_LOCK(&so->so_snd);
+	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(&so->so_snd);
+	SOCKBUF_LOCK(&so->so_rcv);
+	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	/* Update TCP PCB */
+	tp->tod = toep->tp_tod;
+	tp->t_toe = toep;
+	tp->t_flags |= TF_TOE;
+
+	/* Install an extra hold on inp */
+	toep->tp_inp = inp;
+	toep->tp_flags |= TP_ATTACHED;
+	in_pcbref(inp);
+
+	/* Add the TOE PCB to the active list */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+}
+
+/* This is _not_ the normal way to "unoffload" a socket. */
+void
+undo_offload_socket(struct socket *so)
+{
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	struct toedev *tod = toep->tp_tod;
+	struct tom_data *td = t3_tomdata(tod);
+
+	INP_WLOCK_ASSERT(inp);
+
+	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
+	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
+
+	tp->tod = NULL;
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+
+	toep->tp_inp = NULL;
+	toep->tp_flags &= ~TP_ATTACHED;
+	if (in_pcbrele_wlocked(inp))
+		panic("%s: inp freed.", __func__);
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
 }
 
 /*
- * Assign offload parameters to some socket fields.  This code is used by
- * both active and passive opens.
+ * Socket could be a listening socket, and we may not have a toepcb at all at
+ * this time.
  */
-static inline void
-init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
-    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
+uint32_t
+calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
 {
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
-	struct sockbuf *snd, *rcv;
-	
-#ifdef notyet	
-	SOCK_LOCK_ASSERT(so);
-#endif
-	
-	snd = so_sockbuf_snd(so);
-	rcv = so_sockbuf_rcv(so);
-	
-	log(LOG_INFO, "initializing offload socket\n");
-	/*
-	 * We either need to fix push frames to work with sbcompress
-	 * or we need to add this
-	 */
-	snd->sb_flags |= SB_NOCOALESCE;
-	rcv->sb_flags |= SB_NOCOALESCE;
-	
-	tp->t_toe = toep;
-	toep->tp_tp = tp;
-	toep->tp_toedev = dev;
-	
-	toep->tp_tid = tid;
-	toep->tp_l2t = e;
-	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
-	toep->tp_wr_unacked = 0;
-	toep->tp_delack_mode = 0;
-	
-	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
-	/*
-	 * XXX broken
-	 * 
-	 */
-	tp->rcv_wnd = select_rcv_wnd(dev, so);
+	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
+	    V_MSS_IDX(mtu_idx);
 
-        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
-		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
-	toep->tp_qset_idx = 0;
-	
-	reset_wr_list(toep);
-	DPRINTF("initialization done\n");
+	if (so != NULL) {
+		struct inpcb *inp = sotoinpcb(so);
+		struct tcpcb *tp = intotcpcb(inp);
+		int keepalive = always_keepalive ||
+		    so_options_get(so) & SO_KEEPALIVE;
+
+		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
+		opt0h |= V_KEEP_ALIVE(keepalive != 0);
+	}
+
+	if (e != NULL)
+		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
+
+	return (htobe32(opt0h));
 }
 
-/*
- * The next two functions calculate the option 0 value for a socket.
- */
-static inline unsigned int
-calc_opt0h(struct socket *so, int mtu_idx)
+uint32_t
+calc_opt0l(struct socket *so, int rcv_bufsize)
 {
-	struct tcpcb *tp = so_sototcpcb(so);
-	int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet);
-	
-	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
-	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
-	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
+	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
+
+	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
+	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
+
+	if (so != NULL)		/* optional because noone cares about IP TOS */
+		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
+
+	return (htobe32(opt0l));
 }
 
-static inline unsigned int
-calc_opt0l(struct socket *so, int ulp_mode)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	unsigned int val;
-	
-	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
-	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
-
-	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
-	return (val);
-}
-
-static inline unsigned int
-calc_opt2(const struct socket *so, struct toedev *dev)
-{
-	int flv_valid;
-
-	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
-
-	return (V_FLAVORS_VALID(flv_valid) |
-	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
-}
-
-#if DEBUG_WR > 1
-static int
-count_pending_wrs(const struct toepcb *toep)
-{
-	const struct mbuf *m;
-	int n = 0;
-
-	wr_queue_walk(toep, m)
-		n += m->m_pkthdr.csum_data;
-	return (n);
-}
-#endif
-
-#if 0
-(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
-#endif
-	
-static void
-mk_act_open_req(struct socket *so, struct mbuf *m,
-    unsigned int atid, const struct l2t_entry *e)
-{
-	struct cpl_act_open_req *req;
-	struct inpcb *inp = so_sotoinpcb(so);
-	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
-	struct toepcb *toep = tp->t_toe;
-	struct toedev *tdev = toep->tp_toedev;
-	
-	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
-	
-	req = mtod(m, struct cpl_act_open_req *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = 0;
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
-	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
-#if 0	
-	req->local_port = inp->inp_lport;
-	req->peer_port = inp->inp_fport;
-	memcpy(&req->local_ip, &inp->inp_laddr, 4);
-	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
-#endif	
-	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
-			   V_TX_CHANNEL(e->smt_idx));
-	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
-	req->params = 0;
-	req->opt2 = htonl(calc_opt2(so, tdev));
-}
-
-
 /*
  * Convert an ACT_OPEN_RPL status to an errno.
  */
@@ -1422,61 +889,6 @@
 	}
 }
 
-static void
-fail_act_open(struct toepcb *toep, int errno)
-{
-	struct tcpcb *tp = toep->tp_tp;
-
-	t3_release_offload_resources(toep);
-	if (tp) {
-		inp_wunlock(tp->t_inpcb);		
-		tcp_offload_drop(tp, errno);
-	}
-	
-#ifdef notyet
-	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
-#endif
-}
-
-/*
- * Handle active open failures.
- */
-static void
-active_open_failed(struct toepcb *toep, struct mbuf *m)
-{
-	struct cpl_act_open_rpl *rpl = cplhdr(m);
-	struct inpcb *inp;
-
-	if (toep->tp_tp == NULL)
-		goto done;
-
-	inp = toep->tp_tp->t_inpcb;
-
-/*
- * Don't handle connection retry for now
- */
-#ifdef notyet
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	if (rpl->status == CPL_ERR_CONN_EXIST &&
-	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
-		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
-		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
-			       jiffies + HZ / 2);
-	} else
-#endif
-	{
-		inp_wlock(inp);
-		/*
-		 * drops the inpcb lock
-		 */
-		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
-	}
-	
-	done:
-	m_free(m);
-}
-
 /*
  * Return whether a failed active open has allocated a TID
  */
@@ -1488,1072 +900,350 @@
 }
 
 /*
- * Process an ACT_OPEN_RPL CPL message.
+ * Active open failed.
  */
 static int
-do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct toepcb *toep = (struct toepcb *)ctx;
-	struct cpl_act_open_rpl *rpl = cplhdr(m);
-	
-	if (cdev->type != T3A && act_open_has_tid(rpl->status))
-		cxgb_queue_tid_release(cdev, GET_TID(rpl));
-	
-	active_open_failed(toep, m);
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	struct cpl_act_open_rpl *rpl = mtod(m, void *);
+	unsigned int atid = G_TID(ntohl(rpl->atid));
+	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	int s = rpl->status;
+
+	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
+
+	free_atid(&td->tid_maps, atid);
+	toep->tp_tid = -1;
+
+	if (act_open_has_tid(s))
+		queue_tid_release(tod, GET_TID(rpl));
+
+	if (s == CPL_ERR_TCAM_FULL || s == CPL_ERR_CONN_EXIST) {
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, EAGAIN);
+		toepcb_release(toep);	/* unlocks inp */
+	} else {
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(s));
+		toepcb_release(toep);	/* unlocks inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	}
+
+	m_freem(m);
 	return (0);
 }
 
 /*
- * Handle an ARP failure for an active open.   XXX purge ofo queue
+ * Send an active open request.
  *
- * XXX badly broken for crossed SYNs as the ATID is no longer valid.
- * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
- * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
- * free the atid.  Hmm.
- */
-#ifdef notyet
-static void
-act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
-{
-	struct toepcb *toep = m_get_toep(m);
-	struct tcpcb *tp = toep->tp_tp;
-	struct inpcb *inp = tp->t_inpcb;
-	struct socket *so;
-	
-	inp_wlock(inp);
-	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
-		/*
-		 * drops the inpcb lock
-		 */
-		fail_act_open(so, EHOSTUNREACH);
-		printf("freeing %p\n", m);
-		
-		m_free(m);
-	} else
-		inp_wunlock(inp);
-}
-#endif
-/*
- * Send an active open request.
+ * State of affairs on entry:
+ * soisconnecting (so_state |= SS_ISCONNECTING)
+ * tcbinfo not locked (this has changed - used to be WLOCKed)
+ * inp WLOCKed
+ * tp->t_state = TCPS_SYN_SENT
+ * rtalloc1, RT_UNLOCK on rt.
  */
 int
-t3_connect(struct toedev *tdev, struct socket *so,
+t3_connect(struct toedev *tod, struct socket *so,
     struct rtentry *rt, struct sockaddr *nam)
 {
-	struct mbuf *m;
-	struct l2t_entry *e;
-	struct tom_data *d = TOM_DATA(tdev);
-	struct inpcb *inp = so_sotoinpcb(so);
+	struct mbuf *m = NULL;
+	struct l2t_entry *e = NULL;
+	struct tom_data *td = t3_tomdata(tod);
+	struct adapter *sc = tod->tod_softc;
+	struct cpl_act_open_req *cpl;
+	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
-	struct toepcb *toep; /* allocated by init_offload_socket */
-		
-	int atid;
+	struct toepcb *toep;
+	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
+	struct sockaddr *gw;
+	struct ifnet *ifp = rt->rt_ifp;
+	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
 
-	toep = toepcb_alloc();
+	INP_WLOCK_ASSERT(inp);
+
+	toep = toepcb_alloc(tod);
 	if (toep == NULL)
-		goto out_err;
-	
-	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
-		goto out_err;
-	
-	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
-	if (!e)
-		goto free_tid;
+		goto failed;
 
-	inp_lock_assert(inp);
-	m = m_gethdr(MT_DATA, M_WAITOK);
-	
-#if 0	
-	m->m_toe.mt_toepcb = tp->t_toe;
-	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
-#endif
-	so_lock(so);
-	
-	init_offload_socket(so, tdev, atid, e, rt, toep);
-	
-	install_offload_ops(so);
-	
-	mk_act_open_req(so, m, atid, e);
-	so_unlock(so);
-	
-	soisconnecting(so);
-	toep = tp->t_toe;
-	m_set_toep(m, tp->t_toe);
-	
-	toep->tp_state = TCPS_SYN_SENT;
-	l2t_send(d->cdev, (struct mbuf *)m, e);
+	atid = alloc_atid(&td->tid_maps, toep);
+	if (atid < 0)
+		goto failed;
 
-	if (toep->tp_ulp_mode)
-		t3_enable_ddp(toep, 0);
-	return 	(0);
-	
-free_tid:
-	printf("failing connect - free atid\n");
-	
-	free_atid(d->cdev, atid);
-out_err:
-	printf("return ENOMEM\n");
-       return (ENOMEM);
+	qset = pi->first_qset + (arc4random() % pi->nqsets);
+
+	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
+	if (m == NULL)
+		goto failed;
+
+	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
+	e = t3_l2t_get(pi, ifp, gw);
+	if (e == NULL)
+		goto failed;
+
+	toep->tp_l2t = e;
+	toep->tp_tid = atid;	/* used to double check response */
+	toep->tp_qset = qset;
+
+	SOCKBUF_LOCK(&so->so_rcv);
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	offload_socket(so, toep);
+
+	/*
+	 * The kernel sets request_r_scale based on sb_max whereas we need to
+	 * take hardware's MAX_RCV_WND into account too.  This is normally a
+	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
+	 */
+	if (tp->t_flags & TF_REQ_SCALE)
+		rscale = tp->request_r_scale = select_rcv_wscale();
+	else
+		rscale = 0;
+	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
+	cpu_idx = sc->rrss_map[qset];
+
+	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
+	cpl->wr.wrh_lo = 0;
+	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 
+	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
+	    &cpl->peer_port);
+	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
+	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
+	cpl->params = 0;
+	cpl->opt2 = calc_opt2(cpu_idx);
+
+	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
+	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
+
+	if (l2t_send(sc, m, e) == 0)
+		return (0);
+
+	undo_offload_socket(so);
+
+failed:
+	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
+	    __func__, atid, toep, e, m);
+
+	if (atid >= 0)
+		free_atid(&td->tid_maps, atid);
+
+	if (e)
+		l2t_release(td->l2t, e);
+
+	if (toep)
+		toepcb_free(toep);
+
+	m_freem(m);
+
+	return (ENOMEM);
 }
 
 /*
- * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
- * not send multiple ABORT_REQs for the same connection and also that we do
- * not try to send a message after the connection has closed.  Returns 1 if
- * an ABORT_REQ wasn't generated after all, 0 otherwise.
+ * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
+ * send multiple ABORT_REQs for the same connection and also that we do not try
+ * to send a message after the connection has closed.
  */
 static void
-t3_send_reset(struct toepcb *toep)
+send_reset(struct toepcb *toep)
 {
-	
+
 	struct cpl_abort_req *req;
 	unsigned int tid = toep->tp_tid;
-	int mode = CPL_ABORT_SEND_RST;
-	struct tcpcb *tp = toep->tp_tp;
-	struct toedev *tdev = toep->tp_toedev;
-	struct socket *so = NULL;
+	struct inpcb *inp = toep->tp_inp;
+	struct socket *so = inp->inp_socket;
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toedev *tod = toep->tp_tod;
+	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
-	struct sockbuf *snd;
-	
-	if (tp) {
-		inp_lock_assert(tp->t_inpcb);
-		so = inp_inpcbtosocket(tp->t_inpcb);
-	}
-	
-	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
-		tdev == NULL))
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(inp);
+
+	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
+	    toep->tp_flags);
+
+	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
 		return;
-	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
 
-	snd = so_sockbuf_snd(so);
-	/* Purge the send queue so we don't send anything after an abort. */
-	if (so)
-		sbflush(snd);
-	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
-		mode |= CPL_ABORT_POST_CLOSE_REQ;
+	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
 
-	m = m_gethdr_nofail(sizeof(*req));
-	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
-	set_arp_failure_handler(m, abort_arp_failure);
+	/* Purge the send queue */
+	sbflush(so_sockbuf_snd(so));
+	purge_wr_queue(toep);
 
-	req = mtod(m, struct cpl_abort_req *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
-	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
+	if (m == NULL)
+		CXGB_UNIMPLEMENTED();
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
-	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
+	req->rsvd0 = htonl(tp->snd_nxt);
 	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
-	req->cmd = mode;
-	if (tp && (tp->t_state == TCPS_SYN_SENT))
-		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	if (tp->t_state == TCPS_SYN_SENT)
+		mbufq_tail(&toep->out_of_order_queue, m); /* defer */
 	else
-		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
-}
-
-static int
-t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
-{
-	struct inpcb *inp;
-	int error, optval;
-	
-	if (sopt->sopt_name == IP_OPTIONS)
-		return (ENOPROTOOPT);
-
-	if (sopt->sopt_name != IP_TOS)
-		return (EOPNOTSUPP);
-	
-	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
-
-	if (error)
-		return (error);
-
-	if (optval > IPTOS_PREC_CRITIC_ECP)
-		return (EINVAL);
-
-	inp = so_sotoinpcb(so);
-	inp_wlock(inp);
-	inp_ip_tos_set(inp, optval);
-#if 0	
-	inp->inp_ip_tos = optval;
-#endif
-	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
-	inp_wunlock(inp);
-
-	return (0);
-}
-
-static int
-t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
-{
-	int err = 0;
-	size_t copied;
-
-	if (sopt->sopt_name != TCP_CONGESTION &&
-	    sopt->sopt_name != TCP_NODELAY)
-		return (EOPNOTSUPP);
-
-	if (sopt->sopt_name == TCP_CONGESTION) {
-		char name[TCP_CA_NAME_MAX];
-		int optlen = sopt->sopt_valsize;
-		struct tcpcb *tp;
-		
-		if (sopt->sopt_dir == SOPT_GET) {
-			KASSERT(0, ("unimplemented"));
-			return (EOPNOTSUPP);
-		}
-
-		if (optlen < 1)
-			return (EINVAL);
-		
-		err = copyinstr(sopt->sopt_val, name, 
-		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
-		if (err)
-			return (err);
-		if (copied < 1)
-			return (EINVAL);
-
-		tp = so_sototcpcb(so);
-		/*
-		 * XXX I need to revisit this
-		 */
-		if ((err = t3_set_cong_control(so, name)) == 0) {
-#ifdef CONGESTION_CONTROL_SUPPORTED
-			tp->t_cong_control = strdup(name, M_CXGB);
-#endif			
-		} else
-			return (err);
-	} else {
-		int optval, oldval;
-		struct inpcb *inp;
-		struct tcpcb *tp;
-
-		if (sopt->sopt_dir == SOPT_GET)
-			return (EOPNOTSUPP);
-	
-		err = sooptcopyin(sopt, &optval, sizeof optval,
-		    sizeof optval);
-
-		if (err)
-			return (err);
-
-		inp = so_sotoinpcb(so);
-		inp_wlock(inp);
-		tp = inp_inpcbtotcpcb(inp);
-
-		oldval = tp->t_flags;
-		if (optval)
-			tp->t_flags |= TF_NODELAY;
-		else
-			tp->t_flags &= ~TF_NODELAY;
-		inp_wunlock(inp);
-
-
-		if (oldval != tp->t_flags && (tp->t_toe != NULL))
-			t3_set_nagle(tp->t_toe);
-
-	}
-
-	return (0);
+		l2t_send(sc, m, toep->tp_l2t);
 }
 
 int
-t3_ctloutput(struct socket *so, struct sockopt *sopt)
+t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
 {
-	int err;
 
-	if (sopt->sopt_level != IPPROTO_TCP) 
-		err =  t3_ip_ctloutput(so, sopt);
-	else
-		err = t3_tcp_ctloutput(so, sopt);
-
-	if (err != EOPNOTSUPP)
-		return (err);
-
-	return (tcp_ctloutput(so, sopt));
-}
-
-/*
- * Returns true if we need to explicitly request RST when we receive new data
- * on an RX-closed connection.
- */
-static inline int
-need_rst_on_excess_rx(const struct toepcb *toep)
-{
-	return (1);
-}
-
-/*
- * Handles Rx data that arrives in a state where the socket isn't accepting
- * new data.
- */
-static void
-handle_excess_rx(struct toepcb *toep, struct mbuf *m)
-{
-	
-	if (need_rst_on_excess_rx(toep) &&
-	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
-		t3_send_reset(toep);
-	m_freem(m); 
-}
-
-/*
- * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
- * by getting the DDP offset from the TCB.
- */
-static void
-tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
-{
-	struct ddp_state *q = &toep->tp_ddp_state;
-	struct ddp_buf_state *bsp;
-	struct cpl_get_tcb_rpl *hdr;
-	unsigned int ddp_offset;
-	struct socket *so;
-	struct tcpcb *tp;
-	struct sockbuf *rcv;	
-	int state;
-	
-	uint64_t t;
-	__be64 *tcb;
-
-	tp = toep->tp_tp;
-	so = inp_inpcbtosocket(tp->t_inpcb);
-
-	inp_lock_assert(tp->t_inpcb);
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);	
-	
-	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
-	 * We really need a cookie in order to dispatch the RPLs.
-	 */
-	q->get_tcb_count--;
-
-	/* It is a possible that a previous CPL already invalidated UBUF DDP
-	 * and moved the cur_buf idx and hence no further processing of this
-	 * skb is required. However, the app might be sleeping on
-	 * !q->get_tcb_count and we need to wake it up.
-	 */
-	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
-		int state = so_state_get(so);
-
-		m_freem(m);
-		if (__predict_true((state & SS_NOFDREF) == 0))
-			so_sorwakeup_locked(so);
-		else
-			sockbuf_unlock(rcv);
-
-		return;
-	}
-
-	bsp = &q->buf_state[q->cur_buf];
-	hdr = cplhdr(m);
-	tcb = (__be64 *)(hdr + 1);
-	if (q->cur_buf == 0) {
-		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
-		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
-	} else {
-		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
-		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
-	}
-	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
-	m->m_cur_offset = bsp->cur_offset;
-	bsp->cur_offset = ddp_offset;
-	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
-
-	CTR5(KTR_TOM,
-	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
-	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
-	KASSERT(ddp_offset >= m->m_cur_offset,
-	    ("ddp_offset=%u less than cur_offset=%u",
-		ddp_offset, m->m_cur_offset));
-	
-#if 0
-{
-	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
-
-	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
-	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
-
-        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
-        rcv_nxt = t >> S_TCB_RCV_NXT;
-        rcv_nxt &= M_TCB_RCV_NXT;
-
-        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
-        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
-        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
-
-	T3_TRACE2(TIDTB(sk),
-		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
-		  ddp_flags, rcv_nxt - rx_hdr_offset);
-	T3_TRACE4(TB(q),
-		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
-		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
-	T3_TRACE3(TB(q),
-		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
-		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
-	T3_TRACE2(TB(q),
-		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
-		 q->buf_state[0].flags, q->buf_state[1].flags);
-
-}
-#endif
-	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
-		handle_excess_rx(toep, m);
-		return;
-	}
-
-#ifdef T3_TRACE
-	if ((int)m->m_pkthdr.len < 0) {
-		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
-	}
-#endif
-	if (bsp->flags & DDP_BF_NOCOPY) {
-#ifdef T3_TRACE
-		T3_TRACE0(TB(q),
-			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
-
-		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
-			printk("!cancel_ubuf");
-			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
-		}
-#endif
-		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
-		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
-		q->cur_buf ^= 1;
-	} else if (bsp->flags & DDP_BF_NOFLIP) {
-
-		m->m_ddp_flags = 1;    /* always a kernel buffer */
-
-		/* now HW buffer carries a user buffer */
-		bsp->flags &= ~DDP_BF_NOFLIP;
-		bsp->flags |= DDP_BF_NOCOPY;
-
-		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
-		 * any new data in which case we're done. If in addition the
-		 * offset is 0, then there wasn't a completion for the kbuf
-		 * and we need to decrement the posted count.
-		 */
-		if (m->m_pkthdr.len == 0) {
-			if (ddp_offset == 0) {
-				q->kbuf_posted--;
-				bsp->flags |= DDP_BF_NODATA;
-			}
-			sockbuf_unlock(rcv);
-			m_free(m);
-			return;
-		}
-	} else {
-		sockbuf_unlock(rcv);
-
-		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
-		 * but it got here way late and nobody cares anymore.
-		 */
-		m_free(m);
-		return;
-	}
-
-	m->m_ddp_gl = (unsigned char *)bsp->gl;
-	m->m_flags |= M_DDP;
-	m->m_seq = tp->rcv_nxt;
-	tp->rcv_nxt += m->m_pkthdr.len;
-	tp->t_rcvtime = ticks;
-	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
-		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
-	if (m->m_pkthdr.len == 0) {
-		q->user_ddp_pending = 0;
-		m_free(m);
-	} else 
-		SBAPPEND(rcv, m);
-
-	state = so_state_get(so);	
-	if (__predict_true((state & SS_NOFDREF) == 0))
-		so_sorwakeup_locked(so);
-	else
-		sockbuf_unlock(rcv);
-}
-
-/*
- * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
- * in that case they are similar to DDP completions.
- */
-static int
-do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct toepcb *toep = (struct toepcb *)ctx;
-
-	/* OK if socket doesn't exist */
-	if (toep == NULL) {
-		printf("null toep in do_get_tcb_rpl\n");
-		return (CPL_RET_BUF_DONE);
-	}
-
-	inp_wlock(toep->tp_tp->t_inpcb);
-	tcb_rpl_as_ddp_complete(toep, m);
-	inp_wunlock(toep->tp_tp->t_inpcb);
-	
+	send_reset(tp->t_toe);
 	return (0);
 }
 
-static void
-handle_ddp_data(struct toepcb *toep, struct mbuf *m)
-{
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so;
-	struct ddp_state *q;
-	struct ddp_buf_state *bsp;
-	struct cpl_rx_data *hdr = cplhdr(m);
-	unsigned int rcv_nxt = ntohl(hdr->seq);
-	struct sockbuf *rcv;	
-	
-	if (tp->rcv_nxt == rcv_nxt)
-		return;
-
-	inp_lock_assert(tp->t_inpcb);
-	so  = inp_inpcbtosocket(tp->t_inpcb);
-	rcv = so_sockbuf_rcv(so);	
-	sockbuf_lock(rcv);	
-
-	q = &toep->tp_ddp_state;
-	bsp = &q->buf_state[q->cur_buf];
-	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
-		rcv_nxt, tp->rcv_nxt));
-	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
-	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
-	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
-	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
-
-#ifdef T3_TRACE
-	if ((int)m->m_pkthdr.len < 0) {
-		t3_ddp_error(so, "handle_ddp_data: neg len");
-	}
-#endif
-	m->m_ddp_gl = (unsigned char *)bsp->gl;
-	m->m_flags |= M_DDP;
-	m->m_cur_offset = bsp->cur_offset;
-	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
-	if (bsp->flags & DDP_BF_NOCOPY)
-		bsp->flags &= ~DDP_BF_NOCOPY;
-
-	m->m_seq = tp->rcv_nxt;
-	tp->rcv_nxt = rcv_nxt;
-	bsp->cur_offset += m->m_pkthdr.len;
-	if (!(bsp->flags & DDP_BF_NOFLIP))
-		q->cur_buf ^= 1;
-	/*
-	 * For now, don't re-enable DDP after a connection fell out of  DDP
-	 * mode.
-	 */
-	q->ubuf_ddp_ready = 0;
-	sockbuf_unlock(rcv);
-}
-
-/*
- * Process new data received for a connection.
- */
-static void
-new_rx_data(struct toepcb *toep, struct mbuf *m)
-{
-	struct cpl_rx_data *hdr = cplhdr(m);
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so;
-	struct sockbuf *rcv;	
-	int state;
-	int len = be16toh(hdr->len);
-
-	inp_wlock(tp->t_inpcb);
-
-	so  = inp_inpcbtosocket(tp->t_inpcb);
-	
-	if (__predict_false(so_no_receive(so))) {
-		handle_excess_rx(toep, m);
-		inp_wunlock(tp->t_inpcb);
-		TRACE_EXIT;
-		return;
-	}
-
-	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
-		handle_ddp_data(toep, m);
-	
-	m->m_seq = ntohl(hdr->seq);
-	m->m_ulp_mode = 0;                    /* for iSCSI */
-
-#if VALIDATE_SEQ
-	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
-		log(LOG_ERR,
-		       "%s: TID %u: Bad sequence number %u, expected %u\n",
-		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
-		       tp->rcv_nxt);
-		m_freem(m);
-		inp_wunlock(tp->t_inpcb);
-		return;
-	}
-#endif
-	m_adj(m, sizeof(*hdr));
-
-#ifdef URGENT_DATA_SUPPORTED
-	/*
-	 * We don't handle urgent data yet
-	 */
-	if (__predict_false(hdr->urg))
-		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
-	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
-		     tp->urg_seq - tp->rcv_nxt < skb->len))
-		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
-							 tp->rcv_nxt];
-#endif	
-	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
-		toep->tp_delack_mode = hdr->dack_mode;
-		toep->tp_delack_seq = tp->rcv_nxt;
-	}
-	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
-	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
-	
-	if (len < m->m_pkthdr.len)
-		m->m_pkthdr.len = m->m_len = len;
-
-	tp->rcv_nxt += m->m_pkthdr.len;
-	tp->t_rcvtime = ticks;
-	toep->tp_enqueued_bytes += m->m_pkthdr.len;
-	CTR2(KTR_TOM,
-	    "new_rx_data: seq 0x%x len %u",
-	    m->m_seq, m->m_pkthdr.len);
-	inp_wunlock(tp->t_inpcb);
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);
-#if 0	
-	if (sb_notify(rcv))
-		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
-#endif
-	SBAPPEND(rcv, m);
-
-#ifdef notyet
-	/*
-	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
-	 *
-	 */
-	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
-
-	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
-		so, rcv->sb_cc, rcv->sb_mbmax));
-#endif
-	
-
-	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
-	    rcv->sb_cc, rcv->sb_mbcnt);
-	
-	state = so_state_get(so);	
-	if (__predict_true((state & SS_NOFDREF) == 0))
-		so_sorwakeup_locked(so);
-	else
-		sockbuf_unlock(rcv);
-}
-
 /*
  * Handler for RX_DATA CPL messages.
  */
 static int
-do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct toepcb *toep = (struct toepcb *)ctx;
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_rx_data *hdr = mtod(m, void *);
+	unsigned int tid = GET_TID(hdr);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	struct sockbuf *so_rcv;	
 
-	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
-	
-	new_rx_data(toep, m);
+	/* Advance over CPL */
+	m_adj(m, sizeof(*hdr));
 
-	return (0);
-}
-
-static void
-new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
-{
-	struct tcpcb *tp;
-	struct ddp_state *q;
-	struct ddp_buf_state *bsp;
-	struct cpl_rx_data_ddp *hdr;
-	struct socket *so;	
-	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
-	int nomoredata = 0;
-	unsigned int delack_mode;
-	struct sockbuf *rcv;
-	
-	tp = toep->tp_tp;	
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(tp->t_inpcb);
-
-	if (__predict_false(so_no_receive(so))) {
-
-		handle_excess_rx(toep, m);
-		inp_wunlock(tp->t_inpcb);
-		return;
-	}
-	
-	q = &toep->tp_ddp_state;
-	hdr = cplhdr(m);
-	ddp_report = ntohl(hdr->u.ddp_report);
-	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
-	bsp = &q->buf_state[buf_idx];
-
-	CTR4(KTR_TOM,
-	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
-	    "hdr seq 0x%x len %u",
-	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
-	    ntohs(hdr->len));
-	CTR3(KTR_TOM,
-	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
-	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
-	
-	ddp_len = ntohs(hdr->len);
-	rcv_nxt = ntohl(hdr->seq) + ddp_len;
-
-	delack_mode = G_DDP_DACK_MODE(ddp_report);
-	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
-		toep->tp_delack_mode = delack_mode;
-		toep->tp_delack_seq = tp->rcv_nxt;
-	}
-	
-	m->m_seq = tp->rcv_nxt;
-	tp->rcv_nxt = rcv_nxt;
-
-	tp->t_rcvtime = ticks;
-	/*
-	 * Store the length in m->m_len.  We are changing the meaning of
-	 * m->m_len here, we need to be very careful that nothing from now on
-	 * interprets ->len of this packet the usual way.
-	 */
-	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
-	inp_wunlock(tp->t_inpcb);
-	CTR3(KTR_TOM,
-	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
-	    m->m_len, rcv_nxt, m->m_seq);
-	/*
-	 * Figure out where the new data was placed in the buffer and store it
-	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
-	 * account for page pod's pg_offset.
-	 */
-	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
-	m->m_cur_offset = end_offset - m->m_pkthdr.len;
-
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);	
-
-	m->m_ddp_gl = (unsigned char *)bsp->gl;
-	m->m_flags |= M_DDP;
-	bsp->cur_offset = end_offset;
-	toep->tp_enqueued_bytes += m->m_pkthdr.len;
-
-	/*
-	 * Length is only meaningful for kbuf
-	 */
-	if (!(bsp->flags & DDP_BF_NOCOPY))
-		KASSERT(m->m_len <= bsp->gl->dgl_length,
-		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
-			m->m_len, bsp->gl->dgl_length));
-
-	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
-	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
-        /*
-	 * Bit 0 of flags stores whether the DDP buffer is completed.
-	 * Note that other parts of the code depend on this being in bit 0.
-	 */
-	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
-		panic("spurious ddp completion");
-	} else {
-		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
-		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 
-			q->cur_buf ^= 1;                     /* flip buffers */
+	/* XXX: revisit.  This comes from the T4 TOM */
+	if (__predict_false(inp == NULL)) {
+		/*
+		 * do_pass_establish failed and must be attempting to abort the
+		 * connection.  Meanwhile, the T4 has sent us data for such a
+		 * connection.
+		 */
+#ifdef notyet
+		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		    ("%s: inp NULL and tid isn't being aborted", __func__));
+#endif
+		m_freem(m);
+		return (0);
 	}
 
-	if (bsp->flags & DDP_BF_NOCOPY) {
-		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
-		bsp->flags &= ~DDP_BF_NOCOPY;
+	INP_WLOCK(inp);
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+		INP_WUNLOCK(inp);
+		m_freem(m);
+		return (0);
 	}
 
-	if (ddp_report & F_DDP_PSH)
-		m->m_ddp_flags |= DDP_BF_PSH;
-	if (nomoredata)
-		m->m_ddp_flags |= DDP_BF_NODATA;
+	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
+		toep->tp_delack_mode = hdr->dack_mode;
 
-#ifdef notyet	
-	skb_reset_transport_header(skb);
-	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
+	tp = intotcpcb(inp);
+
+#ifdef INVARIANTS
+	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
+		log(LOG_ERR,
+		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
+		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
+	}
 #endif
-	SBAPPEND(rcv, m);
-
-	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
-	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
-		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
-		so_sorwakeup_locked(so);
-	else
-		sockbuf_unlock(rcv);
-}
-
-#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
-		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
-		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
-		 F_DDP_INVALID_PPOD)
-
-/*
- * Handler for RX_DATA_DDP CPL messages.
- */
-static int
-do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct toepcb *toep = ctx;
-	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
-
-	VALIDATE_SOCK(so);
-
-	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
-		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
-		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
-		return (CPL_RET_BUF_DONE);
-	}
-#if 0
-	skb->h.th = tcphdr_skb->h.th;
-#endif	
-	new_rx_data_ddp(toep, m);
-	return (0);
-}
-
-static void
-process_ddp_complete(struct toepcb *toep, struct mbuf *m)
-{
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so;
-	struct ddp_state *q;
-	struct ddp_buf_state *bsp;
-	struct cpl_rx_ddp_complete *hdr;
-	unsigned int ddp_report, buf_idx, when, delack_mode;
-	int nomoredata = 0;
-	struct sockbuf *rcv;
-	
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(tp->t_inpcb);
-
-	if (__predict_false(so_no_receive(so))) {
-		struct inpcb *inp = so_sotoinpcb(so);
-
-		handle_excess_rx(toep, m);
-		inp_wunlock(inp);
-		return;
-	}
-	q = &toep->tp_ddp_state; 
-	hdr = cplhdr(m);
-	ddp_report = ntohl(hdr->ddp_report);
-	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
-	m->m_pkthdr.csum_data = tp->rcv_nxt;
-
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);
-
-	bsp = &q->buf_state[buf_idx];
-	when = bsp->cur_offset;
-	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
-	tp->rcv_nxt += m->m_len;
+	tp->rcv_nxt += m->m_pkthdr.len;
+	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
+	    ("%s: negative window size", __func__));
+	tp->rcv_wnd -= m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 
-	delack_mode = G_DDP_DACK_MODE(ddp_report);
-	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
-		toep->tp_delack_mode = delack_mode;
-		toep->tp_delack_seq = tp->rcv_nxt;
+	so  = inp->inp_socket;
+	so_rcv = &so->so_rcv;
+	SOCKBUF_LOCK(so_rcv);
+
+	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
+		    __func__, tid, m->m_pkthdr.len);
+		SOCKBUF_UNLOCK(so_rcv);
+		INP_WUNLOCK(inp);
+
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		tp = tcp_drop(tp, ECONNRESET);
+		if (tp)
+			INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		m_freem(m);
+		return (0);
 	}
-#ifdef notyet
-	skb_reset_transport_header(skb);
-	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
-#endif
-	inp_wunlock(tp->t_inpcb);
 
-	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
-	CTR5(KTR_TOM,
-		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
-		  "ddp_report 0x%x offset %u, len %u",
-		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
-		   G_DDP_OFFSET(ddp_report), m->m_len);
+	/* receive buffer autosize */
+	if (so_rcv->sb_flags & SB_AUTOSIZE &&
+	    V_tcp_do_autorcvbuf &&
+	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
+	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
+		unsigned int hiwat = so_rcv->sb_hiwat;
+		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		    V_tcp_autorcvbuf_max);
 
-	m->m_cur_offset = bsp->cur_offset;
-	bsp->cur_offset += m->m_len;
+		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
+			so_rcv->sb_flags &= ~SB_AUTOSIZE;
+		else
+			toep->tp_rx_credits += newsize - hiwat;
+	}
 
-	if (!(bsp->flags & DDP_BF_NOFLIP)) {
-		q->cur_buf ^= 1;                     /* flip buffers */
-		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
-			nomoredata=1;
-	}
-		
-	CTR4(KTR_TOM,
-		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
-		  "ddp_report %u offset %u",
-		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
-		   G_DDP_OFFSET(ddp_report));
-	
-	m->m_ddp_gl = (unsigned char *)bsp->gl;
-	m->m_flags |= M_DDP;
-	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
-	if (bsp->flags & DDP_BF_NOCOPY)
-		bsp->flags &= ~DDP_BF_NOCOPY;
-	if (nomoredata)
-		m->m_ddp_flags |= DDP_BF_NODATA;
+	toep->tp_enqueued += m->m_pkthdr.len;
+	sbappendstream_locked(so_rcv, m);
+	sorwakeup_locked(so);
+	SOCKBUF_UNLOCK_ASSERT(so_rcv);
 
-	SBAPPEND(rcv, m);
-	if ((so_state_get(so) & SS_NOFDREF) == 0)
-		so_sorwakeup_locked(so);
-	else
-		sockbuf_unlock(rcv);
-}
-
-/*
- * Handler for RX_DDP_COMPLETE CPL messages.
- */
-static int
-do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct toepcb *toep = ctx;
-
-	VALIDATE_SOCK(so);
-#if 0
-	skb->h.th = tcphdr_skb->h.th;
-#endif	
-	process_ddp_complete(toep, m);
+	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /*
- * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
- * socket state before calling tcp_time_wait to comply with its expectations.
+ * Handler for PEER_CLOSE CPL messages.
  */
-static void
-enter_timewait(struct tcpcb *tp)
+static int
+do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	/*
-	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
-	 * process peer_close because we don't want to carry the peer FIN in
-	 * the socket's receive queue and if we increment rcv_nxt without
-	 * having the FIN in the receive queue we'll confuse facilities such
-	 * as SIOCINQ.
-	 */
-	inp_wlock(tp->t_inpcb);	
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	const struct cpl_peer_close *hdr = mtod(m, void *);
+	unsigned int tid = GET_TID(hdr);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp;
+	struct socket *so;
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
+	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
+
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
+		goto done;
+
+	so = inp_inpcbtosocket(inp);
+
+	socantrcvmore(so);
 	tp->rcv_nxt++;
 
-	tp->ts_recent_age = 0;	     /* defeat recycling */
-	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
-	inp_wunlock(tp->t_inpcb);
-	tcp_offload_twstart(tp);
-}
-
-/*
- * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
- * function deals with the data that may be reported along with the FIN.
- * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
- * perform normal FIN-related processing.  In the latter case 1 indicates that
- * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
- * skb can be freed.
- */
-static int
-handle_peer_close_data(struct socket *so, struct mbuf *m)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	struct ddp_state *q;
-	struct ddp_buf_state *bsp;
-	struct cpl_peer_close *req = cplhdr(m);
-	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
-	struct sockbuf *rcv;
-	
-	if (tp->rcv_nxt == rcv_nxt)			/* no data */
-		return (0);
-
-	CTR0(KTR_TOM, "handle_peer_close_data");
-	if (__predict_false(so_no_receive(so))) {
-		handle_excess_rx(toep, m);
-
-		/*
-		 * Although we discard the data we want to process the FIN so
-		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
-		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
-		 * may be what will close the connection.  We return 1 because
-		 * handle_excess_rx() already freed the packet.
-		 */
-		return (1);
-	}
-
-	inp_lock_assert(tp->t_inpcb);
-	q = &toep->tp_ddp_state;
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);
-
-	bsp = &q->buf_state[q->cur_buf];
-	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
-	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
-	m->m_ddp_gl = (unsigned char *)bsp->gl;
-	m->m_flags |= M_DDP;
-	m->m_cur_offset = bsp->cur_offset;
-	m->m_ddp_flags = 
-	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
-	m->m_seq = tp->rcv_nxt;
-	tp->rcv_nxt = rcv_nxt;
-	bsp->cur_offset += m->m_pkthdr.len;
-	if (!(bsp->flags & DDP_BF_NOFLIP))
-		q->cur_buf ^= 1;
-#ifdef notyet	
-	skb_reset_transport_header(skb);
-	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
-#endif	
-	tp->t_rcvtime = ticks;
-	SBAPPEND(rcv, m);
-	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
-		so_sorwakeup_locked(so);
-	else
-		sockbuf_unlock(rcv);
-
-	return (1);
-}
-
-/*
- * Handle a peer FIN.
- */
-static void
-do_peer_fin(struct toepcb *toep, struct mbuf *m)
-{
-	struct socket *so;
-	struct tcpcb *tp = toep->tp_tp;
-	int keep, action;
-	
-	action = keep = 0;	
-	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
-	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
-		printf("abort_pending set\n");
-		
-		goto out;
-	}
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
-	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
-		keep = handle_peer_close_data(so, m);
-		if (keep < 0) {
-			inp_wunlock(tp->t_inpcb);					
-			return;
-		}
-	}
-	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
-		CTR1(KTR_TOM,
-		    "waking up waiters for cantrcvmore on %p ", so);	
-		socantrcvmore(so);
-
-		/*
-		 * If connection is half-synchronized
-		 * (ie NEEDSYN flag on) then delay ACK,
-		 * so it may be piggybacked when SYN is sent.
-		 * Otherwise, since we received a FIN then no
-		 * more input can be expected, send ACK now.
-		 */
-		if (tp->t_flags & TF_NEEDSYN)
-			tp->t_flags |= TF_DELACK;
-		else
-			tp->t_flags |= TF_ACKNOW;
-		tp->rcv_nxt++;
-	}
-	
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
-	    tp->t_starttime = ticks;
-	/* FALLTHROUGH */ 
+		tp->t_starttime = ticks;
+		/* FALLTHROUGH */ 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
@@ -2561,228 +1251,134 @@
 		tp->t_state = TCPS_CLOSING;
 		break;
 	case TCPS_FIN_WAIT_2:
-		/*
-		 * If we've sent an abort_req we must have sent it too late,
-		 * HW will send us a reply telling us so, and this peer_close
-		 * is really the last message for this connection and needs to
-		 * be treated as an abort_rpl, i.e., transition the connection
-		 * to TCP_CLOSE (note that the host stack does this at the
-		 * time of generating the RST but we must wait for HW).
-		 * Otherwise we enter TIME_WAIT.
-		 */
-		t3_release_offload_resources(toep);
-		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
-			action = TCP_CLOSE;
-		} else {
-			action = TCP_TIMEWAIT;			
-		}
-		break;
+		tcp_twstart(tp);
+		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		toepcb_release(toep);	/* no more CPLs expected */
+
+		m_freem(m);
+		return (0);
 	default:
-		log(LOG_ERR,
-		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
-		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
-	}
-	inp_wunlock(tp->t_inpcb);					
-
-	if (action == TCP_TIMEWAIT) {
-		enter_timewait(tp);
-	} else if (action == TCP_DROP) {
-		tcp_offload_drop(tp, 0);		
-	} else if (action == TCP_CLOSE) {
-		tcp_offload_close(tp);		
+		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
+		    __func__, toep->tp_tid, tp->t_state);
 	}
 
-#ifdef notyet		
-	/* Do not send POLL_HUP for half duplex close. */
-	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
-	    sk->sk_state == TCP_CLOSE)
-		sk_wake_async(so, 1, POLL_HUP);
-	else
-		sk_wake_async(so, 1, POLL_IN);
-#endif
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
 
-out:
-	if (!keep)
-		m_free(m);
-}
-
-/*
- * Handler for PEER_CLOSE CPL messages.
- */
-static int
-do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct toepcb *toep = (struct toepcb *)ctx;
-
-	VALIDATE_SOCK(so);
-
-	do_peer_fin(toep, m);
-	return (0);
-}
-
-static void
-process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
-{
-	struct cpl_close_con_rpl *rpl = cplhdr(m);
-	struct tcpcb *tp = toep->tp_tp;	
-	struct socket *so;	
-	int action = 0;
-	struct sockbuf *rcv;	
-	
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(tp->t_inpcb);	
-	
-	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
-
-	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
-		inp_wunlock(tp->t_inpcb);
-		goto out;
-	}
-	
-	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 
-	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
-
-	switch (tp->t_state) {
-	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
-		t3_release_offload_resources(toep);
-		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
-			action = TCP_CLOSE;
-
-		} else {
-			action = TCP_TIMEWAIT;
-		}
-		break;
-	case TCPS_LAST_ACK:
-		/*
-		 * In this state we don't care about pending abort_rpl.
-		 * If we've sent abort_req it was post-close and was sent too
-		 * late, this close_con_rpl is the actual last message.
-		 */
-		t3_release_offload_resources(toep);
-		action = TCP_CLOSE;
-		break;
-	case TCPS_FIN_WAIT_1:
-		/*
-		 * If we can't receive any more
-		 * data, then closing user can proceed.
-		 * Starting the timer is contrary to the
-		 * specification, but if we don't get a FIN
-		 * we'll hang forever.
-		 *
-		 * XXXjl:
-		 * we should release the tp also, and use a
-		 * compressed state.
-		 */
-		if (so)
-			rcv = so_sockbuf_rcv(so);
-		else
-			break;
-		
-		if (rcv->sb_state & SBS_CANTRCVMORE) {
-			int timeout;
-
-			if (so)
-				soisdisconnected(so);
-			timeout = (tcp_fast_finwait2_recycle) ? 
-			    tcp_finwait2_timeout : tcp_maxidle;
-			tcp_timer_activate(tp, TT_2MSL, timeout);
-		}
-		tp->t_state = TCPS_FIN_WAIT_2;
-		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
-		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
-			action = TCP_DROP;
-		}
-
-		break;
-	default:
-		log(LOG_ERR,
-		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
-		       toep->tp_toedev->tod_name, toep->tp_tid,
-		       tp->t_state);
-	}
-	inp_wunlock(tp->t_inpcb);
-
-
-	if (action == TCP_TIMEWAIT) {
-		enter_timewait(tp);
-	} else if (action == TCP_DROP) {
-		tcp_offload_drop(tp, 0);		
-	} else if (action == TCP_CLOSE) {
-		tcp_offload_close(tp);		
-	}
-out:
 	m_freem(m);
-}
-
-/*
- * Handler for CLOSE_CON_RPL CPL messages.
- */
-static int
-do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
-			    void *ctx)
-{
-	struct toepcb *toep = (struct toepcb *)ctx;
-
-	process_close_con_rpl(toep, m);
 	return (0);
 }
 
 /*
- * Process abort replies.  We only process these messages if we anticipate
- * them as the coordination between SW and HW in this area is somewhat lacking
- * and sometimes we get ABORT_RPLs after we are done with the connection that
- * originated the ABORT_REQ.
+ * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
  */
-static void
-process_abort_rpl(struct toepcb *toep, struct mbuf *m)
+static int
+do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so;	
-	int needclose = 0;
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
+	unsigned int tid = GET_TID(rpl);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp;
+	struct socket *so;
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
+	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
+
+	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
+		goto done;
+
+	so = inp_inpcbtosocket(inp);
+	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
+
+	switch (tp->t_state) {
+	case TCPS_CLOSING:
+		tcp_twstart(tp);
+release:
+		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		toepcb_release(toep);	/* no more CPLs expected */
 	
-#ifdef T3_TRACE
-	T3_TRACE1(TIDTB(sk),
-		  "process_abort_rpl: GTS rpl pending %d",
-		  sock_flag(sk, ABORT_RPL_PENDING));
-#endif
-	
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(tp->t_inpcb);
-	
-	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
-		/*
-		 * XXX panic on tcpdrop
-		 */
-		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
-			toep->tp_flags |= TP_ABORT_RPL_RCVD;
-		else {
-			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
-			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
-			    !is_t3a(toep->tp_toedev)) {
-				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
-					panic("TP_ABORT_REQ_RCVD set");
-				t3_release_offload_resources(toep);
-				needclose = 1;
-			}
-		}
+		m_freem(m);
+		return (0);
+	case TCPS_LAST_ACK:
+		if (tcp_close(tp))
+			INP_WUNLOCK(inp);
+		goto release;
+
+	case TCPS_FIN_WAIT_1:
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+			soisdisconnected(so);
+		tp->t_state = TCPS_FIN_WAIT_2;
+		break;
+	default:
+		log(LOG_ERR,
+		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
+		    __func__, toep->tp_tid, tp->t_state);
 	}
-	inp_wunlock(tp->t_inpcb);
 
-	if (needclose)
-		tcp_offload_close(tp);
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
 
-	m_free(m);
+	m_freem(m);
+	return (0);
+}
+
+static int
+do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		log(LOG_ERR,
+		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
+		    rpl->status, GET_TID(rpl));
+	}
+
+	m_freem(m);
+	return (0);
+}
+
+static int
+do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
+		    rpl->status, GET_TID(rpl));
+	}
+
+	m_freem(m);
+	return (0);
 }
 
 /*
  * Handle an ABORT_RPL_RSS CPL message.
  */
 static int
-do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
-	struct toepcb *toep;
-	
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
+	unsigned int tid = GET_TID(rpl);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct inpcb *inp;
+
 	/*
 	 * Ignore replies to post-close aborts indicating that the abort was
 	 * requested too late.  These connections are terminated when we get
@@ -2790,99 +1386,54 @@
 	 * arrives the TID is either no longer used or it has been recycled.
 	 */
 	if (rpl->status == CPL_ERR_ABORT_FAILED) {
-discard:
-		m_free(m);
+		m_freem(m);
 		return (0);
 	}
 
-	toep = (struct toepcb *)ctx;
-	
-        /*
-	 * Sometimes we've already closed the socket, e.g., a post-close
-	 * abort races with ABORT_REQ_RSS, the latter frees the socket
-	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
-	 * but FW turns the ABORT_REQ into a regular one and so we get
-	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
-	 */
-	if (!toep)
-		goto discard;
+	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
+		return (do_abort_rpl_synqe(qs, r, m));
 
-	if (toep->tp_tp == NULL) {
-		log(LOG_NOTICE, "removing tid for abort\n");
-		cxgb_remove_tid(cdev, toep, toep->tp_tid);
-		if (toep->tp_l2t) 
-			l2t_release(L2DATA(cdev), toep->tp_l2t);
+	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
+	    rpl->status);
 
-		toepcb_release(toep);
-		goto discard;
+	inp = toep->tp_inp;
+	INP_WLOCK(inp);
+
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
+			toep->tp_flags |= TP_ABORT_RPL_RCVD;
+			INP_WUNLOCK(inp);
+		} else {
+			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
+			toep->tp_flags &= TP_ABORT_RPL_PENDING;
+			toepcb_release(toep);	/* no more CPLs expected */
+		}
 	}
-	
-	log(LOG_NOTICE, "toep=%p\n", toep);
-	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
 
-	toepcb_hold(toep);
-	process_abort_rpl(toep, m);
-	toepcb_release(toep);
+	m_freem(m);
 	return (0);
 }
 
 /*
- * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
- * indicate whether RST should be sent in response.
+ * Convert the status code of an ABORT_REQ into a FreeBSD error code.
  */
 static int
-abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
+abort_status_to_errno(struct tcpcb *tp, int abort_reason)
 {
-	struct tcpcb *tp = so_sototcpcb(so);
-
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
-#if 0		
-		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
-#endif		
 	case CPL_ERR_CONN_RESET:
-		// XXX need to handle SYN_RECV due to crossed SYNs
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
-#if 0		
-		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
-#endif		
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
-static inline void
-set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
-{
-	struct cpl_abort_rpl *rpl = cplhdr(m);
-
-	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
-	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
-	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
-	
-	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
-	rpl->cmd = cmd;
-}
-
-static void
-send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
-{
-	struct mbuf *reply_mbuf;
-	struct cpl_abort_req_rss *req = cplhdr(m);
-
-	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
-	m_set_priority(m, CPL_PRIORITY_DATA);
-	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
-	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
-	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
-	m_free(m);
-}
-
 /*
  * Returns whether an ABORT_REQ_RSS message is a negative advice.
  */
@@ -2893,850 +1444,177 @@
 	    status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
-static void
-send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
+void
+send_abort_rpl(struct toedev *tod, int tid, int qset)
 {
-	struct mbuf  *reply_mbuf;
-	struct cpl_abort_req_rss *req = cplhdr(m);
+	struct mbuf *reply;
+	struct cpl_abort_rpl *rpl;
+	struct adapter *sc = tod->tod_softc;
 
-	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
+	if (!reply)
+		CXGB_UNIMPLEMENTED();
 
-	if (!reply_mbuf) {
-		/* Defer the reply.  Stick rst_status into req->cmd. */
-		req->status = rst_status;
-		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
-		return;
+	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+
+	t3_offload_tx(sc, reply);
+}
+
+/*
+ * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
+ * ignore this request except that we need to reply to it.
+ */
+static int
+do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	const struct cpl_abort_req_rss *req = mtod(m, void *);
+	unsigned int tid = GET_TID(req);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	int qset = toep->tp_qset;
+
+	if (is_neg_adv_abort(req->status)) {
+		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
+		    __func__, req->status, tid, toep->tp_flags);
+		m_freem(m);
+		return (0);
 	}
 
-	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
-	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
-	m_free(m);
+	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
+		return (do_abort_req_synqe(qs, r, m));
+
+	inp = toep->tp_inp;
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for tcp_close */
+	INP_WLOCK(inp);
+
+	tp = intotcpcb(inp);
+	so = inp->inp_socket;
+
+	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
+	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
+	    req->status);
+
+	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
+		toep->tp_flags |= TP_ABORT_REQ_RCVD;
+		toep->tp_flags |= TP_ABORT_SHUTDOWN;
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		m_freem(m);
+		return (0);
+	}
+	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
 
 	/*
-	 * XXX need to sync with ARP as for SYN_RECV connections we can send
-	 * these messages while ARP is pending.  For other connection states
-	 * it's not a problem.
+	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
+	 * the T3's reply to our reset instead.
 	 */
-	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
-}
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		toep->tp_flags |= TP_ABORT_RPL_SENT;
+		INP_WUNLOCK(inp);
+	} else {
+		so_error_set(so, abort_status_to_errno(tp, req->status));
+		tp = tcp_close(tp);
+		if (tp == NULL)
+			INP_WLOCK(inp);	/* re-acquire */
+		toepcb_release(toep);	/* no more CPLs expected */
+	}
+	INP_INFO_WUNLOCK(&V_tcbinfo);
 
-#ifdef notyet
-static void
-cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
-{
-	CXGB_UNIMPLEMENTED();
-#ifdef notyet	
-	struct request_sock *req = child->sk_user_data;
-
-	inet_csk_reqsk_queue_removed(parent, req);
-	synq_remove(tcp_sk(child));
-	__reqsk_free(req);
-	child->sk_user_data = NULL;
-#endif
-}
-
-
-/*
- * Performs the actual work to abort a SYN_RECV connection.
- */
-static void
-do_abort_syn_rcv(struct socket *child, struct socket *parent)
-{
-	struct tcpcb *parenttp = so_sototcpcb(parent);
-	struct tcpcb *childtp = so_sototcpcb(child);
-
-	/*
-	 * If the server is still open we clean up the child connection,
-	 * otherwise the server already did the clean up as it was purging
-	 * its SYN queue and the skb was just sitting in its backlog.
-	 */
-	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
-		cleanup_syn_rcv_conn(child, parent);
-		inp_wlock(childtp->t_inpcb);
-		t3_release_offload_resources(childtp->t_toe);
-		inp_wunlock(childtp->t_inpcb);
-		tcp_offload_close(childtp);
-	}
-}
-#endif
-
-/*
- * Handle abort requests for a SYN_RECV connection.  These need extra work
- * because the socket is on its parent's SYN queue.
- */
-static int
-abort_syn_rcv(struct socket *so, struct mbuf *m)
-{
-	CXGB_UNIMPLEMENTED();
-#ifdef notyet	
-	struct socket *parent;
-	struct toedev *tdev = toep->tp_toedev;
-	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
-	struct socket *oreq = so->so_incomp;
-	struct t3c_tid_entry *t3c_stid;
-	struct tid_info *t;
-
-	if (!oreq)
-		return -1;        /* somehow we are not on the SYN queue */
-
-	t = &(T3C_DATA(cdev))->tid_maps;
-	t3c_stid = lookup_stid(t, oreq->ts_recent);
-	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
-
-	so_lock(parent);
-	do_abort_syn_rcv(so, parent);
-	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
-	so_unlock(parent);
-#endif
+	send_abort_rpl(tod, tid, qset);
+	m_freem(m);
 	return (0);
 }
 
-/*
- * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
- * request except that we need to reply to it.
- */
 static void
-process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
+assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
 {
-	int rst_status = CPL_ABORT_NO_RST;
-	const struct cpl_abort_req_rss *req = cplhdr(m);
-	struct tcpcb *tp = toep->tp_tp; 
-	struct socket *so;
-	int needclose = 0;
-	
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
-	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
-		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
-		m_free(m);
-		goto skip;
+	struct toepcb *toep = tp->t_toe;
+	struct adapter *sc = toep->tp_tod->tod_softc;
+
+	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
+
+	if (G_TCPOPT_TSTAMP(tcpopt)) {
+		tp->t_flags |= TF_RCVD_TSTMP;
+		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
+		tp->ts_recent = 0;		/* XXX */
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
 	}
 
-	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
-	/*
-	 * Three cases to consider:
-	 * a) We haven't sent an abort_req; close the connection.
-	 * b) We have sent a post-close abort_req that will get to TP too late
-	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
-	 *    be ignored and the connection should be closed now.
-	 * c) We have sent a regular abort_req that will get to TP too late.
-	 *    That will generate an abort_rpl with status 0, wait for it.
-	 */
-	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
-	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
-		int error;
-		
-		error = abort_status_to_errno(so, req->status,
-		    &rst_status);
-		so_error_set(so, error);
+	if (G_TCPOPT_SACK(tcpopt))
+		tp->t_flags |= TF_SACK_PERMIT;
+	else
+		tp->t_flags &= ~TF_SACK_PERMIT;
 
-		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
-			so_sorwakeup(so);
-		/*
-		 * SYN_RECV needs special processing.  If abort_syn_rcv()
-		 * returns 0 is has taken care of the abort.
-		 */
-		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
-			goto skip;
+	if (G_TCPOPT_WSCALE_OK(tcpopt))
+		tp->t_flags |= TF_RCVD_SCALE;
 
-		t3_release_offload_resources(toep);
-		needclose = 1;
+	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+		tp->rcv_scale = tp->request_r_scale;
+		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
 	}
-	inp_wunlock(tp->t_inpcb);
 
-	if (needclose)
-		tcp_offload_close(tp);
-
-	send_abort_rpl(m, tdev, rst_status);
-	return;
-skip:
-	inp_wunlock(tp->t_inpcb);	
 }
 
 /*
- * Handle an ABORT_REQ_RSS CPL message.
+ * The ISS and IRS are from after the exchange of SYNs and are off by 1.
  */
-static int
-do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+void
+make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
+    uint16_t cpl_tcpopt)
 {
-	const struct cpl_abort_req_rss *req = cplhdr(m);
-	struct toepcb *toep = (struct toepcb *)ctx;
-	
-	if (is_neg_adv_abort(req->status)) {
-		m_free(m);
-		return (0);
-	}
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	long bufsize;
+	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
+	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
+	uint16_t tcpopt = be16toh(cpl_tcpopt);
 
-	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
-	
-	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
-		cxgb_remove_tid(cdev, toep, toep->tp_tid);
-		toep->tp_flags |= TP_ABORT_REQ_RCVD;
-		
-		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
-		if (toep->tp_l2t) 
-			l2t_release(L2DATA(cdev), toep->tp_l2t);
+	INP_WLOCK_ASSERT(inp);
 
-		/*
-		 *  Unhook
-		 */
-		toep->tp_tp->t_toe = NULL;
-		toep->tp_tp->t_flags &= ~TF_TOE;
-		toep->tp_tp = NULL;
-		/*
-		 * XXX need to call syncache_chkrst - but we don't
-		 * have a way of doing that yet
-		 */
-		toepcb_release(toep);
-		log(LOG_ERR, "abort for unestablished connection :-(\n");
-		return (0);
-	}
-	if (toep->tp_tp == NULL) {
-		log(LOG_NOTICE, "disconnected toepcb\n");
-		/* should be freed momentarily */
-		return (0);
-	}
+	tp->t_state = TCPS_ESTABLISHED;
+	tp->t_starttime = ticks;
+	TCPSTAT_INC(tcps_connects);
 
+	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
+	    toep->tp_tid, toep, inp);
 
-	toepcb_hold(toep);
-	process_abort_req(toep, m, toep->tp_toedev);
-	toepcb_release(toep);
-	return (0);
-}
-#ifdef notyet
-static void
-pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
-{
-	struct toedev *tdev = TOE_DEV(parent);
-
-	do_abort_syn_rcv(child, parent);
-	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
-		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
-
-		rpl->opt0h = htonl(F_TCAM_BYPASS);
-		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
-		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
-	} else
-		m_free(m);
-}
-#endif
-static void
-handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
-{
-	CXGB_UNIMPLEMENTED();
-	
-#ifdef notyet	
-	struct t3cdev *cdev;
-	struct socket *parent;
-	struct socket *oreq;
-	struct t3c_tid_entry *t3c_stid;
-	struct tid_info *t;
-	struct tcpcb *otp, *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	
-	/*
-	 * If the connection is being aborted due to the parent listening
-	 * socket going away there's nothing to do, the ABORT_REQ will close
-	 * the connection.
-	 */
-	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
-		m_free(m);
-		return;
-	}
-
-	oreq = so->so_incomp;
-	otp = so_sototcpcb(oreq);
-	
-	cdev = T3C_DEV(so);
-	t = &(T3C_DATA(cdev))->tid_maps;
-	t3c_stid = lookup_stid(t, otp->ts_recent);
-	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
-
-	so_lock(parent);
-	pass_open_abort(so, parent, m);
-	so_unlock(parent);
-#endif	
-}
-
-/*
- * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
- * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
- * connection.
- */
-static void
-pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
-{
-
-#ifdef notyet	
-	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
-	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
-#endif
-	handle_pass_open_arp_failure(m_get_socket(m), m);
-}
-
-/*
- * Populate a reject CPL_PASS_ACCEPT_RPL WR.
- */
-static void
-mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
-{
-	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
-	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
-	unsigned int tid = GET_TID(req);
-
-	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
-	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
-	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
-	rpl->opt0h = htonl(F_TCAM_BYPASS);
-	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
-	rpl->opt2 = 0;
-	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
-}
-
-/*
- * Send a deferred reject to an accept request.
- */
-static void
-reject_pass_request(struct toedev *tdev, struct mbuf *m)
-{
-	struct mbuf *reply_mbuf;
-
-	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
-	mk_pass_accept_rpl(reply_mbuf, m);
-	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
-	m_free(m);
-}
-
-static void
-handle_syncache_event(int event, void *arg)
-{
-	struct toepcb *toep = arg;
-
-	switch (event) {
-	case TOE_SC_ENTRY_PRESENT:
-		/*
-		 * entry already exists - free toepcb
-		 * and l2t
-		 */
-		printf("syncache entry present\n");
-		toepcb_release(toep);
-		break;
-	case TOE_SC_DROP:
-		/*
-		 * The syncache has given up on this entry
-		 * either it timed out, or it was evicted
-		 * we need to explicitly release the tid
-		 */
-		printf("syncache entry dropped\n");
-		toepcb_release(toep);		
-		break;
-	default:
-		log(LOG_ERR, "unknown syncache event %d\n", event);
-		break;
-	}
-}
-
-static void
-syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
-{
-	struct in_conninfo inc;
-	struct toeopt toeo;
-	struct tcphdr th;
-	struct inpcb *inp;
-	int mss, wsf, sack, ts;
-	uint32_t rcv_isn = ntohl(req->rcv_isn);
-	
-	bzero(&toeo, sizeof(struct toeopt));
-	inp = so_sotoinpcb(lso);
-	
-	/*
-	 * Fill out information for entering us into the syncache
-	 */
-	bzero(&inc, sizeof(inc));
-	inc.inc_fport = th.th_sport = req->peer_port;
-	inc.inc_lport = th.th_dport = req->local_port;
-	th.th_seq = req->rcv_isn;
-	th.th_flags = TH_SYN;
-
-	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
-
-	inc.inc_len = 0;
-	inc.inc_faddr.s_addr = req->peer_ip;
-	inc.inc_laddr.s_addr = req->local_ip;
-
-	DPRINTF("syncache add of %d:%d %d:%d\n",
-	    ntohl(req->local_ip), ntohs(req->local_port),
-	    ntohl(req->peer_ip), ntohs(req->peer_port));
-	
-	mss = req->tcp_options.mss;
-	wsf = req->tcp_options.wsf;
-	ts = req->tcp_options.tstamp;
-	sack = req->tcp_options.sack;
-	toeo.to_mss = mss;
-	toeo.to_wscale = wsf;
-	toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
-	tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs,
-toep);
-}
-
-
-/*
- * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
- * lock held.  Note that the sock here is a listening socket that is not owned
- * by the TOE.
- */
-static void
-process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
-    struct listen_ctx *lctx)
-{
-	int rt_flags;
-	struct l2t_entry *e;
-	struct iff_mac tim;
-	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
-	struct cpl_pass_accept_rpl *rpl;
-	struct cpl_pass_accept_req *req = cplhdr(m);
-	unsigned int tid = GET_TID(req);
-	struct tom_data *d = TOM_DATA(tdev);
-	struct t3cdev *cdev = d->cdev;
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *newtoep;
-	struct rtentry *dst;
-	struct sockaddr_in nam;
-	struct t3c_data *td = T3C_DATA(cdev);
-
-	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
-	if (__predict_false(reply_mbuf == NULL)) {
-		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
-			t3_defer_reply(m, tdev, reject_pass_request);
-		else {
-			cxgb_queue_tid_release(cdev, tid);
-			m_free(m);
-		}
-		DPRINTF("failed to get reply_mbuf\n");
-		
-		goto out;
-	}
-
-	if (tp->t_state != TCPS_LISTEN) {
-		DPRINTF("socket not in listen state\n");
-		
-		goto reject;
-	}
-	
-	tim.mac_addr = req->dst_mac;
-	tim.vlan_tag = ntohs(req->vlan_tag);
-	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
-		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
-		goto reject;
-	}
-	
-#ifdef notyet
-	/*
-	 * XXX do route lookup to confirm that we're still listening on this
-	 * address
-	 */
-	if (ip_route_input(skb, req->local_ip, req->peer_ip,
-			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
-		goto reject;
-	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
-		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
-	dst_release(skb->dst);	// done with the input route, release it
-	skb->dst = NULL;
-	
-	if ((rt_flags & RTF_LOCAL) == 0)
-		goto reject;
-#endif
-	/*
-	 * XXX
-	 */
-	rt_flags = RTF_LOCAL;
-	if ((rt_flags & RTF_LOCAL) == 0)
-		goto reject;
-	
-	/*
-	 * Calculate values and add to syncache
-	 */
-
-	newtoep = toepcb_alloc();
-	if (newtoep == NULL)
-		goto reject;
-
-	bzero(&nam, sizeof(struct sockaddr_in));
-	
-	nam.sin_len = sizeof(struct sockaddr_in);
-	nam.sin_family = AF_INET;
-	nam.sin_addr.s_addr =req->peer_ip;
-	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
-
-	if (dst == NULL) {
-		printf("failed to find route\n");
-		goto reject;
-	}
-	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
-	    (struct sockaddr *)&nam);
-	if (e == NULL) {
-		DPRINTF("failed to get l2t\n");
-	}
-	/*
-	 * Point to our listen socket until accept
-	 */
-	newtoep->tp_tp = tp;
-	newtoep->tp_flags = TP_SYN_RCVD;
-	newtoep->tp_tid = tid;
-	newtoep->tp_toedev = tdev;
-	tp->rcv_wnd = select_rcv_wnd(tdev, so);
-	
-	cxgb_insert_tid(cdev, d->client, newtoep, tid);
-	so_lock(so);
-	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
-	so_unlock(so);
-
-	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
-		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
-
-	if (newtoep->tp_ulp_mode) {
-		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
-		
-		if (ddp_mbuf == NULL)
-			newtoep->tp_ulp_mode = 0;
-	}
-	
-	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
-	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
-	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
-	/*
-	 * XXX workaround for lack of syncache drop
-	 */
-	toepcb_hold(newtoep);
-	syncache_add_accept_req(req, so, newtoep);
-	
-	rpl = cplhdr(reply_mbuf);
-	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
-	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	rpl->wr.wr_lo = 0;
-	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
-	rpl->opt2 = htonl(calc_opt2(so, tdev));
-	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
-	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
-
-	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
-	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
-	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
-				  CPL_PASS_OPEN_ACCEPT);
-
-	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
-	
-	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
-		
-	l2t_send(cdev, reply_mbuf, e);
-	m_free(m);
-	if (newtoep->tp_ulp_mode) {	
-		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
-				V_TF_DDP_OFF(1) |
-				TP_DDP_TIMER_WORKAROUND_MASK,
-				V_TF_DDP_OFF(1) |
-		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
-	} else
-		DPRINTF("no DDP\n");
-
-	return;
-reject:
-	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
-		mk_pass_accept_rpl(reply_mbuf, m);
-	else 
-		mk_tid_release(reply_mbuf, newtoep, tid);
-	cxgb_ofld_send(cdev, reply_mbuf);
-	m_free(m);
-out:
-#if 0
-	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
-#else
-	return;
-#endif	
-}      
-
-/*
- * Handle a CPL_PASS_ACCEPT_REQ message.
- */
-static int
-do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
-	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
-	struct tom_data *d = listen_ctx->tom_data;
-
-#if VALIDATE_TID
-	struct cpl_pass_accept_req *req = cplhdr(m);
-	unsigned int tid = GET_TID(req);
-	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
-
-	if (unlikely(!lsk)) {
-		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
-		       cdev->name,
-		       (unsigned long)((union listen_entry *)ctx -
-					t->stid_tab));
-		return CPL_RET_BUF_DONE;
-	}
-	if (unlikely(tid >= t->ntids)) {
-		printk(KERN_ERR "%s: passive open TID %u too large\n",
-		       cdev->name, tid);
-		return CPL_RET_BUF_DONE;
-	}
-	/*
-	 * For T3A the current user of the TID may have closed but its last
-	 * message(s) may have been backlogged so the TID appears to be still
-	 * in use.  Just take the TID away, the connection can close at its
-	 * own leisure.  For T3B this situation is a bug.
-	 */
-	if (!valid_new_tid(t, tid) &&
-	    cdev->type != T3A) {
-		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
-		       cdev->name, tid);
-		return CPL_RET_BUF_DONE;
-	}
-#endif
-
-	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
-	return (0);
-}
-
-/*
- * Called when a connection is established to translate the TCP options
- * reported by HW to FreeBSD's native format.
- */
-static void
-assign_rxopt(struct socket *so, unsigned int opt)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
-
-	inp_lock_assert(tp->t_inpcb);
-	
-	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
-	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
-	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
-	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
-	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
-	    (TF_RCVD_SCALE|TF_REQ_SCALE))
-		tp->rcv_scale = tp->request_r_scale;
-}
-
-/*
- * Completes some final bits of initialization for just established connections
- * and changes their state to TCP_ESTABLISHED.
- *
- * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
- */
-static void
-make_established(struct socket *so, u32 snd_isn, unsigned int opt)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	
-	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
-	assign_rxopt(so, opt);
+	tp->irs = irs;
+	tcp_rcvseqinit(tp);
+	tp->rcv_wnd = toep->tp_rx_credits << 10;
+	tp->rcv_adv += tp->rcv_wnd;
+	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
-	 *XXXXXXXXXXX
-	 * 
+	 * If we were unable to send all rx credits via opt0, save the remainder
+	 * in rx_credits so that they can be handed over with the next credit
+	 * update.
 	 */
-#ifdef notyet
-	so->so_proto->pr_ctloutput = t3_ctloutput;
-#endif
-	
-#if 0	
-	inet_sk(sk)->id = tp->write_seq ^ jiffies;
-#endif	
-	/*
-	 * XXX not clear what rcv_wup maps to
-	 */
-	/*
-	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
-	 * pass through opt0.
-	 */
-	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
-		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
+	SOCKBUF_LOCK(&so->so_rcv);
+	bufsize = select_rcv_wnd(so);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
 
-	dump_toepcb(toep);
+	tp->iss = iss;
+	tcp_sendseqinit(tp);
+	tp->snd_una = iss + 1;
+	tp->snd_nxt = iss + 1;
+	tp->snd_max = iss + 1;
 
-#ifdef notyet
-/*
- * no clean interface for marking ARP up to date
- */
-	dst_confirm(sk->sk_dst_cache);
-#endif
-	tp->t_starttime = ticks;
-	tp->t_state = TCPS_ESTABLISHED;
+	assign_rxopt(tp, tcpopt);
 	soisconnected(so);
 }
 
-static int
-syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
-{
-
-	struct in_conninfo inc;
-	struct toeopt toeo;
-	struct tcphdr th;
-	int mss, wsf, sack, ts;
-	struct mbuf *m = NULL;
-	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
-	unsigned int opt;
-	
-#ifdef MAC
-#error	"no MAC support"
-#endif	
-	
-	opt = ntohs(req->tcp_opt);
-	
-	bzero(&toeo, sizeof(struct toeopt));
-	
-	/*
-	 * Fill out information for entering us into the syncache
-	 */
-	bzero(&inc, sizeof(inc));
-	inc.inc_fport = th.th_sport = req->peer_port;
-	inc.inc_lport = th.th_dport = req->local_port;
-	th.th_seq = req->rcv_isn;
-	th.th_flags = TH_ACK;
-	
-	inc.inc_len = 0;
-	inc.inc_faddr.s_addr = req->peer_ip;
-	inc.inc_laddr.s_addr = req->local_ip;
-	
-	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
-	wsf  = G_TCPOPT_WSCALE_OK(opt);
-	ts   = G_TCPOPT_TSTAMP(opt);
-	sack = G_TCPOPT_SACK(opt);
-	
-	toeo.to_mss = mss;
-	toeo.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
-	toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
-
-	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
-	    ntohl(req->local_ip), ntohs(req->local_port),
-	    ntohl(req->peer_ip), ntohs(req->peer_port),
-	    mss, wsf, ts, sack);
-	return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m);
-}
-
-
-/*
- * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
- * if we are in TCP_SYN_RECV due to crossed SYNs
- */
-static int
-do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct cpl_pass_establish *req = cplhdr(m);
-	struct toepcb *toep = (struct toepcb *)ctx;
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so, *lso;
-	struct t3c_data *td = T3C_DATA(cdev);
-	struct sockbuf *snd, *rcv;
-	
-	// Complete socket initialization now that we have the SND_ISN
-	
-	struct toedev *tdev;
-
-
-	tdev = toep->tp_toedev;
-
-	inp_wlock(tp->t_inpcb);
-	
-	/*
-	 *
-	 * XXX need to add reference while we're manipulating
-	 */
-	so = lso = inp_inpcbtosocket(tp->t_inpcb);
-
-	inp_wunlock(tp->t_inpcb);
-
-	so_lock(so);
-	LIST_REMOVE(toep, synq_entry);
-	so_unlock(so);
-	
-	if (!syncache_expand_establish_req(req, &so, toep)) {
-		/*
-		 * No entry 
-		 */
-		CXGB_UNIMPLEMENTED();
-	}
-	if (so == NULL) {
-		/*
-		 * Couldn't create the socket
-		 */
-		CXGB_UNIMPLEMENTED();
-	}
-
-	tp = so_sototcpcb(so);
-	inp_wlock(tp->t_inpcb);
-
-	snd = so_sockbuf_snd(so);
-	rcv = so_sockbuf_rcv(so);
-
-	snd->sb_flags |= SB_NOCOALESCE;
-	rcv->sb_flags |= SB_NOCOALESCE;
-
-	toep->tp_tp = tp;
-	toep->tp_flags = 0;
-	tp->t_toe = toep;
-	reset_wr_list(toep);
-	tp->rcv_wnd = select_rcv_wnd(tdev, so);
-	tp->rcv_nxt = toep->tp_copied_seq;
-	install_offload_ops(so);
-	
-	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
-	toep->tp_wr_unacked = 0;
-	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
-	toep->tp_qset_idx = 0;
-	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
-	
-	/*
-	 * XXX Cancel any keep alive timer
-	 */
-	     
-	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
-
-	/*
-	 * XXX workaround for lack of syncache drop
-	 */
-	toepcb_release(toep);
-	inp_wunlock(tp->t_inpcb);
-	
-	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
-	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
-#ifdef notyet
-	/*
-	 * XXX not sure how these checks map to us
-	 */
-	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
-		sk->sk_state_change(sk);
-		sk_wake_async(so, 0, POLL_OUT);
-	}
-	/*
-	 * The state for the new connection is now up to date.
-	 * Next check if we should add the connection to the parent's
-	 * accept queue.  When the parent closes it resets connections
-	 * on its SYN queue, so check if we are being reset.  If so we
-	 * don't need to do anything more, the coming ABORT_RPL will
-	 * destroy this socket.  Otherwise move the connection to the
-	 * accept queue.
-	 *
-	 * Note that we reset the synq before closing the server so if
-	 * we are not being reset the stid is still open.
-	 */
-	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
-		__kfree_skb(skb);
-		goto unlock;
-	}
-#endif
-	m_free(m);
-
-	return (0);
-}
-
 /*
  * Fill in the right TID for CPL messages waiting in the out-of-order queue
  * and send them to the TOE.
@@ -3745,48 +1623,70 @@
 fixup_and_send_ofo(struct toepcb *toep)
 {
 	struct mbuf *m;
-	struct toedev *tdev = toep->tp_toedev;
-	struct tcpcb *tp = toep->tp_tp;
+	struct toedev *tod = toep->tp_tod;
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = toep->tp_inp;
 	unsigned int tid = toep->tp_tid;
 
-	log(LOG_NOTICE, "fixup_and_send_ofo\n");
-	
-	inp_lock_assert(tp->t_inpcb);
+	inp_lock_assert(inp);
+
 	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
+		struct ofld_hdr *oh = mtod(m, void *);
 		/*
 		 * A variety of messages can be waiting but the fields we'll
 		 * be touching are common to all so any message type will do.
 		 */
-		struct cpl_close_con_req *p = cplhdr(m);
+		struct cpl_close_con_req *p = (void *)(oh + 1);
 
-		p->wr.wr_lo = htonl(V_WR_TID(tid));
+		p->wr.wrh_lo = htonl(V_WR_TID(tid));
 		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
-		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+		t3_offload_tx(sc, m);
 	}
 }
 
 /*
- * Updates socket state from an active establish CPL message.  Runs with the
- * socket lock held.
+ * Process a CPL_ACT_ESTABLISH message.
  */
-static void
-socket_act_establish(struct socket *so, struct mbuf *m)
+static int
+do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct cpl_act_establish *req = cplhdr(m);
-	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	
-	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
-		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
-		    toep->tp_tid, tp->t_state);
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_act_establish *req = mtod(m, void *);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp;
+	struct socket *so; 
 
-	tp->ts_recent_age = ticks;
-	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
-	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
+	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
 
-	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
-	
+	free_atid(&td->tid_maps, atid);
+
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	KASSERT(toep->tp_qset == qs->idx,
+	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
+	KASSERT(toep->tp_tid == atid,
+	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
+
+	toep->tp_tid = tid;
+	insert_tid(td, toep, tid);
+
+	if (inp->inp_flags & INP_DROPPED) {
+		/* socket closed by the kernel before hw told us it connected */
+		send_reset(toep);
+		goto done;
+	}
+
+	KASSERT(tp->t_state == TCPS_SYN_SENT,
+	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
+
+	so = inp->inp_socket;
+	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
+
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
@@ -3794,80 +1694,9 @@
 	if (mbufq_len(&toep->out_of_order_queue))
 		fixup_and_send_ofo(toep);
 
-	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
-		/*
-		 * XXX does this even make sense?
-		 */
-		so_sorwakeup(so);
-	}
-	m_free(m);
-#ifdef notyet
-/*
- * XXX assume no write requests permitted while socket connection is
- * incomplete
- */
-	/*
-	 * Currently the send queue must be empty at this point because the
-	 * socket layer does not send anything before a connection is
-	 * established.  To be future proof though we handle the possibility
-	 * that there are pending buffers to send (either TX_DATA or
-	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
-	 * buffers according to the just learned write_seq, and then we send
-	 * them on their way.
-	 */
-	fixup_pending_writeq_buffers(sk);
-	if (t3_push_frames(so, 1))
-		sk->sk_write_space(sk);
-#endif
-
-	toep->tp_state = tp->t_state;
-	KMOD_TCPSTAT_INC(tcps_connects);
-				
-}
-
-/*
- * Process a CPL_ACT_ESTABLISH message.
- */
-static int
-do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct cpl_act_establish *req = cplhdr(m);
-	unsigned int tid = GET_TID(req);
-	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
-	struct toepcb *toep = (struct toepcb *)ctx;
-	struct tcpcb *tp = toep->tp_tp;
-	struct socket *so; 
-	struct toedev *tdev;
-	struct tom_data *d;
-	
-	if (tp == NULL) {
-		free_atid(cdev, atid);
-		return (0);
-	}
-	inp_wlock(tp->t_inpcb);
-
-	/*
-	 * XXX
-	 */
-	so = inp_inpcbtosocket(tp->t_inpcb);
-	tdev = toep->tp_toedev; /* blow up here if link was down */
-	d = TOM_DATA(tdev);
-
-	/*
-	 * It's OK if the TID is currently in use, the owning socket may have
-	 * backlogged its last CPL message(s).  Just take it away.
-	 */
-	toep->tp_tid = tid;
-	toep->tp_tp = tp;
-	so_insert_tid(d, toep, tid);
-	free_atid(cdev, atid);
-	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
-
-	socket_act_establish(so, m);
-	inp_wunlock(tp->t_inpcb);
-	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
-	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
-
+done:
+	INP_WUNLOCK(inp);
+	m_freem(m);
 	return (0);
 }
 
@@ -3878,97 +1707,66 @@
 static void
 wr_ack(struct toepcb *toep, struct mbuf *m)
 {
-	struct tcpcb *tp = toep->tp_tp;
-	struct cpl_wr_ack *hdr = cplhdr(m);
+	struct inpcb *inp = toep->tp_inp;
+	struct tcpcb *tp;
+	struct cpl_wr_ack *hdr = mtod(m, void *);
 	struct socket *so;
 	unsigned int credits = ntohs(hdr->credits);
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	struct sockbuf *snd;
-	
-	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
+	struct mbuf *p;
+	struct ofld_hdr *oh;
 
-	inp_wlock(tp->t_inpcb);
-	so = inp_inpcbtosocket(tp->t_inpcb);
+	inp_wlock(inp);
+	tp = intotcpcb(inp);
+	so = inp->inp_socket;
 	toep->tp_wr_avail += credits;
 	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
 		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
 
 	while (credits) {
-		struct mbuf *p = peek_wr(toep);
-		
+		p = peek_wr(toep);
+
 		if (__predict_false(!p)) {
+			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
+			    "tid %u, state %u, wr_avail %u", __func__, credits,
+			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
+
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
 			    "nothing pending, state %u wr_avail=%u\n",
 			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
-		CTR2(KTR_TOM,
-			"wr_ack: p->credits=%d p->bytes=%d",
-		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
-		KASSERT(p->m_pkthdr.csum_data != 0,
-		    ("empty request still on list"));
 
-		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+		oh = mtod(p, struct ofld_hdr *);
 
-#if DEBUG_WR > 1
-			struct tx_data_wr *w = cplhdr(p);
-			log(LOG_ERR,
-			       "TID %u got %u WR credits, need %u, len %u, "
-			       "main body %u, frags %u, seq # %u, ACK una %u,"
-			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
-			       toep->tp_tid, credits, p->csum, p->len,
-			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
-			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
-			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
-#endif
-			p->m_pkthdr.csum_data -= credits;
-			break;
-		} else {
-			dequeue_wr(toep);
-			credits -= p->m_pkthdr.csum_data;
-			bytes += p->m_pkthdr.len;
-			CTR3(KTR_TOM,
-			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
-			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
-	
-			m_free(p);
-		}
+		KASSERT(credits >= G_HDR_NDESC(oh->flags),
+		    ("%s: partial credits?  %d %d", __func__, credits,
+		    G_HDR_NDESC(oh->flags)));
+
+		dequeue_wr(toep);
+		credits -= G_HDR_NDESC(oh->flags);
+		bytes += oh->plen;
+
+		if (oh->flags & F_HDR_SGL)
+			sglist_free(oh->sgl);
+		m_freem(p);
 	}
 
-#if DEBUG_WR
-	check_wr_invariants(tp);
-#endif
-
-	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
-#if VALIDATE_SEQ
-		struct tom_data *d = TOM_DATA(TOE_DEV(so));
-
-		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
-		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
-		    toep->tp_tid, tp->snd_una);
-#endif
+	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
 		goto out_free;
-	}
 
 	if (tp->snd_una != snd_una) {
 		tp->snd_una = snd_una;
-		tp->ts_recent_age = ticks;
-#ifdef notyet
-		/*
-		 * Keep ARP entry "minty fresh"
-		 */
-		dst_confirm(sk->sk_dst_cache);
-#endif
+		tp->ts_recent_age = tcp_ts_getticks();
 		if (tp->snd_una == tp->snd_nxt)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 
 	snd = so_sockbuf_snd(so);
 	if (bytes) {
-		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
-		snd = so_sockbuf_snd(so);
-		sockbuf_lock(snd);		
+		SOCKBUF_LOCK(snd);
 		sbdrop_locked(snd, bytes);
 		so_sowwakeup_locked(so);
 	}
@@ -3978,142 +1776,25 @@
 
 out_free:
 	inp_wunlock(tp->t_inpcb);
-	m_free(m);
+	m_freem(m);
 }
 
 /*
  * Handler for TX_DATA_ACK CPL messages.
  */
 static int
-do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
+do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
-	struct toepcb *toep = (struct toepcb *)ctx;
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_wr_ack *hdr = mtod(m, void *);
+	unsigned int tid = GET_TID(hdr);
+	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
-	VALIDATE_SOCK(so);
+	/* XXX bad race */
+	if (toep)
+		wr_ack(toep, m);
 
-	wr_ack(toep, m);
-	return 0;
-}
-
-/*
- * Handler for TRACE_PKT CPL messages.  Just sink these packets.
- */
-static int
-do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
-{
-	m_freem(m);
-	return 0;
-}
-
-/*
- * Reset a connection that is on a listener's SYN queue or accept queue,
- * i.e., one that has not had a struct socket associated with it.
- * Must be called from process context.
- *
- * Modeled after code in inet_csk_listen_stop().
- */
-static void
-t3_reset_listen_child(struct socket *child)
-{
-	struct tcpcb *tp = so_sototcpcb(child);
-	
-	t3_send_reset(tp->t_toe);
-}
-
-
-static void
-t3_child_disconnect(struct socket *so, void *arg)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-		
-	if (tp->t_flags & TF_TOE) {
-		inp_wlock(tp->t_inpcb);
-		t3_reset_listen_child(so);
-		inp_wunlock(tp->t_inpcb);
-	}	
-}
-
-/*
- * Disconnect offloaded established but not yet accepted connections sitting
- * on a server's accept_queue.  We just send an ABORT_REQ at this point and
- * finish off the disconnect later as we may need to wait for the ABORT_RPL.
- */
-void
-t3_disconnect_acceptq(struct socket *listen_so)
-{
-
-	so_lock(listen_so);
-	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
-	so_unlock(listen_so);
-}
-
-/*
- * Reset offloaded connections sitting on a server's syn queue.  As above
- * we send ABORT_REQ and finish off when we get ABORT_RPL.
- */
-
-void
-t3_reset_synq(struct listen_ctx *lctx)
-{
-	struct toepcb *toep;
-
-	so_lock(lctx->lso);	
-	while (!LIST_EMPTY(&lctx->synq_head)) {
-		toep = LIST_FIRST(&lctx->synq_head);
-		LIST_REMOVE(toep, synq_entry);
-		toep->tp_tp = NULL;
-		t3_send_reset(toep);
-		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
-		toepcb_release(toep);
-	}
-	so_unlock(lctx->lso); 
-}
-
-
-int
-t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
-		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
-		   unsigned int pg_off, unsigned int color)
-{
-	unsigned int i, j, pidx;
-	struct pagepod *p;
-	struct mbuf *m;
-	struct ulp_mem_io *req;
-	unsigned int tid = toep->tp_tid;
-	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
-	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
-
-	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
-	    gl, nppods, tag, maxoff, pg_off, color);
-	
-	for (i = 0; i < nppods; ++i) {
-		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
-		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-		req = mtod(m, struct ulp_mem_io *);
-		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
-		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
-		req->wr.wr_lo = 0;
-		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
-					   V_ULPTX_CMD(ULP_MEM_WRITE));
-		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
-				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
-
-		p = (struct pagepod *)(req + 1);
-		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
-			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
-			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
-						  V_PPOD_COLOR(color));
-			p->pp_max_offset = htonl(maxoff);
-			p->pp_page_offset = htonl(pg_off);
-			p->pp_rsvd = 0;
-			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
-				p->pp_addr[j] = pidx < gl->dgl_nelem ?
-				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
-		} else
-			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
-		send_or_defer(toep, m, 0);
-		ppod_addr += PPOD_SIZE;
-	}
 	return (0);
 }
 
@@ -4153,10 +1834,7 @@
                      unsigned int word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
-	
-	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
-	    tid, word, mask, val);
-	
+
 	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
 	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
@@ -4167,294 +1845,19 @@
 	req->val = htobe64(val);
 }
 
-/*
- * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
- */
-static void
-mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
-    unsigned int tid, unsigned int credits)
+void
+t3_init_cpl_io(struct adapter *sc)
 {
-	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
-
-	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
-	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
-	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
-	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
-	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
-				 V_RX_CREDITS(credits));
+	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
+	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
+	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
+	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
+	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
+	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
+	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
+	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
+	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
+	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
 }
-
-void
-t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
-{
-	unsigned int wrlen;
-	struct mbuf *m;
-	struct work_request_hdr *wr;
-	struct cpl_barrier *lock;
-	struct cpl_set_tcb_field *req;
-	struct cpl_get_tcb *getreq;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-#if 0
-	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
 #endif
-	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
-		sizeof(*getreq);
-	m = m_gethdr_nofail(wrlen);
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	wr = mtod(m, struct work_request_hdr *);
-	bzero(wr, wrlen);
-	
-	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
-	m->m_pkthdr.len = m->m_len = wrlen;
-
-	lock = (struct cpl_barrier *)(wr + 1);
-	mk_cpl_barrier_ulp(lock);
-
-	req = (struct cpl_set_tcb_field *)(lock + 1);
-
-	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
-
-	/* Hmmm, not sure if this actually a good thing: reactivating
-	 * the other buffer might be an issue if it has been completed
-	 * already. However, that is unlikely, since the fact that the UBUF
-	 * is not completed indicates that there is no oustanding data.
-	 */
-	if (bufidx == 0)
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
-				     V_TF_DDP_ACTIVE_BUF(1) |
-				     V_TF_DDP_BUF0_VALID(1),
-				     V_TF_DDP_ACTIVE_BUF(1));
-	else
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
-				     V_TF_DDP_ACTIVE_BUF(1) |
-				     V_TF_DDP_BUF1_VALID(1), 0);
-
-	getreq = (struct cpl_get_tcb *)(req + 1);
-	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
-
-	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
-
-	/* Keep track of the number of oustanding CPL_GET_TCB requests
-	 */
-	p->get_tcb_count++;
-	
-#ifdef T3_TRACE
-	T3_TRACE1(TIDTB(so),
-		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
-#endif
-	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
-}
-
-/**
- * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
- * @sk: the socket associated with the buffers
- * @bufidx: index of HW DDP buffer (0 or 1)
- * @tag0: new tag for HW buffer 0
- * @tag1: new tag for HW buffer 1
- * @len: new length for HW buf @bufidx
- *
- * Sends a compound WR to overlay a new DDP buffer on top of an existing
- * buffer by changing the buffer tag and length and setting the valid and
- * active flag accordingly.  The caller must ensure the new buffer is at
- * least as big as the existing one.  Since we typically reprogram both HW
- * buffers this function sets both tags for convenience. Read the TCB to
- * determine how made data was written into the buffer before the overlay
- * took place.
- */
-void
-t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
-	 	       unsigned int tag1, unsigned int len)
-{
-	unsigned int wrlen;
-	struct mbuf *m;
-	struct work_request_hdr *wr;
-	struct cpl_get_tcb *getreq;
-	struct cpl_set_tcb_field *req;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
-	    bufidx, tag0, tag1, len);
-#if 0
-	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
-#endif	
-	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
-	m = m_gethdr_nofail(wrlen);
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	wr = mtod(m, struct work_request_hdr *);
-	m->m_pkthdr.len = m->m_len = wrlen;
-	bzero(wr, wrlen);
-
-	
-	/* Set the ATOMIC flag to make sure that TP processes the following
-	 * CPLs in an atomic manner and no wire segments can be interleaved.
-	 */
-	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
-	req = (struct cpl_set_tcb_field *)(wr + 1);
-	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
-			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
-			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
-			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
-			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
-	req++;
-	if (bufidx == 0) {
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
-			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
-			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
-		req++;
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
-			    V_TF_DDP_PUSH_DISABLE_0(1) |
-			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
-			    V_TF_DDP_PUSH_DISABLE_0(0) |
-			    V_TF_DDP_BUF0_VALID(1));
-	} else {
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
-			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
-			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
-		req++;
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
-			    V_TF_DDP_PUSH_DISABLE_1(1) |
-			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
-			    V_TF_DDP_PUSH_DISABLE_1(0) |
-			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
-	}
-
-	getreq = (struct cpl_get_tcb *)(req + 1);
-	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
-
-	/* Keep track of the number of oustanding CPL_GET_TCB requests
-	 */
-	p->get_tcb_count++;
-
-#ifdef T3_TRACE
-	T3_TRACE4(TIDTB(sk),
-		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
-		  "len %d",
-		  bufidx, tag0, tag1, len);
-#endif
-	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
-}
-
-/*
- * Sends a compound WR containing all the CPL messages needed to program the
- * two HW DDP buffers, namely optionally setting up the length and offset of
- * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
- */
-void
-t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
-		      unsigned int len1, unsigned int offset1,
-                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
-{
-	unsigned int wrlen;
-	struct mbuf *m;
-	struct work_request_hdr *wr;
-	struct cpl_set_tcb_field *req;
-
-	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
-	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
-	
-#if 0
-	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
-#endif
-	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
-		(len1 ? sizeof(*req) : 0) +
-		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
-	m = m_gethdr_nofail(wrlen);
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-	wr = mtod(m, struct work_request_hdr *);
-	bzero(wr, wrlen);
-	
-	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
-	m->m_pkthdr.len = m->m_len = wrlen;
-
-	req = (struct cpl_set_tcb_field *)(wr + 1);
-	if (len0) {                  /* program buffer 0 offset and length */
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
-			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
-			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
-			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
-			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
-		req++;
-	}
-	if (len1) {                  /* program buffer 1 offset and length */
-		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
-			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
-			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
-			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
-			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
-		req++;
-	}
-
-	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
-			     ddp_flags);
-
-	if (modulate) {
-		mk_rx_data_ack_ulp(toep,
-		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
-		    toep->tp_copied_seq - toep->tp_rcv_wup);
-		toep->tp_rcv_wup = toep->tp_copied_seq;
-	}
-
-#ifdef T3_TRACE
-	T3_TRACE5(TIDTB(sk),
-		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
-		  "modulate %d",
-		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
-		  modulate);
-#endif
-
-	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
-}
-
-void
-t3_init_wr_tab(unsigned int wr_len)
-{
-	int i;
-
-	if (mbuf_wrs[1])     /* already initialized */
-		return;
-
-	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
-		int sgl_len = (3 * i) / 2 + (i & 1);
-
-		sgl_len += 3;
-		mbuf_wrs[i] = sgl_len <= wr_len ?
-		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
-	}
-
-	wrlen = wr_len * 8;
-}
-
-int
-t3_init_cpl_io(void)
-{
-#ifdef notyet
-	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
-	if (!tcphdr_skb) {
-		log(LOG_ERR,
-		       "Chelsio TCP offload: can't allocate sk_buff\n");
-		return -1;
-	}
-	skb_put(tcphdr_skb, sizeof(struct tcphdr));
-	tcphdr_skb->h.raw = tcphdr_skb->data;
-	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
-#endif
-	
-	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
-	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
-	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
-	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
-	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
-	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
-	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
-	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
-	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
-	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
-	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
-	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
-	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
-	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
-	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
-	return (0);
-}
-
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1034 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/types.h>
-#include <sys/fcntl.h>
-#include <sys/kernel.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/mbuf.h>
-#include <sys/condvar.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/smp.h>
-#include <sys/sockstate.h>
-#include <sys/sockopt.h>
-#include <sys/socket.h>
-#include <sys/sockbuf.h>
-#include <sys/syslog.h>
-#include <sys/uio.h>
-#include <sys/file.h>
-
-#include <machine/bus.h>
-#include <machine/cpu.h>
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
-#include <ulp/tom/cxgb_tcp_offload.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_offload.h>
-#include <net/route.h>
-
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_t3_cpl.h>
-#include <common/cxgb_tcb.h>
-#include <common/cxgb_ctl_defs.h>
-#include <cxgb_offload.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-
-#include <sys/mvec.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_toepcb.h>
-#include <ulp/tom/cxgb_tcp.h>
-
-
-static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
-    struct uio *uio, struct mbuf *top, struct mbuf *control,
-    int flags, struct thread *td);
-
-static int	(*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
-    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
-    int *flagsp);
-
-#define TMP_IOV_MAX 16
-#ifndef PG_FRAME
-#define PG_FRAME	~PAGE_MASK
-#endif
-#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
-
-void
-t3_init_socket_ops(void)
-{
-	struct protosw *prp;
-
-	prp = pffindtype(AF_INET, SOCK_STREAM);
-	pru_sosend = prp->pr_usrreqs->pru_sosend;
-	pru_soreceive = prp->pr_usrreqs->pru_soreceive;
-}
-
-struct cxgb_dma_info {
-	size_t			cdi_mapped;
-	int			cdi_nsegs;
-	bus_dma_segment_t	*cdi_segs;
-	
-};
-
-static void
-cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
-    bus_size_t mapsize, int error)
-{
-	struct cxgb_dma_info *cdi = arg;
-	
-	cdi->cdi_mapped = mapsize;
-	cdi->cdi_nsegs = nsegs;
-	cdi->cdi_segs = segs;
-}
-
-static void
-iov_adj(struct iovec **iov, int *iovcnt, size_t count)
-{
-	struct iovec *iovtmp;
-	int iovcnttmp;
-	caddr_t ptmp;
-	
-	if (count > 0) {
-		iovtmp = *iov;
-		iovcnttmp = *iovcnt;
-		while (count > 0) {
-			if (count < iovtmp->iov_len) {
-				ptmp = iovtmp->iov_base;
-				ptmp += count; 
-				iovtmp->iov_base = ptmp;
-				iovtmp->iov_len -= count;
-				break;
-			} else 
-				count -= iovtmp->iov_len;
-			iovtmp++;
-			iovcnttmp--;
-		} 
-		*iov = iovtmp;
-		*iovcnt = iovcnttmp;
-	} else if (count < 0) {
-		iovtmp = &(*iov)[*iovcnt - 1];
-		iovcnttmp = *iovcnt;
-		while (count < 0) {
-			if (-count < iovtmp->iov_len) {
-				iovtmp->iov_len += count;
-				break;
-			} else
-				count += iovtmp->iov_len;
-			iovtmp--;
-			iovcnttmp--;
-		}
-		*iovcnt = iovcnttmp;
-	}
-}
-
-static void
-cxgb_zero_copy_free(void *cl, void *arg)
-{
-	struct mbuf_vec *mv;
-	struct mbuf *m = (struct mbuf *)cl;
-
-	mv = mtomv(m);
-	/*
-	 * Physical addresses, don't try to free should be unheld separately from sbdrop
-	 *
-	 */
-	mv->mv_count = 0;
-	m_free_iovec(m, m->m_type);
-}
-
-
-static int
-cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, vm_prot_t prot)
-{
-	struct iovec *iov = uio->uio_iov;
-	int iovcnt = uio->uio_iovcnt;
-	int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
-	uint64_t start, end;
-	vm_page_t *mp;
-	vm_map_t map;
-
-	map = &uio->uio_td->td_proc->p_vmspace->vm_map;
-	totbytes = totcount = 0;
-	maxcount = *held;
-
-	mp = m;
-	for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount);  i++, iov++) {
-		count = maxcount - totcount;
-		    
-		start = (uintptr_t)iov->iov_base;
-		end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
-		start &= PG_FRAME;
-		end += PAGE_MASK;
-		end &= PG_FRAME;
-		npages = (end - start) >> PAGE_SHIFT;
-		
-		count = min(count, npages);
-
-		/* The following return value is not used. XXX */
-		err = vm_fault_quick_hold_pages(map,
-		    (vm_offset_t)iov->iov_base, iov->iov_len, prot, mp, count);
-		mp += count;
-		totcount += count;
-		curbytes = iov->iov_len;
-		if (count != npages)
-			curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
-		totbytes += curbytes;
-	}
-	uio->uio_resid -= totbytes;
-
-	return (0);
-}
-
-/*
- * Returns whether a connection should enable DDP.  This happens when all of
- * the following conditions are met:
- * - the connection's ULP mode is DDP
- * - DDP is not already enabled
- * - the last receive was above the DDP threshold
- * - receive buffers are in user space
- * - receive side isn't shutdown (handled by caller)
- * - the connection's receive window is big enough so that sizable buffers
- *   can be posted without closing the window in the middle of DDP (checked
- *   when the connection is offloaded)
- */
-static int
-so_should_ddp(const struct toepcb *toep, int last_recv_len)
-{
-
-	DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
-	    toep->tp_ulp_mode, last_recv_len,  TOM_TUNABLE(toep->tp_toedev, ddp_thres),
-	    toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
-
-	return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
-	       last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
-	       toep->tp_tp->rcv_wnd > 
-	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
-}
-
-static inline int
-is_ddp(const struct mbuf *m)
-{
-	return ((m->m_flags & M_DDP) != 0);
-}
-
-static inline int
-is_ddp_psh(const struct mbuf *m)
-{
-        return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
-}
-
-static int
-m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
-{
-	int curlen, startlen, resid_init, err = 0;
-	caddr_t buf;
-
-	DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
-	    m, offset, len);
-
-	startlen = len;
-	resid_init = uio->uio_resid;
-	while (m && len) {
-		buf = mtod(m, caddr_t);
-		curlen = m->m_len;
-		if (offset && (offset < curlen)) {
-			curlen -= offset;
-			buf += offset;
-			offset = 0;
-		} else if (offset) {
-			offset -= curlen;
-			m = m->m_next;
-			continue;
-		}
-		err = uiomove(buf, min(len, curlen), uio);
-		if (err) {
-			printf("uiomove returned %d\n", err);
-			return (err);
-		}
-		
-		len -= min(len, curlen);
-		m = m->m_next;
-	}
-	DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
-	    startlen - len, resid_init, uio->uio_resid);
-	return (err);
-}
-
-/*
- * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
- * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
- * DDP buffer.
- */
-static inline int
-copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
-{
-	struct iovec *to = uio->uio_iov;
-	int err;
-	
-	if (__predict_true(!is_ddp(m)))                              /* RX_DATA */
-		return m_uiomove(m, offset, len, uio);
-	if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
-		to->iov_len -= len;
-		to->iov_base = ((caddr_t)to->iov_base) + len;
-		uio->uio_iov = to;
-		uio->uio_resid -= len;
-		return (0);
-	}
-	err = t3_ddp_copy(m, offset, uio, len);             /* kernel DDP */
-	return (err);
-}
-
-static void
-cxgb_wait_dma_completion(struct toepcb *toep)
-{
-	struct rwlock *lock;
-	
-	lock = &toep->tp_tp->t_inpcb->inp_lock;
-	inp_wlock(toep->tp_tp->t_inpcb);
-	cv_wait_unlock(&toep->tp_cv, lock);
-}
-
-static int
-cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
-{
-	int i, seg_count, err, type;
-	struct mbuf *m0;
-	struct cxgb_dma_info cdi;
-	struct mbuf_vec *mv;
-	struct mbuf_iovec *mi;
-	bus_dma_segment_t *segs;
-	
-	err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
-	    cxgb_dma_callback, &cdi, 0);
-
-	if (err)
-		return (err);
-	seg_count = cdi.cdi_nsegs;	
-	if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
-		bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
-		return (ENOMEM);
-	}
-	segs = cdi.cdi_segs;
-	m0->m_type = type;
-	m0->m_flags = (M_EXT|M_NOFREE);
-	m0->m_ext.ext_type = EXT_EXTREF;
-	m0->m_ext.ext_free = cxgb_zero_copy_free;
-#if __FreeBSD_version >= 800016
-	m0->m_ext.ext_arg1 = NULL;	/* XXX: probably wrong /phk */
-	m0->m_ext.ext_arg2 = NULL;
-#else
-	m0->m_ext.ext_args = NULL;
-#endif
-    
-	mv = mtomv(m0);
-	mv->mv_count = seg_count;
-	mv->mv_first = 0;
-	for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
-		mi_collapse_sge(mi, segs);
-
-	*m = m0;
-
-	/*
-	 * This appears to be a no-op at the moment
-	 * as busdma is all or nothing need to make
-	 * sure the tag values are large enough
-	 *
-	 */
-	if (cdi.cdi_mapped < uio->uio_resid) {
-		uio->uio_resid -= cdi.cdi_mapped;
-	} else
-		uio->uio_resid = 0;
-
-	return (0);
-}
-
-static int
-t3_sosend(struct socket *so, struct uio *uio)
-{
-	int rv, count, hold_resid, sent, iovcnt;
-	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	struct mbuf *m;
-	struct uio uiotmp;
-	struct sockbuf *snd;
-	
-	/*
-	 * Events requiring iteration:
-	 *  - number of pages exceeds max hold pages for process or system
-	 *  - number of pages exceeds maximum sg entries for a single WR
-	 *
-	 * We're limited to holding 128 pages at once - and we're limited to
-	 * 34 SG entries per work request, but each SG entry can be any number 
-	 * of contiguous pages
-	 *
-	 */
-
-	uiotmp = *uio;
-	iovcnt = uio->uio_iovcnt;
-	iov = uio->uio_iov;
-	sent = 0;
-	snd = so_sockbuf_snd(so);
-sendmore:
-	/*
-	 * Make sure we don't exceed the socket buffer
-	 */
-	count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
-	rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, VM_PROT_READ);
-	hold_resid = uiotmp.uio_resid;
-	if (rv)
-		return (rv);
-
-	/*
-	 * Bump past sent and shave off the unheld amount
-	 */
-	if (hold_resid  > 0) {
-		iovtmpp = iovtmp;
-		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
-		if (sent)
-			iov_adj(&iovtmpp, &iovcnt, sent);
-		iov_adj(&iovtmpp, &iovcnt, -hold_resid);
-		uiotmp.uio_iov = iovtmpp;
-		uiotmp.uio_iovcnt = iovcnt;
-
-	}
-	uiotmp.uio_resid = uio->uio_resid - hold_resid;
-	
-	/*
-	 * Push off all held pages
-	 *
-	 */
-	while (uiotmp.uio_resid > 0) {
-		rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
-		if (rv) {
-			vm_page_unhold_pages(toep->tp_pages, count);
-			return (rv);
-		}
-		uio->uio_resid -= m->m_pkthdr.len;
-		sent += m->m_pkthdr.len;
-		sbappend(snd, m);
-		t3_push_frames(so, TRUE);
-		iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
-	}
-
-	/*
-	 * Wait for pending I/O to be DMA'd to the card 
-	 * 
-	 */
-	cxgb_wait_dma_completion(toep);
-	vm_page_unhold_pages(toep->tp_pages, count);
-	/*
-	 * If there is more data to send adjust local copy of iov
-	 * to point to teh start
-	 */
-	if (hold_resid) {
-		iovtmpp = iovtmp;
-		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
-		iov_adj(&iovtmpp, &iovcnt, sent);
-		uiotmp = *uio;
-		uiotmp.uio_iov = iovtmpp;
-		uiotmp.uio_iovcnt = iovcnt;
-		goto sendmore;
-	}
-
-	return (0);
-}
-
-static int
-cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
-    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toedev *tdev; 
-	int zcopy_thres, zcopy_enabled, rv;
-
-	/*
-	 * In order to use DMA direct from userspace the following
-	 * conditions must be met:
-	 *  - the connection is currently offloaded
-	 *  - ddp is enabled
-	 *  - the number of bytes to be transferred exceeds the threshold
-	 *  - the number of bytes currently in flight won't exceed the in-flight
-	 *    threshold XXX TODO
-	 *  - vm_fault_quick_hold_pages succeeds
-	 *  - blocking socket XXX for now
-	 *
-	 */
-	if (tp && tp->t_flags & TF_TOE) {
-		struct toepcb *toep = tp->t_toe;
-		
-		tdev = toep->tp_toedev;
-		zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
-		zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
-
-		if (uio && (uio->uio_resid > zcopy_thres) &&
-		    (uio->uio_iovcnt < TMP_IOV_MAX) &&  ((so_state_get(so) & SS_NBIO) == 0)
-		    && zcopy_enabled) {
-			rv = t3_sosend(so, uio);
-			if (rv != EAGAIN)
-				return (rv);
-		}
-	}
-	return pru_sosend(so, addr, uio, top, control, flags, td);
-}
-
-/*
- * Following replacement or removal of the first mbuf on the first mbuf chain
- * of a socket buffer, push necessary state changes back into the socket
- * buffer so that other consumers see the values consistently.  'nextrecord'
- * is the callers locally stored value of the original value of
- * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
- * NOTE: 'nextrecord' may be NULL.
- */
-static __inline void
-sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
-{
-	sockbuf_lock_assert(sb);
-	/*
-	 * First, update for the new value of nextrecord.  If necessary, make
-	 * it the first record.
-	 */
-	if (sb->sb_mb != NULL)
-		sb->sb_mb->m_nextpkt = nextrecord;
-	else
-		sb->sb_mb = nextrecord;
-
-        /*
-         * Now update any dependent socket buffer fields to reflect the new
-         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
-	 * addition of a second clause that takes care of the case where
-	 * sb_mb has been updated, but remains the last record.
-         */
-        if (sb->sb_mb == NULL) {
-                sb->sb_mbtail = NULL;
-                sb->sb_lastrecord = NULL;
-        } else if (sb->sb_mb->m_nextpkt == NULL)
-                sb->sb_lastrecord = sb->sb_mb;
-}
-
-#define IS_NONBLOCKING(so)	(so_state_get(so) & SS_NBIO)
-
-static int
-t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
-{
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
-	struct mbuf *m;
-	uint32_t offset;
-	int err, flags, avail, len, copied, copied_unacked;
-	int target;		/* Read at least this many bytes */
-	int user_ddp_ok;
-	struct ddp_state *p;
-	struct inpcb *inp = so_sotoinpcb(so);
-	int socket_state, socket_error;
-	struct sockbuf *rcv;
-	
-	avail = offset = copied = copied_unacked = 0;
-	flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
-	rcv = so_sockbuf_rcv(so);
-	
-	err = sblock(rcv, SBLOCKWAIT(flags));
-	p = &toep->tp_ddp_state;
-
-	if (err)
-		return (err);
-
-	rcv = so_sockbuf_rcv(so);
-	sockbuf_lock(rcv);
-	if ((tp->t_flags & TF_TOE) == 0) {
-		sockbuf_unlock(rcv);
-		err = EAGAIN;
-		goto done_unlocked;
-	}
-	
-	p->user_ddp_pending = 0;
-restart:
-	if ((tp->t_flags & TF_TOE) == 0) {
-		sockbuf_unlock(rcv);
-		err = EAGAIN;
-		goto done_unlocked;
-	}
-
-	len = uio->uio_resid;
-	m = rcv->sb_mb;
-	target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
-	user_ddp_ok = p->ubuf_ddp_ready;
-	p->cancel_ubuf = 0;
-	
-	if (len == 0)
-		goto done;
-	if (m) 
-		goto got_mbuf;
-
-	/* empty receive queue */
-	if (copied >= target && (rcv->sb_mb == NULL) &&
-	    !p->user_ddp_pending)
-		goto done;
-
-	socket_state = so_state_get(so);
-	socket_error = so_error_get(so);
-	rcv = so_sockbuf_rcv(so);
-	
-	if (copied) {
-		if (socket_error || tp->t_state == TCPS_CLOSED || 
-		    (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
-			goto done;
-	} else {
-		if (socket_state & SS_NOFDREF)
-			goto done;
-		if (socket_error) {
-			err = socket_error;
-			socket_error = 0;
-			goto done;
-		}
-		if (rcv->sb_state & SBS_CANTRCVMORE) 
-			goto done;
-		if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
-			goto done;
-		if (tp->t_state == TCPS_CLOSED) {
-			err = ENOTCONN; 
-			goto done;
-		}
-	}
-	if (rcv->sb_mb && !p->user_ddp_pending) {
-		sockbuf_unlock(rcv);
-		inp_wlock(inp);
-		t3_cleanup_rbuf(tp, copied_unacked);
-		inp_wunlock(inp);
-		sockbuf_lock(rcv);
-		copied_unacked = 0;
-		goto restart;
-	}
-	if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && 
-	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
-	    p->ubuf_ddp_ready) {
-		p->user_ddp_pending =
-		    !t3_overlay_ubuf(toep, rcv, uio,
-			IS_NONBLOCKING(so), flags, 1, 1);
-		if (p->user_ddp_pending) {
-			p->kbuf_posted++;
-			user_ddp_ok = 0;
-		}
-	}
-	if (p->kbuf[0] && (p->kbuf_posted == 0)) {
-		t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
-		p->kbuf_posted++;
-	}
-	if (p->user_ddp_pending) {
-		/* One shot at DDP if we already have enough data */
-		if (copied >= target)
-			user_ddp_ok = 0;
-
-		if (rcv->sb_state & SBS_CANTRCVMORE) 
-			goto done;
-		CTR0(KTR_TOM, "ddp pending -- waiting");
-		if ((err = sbwait(rcv)) != 0)
-			goto done;
-//for timers to work			await_ddp_completion(sk, flags, &timeo);
-	} else if (copied >= target)
-		goto done;
-	else {
-		if (copied_unacked) {
-			int i = 0;
-
-			sockbuf_unlock(rcv);
-			inp_wlock(inp);
-			t3_cleanup_rbuf(tp, copied_unacked);
-			inp_wunlock(inp);
-			copied_unacked = 0;
-			if (mp_ncpus > 1)
-				while (i++ < 200 && rcv->sb_mb == NULL)
-					cpu_spinwait();
-			sockbuf_lock(rcv);
-		}
-		if (rcv->sb_mb)
-			goto restart;
-
-		if (rcv->sb_state & SBS_CANTRCVMORE)
-			goto done;
-
-		CTR0(KTR_TOM, "no buffers -- waiting");
-
-		if ((err = sbwait(rcv)) != 0) 
-			goto done;
-	}
-     	goto restart;
-got_mbuf:
-	/*
-	 * Adjust the mbuf seqno if it has already been partially processed by
-	 * soreceive_generic
-	 */
-	if (m->m_pkthdr.len != m->m_len) {
-		m->m_seq += m->m_pkthdr.len - m->m_len;
-		m->m_pkthdr.len = m->m_len;
-	}
-	    
-	CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
-	    "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
-	    (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
-	    m->m_seq, toep->tp_copied_seq, copied_unacked);
-	KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
-	    ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
-		m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
-	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
-		" m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
-	if (m->m_pkthdr.len == 0) {
-		if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
-			panic("empty mbuf and NOCOPY not set\n");
-		CTR0(KTR_TOM, "ddp done notification");
-		p->user_ddp_pending = 0;
-		sbdroprecord_locked(rcv);
-		goto done;
-	}
-
-	KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
-	    ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
-		offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
-	offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
-	
-	if (offset >= m->m_pkthdr.len)
-		panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
-		    "seq 0x%x pktlen %d ddp flags 0x%x", offset,
-		    toep->tp_copied_seq + copied_unacked, m->m_seq,
-		    m->m_pkthdr.len, m->m_ddp_flags);
-
-	avail = m->m_pkthdr.len - offset;
-	if (len < avail) {
-		if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) 
-			panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
-		avail = len;
-		rcv->sb_flags |= SB_IN_TOE;
-	} else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
-		rcv->sb_flags &= ~SB_IN_TOE;
-		
-#ifdef URGENT_DATA_SUPPORTED
-	/*
-	 * Check if the data we are preparing to copy contains urgent
-	 * data.  Either stop short of urgent data or skip it if it's
-	 * first and we are not delivering urgent data inline.
-	 */
-	if (__predict_false(toep->tp_urg_data)) {
-		uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
-		
-		if (urg_offset < avail) {
-			if (urg_offset) {
-				/* stop short of the urgent data */
-				avail = urg_offset;
-			} else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
-				/* First byte is urgent, skip */
-				toep->tp_copied_seq++;
-				offset++;
-				avail--;
-				if (!avail)
-					goto skip_copy;
-			}	
-		}	
-	}	
-#endif
-	if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
-		user_ddp_ok = 0;
-#ifdef T3_TRACE	
-		T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
-#endif	
-	}
-	
-	if (user_ddp_ok && !p->user_ddp_pending &&
-	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
-	    p->ubuf_ddp_ready) {
-		p->user_ddp_pending = 
-		    !t3_overlay_ubuf(toep, rcv, uio,
-			IS_NONBLOCKING(so), flags, 1, 1);
-		if (p->user_ddp_pending) {
-			p->kbuf_posted++;
-			user_ddp_ok = 0;
-		}
-		DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
-	} else
-		DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
-		    user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
-		    p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
-	
-	/*
-	 * If MSG_TRUNC is specified the data is discarded.
-	 * XXX need to check pr_atomic
-	 */
-	KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail,  uio->uio_resid, offset));
-	if (__predict_true(!(flags & MSG_TRUNC))) {
-		int resid = uio->uio_resid;
-		
-		sockbuf_unlock(rcv);
-		if ((err = copy_data(m, offset, avail, uio))) {
-			if (err)
-				err = EFAULT;
-			goto done_unlocked;
-		}
-			    
-		sockbuf_lock(rcv);
-		if (avail != (resid - uio->uio_resid))
-			printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
-			    avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
-
-		if ((tp->t_flags & TF_TOE) == 0) {
-			sockbuf_unlock(rcv);
-			err = EAGAIN;
-			goto done_unlocked;
-		}
-	}
-	
-	copied += avail;
-	copied_unacked += avail;
-	len -= avail;
-	
-#ifdef URGENT_DATA_SUPPORTED
-skip_copy:
-	if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
-		tp->urg_data = 0;
-#endif
-	/*
-	 * If the buffer is fully consumed free it.  If it's a DDP
-	 * buffer also handle any events it indicates.
-	 */
-	if (avail + offset >= m->m_pkthdr.len) {
-		unsigned int fl = m->m_ddp_flags;
-		int exitnow, got_psh = 0, nomoredata = 0;
-		int count;
-		struct mbuf *nextrecord;
-
-		if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
-			if (is_ddp_psh(m) && p->user_ddp_pending)
-				got_psh = 1;
-			
-			if (fl & DDP_BF_NOCOPY)
-				p->user_ddp_pending = 0;
-			else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
-				p->kbuf_posted--;
-				nomoredata = 1;
-			} else {
-				p->kbuf_posted--;
-				p->ubuf_ddp_ready = 1;
-			}
-		}
-
-		nextrecord = m->m_nextpkt;
-		count = m->m_pkthdr.len;
-		while (count > 0) {
-			count -= m->m_len;
-			KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
-			CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
-			sbfree(rcv, m);
-			rcv->sb_mb = m_free(m);
-			m = rcv->sb_mb;
-		}
-		sockbuf_pushsync(rcv, nextrecord);
-#if 0
-		sbdrop_locked(rcv, m->m_pkthdr.len);
-#endif		
-		exitnow = got_psh || nomoredata;
-		if  (copied >= target && (rcv->sb_mb == NULL) && exitnow)
-			goto done;
-		if (copied_unacked > (rcv->sb_hiwat >> 2)) {
-			sockbuf_unlock(rcv);
-			inp_wlock(inp);
-			t3_cleanup_rbuf(tp, copied_unacked);
-			inp_wunlock(inp);
-			copied_unacked = 0;
-			sockbuf_lock(rcv);
-		}
-	} 
-	if (len > 0)
-		goto restart;
-
-	done:
-	if ((tp->t_flags & TF_TOE) == 0) {
-		sockbuf_unlock(rcv);
-		err = EAGAIN;
-		goto done_unlocked;
-	}
-	/*
-	 * If we can still receive decide what to do in preparation for the
-	 * next receive.  Note that RCV_SHUTDOWN is set if the connection
-	 * transitioned to CLOSE but not if it was in that state to begin with.
-	 */
-	if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
-		if (p->user_ddp_pending) {
-			user_ddp_ok = 0;
-			t3_cancel_ubuf(toep, rcv);
-			if (rcv->sb_mb) {
-				if (copied < 0)
-					copied = 0;
-				if (len > 0)
-					goto restart;
-			}
-			p->user_ddp_pending = 0;
-		}
-		if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
-#ifdef T3_TRACE
-			T3_TRACE0(TIDTB(so),
-			  "chelsio_recvmsg: about to exit, repost kbuf");
-#endif
-
-			t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
-			p->kbuf_posted++;
-		} else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
-			CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
-			if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
-				    ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
-				rcv->sb_flags |= SB_IN_TOE;
-				p->kbuf_posted = 1;
-			}
-			
-		}
-	}
-#ifdef T3_TRACE
-	T3_TRACE5(TIDTB(so),
-		  "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
-		  "kbuf_posted %d user_ddp_pending %u",
-		  copied, len, buffers_freed, p ? p->kbuf_posted : -1, 
-	    p->user_ddp_pending);
-#endif
-	sockbuf_unlock(rcv);
-done_unlocked:	
-	if (copied_unacked && (tp->t_flags & TF_TOE)) {
-		inp_wlock(inp);
-		t3_cleanup_rbuf(tp, copied_unacked);
-		inp_wunlock(inp);
-	}
-	sbunlock(rcv);
-
-	return (err);
-}
-
-static int
-cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
-    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
-{
-	struct toedev *tdev;
-	int rv, zcopy_thres, zcopy_enabled, flags;
-	struct tcpcb *tp = so_sototcpcb(so);
-	struct sockbuf *rcv = so_sockbuf_rcv(so);
-	
-	flags = flagsp ? *flagsp &~ MSG_EOR : 0;
-	
-	/*
-	 * In order to use DMA direct from userspace the following
-	 * conditions must be met:
-	 *  - the connection is currently offloaded
-	 *  - ddp is enabled
-	 *  - the number of bytes to be transferred exceeds the threshold
-	 *  - the number of bytes currently in flight won't exceed the in-flight
-	 *    threshold XXX TODO
-	 *  - vm_fault_quick_hold_pages succeeds
-	 *  - blocking socket XXX for now
-	 *  - iovcnt is 1
-	 *
-	 */
-	if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
-	    && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
-	    ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
-		struct toepcb *toep = tp->t_toe;
-		
-		tdev =  toep->tp_toedev;
-		zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
-		zcopy_enabled = TOM_TUNABLE(tdev, ddp);
-		if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
-			(uio->uio_iovcnt == 1) && zcopy_enabled)) {
-			CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
-			    rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
-			rv = t3_soreceive(so, flagsp, uio);
-			if (rv != EAGAIN)
-				return (rv);
-			else
-				printf("returned EAGAIN\n");
-		} 
-	} else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
-		struct sockbuf *rcv = so_sockbuf_rcv(so);
-		
-		log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
-		    flags, uio->uio_iovcnt, rcv->sb_state);
-	}
-	
-	return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
-}
-
-struct protosw cxgb_protosw;
-struct pr_usrreqs cxgb_tcp_usrreqs;
-
-void
-t3_install_socket_ops(struct socket *so)
-{
-	static int copied = 0;
-	struct pr_usrreqs *pru;
-	struct protosw *psw;
-	
-	if (copied == 0) {
-		psw = so_protosw_get(so);	
-		pru = psw->pr_usrreqs;
-
-		bcopy(psw, &cxgb_protosw, sizeof(*psw));
-		bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
-
-		cxgb_protosw.pr_ctloutput = t3_ctloutput;
-		cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
-		cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
-		cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
-	}
-	so_protosw_set(so, &cxgb_protosw);
-	
-#if 0	
-	so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
-	so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
-#endif
-}
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_ddp.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,738 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/types.h>
-#include <sys/fcntl.h>
-#include <sys/kernel.h>
-#include <sys/ktr.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/mbuf.h>
-#include <sys/condvar.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/sockstate.h>
-#include <sys/sockopt.h>
-#include <sys/socket.h>
-#include <sys/sockbuf.h>
-#include <sys/syslog.h>
-#include <sys/uio.h>
-
-#include <machine/bus.h>
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-
-
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
-
-#include <ulp/tom/cxgb_tcp_offload.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_offload.h>
-#include <net/route.h>
-
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_t3_cpl.h>
-#include <common/cxgb_tcb.h>
-#include <common/cxgb_ctl_defs.h>
-#include <cxgb_offload.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-
-#include <sys/mvec.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_toepcb.h>
-#include <ulp/tom/cxgb_tcp.h>
-
-
-#define MAX_SCHEDULE_TIMEOUT	300
-
-/*
- * Return the # of page pods needed to accommodate a # of pages.
- */
-static inline unsigned int
-pages2ppods(unsigned int pages)
-{
-	return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
-}
-
-/**
- *	t3_pin_pages - pin a user memory range and prepare it for DDP
- *	@addr - the starting address
- *	@len - the length of the range
- *	@newgl - contains the pages and physical addresses of the pinned range
- *	@gl - an existing gather list, may be %NULL
- *
- *	Pins the pages in the user-space memory range [addr, addr + len) and
- *	maps them for DMA.  Returns a gather list with the pinned pages and
- *	their physical addresses.  If @gl is non NULL the pages it describes
- *	are compared against the pages for [addr, addr + len), and if the
- *	existing gather list already covers the range a new list is not
- *	allocated.  Returns 0 on success, or a negative errno.  On success if
- *	a new gather list was allocated it is returned in @newgl.
- */ 
-static int
-t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t dmamap, vm_offset_t addr,
-    size_t len, struct ddp_gather_list **newgl,
-    const struct ddp_gather_list *gl)
-{
-	int i = 0, err;
-	size_t pg_off;
-	unsigned int npages;
-	struct ddp_gather_list *p;
-	vm_map_t map;
-	
-	pg_off = addr & PAGE_MASK;
-	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
-	    M_DEVBUF, M_NOWAIT|M_ZERO);
-	if (p == NULL)
-		return (ENOMEM);
-
-	map = &curthread->td_proc->p_vmspace->vm_map;
-	if (vm_fault_quick_hold_pages(map, addr, len, VM_PROT_READ |
-	    VM_PROT_WRITE, p->dgl_pages, npages) < 0) {
-		err = EFAULT;
-		goto free_gl;
-	}
-
-	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
-	    gl->dgl_length >= len) {
-		for (i = 0; i < npages; i++)
-			if (p->dgl_pages[i] != gl->dgl_pages[i])
-				goto different_gl;
-		err = 0;
-		goto unpin;
-	}
-
-different_gl:
-	p->dgl_length = len;
-	p->dgl_offset = pg_off;
-	p->dgl_nelem = npages;
-#ifdef NEED_BUSDMA
-	p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
-				       PAGE_SIZE - pg_off,
-				       PCI_DMA_FROMDEVICE) - pg_off;
-	for (i = 1; i < npages; ++i)
-		p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
-					       PCI_DMA_FROMDEVICE);
-#endif	
-	*newgl = p;
-	return (0);
-unpin:
-	vm_page_unhold_pages(p->dgl_pages, npages);
-
-free_gl:
-	
-	free(p, M_DEVBUF);
-	*newgl = NULL;
-	return (err);
-}
-
-static void
-unmap_ddp_gl(const struct ddp_gather_list *gl)
-{
-#ifdef NEED_BUSDMA	
-	int i;
-
-	if (!gl->nelem)
-		return;
-
-	pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset,
-		       PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE);
-	for (i = 1; i < gl->nelem; ++i)
-		pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
-			       PCI_DMA_FROMDEVICE);
-
-#endif
-}
-
-static void
-ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty)
-{
-	/*
-	 * XXX mark pages as dirty before unholding 
-	 */
-	vm_page_unhold_pages(gl->dgl_pages, gl->dgl_nelem);
-}
-
-void
-t3_free_ddp_gl(struct ddp_gather_list *gl)
-{
-	unmap_ddp_gl(gl);
-	ddp_gl_free_pages(gl, 0);
-	free(gl, M_DEVBUF);
-}
-
-/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
-#define MAX_PPODS 64U
-
-/*
- * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
- * the TCB.  We allocate page pods in multiples of PPOD_CLUSTER_SIZE.  First we
- * try to allocate enough page pods to accommodate the whole buffer, subject to
- * the MAX_PPODS limit.  If that fails we try to allocate PPOD_CLUSTER_SIZE page
- * pods before failing entirely.
- */
-static int
-alloc_buf1_ppods(struct toepcb *toep, struct ddp_state *p,
-			    unsigned long addr, unsigned int len)
-{
-	int err, tag, npages, nppods;
-	struct tom_data *d = TOM_DATA(toep->tp_toedev);
-
-#if 0
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-#endif	
-	npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	nppods = min(pages2ppods(npages), MAX_PPODS);
-	nppods = roundup2(nppods, PPOD_CLUSTER_SIZE);
-	err = t3_alloc_ppods(d, nppods, &tag);
-	if (err && nppods > PPOD_CLUSTER_SIZE) {
-		nppods = PPOD_CLUSTER_SIZE;
-		err = t3_alloc_ppods(d, nppods, &tag);
-	}
-	if (err)
-		return (ENOMEM);
-
-	p->ubuf_nppods = nppods;
-	p->ubuf_tag = tag;
-#if NUM_DDP_KBUF == 1
-	t3_set_ddp_tag(toep, 1, tag << 6);
-#endif
-	return (0);
-}
-
-/*
- * Starting offset for the user DDP buffer.  A non-0 value ensures a DDP flush
- * won't block indefinitely if there's nothing to place (which should be rare).
- */
-#define UBUF_OFFSET 1
-
-static __inline unsigned long
-select_ddp_flags(const struct toepcb *toep, int buf_idx,
-                 int nonblock, int rcv_flags)
-{
-	if (buf_idx == 1) {
-		if (__predict_false(rcv_flags & MSG_WAITALL))
-			return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
-			       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
-			       V_TF_DDP_PUSH_DISABLE_1(1);
-		if (nonblock)
-			return V_TF_DDP_BUF1_FLUSH(1);
-
-		return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(toep->tp_toedev,
-							ddp_push_wait));
-	}
-
-	if (__predict_false(rcv_flags & MSG_WAITALL))
-		return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
-		       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
-		       V_TF_DDP_PUSH_DISABLE_0(1);
-	if (nonblock)
-		return V_TF_DDP_BUF0_FLUSH(1);
-
-	return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(toep->tp_toedev, ddp_push_wait));
-}
-
-/*
- * Reposts the kernel DDP buffer after it has been previously become full and
- * invalidated.  We just need to reset the offset and adjust the DDP flags.
- * Conveniently, we can set the flags and the offset with a single message.
- * Note that this function does not set the buffer length.  Again conveniently
- * our kernel buffer is of fixed size.  If the length needs to be changed it
- * needs to be done separately.
- */
-static void
-t3_repost_kbuf(struct toepcb *toep, unsigned int bufidx, int modulate, 
-    int activate, int nonblock)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-	unsigned long flags;
-
-#if 0	
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-#endif	
-	p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset;
-	p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0;
-	p->buf_state[bufidx].gl = p->kbuf[bufidx];
-	p->cur_buf = bufidx;
-	p->kbuf_idx = bufidx;
-
-	flags = select_ddp_flags(toep, bufidx, nonblock, 0);
-	if (!bufidx)
-		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
-			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |
-			 V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) |
-		         V_TF_DDP_BUF0_VALID(1),
-		         V_TF_DDP_BUF0_FLUSH(1) |
-			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
-		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
-			 V_TF_DDP_BUF0_VALID(1) |
-			 V_TF_DDP_ACTIVE_BUF(activate), modulate);
-	else
-		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
-			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |	
-		         V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | 
-			 V_TF_DDP_BUF1_VALID(1) | 
-			 V_TF_DDP_ACTIVE_BUF(activate),
-		         V_TF_DDP_BUF1_FLUSH(1) | 
-			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
-		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
-			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 
-			 modulate);
-	
-}
-
-/**
- * setup_uio_ppods - setup HW page pods for a user iovec
- * @sk: the associated socket
- * @uio: the uio
- * @oft: additional bytes to map before the start of the buffer
- *
- * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
- * page pods for user buffers on the first call per socket.  Afterwards we
- * limit the buffer length to whatever the existing page pods can accommodate.
- * Returns a negative error code or the length of the mapped buffer.
- *
- * The current implementation handles iovecs with only one entry.
- */
-static int
-setup_uio_ppods(struct toepcb *toep, const struct uio *uio, int oft, int *length)
-{
-	int err;
-	unsigned int len;
-	struct ddp_gather_list *gl = NULL;
-	struct ddp_state *p = &toep->tp_ddp_state;
-	struct iovec *iov = uio->uio_iov;
-	vm_offset_t addr = (vm_offset_t)iov->iov_base - oft;
-
-#ifdef notyet	
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-#endif
-	if (__predict_false(p->ubuf_nppods == 0)) {
-		err = alloc_buf1_ppods(toep, p, addr, iov->iov_len + oft);
-		if (err)
-			return (err);
-	}
-
-	len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
-	len -= addr & PAGE_MASK;
-	if (len > M_TCB_RX_DDP_BUF0_LEN)
-		len = M_TCB_RX_DDP_BUF0_LEN;
-	len = min(len, toep->tp_tp->rcv_wnd - 32768);
-	len = min(len, iov->iov_len + oft);
-
-	if (len <= p->kbuf[0]->dgl_length) {
-		printf("length too short\n");
-		return (EINVAL);
-	}
-	
-	err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
-	if (err)
-		return (err);
-	if (gl) {
-		if (p->ubuf)
-			t3_free_ddp_gl(p->ubuf);
-		p->ubuf = gl;
-		t3_setup_ppods(toep, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
-			       gl->dgl_offset, 0);
-	}
-	*length = len;
-	return (0);
-}
-
-/*
- * 
- */
-void
-t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-	int ubuf_pending = t3_ddp_ubuf_pending(toep);
-	int err = 0, count = 0;
-	
-	if (p->ubuf == NULL)
-		return;
-	
-	sockbuf_lock_assert(rcv);
-
-	p->cancel_ubuf = 1;
-	while (ubuf_pending && !(rcv->sb_state & SBS_CANTRCVMORE)) {
-		CTR3(KTR_TOM,
-		  "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
-		  p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), 
-		  p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
-		  p->get_tcb_count);	
-		if (p->get_tcb_count == 0)
-			t3_cancel_ddpbuf(toep, p->cur_buf);
-		else
-			CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p SBS_CANTRCVMORE=%d",
-			    err, p->get_tcb_count, rcv->sb_timeo, rcv,
-			    !!(rcv->sb_state & SBS_CANTRCVMORE));
-		
-		while (p->get_tcb_count && !(rcv->sb_state & SBS_CANTRCVMORE)) {
-			if (count & 0xfffffff)
-				CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p count=%d",
-				    err, p->get_tcb_count, rcv->sb_timeo, rcv, count);
-			count++;
-			err = sbwait(rcv);
-		}
-		ubuf_pending = t3_ddp_ubuf_pending(toep);
-	}
-	p->cancel_ubuf = 0;
-	p->user_ddp_pending = 0;
-
-}
-
-#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \
-	              V_TF_DDP_PSH_NO_INVALIDATE1(1) | \
-		      V_TF_DDP_BUF1_FLUSH(1) | \
-		      V_TF_DDP_BUF0_FLUSH(1) | \
-		      V_TF_DDP_PUSH_DISABLE_1(1) | \
-		      V_TF_DDP_PUSH_DISABLE_0(1) | \
-		      V_TF_DDP_INDICATE_OUT(1))
-
-/*
- * Post a user buffer as an overlay on top of the current kernel buffer.
- */
-int
-t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv,
-    const struct uio *uio, int nonblock, int rcv_flags,
-    int modulate, int post_kbuf)
-{
-	int err, len, ubuf_idx;
-	unsigned long flags;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	if (p->kbuf[0] == NULL) {
-		return (EINVAL);
-	}
-	sockbuf_unlock(rcv);
-	err = setup_uio_ppods(toep, uio, 0, &len);
-	sockbuf_lock(rcv);
-	if (err)
-		return (err);
-	
-	if ((rcv->sb_state & SBS_CANTRCVMORE) ||
-	    (toep->tp_tp->t_flags & TF_TOE) == 0) 
-		return (EINVAL);
-		
-	ubuf_idx = p->kbuf_idx;
-	p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
-	/* Use existing offset */
-	/* Don't need to update .gl, user buffer isn't copied. */
-	p->cur_buf = ubuf_idx;
-
-	flags = select_ddp_flags(toep, ubuf_idx, nonblock, rcv_flags);
-
-	if (post_kbuf) {
-		struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
-		
-		dbs->cur_offset = 0;
-		dbs->flags = 0;
-		dbs->gl = p->kbuf[ubuf_idx ^ 1];
-		p->kbuf_idx ^= 1;
-		flags |= p->kbuf_idx ?
-		    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
-		    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
-	}
-	
-	if (ubuf_idx == 0) {
-		t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
-				  len);
-		t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0,
-				 flags,
-				 OVERLAY_MASK | flags, 1);
-	} else {
-		t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
-				  len);
-		t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0,
-				 flags,
-				 OVERLAY_MASK | flags, 1);
-	}
-#ifdef T3_TRACE
-	T3_TRACE5(TIDTB(so),
-		  "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
-		  " kbuf_idx %d",
-		   p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
-#endif
-	CTR3(KTR_TOM,
-	    "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x",
-	    p->ubuf_tag, flags, OVERLAY_MASK);
-	CTR3(KTR_TOM,
-	    "t3_overlay_ubuf:  ubuf_idx %d kbuf_idx %d post_kbuf %d",
-	    ubuf_idx, p->kbuf_idx, post_kbuf);
-	    
-	return (0);
-}
-
-/*
- * Clean up DDP state that needs to survive until socket close time, such as the
- * DDP buffers.  The buffers are already unmapped at this point as unmapping
- * needs the PCI device and a socket may close long after the device is removed.
- */
-void
-t3_cleanup_ddp(struct toepcb *toep)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-	int idx;
-
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++)
-		if (p->kbuf[idx]) {
-			ddp_gl_free_pages(p->kbuf[idx], 0);
-			free(p->kbuf[idx], M_DEVBUF);
-		}
-	if (p->ubuf) {
-		ddp_gl_free_pages(p->ubuf, 0);
-		free(p->ubuf, M_DEVBUF);
-		p->ubuf = NULL;
-	}
-	toep->tp_ulp_mode = 0;
-}
-
-/*
- * This is a companion to t3_cleanup_ddp() and releases the HW resources
- * associated with a connection's DDP state, such as the page pods.
- * It's called when HW is done with a connection.   The rest of the state
- * remains available until both HW and the app are done with the connection.
- */
-void
-t3_release_ddp_resources(struct toepcb *toep)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-	struct tom_data *d = TOM_DATA(toep->tp_toedev);
-	int idx;
-	
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
-		t3_free_ppods(d, p->kbuf_tag[idx], 
-		    p->kbuf_nppods[idx]);
-		unmap_ddp_gl(p->kbuf[idx]);
-	}
-
-	if (p->ubuf_nppods) {
-		t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods);
-		p->ubuf_nppods = 0;
-	}
-	if (p->ubuf)
-		unmap_ddp_gl(p->ubuf);
-	
-}
-
-void
-t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	t3_set_ddp_tag(toep, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6);
-	t3_set_ddp_buf(toep, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length);
-	t3_repost_kbuf(toep, p->cur_buf, modulate, 1, nonblock);
-#ifdef T3_TRACE
-	T3_TRACE1(TIDTB(so),
-		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
-#endif
-	CTR1(KTR_TOM,
-		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
-}
-
-/*
- * Prepare a socket for DDP.  Must be called when the socket is known to be
- * open.
- */
-int
-t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock)
-{
-	int i, err = ENOMEM;
-	static vm_pindex_t color;
-	unsigned int nppods, kbuf_pages, idx = 0;
-	struct ddp_state *p = &toep->tp_ddp_state;
-	struct tom_data *d = TOM_DATA(toep->tp_toedev);
-
-	
-	if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN)
-		return (EINVAL);
-
-#ifdef notyet	
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-#endif	
-	kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	nppods = pages2ppods(kbuf_pages);
-
-	p->kbuf_noinval = !!waitall;
-	p->kbuf_tag[NUM_DDP_KBUF - 1] = -1;
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
-		p->kbuf[idx] = 
-		    malloc(sizeof (struct ddp_gather_list) + kbuf_pages *
-			sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO);
-		if (p->kbuf[idx] == NULL)
-			goto err;
-		err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]);
-		if (err) {
-			printf("t3_alloc_ppods failed err=%d\n", err);
-			goto err;
-		}
-		
-		p->kbuf_nppods[idx] = nppods;
-		p->kbuf[idx]->dgl_length = kbuf_size;
-		p->kbuf[idx]->dgl_offset = 0;
-		p->kbuf[idx]->dgl_nelem = kbuf_pages;
-
-		for (i = 0; i < kbuf_pages; ++i) {
-			p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color,
-			    VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
-			    VM_ALLOC_ZERO);
-			if (p->kbuf[idx]->dgl_pages[i] == NULL) {
-				p->kbuf[idx]->dgl_nelem = i;
-				printf("failed to allocate kbuf pages\n");
-				goto err;
-			}
-		}
-#ifdef NEED_BUSDMA
-		/*
-		 * XXX we'll need this for VT-d or any platform with an iommu :-/
-		 *
-		 */
-		for (i = 0; i < kbuf_pages; ++i)
-			p->kbuf[idx]->phys_addr[i] = 
-			    pci_map_page(p->pdev, p->kbuf[idx]->pages[i],
-					 0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
-#endif
-		t3_setup_ppods(toep, p->kbuf[idx], nppods, p->kbuf_tag[idx], 
-			       p->kbuf[idx]->dgl_length, 0, 0);
-	}
-	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
-
-	t3_set_ddp_tag(toep, 0, p->kbuf_tag[0] << 6);
-	t3_set_ddp_buf(toep, 0, 0, p->kbuf[0]->dgl_length);
-	t3_repost_kbuf(toep, 0, 0, 1, nonblock);
-
-	t3_set_rcv_coalesce_enable(toep, 
-	    TOM_TUNABLE(toep->tp_toedev, ddp_rcvcoalesce));
-	t3_set_dack_mss(toep, TOM_TUNABLE(toep->tp_toedev, delack)>>1);
-	
-#ifdef T3_TRACE
-	T3_TRACE4(TIDTB(so),
-		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
-		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
-#endif
-	CTR4(KTR_TOM,
-		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
-		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
-	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
-	return (0);
-
-err:
-	t3_release_ddp_resources(toep);
-	t3_cleanup_ddp(toep);
-	return (err);
-}
-
-int
-t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len)
-{
-	int resid_init, err;
-	struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl;
-	
-	resid_init = uio->uio_resid;
-	
-	if (!gl->dgl_pages)
-		panic("pages not set\n");
-
-	CTR4(KTR_TOM, "t3_ddp_copy: offset=%d dgl_offset=%d cur_offset=%d len=%d",
-	    offset, gl->dgl_offset, m->m_cur_offset, len);
-	offset += gl->dgl_offset + m->m_cur_offset;
-	KASSERT(len <= gl->dgl_length,
-	    ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length));
-
-
-	err = uiomove_fromphys(gl->dgl_pages, offset, len, uio);
-	return (err);
-}
-
-
-/*
- * Allocate n page pods.  Returns -1 on failure or the page pod tag.
- */
-int
-t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag)
-{
-	unsigned int i, j;
-
-	if (__predict_false(!td->ppod_map)) {
-		printf("ppod_map not set\n");
-		return (EINVAL);
-	}
-
-	mtx_lock(&td->ppod_map_lock);
-	for (i = 0; i < td->nppods; ) {
-		
-		for (j = 0; j < n; ++j)           /* scan ppod_map[i..i+n-1] */
-			if (td->ppod_map[i + j]) {
-				i = i + j + 1;
-				goto next;
-			}
-		memset(&td->ppod_map[i], 1, n);   /* allocate range */
-		mtx_unlock(&td->ppod_map_lock);
-		CTR2(KTR_TOM,
-		    "t3_alloc_ppods: n=%u tag=%u", n, i);
-		*ptag = i;
-		return (0);
-	next: ;
-	}
-	mtx_unlock(&td->ppod_map_lock);
-	return (0);
-}
-
-void
-t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n)
-{
-	/* No need to take ppod_lock here */
-	memset(&td->ppod_map[tag], 0, n);
-}
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_defs.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_defs.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,91 +0,0 @@
-
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-$FreeBSD$
-
-***************************************************************************/
-#ifndef CXGB_DEFS_H_
-#define CXGB_DEFS_H_
-
-#define VALIDATE_TID 0
-
-#define TOEPCB(so)  ((struct toepcb *)(sototcpcb((so))->t_toe))
-#define TOE_DEV(so) (TOEPCB((so))->tp_toedev)
-#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
-#define sototoep(so) (sototcpcb((so))->t_toe)
-
-#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__)
-#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__)
-	
-#define	KTR_TOM	KTR_SPARE2
-#define	KTR_TCB	KTR_SPARE3
-
-struct toepcb;
-struct listen_ctx;
-
-void cxgb_log_tcb(struct adapter *sc, unsigned int tid);
-typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
-
-void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h);
-void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
-void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
-int t3_push_frames(struct socket *so, int req_completion);
-int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt,
-	struct sockaddr *nam);
-void t3_init_listen_cpl_handlers(void);
-int t3_init_cpl_io(void);
-void t3_init_wr_tab(unsigned int wr_len);
-uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
-void t3_send_rx_modulate(struct toepcb *toep);
-void t3_cleanup_rbuf(struct tcpcb *tp, int copied);
-
-void t3_init_socket_ops(void);
-void t3_install_socket_ops(struct socket *so);
-
-
-void t3_disconnect_acceptq(struct socket *listen_so);
-void t3_reset_synq(struct listen_ctx *ctx);
-void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler);
-
-struct toepcb *toepcb_alloc(void);
-void toepcb_hold(struct toepcb *);
-void toepcb_release(struct toepcb *);
-void toepcb_init(struct toepcb *);
-
-void t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off);
-void t3_set_dack_mss(struct toepcb *toep, int on);
-void t3_set_keepalive(struct toepcb *toep, int on_off);
-void t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag);
-void t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
-		    unsigned int len);
-int t3_get_tcb(struct toepcb *toep);
-
-int t3_ctloutput(struct socket *so, struct sockopt *sopt);
-
-#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_l2t.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_l2t.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,76 +1,61 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
- 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#if __FreeBSD_version > 700000
-#include <sys/rwlock.h>
-#endif
-
 #include <sys/socket.h>
 #include <net/if.h>
 #include <net/ethernet.h>
 #include <net/if_vlan_var.h>
-#include <net/if_dl.h>
-#include <net/route.h>
 #include <netinet/in.h>
-#include <netinet/if_ether.h>
+#include <netinet/toecore.h>
 
-#include <cxgb_include.h>
-#include <ulp/tom/cxgb_l2t.h>
+#include "cxgb_include.h"
+#include "ulp/tom/cxgb_tom.h"
+#include "ulp/tom/cxgb_l2t.h"
 
-#define VLAN_NONE 0xfff
-#define SDL(s) ((struct sockaddr_dl *)s) 
-#define RT_ENADDR(sa)  ((u_char *)LLADDR(SDL((sa))))
-#define rt_expire rt_rmx.rmx_expire 
-
-struct llinfo_arp { 
-        struct  callout la_timer; 
-        struct  rtentry *la_rt; 
-        struct  mbuf *la_hold;  /* last packet until resolved/timeout */ 
-        u_short la_preempt;     /* countdown for pre-expiry arps */ 
-        u_short la_asked;       /* # requests sent */ 
-}; 
+#define VLAN_NONE	0xfff
+#define SA(x)		((struct sockaddr *)(x))
+#define SIN(x)		((struct sockaddr_in *)(x))
+#define SINADDR(x)	(SIN(x)->sin_addr.s_addr)
 
 /*
  * Module locking notes:  There is a RW lock protecting the L2 table as a
- * whole plus a spinlock per L2T entry.  Entry lookups and allocations happen
+ * whole plus a mutex per L2T entry.  Entry lookups and allocations happen
  * under the protection of the table lock, individual entry changes happen
- * while holding that entry's spinlock.  The table lock nests outside the
+ * while holding that entry's mutex.  The table lock nests outside the
  * entry locks.  Allocations of new entries take the table lock as writers so
  * no other lookups can happen while allocating new entries.  Entry updates
  * take the table lock as readers so multiple entries can be updated in
@@ -78,72 +63,60 @@
  * and therefore can happen in parallel with entry allocation but no entry
  * can change state or increment its ref count during allocation as both of
  * these perform lookups.
+ *
+ * When acquiring multiple locks, the order is llentry -> L2 table -> L2 entry.
  */
 
 static inline unsigned int
-vlan_prio(const struct l2t_entry *e)
-{
-	return e->vlan >> 13;
-}
-
-static inline unsigned int
 arp_hash(u32 key, int ifindex, const struct l2t_data *d)
 {
 	return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
 }
 
-static inline void
-neigh_replace(struct l2t_entry *e, struct llentry *neigh)
-{
-	LLE_WLOCK(neigh);
-	LLE_ADDREF(neigh);
-	LLE_WUNLOCK(neigh);
-	
-	if (e->neigh)
-		LLE_FREE(e->neigh);
-	e->neigh = neigh;
-}
-
 /*
- * Set up an L2T entry and send any packets waiting in the arp queue.  The
- * supplied mbuf is used for the CPL_L2T_WRITE_REQ.  Must be called with the
- * entry locked.
+ * Set up an L2T entry and send any packets waiting in the arp queue.  Must be
+ * called with the entry locked.
  */
 static int
-setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m,
-    struct l2t_entry *e)
+setup_l2e_send_pending(struct adapter *sc, struct l2t_entry *e)
 {
+	struct mbuf *m;
 	struct cpl_l2t_write_req *req;
+	struct port_info *pi = &sc->port[e->smt_idx];	/* smt_idx is port_id */
 
-	if (!m) {
-		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
-		    return (ENOMEM);
+	mtx_assert(&e->lock, MA_OWNED);
+
+	m = M_GETHDR_OFLD(pi->first_qset, CPL_PRIORITY_CONTROL, req);
+	if (m == NULL) {
+		log(LOG_ERR, "%s: no mbuf, can't setup L2 entry at index %d\n",
+		    __func__, e->idx);
+		return (ENOMEM);
 	}
-	/*
-	 * XXX MH_ALIGN
-	 */
-	req = mtod(m, struct cpl_l2t_write_req *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
 	req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
-			    V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) |
-			    V_L2T_W_PRIO(vlan_prio(e)));
+	    V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) |
+	    V_L2T_W_PRIO(EVL_PRIOFTAG(e->vlan)));
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
-	m_set_priority(m, CPL_PRIORITY_CONTROL);
-	cxgb_ofld_send(dev, m);
+	t3_offload_tx(sc, m);
+
+	/*
+	 * XXX: We used pi->first_qset to send the L2T_WRITE_REQ.  If any mbuf
+	 * on the arpq is going out via another queue set associated with the
+	 * port then it has a bad race with the L2T_WRITE_REQ.  Ideally we
+	 * should wait till the reply to the write before draining the arpq.
+	 */
 	while (e->arpq_head) {
 		m = e->arpq_head;
 		e->arpq_head = m->m_next;
 		m->m_next = NULL;
-		cxgb_ofld_send(dev, m);
+		t3_offload_tx(sc, m);
 	}
 	e->arpq_tail = NULL;
-	e->state = L2T_STATE_VALID;
 
-	return 0;
+	return (0);
 }
 
 /*
@@ -153,6 +126,8 @@
 static inline void
 arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
 {
+	mtx_assert(&e->lock, MA_OWNED);
+
 	m->m_next = NULL;
 	if (e->arpq_head)
 		e->arpq_tail->m_next = m;
@@ -161,113 +136,149 @@
 	e->arpq_tail = m;
 }
 
-int
-t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e)
+static void
+resolution_failed_mbuf(struct mbuf *m)
 {
-	struct llentry *lle =  e->neigh;
-	struct sockaddr_in sin;
+	log(LOG_ERR, "%s: leaked mbuf %p, CPL at %p",
+	    __func__, m, mtod(m, void *));
+}
 
-	bzero(&sin, sizeof(struct sockaddr_in));
+static void
+resolution_failed(struct l2t_entry *e)
+{
+	struct mbuf *m;
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	while (e->arpq_head) {
+		m = e->arpq_head;
+		e->arpq_head = m->m_next;
+		m->m_next = NULL;
+		resolution_failed_mbuf(m);
+	}
+	e->arpq_tail = NULL;
+}
+
+static void
+update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr,
+    uint16_t vtag)
+{
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	/*
+	 * The entry may be in active use (e->refcount > 0) or not.  We update
+	 * it even when it's not as this simplifies the case where we decide to
+	 * reuse the entry later.
+	 */
+
+	if (lladdr == NULL &&
+	    (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) {
+		/*
+		 * Never got a valid L2 address for this one.  Just mark it as
+		 * failed instead of removing it from the hash (for which we'd
+		 * need to wlock the table).
+		 */
+		e->state = L2T_STATE_FAILED;
+		resolution_failed(e);
+		return;
+
+	} else if (lladdr == NULL) {
+
+		/* Valid or already-stale entry was deleted (or expired) */
+
+		KASSERT(e->state == L2T_STATE_VALID ||
+		    e->state == L2T_STATE_STALE,
+		    ("%s: lladdr NULL, state %d", __func__, e->state));
+
+		e->state = L2T_STATE_STALE;
+
+	} else {
+
+		if (e->state == L2T_STATE_RESOLVING ||
+		    e->state == L2T_STATE_FAILED ||
+		    memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) {
+
+			/* unresolved -> resolved; or dmac changed */
+
+			memcpy(e->dmac, lladdr, ETHER_ADDR_LEN);
+			e->vlan = vtag;
+			setup_l2e_send_pending(sc, e);
+		}
+		e->state = L2T_STATE_VALID;
+	}
+}
+
+static int
+resolve_entry(struct adapter *sc, struct l2t_entry *e)
+{
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	struct sockaddr_in sin = {0};
+	uint8_t dmac[ETHER_ADDR_LEN];
+	uint16_t vtag = EVL_VLID_MASK;
+	int rc;
+
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(struct sockaddr_in);
-	sin.sin_addr.s_addr = e->addr;
+	SINADDR(&sin) = e->addr;
 
-	CTR2(KTR_CXGB, "send slow on rt=%p eaddr=0x%08x\n", rt, e->addr);
+	rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag);
+	if (rc == EWOULDBLOCK)
+		return (rc);
+
+	mtx_lock(&e->lock);
+	update_entry(sc, e, rc == 0 ? dmac : NULL, vtag);
+	mtx_unlock(&e->lock);
+
+	return (rc);
+}
+
+int
+t3_l2t_send_slow(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
+{
+
 again:
 	switch (e->state) {
 	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		arpresolve(rt->rt_ifp, rt, NULL,
-		     (struct sockaddr *)&sin, e->dmac, &lle);
-		mtx_lock(&e->lock);
-		if (e->state == L2T_STATE_STALE)
-			e->state = L2T_STATE_VALID;
-		mtx_unlock(&e->lock);
+
+		if (resolve_entry(sc, e) != EWOULDBLOCK)
+			goto again;	/* entry updated, re-examine state */
+
+		/* Fall through */
+
 	case L2T_STATE_VALID:     /* fast-path, send the packet on */
-		return cxgb_ofld_send(dev, m);
+
+		return (t3_offload_tx(sc, m));
+
 	case L2T_STATE_RESOLVING:
 		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
+		if (e->state != L2T_STATE_RESOLVING) {
 			mtx_unlock(&e->lock);
 			goto again;
 		}
 		arpq_enqueue(e, m);
 		mtx_unlock(&e->lock);
-		/*
-		 * Only the first packet added to the arpq should kick off
-		 * resolution.  However, because the m_gethdr below can fail,
-		 * we allow each packet added to the arpq to retry resolution
-		 * as a way of recovering from transient memory exhaustion.
-		 * A better way would be to use a work request to retry L2T
-		 * entries when there's no memory.
-		 */
-		if (arpresolve(rt->rt_ifp, rt, NULL,
-		     (struct sockaddr *)&sin, e->dmac, &lle) == 0) {
-			CTR6(KTR_CXGB, "mac=%x:%x:%x:%x:%x:%x\n",
-			    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
-			
-			if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
-				return (ENOMEM);
 
-			mtx_lock(&e->lock);
-			if (e->arpq_head) 
-				setup_l2e_send_pending(dev, m, e);
-			else
-				m_freem(m);
-			mtx_unlock(&e->lock);
-		}
+		if (resolve_entry(sc, e) == EWOULDBLOCK)
+			break;
+
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_VALID && e->arpq_head)
+			setup_l2e_send_pending(sc, e);
+		if (e->state == L2T_STATE_FAILED)
+			resolution_failed(e);
+		mtx_unlock(&e->lock);
+		break;
+
+	case L2T_STATE_FAILED:
+		resolution_failed_mbuf(m);
+		return (EHOSTUNREACH);
 	}
-	return 0;
+
+	return (0);
 }
 
-void
-t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
-{
-	struct mbuf *m0;
-	struct sockaddr_in sin;
-	sin.sin_family = AF_INET;
-	sin.sin_len = sizeof(struct sockaddr_in);
-	sin.sin_addr.s_addr = e->addr;
-	struct llentry *lle;
-	
-	if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
-		return;
-
-	rt = e->neigh;
-again:
-	switch (e->state) {
-	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		arpresolve(rt->rt_ifp, rt, NULL,
-		     (struct sockaddr *)&sin, e->dmac, &lle);
-		mtx_lock(&e->lock);
-		if (e->state == L2T_STATE_STALE) {
-			e->state = L2T_STATE_VALID;
-		}
-		mtx_unlock(&e->lock);
-		return;
-	case L2T_STATE_VALID:     /* fast-path, send the packet on */
-		return;
-	case L2T_STATE_RESOLVING:
-		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
-			mtx_unlock(&e->lock);
-			goto again;
-		}
-		mtx_unlock(&e->lock);
-		
-		/*
-		 * Only the first packet added to the arpq should kick off
-		 * resolution.  However, because the alloc_skb below can fail,
-		 * we allow each packet added to the arpq to retry resolution
-		 * as a way of recovering from transient memory exhaustion.
-		 * A better way would be to use a work request to retry L2T
-		 * entries when there's no memory.
-		 */
-		arpresolve(rt->rt_ifp, rt, NULL,
-		    (struct sockaddr *)&sin, e->dmac, &lle);
-
-	}
-	return;
-}
 /*
  * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
  */
@@ -276,15 +287,19 @@
 {
 	struct l2t_entry *end, *e, **p;
 
+	rw_assert(&d->lock, RA_WLOCKED);
+
 	if (!atomic_load_acq_int(&d->nfree))
-		return NULL;
+		return (NULL);
 
 	/* there's definitely a free entry */
-	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
+	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) {
 		if (atomic_load_acq_int(&e->refcnt) == 0)
 			goto found;
+	}
 
-	for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ;
+	for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e)
+		continue;
 found:
 	d->rover = e + 1;
 	atomic_add_int(&d->nfree, -1);
@@ -294,90 +309,37 @@
 	 * presently in the hash table.  We need to remove it.
 	 */
 	if (e->state != L2T_STATE_UNUSED) {
-		int hash = arp_hash(e->addr, e->ifindex, d);
+		int hash = arp_hash(e->addr, e->ifp->if_index, d);
 
-		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
+		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) {
 			if (*p == e) {
 				*p = e->next;
 				break;
 			}
+		}
 		e->state = L2T_STATE_UNUSED;
 	}
-	
-	return e;
-}
 
-/*
- * Called when an L2T entry has no more users.  The entry is left in the hash
- * table since it is likely to be reused but we also bump nfree to indicate
- * that the entry can be reallocated for a different neighbor.  We also drop
- * the existing neighbor reference in case the neighbor is going away and is
- * waiting on our reference.
- *
- * Because entries can be reallocated to other neighbors once their ref count
- * drops to 0 we need to take the entry's lock to avoid races with a new
- * incarnation.
- */
-void
-t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
-{
-	struct llentry *lle;
-
-	mtx_lock(&e->lock);
-	if (atomic_load_acq_int(&e->refcnt) == 0) {  /* hasn't been recycled */
-		lle = e->neigh;
-		e->neigh = NULL;
-	}
-	
-	mtx_unlock(&e->lock);
-	atomic_add_int(&d->nfree, 1);
-	if (lle)
-		LLE_FREE(lle);
-}
-
-
-/*
- * Update an L2T entry that was previously used for the same next hop as neigh.
- * Must be called with softirqs disabled.
- */
-static inline void
-reuse_entry(struct l2t_entry *e, struct llentry *neigh)
-{
-
-	mtx_lock(&e->lock);                /* avoid race with t3_l2t_free */
-	if (neigh != e->neigh)
-		neigh_replace(e, neigh);
-	
-	if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), sizeof(e->dmac)) ||
-	    (neigh->rt_expire > time_uptime))
-		e->state = L2T_STATE_RESOLVING;
-	else if (la->la_hold == NULL)
-		e->state = L2T_STATE_VALID;
-	else
-		e->state = L2T_STATE_STALE;
-	mtx_unlock(&e->lock);
+	return (e);
 }
 
 struct l2t_entry *
-t3_l2t_get(struct t3cdev *dev, struct llentry *neigh, struct ifnet *ifp,
-	struct sockaddr *sa)
+t3_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
 {
+	struct tom_data *td = pi->adapter->tom_softc;
 	struct l2t_entry *e;
-	struct l2t_data *d = L2DATA(dev);
-	u32 addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr;
-	int ifidx = ifp->if_index;
-	int hash = arp_hash(addr, ifidx, d);
-	unsigned int smt_idx = ((struct port_info *)ifp->if_softc)->port_id;
+	struct l2t_data *d = td->l2t;
+	uint32_t addr = SINADDR(sa);
+	int hash = arp_hash(addr, ifp->if_index, d);
+	unsigned int smt_idx = pi->port_id;
 
 	rw_wlock(&d->lock);
-	for (e = d->l2tab[hash].first; e; e = e->next)
-		if (e->addr == addr && e->ifindex == ifidx &&
-		    e->smt_idx == smt_idx) {
+	for (e = d->l2tab[hash].first; e; e = e->next) {
+		if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) {
 			l2t_hold(d, e);
-			if (atomic_load_acq_int(&e->refcnt) == 1)
-				reuse_entry(e, neigh);
 			goto done;
 		}
+	}
 
 	/* Need to allocate a new entry */
 	e = alloc_l2e(d);
@@ -385,116 +347,59 @@
 		mtx_lock(&e->lock);          /* avoid race with t3_l2t_free */
 		e->next = d->l2tab[hash].first;
 		d->l2tab[hash].first = e;
-		rw_wunlock(&d->lock);
-		
+
 		e->state = L2T_STATE_RESOLVING;
 		e->addr = addr;
-		e->ifindex = ifidx;
+		e->ifp = ifp;
 		e->smt_idx = smt_idx;
 		atomic_store_rel_int(&e->refcnt, 1);
-		e->neigh = NULL;
-		
-		
-		neigh_replace(e, neigh);
-#ifdef notyet
-		/* 
-		 * XXX need to add accessor function for vlan tag
-		 */
-		if (neigh->rt_ifp->if_vlantrunk)
-			e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id;
-		else
-#endif			    
-			e->vlan = VLAN_NONE;
+
+		KASSERT(ifp->if_vlantrunk == NULL, ("TOE+VLAN unimplemented."));
+		e->vlan = VLAN_NONE;
+
 		mtx_unlock(&e->lock);
+	}
 
-		return (e);
-	}
-	
 done:
 	rw_wunlock(&d->lock);
-	return e;
-}
 
-/*
- * Called when address resolution fails for an L2T entry to handle packets
- * on the arpq head.  If a packet specifies a failure handler it is invoked,
- * otherwise the packets is sent to the TOE.
- *
- * XXX: maybe we should abandon the latter behavior and just require a failure
- * handler.
- */
-static void
-handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq)
-{
-
-	while (arpq) {
-		struct mbuf *m = arpq;
-#ifdef notyet		
-		struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m);
-#endif
-		arpq = m->m_next;
-		m->m_next = NULL;
-#ifdef notyet		
-		if (cb->arp_failure_handler)
-			cb->arp_failure_handler(dev, m);
-		else
-#endif			
-			cxgb_ofld_send(dev, m);
-	}
-
+	return (e);
 }
 
 void
-t3_l2t_update(struct t3cdev *dev, struct llentry *neigh,
-    uint8_t *enaddr, struct sockaddr *sa)
+t3_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t vtag)
 {
+	struct tom_data *td = t3_tomdata(tod);
+	struct adapter *sc = tod->tod_softc;
 	struct l2t_entry *e;
-	struct mbuf *arpq = NULL;
-	struct l2t_data *d = L2DATA(dev);
-	u32 addr = *(u32 *) &((struct sockaddr_in *)sa)->sin_addr;
-	int hash = arp_hash(addr, ifidx, d);
-	struct llinfo_arp *la;
+	struct l2t_data *d = td->l2t;
+	u32 addr = *(u32 *) &SIN(sa)->sin_addr;
+	int hash = arp_hash(addr, ifp->if_index, d);
 
 	rw_rlock(&d->lock);
 	for (e = d->l2tab[hash].first; e; e = e->next)
-		if (e->addr == addr) {
+		if (e->addr == addr && e->ifp == ifp) {
 			mtx_lock(&e->lock);
 			goto found;
 		}
 	rw_runlock(&d->lock);
-	CTR1(KTR_CXGB, "t3_l2t_update: addr=0x%08x not found", addr);
+
+	/*
+	 * This is of no interest to us.  We've never had an offloaded
+	 * connection to this destination, and we aren't attempting one right
+	 * now.
+	 */
 	return;
 
 found:
-	printf("found 0x%08x\n", addr);
+	rw_runlock(&d->lock);
 
-	rw_runlock(&d->lock);
-	memcpy(e->dmac, enaddr, ETHER_ADDR_LEN);
-	printf("mac=%x:%x:%x:%x:%x:%x\n",
-	    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
-	
-	if (atomic_load_acq_int(&e->refcnt)) {
-		if (neigh != e->neigh)
-			neigh_replace(e, neigh);
-		
-		la = (struct llinfo_arp *)neigh->rt_llinfo; 
-		if (e->state == L2T_STATE_RESOLVING) {
-			
-			if (la->la_asked >= 5 /* arp_maxtries */) {
-				arpq = e->arpq_head;
-				e->arpq_head = e->arpq_tail = NULL;
-			} else
-				setup_l2e_send_pending(dev, NULL, e);
-		} else {
-			e->state = L2T_STATE_VALID;
-			if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), 6))
-				setup_l2e_send_pending(dev, NULL, e);
-		}
-	}
+	KASSERT(e->state != L2T_STATE_UNUSED,
+	    ("%s: unused entry in the hash.", __func__));
+
+	update_entry(sc, e, lladdr, vtag);
 	mtx_unlock(&e->lock);
-
-	if (arpq)
-		handle_failed_resolution(dev, arpq);
 }
 
 struct l2t_data *
@@ -503,9 +408,9 @@
 	struct l2t_data *d;
 	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
 
-	d = cxgb_alloc_mem(size);
+	d = malloc(size, M_CXGB, M_NOWAIT | M_ZERO);
 	if (!d)
-		return NULL;
+		return (NULL);
 
 	d->nentries = l2t_capacity;
 	d->rover = &d->l2tab[1];	/* entry 0 is not used */
@@ -515,10 +420,10 @@
 	for (i = 0; i < l2t_capacity; ++i) {
 		d->l2tab[i].idx = i;
 		d->l2tab[i].state = L2T_STATE_UNUSED;
-		mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF);
+		mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF);
 		atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
 	}
-	return d;
+	return (d);
 }
 
 void
@@ -530,5 +435,26 @@
 	for (i = 0; i < d->nentries; ++i) 
 		mtx_destroy(&d->l2tab[i].lock);
 
-	cxgb_free_mem(d);
+	free(d, M_CXGB);
 }
+
+static int
+do_l2t_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct cpl_l2t_write_rpl *rpl = mtod(m, void *);
+
+	if (rpl->status != CPL_ERR_NONE)
+		log(LOG_ERR,
+		       "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	m_freem(m);
+	return (0);
+}
+
+void
+t3_init_l2t_cpl_handlers(struct adapter *sc)
+{
+	t3_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_l2t.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_l2t.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,6 +1,6 @@
 /**************************************************************************
 
-Copyright (c) 2007-2008, Chelsio Inc.
+Copyright (c) 2007-2009, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,26 +31,19 @@
 #ifndef _CHELSIO_L2T_H
 #define _CHELSIO_L2T_H
 
-#include <ulp/toecore/cxgb_toedev.h>
 #include <sys/lock.h>
-
-#if __FreeBSD_version > 700000
 #include <sys/rwlock.h>
-#else
-#define rwlock mtx
-#define rw_wlock(x) mtx_lock((x))
-#define rw_wunlock(x) mtx_unlock((x))
-#define rw_rlock(x) mtx_lock((x))
-#define rw_runlock(x) mtx_unlock((x))
-#define rw_init(x, str) mtx_init((x), (str), NULL, MTX_DEF)
-#define rw_destroy(x) mtx_destroy((x))
-#endif
 
 enum {
-	L2T_STATE_VALID,      /* entry is up to date */
-	L2T_STATE_STALE,      /* entry may be used but needs revalidation */
-	L2T_STATE_RESOLVING,  /* entry needs address resolution */
-	L2T_STATE_UNUSED      /* entry not in use */
+	L2T_SIZE = 2048
+};
+
+enum {
+	L2T_STATE_VALID,	/* entry is up to date */
+	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,	/* entry needs address resolution */
+	L2T_STATE_FAILED,	/* failed to resolve */
+	L2T_STATE_UNUSED	/* entry not in use */
 };
 
 /*
@@ -64,18 +57,17 @@
 struct l2t_entry {
 	uint16_t state;               /* entry state */
 	uint16_t idx;                 /* entry index */
-	uint32_t addr;                /* dest IP address */
-	int ifindex;                  /* neighbor's net_device's ifindex */
+	uint32_t addr;                /* nexthop IP address */
+	struct ifnet *ifp;            /* outgoing interface */
 	uint16_t smt_idx;             /* SMT index */
 	uint16_t vlan;                /* VLAN TCI (id: bits 0-11, prio: 13-15 */
-	struct llentry *neigh;        /* associated neighbour */
 	struct l2t_entry *first;      /* start of hash chain */
 	struct l2t_entry *next;       /* next l2t_entry on chain */
 	struct mbuf *arpq_head;       /* queue of packets awaiting resolution */
 	struct mbuf *arpq_tail;
 	struct mtx lock;
 	volatile uint32_t refcnt;     /* entry reference count */
-	uint8_t dmac[6];              /* neighbour's MAC address */
+	uint8_t dmac[ETHER_ADDR_LEN]; /* nexthop's MAC address */
 };
 
 struct l2t_data {
@@ -86,76 +78,37 @@
 	struct l2t_entry l2tab[0];
 };
 
-typedef void (*arp_failure_handler_func)(struct t3cdev *dev,
-					 struct mbuf *m);
+void t3_l2e_free(struct l2t_data *, struct l2t_entry *e);
+void t3_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t vtag);
+struct l2t_entry *t3_l2t_get(struct port_info *, struct ifnet *,
+    struct sockaddr *);
+int t3_l2t_send_slow(struct adapter *, struct mbuf *, struct l2t_entry *);
+struct l2t_data *t3_init_l2t(unsigned int);
+void t3_free_l2t(struct l2t_data *);
+void t3_init_l2t_cpl_handlers(struct adapter *);
 
-typedef void (*opaque_arp_failure_handler_func)(void *dev,
-					 struct mbuf *m);
-
-/*
- * Callback stored in an skb to handle address resolution failure.
- */
-struct l2t_mbuf_cb {
-	arp_failure_handler_func arp_failure_handler;
-};
-
-/*
- * XXX 
- */
-#define L2T_MBUF_CB(skb) ((struct l2t_mbuf_cb *)(skb)->cb)
-
-
-static __inline void set_arp_failure_handler(struct mbuf *m,
-					   arp_failure_handler_func hnd)
+static inline int
+l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
 {
-	m->m_pkthdr.header = (opaque_arp_failure_handler_func)hnd;
-
+	if (__predict_true(e->state == L2T_STATE_VALID))
+		return t3_offload_tx(sc, m);
+	else
+		return t3_l2t_send_slow(sc, m, e);
 }
 
-/*
- * Getting to the L2 data from an offload device.
- */
-#define L2DATA(dev) ((dev)->l2opt)
-
-void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
-void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
-struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh,
-    struct ifnet *ifp, struct sockaddr *sa);
-int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m,
-		     struct l2t_entry *e);
-void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e);
-struct l2t_data *t3_init_l2t(unsigned int l2t_capacity);
-void t3_free_l2t(struct l2t_data *d);
-
-#ifdef CONFIG_PROC_FS
-int t3_l2t_proc_setup(struct proc_dir_entry *dir, struct l2t_data *d);
-void t3_l2t_proc_free(struct proc_dir_entry *dir);
-#else
-#define l2t_proc_setup(dir, d) 0
-#define l2t_proc_free(dir)
-#endif
-
-int cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m);
-
-static inline int l2t_send(struct t3cdev *dev, struct mbuf *m,
-			   struct l2t_entry *e)
+static inline void
+l2t_release(struct l2t_data *d, struct l2t_entry *e)
 {
-	if (__predict_true(e->state == L2T_STATE_VALID)) {
-		return cxgb_ofld_send(dev, (struct mbuf *)m);
-	}
-	return t3_l2t_send_slow(dev, (struct mbuf *)m, e);
-}
-
-static inline void l2t_release(struct l2t_data *d, struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
-		t3_l2e_free(d, e);
-}
-
-static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, 1) == 1)  /* 0 -> 1 transition */
+	if (atomic_fetchadd_int(&e->refcnt, -1) == 1) /* 1 -> 0 transition */
 		atomic_add_int(&d->nfree, 1);
 }
 
+static inline void
+l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
+		atomic_add_int(&d->nfree, -1);
+}
+
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_listen.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,280 +1,237 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/fcntl.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/mbuf.h>
-#include <sys/mutex.h>
-
-#include <sys/sockopt.h>
-#include <sys/sockstate.h>
-#include <sys/sockbuf.h>
-
+#include <sys/refcount.h>
 #include <sys/socket.h>
-#include <sys/syslog.h>
-
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
 #include <net/if.h>
 #include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
 
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
+#include "cxgb_include.h"
+#include "ulp/tom/cxgb_tom.h"
+#include "ulp/tom/cxgb_l2t.h"
+#include "ulp/tom/cxgb_toepcb.h"
 
+static void t3_send_reset_synqe(struct toedev *, struct synq_entry *);
 
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
+static int
+alloc_stid(struct tid_info *t, void *ctx)
+{
+	int stid = -1;
 
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_fsm.h>
+	mtx_lock(&t->stid_lock);
+	if (t->sfree) {
+		union listen_entry *p = t->sfree;
 
-#include <netinet/tcp_offload.h>
-#include <net/route.h>
+		stid = (p - t->stid_tab) + t->stid_base;
+		t->sfree = p->next;
+		p->ctx = ctx;
+		t->stids_in_use++;
+	}
+	mtx_unlock(&t->stid_lock);
+	return (stid);
+}
 
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_t3_cpl.h>
-#include <common/cxgb_tcb.h>
-#include <common/cxgb_ctl_defs.h>
-#include <cxgb_offload.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_l2t.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_toepcb.h>
+static void
+free_stid(struct tid_info *t, int stid)
+{
+	union listen_entry *p = stid2entry(t, stid);
 
+	mtx_lock(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	mtx_unlock(&t->stid_lock);
+}
 
-static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid);
-static int listen_hash_del(struct tom_data *d, struct socket *so);
+static struct listen_ctx *
+alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset)
+{
+	struct listen_ctx *lctx;
 
-/*
- * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
- * the STID.
- */
-static int
-do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	struct cpl_close_listserv_rpl *rpl = cplhdr(m);
-	unsigned int stid = GET_TID(rpl);
+	INP_WLOCK_ASSERT(inp);
 
-	if (rpl->status != CPL_ERR_NONE)
-		log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for "
-		       "STID %u\n", rpl->status, stid);
-	else {
-		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+	lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO);
+	if (lctx == NULL)
+		return (NULL);
 
-		cxgb_free_stid(cdev, stid);
-		free(listen_ctx, M_CXGB);
+	lctx->stid = alloc_stid(&td->tid_maps, lctx);
+	if (lctx->stid < 0) {
+		free(lctx, M_CXGB);
+		return (NULL);
 	}
 
-	return (CPL_RET_BUF_DONE);
+	lctx->inp = inp;
+	in_pcbref(inp);
+
+	lctx->qset = qset;
+	refcount_init(&lctx->refcnt, 1);
+	TAILQ_INIT(&lctx->synq);
+
+	return (lctx);
+}
+
+/* Don't call this directly, use release_lctx instead */
+static int
+free_lctx(struct tom_data *td, struct listen_ctx *lctx)
+{
+	struct inpcb *inp = lctx->inp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(lctx->refcnt == 0,
+	    ("%s: refcnt %d", __func__, lctx->refcnt));
+	KASSERT(TAILQ_EMPTY(&lctx->synq),
+	    ("%s: synq not empty.", __func__));
+	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
+
+	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p",
+	    __func__, lctx->stid, lctx, lctx->inp);
+
+	free_stid(&td->tid_maps, lctx->stid);
+	free(lctx, M_CXGB);
+
+	return in_pcbrele_wlocked(inp);
+}
+
+static void
+hold_lctx(struct listen_ctx *lctx)
+{
+
+	refcount_acquire(&lctx->refcnt);
+}
+
+static inline uint32_t
+listen_hashfn(void *key, u_long mask)
+{
+
+	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
 }
 
 /*
- * Process a CPL_PASS_OPEN_RPL message.  Remove the socket from the listen hash
- * table and free the STID if there was any error, otherwise nothing to do.
+ * Add a listen_ctx entry to the listen hash table.
  */
-static int
-do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+static void
+listen_hash_add(struct tom_data *td, struct listen_ctx *lctx)
 {
-       	struct cpl_pass_open_rpl *rpl = cplhdr(m);
+	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
 
-	if (rpl->status != CPL_ERR_NONE) {
-		int stid = GET_TID(rpl);
-		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
-		struct tom_data *d = listen_ctx->tom_data;
-		struct socket *lso = listen_ctx->lso;
-
-#if VALIDATE_TID
-		if (!lso)
-			return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE);
-#endif
-		/*
-		 * Note: It is safe to unconditionally call listen_hash_del()
-		 * at this point without risking unhashing a reincarnation of
-		 * an already closed socket (i.e., there is no listen, close,
-		 * listen, free the sock for the second listen while processing
-		 * a message for the first race) because we are still holding
-		 * a reference on the socket.  It is possible that the unhash
-		 * will fail because the socket is already closed, but we can't
-		 * unhash the wrong socket because it is impossible for the
-		 * socket to which this message refers to have reincarnated.
-		 */
-		listen_hash_del(d, lso);
-		cxgb_free_stid(cdev, stid);
-#ifdef notyet
-		/*
-		 * XXX need to unreference the inpcb
-		 * but we have no way of knowing that other TOMs aren't referencing it 
-		 */
-		sock_put(lso);
-#endif
-		free(listen_ctx, M_CXGB);
-	}
-	return CPL_RET_BUF_DONE;
-}
-
-void
-t3_init_listen_cpl_handlers(void)
-{
-	t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
-	t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
-}
-
-static inline int
-listen_hashfn(const struct socket *so)
-{
-	return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1);
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
+	td->lctx_count++;
+	mtx_unlock(&td->lctx_hash_lock);
 }
 
 /*
- * Create and add a listen_info entry to the listen hash table.  This and the
- * listen hash table functions below cannot be called from softirqs.
+ * Look for the listening socket's context entry in the hash and return it.
  */
-static struct listen_info *
-listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
+static struct listen_ctx *
+listen_hash_find(struct tom_data *td, struct inpcb *inp)
 {
-	struct listen_info *p;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx;
 
-	p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO);
-	if (p) {
-		int bucket = listen_hashfn(so);
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
+		if (lctx->inp == inp)
+			break;
+	}
+	mtx_unlock(&td->lctx_hash_lock);
 
-		p->so = so;	/* just a key, no need to take a reference */
-		p->stid = stid;
-		mtx_lock(&d->listen_lock);		
-		p->next = d->listen_hash_tab[bucket];
-		d->listen_hash_tab[bucket] = p;
-		mtx_unlock(&d->listen_lock);
-	}
-	return p;
+	return (lctx);
 }
 
 /*
- * Given a pointer to a listening socket return its server TID by consulting
- * the socket->stid map.  Returns -1 if the socket is not in the map.
+ * Removes the listen_ctx structure for inp from the hash and returns it.
  */
-static int
-listen_hash_find(struct tom_data *d, struct socket *so)
+static struct listen_ctx *
+listen_hash_del(struct tom_data *td, struct inpcb *inp)
 {
-	int stid = -1, bucket = listen_hashfn(so);
-	struct listen_info *p;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx, *l;
 
-	mtx_lock(&d->listen_lock);
-	for (p = d->listen_hash_tab[bucket]; p; p = p->next)
-		if (p->so == so) {
-			stid = p->stid;
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
+		if (lctx->inp == inp) {
+			LIST_REMOVE(lctx, link);
+			td->lctx_count--;
 			break;
 		}
-	mtx_unlock(&d->listen_lock);
-	return stid;
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	return (lctx);
 }
 
 /*
- * Delete the listen_info structure for a listening socket.  Returns the server
- * TID for the socket if it is present in the socket->stid map, or -1.
+ * Releases a hold on the lctx.  Must be called with the listening socket's inp
+ * locked.  The inp may be freed by this function and it returns NULL to
+ * indicate this.
  */
-static int
-listen_hash_del(struct tom_data *d, struct socket *so)
+static struct inpcb *
+release_lctx(struct tom_data *td, struct listen_ctx *lctx)
 {
-	int bucket, stid = -1;
-	struct listen_info *p, **prev;
+	struct inpcb *inp = lctx->inp;
+	int inp_freed = 0;
 
-	bucket = listen_hashfn(so);
-	prev  = &d->listen_hash_tab[bucket];
+	INP_WLOCK_ASSERT(inp);
+	if (refcount_release(&lctx->refcnt))
+		inp_freed = free_lctx(td, lctx);
 
-	mtx_lock(&d->listen_lock);
-	for (p = *prev; p; prev = &p->next, p = p->next)
-		if (p->so == so) {
-			stid = p->stid;
-			*prev = p->next;
-			free(p, M_CXGB);
-			break;
-		}
-	mtx_unlock(&d->listen_lock);
-	
-	return (stid);
+	return (inp_freed ? NULL : inp);
 }
 
-/*
- * Start a listening server by sending a passive open request to HW.
- */
-void
-t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+static int
+create_server(struct adapter *sc, struct listen_ctx *lctx)
 {
-	int stid;
 	struct mbuf *m;
 	struct cpl_pass_open_req *req;
-	struct tom_data *d = TOM_DATA(dev);
-	struct inpcb *inp = so_sotoinpcb(so);
-	struct listen_ctx *ctx;
+	struct inpcb *inp = lctx->inp;
 
-	if (!TOM_TUNABLE(dev, activated))
-		return;
+	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
+	if (m == NULL)
+		return (ENOMEM);
 
-	if (listen_hash_find(d, so) != -1)
-		return;
-	
-	CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport));
-	ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO);
-
-	if (!ctx)
-		return;
-
-	ctx->tom_data = d;
-	ctx->lso = so;
-	ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0;
-	LIST_INIT(&ctx->synq_head);
-	
-	stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
-	if (stid < 0)
-		goto free_ctx;
-
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
-		goto free_stid;
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	
-	if (!listen_hash_add(d, so, stid))
-		goto free_all;
-
-	req = mtod(m, struct cpl_pass_open_req *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
 	req->local_port = inp->inp_lport; 
 	memcpy(&req->local_ip, &inp->inp_laddr, 4);
 	req->peer_port = 0;
@@ -284,60 +241,900 @@
 	req->opt0l = htonl(V_RCV_BUFSIZ(16));
 	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
 
-	m_set_priority(m, CPL_PRIORITY_LISTEN); 
-	cxgb_ofld_send(cdev, m);
-	return;
+	t3_offload_tx(sc, m);
 
-free_all:
-	m_free(m);
-free_stid:
-	cxgb_free_stid(cdev, stid);
-#if 0	
-	sock_put(sk);
-#endif	
-free_ctx:
-	free(ctx, M_CXGB);
+	return (0);
+}
+
+static int
+destroy_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct mbuf *m;
+	struct cpl_close_listserv_req *req;
+
+	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
+	if (m == NULL)
+		return (ENOMEM);
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
+	    lctx->stid));
+	req->cpu_idx = 0;
+
+	t3_offload_tx(sc, m);
+
+	return (0);
+}
+
+/*
+ * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
+ * the STID.
+ */
+static int
+do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_close_listserv_rpl *rpl = mtod(m, void *);
+	unsigned int stid = GET_TID(rpl);
+	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
+	struct inpcb *inp = lctx->inp;
+
+	CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
+		    __func__, rpl->status, stid);
+	} else {
+		INP_WLOCK(inp);
+		KASSERT(listen_hash_del(td, lctx->inp) == NULL,
+		    ("%s: inp %p still in listen hash", __func__, inp));
+		if (release_lctx(td, lctx) != NULL)
+			INP_WUNLOCK(inp);
+	}
+
+	m_freem(m);
+	return (0);
+}
+
+/*
+ * Process a CPL_PASS_OPEN_RPL message.  Remove the lctx from the listen hash
+ * table and free it if there was any error, otherwise nothing to do.
+ */
+static int
+do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+       	struct cpl_pass_open_rpl *rpl = mtod(m, void *);
+	int stid = GET_TID(rpl);
+	struct listen_ctx *lctx;
+	struct inpcb *inp;
+
+	/*
+	 * We get these replies also when setting up HW filters.  Just throw
+	 * those away.
+	 */
+	if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids)
+		goto done;
+
+	lctx = lookup_stid(&td->tid_maps, stid);
+	inp = lctx->inp;
+
+	INP_WLOCK(inp);
+
+	CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x",
+	    __func__, stid, rpl->status, lctx->flags);
+
+	lctx->flags &= ~LCTX_RPL_PENDING;
+
+	if (rpl->status != CPL_ERR_NONE) {
+		log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n",
+		    __func__, device_get_nameunit(sc->dev), stid, rpl->status);
+	}
+
+#ifdef INVARIANTS
+	/*
+	 * If the inp has been dropped (listening socket closed) then
+	 * listen_stop must have run and taken the inp out of the hash.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		KASSERT(listen_hash_del(td, inp) == NULL,
+		    ("%s: inp %p still in listen hash", __func__, inp));
+	}
+#endif
+
+	if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) {
+		if (release_lctx(td, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		goto done;
+	}
+
+	/*
+	 * Listening socket stopped listening earlier and now the chip tells us
+	 * it has started the hardware listener.  Stop it; the lctx will be
+	 * released in do_close_server_rpl.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		destroy_server(sc, lctx);
+		INP_WUNLOCK(inp);
+		goto done;
+	}
+
+	/*
+	 * Failed to start hardware listener.  Take inp out of the hash and
+	 * release our reference on it.  An error message has been logged
+	 * already.
+	 */
+	if (rpl->status != CPL_ERR_NONE) {
+		listen_hash_del(td, inp);
+		if (release_lctx(td, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		goto done;
+	}
+
+	/* hardware listener open for business */
+
+	INP_WUNLOCK(inp);
+done:
+	m_freem(m);
+	return (0);
+}
+
+static void
+pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl,
+    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
+{
+	const struct tcp_options *t3opt = &cpl->tcp_options;
+
+	bzero(inc, sizeof(*inc));
+	inc->inc_faddr.s_addr = cpl->peer_ip;
+	inc->inc_laddr.s_addr = cpl->local_ip;
+	inc->inc_fport = cpl->peer_port;
+	inc->inc_lport = cpl->local_port;
+
+	bzero(th, sizeof(*th));
+	th->th_sport = cpl->peer_port;
+	th->th_dport = cpl->local_port;
+	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
+	th->th_flags = TH_SYN;
+
+	bzero(to, sizeof(*to));
+	if (t3opt->mss) {
+		to->to_flags |= TOF_MSS;
+		to->to_mss = be16toh(t3opt->mss);
+	}
+	if (t3opt->wsf) {
+		to->to_flags |= TOF_SCALE;
+		to->to_wscale = t3opt->wsf;
+	}
+	if (t3opt->tstamp)
+		to->to_flags |= TOF_TS;
+	if (t3opt->sack)
+		to->to_flags |= TOF_SACKPERM;
+}
+
+static inline void
+hold_synqe(struct synq_entry *synqe)
+{
+
+	refcount_acquire(&synqe->refcnt);
+}
+
+static inline void
+release_synqe(struct synq_entry *synqe)
+{
+
+	if (refcount_release(&synqe->refcnt))
+		m_freem(synqe->m);
+}
+
+/*
+ * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
+ * store some state temporarily.  There will be enough room in the mbuf's
+ * trailing space as the CPL is not that large.
+ *
+ * XXX: bad hack.
+ */
+static struct synq_entry *
+mbuf_to_synq_entry(struct mbuf *m)
+{
+	int len = roundup(sizeof (struct synq_entry), 8);
+	uint8_t *buf;
+	int buflen;
+
+	if (__predict_false(M_TRAILINGSPACE(m) < len)) {
+	    panic("%s: no room for synq_entry (%td, %d)\n", __func__,
+	    M_TRAILINGSPACE(m), len);
+	}
+
+	if (m->m_flags & M_EXT) {
+		buf = m->m_ext.ext_buf;
+		buflen = m->m_ext.ext_size;
+	} else if (m->m_flags & M_PKTHDR) {
+		buf = &m->m_pktdat[0];
+		buflen = MHLEN;
+	} else {
+		buf = &m->m_dat[0];
+		buflen = MLEN;
+	}
+
+	return ((void *)(buf + buflen - len));
+}
+
+#ifdef KTR
+#define REJECT_PASS_ACCEPT()	do { \
+	reject_reason = __LINE__; \
+	goto reject; \
+} while (0)
+#else
+#define REJECT_PASS_ACCEPT()	do { goto reject; } while (0)
+#endif
+
+/*
+ * The context associated with a tid entry via insert_tid could be a synq_entry
+ * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
+ */
+CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags));
+
+/*
+ * Handle a CPL_PASS_ACCEPT_REQ message.
+ */
+static int
+do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	const struct cpl_pass_accept_req *req = mtod(m, void *);
+	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	unsigned int tid = GET_TID(req);
+	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
+	struct l2t_entry *e = NULL;
+	struct sockaddr_in nam;
+	struct rtentry *rt;
+	struct inpcb *inp;
+	struct socket *so;
+	struct port_info *pi;
+	struct ifnet *ifp;
+	struct in_conninfo inc;
+	struct tcphdr th;
+	struct tcpopt to;
+	struct synq_entry *synqe = NULL;
+	int i;
+#ifdef KTR
+	int reject_reason;
+#endif
+
+	CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
+	    lctx);
+
+	pass_accept_req_to_protohdrs(req, &inc, &th, &to);
+
+	/*
+	 * Don't offload if the interface that received the SYN doesn't have
+	 * IFCAP_TOE enabled.
+	 */
+	pi = NULL;
+	for_each_port(sc, i) {
+		if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN))
+			continue;
+		pi = &sc->port[i];
+		break;
+	}
+	if (pi == NULL)
+		REJECT_PASS_ACCEPT();
+	ifp = pi->ifp;
+	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
+		REJECT_PASS_ACCEPT();
+
+	/*
+	 * Don't offload if the outgoing interface for the route back to the
+	 * peer is not the same as the interface that received the SYN.
+	 */
+	bzero(&nam, sizeof(nam));
+	nam.sin_len = sizeof(nam);
+	nam.sin_family = AF_INET;
+	nam.sin_addr = inc.inc_faddr;
+	rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
+	if (rt == NULL)
+		REJECT_PASS_ACCEPT();
+	else {
+		struct sockaddr *nexthop;
+
+		RT_UNLOCK(rt);
+		nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
+		    (struct sockaddr *)&nam;
+		if (rt->rt_ifp == ifp)
+			e = t3_l2t_get(pi, rt->rt_ifp, nexthop);
+		RTFREE(rt);
+		if (e == NULL)
+			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
+	}
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+
+	/* Don't offload if the 4-tuple is already in use */
+	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		REJECT_PASS_ACCEPT();
+	}
+
+	inp = lctx->inp;	/* listening socket (not owned by the TOE) */
+	INP_WLOCK(inp);
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The reply from the TOE to
+		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
+		 * resources tied to this listen context.
+		 */
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		REJECT_PASS_ACCEPT();
+	}
+	so = inp->inp_socket;
+
+	/* Reuse the mbuf that delivered the CPL to us */
+	synqe = mbuf_to_synq_entry(m);
+	synqe->flags = TP_IS_A_SYNQ_ENTRY;
+	synqe->m = m;
+	synqe->lctx = lctx;
+	synqe->tid = tid;
+	synqe->e = e;
+	synqe->opt0h = calc_opt0h(so, 0, 0, e);
+	synqe->qset = pi->first_qset + (arc4random() % pi->nqsets);
+	SOCKBUF_LOCK(&so->so_rcv);
+	synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	refcount_init(&synqe->refcnt, 1);
+	atomic_store_rel_int(&synqe->reply, RPL_OK);
+
+	insert_tid(td, synqe, tid);
+	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
+	hold_synqe(synqe);
+	hold_lctx(lctx);
+
+	/* syncache_add releases both pcbinfo and pcb locks */
+	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
+	INP_UNLOCK_ASSERT(inp);
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+
+	/*
+	 * If we replied during syncache_add (reply is RPL_DONE), good.
+	 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply.
+	 * The mbuf will stick around as long as the entry is in the syncache.
+	 * The kernel is free to retry syncache_respond but we'll ignore it due
+	 * to RPL_DONT.
+	 */
+	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) {
+
+		INP_WLOCK(inp);
+		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+			/* listener closed.  synqe must have been aborted. */
+			KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
+			    ("%s: listener %p closed but synqe %p not aborted",
+			    __func__, inp, synqe));
+
+			CTR5(KTR_CXGB,
+			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
+			    __func__, stid, tid, lctx, synqe);
+			INP_WUNLOCK(inp);
+			release_synqe(synqe);
+			return (__LINE__);
+		}
+
+		KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN),
+		    ("%s: synqe %p aborted, but listener %p not dropped.",
+		    __func__, synqe, inp));
+
+		TAILQ_REMOVE(&lctx->synq, synqe, link);
+		release_synqe(synqe);	/* removed from synq list */
+		inp = release_lctx(td, lctx);
+		if (inp)
+			INP_WUNLOCK(inp);
+
+		release_synqe(synqe);	/* about to exit function */
+		REJECT_PASS_ACCEPT();
+	}
+
+	KASSERT(synqe->reply == RPL_DONE,
+	    ("%s: reply %d", __func__, synqe->reply));
+
+	CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid);
+	release_synqe(synqe);
+	return (0);
+
+reject:
+	CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
+	    reject_reason);
+
+	if (synqe == NULL)
+		m_freem(m);
+	if (e)
+		l2t_release(td->l2t, e);
+	queue_tid_release(tod, tid);
+
+	return (0);
+}
+
+static void
+pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl,
+    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
+{
+	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
+
+	bzero(inc, sizeof(*inc));
+	inc->inc_faddr.s_addr = cpl->peer_ip;
+	inc->inc_laddr.s_addr = cpl->local_ip;
+	inc->inc_fport = cpl->peer_port;
+	inc->inc_lport = cpl->local_port;
+
+	bzero(th, sizeof(*th));
+	th->th_sport = cpl->peer_port;
+	th->th_dport = cpl->local_port;
+	th->th_flags = TH_ACK;
+	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
+	th->th_ack = be32toh(cpl->snd_isn); /* ditto */
+
+	bzero(to, sizeof(*to));
+	if (G_TCPOPT_TSTAMP(tcp_opt))
+		to->to_flags |= TOF_TS;
+}
+
+/*
+ * Process a CPL_PASS_ESTABLISH message.  The T3 has already established a
+ * connection and we need to do the software side setup.
+ */
+static int
+do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct cpl_pass_establish *cpl = mtod(m, void *);
+	struct toedev *tod = &td->tod;
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
+	struct toepcb *toep;
+	struct socket *so;
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	struct tcpopt to;
+	struct tcphdr th;
+	struct in_conninfo inc;
+#ifdef KTR
+	int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid));
+#endif
+
+	CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x",
+	    __func__, stid, tid, lctx, inp->inp_flags);
+
+	KASSERT(qs->idx == synqe->qset,
+	    ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
+
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for syncache_expand */
+	INP_WLOCK(inp);
+
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The TOM must have aborted
+		 * all the embryonic connections (including this one) that were
+		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
+		 * for cleaning up.
+		 */
+		KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
+		    ("%s: listen socket dropped but tid %u not aborted.",
+		    __func__, tid));
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		m_freem(m);
+		return (0);
+	}
+
+	pass_establish_to_protohdrs(cpl, &inc, &th, &to);
+
+	/* Lie in order to pass the checks in syncache_expand */
+	to.to_tsecr = synqe->ts;
+	th.th_ack = synqe->iss + 1;
+
+	toep = toepcb_alloc(tod);
+	if (toep == NULL) {
+reset:
+		t3_send_reset_synqe(tod, synqe);
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		m_freem(m);
+		return (0);
+	}
+	toep->tp_qset = qs->idx;
+	toep->tp_l2t = synqe->e;
+	toep->tp_tid = tid;
+	toep->tp_rx_credits = synqe->rx_credits;
+
+	synqe->toep = toep;
+	synqe->cpl = cpl;
+
+	so = inp->inp_socket;
+	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
+		toepcb_free(toep);
+		goto reset;
+	}
+
+	/* Remove the synq entry and release its reference on the lctx */
+	TAILQ_REMOVE(&lctx->synq, synqe, link);
+	inp = release_lctx(td, lctx);
+	if (inp)
+		INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	release_synqe(synqe);
+
+	m_freem(m);
+	return (0);
+}
+
+void
+t3_init_listen_cpl_handlers(struct adapter *sc)
+{
+	t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+	t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+	t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+	t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ *
+ * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
+ * sc->offload_map, if_capenable are all race prone.
+ */
+int
+t3_listen_start(struct toedev *tod, struct tcpcb *tp)
+{
+	struct tom_data *td = t3_tomdata(tod);
+	struct adapter *sc = tod->tod_softc;
+	struct port_info *pi;
+	struct inpcb *inp = tp->t_inpcb;
+	struct listen_ctx *lctx;
+	int i;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if ((inp->inp_vflag & INP_IPV4) == 0)
+		return (0);
+
+#ifdef notyet
+	ADAPTER_LOCK(sc);
+	if (IS_BUSY(sc)) {
+		log(LOG_ERR, "%s: listen request ignored, %s is busy",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+
+	KASSERT(sc->flags & TOM_INIT_DONE,
+	    ("%s: TOM not initialized", __func__));
+#endif
+
+	if ((sc->open_device_map & sc->offload_map) == 0)
+		goto done;	/* no port that's UP with IFCAP_TOE enabled */
+
+	/*
+	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
+	 * queues to send the passive open and receive the reply to it.
+	 *
+	 * XXX: need a way to mark an port in use by offload.  if_cxgbe should
+	 * then reject any attempt to bring down such a port (and maybe reject
+	 * attempts to disable IFCAP_TOE on that port too?).
+	 */
+	for_each_port(sc, i) {
+		if (isset(&sc->open_device_map, i) &&
+		    sc->port[i].ifp->if_capenable & IFCAP_TOE4)
+				break;
+	}
+	KASSERT(i < sc->params.nports,
+	    ("%s: no running port with TOE capability enabled.", __func__));
+	pi = &sc->port[i];
+
+	if (listen_hash_find(td, inp) != NULL)
+		goto done;	/* already setup */
+
+	lctx = alloc_lctx(td, inp, pi->first_qset);
+	if (lctx == NULL) {
+		log(LOG_ERR,
+		    "%s: listen request ignored, %s couldn't allocate lctx\n",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+	listen_hash_add(td, lctx);
+
+	CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__,
+	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
+
+	if (create_server(sc, lctx) != 0) {
+		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
+		    device_get_nameunit(sc->dev));
+		(void) listen_hash_del(td, inp);
+		inp = release_lctx(td, lctx);
+		/* can't be freed, host stack has a reference */
+		KASSERT(inp != NULL, ("%s: inp freed", __func__));
+		goto done;
+	}
+	lctx->flags |= LCTX_RPL_PENDING;
+done:
+#ifdef notyet
+	ADAPTER_UNLOCK(sc);
+#endif
+	return (0);
 }
 
 /*
  * Stop a listening server by sending a close_listsvr request to HW.
  * The server TID is freed when we get the reply.
  */
+int
+t3_listen_stop(struct toedev *tod, struct tcpcb *tp)
+{
+	struct listen_ctx *lctx;
+	struct adapter *sc = tod->tod_softc;
+	struct tom_data *td = t3_tomdata(tod);
+	struct inpcb *inp = tp->t_inpcb;
+	struct synq_entry *synqe;
+
+	INP_WLOCK_ASSERT(inp);
+
+	lctx = listen_hash_del(td, inp);
+	if (lctx == NULL)
+		return (ENOENT);	/* no hardware listener for this inp */
+
+	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
+	    lctx, lctx->flags);
+
+	/*
+	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
+	 * arrive and clean up when it does.
+	 */
+	if (lctx->flags & LCTX_RPL_PENDING) {
+		KASSERT(TAILQ_EMPTY(&lctx->synq),
+		    ("%s: synq not empty.", __func__));
+		return (EINPROGRESS);
+	}
+
+	/*
+	 * The host stack will abort all the connections on the listening
+	 * socket's so_comp.  It doesn't know about the connections on the synq
+	 * so we need to take care of those.
+	 */
+	TAILQ_FOREACH(synqe, &lctx->synq, link) {
+		KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__));
+		t3_send_reset_synqe(tod, synqe);
+	}
+
+	destroy_server(sc, lctx);
+	return (0);
+}
+
 void
-t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+t3_syncache_added(struct toedev *tod __unused, void *arg)
 {
+	struct synq_entry *synqe = arg;
+
+	hold_synqe(synqe);
+}
+
+void
+t3_syncache_removed(struct toedev *tod __unused, void *arg)
+{
+	struct synq_entry *synqe = arg;
+
+	release_synqe(synqe);
+}
+
+/* XXX */
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+
+int
+t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct synq_entry *synqe = arg;
+	struct l2t_entry *e = synqe->e;
+	struct ip *ip = mtod(m, struct ip *);
+	struct tcphdr *th = (void *)(ip + 1);
+	struct cpl_pass_accept_rpl *rpl;
+	struct mbuf *r;
+	struct listen_ctx *lctx = synqe->lctx;
+	struct tcpopt to;
+	int mtu_idx, cpu_idx;
+
+	/*
+	 * The first time we run it's during the call to syncache_add.  That's
+	 * the only one we care about.
+	 */
+	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0)
+		goto done;	/* reply to the CPL only if it's ok to do so */
+
+	r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl);
+	if (r == NULL)
+		goto done;
+
+	/*
+	 * Use only the provided mbuf (with ip and tcp headers) and what's in
+	 * synqe.  Avoid looking at the listening socket (lctx->inp) here.
+	 *
+	 * XXX: if the incoming SYN had the TCP timestamp option but the kernel
+	 * decides it doesn't want to use TCP timestamps we have no way of
+	 * relaying this info to the chip on a per-tid basis (all we have is a
+	 * global knob).
+	 */
+	bzero(&to, sizeof(to));
+	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
+	    TO_SYN);
+
+	/* stash them for later */
+	synqe->iss = be32toh(th->th_seq);
+	synqe->ts = to.to_tsval;
+
+	mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss);
+	cpu_idx = sc->rrss_map[synqe->qset];
+
+	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	rpl->wr.wrh_lo = 0;
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid));
+	rpl->opt2 = calc_opt2(cpu_idx);
+	rpl->rsvd = rpl->opt2;		/* workaround for HW bug */
+	rpl->peer_ip = ip->ip_dst.s_addr;
+	rpl->opt0h = synqe->opt0h |
+	    calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL);
+	rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) |
+	    calc_opt0l(NULL, synqe->rx_credits);
+
+	l2t_send(sc, r, e);
+done:
+	m_freem(m);
+	return (0);
+}
+
+int
+do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	const struct cpl_abort_req_rss *req = mtod(m, void *);
+	unsigned int tid = GET_TID(req);
+	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+
+	KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY,
+	    ("%s: !SYNQ_ENTRY", __func__));
+
+	CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d",
+	    __func__, tid, synqe, synqe->flags, synqe->lctx, req->status);
+
+	INP_WLOCK(inp);
+
+	if (!(synqe->flags & TP_ABORT_REQ_RCVD)) {
+		synqe->flags |= TP_ABORT_REQ_RCVD;
+		synqe->flags |= TP_ABORT_SHUTDOWN;
+		INP_WUNLOCK(inp);
+		m_freem(m);
+		return (0);
+	}
+	synqe->flags &= ~TP_ABORT_REQ_RCVD;
+
+	/*
+	 * If we'd sent a reset on this synqe, we'll ignore this and clean up in
+	 * the T3's reply to our reset instead.
+	 */
+	if (synqe->flags & TP_ABORT_RPL_PENDING) {
+		synqe->flags |= TP_ABORT_RPL_SENT;
+		INP_WUNLOCK(inp);
+	} else {
+		TAILQ_REMOVE(&lctx->synq, synqe, link);
+		inp = release_lctx(td, lctx);
+		if (inp)
+			INP_WUNLOCK(inp);
+		release_tid(tod, tid, qs->idx);
+		l2t_release(td->l2t, synqe->e);
+		release_synqe(synqe);
+	}
+
+	send_abort_rpl(tod, tid, qs->idx);
+	m_freem(m);
+	return (0);
+}
+
+int
+do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
+{
+	struct adapter *sc = qs->adap;
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
+	unsigned int tid = GET_TID(rpl);
+	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+
+	CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe,
+	    rpl->status);
+
+	INP_WLOCK(inp);
+
+	if (synqe->flags & TP_ABORT_RPL_PENDING) {
+		if (!(synqe->flags & TP_ABORT_RPL_RCVD)) {
+			synqe->flags |= TP_ABORT_RPL_RCVD;
+			INP_WUNLOCK(inp);
+		} else {
+			synqe->flags &= ~TP_ABORT_RPL_RCVD;
+			synqe->flags &= TP_ABORT_RPL_PENDING;
+
+			TAILQ_REMOVE(&lctx->synq, synqe, link);
+			inp = release_lctx(td, lctx);
+			if (inp)
+				INP_WUNLOCK(inp);
+			release_tid(tod, tid, qs->idx);
+			l2t_release(td->l2t, synqe->e);
+			release_synqe(synqe);
+		}
+	}
+
+	m_freem(m);
+	return (0);
+}
+
+static void
+t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
+{
+	struct cpl_abort_req *req;
+	unsigned int tid = synqe->tid;
+	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
-	struct cpl_close_listserv_req *req;
-	struct listen_ctx *lctx;
-	int stid = listen_hash_del(TOM_DATA(dev), so);
-	
-	if (stid < 0)
+#ifdef INVARIANTS
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+#endif
+
+	INP_WLOCK_ASSERT(inp);
+
+	CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe,
+	    synqe->flags);
+
+	if (synqe->flags & TP_ABORT_SHUTDOWN)
 		return;
 
-	lctx = cxgb_get_lctx(cdev, stid);
-	/*
-	 * Do this early so embryonic connections are marked as being aborted
-	 * while the stid is still open.  This ensures pass_establish messages
-	 * that arrive while we are closing the server will be able to locate
-	 * the listening socket.
-	 */
-	t3_reset_synq(lctx);
+	synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
 
-	/* Send the close ASAP to stop further passive opens */
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL) {
-		/*
-		 * XXX allocate from lowmem cache
-		 */
-	}
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req);
+	if (m == NULL)
+		CXGB_UNIMPLEMENTED();
 
-	req = mtod(m, struct cpl_close_listserv_req *);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
-	req->cpu_idx = 0;
-	m_set_priority(m, CPL_PRIORITY_LISTEN);
-	cxgb_ofld_send(cdev, m);
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wrh_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+	req->rsvd0 = 0;
+	req->rsvd1 = !(synqe->flags & TP_DATASENT);
+	req->cmd = CPL_ABORT_SEND_RST;
 
-	t3_disconnect_acceptq(so);
+	l2t_send(sc, m, synqe->e);
 }
+
+void
+t3_offload_socket(struct toedev *tod, void *arg, struct socket *so)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct tom_data *td = sc->tom_softc;
+	struct synq_entry *synqe = arg;
+#ifdef INVARIANTS
+	struct inpcb *inp = sotoinpcb(so);
+#endif
+	struct cpl_pass_establish *cpl = synqe->cpl;
+	struct toepcb *toep = synqe->toep;
+
+	INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+	INP_WLOCK_ASSERT(inp);
+
+	offload_socket(so, toep);
+	make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+	update_tid(td, toep, synqe->tid);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,181 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-$FreeBSD$
-
-***************************************************************************/
-
-#ifndef T3_DDP_H
-#define T3_DDP_H
-
-/* Should be 1 or 2 indicating single or double kernel buffers. */
-#define NUM_DDP_KBUF 2
-
-/* min receive window for a connection to be considered for DDP */
-#define MIN_DDP_RCV_WIN (48 << 10)
-
-/* amount of Rx window not available to DDP to avoid window exhaustion */
-#define DDP_RSVD_WIN (16 << 10)
-
-/* # of sentinel invalid page pods at the end of a group of valid page pods */
-#define NUM_SENTINEL_PPODS 0
-
-/* # of pages a pagepod can hold without needing another pagepod */
-#define PPOD_PAGES 4
-
-/* page pods are allocated in groups of this size (must be power of 2) */
-#define PPOD_CLUSTER_SIZE 16
-
-/* for each TID we reserve this many page pods up front */
-#define RSVD_PPODS_PER_TID 1
-
-struct pagepod {
-	uint32_t pp_vld_tid;
-	uint32_t pp_pgsz_tag_color;
-	uint32_t pp_max_offset;
-	uint32_t pp_page_offset;
-	uint64_t pp_rsvd;
-	uint64_t pp_addr[5];
-};
-
-#define PPOD_SIZE sizeof(struct pagepod)
-
-#define S_PPOD_TID    0
-#define M_PPOD_TID    0xFFFFFF
-#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
-
-#define S_PPOD_VALID    24
-#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
-#define F_PPOD_VALID    V_PPOD_VALID(1U)
-
-#define S_PPOD_COLOR    0
-#define M_PPOD_COLOR    0x3F
-#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
-
-#define S_PPOD_TAG    6
-#define M_PPOD_TAG    0xFFFFFF
-#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
-
-#define S_PPOD_PGSZ    30
-#define M_PPOD_PGSZ    0x3
-#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <machine/bus.h>
-
-/* DDP gather lists can specify an offset only for the first page. */
-struct ddp_gather_list {
-	unsigned int	dgl_length;
-	unsigned int	dgl_offset;
-	unsigned int	dgl_nelem;
-	vm_page_t   	dgl_pages[0];
-};
-
-struct ddp_buf_state {
-	unsigned int cur_offset;     /* offset of latest DDP notification */
-	unsigned int flags;
-	struct ddp_gather_list *gl;
-};
-
-struct ddp_state {
-	struct ddp_buf_state buf_state[2];   /* per buffer state */
-	int cur_buf;
-	unsigned short kbuf_noinval;
-	unsigned short kbuf_idx;        /* which HW buffer is used for kbuf */
-	struct ddp_gather_list *ubuf;
-	int user_ddp_pending;
-	unsigned int ubuf_nppods;       /* # of page pods for buffer 1 */
-	unsigned int ubuf_tag;
-	unsigned int ubuf_ddp_ready;
-	int cancel_ubuf;
-	int get_tcb_count;
-	unsigned int kbuf_posted;
-	unsigned int kbuf_nppods[NUM_DDP_KBUF];
-	unsigned int kbuf_tag[NUM_DDP_KBUF];
-	struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
-};
-
-/* buf_state flags */
-enum {
-	DDP_BF_NOINVAL = 1 << 0,   /* buffer is set to NO_INVALIDATE */
-	DDP_BF_NOCOPY  = 1 << 1,   /* DDP to final dest, no copy needed */
-	DDP_BF_NOFLIP  = 1 << 2,   /* buffer flips after GET_TCB_RPL */
-	DDP_BF_PSH     = 1 << 3,   /* set in skb->flags if the a DDP was 
-	                              completed with a segment having the
-				      PSH flag set */
-	DDP_BF_NODATA  = 1 << 4,   /* buffer completed before filling */ 
-};
-
-#include <ulp/tom/cxgb_toepcb.h>
-struct sockbuf;
-
-/*
- * Returns 1 if a UBUF DMA buffer might be active.
- */
-static inline int
-t3_ddp_ubuf_pending(struct toepcb *toep)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
-	 * but DDP_STATE() is only valid if the connection actually enabled
-	 * DDP.
-	 */
-	if (p->kbuf[0] == NULL)
-		return (0);
-
-	return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || 
-	       (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
-}
-
-int t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
-		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
-		   unsigned int pg_off, unsigned int color);
-int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
-void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
-void t3_free_ddp_gl(struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
-//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
-void t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock);
-int t3_post_ubuf(struct toepcb *toep, const struct uio *uio, int nonblock,
-		 int rcv_flags, int modulate, int post_kbuf);
-void t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv);
-int t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv,
-    const struct uio *uio, int nonblock,
-    int rcv_flags, int modulate, int post_kbuf);
-int t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock);
-void t3_cleanup_ddp(struct toepcb *toep);
-void t3_release_ddp_resources(struct toepcb *toep);
-void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx);
-void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0,
-		       unsigned int tag1, unsigned int len);
-void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0,
-		      unsigned int len1, unsigned int offset1,
-		      uint64_t ddp_flags, uint64_t flag_mask, int modulate);
-#endif  /* T3_DDP_H */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tcp.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-
-/*-
- * Copyright (c) 2007, Chelsio Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-#ifndef CXGB_TCP_H_
-#define CXGB_TCP_H_
-#ifdef TCP_USRREQS_OVERLOAD
-struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno);
-#else
-#define cxgb_tcp_drop	tcp_drop
-#endif
-void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip);
-struct tcpcb *cxgb_tcp_close(struct tcpcb *tp);
-
-extern struct pr_usrreqs cxgb_tcp_usrreqs;
-#ifdef INET6
-extern struct pr_usrreqs cxgb_tcp6_usrreqs;
-#endif
-
-#include <sys/sysctl.h>
-SYSCTL_DECL(_net_inet_tcp_cxgb);
-#endif  /* CXGB_TCP_H_ */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-/*-
- * Copyright (c) 2007, Chelsio Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-/*
- * grab bag of accessor routines that will either be moved to netinet
- * or removed
- */
-
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/types.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/mbuf.h>
-#include <sys/sockopt.h>
-#include <sys/sockbuf.h>
-
-#include <sys/socket.h>
-
-#include <net/if.h>
-#include <net/if_types.h>
-#include <net/if_var.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_pcb.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_offload.h>
-#include <netinet/tcp_syncache.h>
-#include <netinet/toedev.h>
-
-#include <ulp/tom/cxgb_tcp_offload.h>
-
-
-/*
- * This file contains code as a short-term staging area before it is moved in 
- * to sys/netinet/tcp_offload.c
- */
-
-void
-sockbuf_lock(struct sockbuf *sb)
-{
-
-	SOCKBUF_LOCK(sb);
-}
-
-void
-sockbuf_lock_assert(struct sockbuf *sb)
-{
-
-	SOCKBUF_LOCK_ASSERT(sb);
-}
-
-void
-sockbuf_unlock(struct sockbuf *sb)
-{
-
-	SOCKBUF_UNLOCK(sb);
-}
-
-int
-sockbuf_sbspace(struct sockbuf *sb)
-{
-
-	return (sbspace(sb));
-}
-
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-/* $FreeBSD$ */
-
-#ifndef CXGB_TCP_OFFLOAD_H_
-#define CXGB_TCP_OFFLOAD_H_
-
-struct sockbuf;
-
-void sockbuf_lock(struct sockbuf *);
-void sockbuf_lock_assert(struct sockbuf *);
-void sockbuf_unlock(struct sockbuf *);
-int  sockbuf_sbspace(struct sockbuf *);
-
-
-#endif /* CXGB_TCP_OFFLOAD_H_ */
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2007-2008, Chelsio Inc.
+ * Copyright (c) 2007-2009, Chelsio Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,88 +32,63 @@
 #include <sys/condvar.h>
 #include <sys/mbufq.h>
 
+#define TP_DATASENT         	(1 << 0)
+#define TP_TX_WAIT_IDLE      	(1 << 1)
+#define TP_FIN_SENT          	(1 << 2)
+#define TP_ABORT_RPL_PENDING 	(1 << 3)
+#define TP_ABORT_SHUTDOWN    	(1 << 4)
+#define TP_ABORT_RPL_RCVD    	(1 << 5)
+#define TP_ABORT_REQ_RCVD    	(1 << 6)
+#define TP_ATTACHED	    	(1 << 7)
+#define TP_CPL_DONE		(1 << 8)
+#define TP_IS_A_SYNQ_ENTRY	(1 << 9)
+#define TP_ABORT_RPL_SENT	(1 << 10)
+#define TP_SEND_FIN          	(1 << 11)
+
 struct toepcb {
-	struct toedev 		*tp_toedev;
+	TAILQ_ENTRY(toepcb) link; /* toep_list */
+	int 			tp_flags;
+	struct toedev 		*tp_tod;
 	struct l2t_entry 	*tp_l2t;
-	unsigned int 		tp_tid;
+	int			tp_tid;
 	int 			tp_wr_max;
 	int 			tp_wr_avail;
 	int 			tp_wr_unacked;
 	int 			tp_delack_mode;
-	int 			tp_mtu_idx;
 	int 			tp_ulp_mode;
-	int 			tp_qset_idx;
-	int 			tp_mss_clamp;
 	int 			tp_qset;
-	int 			tp_flags;
-	int 			tp_enqueued_bytes;
-	int 			tp_page_count;
-	int 			tp_state;
+	int 			tp_enqueued;
+	int 			tp_rx_credits;
 
-	tcp_seq 		tp_iss;
-	tcp_seq 		tp_delack_seq;
-	tcp_seq 		tp_rcv_wup;
-	tcp_seq 		tp_copied_seq;
-	uint64_t 		tp_write_seq;
+	struct inpcb 		*tp_inp;
+	struct mbuf		*tp_m_last;
 
-	volatile int 		tp_refcount;
-	vm_page_t 		*tp_pages;
-	
-	struct tcpcb 		*tp_tp;
-	struct mbuf  		*tp_m_last;
-	bus_dma_tag_t		tp_tx_dmat;
-	bus_dma_tag_t		tp_rx_dmat;
-	bus_dmamap_t		tp_dmamap;
-
-	LIST_ENTRY(toepcb) 	synq_entry;
 	struct mbuf_head 	wr_list;
 	struct mbuf_head 	out_of_order_queue;
-	struct ddp_state 	tp_ddp_state;
-	struct cv		tp_cv;
-			   
 };
 
 static inline void
 reset_wr_list(struct toepcb *toep)
 {
-
 	mbufq_init(&toep->wr_list);
 }
 
 static inline void
-purge_wr_queue(struct toepcb *toep)
-{
-	struct mbuf *m;
-	
-	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) 
-		m_freem(m);
-}
-
-static inline void
 enqueue_wr(struct toepcb *toep, struct mbuf *m)
 {
-
 	mbufq_tail(&toep->wr_list, m);
 }
 
 static inline struct mbuf *
 peek_wr(const struct toepcb *toep)
 {
-
 	return (mbufq_peek(&toep->wr_list));
 }
 
 static inline struct mbuf *
 dequeue_wr(struct toepcb *toep)
 {
-
 	return (mbufq_dequeue(&toep->wr_list));
 }
 
-#define wr_queue_walk(toep, m) \
-	for (m = peek_wr(toep); m; m = m->m_nextpkt)
-
-
-
 #endif
-
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tom.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,261 +1,106 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include <sys/param.h>
-#include <sys/systm.h>
+#include <sys/types.h>
 #include <sys/kernel.h>
-#include <sys/fcntl.h>
-#include <sys/ktr.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/eventhandler.h>
-#include <sys/mbuf.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
 #include <sys/module.h>
-#include <sys/condvar.h>
-#include <sys/mutex.h>
 #include <sys/socket.h>
-#include <sys/sockopt.h>
-#include <sys/sockstate.h>
-#include <sys/sockbuf.h>
-#include <sys/syslog.h>
 #include <sys/taskqueue.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <netinet/toecore.h>
 
-#include <net/if.h>
-#include <net/route.h>
+#ifdef TCP_OFFLOAD
+#include "cxgb_include.h"
+#include "ulp/tom/cxgb_tom.h"
+#include "ulp/tom/cxgb_l2t.h"
+#include "ulp/tom/cxgb_toepcb.h"
 
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
+MALLOC_DEFINE(M_CXGB, "cxgb", "Chelsio T3 Offload services");
 
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
+/* Module ops */
+static int t3_tom_mod_load(void);
+static int t3_tom_mod_unload(void);
+static int t3_tom_modevent(module_t, int, void *);
 
-#include <netinet/in_pcb.h>
+/* ULD ops and helpers */
+static int t3_tom_activate(struct adapter *);
+static int t3_tom_deactivate(struct adapter *);
 
-#include <ulp/tom/cxgb_tcp_offload.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_offload.h>
-#include <netinet/tcp_fsm.h>
+static int alloc_tid_tabs(struct tid_info *, u_int, u_int, u_int, u_int, u_int);
+static void free_tid_tabs(struct tid_info *);
+static int write_smt_entry(struct adapter *, int);
+static void free_tom_data(struct tom_data *);
 
-#include <cxgb_include.h>
-
-#include <net/if_vlan_var.h>
-#include <net/route.h>
-
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_tcb.h>
-#include <cxgb_include.h>
-#include <common/cxgb_ctl_defs.h>
-#include <common/cxgb_t3_cpl.h>
-#include <cxgb_offload.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_l2t.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-#include <ulp/tom/cxgb_toepcb.h>
-#include <ulp/tom/cxgb_tcp.h>
-
-
-TAILQ_HEAD(, adapter) adapter_list;
-static struct rwlock adapter_list_lock;
-
-static TAILQ_HEAD(, tom_data) cxgb_list;
-static struct mtx cxgb_list_lock;
-static const unsigned int MAX_ATIDS = 64 * 1024;
-static const unsigned int ATID_BASE = 0x100000;
-
-static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
-static void cxgb_register_listeners(void);
-static void t3c_tom_add(struct t3cdev *cdev);
-
-/*
- * Handlers for each CPL opcode
- */
-static cxgb_cpl_handler_func tom_cpl_handlers[256];
-
-
-static eventhandler_tag listen_tag;
-
-static struct offload_id t3_toe_id_tab[] = {
-	{ TOE_ID_CHELSIO_T3, 0 },
-	{ TOE_ID_CHELSIO_T3B, 0 },
-	{ TOE_ID_CHELSIO_T3C, 0 },
-	{ 0 }
+static struct uld_info tom_uld_info = {
+	.uld_id = ULD_TOM,
+	.activate = t3_tom_activate,
+	.deactivate = t3_tom_deactivate,
 };
 
-static struct tom_info t3_tom_info = {
-	.ti_attach = t3_toe_attach,
-	.ti_id_table = t3_toe_id_tab,
-	.ti_name = "Chelsio-T3"
-};
-
-struct cxgb_client t3c_tom_client = {
-	.name = "tom_cxgb3",
-	.add = t3c_tom_add,
-	.remove = NULL,
-	.handlers = tom_cpl_handlers,
-	.redirect = NULL
-};
-
-void
-cxgb_log_tcb(struct adapter *sc, unsigned int tid)
-{
-
-	char buf[TCB_SIZE];
-	uint64_t *tcb = (uint64_t *)buf;
-	int i, error;
-	struct mc7 *mem = &sc->cm;
-
-	error = t3_mc7_bd_read(mem, tid*TCB_SIZE/8, TCB_SIZE/8, tcb);
-	if (error)
-		printf("cxgb_tcb_log failed\n");
-
-
-	CTR1(KTR_CXGB, "TCB tid=%u", tid);
-	for (i = 0; i < TCB_SIZE / 32; i++) {
-
-		CTR5(KTR_CXGB, "%1d: %08x %08x %08x %08x",
-		    i, (uint32_t)tcb[1], (uint32_t)(tcb[1] >> 32),
-		    (uint32_t)tcb[0], (uint32_t)(tcb[0] >> 32));
-
-		tcb += 2;
-		CTR4(KTR_CXGB, "   %08x %08x %08x %08x",
-		    (uint32_t)tcb[1], (uint32_t)(tcb[1] >> 32),
-		    (uint32_t)tcb[0], (uint32_t)(tcb[0] >> 32));
-		tcb += 2;
-	}
-}
-
-/*
- * Add an skb to the deferred skb queue for processing from process context.
- */
-void
-t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler)
-{
-	struct tom_data *td = TOM_DATA(dev);
-
-	m_set_handler(m, handler);
-	mtx_lock(&td->deferq.lock);
-	
-	mbufq_tail(&td->deferq, m);
-	if (mbufq_len(&td->deferq) == 1)
-		taskqueue_enqueue(td->tq, &td->deferq_task);
-	mtx_lock(&td->deferq.lock);
-}
-
 struct toepcb *
-toepcb_alloc(void)
+toepcb_alloc(struct toedev *tod)
 {
 	struct toepcb *toep;
-	
-	toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT|M_ZERO);
-	
+
+	toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT | M_ZERO);
 	if (toep == NULL)
 		return (NULL);
 
-	toepcb_init(toep);
+	toep->tp_tod = tod;
+	toep->tp_wr_max = toep->tp_wr_avail = 15;
+	toep->tp_wr_unacked = 0;
+	toep->tp_delack_mode = 0;
+
 	return (toep);
 }
 
 void
-toepcb_init(struct toepcb *toep)
+toepcb_free(struct toepcb *toep)
 {
-	toep->tp_refcount = 1;
-	cv_init(&toep->tp_cv, "toep cv");
+	free(toep, M_CXGB);
 }
 
-void
-toepcb_hold(struct toepcb *toep)
-{
-	atomic_add_acq_int(&toep->tp_refcount, 1);
-}
-
-void
-toepcb_release(struct toepcb *toep)
-{
-	if (toep->tp_refcount == 1) {
-		free(toep, M_CXGB);
-		return;
-	}
-	atomic_add_acq_int(&toep->tp_refcount, -1);
-}
-
-
-/*
- * Add a T3 offload device to the list of devices we are managing.
- */
-static void
-t3cdev_add(struct tom_data *t)
-{	
-	mtx_lock(&cxgb_list_lock);
-	TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
-	mtx_unlock(&cxgb_list_lock);
-}
-
-static inline int
-cdev2type(struct t3cdev *cdev)
-{
-	int type = 0;
-
-	switch (cdev->type) {
-	case T3A:
-		type = TOE_ID_CHELSIO_T3;
-		break;
-	case T3B:
-		type = TOE_ID_CHELSIO_T3B;
-		break;
-	case T3C:
-		type = TOE_ID_CHELSIO_T3C;
-		break;
-	}
-	return (type);
-}
-
-/*
- * Allocate and initialize the TID tables.  Returns 0 on success.
- */
 static int
-init_tid_tabs(struct tid_info *t, unsigned int ntids,
-			 unsigned int natids, unsigned int nstids,
-			 unsigned int atid_base, unsigned int stid_base)
+alloc_tid_tabs(struct tid_info *t, u_int ntids, u_int natids, u_int nstids,
+    u_int atid_base, u_int stid_base)
 {
 	unsigned long size = ntids * sizeof(*t->tid_tab) +
 	    natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab);
 
-	t->tid_tab = cxgb_alloc_mem(size);
+	t->tid_tab = malloc(size, M_CXGB, M_NOWAIT | M_ZERO);
 	if (!t->tid_tab)
 		return (ENOMEM);
 
@@ -270,8 +115,8 @@
 	t->afree = NULL;
 	t->stids_in_use = t->atids_in_use = 0;
 	t->tids_in_use = 0;
-	mtx_init(&t->stid_lock, "stid", NULL, MTX_DUPOK|MTX_DEF);
-	mtx_init(&t->atid_lock, "atid", NULL, MTX_DUPOK|MTX_DEF);
+	mtx_init(&t->stid_lock, "stid", NULL, MTX_DEF);
+	mtx_init(&t->atid_lock, "atid", NULL, MTX_DEF);
 
 	/*
 	 * Setup the free lists for stid_tab and atid_tab.
@@ -286,1240 +131,266 @@
 			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
 		t->afree = t->atid_tab;
 	}
-	return 0;
-}
-
-static void
-free_tid_maps(struct tid_info *t)
-{
-	mtx_destroy(&t->stid_lock);
-	mtx_destroy(&t->atid_lock);
-	cxgb_free_mem(t->tid_tab);
-}
-
-static inline void
-add_adapter(adapter_t *adap)
-{
-	rw_wlock(&adapter_list_lock);
-	TAILQ_INSERT_TAIL(&adapter_list, adap, adapter_entry);
-	rw_wunlock(&adapter_list_lock);
-}
-
-static inline void
-remove_adapter(adapter_t *adap)
-{
-	rw_wlock(&adapter_list_lock);
-	TAILQ_REMOVE(&adapter_list, adap, adapter_entry);
-	rw_wunlock(&adapter_list_lock);
-}
-
-/*
- * Populate a TID_RELEASE WR.  The mbuf must be already propely sized.
- */
-static inline void
-mk_tid_release(struct mbuf *m, unsigned int tid)
-{
-	struct cpl_tid_release *req;
-
-	m_set_priority(m, CPL_PRIORITY_SETUP);
-	req = mtod(m, struct cpl_tid_release *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
-}
-
-static void
-t3_process_tid_release_list(void *data, int pending)
-{
-	struct mbuf *m;
-	struct t3cdev *tdev = data;
-	struct t3c_data *td = T3C_DATA (tdev);
-
-	mtx_lock(&td->tid_release_lock);
-	while (td->tid_release_list) {
-		struct toe_tid_entry *p = td->tid_release_list;
-
-		td->tid_release_list = (struct toe_tid_entry *)p->ctx;
-		mtx_unlock(&td->tid_release_lock);
-		m = m_get(M_WAIT, MT_DATA);
-		mk_tid_release(m, p - td->tid_maps.tid_tab);
-		cxgb_ofld_send(tdev, m);
-		p->ctx = NULL;
-		mtx_lock(&td->tid_release_lock);
-	}
-	mtx_unlock(&td->tid_release_lock);
-}
-
-int
-cxgb_offload_activate(struct adapter *adapter)
-{
-	struct t3cdev *dev = &adapter->tdev;
-	int natids, err;
-	struct t3c_data *t;
-	struct tid_range stid_range, tid_range;
-	struct mtutab mtutab;
-	unsigned int l2t_capacity;
-
-	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
-	if (!t)
-		return (ENOMEM);
-	dev->adapter = adapter;
-
-	err = (EOPNOTSUPP);
-	if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 ||
-	    dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 ||
-	    dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 ||
-	    dev->ctl(dev, GET_MTUS, &mtutab) < 0 ||
-	    dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 ||
-	    dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0) {
-		device_printf(adapter->dev, "%s: dev->ctl check failed\n", __FUNCTION__);
-		goto out_free;
-	}
-      
-	err = (ENOMEM);
-	L2DATA(dev) = t3_init_l2t(l2t_capacity);
-	if (!L2DATA(dev)) {
-		device_printf(adapter->dev, "%s: t3_init_l2t failed\n", __FUNCTION__);
-		goto out_free;
-	}
-	natids = min(tid_range.num / 2, MAX_ATIDS);
-	err = init_tid_tabs(&t->tid_maps, tid_range.num, natids,
-			    stid_range.num, ATID_BASE, stid_range.base);
-	if (err) {	
-		device_printf(adapter->dev, "%s: init_tid_tabs failed\n", __FUNCTION__);
-		goto out_free_l2t;
-	}
-	
-	t->mtus = mtutab.mtus;
-	t->nmtus = mtutab.size;
-
-	TASK_INIT(&t->tid_release_task, 0 /* XXX? */, t3_process_tid_release_list, dev);
-	mtx_init(&t->tid_release_lock, "tid release", NULL, MTX_DUPOK|MTX_DEF);
-	t->dev = dev;
-
-	T3C_DATA (dev) = t;
-	dev->recv = process_rx;
-	dev->arp_update = t3_l2t_update;
-	/* Register netevent handler once */
-	if (TAILQ_EMPTY(&adapter_list)) {
-#if defined(CONFIG_CHELSIO_T3_MODULE)
-		if (prepare_arp_with_t3core())
-			log(LOG_ERR, "Unable to set offload capabilities\n");
-#endif
-	}
-	CTR1(KTR_CXGB, "adding adapter %p", adapter); 
-	add_adapter(adapter);
-	device_printf(adapter->dev, "offload started\n");
-	adapter->flags |= CXGB_OFLD_INIT;
-	return (0);
-
-out_free_l2t:
-	t3_free_l2t(L2DATA(dev));
-	L2DATA(dev) = NULL;
-out_free:
-	free(t, M_CXGB);
-	return (err);
-}
-
-void
-cxgb_offload_deactivate(struct adapter *adapter)
-{
-	struct t3cdev *tdev = &adapter->tdev;
-	struct t3c_data *t = T3C_DATA(tdev);
-
-	printf("removing adapter %p\n", adapter);
-	remove_adapter(adapter);
-	if (TAILQ_EMPTY(&adapter_list)) {
-#if defined(CONFIG_CHELSIO_T3_MODULE)
-		restore_arp_sans_t3core();
-#endif
-	}
-	free_tid_maps(&t->tid_maps);
-	T3C_DATA(tdev) = NULL;
-	t3_free_l2t(L2DATA(tdev));
-	L2DATA(tdev) = NULL;
-	mtx_destroy(&t->tid_release_lock);
-	free(t, M_CXGB);
-}
-
-/*
- * Sends an sk_buff to a T3C driver after dealing with any active network taps.
- */
-int
-cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m)
-{
-	int r;
-
-	r = dev->send(dev, m);
-	return r;
-}
-
-static struct ifnet *
-get_iff_from_mac(adapter_t *adapter, const uint8_t *mac, unsigned int vlan)
-{
-	int i;
-
-	for_each_port(adapter, i) {
-#ifdef notyet		
-		const struct vlan_group *grp;
-#endif		
-		const struct port_info *p = &adapter->port[i];
-		struct ifnet *ifp = p->ifp;
-
-		if (!memcmp(p->hw_addr, mac, ETHER_ADDR_LEN)) {
-#ifdef notyet	
-			
-			if (vlan && vlan != EVL_VLID_MASK) {
-				grp = p->vlan_grp;
-				dev = grp ? grp->vlan_devices[vlan] : NULL;
-			} else
-				while (dev->master)
-					dev = dev->master;
-#endif			
-			return (ifp);
-		}
-	}
-	return (NULL);
-}
-
-static inline void
-failover_fixup(adapter_t *adapter, int port)
-{
-	if (adapter->params.rev == 0) {
-		struct ifnet *ifp = adapter->port[port].ifp;
-		struct cmac *mac = &adapter->port[port].mac;
-		if (!(ifp->if_flags & IFF_UP)) {
-			/* Failover triggered by the interface ifdown */
-			t3_write_reg(adapter, A_XGM_TX_CTRL + mac->offset,
-				     F_TXEN);
-			t3_read_reg(adapter, A_XGM_TX_CTRL + mac->offset);
-		} else {
-			/* Failover triggered by the interface link down */
-			t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
-			t3_read_reg(adapter, A_XGM_RX_CTRL + mac->offset);
-			t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset,
-				     F_RXEN);
-		}
-	}
-}
-
-static int
-cxgb_ulp_iscsi_ctl(adapter_t *adapter, unsigned int req, void *data)
-{
-	int ret = 0;
-	struct ulp_iscsi_info *uiip = data;
-
-	switch (req) {
-	case ULP_ISCSI_GET_PARAMS:
-		uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT);
-		uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT);
-		uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK);
-		/*
-		 * On tx, the iscsi pdu has to be <= tx page size and has to
-		 * fit into the Tx PM FIFO.
-		 */
-		uiip->max_txsz = min(adapter->params.tp.tx_pg_size,
-				     t3_read_reg(adapter, A_PM1_TX_CFG) >> 17);
-		/* on rx, the iscsi pdu has to be < rx page size and the
-		   whole pdu + cpl headers has to fit into one sge buffer */
-		/* also check the max rx data length programmed in TP */
-		uiip->max_rxsz = min(uiip->max_rxsz,
-				     ((t3_read_reg(adapter, A_TP_PARA_REG2))
-					>> S_MAXRXDATA) & M_MAXRXDATA);
-		break;
-	case ULP_ISCSI_SET_PARAMS:
-		t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask);
-		break;
-	default:
-		ret = (EOPNOTSUPP);
-	}
-	return ret;
-}
-
-/* Response queue used for RDMA events. */
-#define ASYNC_NOTIF_RSPQ 0
-
-static int
-cxgb_rdma_ctl(adapter_t *adapter, unsigned int req, void *data)
-{
-	int ret = 0;
-
-	switch (req) {
-	case RDMA_GET_PARAMS: {
-		struct rdma_info *req = data;
-
-		req->udbell_physbase = rman_get_start(adapter->udbs_res);
-		req->udbell_len = rman_get_size(adapter->udbs_res);
-		req->tpt_base = t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT);
-		req->tpt_top  = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT);
-		req->pbl_base = t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT);
-		req->pbl_top  = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT);
-		req->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT);
-		req->rqt_top  = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT);
-		req->kdb_addr =  (void *)((unsigned long)rman_get_virtual(adapter->regs_res) + A_SG_KDOORBELL);		break;
-	}
-	case RDMA_CQ_OP: {
-		struct rdma_cq_op *req = data;
-
-		/* may be called in any context */
-		mtx_lock_spin(&adapter->sge.reg_lock);
-		ret = t3_sge_cqcntxt_op(adapter, req->id, req->op,
-					req->credits);
-		mtx_unlock_spin(&adapter->sge.reg_lock);
-		break;
-	}
-	case RDMA_GET_MEM: {
-		struct ch_mem_range *t = data;
-		struct mc7 *mem;
-
-		if ((t->addr & 7) || (t->len & 7))
-			return (EINVAL);
-		if (t->mem_id == MEM_CM)
-			mem = &adapter->cm;
-		else if (t->mem_id == MEM_PMRX)
-			mem = &adapter->pmrx;
-		else if (t->mem_id == MEM_PMTX)
-			mem = &adapter->pmtx;
-		else
-			return (EINVAL);
-
-		ret = t3_mc7_bd_read(mem, t->addr/8, t->len/8, (u64 *)t->buf);
-		if (ret)
-			return (ret);
-		break;
-	}
-	case RDMA_CQ_SETUP: {
-		struct rdma_cq_setup *req = data;
-
-		mtx_lock_spin(&adapter->sge.reg_lock);
-		ret = t3_sge_init_cqcntxt(adapter, req->id, req->base_addr,
-					  req->size, ASYNC_NOTIF_RSPQ,
-					  req->ovfl_mode, req->credits,
-					  req->credit_thres);
-		mtx_unlock_spin(&adapter->sge.reg_lock);
-		break;
-	}
-	case RDMA_CQ_DISABLE:
-		mtx_lock_spin(&adapter->sge.reg_lock);
-		ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data);
-		mtx_unlock_spin(&adapter->sge.reg_lock);
-		break;
-	case RDMA_CTRL_QP_SETUP: {
-		struct rdma_ctrlqp_setup *req = data;
-
-		mtx_lock_spin(&adapter->sge.reg_lock);
-		ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0,
-					 SGE_CNTXT_RDMA, ASYNC_NOTIF_RSPQ,
-					 req->base_addr, req->size,
-					 FW_RI_TID_START, 1, 0);
-		mtx_unlock_spin(&adapter->sge.reg_lock);
-		break;
-	}
-	default:
-		ret = EOPNOTSUPP;
-	}
-	return (ret);
-}
-
-static int
-cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
-{
-	struct adapter *adapter = tdev2adap(tdev);
-	struct tid_range *tid;
-	struct mtutab *mtup;
-	struct iff_mac *iffmacp;
-	struct ddp_params *ddpp;
-	struct adap_ports *ports;
-	struct ofld_page_info *rx_page_info;
-	struct tp_params *tp = &adapter->params.tp;
-	int port;
-
-	switch (req) {
-	case GET_MAX_OUTSTANDING_WR:
-		*(unsigned int *)data = FW_WR_NUM;
-		break;
-	case GET_WR_LEN:
-		*(unsigned int *)data = WR_FLITS;
-		break;
-	case GET_TX_MAX_CHUNK:
-		*(unsigned int *)data = 1 << 20;  /* 1MB */
-		break;
-	case GET_TID_RANGE:
-		tid = data;
-		tid->num = t3_mc5_size(&adapter->mc5) -
-			adapter->params.mc5.nroutes -
-			adapter->params.mc5.nfilters -
-			adapter->params.mc5.nservers;
-		tid->base = 0;
-		break;
-	case GET_STID_RANGE:
-		tid = data;
-		tid->num = adapter->params.mc5.nservers;
-		tid->base = t3_mc5_size(&adapter->mc5) - tid->num -
-			adapter->params.mc5.nfilters -
-			adapter->params.mc5.nroutes;
-		break;
-	case GET_L2T_CAPACITY:
-		*(unsigned int *)data = 2048;
-		break;
-	case GET_MTUS:
-		mtup = data;
-		mtup->size = NMTUS;
-		mtup->mtus = adapter->params.mtus;
-		break;
-	case GET_IFF_FROM_MAC:
-		iffmacp = data;
-		iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr,
-					  iffmacp->vlan_tag & EVL_VLID_MASK);
-		break;
-	case GET_DDP_PARAMS:
-		ddpp = data;
-		ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT);
-		ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT);
-		ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK);
-		break;
-	case GET_PORTS:
-		ports = data;
-		ports->nports   = adapter->params.nports;
-		for_each_port(adapter, port)
-			ports->lldevs[port] = adapter->port[port].ifp;
-		break;
-	case FAILOVER:
-		port = *(int *)data;
-		t3_port_failover(adapter, port);
-		failover_fixup(adapter, port);
-		break;
-	case FAILOVER_DONE:
-		port = *(int *)data;
-		t3_failover_done(adapter, port);
-		break;
-	case FAILOVER_CLEAR:
-		t3_failover_clear(adapter);
-		break;
-	case GET_RX_PAGE_INFO:
-		rx_page_info = data;
-		rx_page_info->page_size = tp->rx_pg_size;
-		rx_page_info->num = tp->rx_num_pgs;
-		break;
-	case ULP_ISCSI_GET_PARAMS:
-	case ULP_ISCSI_SET_PARAMS:
-		if (!offload_running(adapter))
-			return (EAGAIN);
-		return cxgb_ulp_iscsi_ctl(adapter, req, data);
-	case RDMA_GET_PARAMS:
-	case RDMA_CQ_OP:
-	case RDMA_CQ_SETUP:
-	case RDMA_CQ_DISABLE:
-	case RDMA_CTRL_QP_SETUP:
-	case RDMA_GET_MEM:
-		if (!offload_running(adapter))
-			return (EAGAIN);
-		return cxgb_rdma_ctl(adapter, req, data);
-	default:
-		return (EOPNOTSUPP);
-	}
-	return 0;
-}
-
-/*
- * Allocate a TOM data structure,
- * initialize its cpl_handlers
- * and register it as a T3C client
- */
-static void
-t3c_tom_add(struct t3cdev *cdev)
-{
-	int i;
-	unsigned int wr_len;
-	struct tom_data *t;
-	struct toedev *tdev;
-	struct adap_ports *port_info;
-
-	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
-	if (t == NULL)
-		return;
-
-	cdev->send = t3_offload_tx;
-	cdev->ctl = cxgb_offload_ctl;
-	
-	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
-		goto out_free_tom;
-
-	port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO);
-	if (!port_info)
-		goto out_free_tom;
-
-	if (cdev->ctl(cdev, GET_PORTS, port_info) < 0)
-		goto out_free_all;
-
-	t3_init_wr_tab(wr_len);
-	t->cdev = cdev;
-	t->client = &t3c_tom_client;
-
-	/* Register TCP offload device */
-	tdev = &t->tdev;
-	tdev->tod_ttid = cdev2type(cdev);
-	tdev->tod_lldev = cdev->lldev;
-	
-	if (register_toedev(tdev, "toe%d")) {
-		printf("unable to register offload device");
-		goto out_free_all;
-	}
-	TOM_DATA(tdev) = t;
-
-	for (i = 0; i < port_info->nports; i++) {
-		struct ifnet *ifp = port_info->lldevs[i];
-		TOEDEV(ifp) = tdev;
-
-		CTR1(KTR_TOM, "enabling toe on %p", ifp);
-		ifp->if_capabilities |= IFCAP_TOE4;
-		ifp->if_capenable |= IFCAP_TOE4;
-	}
-	t->ports = port_info;
-
-	/* Add device to the list of offload devices */
-	t3cdev_add(t);
-
-	/* Activate TCP offload device */
-	cxgb_offload_activate(TOM_DATA(tdev)->cdev->adapter);
-
-	activate_offload(tdev);
-	cxgb_register_listeners();
-	return;
-
-out_free_all:
-	printf("out_free_all fail\n");
-	free(port_info, M_CXGB);
-out_free_tom:
-	printf("out_free_tom fail\n");
-	free(t, M_CXGB);
-	return;
-}
-
-
-
-static int
-do_act_open_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_act_open_rpl *rpl = cplhdr(m);
-	unsigned int atid = G_TID(ntohl(rpl->atid));
-	struct toe_tid_entry *toe_tid;
-
-	toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid);
-	if (toe_tid->ctx && toe_tid->client && toe_tid->client->handlers &&
-		toe_tid->client->handlers[CPL_ACT_OPEN_RPL]) {
-		return toe_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, m,
-			toe_tid->ctx);
-	} else {
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, CPL_ACT_OPEN_RPL);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-}
-
-static int
-do_stid_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	union opcode_tid *p = cplhdr(m);
-	unsigned int stid = G_TID(ntohl(p->opcode_tid));
-	struct toe_tid_entry *toe_tid;
-
-	toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid);
-	if (toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[p->opcode]) {
-		return toe_tid->client->handlers[p->opcode] (dev, m, toe_tid->ctx);
-	} else {
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, p->opcode);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-}
-
-static int
-do_hwtid_rpl(struct t3cdev *dev, struct mbuf *m)
-{
-	union opcode_tid *p = cplhdr(m);
-	unsigned int hwtid;
-	struct toe_tid_entry *toe_tid;
-	
-	DPRINTF("do_hwtid_rpl opcode=0x%x\n", p->opcode);
-	hwtid = G_TID(ntohl(p->opcode_tid));
-
-	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
-	if (toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[p->opcode]) {
-		return toe_tid->client->handlers[p->opcode]
-						(dev, m, toe_tid->ctx);
-	} else {
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, p->opcode);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-}
-
-static int
-do_cr(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_pass_accept_req *req = cplhdr(m);
-	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
-	struct toe_tid_entry *toe_tid;
-
-	toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid);
-	if (toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) {
-		return toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]
-						(dev, m, toe_tid->ctx);
-	} else {
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, CPL_PASS_ACCEPT_REQ);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-}
-
-static int
-do_abort_req_rss(struct t3cdev *dev, struct mbuf *m)
-{
-	union opcode_tid *p = cplhdr(m);
-	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
-	struct toe_tid_entry *toe_tid;
-
-	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
-	if (toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[p->opcode]) {
-		return toe_tid->client->handlers[p->opcode]
-						(dev, m, toe_tid->ctx);
-	} else {
-		struct cpl_abort_req_rss *req = cplhdr(m);
-		struct cpl_abort_rpl *rpl;
-		
-		struct mbuf *m = m_get(M_NOWAIT, MT_DATA);
-		if (!m) {
-			log(LOG_NOTICE, "do_abort_req_rss: couldn't get mbuf!\n");
-			goto out;
-		}
-
-		m_set_priority(m, CPL_PRIORITY_DATA);
-		rpl = cplhdr(m);
-		rpl->wr.wr_hi = 
-			htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
-		rpl->wr.wr_lo = htonl(V_WR_TID(GET_TID(req)));
-		OPCODE_TID(rpl) =
-			htonl(MK_OPCODE_TID(CPL_ABORT_RPL, GET_TID(req)));
-		rpl->cmd = req->status;
-		cxgb_ofld_send(dev, m);
- out:
-		return (CPL_RET_BUF_DONE);
-	}
-}
-
-static int
-do_act_establish(struct t3cdev *dev, struct mbuf *m)
-{
-	struct cpl_act_establish *req;
-	unsigned int atid;
-	struct toe_tid_entry *toe_tid;
-
-	req = cplhdr(m);
-	atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
-	toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid);
-	if (toe_tid && toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[CPL_ACT_ESTABLISH]) {
-		
-		return toe_tid->client->handlers[CPL_ACT_ESTABLISH]
-						(dev, m, toe_tid->ctx);
-	} else {
-	
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, CPL_ACT_ESTABLISH);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-}
-
-
-static int
-do_term(struct t3cdev *dev, struct mbuf *m)
-{
-	unsigned int hwtid = ntohl(m_get_priority(m)) >> 8 & 0xfffff;
-	unsigned int opcode = G_OPCODE(ntohl(m->m_pkthdr.csum_data));
-	struct toe_tid_entry *toe_tid;
-
-	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
-	if (toe_tid && toe_tid->ctx && toe_tid->client->handlers &&
-		toe_tid->client->handlers[opcode]) {
-		return toe_tid->client->handlers[opcode](dev, m, toe_tid->ctx);
-	} else {
-		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
-			dev->name, opcode);
-		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
-	}
-	return (0);
-}
-
-/*
- * Process a received packet with an unknown/unexpected CPL opcode.
- */
-static int
-do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
-{
-	log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
-	    0xFF & *mtod(m, unsigned int *));
-	return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
-}
-
-/*
- * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
- * to unregister an existing handler.
- */
-void
-t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
-{
-	if (opcode < UCHAR_MAX)
-		tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
-	else
-		log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
-		       "opcode %u failed\n", opcode);
-}
-
-/*
- * Make a preliminary determination if a connection can be offloaded.  It's OK
- * to fail the offload later if we say we can offload here.  For now this
- * always accepts the offload request unless there are IP options.
- */
-static int
-can_offload(struct toedev *dev, struct socket *so)
-{
-	struct tom_data *tomd = TOM_DATA(dev);
-	struct t3cdev *cdev = T3CDEV(dev->tod_lldev);
-	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
-
-	return so_sotoinpcb(so)->inp_depend4.inp4_options == NULL &&
-	    tomd->conf.activated &&
-	    (tomd->conf.max_conn < 0 ||
-	     atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
-}
-
-static int
-tom_ctl(struct toedev *dev, unsigned int req, void *data)
-{
-	struct tom_data *t = TOM_DATA(dev);
-	struct t3cdev *cdev = t->cdev;
-
-	if (cdev->ctl)
-		return cdev->ctl(cdev, req, data);
-
-	return (EOPNOTSUPP);
-}
-
-/*
- * Free an active-open TID.
- */
-void *
-cxgb_free_atid(struct t3cdev *tdev, int atid)
-{
-	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
-	union active_open_entry *p = atid2entry(t, atid);
-	void *ctx = p->toe_tid.ctx;
-
-	mtx_lock(&t->atid_lock);
-	p->next = t->afree;
-	t->afree = p;
-	t->atids_in_use--;
-	mtx_unlock(&t->atid_lock);
-
-	return ctx;
-}
-
-/*
- * Free a server TID and return it to the free pool.
- */
-void
-cxgb_free_stid(struct t3cdev *tdev, int stid)
-{
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-	union listen_entry *p = stid2entry(t, stid);
-
-	mtx_lock(&t->stid_lock);
-	p->next = t->sfree;
-	t->sfree = p;
-	t->stids_in_use--;
-	mtx_unlock(&t->stid_lock);
-}
-
-/*
- * Free a server TID and return it to the free pool.
- */
-void *
-cxgb_get_lctx(struct t3cdev *tdev, int stid)
-{
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-	union listen_entry *p = stid2entry(t, stid);
-
-	return (p->toe_tid.ctx);
-}
-
-void
-cxgb_insert_tid(struct t3cdev *tdev, struct cxgb_client *client,
-	void *ctx, unsigned int tid)
-{
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-
-	t->tid_tab[tid].client = client;
-	t->tid_tab[tid].ctx = ctx;
-	atomic_add_int(&t->tids_in_use, 1);
-}
-
-/* use ctx as a next pointer in the tid release list */
-void
-cxgb_queue_tid_release(struct t3cdev *tdev, unsigned int tid)
-{
-	struct t3c_data *td = T3C_DATA (tdev);
-	struct toe_tid_entry *p = &td->tid_maps.tid_tab[tid];
-	
-	CTR0(KTR_TOM, "queuing tid release\n");
-	
-	mtx_lock(&td->tid_release_lock);
-	p->ctx = td->tid_release_list;
-	td->tid_release_list = p;
-
-	if (!p->ctx)
-		taskqueue_enqueue(tdev->adapter->tq, &td->tid_release_task);
-
-	mtx_unlock(&td->tid_release_lock);
-}
-
-/*
- * Remove a tid from the TID table.  A client may defer processing its last
- * CPL message if it is locked at the time it arrives, and while the message
- * sits in the client's backlog the TID may be reused for another connection.
- * To handle this we atomically switch the TID association if it still points
- * to the original client context.
- */
-void
-cxgb_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid)
-{
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-
-	if (tid >= t->ntids)
-		panic("tid=%d >= t->ntids=%d", tid, t->ntids);
-	
-	if (tdev->type == T3A)
-		atomic_cmpset_ptr((uintptr_t *)&t->tid_tab[tid].ctx, (long)NULL, (long)ctx);
-	else {
-		struct mbuf *m;
-
-		m = m_get(M_NOWAIT, MT_DATA);
-		if (__predict_true(m != NULL)) {
-			mk_tid_release(m, tid);
-			CTR1(KTR_CXGB, "releasing tid=%u", tid);
-			
-			cxgb_ofld_send(tdev, m);
-			t->tid_tab[tid].ctx = NULL;
-		} else
-			cxgb_queue_tid_release(tdev, tid);
-	}
-	atomic_add_int(&t->tids_in_use, -1);
-}
-
-int
-cxgb_alloc_atid(struct t3cdev *tdev, struct cxgb_client *client,
-		     void *ctx)
-{
-	int atid = -1;
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-
-	mtx_lock(&t->atid_lock);
-	if (t->afree) {
-		union active_open_entry *p = t->afree;
-
-		atid = (p - t->atid_tab) + t->atid_base;
-		t->afree = p->next;
-		p->toe_tid.ctx = ctx;
-		p->toe_tid.client = client;
-		t->atids_in_use++;
-	}
-	mtx_unlock(&t->atid_lock);
-	return atid;
-}
-
-int
-cxgb_alloc_stid(struct t3cdev *tdev, struct cxgb_client *client,
-		     void *ctx)
-{
-	int stid = -1;
-	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
-
-	mtx_lock(&t->stid_lock);
-	if (t->sfree) {
-		union listen_entry *p = t->sfree;
-
-		stid = (p - t->stid_tab) + t->stid_base;
-		t->sfree = p->next;
-		p->toe_tid.ctx = ctx;
-		p->toe_tid.client = client;
-		t->stids_in_use++;
-	}
-	mtx_unlock(&t->stid_lock);
-	return stid;
-}
-
-
-static int
-is_offloading(struct ifnet *ifp)
-{
-	struct adapter *adapter;
-	int port;
-
-	rw_rlock(&adapter_list_lock);
-	TAILQ_FOREACH(adapter, &adapter_list, adapter_entry) {
-		for_each_port(adapter, port) {
-			if (ifp == adapter->port[port].ifp) {
-				rw_runlock(&adapter_list_lock);
-				return 1;
-			}
-		}
-	}
-	rw_runlock(&adapter_list_lock);
-	return 0;
-}
-
-
-static void
-cxgb_arp_update_event(void *unused, struct rtentry *rt0,
-    uint8_t *enaddr, struct sockaddr *sa)
-{
-
-	if (!is_offloading(rt0->rt_ifp))
-		return;
-
-	RT_ADDREF(rt0);
-	RT_UNLOCK(rt0);
-	cxgb_neigh_update(rt0, enaddr, sa);
-	RT_LOCK(rt0);
-	RT_REMREF(rt0);
-}
-
-static void
-cxgb_redirect_event(void *unused, int event, struct rtentry *rt0,
-    struct rtentry *rt1, struct sockaddr *sa)
-{
-	/* 
-	 * ignore events on non-offloaded interfaces
-	 */
-	if (!is_offloading(rt0->rt_ifp))
-		return;
-
-	/*
-	 * Cannot redirect to non-offload device.
-	 */
-	if (!is_offloading(rt1->rt_ifp)) {
-		log(LOG_WARNING, "%s: Redirect to non-offload"
-		    "device ignored.\n", __FUNCTION__);
-		return;
-	}
-
-        /*
-	 * avoid LORs by dropping the route lock but keeping a reference
-	 * 
-	 */
-	RT_ADDREF(rt0);
-	RT_UNLOCK(rt0);
-	RT_ADDREF(rt1);
-	RT_UNLOCK(rt1);
-	
-	cxgb_redirect(rt0, rt1, sa);
-	cxgb_neigh_update(rt1, NULL, sa);
-
-	RT_LOCK(rt0);
-	RT_REMREF(rt0);
-	RT_LOCK(rt1);
-	RT_REMREF(rt1);
-}
-
-void
-cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa)
-{
-
-	if (rt->rt_ifp && is_offloading(rt->rt_ifp) && (rt->rt_ifp->if_flags & IFCAP_TOE)) {
-		struct t3cdev *tdev = T3CDEV(rt->rt_ifp);
-
-		PANIC_IF(!tdev);
-		t3_l2t_update(tdev, rt, enaddr, sa);
-	}
-}
-
-static void
-set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e)
-{
-	struct mbuf *m;
-	struct cpl_set_tcb_field *req;
-
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (!m) {
-		log(LOG_ERR, "%s: cannot allocate mbuf!\n", __FUNCTION__);
-		return;
-	}
-	
-	m_set_priority(m, CPL_PRIORITY_CONTROL);
-	req = mtod(m, struct cpl_set_tcb_field *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
-	req->reply = 0;
-	req->cpu_idx = 0;
-	req->word = htons(W_TCB_L2T_IX);
-	req->mask = htobe64(V_TCB_L2T_IX(M_TCB_L2T_IX));
-	req->val = htobe64(V_TCB_L2T_IX(e->idx));
-	tdev->send(tdev, m);
-}
-
-void
-cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa)
-{
-	struct ifnet *olddev, *newdev;
-	struct tid_info *ti;
-	struct t3cdev *tdev;
-	u32 tid;
-	int update_tcb;
-	struct l2t_entry *e;
-	struct toe_tid_entry *te;
-
-	olddev = old->rt_ifp;
-	newdev = new->rt_ifp;
-	if (!is_offloading(olddev))
-		return;
-	if (!is_offloading(newdev)) {
-		log(LOG_WARNING, "%s: Redirect to non-offload"
-		    "device ignored.\n", __FUNCTION__);
-		return;
-	}
-	tdev = T3CDEV(olddev);
-	PANIC_IF(!tdev);
-	if (tdev != T3CDEV(newdev)) {
-		log(LOG_WARNING, "%s: Redirect to different "
-		    "offload device ignored.\n", __FUNCTION__);
-		return;
-	}
-
-	/* Add new L2T entry */
-	e = t3_l2t_get(tdev, new, new->rt_ifp, sa);
-	if (!e) {
-		log(LOG_ERR, "%s: couldn't allocate new l2t entry!\n",
-		       __FUNCTION__);
-		return;
-	}
-
-	/* Walk tid table and notify clients of dst change. */
-	ti = &(T3C_DATA (tdev))->tid_maps;
-	for (tid=0; tid < ti->ntids; tid++) {
-		te = lookup_tid(ti, tid);
-		PANIC_IF(!te);
-		if (te->ctx && te->client && te->client->redirect) {
-			update_tcb = te->client->redirect(te->ctx, old, new,
-							  e);
-			if (update_tcb)  {
-				l2t_hold(L2DATA(tdev), e);
-				set_l2t_ix(tdev, tid, e);
-			}
-		}
-	}
-	l2t_release(L2DATA(tdev), e);
-}
-
-/*
- * Initialize the CPL dispatch table.
- */
-static void
-init_cpl_handlers(void)
-{
-	int i;
-
-	for (i = 0; i < 256; ++i)
-		tom_cpl_handlers[i] = do_bad_cpl;
-
-	t3_init_listen_cpl_handlers();
-}
-
-static int
-t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
-{
-	struct tom_data *t = TOM_DATA(dev);
-	struct t3cdev *cdev = t->cdev;
-	struct ddp_params ddp;
-	struct ofld_page_info rx_page_info;
-	int err;
-	
-	t3_init_tunables(t);
-	mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
-	CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry);
-
-	dev->tod_can_offload = can_offload;
-	dev->tod_connect = t3_connect;
-	dev->tod_ctl = tom_ctl;
-#if 0	
-	dev->tod_failover = t3_failover;
-#endif
-	err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp);
-	if (err)
-		return err;
-
-	err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info);
-	if (err)
-		return err;
-
-	t->ddp_llimit = ddp.llimit;
-	t->ddp_ulimit = ddp.ulimit;
-	t->pdev = ddp.pdev;
-	t->rx_page_size = rx_page_info.page_size;
-	/* OK if this fails, we just can't do DDP */
-	t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
-	t->ppod_map = malloc(t->nppods, M_DEVBUF, M_NOWAIT|M_ZERO);
-
-	mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF);
-
-
-	t3_sysctl_register(cdev->adapter, &t->conf);
 	return (0);
 }
 
 static void
-cxgb_toe_listen_start(void *unused, struct tcpcb *tp)
+free_tid_tabs(struct tid_info *t)
 {
-	struct socket *so = inp_inpcbtosocket(tp->t_inpcb);
-	struct tom_data *p;
-	
-	mtx_lock(&cxgb_list_lock);
-	TAILQ_FOREACH(p, &cxgb_list, entry) {
-			t3_listen_start(&p->tdev, so, p->cdev);
+	if (mtx_initialized(&t->stid_lock))
+		mtx_destroy(&t->stid_lock);
+	if (mtx_initialized(&t->atid_lock))
+		mtx_destroy(&t->atid_lock);
+	free(t->tid_tab, M_CXGB);
+}
+
+static int
+write_smt_entry(struct adapter *sc, int idx)
+{
+	struct port_info *pi = &sc->port[idx];
+	struct cpl_smt_write_req *req;
+	struct mbuf *m;
+
+	m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, req);
+	if (m == NULL) {
+		log(LOG_ERR, "%s: no mbuf, can't write SMT entry for %d\n",
+		    __func__, idx);
+		return (ENOMEM);
 	}
-	mtx_unlock(&cxgb_list_lock);
+
+	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
+	req->mtu_idx = NMTUS - 1;  /* should be 0 but there's a T3 bug */
+	req->iff = idx;
+	memset(req->src_mac1, 0, sizeof(req->src_mac1));
+	memcpy(req->src_mac0, pi->hw_addr, ETHER_ADDR_LEN);
+
+	t3_offload_tx(sc, m);
+
+	return (0);
 }
 
 static void
-cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
+free_tom_data(struct tom_data *td)
 {
-	struct socket *so = inp_inpcbtosocket(tp->t_inpcb);
-	struct tom_data *p;
-	
-	mtx_lock(&cxgb_list_lock);
-	TAILQ_FOREACH(p, &cxgb_list, entry) {
-		if (tp->t_state == TCPS_LISTEN)
-			t3_listen_stop(&p->tdev, so, p->cdev);
+	KASSERT(TAILQ_EMPTY(&td->toep_list),
+	    ("%s: toep_list not empty", __func__));
+
+	if (td->listen_mask != 0)
+		hashdestroy(td->listen_hash, M_CXGB, td->listen_mask);
+
+	if (mtx_initialized(&td->toep_list_lock))
+		mtx_destroy(&td->toep_list_lock);
+	if (mtx_initialized(&td->lctx_hash_lock))
+		mtx_destroy(&td->lctx_hash_lock);
+	if (mtx_initialized(&td->tid_release_lock))
+		mtx_destroy(&td->tid_release_lock);
+	if (td->l2t)
+		t3_free_l2t(td->l2t);
+	free_tid_tabs(&td->tid_maps);
+	free(td, M_CXGB);
+}
+
+/*
+ * Ground control to Major TOM
+ * Commencing countdown, engines on
+ */
+static int
+t3_tom_activate(struct adapter *sc)
+{
+	struct tom_data *td;
+	struct toedev *tod;
+	int i, rc = 0;
+	struct mc5_params *mc5 = &sc->params.mc5;
+	u_int ntids, natids, mtus;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	/* per-adapter softc for TOM */
+	td = malloc(sizeof(*td), M_CXGB, M_ZERO | M_NOWAIT);
+	if (td == NULL)
+		return (ENOMEM);
+
+	/* List of TOE PCBs and associated lock */
+	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
+	TAILQ_INIT(&td->toep_list);
+
+	/* Listen context */
+	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
+	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGB,
+	    &td->listen_mask, HASH_NOWAIT);
+
+	/* TID release task */
+	TASK_INIT(&td->tid_release_task, 0 , t3_process_tid_release_list, td);
+	mtx_init(&td->tid_release_lock, "tid release", NULL, MTX_DEF);
+
+	/* L2 table */
+	td->l2t = t3_init_l2t(L2T_SIZE);
+	if (td->l2t == NULL) {
+		rc = ENOMEM;
+		goto done;
 	}
-	mtx_unlock(&cxgb_list_lock);
+
+	/* TID tables */
+	ntids = t3_mc5_size(&sc->mc5) - mc5->nroutes - mc5->nfilters -
+	    mc5->nservers;
+	natids = min(ntids / 2, 64 * 1024);
+	rc = alloc_tid_tabs(&td->tid_maps, ntids, natids, mc5->nservers,
+	    0x100000 /* ATID_BASE */, ntids);
+	if (rc != 0)
+		goto done;
+
+	/* CPL handlers */
+	t3_init_listen_cpl_handlers(sc);
+	t3_init_l2t_cpl_handlers(sc);
+	t3_init_cpl_io(sc);
+
+	/* toedev ops */
+	tod = &td->tod;
+	init_toedev(tod);
+	tod->tod_softc = sc;
+	tod->tod_connect = t3_connect;
+	tod->tod_listen_start = t3_listen_start;
+	tod->tod_listen_stop = t3_listen_stop;
+	tod->tod_rcvd = t3_rcvd;
+	tod->tod_output = t3_tod_output;
+	tod->tod_send_rst = t3_send_rst;
+	tod->tod_send_fin = t3_send_fin;
+	tod->tod_pcb_detach = t3_pcb_detach;
+	tod->tod_l2_update = t3_l2_update;
+	tod->tod_syncache_added = t3_syncache_added;
+	tod->tod_syncache_removed = t3_syncache_removed;
+	tod->tod_syncache_respond = t3_syncache_respond;
+	tod->tod_offload_socket = t3_offload_socket;
+
+	/* port MTUs */
+	mtus = sc->port[0].ifp->if_mtu;
+	if (sc->params.nports > 1)
+		mtus |= sc->port[1].ifp->if_mtu << 16;
+	t3_write_reg(sc, A_TP_MTU_PORT_TABLE, mtus);
+	t3_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd,
+	    sc->params.rev == 0 ? sc->port[0].ifp->if_mtu : 0xffff);
+
+	/* SMT entry for each port */
+	for_each_port(sc, i) {
+		write_smt_entry(sc, i);
+		TOEDEV(sc->port[i].ifp) = &td->tod;
+	}
+
+	/* Switch TP to offload mode */
+	t3_tp_set_offload_mode(sc, 1);
+
+	sc->tom_softc = td;
+	sc->flags |= TOM_INIT_DONE;
+	register_toedev(tod);
+
+done:
+	if (rc != 0)
+		free_tom_data(td);
+
+	return (rc);
+}
+
+static int
+t3_tom_deactivate(struct adapter *sc)
+{
+	int rc = 0;
+	struct tom_data *td = sc->tom_softc;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	if (td == NULL)
+		return (0);	/* XXX. KASSERT? */
+
+	if (sc->offload_map != 0)
+		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
+
+	mtx_lock(&td->toep_list_lock);
+	if (!TAILQ_EMPTY(&td->toep_list))
+		rc = EBUSY;
+	mtx_unlock(&td->toep_list_lock);
+
+	mtx_lock(&td->lctx_hash_lock);
+	if (td->lctx_count > 0)
+		rc = EBUSY;
+	mtx_unlock(&td->lctx_hash_lock);
+
+	if (rc == 0) {
+		unregister_toedev(&td->tod);
+		t3_tp_set_offload_mode(sc, 0);
+		free_tom_data(td);
+		sc->tom_softc = NULL;
+		sc->flags &= ~TOM_INIT_DONE;
+	}
+
+	return (rc);
+}
+
+static int
+t3_tom_mod_load(void)
+{
+	int rc;
+
+	rc = t3_register_uld(&tom_uld_info);
+	if (rc != 0)
+		t3_tom_mod_unload();
+
+	return (rc);
 }
 
 static void
-cxgb_toe_listen_start_handler(struct inpcb *inp, void *arg)
+tom_uninit(struct adapter *sc, void *arg __unused)
 {
-	struct tcpcb *tp = intotcpcb(inp);
-
-	if (tp->t_state == TCPS_LISTEN)
-		cxgb_toe_listen_start(NULL, tp);
-}
-
-static void
-cxgb_register_listeners(void)
-{
-
-	inp_apply_all(cxgb_toe_listen_start_handler, NULL);
+	/* Try to free resources (works only if no port has IFCAP_TOE) */
+	ADAPTER_LOCK(sc);
+	if (sc->flags & TOM_INIT_DONE)
+		t3_deactivate_uld(sc, ULD_TOM);
+	ADAPTER_UNLOCK(sc);
 }
 
 static int
-t3_tom_init(void)
+t3_tom_mod_unload(void)
 {
-	init_cpl_handlers();
-	if (t3_init_cpl_io() < 0) {
-		log(LOG_ERR,
-		    "Unable to initialize cpl io ops\n");
-		return -1;
-	}
-	t3_init_socket_ops();
+	t3_iterate(tom_uninit, NULL);
 
-	 /* Register with the TOE device layer. */
+	if (t3_unregister_uld(&tom_uld_info) == EBUSY)
+		return (EBUSY);
 
-	if (register_tom(&t3_tom_info) != 0) {
-		log(LOG_ERR,
-		    "Unable to register Chelsio T3 TCP offload module.\n");
-		return -1;
-	}
-
-	rw_init(&adapter_list_lock, "ofld adap list");
-	TAILQ_INIT(&adapter_list);
-	EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event,
-	    NULL, EVENTHANDLER_PRI_ANY);
-	EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event,
-	    NULL, EVENTHANDLER_PRI_ANY);
-	
-	mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
-	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
-	    cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY);
-	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
-	    cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY);
-	TAILQ_INIT(&cxgb_list);
-	
-
-
-	t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl);
-	t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl);
-	t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr);
-	t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
-	t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss);
-	t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
-	t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term);
-	t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl);
-	t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl);
-
-	/* Register to offloading devices */
-	cxgb_register_client(&t3c_tom_client);
-	
 	return (0);
 }
+#endif	/* ifdef TCP_OFFLOAD */
 
 static int
-t3_tom_load(module_t mod, int cmd, void *arg)
+t3_tom_modevent(module_t mod, int cmd, void *arg)
 {
-	int err = 0;
+	int rc = 0;
 
+#ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
-		t3_tom_init();
+		rc = t3_tom_mod_load();
 		break;
-	case MOD_QUIESCE:
+
+	case MOD_UNLOAD:
+		rc = t3_tom_mod_unload();
 		break;
-	case MOD_UNLOAD:
-		printf("uhm, ... unloading isn't really supported for toe\n");
-		break;
-	case MOD_SHUTDOWN:
-		break;
+
 	default:
-		err = EOPNOTSUPP;
-		break;
+		rc = EINVAL;
 	}
-
-	return (err);
+#else
+	rc = EOPNOTSUPP;
+#endif
+	return (rc);
 }
 
-static moduledata_t mod_data= {
+static moduledata_t t3_tom_moddata= {
 	"t3_tom",
-	t3_tom_load,
+	t3_tom_modevent,
 	0
 };
+
 MODULE_VERSION(t3_tom, 1);
 MODULE_DEPEND(t3_tom, toecore, 1, 1, 1);
-MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1);
-DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
-
+MODULE_DEPEND(t3_tom, cxgbc, 1, 1, 1);
+DECLARE_MODULE(t3_tom, t3_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tom.h
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,7 +1,6 @@
-
 /**************************************************************************
 
-Copyright (c) 2007, Chelsio Inc.
+Copyright (c) 2007, 2009 Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,128 +32,248 @@
 #ifndef CXGB_TOM_H_
 #define CXGB_TOM_H_
 #include <sys/protosw.h>
-#include <netinet/toedev.h>
+#include <netinet/toecore.h>
 
-#define LISTEN_INFO_HASH_SIZE 32 
+MALLOC_DECLARE(M_CXGB);
 
-struct listen_info {
-	struct listen_info *next;  /* Link to next entry */
-	struct socket *so;         /* The listening socket */
-	unsigned int stid;         /* The server TID */
-};
+#define	KTR_CXGB	KTR_SPARE3
 
+#define LISTEN_HASH_SIZE 32 
 
 /*
- * TOM tunable parameters.  They can be manipulated through sysctl(2) or /proc.
+ * Holds the size, base address, free list start, etc of the TID, server TID,
+ * and active-open TID tables for a offload device.
+ * The tables themselves are allocated dynamically.
  */
-struct tom_tunables {
-        int max_host_sndbuf;    // max host RAM consumed by a sndbuf
-        int tx_hold_thres;      // push/pull threshold for non-full TX sk_buffs
-        int max_wrs;            // max # of outstanding WRs per connection
-        int rx_credit_thres;    // min # of RX credits needed for RX_DATA_ACK
-        int cong_alg;           // Congestion control algorithm
-        int mss;                // max TX_DATA WR payload size
-        int delack;             // delayed ACK control
-        int max_conn;           // maximum number of offloaded connections
-        int soft_backlog_limit; // whether the listen backlog limit is soft
-        int ddp;                // whether to put new connections in DDP mode
-        int ddp_thres;          // min recvmsg size before activating DDP
-        int ddp_copy_limit;     // capacity of kernel DDP buffer
-        int ddp_push_wait;      // whether blocking DDP waits for PSH flag
-        int ddp_rcvcoalesce;    // whether receive coalescing is enabled
-        int zcopy_sosend_enabled; // < is never zcopied
-        int zcopy_sosend_partial_thres; // < is never zcopied
-        int zcopy_sosend_partial_copy; // bytes copied in partial zcopy
-        int zcopy_sosend_thres;// >= are mostly zcopied
-        int zcopy_sosend_copy; // bytes coped in zcopied
-        int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA
-        int activated;          // TOE engine activation state
+struct tid_info {
+	void **tid_tab;
+	unsigned int ntids;
+	volatile unsigned int tids_in_use;
+
+	union listen_entry *stid_tab;
+	unsigned int nstids;
+	unsigned int stid_base;
+
+	union active_open_entry *atid_tab;
+	unsigned int natids;
+	unsigned int atid_base;
+
+	/*
+	 * The following members are accessed R/W so we put them in their own
+	 * cache lines.  TOM_XXX: actually do what is said here.
+	 *
+	 * XXX We could combine the atid fields above with the lock here since
+	 * atids are use once (unlike other tids).  OTOH the above fields are
+	 * usually in cache due to tid_tab.
+	 */
+	struct mtx atid_lock;
+	union active_open_entry *afree;
+	unsigned int atids_in_use;
+
+	struct mtx stid_lock;
+	union listen_entry *sfree;
+	unsigned int stids_in_use;
 };
 
 struct tom_data {
-        TAILQ_ENTRY(tom_data) entry;
-			      
-        struct t3cdev *cdev;
-        struct pci_dev *pdev;
-        struct toedev tdev;
+        struct toedev tod;
 
-        struct cxgb_client *client;
-        struct tom_tunables conf;
-        struct tom_sysctl_table *sysctl;
+	/*
+	 * toepcb's associated with this TOE device are either on the
+	 * toep list or in the synq of a listening socket in lctx hash.
+	 */
+	struct mtx toep_list_lock;
+	TAILQ_HEAD(, toepcb) toep_list;
+
+	struct l2t_data *l2t;
+	struct tid_info tid_maps;
 
         /*
-         * The next three locks listen_lock, deferq.lock, and tid_release_lock
-         * are used rarely so we let them potentially share a cacheline.
+	 * The next two locks listen_lock, and tid_release_lock are used rarely
+	 * so we let them potentially share a cacheline.
          */
 
-        struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE];
-        struct mtx listen_lock;
+	LIST_HEAD(, listen_ctx) *listen_hash;
+	u_long listen_mask;
+	int lctx_count;		/* # of lctx in the hash table */
+        struct mtx lctx_hash_lock;
 
-        struct mbuf_head deferq;
-        struct task deferq_task;
-
-        struct socket **tid_release_list;
+        void **tid_release_list;
         struct mtx tid_release_lock;
         struct task tid_release_task;
-
-        volatile int tx_dma_pending;
-	
-        unsigned int ddp_llimit;
-        unsigned int ddp_ulimit;
-
-        unsigned int rx_page_size;
-
-        u8 *ppod_map;
-        unsigned int nppods;
-        struct mtx ppod_map_lock;
-	
-        struct adap_ports *ports;
-	struct taskqueue *tq;
 };
 
+struct synq_entry {
+	TAILQ_ENTRY(synq_entry) link;	/* listen_ctx's synq link */
+	int flags;			/* same as toepcb's tp_flags */
+	int tid;
+	struct mbuf *m;			/* backpointer to containing mbuf */
+	struct listen_ctx *lctx;	/* backpointer to listen ctx */
+	struct cpl_pass_establish *cpl;
+	struct toepcb *toep;
+	struct l2t_entry *e;
+	uint32_t iss;
+	uint32_t ts;
+	uint32_t opt0h;
+	uint32_t qset;
+	int rx_credits;
+	volatile u_int refcnt;
+
+#define RPL_OK		0	/* ok to reply */
+#define RPL_DONE	1	/* replied already */
+#define RPL_DONT	2	/* don't reply */
+	volatile u_int reply;	/* see above. */
+};
+
+#define LCTX_RPL_PENDING	1	/* waiting for CPL_PASS_OPEN_RPL */
 
 struct listen_ctx {
-	struct socket *lso;
-	struct tom_data *tom_data;
-	int ulp_mode;
-	LIST_HEAD(, toepcb) synq_head;
-	
+	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
+	volatile int refcnt;
+	int stid;
+	int flags;
+	struct inpcb *inp;		/* listening socket's inp */
+	int qset;
+	TAILQ_HEAD(, synq_entry) synq;
 };
 
-#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt)
-#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev)
-#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev)
-#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param)
+void t3_process_tid_release_list(void *data, int pending);
 
-#define TP_DATASENT         	(1 << 0)
-#define TP_TX_WAIT_IDLE      	(1 << 1)
-#define TP_FIN_SENT          	(1 << 2)
-#define TP_ABORT_RPL_PENDING 	(1 << 3)
-#define TP_ABORT_SHUTDOWN    	(1 << 4)
-#define TP_ABORT_RPL_RCVD    	(1 << 5)
-#define TP_ABORT_REQ_RCVD    	(1 << 6)
-#define TP_CLOSE_CON_REQUESTED	(1 << 7)
-#define TP_SYN_RCVD		(1 << 8)
-#define TP_ESTABLISHED		(1 << 9)
-
-void t3_init_tunables(struct tom_data *t);
-
-void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p);
-
-static __inline struct mbuf *
-m_gethdr_nofail(int len)
+static inline struct tom_data *
+t3_tomdata(struct toedev *tod)
 {
-	struct mbuf *m;
-	
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL) {
-		panic("implement lowmem cache\n");
-	}
-	
-	KASSERT(len < MHLEN, ("requested header size too large for mbuf"));	
-	m->m_pkthdr.len = m->m_len = len;
-	return (m);
+	return (member2struct(tom_data, tod, tod));
 }
 
+union listen_entry {
+	void *ctx;
+	union listen_entry *next;
+};
 
+union active_open_entry {
+	void *ctx;
+	union active_open_entry *next;
+};
+
+/*
+ * Map an ATID or STID to their entries in the corresponding TID tables.
+ */
+static inline union active_open_entry *atid2entry(const struct tid_info *t,
+                                                  unsigned int atid)
+{
+        return &t->atid_tab[atid - t->atid_base];
+}
+
+
+static inline union listen_entry *stid2entry(const struct tid_info *t,
+                                             unsigned int stid)
+{
+        return &t->stid_tab[stid - t->stid_base];
+}
+
+/*
+ * Find the connection corresponding to a TID.
+ */
+static inline void *lookup_tid(const struct tid_info *t, unsigned int tid)
+{
+	void *p;
+
+	if (tid >= t->ntids)
+		return (NULL);
+
+	p = t->tid_tab[tid];
+	if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids])
+		return (p);
+
+	return (NULL);
+}
+
+/*
+ * Find the connection corresponding to a server TID.
+ */
+static inline void *lookup_stid(const struct tid_info *t, unsigned int tid)
+{
+	void *p;
+
+        if (tid < t->stid_base || tid >= t->stid_base + t->nstids)
+                return (NULL);
+
+	p = stid2entry(t, tid)->ctx;
+	if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids])
+		return (p);
+
+	return (NULL);
+}
+
+/*
+ * Find the connection corresponding to an active-open TID.
+ */
+static inline void *lookup_atid(const struct tid_info *t, unsigned int tid)
+{
+	void *p;
+
+        if (tid < t->atid_base || tid >= t->atid_base + t->natids)
+                return (NULL);
+
+	p = atid2entry(t, tid)->ctx;
+	if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids])
+		return (p);
+
+	return (NULL);
+}
+
+static inline uint32_t
+calc_opt2(int cpu_idx)
+{
+	uint32_t opt2 = F_CPU_INDEX_VALID | V_CPU_INDEX(cpu_idx);
+
+	/* 3 = highspeed CC algorithm */
+	opt2 |= V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(3) |
+	    V_PACING_FLAVOR(1);
+
+	/* coalesce and push bit semantics */
+	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(3);
+
+	return (htobe32(opt2));
+}
+
+/* cxgb_tom.c */
+struct toepcb *toepcb_alloc(struct toedev *);
+void toepcb_free(struct toepcb *);
+
+/* cxgb_cpl_io.c */
+void t3_init_cpl_io(struct adapter *);
+int t3_push_frames(struct socket *, int);
+int t3_connect(struct toedev *, struct socket *, struct rtentry *,
+    struct sockaddr *);
+int t3_tod_output(struct toedev *, struct tcpcb *);
+int t3_send_rst(struct toedev *, struct tcpcb *);
+int t3_send_fin(struct toedev *, struct tcpcb *);
+void insert_tid(struct tom_data *, void *, unsigned int);
+void update_tid(struct tom_data *, void *, unsigned int);
+void remove_tid(struct tom_data *, unsigned int);
+uint32_t calc_opt0h(struct socket *, int, int, struct l2t_entry *);
+uint32_t calc_opt0l(struct socket *, int);
+void queue_tid_release(struct toedev *, unsigned int);
+void offload_socket(struct socket *, struct toepcb *);
+void undo_offload_socket(struct socket *);
+int select_rcv_wscale(void);
+unsigned long select_rcv_wnd(struct socket *);
+int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
+void make_established(struct socket *, uint32_t, uint32_t, uint16_t);
+void t3_rcvd(struct toedev *, struct tcpcb *);
+void t3_pcb_detach(struct toedev *, struct tcpcb *);
+void send_abort_rpl(struct toedev *, int, int);
+void release_tid(struct toedev *, unsigned int, int);
+
+/* cxgb_listen.c */
+void t3_init_listen_cpl_handlers(struct adapter *);
+int t3_listen_start(struct toedev *, struct tcpcb *);
+int t3_listen_stop(struct toedev *, struct tcpcb *);
+void t3_syncache_added(struct toedev *, void *);
+void t3_syncache_removed(struct toedev *, void *);
+int t3_syncache_respond(struct toedev *, void *, struct mbuf *);
+int do_abort_req_synqe(struct sge_qset *, struct rsp_desc *, struct mbuf *);
+int do_abort_rpl_synqe(struct sge_qset *, struct rsp_desc *, struct mbuf *);
+void t3_offload_socket(struct toedev *, void *, struct socket *);
 #endif
diff -r 7cec8c20120e sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/fcntl.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <sys/mutex.h>
-
-#include <sys/sockopt.h>
-#include <sys/sockstate.h>
-#include <sys/sockbuf.h>
-#include <sys/socket.h>
-#include <sys/sysctl.h>
-
-#include <sys/syslog.h>
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-
-#include <cxgb_osdep.h>
-#include <sys/mbufq.h>
-
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_fsm.h>
-#include <net/route.h>
-
-#include <t3cdev.h>
-#include <common/cxgb_firmware_exports.h>
-#include <common/cxgb_tcb.h>
-#include <common/cxgb_ctl_defs.h>
-#include <common/cxgb_t3_cpl.h>
-#include <cxgb_offload.h>
-#include <cxgb_include.h>
-#include <ulp/toecore/cxgb_toedev.h>
-#include <ulp/tom/cxgb_tom.h>
-#include <ulp/tom/cxgb_defs.h>
-#include <ulp/tom/cxgb_t3_ddp.h>
-
-/* Avoid clutter in the hw.* space, keep all toe tunables within hw.cxgb */
-SYSCTL_DECL(_hw_cxgb);
-static SYSCTL_NODE(_hw_cxgb, OID_AUTO, toe, CTLFLAG_RD, 0, "TOE parameters");
-
-static struct tom_tunables default_tunable_vals = {
-	.max_host_sndbuf = 32 * 1024,
-	.tx_hold_thres = 0,
-	.max_wrs = 15,
-	.rx_credit_thres = 15 * 1024,
-	.cong_alg = -1,
-	.mss = 16384,
-	.delack = 1,
-	.max_conn = -1,
-	.soft_backlog_limit = 0,
-	.ddp = 1,
-	.ddp_thres = 14 * 4096,
-	.ddp_copy_limit = 13 * 4096,
-	.ddp_push_wait = 1,
-	.ddp_rcvcoalesce = 0,
-	.zcopy_sosend_enabled = 0,	
-	.zcopy_sosend_partial_thres = 40960,
-	.zcopy_sosend_partial_copy = 4096 * 3,
-	.zcopy_sosend_thres = 128 * 1024,
-	.zcopy_sosend_copy = 4096 * 2,
-	.zcopy_sosend_ret_pending_dma = 1,
-	.activated = 1,
-};
-
-static int activated = 1;
-TUNABLE_INT("hw.cxgb.toe.activated", &activated);
-SYSCTL_UINT(_hw_cxgb_toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0,
-    "enable TOE at init time");
-
-static int ddp = 1;
-TUNABLE_INT("hw.cxgb.toe.ddp", &ddp);
-SYSCTL_UINT(_hw_cxgb_toe, OID_AUTO, ddp, CTLFLAG_RDTUN, &ddp, 0, "enable DDP");
-
-void
-t3_init_tunables(struct tom_data *t)
-{
-	t->conf = default_tunable_vals;
-
-	/* Adjust tunables */
-	t->conf.activated = activated;
-	t->conf.ddp = ddp;
-
-	/* Now apply device specific fixups. */
-	t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
-	t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
-}
-
-void
-t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p)
-{
-	struct sysctl_ctx_list *ctx;
-	struct sysctl_oid_list *children;
-
-	ctx = device_get_sysctl_ctx(sc->dev);
-	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
-	
-}
-
diff -r 7cec8c20120e sys/dev/cxgbe/adapter.h
--- a/sys/dev/cxgbe/adapter.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/adapter.h	Mon Jun 11 00:15:24 2012 -0700
@@ -157,6 +157,7 @@
 	INTR_DIRECT	= (1 << 2),	/* direct interrupts for everything */
 	MASTER_PF	= (1 << 3),
 	ADAP_SYSCTL_CTX	= (1 << 4),
+	TOM_INIT_DONE	= (1 << 5),
 
 	CXGBE_BUSY	= (1 << 9),
 
@@ -199,7 +200,7 @@
 	int first_txq;	/* index of first tx queue */
 	int nrxq;	/* # of rx queues */
 	int first_rxq;	/* index of first rx queue */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldtxq;		/* # of offload tx queues */
 	int first_ofld_txq;	/* index of first offload tx queue */
 	int nofldrxq;		/* # of offload rx queues */
@@ -213,6 +214,8 @@
 	struct link_config link_cfg;
 	struct port_stats stats;
 
+	eventhandler_tag vlan_c;
+
 	struct callout tick;
 	struct sysctl_ctx_list ctx;	/* from ifconfig up to driver detach */
 
@@ -296,7 +299,7 @@
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	EQ_OFLD		= 3,
 #endif
 
@@ -422,14 +425,36 @@
 
 } __aligned(CACHE_LINE_SIZE);
 
-#ifndef TCP_OFFLOAD_DISABLE
+static inline struct sge_rxq *
+iq_to_rxq(struct sge_iq *iq)
+{
+
+	return (member2struct(sge_rxq, iq, iq));
+}
+
+
+#ifdef TCP_OFFLOAD
 /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_ofld_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 } __aligned(CACHE_LINE_SIZE);
+
+static inline struct sge_ofld_rxq *
+iq_to_ofld_rxq(struct sge_iq *iq)
+{
+
+	return (member2struct(sge_ofld_rxq, iq, iq));
+}
 #endif
 
+struct wrqe {
+	STAILQ_ENTRY(wrqe) link;
+	struct sge_wrq *wrq;
+	int wr_len;
+	uint64_t wr[] __aligned(16);
+};
+
 /*
  * wrq: SGE egress queue that is given prebuilt work requests.  Both the control
  * and offload tx queues are of this type.
@@ -438,8 +463,9 @@
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
-	struct mbuf *head;	/* held up due to lack of descriptors */
-	struct mbuf *tail;	/* valid only if head is valid */
+
+	/* List of WRs held up due to lack of tx descriptors */
+	STAILQ_HEAD(, wrqe) wr_list;
 
 	/* stats for common events first */
 
@@ -457,7 +483,7 @@
 
 	int nrxq;	/* total # of Ethernet rx queues */
 	int ntxq;	/* total # of Ethernet tx tx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldrxq;	/* total # of TOE rx queues */
 	int nofldtxq;	/* total # of TOE tx queues */
 #endif
@@ -469,7 +495,7 @@
 	struct sge_wrq *ctrlq;	/* Control queues */
 	struct sge_txq *txq;	/* NIC tx queues */
 	struct sge_rxq *rxq;	/* NIC rx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_wrq *ofld_txq;	/* TOE tx queues */
 	struct sge_ofld_rxq *ofld_rxq;	/* TOE rx queues */
 #endif
@@ -483,6 +509,7 @@
 struct rss_header;
 typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
 
 struct adapter {
 	SLIST_ENTRY(adapter) link;
@@ -519,15 +546,15 @@
 	uint8_t chan_map[NCHAN];
 	uint32_t filter_mode;
 
-#ifndef TCP_OFFLOAD_DISABLE
-	struct uld_softc tom;
+#ifdef TCP_OFFLOAD
+	void *tom_softc;	/* (struct tom_data *) */
 	struct tom_tunables tt;
 #endif
 	struct l2t_data *l2t;	/* L2 table */
 	struct tid_info tids;
 
 	int open_device_map;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int offload_map;
 #endif
 	int flags;
@@ -554,7 +581,8 @@
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
 
-	cpl_handler_t cpl_handler[256] __aligned(CACHE_LINE_SIZE);
+	an_handler_t an_handler __aligned(CACHE_LINE_SIZE);
+	cpl_handler_t cpl_handler[256];
 };
 
 #define ADAPTER_LOCK(sc)		mtx_lock(&(sc)->sc_lock)
@@ -609,82 +637,96 @@
 static inline uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
+
 	return bus_space_read_4(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
 {
+
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 }
 
 static inline uint64_t
 t4_read_reg64(struct adapter *sc, uint32_t reg)
 {
+
 	return t4_bus_space_read_8(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 {
+
 	t4_bus_space_write_8(sc->bt, sc->bh, reg, val);
 }
 
 static inline void
 t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 1);
 }
 
 static inline void
 t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 1);
 }
 
 static inline void
 t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 2);
 }
 
 static inline void
 t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 2);
 }
 
 static inline void
 t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 4);
 }
 
 static inline void
 t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 4);
 }
 
 static inline struct port_info *
 adap2pinfo(struct adapter *sc, int idx)
 {
+
 	return (sc->port[idx]);
 }
 
 static inline void
 t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[])
 {
+
 	bcopy(hw_addr, sc->port[idx]->hw_addr, ETHER_ADDR_LEN);
 }
 
 static inline bool is_10G_port(const struct port_info *pi)
 {
+
 	return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0);
 }
 
 static inline int tx_resume_threshold(struct sge_eq *eq)
 {
+
 	return (eq->qsize / 4);
 }
 
@@ -698,6 +740,7 @@
 void t4_os_link_changed(struct adapter *, int, int);
 void t4_iterate(void (*)(struct adapter *, void *), void *);
 int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t);
+int t4_register_an_handler(struct adapter *, an_handler_t);
 
 /* t4_sge.c */
 void t4_sge_modload(void);
@@ -714,21 +757,45 @@
 void t4_intr(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
-int t4_mgmt_tx(struct adapter *, struct mbuf *);
-int t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct mbuf *);
+void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
 int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
 void t4_update_fl_bufsize(struct ifnet *);
 int can_resume_tx(struct sge_eq *);
 
-static inline int t4_wrq_tx(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m)
+static inline struct wrqe *
+alloc_wrqe(int wr_len, struct sge_wrq *wrq)
 {
-	int rc;
+	int len = offsetof(struct wrqe, wr) + wr_len;
+	struct wrqe *wr;
+
+	wr = malloc(len, M_CXGBE, M_NOWAIT);
+	if (__predict_false(wr == NULL))
+		return (NULL);
+	wr->wr_len = wr_len;
+	wr->wrq = wrq;
+	return (wr);
+}
+
+static inline void *
+wrtod(struct wrqe *wr)
+{
+	return (&wr->wr[0]);
+}
+
+static inline void
+free_wrqe(struct wrqe *wr)
+{
+	free(wr, M_CXGBE);
+}
+
+static inline void
+t4_wrq_tx(struct adapter *sc, struct wrqe *wr)
+{
+	struct sge_wrq *wrq = wr->wrq;
 
 	TXQ_LOCK(wrq);
-	rc = t4_wrq_tx_locked(sc, wrq, m);
+	t4_wrq_tx_locked(sc, wrq, wr);
 	TXQ_UNLOCK(wrq);
-	return (rc);
 }
 
-
 #endif
diff -r 7cec8c20120e sys/dev/cxgbe/common/t4_hw.c
--- a/sys/dev/cxgbe/common/t4_hw.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/common/t4_hw.c	Mon Jun 11 00:15:24 2012 -0700
@@ -27,6 +27,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include "common.h"
 #include "t4_regs.h"
 #include "t4_regs_values.h"
diff -r 7cec8c20120e sys/dev/cxgbe/offload.h
--- a/sys/dev/cxgbe/offload.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/offload.h	Mon Jun 11 00:15:24 2012 -0700
@@ -31,12 +31,6 @@
 #ifndef __T4_OFFLOAD_H__
 #define __T4_OFFLOAD_H__
 
-/* XXX: flagrant misuse of mbuf fields (during tx by TOM) */
-#define MBUF_EQ(m)		(*((void **)(&(m)->m_pkthdr.rcvif)))
-/* These have to work for !M_PKTHDR so we use a field from m_hdr. */
-#define MBUF_TX_CREDITS(m)	((m)->m_hdr.pad[0])
-#define MBUF_DMA_MAPPED(m)	((m)->m_hdr.pad[1])
-
 #define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
 	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
 	(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
@@ -119,7 +113,7 @@
 	struct t4_range ocq;
 };
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 enum {
 	ULD_TOM = 1,
 };
@@ -130,13 +124,8 @@
 	SLIST_ENTRY(uld_info) link;
 	int refcount;
 	int uld_id;
-	int (*attach)(struct adapter *, void **);
-	int (*detach)(void *);
-};
-
-struct uld_softc {
-	struct uld_info *uld;
-	void *softc;
+	int (*activate)(struct adapter *);
+	int (*deactivate)(struct adapter *);
 };
 
 struct tom_tunables {
@@ -148,6 +137,8 @@
 
 int t4_register_uld(struct uld_info *);
 int t4_unregister_uld(struct uld_info *);
+int t4_activate_uld(struct adapter *, int);
+int t4_deactivate_uld(struct adapter *, int);
 #endif
 
 #endif
diff -r 7cec8c20120e sys/dev/cxgbe/t4_l2t.c
--- a/sys/dev/cxgbe/t4_l2t.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/t4_l2t.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2011 Chelsio Communications, Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,16 +38,7 @@
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/sbuf.h>
-#include <net/if.h>
-#include <net/if_types.h>
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
-#include <net/if_dl.h>
-#include <net/if_llatbl.h>
-#include <net/route.h>
 #include <netinet/in.h>
-#include <netinet/in_var.h>
-#include <netinet/if_ether.h>
 
 #include "common/common.h"
 #include "common/jhash.h"
@@ -72,42 +63,11 @@
  * lifetime of an L2T entry is fully contained in the lifetime of the TOE.
  */
 
-/* identifies sync vs async L2T_WRITE_REQs */
-#define S_SYNC_WR    12
-#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
-#define F_SYNC_WR    V_SYNC_WR(1)
-
-enum {
-	L2T_STATE_VALID,	/* entry is up to date */
-	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
-	L2T_STATE_RESOLVING,	/* entry needs address resolution */
-	L2T_STATE_SYNC_WRITE,	/* synchronous write of entry underway */
-
-	/* when state is one of the below the entry is not hashed */
-	L2T_STATE_SWITCHING,	/* entry is being used by a switching filter */
-	L2T_STATE_UNUSED	/* entry not in use */
-};
-
-struct l2t_data {
-	struct rwlock lock;
-	volatile int nfree;	/* number of free entries */
-	struct l2t_entry *rover;/* starting point for next allocation */
-	struct l2t_entry l2tab[L2T_SIZE];
-};
-
-static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *,
-    struct mbuf *);
-
-#define VLAN_NONE	0xfff
-#define SA(x)           ((struct sockaddr *)(x))
-#define SIN(x)          ((struct sockaddr_in *)(x))
-#define SINADDR(x)      (SIN(x)->sin_addr.s_addr)
-
 /*
  * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
  */
-static struct l2t_entry *
-alloc_l2e(struct l2t_data *d)
+struct l2t_entry *
+t4_alloc_l2e(struct l2t_data *d)
 {
 	struct l2t_entry *end, *e, **p;
 
@@ -121,7 +81,8 @@
 		if (atomic_load_acq_int(&e->refcnt) == 0)
 			goto found;
 
-	for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ;
+	for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e)
+		continue;
 found:
 	d->rover = e + 1;
 	atomic_subtract_int(&d->nfree, 1);
@@ -148,19 +109,18 @@
  * Write an L2T entry.  Must be called with the entry locked.
  * The write may be synchronous or asynchronous.
  */
-static int
-write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
+int
+t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 {
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct cpl_l2t_write_req *req;
 
 	mtx_assert(&e->lock, MA_OWNED);
 
-	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+	wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
-
-	req = mtod(m, struct cpl_l2t_write_req *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx |
@@ -170,7 +130,7 @@
 	req->vlan = htons(e->vlan);
 	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 
 	if (sync && e->state != L2T_STATE_SWITCHING)
 		e->state = L2T_STATE_SYNC_WRITE;
@@ -189,7 +149,7 @@
 	struct l2t_entry *e;
 
 	rw_rlock(&d->lock);
-	e = alloc_l2e(d);
+	e = t4_alloc_l2e(d);
 	if (e) {
 		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
 		e->state = L2T_STATE_SWITCHING;
@@ -214,7 +174,7 @@
 	e->lport = port;
 	memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN);
 	mtx_lock(&e->lock);
-	rc = write_l2e(sc, e, 0);
+	rc = t4_write_l2e(sc, e, 0);
 	mtx_unlock(&e->lock);
 	return (rc);
 }
@@ -234,10 +194,13 @@
 	rw_init(&d->lock, "L2T");
 
 	for (i = 0; i < L2T_SIZE; i++) {
-		d->l2tab[i].idx = i;
-		d->l2tab[i].state = L2T_STATE_UNUSED;
-		mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF);
-		atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
+		struct l2t_entry *e = &d->l2tab[i];
+
+		e->idx = i;
+		e->state = L2T_STATE_UNUSED;
+		mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF);
+		STAILQ_INIT(&e->wr_list);
+		atomic_store_rel_int(&e->refcnt, 0);
 	}
 
 	sc->l2t = d;
@@ -259,6 +222,24 @@
 	return (0);
 }
 
+int
+do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(rpl);
+	unsigned int idx = tid & (L2T_SIZE - 1);
+
+	if (__predict_false(rpl->status != CPL_ERR_NONE)) {
+		log(LOG_ERR,
+		    "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+		    rpl->status, idx);
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
 #ifdef SBUF_DRAIN
 static inline unsigned int
 vlan_prio(const struct l2t_entry *e)
@@ -273,7 +254,7 @@
 	case L2T_STATE_VALID: return 'V';  /* valid, fast-path entry */
 	case L2T_STATE_STALE: return 'S';  /* needs revalidation, but usable */
 	case L2T_STATE_SYNC_WRITE: return 'W';
-	case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R';
+	case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A';
 	case L2T_STATE_SWITCHING: return 'X';
 	default: return 'U';
 	}
@@ -311,20 +292,20 @@
 			    "Ethernet address  VLAN/P LP State Users Port");
 			header = 1;
 		}
-		if (e->state == L2T_STATE_SWITCHING || e->v6)
+		if (e->state == L2T_STATE_SWITCHING)
 			ip[0] = 0;
 		else
 			snprintf(ip, sizeof(ip), "%s",
-			    inet_ntoa(*(struct in_addr *)&e->addr[0]));
+			    inet_ntoa(*(struct in_addr *)&e->addr));
 
-		/* XXX: accessing lle probably not safe? */
+		/* XXX: e->ifp may not be around */
 		sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d"
 			   " %u %2u   %c   %5u %s",
 			   e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2],
 			   e->dmac[3], e->dmac[4], e->dmac[5],
 			   e->vlan & 0xfff, vlan_prio(e), e->lport,
 			   l2e_state(e), atomic_load_acq_int(&e->refcnt),
-			   e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : "");
+			   e->ifp->if_xname);
 skip:
 		mtx_unlock(&e->lock);
 	}
@@ -335,459 +316,3 @@
 	return (rc);
 }
 #endif
-
-#ifndef TCP_OFFLOAD_DISABLE
-static inline void
-l2t_hold(struct l2t_data *d, struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
-		atomic_subtract_int(&d->nfree, 1);
-}
-
-/*
- * To avoid having to check address families we do not allow v4 and v6
- * neighbors to be on the same hash chain.  We keep v4 entries in the first
- * half of available hash buckets and v6 in the second.
- */
-enum {
-	L2T_SZ_HALF = L2T_SIZE / 2,
-	L2T_HASH_MASK = L2T_SZ_HALF - 1
-};
-
-static inline unsigned int
-arp_hash(const uint32_t *key, int ifindex)
-{
-	return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK;
-}
-
-static inline unsigned int
-ipv6_hash(const uint32_t *key, int ifindex)
-{
-	uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3];
-
-	return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK);
-}
-
-static inline unsigned int
-addr_hash(const uint32_t *addr, int addr_len, int ifindex)
-{
-	return addr_len == 4 ? arp_hash(addr, ifindex) :
-			       ipv6_hash(addr, ifindex);
-}
-
-/*
- * Checks if an L2T entry is for the given IP/IPv6 address.  It does not check
- * whether the L2T entry and the address are of the same address family.
- * Callers ensure an address is only checked against L2T entries of the same
- * family, something made trivial by the separation of IP and IPv6 hash chains
- * mentioned above.  Returns 0 if there's a match,
- */
-static inline int
-addreq(const struct l2t_entry *e, const uint32_t *addr)
-{
-	if (e->v6)
-		return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
-		       (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
-	return e->addr[0] ^ addr[0];
-}
-
-/*
- * Add a packet to an L2T entry's queue of packets awaiting resolution.
- * Must be called with the entry's lock held.
- */
-static inline void
-arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
-{
-	mtx_assert(&e->lock, MA_OWNED);
-
-	KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__));
-	if (e->arpq_head)
-		e->arpq_tail->m_nextpkt = m;
-	else
-		e->arpq_head = m;
-	e->arpq_tail = m;
-}
-
-static inline void
-send_pending(struct adapter *sc, struct l2t_entry *e)
-{
-	struct mbuf *m, *next;
-
-	mtx_assert(&e->lock, MA_OWNED);
-
-	for (m = e->arpq_head; m; m = next) {
-		next = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		t4_wrq_tx(sc, MBUF_EQ(m), m);
-	}
-	e->arpq_head = e->arpq_tail = NULL;
-}
-
-#ifdef INET
-/*
- * Looks up and fills up an l2t_entry's lle.  We grab all the locks that we need
- * ourself, and update e->state at the end if e->lle was successfully filled.
- *
- * The lle passed in comes from arpresolve and is ignored as it does not appear
- * to be of much use.
- */
-static int
-l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused)
-{
-        int rc = 0;
-        struct sockaddr_in sin;
-        struct ifnet *ifp = e->ifp;
-        struct llentry *lle;
-
-        bzero(&sin, sizeof(struct sockaddr_in));
-	if (e->v6)
-		panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
-	sin.sin_family = AF_INET;
-	sin.sin_len = sizeof(struct sockaddr_in);
-	memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
-        mtx_assert(&e->lock, MA_NOTOWNED);
-        KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__));
-
-        IF_AFDATA_LOCK(ifp);
-        lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin));
-        IF_AFDATA_UNLOCK(ifp);
-        if (!LLE_IS_VALID(lle))
-                return (ENOMEM);
-        if (!(lle->la_flags & LLE_VALID)) {
-                rc = EINVAL;
-                goto done;
-        }
-
-        LLE_ADDREF(lle);
-
-        mtx_lock(&e->lock);
-        if (e->state == L2T_STATE_RESOLVING) {
-                KASSERT(e->lle == NULL, ("%s: lle already valid", __func__));
-                e->lle = lle;
-                memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
-		write_l2e(sc, e, 1);
-        } else {
-                KASSERT(e->lle == lle, ("%s: lle changed", __func__));
-                LLE_REMREF(lle);
-        }
-        mtx_unlock(&e->lock);
-done:
-        LLE_WUNLOCK(lle);
-        return (rc);
-}
-#endif
-
-int
-t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
-{
-#ifndef INET
-	return (EINVAL);
-#else
-	struct llentry *lle = NULL;
-	struct sockaddr_in sin;
-	struct ifnet *ifp = e->ifp;
-
-	if (e->v6)
-		panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
-        bzero(&sin, sizeof(struct sockaddr_in));
-	sin.sin_family = AF_INET;
-	sin.sin_len = sizeof(struct sockaddr_in);
-	memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
-again:
-	switch (e->state) {
-	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
-			l2t_fill_lle(sc, e, lle);
-
-		/* Fall through */
-
-	case L2T_STATE_VALID:     /* fast-path, send the packet on */
-		return t4_wrq_tx(sc, MBUF_EQ(m), m);
-
-	case L2T_STATE_RESOLVING:
-	case L2T_STATE_SYNC_WRITE:
-		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_SYNC_WRITE &&
-		    e->state != L2T_STATE_RESOLVING) {
-			/* state changed by the time we got here */
-			mtx_unlock(&e->lock);
-			goto again;
-		}
-		arpq_enqueue(e, m);
-		mtx_unlock(&e->lock);
-
-		if (e->state == L2T_STATE_RESOLVING &&
-		    arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
-			l2t_fill_lle(sc, e, lle);
-	}
-
-	return (0);
-#endif
-}
-
-/*
- * Called when an L2T entry has no more users.  The entry is left in the hash
- * table since it is likely to be reused but we also bump nfree to indicate
- * that the entry can be reallocated for a different neighbor.  We also drop
- * the existing neighbor reference in case the neighbor is going away and is
- * waiting on our reference.
- *
- * Because entries can be reallocated to other neighbors once their ref count
- * drops to 0 we need to take the entry's lock to avoid races with a new
- * incarnation.
- */
-static void
-t4_l2e_free(struct l2t_entry *e)
-{
-	struct llentry *lle = NULL;
-	struct l2t_data *d;
-
-	mtx_lock(&e->lock);
-	if (atomic_load_acq_int(&e->refcnt) == 0) {  /* hasn't been recycled */
-		lle = e->lle;
-		e->lle = NULL;
-		/*
-		 * Don't need to worry about the arpq, an L2T entry can't be
-		 * released if any packets are waiting for resolution as we
-		 * need to be able to communicate with the device to close a
-		 * connection.
-		 */
-	}
-	mtx_unlock(&e->lock);
-
-	d = container_of(e, struct l2t_data, l2tab[e->idx]);
-	atomic_add_int(&d->nfree, 1);
-
-	if (lle)
-		LLE_FREE(lle);
-}
-
-void
-t4_l2t_release(struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
-		t4_l2e_free(e);
-}
-
-static int
-do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
-    struct mbuf *m)
-{
-	struct adapter *sc = iq->adapter;
-	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
-	unsigned int tid = GET_TID(rpl);
-	unsigned int idx = tid & (L2T_SIZE - 1);
-
-	if (__predict_false(rpl->status != CPL_ERR_NONE)) {
-		log(LOG_ERR,
-		    "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
-		    rpl->status, idx);
-		return (EINVAL);
-	}
-
-	if (tid & F_SYNC_WR) {
-		struct l2t_entry *e = &sc->l2t->l2tab[idx];
-
-		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_SWITCHING) {
-			send_pending(sc, e);
-			e->state = L2T_STATE_VALID;
-		}
-		mtx_unlock(&e->lock);
-	}
-
-	return (0);
-}
-
-/*
- * Reuse an L2T entry that was previously used for the same next hop.
- */
-static void
-reuse_entry(struct l2t_entry *e)
-{
-	struct llentry *lle;
-
-	mtx_lock(&e->lock);                /* avoid race with t4_l2t_free */
-	lle = e->lle;
-	if (lle) {
-		KASSERT(lle->la_flags & LLE_VALID,
-			("%s: invalid lle stored in l2t_entry", __func__));
-
-		if (lle->la_expire >= time_uptime)
-			e->state = L2T_STATE_STALE;
-		else
-			e->state = L2T_STATE_VALID;
-	} else
-		e->state = L2T_STATE_RESOLVING;
-	mtx_unlock(&e->lock);
-}
-
-/*
- * The TOE wants an L2 table entry that it can use to reach the next hop over
- * the specified port.  Produce such an entry - create one if needed.
- *
- * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
- * top of the real cxgbe interface.
- */
-struct l2t_entry *
-t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
-{
-	struct l2t_entry *e;
-	struct l2t_data *d = pi->adapter->l2t;
-	int addr_len;
-	uint32_t *addr;
-	int hash;
-	struct sockaddr_in6 *sin6;
-	unsigned int smt_idx = pi->port_id;
-
-	if (sa->sa_family == AF_INET) {
-		addr = (uint32_t *)&SINADDR(sa);
-		addr_len = sizeof(SINADDR(sa));
-	} else if (sa->sa_family == AF_INET6) {
-		sin6 = (struct sockaddr_in6 *)sa;
-		addr = (uint32_t *)&sin6->sin6_addr.s6_addr;
-		addr_len = sizeof(sin6->sin6_addr.s6_addr);
-	} else
-		return (NULL);
-
-#ifndef VLAN_TAG
-	if (ifp->if_type == IFT_L2VLAN)
-		return (NULL);
-#endif
-
-	hash = addr_hash(addr, addr_len, ifp->if_index);
-
-	rw_wlock(&d->lock);
-	for (e = d->l2tab[hash].first; e; e = e->next) {
-		if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){
-			l2t_hold(d, e);
-			if (atomic_load_acq_int(&e->refcnt) == 1)
-				reuse_entry(e);
-			goto done;
-		}
-	}
-
-	/* Need to allocate a new entry */
-	e = alloc_l2e(d);
-	if (e) {
-		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
-		e->state = L2T_STATE_RESOLVING;
-		memcpy(e->addr, addr, addr_len);
-		e->ifindex = ifp->if_index;
-		e->smt_idx = smt_idx;
-		e->ifp = ifp;
-		e->hash = hash;
-		e->lport = pi->lport;
-		e->v6 = (addr_len == 16);
-		e->lle = NULL;
-		atomic_store_rel_int(&e->refcnt, 1);
-#ifdef VLAN_TAG
-		if (ifp->if_type == IFT_L2VLAN)
-			VLAN_TAG(ifp, &e->vlan);
-		else
-			e->vlan = VLAN_NONE;
-#endif
-		e->next = d->l2tab[hash].first;
-		d->l2tab[hash].first = e;
-		mtx_unlock(&e->lock);
-	}
-done:
-	rw_wunlock(&d->lock);
-	return e;
-}
-
-/*
- * Called when the host's neighbor layer makes a change to some entry that is
- * loaded into the HW L2 table.
- */
-void
-t4_l2t_update(struct adapter *sc, struct llentry *lle)
-{
-	struct l2t_entry *e;
-	struct l2t_data *d = sc->l2t;
-	struct sockaddr *sa = L3_ADDR(lle);
-	struct llentry *old_lle = NULL;
-	uint32_t *addr = (uint32_t *)&SINADDR(sa);
-	struct ifnet *ifp = lle->lle_tbl->llt_ifp;
-	int hash = addr_hash(addr, sizeof(*addr), ifp->if_index);
-
-	KASSERT(d != NULL, ("%s: no L2 table", __func__));
-	LLE_WLOCK_ASSERT(lle);
-	KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED,
-	    ("%s: entry neither valid nor deleted.", __func__));
-
-	rw_rlock(&d->lock);
-	for (e = d->l2tab[hash].first; e; e = e->next) {
-		if (!addreq(e, addr) && e->ifp == ifp) {
-			mtx_lock(&e->lock);
-			if (atomic_load_acq_int(&e->refcnt))
-				goto found;
-			e->state = L2T_STATE_STALE;
-			mtx_unlock(&e->lock);
-			break;
-		}
-	}
-	rw_runlock(&d->lock);
-
-	/* The TOE has no interest in this LLE */
-	return;
-
- found:
-	rw_runlock(&d->lock);
-
-        if (atomic_load_acq_int(&e->refcnt)) {
-
-                /* Entry is referenced by at least 1 offloaded connection. */
-
-                /* Handle deletes first */
-                if (lle->la_flags & LLE_DELETED) {
-                        if (lle == e->lle) {
-                                e->lle = NULL;
-                                e->state = L2T_STATE_RESOLVING;
-                                LLE_REMREF(lle);
-                        }
-                        goto done;
-                }
-
-                if (lle != e->lle) {
-                        old_lle = e->lle;
-                        LLE_ADDREF(lle);
-                        e->lle = lle;
-                }
-
-                if (e->state == L2T_STATE_RESOLVING ||
-                    memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) {
-
-                        /* unresolved -> resolved; or dmac changed */
-
-                        memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
-			write_l2e(sc, e, 1);
-                } else {
-
-                        /* +ve reinforcement of a valid or stale entry */
-
-                }
-
-                e->state = L2T_STATE_VALID;
-
-        } else {
-                /*
-                 * Entry was used previously but is unreferenced right now.
-                 * e->lle has been released and NULL'd out by t4_l2t_free, or
-                 * l2t_release is about to call t4_l2t_free and do that.
-                 *
-                 * Either way this is of no interest to us.
-                 */
-        }
-
-done:
-        mtx_unlock(&e->lock);
-        if (old_lle)
-                LLE_FREE(old_lle);
-}
-
-#endif
diff -r 7cec8c20120e sys/dev/cxgbe/t4_l2t.h
--- a/sys/dev/cxgbe/t4_l2t.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/t4_l2t.h	Mon Jun 11 00:15:24 2012 -0700
@@ -30,8 +30,25 @@
 #ifndef __T4_L2T_H
 #define __T4_L2T_H
 
+/* identifies sync vs async L2T_WRITE_REQs */
+#define S_SYNC_WR    12
+#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
+#define F_SYNC_WR    V_SYNC_WR(1)
+
 enum { L2T_SIZE = 4096 };     /* # of L2T entries */
 
+enum {
+	L2T_STATE_VALID,	/* entry is up to date */
+	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,	/* entry needs address resolution */
+	L2T_STATE_FAILED,	/* failed to resolve */
+	L2T_STATE_SYNC_WRITE,	/* synchronous write of entry underway */
+
+	/* when state is one of the below the entry is not hashed */
+	L2T_STATE_SWITCHING,	/* entry is being used by a switching filter */
+	L2T_STATE_UNUSED	/* entry not in use */
+};
+
 /*
  * Each L2T entry plays multiple roles.  First of all, it keeps state for the
  * corresponding entry of the HW L2 table and maintains a queue of offload
@@ -43,39 +60,49 @@
 struct l2t_entry {
 	uint16_t state;			/* entry state */
 	uint16_t idx;			/* entry index */
-	uint32_t addr[4];		/* next hop IP or IPv6 address */
+	uint32_t addr;			/* next hop IP address */
 	struct ifnet *ifp;		/* outgoing interface */
 	uint16_t smt_idx;		/* SMT index */
 	uint16_t vlan;			/* VLAN TCI (id: 0-11, prio: 13-15) */
-	int ifindex;			/* interface index */
-	struct llentry *lle;		/* llentry for next hop */
 	struct l2t_entry *first;	/* start of hash chain */
 	struct l2t_entry *next;		/* next l2t_entry on chain */
-	struct mbuf *arpq_head;		/* list of mbufs awaiting resolution */
-	struct mbuf *arpq_tail;
+	STAILQ_HEAD(, wrqe) wr_list;	/* list of WRs awaiting resolution */
 	struct mtx lock;
 	volatile int refcnt;		/* entry reference count */
 	uint16_t hash;			/* hash bucket the entry is on */
-	uint8_t v6;			/* whether entry is for IPv6 */
 	uint8_t lport;			/* associated offload logical port */
 	uint8_t dmac[ETHER_ADDR_LEN];	/* next hop's MAC address */
 };
 
+struct l2t_data {
+	struct rwlock lock;
+	volatile int nfree;	/* number of free entries */
+	struct l2t_entry *rover;/* starting point for next allocation */
+	struct l2t_entry l2tab[L2T_SIZE];
+};
+
+
 int t4_init_l2t(struct adapter *, int);
 int t4_free_l2t(struct l2t_data *);
+struct l2t_entry *t4_alloc_l2e(struct l2t_data *);
 struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *);
 int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t,
     uint8_t, uint8_t *);
-void t4_l2t_release(struct l2t_entry *);
+int t4_write_l2e(struct adapter *, struct l2t_entry *, int);
+int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
+
+static inline void
+t4_l2t_release(struct l2t_entry *e)
+{
+	struct l2t_data *d = container_of(e, struct l2t_data, l2tab[e->idx]);
+
+	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
+		atomic_add_int(&d->nfree, 1);
+}
+
+
 #ifdef SBUF_DRAIN
 int sysctl_l2t(SYSCTL_HANDLER_ARGS);
 #endif
 
-#ifndef TCP_OFFLOAD_DISABLE
-struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
-    struct sockaddr *);
-int t4_l2t_send(struct adapter *, struct mbuf *, struct l2t_entry *);
-void t4_l2t_update(struct adapter *, struct llentry *);
-#endif
-
 #endif  /* __T4_L2T_H */
diff -r 7cec8c20120e sys/dev/cxgbe/t4_main.c
--- a/sys/dev/cxgbe/t4_main.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/t4_main.c	Mon Jun 11 00:15:24 2012 -0700
@@ -119,9 +119,13 @@
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4 Ethernet driver and services");
 
+/*
+ * Correct lock order when you need to acquire multiple locks is t4_list_lock,
+ * then ADAPTER_LOCK, then t4_uld_list_lock.
+ */
 static struct mtx t4_list_lock;
 static SLIST_HEAD(, adapter) t4_list;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static struct mtx t4_uld_list_lock;
 static SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
@@ -149,7 +153,7 @@
 static int t4_nrxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 #define NOFLDTXQ_10G 8
 static int t4_nofldtxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g);
@@ -237,7 +241,7 @@
 	int nrxq10g;		/* # of NIC rxq's for each 10G port */
 	int ntxq1g;		/* # of NIC txq's for each 1G port */
 	int nrxq1g;		/* # of NIC rxq's for each 1G port */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldtxq10g;	/* # of TOE txq's for each 10G port */
 	int nofldrxq10g;	/* # of TOE rxq's for each 10G port */
 	int nofldtxq1g;		/* # of TOE txq's for each 1G port */
@@ -297,8 +301,10 @@
     unsigned int);
 static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void cxgbe_tick(void *);
+static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t);
 static int cpl_not_handled(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *);
 static int t4_sysctls(struct adapter *);
 static int cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
@@ -342,10 +348,8 @@
     struct mbuf *);
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int read_card_mem(struct adapter *, struct t4_mem_range *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int toe_capability(struct port_info *, int);
-static int activate_uld(struct adapter *, int, struct uld_softc *);
-static int deactivate_uld(struct uld_softc *);
 #endif
 static int t4_mod_event(module_t, int, void *);
 
@@ -368,8 +372,12 @@
 	{0x440a, 4, "Chelsio T404-BT"},
 };
 
-#ifndef TCP_OFFLOAD_DISABLE
-/* This is used in service_iq() to get to the fl associated with an iq. */
+#ifdef TCP_OFFLOAD
+/*
+ * service_iq() has an iq and needs the fl.  Offset of fl from the iq should be
+ * exactly the same for both rxq and ofld_rxq.
+ */
+CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 
@@ -401,7 +409,7 @@
 	int rc = 0, i, n10g, n1g, rqidx, tqidx;
 	struct intrs_and_queues iaq;
 	struct sge *s;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int ofld_rqidx, ofld_tqidx;
 #endif
 
@@ -436,6 +444,7 @@
 		goto done; /* error message displayed already */
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
+	sc->an_handler = an_not_handled;
 	for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++)
 		sc->cpl_handler[i] = cpl_not_handled;
 	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl);
@@ -595,7 +604,7 @@
 	s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 
 		s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g;
@@ -631,7 +640,7 @@
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	ofld_rqidx = ofld_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
@@ -653,7 +662,7 @@
 		rqidx += pi->nrxq;
 		tqidx += pi->ntxq;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (is_offload(sc)) {
 			pi->first_ofld_rxq = ofld_rqidx;
 			pi->first_ofld_txq = ofld_tqidx;
@@ -761,7 +770,7 @@
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
@@ -832,7 +841,7 @@
 	ifp->if_qflush = cxgbe_qflush;
 
 	ifp->if_capabilities = T4_CAP;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter))
 		ifp->if_capabilities |= IFCAP_TOE4;
 #endif
@@ -844,9 +853,12 @@
 	    cxgbe_media_status);
 	build_medialist(pi);
 
+	pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp,
+	    EVENTHANDLER_PRI_ANY);
+
 	ether_ifattach(ifp, pi->hw_addr);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter)) {
 		device_printf(dev,
 		    "%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n",
@@ -876,6 +888,9 @@
 	SET_BUSY(sc);
 	ADAPTER_UNLOCK(sc);
 
+	if (pi->vlan_c)
+		EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c);
+
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	callout_stop(&pi->tick);
@@ -1042,7 +1057,7 @@
 			}
 #endif
 		}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
@@ -1292,7 +1307,7 @@
 	iaq->ntxq1g = t4_ntxq1g;
 	iaq->nrxq10g = nrxq10g = t4_nrxq10g;
 	iaq->nrxq1g = nrxq1g = t4_nrxq1g;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	iaq->nofldtxq10g = t4_nofldtxq10g;
 	iaq->nofldtxq1g = t4_nofldtxq1g;
 	iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g;
@@ -1364,7 +1379,7 @@
 					n++;
 				}
 				iaq->nrxq10g = min(n, nrxq10g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 				iaq->nofldrxq10g = min(n, nofldrxq10g);
 #endif
 			}
@@ -1379,7 +1394,7 @@
 					n++;
 				}
 				iaq->nrxq1g = min(n, nrxq1g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 				iaq->nofldrxq1g = min(n, nofldrxq1g);
 #endif
 			}
@@ -1392,7 +1407,7 @@
 		 * Least desirable option: one interrupt vector for everything.
 		 */
 		iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		iaq->nofldrxq10g = iaq->nofldrxq1g = 1;
 #endif
 
@@ -2305,7 +2320,7 @@
 	struct irq *irq;
 	struct port_info *pi;
 	struct sge_rxq *rxq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 
@@ -2369,7 +2384,7 @@
 		for_each_port(sc, p) {
 			pi = sc->port[p];
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 			/*
 			 * Skip over the NIC queues if they aren't taking direct
 			 * interrupts.
@@ -2386,7 +2401,7 @@
 				rid++;
 			}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 			/*
 			 * Skip over the offload queues if they aren't taking
 			 * direct interrupts.
@@ -2494,7 +2509,7 @@
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
@@ -2507,7 +2522,7 @@
 			quiesce_eq(sc, &txq->eq);
 		}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
 			quiesce_eq(sc, &ofld_txq->eq);
 		}
@@ -2518,7 +2533,7 @@
 			quiesce_fl(sc, &rxq->fl);
 		}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		for_each_ofld_rxq(pi, i, ofld_rxq) {
 			quiesce_iq(sc, &ofld_rxq->iq);
 			quiesce_fl(sc, &ofld_rxq->fl);
@@ -2892,14 +2907,27 @@
 	PORT_UNLOCK(pi);
 }
 
+static void
+cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid)
+{
+	struct ifnet *vlan;
+
+	if (arg != ifp)
+		return;
+
+	vlan = VLAN_DEVAT(ifp, vid);
+	VLAN_SETCOOKIE(vlan, ifp);
+}
+
 static int
 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
+
 #ifdef INVARIANTS
-	panic("%s: opcode %02x on iq %p with payload %p",
+	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
-	log(LOG_ERR, "%s: opcode %02x on iq %p with payload %p",
+	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
@@ -2922,6 +2950,31 @@
 }
 
 static int
+an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
+{
+
+#ifdef INVARIANTS
+	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
+#else
+	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)",
+	    __func__, iq, ctrl);
+#endif
+	return (EDOOFUS);
+}
+
+int
+t4_register_an_handler(struct adapter *sc, an_handler_t h)
+{
+	uintptr_t *loc, new;
+
+	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
+	loc = (uintptr_t *) &sc->an_handler;
+	atomic_store_rel_ptr(loc, new);
+
+	return (0);
+}
+
+static int
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
@@ -3072,7 +3125,7 @@
 	    sysctl_tx_rate, "A", "Tx rate");
 #endif
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		/*
 		 * dev.t4nex.X.toe.
@@ -3125,7 +3178,7 @@
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &pi->first_txq, 0, "index of first tx queue");
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter)) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &pi->nofldrxq, 0,
@@ -4543,7 +4596,7 @@
 		goto done;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (sc->offload_map) {
 		rc = EBUSY;
 		goto done;
@@ -4734,7 +4787,7 @@
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
@@ -4755,12 +4808,11 @@
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
+	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
 
-	fwr = mtod(m, struct fw_filter_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
@@ -4830,7 +4882,7 @@
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
@@ -4838,7 +4890,7 @@
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
@@ -4846,18 +4898,16 @@
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
+	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
-
-	fwr = mtod(m, struct fw_filter_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
@@ -5215,7 +5265,7 @@
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 toe_capability(struct port_info *pi, int enable)
 {
@@ -5228,13 +5278,28 @@
 		return (ENODEV);
 
 	if (enable) {
+		if (!(sc->flags & FULL_INIT_DONE)) {
+			log(LOG_WARNING,
+			    "You must enable a cxgbe interface first\n");
+			return (EAGAIN);
+		}
+
 		if (isset(&sc->offload_map, pi->port_id))
 			return (0);
 
-		if (sc->offload_map == 0) {
-			rc = activate_uld(sc, ULD_TOM, &sc->tom);
+		if (!(sc->flags & TOM_INIT_DONE)) {
+			rc = t4_activate_uld(sc, ULD_TOM);
+			if (rc == EAGAIN) {
+				log(LOG_WARNING,
+				    "You must kldload t4_tom.ko before trying "
+				    "to enable TOE on a cxgbe interface.\n");
+			}
 			if (rc != 0)
 				return (rc);
+			KASSERT(sc->tom_softc != NULL,
+			    ("%s: TOM activated but softc NULL", __func__));
+			KASSERT(sc->flags & TOM_INIT_DONE,
+			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		setbit(&sc->offload_map, pi->port_id);
@@ -5242,15 +5307,9 @@
 		if (!isset(&sc->offload_map, pi->port_id))
 			return (0);
 
+		KASSERT(sc->flags & TOM_INIT_DONE,
+		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
-
-		if (sc->offload_map == 0) {
-			rc = deactivate_uld(&sc->tom);
-			if (rc != 0) {
-				setbit(&sc->offload_map, pi->port_id);
-				return (rc);
-			}
-		}
 	}
 
 	return (0);
@@ -5305,8 +5364,8 @@
 	return (rc);
 }
 
-static int
-activate_uld(struct adapter *sc, int id, struct uld_softc *usc)
+int
+t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc = EAGAIN;
 	struct uld_info *ui;
@@ -5315,13 +5374,9 @@
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
-			rc = ui->attach(sc, &usc->softc);
-			if (rc == 0) {
-				KASSERT(usc->softc != NULL,
-				    ("%s: ULD %d has no state", __func__, id));
+			rc = ui->activate(sc);
+			if (rc == 0)
 				ui->refcount++;
-				usc->uld = ui;
-			}
 			goto done;
 		}
 	}
@@ -5331,25 +5386,21 @@
 	return (rc);
 }
 
-static int
-deactivate_uld(struct uld_softc *usc)
+int
+t4_deactivate_uld(struct adapter *sc, int id)
 {
-	int rc;
+	int rc = EINVAL;
+	struct uld_info *ui;
 
 	mtx_lock(&t4_uld_list_lock);
 
-	if (usc->uld == NULL || usc->softc == NULL) {
-		rc = EINVAL;
-		goto done;
-	}
-
-	rc = usc->uld->detach(usc->softc);
-	if (rc == 0) {
-		KASSERT(usc->uld->refcount > 0,
-		    ("%s: ULD has bad refcount", __func__));
-		usc->uld->refcount--;
-		usc->uld = NULL;
-		usc->softc = NULL;
+	SLIST_FOREACH(ui, &t4_uld_list, link) {
+		if (ui->uld_id == id) {
+			rc = ui->deactivate(sc);
+			if (rc == 0)
+				ui->refcount--;
+			goto done;
+		}
 	}
 done:
 	mtx_unlock(&t4_uld_list_lock);
@@ -5379,7 +5430,7 @@
 	if (t4_nrxq1g < 1)
 		t4_nrxq1g = min(nc, NRXQ_1G);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (t4_nofldtxq10g < 1)
 		t4_nofldtxq10g = min(nc, NOFLDTXQ_10G);
 
@@ -5426,7 +5477,7 @@
 		t4_sge_modload();
 		mtx_init(&t4_list_lock, "T4 adapters", 0, MTX_DEF);
 		SLIST_INIT(&t4_list);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		mtx_init(&t4_uld_list_lock, "T4 ULDs", 0, MTX_DEF);
 		SLIST_INIT(&t4_uld_list);
 #endif
@@ -5434,7 +5485,7 @@
 		break;
 
 	case MOD_UNLOAD:
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		mtx_lock(&t4_uld_list_lock);
 		if (!SLIST_EMPTY(&t4_uld_list)) {
 			rc = EBUSY;
diff -r 7cec8c20120e sys/dev/cxgbe/t4_sge.c
--- a/sys/dev/cxgbe/t4_sge.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/dev/cxgbe/t4_sge.c	Mon Jun 11 00:15:24 2012 -0700
@@ -34,6 +34,7 @@
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
+#include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
@@ -51,7 +52,6 @@
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
-#include "t4_l2t.h"
 
 struct fl_buf_info {
 	int size;
@@ -115,14 +115,14 @@
 static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct port_info *, struct sge_rxq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
@@ -397,7 +397,7 @@
 		if (i == pi->port_id)
 			break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (sc->flags & INTR_DIRECT)
 			rc += pi->nrxq + pi->nofldrxq;
 		else
@@ -434,7 +434,7 @@
 	if (sc->intr_count == 1)
 		return (&sc->sge.fwq);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (sc->flags & INTR_DIRECT) {
 		idx %= pi->nrxq + pi->nofldrxq;
 		
@@ -475,19 +475,20 @@
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
+	struct sysctl_oid *oid2 = NULL;
 #endif
 	char name[16];
 	struct adapter *sc = pi->adapter;
-	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev), *oid2 = NULL;
+	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
 	    NULL, "rx queues");
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
@@ -515,7 +516,7 @@
 		init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name);
 
 		if (sc->flags & INTR_DIRECT
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		    || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq)
 #endif
 		   ) {
@@ -527,7 +528,7 @@
 		}
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-iq",
@@ -567,7 +568,7 @@
 		j++;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			continue;
@@ -603,7 +604,7 @@
 		j++;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
 	for_each_ofld_txq(pi, i, ofld_txq) {
@@ -655,7 +656,7 @@
 	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
@@ -677,7 +678,7 @@
 		free_txq(pi, txq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_txq(pi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
@@ -693,7 +694,7 @@
 			free_rxq(pi, rxq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
 			free_ofld_rxq(pi, ofld_rxq);
@@ -709,7 +710,7 @@
 			free_rxq(pi, rxq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			free_ofld_rxq(pi, ofld_rxq);
@@ -775,7 +776,7 @@
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
-	struct sge_rxq *rxq = (void *)iq;	/* Use iff iq is part of rxq */
+	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
 	struct sge_fl *fl = &rxq->fl;		/* Use iff IQ_HAS_FL */
 	struct adapter *sc = iq->adapter;
 	struct rsp_ctrl *ctrl;
@@ -862,7 +863,8 @@
 				break;
 
 			default:
-				panic("%s: rsp_type %u", __func__, rsp_type);
+				sc->an_handler(iq, ctrl);
+				break;
 			}
 
 			iq_next(iq);
@@ -1076,42 +1078,33 @@
 	return (0);
 }
 
-int
-t4_mgmt_tx(struct adapter *sc, struct mbuf *m)
-{
-	return t4_wrq_tx(sc, &sc->sge.mgmtq, m);
-}
-
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
-int
-t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
+void
+t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 	struct sge_eq *eq = &wrq->eq;
 	int can_reclaim;
 	caddr_t dst;
-	struct mbuf *wr, *next;
 
 	TXQ_LOCK_ASSERT_OWNED(wrq);
+#ifdef TCP_OFFLOAD
 	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
 	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
 	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
-
-	if (__predict_true(m0 != NULL)) {
-		if (wrq->head)
-			wrq->tail->m_nextpkt = m0;
-		else
-			wrq->head = m0;
-		while (m0->m_nextpkt)
-			m0 = m0->m_nextpkt;
-		wrq->tail = m0;
-	}
+#else
+	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
+	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+#endif
+
+	if (__predict_true(wr != NULL))
+		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 
 	can_reclaim = reclaimable(eq);
 	if (__predict_false(eq->flags & EQ_STALLED)) {
 		if (can_reclaim < tx_resume_threshold(eq))
-			return (0);
+			return;
 		eq->flags &= ~EQ_STALLED;
 		eq->unstalled++;
 	}
@@ -1120,39 +1113,34 @@
 	if (__predict_false(eq->cidx >= eq->cap))
 		eq->cidx -= eq->cap;
 
-	for (wr = wrq->head; wr; wr = next) {
+	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
 		int ndesc;
-		struct mbuf *m;
-
-		next = wr->m_nextpkt;
-		wr->m_nextpkt = NULL;
-
-		M_ASSERTPKTHDR(wr);
-		KASSERT(wr->m_pkthdr.len > 0 && (wr->m_pkthdr.len & 0x7) == 0,
-		    ("%s: work request len %d.", __func__, wr->m_pkthdr.len));
-
-		if (wr->m_pkthdr.len > SGE_MAX_WR_LEN) {
+
+		if (__predict_false(wr->wr_len < 0 ||
+		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
+
 #ifdef INVARIANTS
-			panic("%s: oversized work request", __func__);
-#else
-			log(LOG_ERR, "%s: %s work request too long (%d)",
-			    device_get_nameunit(sc->dev), __func__,
-			    wr->m_pkthdr.len);
-			m_freem(wr);
+			panic("%s: work request with length %d", __func__,
+			    wr->wr_len);
+#endif
+#ifdef KDB
+			kdb_backtrace();
+#endif
+			log(LOG_ERR, "%s: %s work request with length %d",
+			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
+			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+			free_wrqe(wr);
 			continue;
-#endif
 		}
 
-		ndesc = howmany(wr->m_pkthdr.len, EQ_ESIZE);
+		ndesc = howmany(wr->wr_len, EQ_ESIZE);
 		if (eq->avail < ndesc) {
-			wr->m_nextpkt = next;
 			wrq->no_desc++;
 			break;
 		}
 
 		dst = (void *)&eq->desc[eq->pidx];
-		for (m = wr; m; m = m->m_next)
-			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
+		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
 
 		eq->pidx += ndesc;
 		eq->avail -= ndesc;
@@ -1164,7 +1152,8 @@
 			ring_eq_db(sc, eq);
 
 		wrq->tx_wrs++;
-		m_freem(wr);
+		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+		free_wrqe(wr);
 
 		if (eq->avail < 8) {
 			can_reclaim = reclaimable(eq);
@@ -1178,20 +1167,11 @@
 	if (eq->pending)
 		ring_eq_db(sc, eq);
 
-	if (wr == NULL)
-		wrq->head = wrq->tail = NULL;
-	else {
-		wrq->head = wr;
-
-		KASSERT(wrq->tail->m_nextpkt == NULL,
-		    ("%s: wrq->tail grew a tail of its own", __func__));
-
+	if (wr != NULL) {
 		eq->flags |= EQ_STALLED;
 		if (callout_pending(&eq->tx_callout) == 0)
 			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
 	}
-
-	return (0);
 }
 
 /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
@@ -1792,6 +1772,7 @@
 static int
 free_mgmtq(struct adapter *sc)
 {
+
 	return free_wrq(sc, &sc->sge.mgmtq);
 }
 
@@ -1885,7 +1866,7 @@
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
@@ -2031,7 +2012,7 @@
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
@@ -2103,7 +2084,7 @@
 		rc = eth_eq_alloc(sc, pi, eq);
 		break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, pi, eq);
 		break;
@@ -2141,7 +2122,7 @@
 			    eq->cntxt_id);
 			break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
@@ -2183,6 +2164,7 @@
 		return (rc);
 
 	wrq->adapter = sc;
+	STAILQ_INIT(&wrq->wr_list);
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
@@ -3179,7 +3161,7 @@
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
-	if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) {
+	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_connect.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_connect.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,377 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* atid services */
+static int alloc_atid(struct adapter *, void *);
+static void *lookup_atid(struct adapter *, int);
+static void free_atid(struct adapter *, int);
+
+static int
+alloc_atid(struct adapter *sc, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+	int atid = -1;
+
+	mtx_lock(&t->atid_lock);
+	if (t->afree) {
+		union aopen_entry *p = t->afree;
+
+		atid = p - t->atid_tab;
+		t->afree = p->next;
+		p->data = ctx;
+		t->atids_in_use++;
+	}
+	mtx_unlock(&t->atid_lock);
+	return (atid);
+}
+
+static void *
+lookup_atid(struct adapter *sc, int atid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->atid_tab[atid].data);
+}
+
+static void
+free_atid(struct adapter *sc, int atid)
+{
+	struct tid_info *t = &sc->tids;
+	union aopen_entry *p = &t->atid_tab[atid];
+
+	mtx_lock(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	mtx_unlock(&t->atid_lock);
+}
+
+/*
+ * Active open failed.
+ */
+static int
+do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_act_establish *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	unsigned int atid = G_TID_TID(ntohl(cpl->tos_atid));
+	struct toepcb *toep = lookup_atid(sc, atid);
+	struct inpcb *inp = toep->inp;
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid);
+	free_atid(sc, atid);
+
+	INP_WLOCK(inp);
+	toep->tid = tid;
+	insert_tid(sc, tid, toep);
+	if (inp->inp_flags & INP_DROPPED) {
+
+		/* socket closed by the kernel before hw told us it connected */
+
+		send_flowc_wr(toep, NULL);
+		send_reset(sc, toep, be32toh(cpl->snd_isn));
+		goto done;
+	}
+
+	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+done:
+	INP_WUNLOCK(inp);
+	return (0);
+}
+
+static inline int
+act_open_has_tid(unsigned int status)
+{
+
+	return (status != CPL_ERR_TCAM_FULL &&
+	    status != CPL_ERR_TCAM_PARITY &&
+	    status != CPL_ERR_CONN_EXIST &&
+	    status != CPL_ERR_ARP_MISS);
+}
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static inline int
+act_open_rpl_status_to_errno(int status)
+{
+
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return (ECONNREFUSED);
+	case CPL_ERR_ARP_MISS:
+		return (EHOSTUNREACH);
+	case CPL_ERR_CONN_TIMEDOUT:
+		return (ETIMEDOUT);
+	case CPL_ERR_TCAM_FULL:
+		return (ENOMEM);
+	case CPL_ERR_CONN_EXIST:
+		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return (EADDRINUSE);
+	default:
+		return (EIO);
+	}
+}
+
+static int
+do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
+	unsigned int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
+	unsigned int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
+	struct toepcb *toep = lookup_atid(sc, atid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toedev *tod = &toep->td->tod;
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status);
+
+	/* Ignore negative advice */
+	if (status == CPL_ERR_RTX_NEG_ADVICE)
+		return (0);
+
+	free_atid(sc, atid);
+	toep->tid = -1;
+
+	if (status && act_open_has_tid(status))
+		release_tid(sc, GET_TID(cpl), toep->ctrlq);
+
+	if (status == CPL_ERR_TCAM_FULL) {
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, EAGAIN);
+		final_cpl_received(toep);	/* unlocks inp */
+	} else {
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(status));
+		final_cpl_received(toep);	/* unlocks inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	}
+
+	return (0);
+}
+
+/*
+ * Options2 for active open.
+ */
+static uint32_t
+calc_opt2a(struct socket *so)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct port_info *pi = toep->port;
+	struct adapter *sc = pi->adapter;
+	uint32_t opt2 = 0;
+
+	if (tp->t_flags & TF_SACK_PERMIT)
+		opt2 |= F_SACK_EN;
+
+	if (tp->t_flags & TF_REQ_TSTMP)
+		opt2 |= F_TSTAMPS_EN;
+
+	if (tp->t_flags & TF_REQ_SCALE)
+		opt2 |= F_WND_SCALE_EN;
+
+	if (V_tcp_do_ecn)
+		opt2 |= F_CCTRL_ECN;
+
+	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
+
+	return (htobe32(opt2));
+}
+
+
+void
+t4_init_connect_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
+	t4_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
+}
+
+/*
+ * active open (soconnect).
+ *
+ * State of affairs on entry:
+ * soisconnecting (so_state |= SS_ISCONNECTING)
+ * tcbinfo not locked (This has changed - used to be WLOCKed)
+ * inp WLOCKed
+ * tp->t_state = TCPS_SYN_SENT
+ * rtalloc1, RT_UNLOCK on rt.
+ */
+int
+t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
+    struct sockaddr *nam)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct toepcb *toep = NULL;
+	struct wrqe *wr = NULL;
+	struct cpl_act_open_req *cpl;
+	struct l2t_entry *e = NULL;
+	struct ifnet *rt_ifp = rt->rt_ifp;
+	struct port_info *pi;
+	int atid = -1, mtu_idx, rscale, qid_atid, rc = ENOMEM;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (nam->sa_family != AF_INET)
+		CXGBE_UNIMPLEMENTED("IPv6 connect");
+
+	if (rt_ifp->if_type == IFT_ETHER)
+		pi = rt_ifp->if_softc;
+	else if (rt_ifp->if_type == IFT_L2VLAN) {
+		struct ifnet *ifp = VLAN_COOKIE(rt_ifp);
+
+		pi = ifp->if_softc;
+	} else if (rt_ifp->if_type == IFT_IEEE8023ADLAG)
+		return (ENOSYS);	/* XXX: implement lagg support */
+	else
+		return (ENOTSUP);
+
+	toep = alloc_toepcb(pi, -1, -1, M_NOWAIT);
+	if (toep == NULL)
+		goto failed;
+
+	atid = alloc_atid(sc, toep);
+	if (atid < 0)
+		goto failed;
+
+	e = t4_l2t_get(pi, rt_ifp,
+	    rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam);
+	if (e == NULL)
+		goto failed;
+
+	wr = alloc_wrqe(sizeof(*cpl), toep->ctrlq);
+	if (wr == NULL)
+		goto failed;
+	cpl = wrtod(wr);
+
+	toep->tid = atid;
+	toep->l2te = e;
+	toep->ulp_mode = ULP_MODE_NONE;
+	SOCKBUF_LOCK(&so->so_rcv);
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	offload_socket(so, toep);
+
+	/*
+	 * The kernel sets request_r_scale based on sb_max whereas we need to
+	 * take hardware's MAX_RCV_WND into account too.  This is normally a
+	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
+	 */
+	if (tp->t_flags & TF_REQ_SCALE)
+		rscale = tp->request_r_scale = select_rcv_wscale();
+	else
+		rscale = 0;
+	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
+	qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | atid;
+
+	INIT_TP_WR(cpl, 0);
+	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid));
+	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
+	    &cpl->peer_port);
+	cpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, toep->rx_credits,
+	    toep->ulp_mode);
+	cpl->params = select_ntuple(pi, e, sc->filter_mode);
+	cpl->opt2 = calc_opt2a(so);
+
+	CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__,
+	    toep->tid, tcpstates[tp->t_state], toep, inp);
+
+	rc = t4_l2t_send(sc, wr, e);
+	if (rc == 0) {
+		toepcb_set_flag(toep, TPF_CPL_PENDING);
+		return (0);
+	}
+
+	undo_offload_socket(so);
+failed:
+	CTR5(KTR_CXGBE, "%s: FAILED, atid %d, toep %p, l2te %p, wr %p",
+	    __func__, atid, toep, e, wr);
+
+	if (e)
+		t4_l2t_release(e);
+	if (wr)
+		free_wrqe(wr);
+	if (atid >= 0)
+		free_atid(sc, atid);
+	if (toep)
+		free_toepcb(toep);
+
+	return (rc);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_cpl_io.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,1259 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sglist.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+VNET_DECLARE(int, tcp_do_autosndbuf);
+#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
+VNET_DECLARE(int, tcp_autosndbuf_inc);
+#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
+VNET_DECLARE(int, tcp_autosndbuf_max);
+#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+
+void
+send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
+{
+        struct wrqe *wr;
+        struct fw_flowc_wr *flowc;
+	unsigned int nparams = ftxp ? 8 : 4, flowclen;
+	struct port_info *pi = toep->port;
+	struct adapter *sc = pi->adapter;
+	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+	KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
+
+	CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
+
+	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+	wr = alloc_wrqe(roundup(flowclen, 16), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	flowc = wrtod(wr);
+	memset(flowc, 0, wr->wr_len);
+
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(nparams));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+	    V_FW_WR_FLOWID(toep->tid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+        flowc->mnemval[0].val = htobe32(pfvf);
+        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+        flowc->mnemval[1].val = htobe32(pi->tx_chan);
+        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+        flowc->mnemval[2].val = htobe32(pi->tx_chan);
+        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+        flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id);
+	if (ftxp) {
+		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
+
+		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+		flowc->mnemval[4].val = htobe32(ftxp->snd_nxt);
+		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+		flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt);
+		flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+		flowc->mnemval[6].val = htobe32(sndbuf);
+		flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+		flowc->mnemval[7].val = htobe32(ftxp->mss);
+	}
+
+	txsd->tx_credits = howmany(flowclen, 16);
+	txsd->plen = 0;
+	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
+	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
+	toep->tx_credits -= txsd->tx_credits;
+	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+		toep->txsd_pidx = 0;
+	toep->txsd_avail--;
+
+	toepcb_set_flag(toep, TPF_FLOWC_WR_SENT);
+        t4_wrq_tx(sc, wr);
+}
+
+void
+send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
+{
+	struct wrqe *wr;
+	struct cpl_abort_req *req;
+	int tid = toep->tid;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
+
+	INP_WLOCK_ASSERT(inp);
+
+	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
+	    __func__, toep->tid,
+	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
+	    tcpstates[tp->t_state],
+	    toep->flags, inp->inp_flags,
+	    toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ?
+	    " (abort already in progress)" : "");
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		return;	/* abort already in progress */
+
+	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
+
+	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
+	if (inp->inp_flags & INP_DROPPED)
+		req->rsvd0 = htobe32(snd_nxt);
+	else
+		req->rsvd0 = htobe32(tp->snd_nxt);
+	req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT);
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	/*
+	 * XXX: What's the correct way to tell that the inp hasn't been detached
+	 * from its socket?  Should I even be flushing the snd buffer here?
+	 */
+	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+		struct socket *so = inp->inp_socket;
+
+		if (so != NULL)	/* because I'm not sure.  See comment above */
+			sbflush(&so->so_snd);
+	}
+
+	t4_l2t_send(sc, wr, toep->l2te);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to FreeBSD's native format.
+ */
+static void
+assign_rxopt(struct tcpcb *tp, unsigned int opt)
+{
+	struct toepcb *toep = tp->t_toe;
+	struct adapter *sc = td_adapter(toep->td);
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+
+	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40;
+
+	if (G_TCPOPT_TSTAMP(opt)) {
+		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
+		tp->ts_recent = 0;		/* hmmm */
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
+	}
+
+	if (G_TCPOPT_SACK(opt))
+		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
+	else
+		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
+
+	if (G_TCPOPT_WSCALE_OK(opt))
+		tp->t_flags |= TF_RCVD_SCALE;
+
+	/* Doing window scaling? */
+	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+		tp->rcv_scale = tp->request_r_scale;
+		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
+	}
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCPS_ESTABLISHED.
+ *
+ * The ISNs are from after the exchange of SYNs.  i.e., the true ISN + 1.
+ */
+void
+make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
+    uint16_t opt)
+{
+	struct inpcb *inp = toep->inp;
+	struct socket *so = inp->inp_socket;
+	struct tcpcb *tp = intotcpcb(inp);
+	long bufsize;
+	uint32_t iss = be32toh(snd_isn) - 1;	/* true ISS */
+	uint32_t irs = be32toh(rcv_isn) - 1;	/* true IRS */
+	uint16_t tcpopt = be16toh(opt);
+	struct flowc_tx_params ftxp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_SYN_SENT ||
+	    tp->t_state == TCPS_SYN_RECEIVED,
+	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
+
+	CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p",
+	    __func__, toep->tid, toep, inp);
+
+	tp->t_state = TCPS_ESTABLISHED;
+	tp->t_starttime = ticks;
+	TCPSTAT_INC(tcps_connects);
+
+	tp->irs = irs;
+	tcp_rcvseqinit(tp);
+	tp->rcv_wnd = toep->rx_credits << 10;
+	tp->rcv_adv += tp->rcv_wnd;
+	tp->last_ack_sent = tp->rcv_nxt;
+
+	/*
+	 * If we were unable to send all rx credits via opt0, save the remainder
+	 * in rx_credits so that they can be handed over with the next credit
+	 * update.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	bufsize = select_rcv_wnd(so);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	toep->rx_credits = bufsize - tp->rcv_wnd;
+
+	tp->iss = iss;
+	tcp_sendseqinit(tp);
+	tp->snd_una = iss + 1;
+	tp->snd_nxt = iss + 1;
+	tp->snd_max = iss + 1;
+
+	assign_rxopt(tp, tcpopt);
+
+	SOCKBUF_LOCK(&so->so_snd);
+	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
+		bufsize = V_tcp_autosndbuf_max;
+	else
+		bufsize = sbspace(&so->so_snd);
+	SOCKBUF_UNLOCK(&so->so_snd);
+
+	ftxp.snd_nxt = tp->snd_nxt;
+	ftxp.rcv_nxt = tp->rcv_nxt;
+	ftxp.snd_space = bufsize;
+	ftxp.mss = tp->t_maxseg;
+	send_flowc_wr(toep, &ftxp);
+
+	soisconnected(so);
+}
+
+static int
+send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits)
+{
+	struct wrqe *wr;
+	struct cpl_rx_data_ack *req;
+	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
+	if (wr == NULL)
+		return (0);
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
+	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
+
+	t4_wrq_tx(sc, wr);
+	return (credits);
+}
+
+void
+t4_rcvd(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	struct sockbuf *so_rcv = &so->so_rcv;
+	struct toepcb *toep = tp->t_toe;
+	int must_send;
+
+	INP_WLOCK_ASSERT(inp);
+
+	SOCKBUF_LOCK(so_rcv);
+	KASSERT(toep->enqueued >= so_rcv->sb_cc,
+	    ("%s: so_rcv->sb_cc > enqueued", __func__));
+	toep->rx_credits += toep->enqueued - so_rcv->sb_cc;
+	toep->enqueued = so_rcv->sb_cc;
+	SOCKBUF_UNLOCK(so_rcv);
+
+	must_send = toep->rx_credits + 16384 >= tp->rcv_wnd;
+	if (must_send || toep->rx_credits >= 15 * 1024) {
+		int credits;
+
+		credits = send_rx_credits(sc, toep, toep->rx_credits);
+		toep->rx_credits -= credits;
+		tp->rcv_wnd += credits;
+		tp->rcv_adv += credits;
+	}
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.
+ */
+static int
+close_conn(struct adapter *sc, struct toepcb *toep)
+{
+	struct wrqe *wr;
+	struct cpl_close_con_req *req;
+	unsigned int tid = toep->tid;
+
+	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
+	    toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : "");
+
+	if (toepcb_flag(toep, TPF_FIN_SENT))
+		return (0);
+
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
+
+	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+        req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
+	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
+	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
+	    V_FW_WR_FLOWID(tid));
+        req->wr.wr_lo = cpu_to_be64(0);
+        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+
+	toepcb_set_flag(toep, TPF_FIN_SENT);
+	toepcb_clr_flag(toep, TPF_SEND_FIN);
+	t4_l2t_send(sc, wr, toep->l2te);
+
+	return (0);
+}
+
+#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
+#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
+
+/* Maximum amount of immediate data we could stuff in a WR */
+static inline int
+max_imm_payload(int tx_credits)
+{
+	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_OFLD_TX_CREDITS)
+		return (0);
+
+	if (tx_credits >= (n * EQ_ESIZE) / 16)
+		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
+	else
+		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
+}
+
+/* Maximum number of SGL entries we could stuff in a WR */
+static inline int
+max_dsgl_nsegs(int tx_credits)
+{
+	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
+	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_OFLD_TX_CREDITS)
+		return (0);
+
+	nseg += 2 * (sge_pair_credits * 16 / 24);
+	if ((sge_pair_credits * 16) % 24 == 16)
+		nseg++;
+
+	return (nseg);
+}
+
+static inline void
+write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
+    unsigned int plen, uint8_t credits, int more_to_come)
+{
+	struct fw_ofld_tx_data_wr *txwr = dst;
+	int shove = !more_to_come;
+	int compl = 1;
+
+	/*
+	 * We always request completion notifications from the firmware.  The
+	 * only exception is when we know we'll get more data to send shortly
+	 * and that we'll have some tx credits remaining to transmit that data.
+	 */
+	if (more_to_come && toep->tx_credits - credits >= MIN_OFLD_TX_CREDITS)
+		compl = 0;
+
+	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
+	    V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen));
+	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
+	    V_FW_WR_LEN16(credits));
+	txwr->tunnel_to_proxy =
+	    htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) |
+		V_FW_OFLD_TX_DATA_WR_URGENT(0) |	/* XXX */
+		V_FW_OFLD_TX_DATA_WR_SHOVE(shove));
+	txwr->plen = htobe32(plen);
+}
+
+/*
+ * Generate a DSGL from a starting mbuf.  The total number of segments and the
+ * maximum segments in any one mbuf are provided.
+ */
+static void
+write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
+{
+	struct mbuf *m;
+	struct ulptx_sgl *usgl = dst;
+	int i, j, rc;
+	struct sglist sg;
+	struct sglist_seg segs[n];
+
+	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
+
+	sglist_init(&sg, n, segs);
+	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
+	    V_ULPTX_NSGE(nsegs));
+
+	i = -1;
+	for (m = start; m != stop; m = m->m_next) {
+		rc = sglist_append(&sg, mtod(m, void *), m->m_len);
+		if (__predict_false(rc != 0))
+			panic("%s: sglist_append %d", __func__, rc);
+
+		for (j = 0; j < sg.sg_nseg; i++, j++) {
+			if (i < 0) {
+				usgl->len0 = htobe32(segs[j].ss_len);
+				usgl->addr0 = htobe64(segs[j].ss_paddr);
+			} else {
+				usgl->sge[i / 2].len[i & 1] =
+				    htobe32(segs[j].ss_len);
+				usgl->sge[i / 2].addr[i & 1] =
+				    htobe64(segs[j].ss_paddr);
+			}
+#ifdef INVARIANTS
+			nsegs--;
+#endif
+		}
+		sglist_reset(&sg);
+	}
+	if (i & 1)
+		usgl->sge[i / 2].len[1] = htobe32(0);
+	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
+	    __func__, nsegs, start, stop));
+}
+
+/*
+ * Max number of SGL entries an offload tx work request can have.  This is 41
+ * (1 + 40) for a full 512B work request.
+ * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
+ */
+#define OFLD_SGL_LEN (41)
+
+/*
+ * Send data and/or a FIN to the peer.
+ *
+ * The socket's so_snd buffer consists of a stream of data starting with sb_mb
+ * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
+ * was transmitted.
+ */
+static void
+t4_push_frames(struct adapter *sc, struct toepcb *toep)
+{
+	struct mbuf *sndptr, *m, *sb_sndptr;
+	struct fw_ofld_tx_data_wr *txwr;
+	struct wrqe *wr;
+	unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	struct socket *so = inp->inp_socket;
+	struct sockbuf *sb = &so->so_snd;
+	int tx_credits;
+	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
+
+	if (toep->ulp_mode != ULP_MODE_NONE)
+		CXGBE_UNIMPLEMENTED("ulp_mode");
+
+	/*
+	 * This function doesn't resume by itself.  Someone else must clear the
+	 * flag and call this function.
+	 */
+	if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED)))
+		return;
+
+	do {
+		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
+		max_imm = max_imm_payload(tx_credits);
+		max_nsegs = max_dsgl_nsegs(tx_credits);
+
+		SOCKBUF_LOCK(sb);
+		sb_sndptr = sb->sb_sndptr;
+		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
+		plen = 0;
+		nsegs = 0;
+		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+		for (m = sndptr; m != NULL; m = m->m_next) {
+			int n = sglist_count(mtod(m, void *), m->m_len);
+
+			nsegs += n;
+			plen += m->m_len;
+
+			/* This mbuf sent us _over_ the nsegs limit, back out */
+			if (plen > max_imm && nsegs > max_nsegs) {
+				nsegs -= n;
+				plen -= m->m_len;
+				if (plen == 0) {
+					/* Too few credits */
+					toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+					SOCKBUF_UNLOCK(sb);
+					return;
+				}
+				break;
+			}
+
+			if (max_nsegs_1mbuf < n)
+				max_nsegs_1mbuf = n;
+			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
+
+			/* This mbuf put us right at the max_nsegs limit */
+			if (plen > max_imm && nsegs == max_nsegs) {
+				m = m->m_next;
+				break;
+			}
+		}
+		SOCKBUF_UNLOCK(sb);
+
+		/* nothing to send */
+		if (plen == 0) {
+			KASSERT(m == NULL,
+			    ("%s: nothing to send, but m != NULL", __func__));
+			break;
+		}
+
+		if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT)))
+			panic("%s: excess tx.", __func__);
+
+		if (plen <= max_imm) {
+
+			/* Immediate data tx */
+
+			wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16),
+					toep->ofld_txq);
+			if (wr == NULL) {
+				/* XXX: how will we recover from this? */
+				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				return;
+			}
+			txwr = wrtod(wr);
+			credits = howmany(wr->wr_len, 16);
+			write_tx_wr(txwr, toep, plen, plen, credits,
+			    tp->t_flags & TF_MORETOCOME);
+			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
+		} else {
+			int wr_len;
+
+			/* DSGL tx */
+
+			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
+			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
+			wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq);
+			if (wr == NULL) {
+				/* XXX: how will we recover from this? */
+				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				return;
+			}
+			txwr = wrtod(wr);
+			credits = howmany(wr_len, 16);
+			write_tx_wr(txwr, toep, 0, plen, credits,
+			    tp->t_flags & TF_MORETOCOME);
+			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
+			    max_nsegs_1mbuf);
+			if (wr_len & 0xf) {
+				uint64_t *pad = (uint64_t *)
+				    ((uintptr_t)txwr + wr_len);
+				*pad = 0;
+			}
+		}
+
+		KASSERT(toep->tx_credits >= credits,
+			("%s: not enough credits", __func__));
+
+		toep->tx_credits -= credits;
+
+		tp->snd_nxt += plen;
+		tp->snd_max += plen;
+
+		SOCKBUF_LOCK(sb);
+		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
+		sb->sb_sndptr = sb_sndptr;
+		SOCKBUF_UNLOCK(sb);
+
+		toepcb_set_flag(toep, TPF_TX_DATA_SENT);
+
+		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+		txsd->plen = plen;
+		txsd->tx_credits = credits;
+		txsd++;
+		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
+			toep->txsd_pidx = 0;
+			txsd = &toep->txsd[0];
+		}
+		toep->txsd_avail--;
+
+		t4_l2t_send(sc, wr, toep->l2te);
+	} while (m != NULL);
+
+	/* Send a FIN if requested, but only if there's no more data to send */
+	if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN))
+		close_conn(sc, toep);
+}
+
+int
+t4_tod_output(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	t4_push_frames(sc, toep);
+
+	return (0);
+}
+
+int
+t4_send_fin(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	toepcb_set_flag(toep, TPF_SEND_FIN);
+	t4_push_frames(sc, toep);
+
+	return (0);
+}
+
+int
+t4_send_rst(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#if defined(INVARIANTS)
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	/* hmmmm */
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc for tid %u [%s] not sent already",
+	    __func__, toep->tid, tcpstates[tp->t_state]));
+
+	send_reset(sc, toep, 0);
+	return (0);
+}
+
+/*
+ * Peer has sent us a FIN.
+ */
+static int
+do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = NULL;
+	struct socket *so = NULL;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PEER_CLOSE,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
+	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		goto done;
+
+	so = inp->inp_socket;
+
+	socantrcvmore(so);
+	tp->rcv_nxt++;	/* FIN */
+	KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
+	    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
+	    be32toh(cpl->rcv_nxt)));
+
+	switch (tp->t_state) {
+	case TCPS_SYN_RECEIVED:
+		tp->t_starttime = ticks;
+		/* FALLTHROUGH */ 
+
+	case TCPS_ESTABLISHED:
+		tp->t_state = TCPS_CLOSE_WAIT;
+		break;
+
+	case TCPS_FIN_WAIT_1:
+		tp->t_state = TCPS_CLOSING;
+		break;
+
+	case TCPS_FIN_WAIT_2:
+		tcp_twstart(tp);
+		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		final_cpl_received(toep);
+		return (0);
+
+	default:
+		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
+		    __func__, tid, tp->t_state);
+	}
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	return (0);
+}
+
+/*
+ * Peer has ACK'd our FIN.
+ */
+static int
+do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = NULL;
+	struct socket *so = NULL;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_CLOSE_CON_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
+	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		goto done;
+
+	so = inp->inp_socket;
+	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
+
+	switch (tp->t_state) {
+	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
+		tcp_twstart(tp);
+release:
+		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		final_cpl_received(toep);	/* no more CPLs expected */
+
+		return (0);
+	case TCPS_LAST_ACK:
+		if (tcp_close(tp))
+			INP_WUNLOCK(inp);
+		goto release;
+
+	case TCPS_FIN_WAIT_1:
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+			soisdisconnected(so);
+		tp->t_state = TCPS_FIN_WAIT_2;
+		break;
+
+	default:
+		log(LOG_ERR,
+		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
+		    __func__, tid, tcpstates[tp->t_state]);
+	}
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	return (0);
+}
+
+void
+send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
+    int rst_status)
+{
+	struct wrqe *wr;
+	struct cpl_abort_rpl *cpl;
+
+	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	cpl = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
+	cpl->cmd = rst_status;
+
+	t4_wrq_tx(sc, wr);
+}
+
+static int
+abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN:
+	case CPL_ERR_CONN_RESET:
+		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return (ETIMEDOUT);
+	default:
+		return (EIO);
+	}
+}
+
+/*
+ * TCP RST from the peer, timeout, or some other such critical error.
+ */
+static int
+do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct sge_wrq *ofld_txq = toep->ofld_txq;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_REQ_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+	if (toepcb_flag(toep, TPF_SYNQE))
+		return (do_abort_req_synqe(iq, rss, m));
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) {
+		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
+		    __func__, cpl->status, tid, toep->flags);
+		return (0);	/* Ignore negative advice */
+	}
+
+	inp = toep->inp;
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for tcp_close */
+	INP_WLOCK(inp);
+
+	tp = intotcpcb(inp);
+	so = inp->inp_socket;
+
+	CTR6(KTR_CXGBE,
+	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
+	    __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags,
+	    cpl->status);
+
+	/*
+	 * If we'd initiated an abort earlier the reply to it is responsible for
+	 * cleaning up resources.  Otherwise we tear everything down right here
+	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
+	 */
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) {
+		INP_WUNLOCK(inp);
+		goto done;
+	}
+	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+	so_error_set(so, abort_status_to_errno(tp, cpl->status));
+	tp = tcp_close(tp);
+	if (tp == NULL)
+		INP_WLOCK(inp);	/* re-acquire */
+
+	final_cpl_received(toep);
+done:
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+	return (0);
+}
+
+/*
+ * Reply to the CPL_ABORT_REQ (send_reset)
+ */
+static int
+do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_RPL_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+	if (toepcb_flag(toep, TPF_SYNQE))
+		return (do_abort_rpl_synqe(iq, rss, m));
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
+	    __func__, tid, toep, inp, cpl->status);
+
+	KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+	    ("%s: wasn't expecting abort reply", __func__));
+
+	INP_WLOCK(inp);
+	final_cpl_received(toep);
+
+	return (0);
+}
+
+static int
+do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_rx_data *cpl = mtod(m, const void *);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	struct sockbuf *so_rcv;
+
+	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+		/*
+		 * do_pass_establish failed and must be attempting to abort the
+		 * synqe's tid.  Meanwhile, the T4 has sent us data for such a
+		 * connection.
+		 */
+		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		    ("%s: synqe and tid isn't being aborted.", __func__));
+		m_freem(m);
+		return (0);
+	}
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	/* strip off CPL header */
+	m_adj(m, sizeof(*cpl));
+
+	INP_WLOCK(inp);
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+		INP_WUNLOCK(inp);
+		m_freem(m);
+		return (0);
+	}
+
+	tp = intotcpcb(inp);
+
+#ifdef INVARIANTS
+	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) {
+		log(LOG_ERR,
+		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
+		    __func__, be32toh(cpl->seq), toep->tid, tp->rcv_nxt);
+	}
+#endif
+
+	tp->rcv_nxt += m->m_pkthdr.len;
+	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
+	    ("%s: negative window size", __func__));
+	tp->rcv_wnd -= m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+
+	so = inp_inpcbtosocket(inp);
+	so_rcv = &so->so_rcv;
+	SOCKBUF_LOCK(so_rcv);
+
+	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
+		    __func__, tid, m->m_pkthdr.len);
+		m_freem(m);
+		SOCKBUF_UNLOCK(so_rcv);
+		INP_WUNLOCK(inp);
+
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		tp = tcp_drop(tp, ECONNRESET);
+		if (tp)
+			INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		return (0);
+	}
+
+	/* receive buffer autosize */
+	if (so_rcv->sb_flags & SB_AUTOSIZE &&
+	    V_tcp_do_autorcvbuf &&
+	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
+	    m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) {
+		unsigned int hiwat = so_rcv->sb_hiwat;
+		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		    V_tcp_autorcvbuf_max);
+
+		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
+			so_rcv->sb_flags &= ~SB_AUTOSIZE;
+		else
+			toep->rx_credits += newsize - hiwat;
+	}
+	toep->enqueued += m->m_pkthdr.len;
+	sbappendstream_locked(so_rcv, m);
+	sorwakeup_locked(so);
+	SOCKBUF_UNLOCK_ASSERT(so_rcv);
+
+	INP_WUNLOCK(inp);
+	return (0);
+}
+
+#define S_CPL_FW4_ACK_OPCODE    24
+#define M_CPL_FW4_ACK_OPCODE    0xff
+#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
+#define G_CPL_FW4_ACK_OPCODE(x) \
+    (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
+ 
+#define S_CPL_FW4_ACK_FLOWID    0
+#define M_CPL_FW4_ACK_FLOWID    0xffffff
+#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
+#define G_CPL_FW4_ACK_FLOWID(x) \
+    (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
+ 
+#define S_CPL_FW4_ACK_CR        24
+#define M_CPL_FW4_ACK_CR        0xff
+#define V_CPL_FW4_ACK_CR(x)     ((x) << S_CPL_FW4_ACK_CR)
+#define G_CPL_FW4_ACK_CR(x)     (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
+ 
+#define S_CPL_FW4_ACK_SEQVAL    0
+#define M_CPL_FW4_ACK_SEQVAL    0x1
+#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
+#define G_CPL_FW4_ACK_SEQVAL(x) \
+    (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
+#define F_CPL_FW4_ACK_SEQVAL    V_CPL_FW4_ACK_SEQVAL(1U)
+
+static int
+do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
+	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	uint8_t credits = cpl->credits;
+	struct ofld_tx_sdesc *txsd;
+	int plen;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	/*
+	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
+	 * now this comes back carrying the credits for the flowc.
+	 */
+	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		    ("%s: credits for a synq entry %p", __func__, toep));
+		return (0);
+	}
+
+	inp = toep->inp;
+
+	KASSERT(opcode == CPL_FW4_ACK,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_WLOCK(inp);
+
+	if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) {
+		INP_WUNLOCK(inp);
+		return (0);
+	}
+
+	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
+
+	tp = intotcpcb(inp);
+
+	if (cpl->seq_vld) {
+		tcp_seq snd_una = be32toh(cpl->snd_una);
+
+#ifdef INVARIANTS
+		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+			log(LOG_ERR,
+			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
+			    __func__, snd_una, toep->tid, tp->snd_una);
+		}
+#endif
+
+		if (tp->snd_una != snd_una) {
+			tp->snd_una = snd_una;
+			tp->ts_recent_age = tcp_ts_getticks();
+		}
+	}
+
+	so = inp->inp_socket;
+	txsd = &toep->txsd[toep->txsd_cidx];
+	plen = 0;
+	while (credits) {
+		KASSERT(credits >= txsd->tx_credits,
+		    ("%s: too many (or partial) credits", __func__));
+		credits -= txsd->tx_credits;
+		toep->tx_credits += txsd->tx_credits;
+		plen += txsd->plen;
+		txsd++;
+		toep->txsd_avail++;
+		KASSERT(toep->txsd_avail <= toep->txsd_total,
+		    ("%s: txsd avail > total", __func__));
+		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
+			txsd = &toep->txsd[0];
+			toep->txsd_cidx = 0;
+		}
+	}
+
+	if (plen > 0) {
+		struct sockbuf *sb = &so->so_snd;
+
+		SOCKBUF_LOCK(sb);
+		sbdrop_locked(sb, plen);
+		sowwakeup_locked(so);
+		SOCKBUF_UNLOCK_ASSERT(sb);
+	}
+
+	/* XXX */
+	if ((toepcb_flag(toep, TPF_TX_SUSPENDED) &&
+	    toep->tx_credits >= MIN_OFLD_TX_CREDITS) ||
+	    toep->tx_credits == toep->txsd_total *
+	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) {
+		toepcb_clr_flag(toep, TPF_TX_SUSPENDED);
+		t4_push_frames(sc, toep);
+	}
+	INP_WUNLOCK(inp);
+
+	return (0);
+}
+
+void
+t4_init_cpl_io_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
+	t4_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
+	t4_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
+	t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
+	t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
+	t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_listen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_listen.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,1362 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/refcount.h>
+#include <sys/domain.h>
+#include <sys/fnv_hash.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* stid services */
+static int alloc_stid(struct adapter *, void *);
+static void *lookup_stid(struct adapter *, int);
+static void free_stid(struct adapter *, int);
+
+/* lctx services */
+static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
+    struct port_info *);
+static int free_lctx(struct adapter *, struct listen_ctx *);
+static void hold_lctx(struct listen_ctx *);
+static void listen_hash_add(struct adapter *, struct listen_ctx *);
+static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
+static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
+static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
+
+static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
+static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
+static void send_reset_synqe(struct toedev *, struct synq_entry *);
+
+/* XXX: won't work for IPv6 */
+static int
+alloc_stid(struct adapter *sc, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+	int stid = -1;
+
+	mtx_lock(&t->stid_lock);
+	if (t->sfree) {
+		union serv_entry *p = t->sfree;
+
+		stid = p - t->stid_tab;
+		stid += t->stid_base;
+		t->sfree = p->next;
+		p->data = ctx;
+		t->stids_in_use++;
+	}
+	mtx_unlock(&t->stid_lock);
+	return (stid);
+}
+
+static void *
+lookup_stid(struct adapter *sc, int stid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->stid_tab[stid - t->stid_base].data);
+}
+
+static void
+free_stid(struct adapter *sc, int stid)
+{
+	struct tid_info *t = &sc->tids;
+	union serv_entry *p = &t->stid_tab[stid - t->stid_base];
+
+	mtx_lock(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	mtx_unlock(&t->stid_lock);
+}
+
+static struct listen_ctx *
+alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
+{
+	struct listen_ctx *lctx;
+
+	INP_WLOCK_ASSERT(inp);
+
+	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
+	if (lctx == NULL)
+		return (NULL);
+
+	lctx->stid = alloc_stid(sc, lctx);
+	if (lctx->stid < 0) {
+		free(lctx, M_CXGBE);
+		return (NULL);
+	}
+
+	lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
+	lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
+	refcount_init(&lctx->refcount, 1);
+	TAILQ_INIT(&lctx->synq);
+
+	lctx->inp = inp;
+	in_pcbref(inp);
+
+	return (lctx);
+}
+
+/* Don't call this directly, use release_lctx instead */
+static int
+free_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct inpcb *inp = lctx->inp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(lctx->refcount == 0,
+	    ("%s: refcount %d", __func__, lctx->refcount));
+	KASSERT(TAILQ_EMPTY(&lctx->synq),
+	    ("%s: synq not empty.", __func__));
+	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
+
+	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
+	    __func__, lctx->stid, lctx, lctx->inp);
+
+	free_stid(sc, lctx->stid);
+	free(lctx, M_CXGBE);
+
+	return (in_pcbrele_wlocked(inp));
+}
+
+static void
+hold_lctx(struct listen_ctx *lctx)
+{
+
+	refcount_acquire(&lctx->refcount);
+}
+
+static inline uint32_t
+listen_hashfn(void *key, u_long mask)
+{
+
+	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
+}
+
+/*
+ * Add a listen_ctx entry to the listen hash table.
+ */
+static void
+listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
+	td->lctx_count++;
+	mtx_unlock(&td->lctx_hash_lock);
+}
+
+/*
+ * Look for the listening socket's context entry in the hash and return it.
+ */
+static struct listen_ctx *
+listen_hash_find(struct adapter *sc, struct inpcb *inp)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx;
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
+		if (lctx->inp == inp)
+			break;
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	return (lctx);
+}
+
+/*
+ * Removes the listen_ctx structure for inp from the hash and returns it.
+ */
+static struct listen_ctx *
+listen_hash_del(struct adapter *sc, struct inpcb *inp)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx, *l;
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
+		if (lctx->inp == inp) {
+			LIST_REMOVE(lctx, link);
+			td->lctx_count--;
+			break;
+		}
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	return (lctx);
+}
+
+/*
+ * Releases a hold on the lctx.  Must be called with the listening socket's inp
+ * locked.  The inp may be freed by this function and it returns NULL to
+ * indicate this.
+ */
+static struct inpcb *
+release_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct inpcb *inp = lctx->inp;
+	int inp_freed = 0;
+
+	INP_WLOCK_ASSERT(inp);
+	if (refcount_release(&lctx->refcount))
+		inp_freed = free_lctx(sc, lctx);
+
+	return (inp_freed ? NULL : inp);
+}
+
+static void
+send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct mbuf *m = synqe->syn;
+	struct ifnet *ifp = m->m_pkthdr.rcvif;
+	struct port_info *pi = ifp->if_softc;
+	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+        struct wrqe *wr;
+        struct fw_flowc_wr *flowc;
+	struct cpl_abort_req *req;
+	int txqid, rxqid, flowclen;
+	struct sge_wrq *ofld_txq;
+	struct sge_ofld_rxq *ofld_rxq;
+	const int nparams = 4;
+	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+
+	INP_WLOCK_ASSERT(synqe->lctx->inp);
+
+	CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s",
+	    __func__, synqe, synqe->tid,
+	    synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ?
+	    " (abort already in progress)" : "");
+	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN))
+		return;	/* abort already in progress */
+	synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN);
+
+	get_qids_from_mbuf(m, &txqid, &rxqid);
+	ofld_txq = &sc->sge.ofld_txq[txqid];
+	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+	/* The wrqe will have two WRs - a flowc followed by an abort_req */
+	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+	wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	flowc = wrtod(wr);
+	req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE));
+
+	/* First the flowc ... */
+	memset(flowc, 0, wr->wr_len);
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(nparams));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+	    V_FW_WR_FLOWID(synqe->tid));
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+        flowc->mnemval[0].val = htobe32(pfvf);
+        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+        flowc->mnemval[1].val = htobe32(pi->tx_chan);
+        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+        flowc->mnemval[2].val = htobe32(pi->tx_chan);
+        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+        flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
+	synqe_set_flag(synqe, TPF_FLOWC_WR_SENT);
+
+	/* ... then ABORT request */
+	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
+	req->rsvd0 = 0;	/* don't have a snd_nxt */
+	req->rsvd1 = 1;	/* no data sent yet */
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	t4_l2t_send(sc, wr, e);
+}
+
+static int
+create_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct wrqe *wr;
+	struct cpl_pass_open_req *req;
+	struct in_conninfo *inc = &lctx->inp->inp_inc;
+
+	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+	if (wr == NULL) {
+		log(LOG_ERR, "%s: allocation failure", __func__);
+		return (ENOMEM);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
+	req->local_port = inc->inc_lport;
+	req->peer_port = 0;
+	req->local_ip = inc->inc_laddr.s_addr;
+	req->peer_ip = 0;
+	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
+	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
+	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
+
+	t4_wrq_tx(sc, wr);
+	return (0);
+}
+
+static int
+destroy_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct wrqe *wr;
+	struct cpl_close_listsvr_req *req;
+
+	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
+	    lctx->stid));
+	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
+	req->rsvd = htobe16(0);
+
+	t4_wrq_tx(sc, wr);
+	return (0);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ *
+ * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
+ * sc->offload_map, if_capenable are all race prone.
+ */
+int
+t4_listen_start(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct port_info *pi;
+	struct inpcb *inp = tp->t_inpcb;
+	struct listen_ctx *lctx;
+	int i;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if ((inp->inp_vflag & INP_IPV4) == 0)
+		return (0);
+
+#if 0
+	ADAPTER_LOCK(sc);
+	if (IS_BUSY(sc)) {
+		log(LOG_ERR, "%s: listen request ignored, %s is busy",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+
+	KASSERT(sc->flags & TOM_INIT_DONE,
+	    ("%s: TOM not initialized", __func__));
+#endif
+
+	if ((sc->open_device_map & sc->offload_map) == 0)
+		goto done;	/* no port that's UP with IFCAP_TOE enabled */
+
+	/*
+	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
+	 * queues to send the passive open and receive the reply to it.
+	 *
+	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
+	 * then reject any attempt to bring down such a port (and maybe reject
+	 * attempts to disable IFCAP_TOE on that port too?).
+	 */
+	for_each_port(sc, i) {
+		if (isset(&sc->open_device_map, i) &&
+		    sc->port[i]->ifp->if_capenable & IFCAP_TOE4)
+				break;
+	}
+	KASSERT(i < sc->params.nports,
+	    ("%s: no running port with TOE capability enabled.", __func__));
+	pi = sc->port[i];
+
+	if (listen_hash_find(sc, inp) != NULL)
+		goto done;	/* already setup */
+
+	lctx = alloc_lctx(sc, inp, pi);
+	if (lctx == NULL) {
+		log(LOG_ERR,
+		    "%s: listen request ignored, %s couldn't allocate lctx\n",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+	listen_hash_add(sc, lctx);
+
+	CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__,
+	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
+
+	if (create_server(sc, lctx) != 0) {
+		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
+		    device_get_nameunit(sc->dev));
+		(void) listen_hash_del(sc, inp);
+		inp = release_lctx(sc, lctx);
+		/* can't be freed, host stack has a reference */
+		KASSERT(inp != NULL, ("%s: inp freed", __func__));
+		goto done;
+	}
+	lctx->flags |= LCTX_RPL_PENDING;
+done:
+#if 0
+	ADAPTER_UNLOCK(sc);
+#endif
+	return (0);
+}
+
+int
+t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
+{
+	struct listen_ctx *lctx;
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = tp->t_inpcb;
+	struct synq_entry *synqe;
+
+	INP_WLOCK_ASSERT(inp);
+
+	lctx = listen_hash_del(sc, inp);
+	if (lctx == NULL)
+		return (ENOENT);	/* no hardware listener for this inp */
+
+	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
+	    lctx, lctx->flags);
+
+	/*
+	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
+	 * arrive and clean up when it does.
+	 */
+	if (lctx->flags & LCTX_RPL_PENDING) {
+		KASSERT(TAILQ_EMPTY(&lctx->synq),
+		    ("%s: synq not empty.", __func__));
+		return (EINPROGRESS);
+	}
+
+	/*
+	 * The host stack will abort all the connections on the listening
+	 * socket's so_comp.  It doesn't know about the connections on the synq
+	 * so we need to take care of those.
+	 */
+	TAILQ_FOREACH(synqe, &lctx->synq, link)
+		send_reset_synqe(tod, synqe);
+
+	destroy_server(sc, lctx);
+	return (0);
+}
+
+static inline void
+hold_synqe(struct synq_entry *synqe)
+{
+
+	refcount_acquire(&synqe->refcnt);
+}
+
+static inline void
+release_synqe(struct synq_entry *synqe)
+{
+
+	if (refcount_release(&synqe->refcnt)) {
+		int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+		m_freem(synqe->syn);
+		if (needfree)
+			free(synqe, M_CXGBE);
+	}
+}
+
+void
+t4_syncache_added(struct toedev *tod __unused, void *arg)
+{
+	struct synq_entry *synqe = arg;
+
+	hold_synqe(synqe);
+}
+
+void
+t4_syncache_removed(struct toedev *tod __unused, void *arg)
+{
+	struct synq_entry *synqe = arg;
+
+	release_synqe(synqe);
+}
+
+/* XXX */
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+
+int
+t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct synq_entry *synqe = arg;
+	struct wrqe *wr;
+	struct l2t_entry *e;
+	struct tcpopt to;
+	struct ip *ip = mtod(m, struct ip *);
+	struct tcphdr *th = (void *)(ip + 1);
+
+	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
+	if (wr == NULL)
+		return (EALREADY);
+
+	bzero(&to, sizeof(to));
+	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
+	    TO_SYN);
+
+	/* save these for later */
+	synqe->iss = be32toh(th->th_seq);
+	synqe->ts = to.to_tsval;
+
+	e = &sc->l2t->l2tab[synqe->l2e_idx];
+	t4_l2t_send(sc, wr, e);
+
+	m_freem(m);	/* don't need this any more */
+	return (0);
+}
+
+static int
+do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
+	int stid = GET_TID(cpl);
+	unsigned int status = cpl->status;
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_OPEN_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	INP_WLOCK(inp);
+
+	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
+	    __func__, stid, status, lctx->flags);
+
+	lctx->flags &= ~LCTX_RPL_PENDING;
+
+	if (status != CPL_ERR_NONE)
+		log(LOG_ERR, "listener with stid %u failed: %d", stid, status);
+
+#ifdef INVARIANTS
+	/*
+	 * If the inp has been dropped (listening socket closed) then
+	 * listen_stop must have run and taken the inp out of the hash.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		KASSERT(listen_hash_del(sc, inp) == NULL,
+		    ("%s: inp %p still in listen hash", __func__, inp));
+	}
+#endif
+
+	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
+		if (release_lctx(sc, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/*
+	 * Listening socket stopped listening earlier and now the chip tells us
+	 * it has started the hardware listener.  Stop it; the lctx will be
+	 * released in do_close_server_rpl.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		destroy_server(sc, lctx);
+		INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/*
+	 * Failed to start hardware listener.  Take inp out of the hash and
+	 * release our reference on it.  An error message has been logged
+	 * already.
+	 */
+	if (status != CPL_ERR_NONE) {
+		listen_hash_del(sc, inp);
+		if (release_lctx(sc, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/* hardware listener open for business */
+
+	INP_WUNLOCK(inp);
+	return (status);
+}
+
+static int
+do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
+	int stid = GET_TID(cpl);
+	unsigned int status = cpl->status;
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
+
+	if (status != CPL_ERR_NONE) {
+		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
+		    __func__, status, stid);
+		return (status);
+	}
+
+	INP_WLOCK(inp);
+	inp = release_lctx(sc, lctx);
+	if (inp != NULL)
+		INP_WUNLOCK(inp);
+
+	return (status);
+}
+
+static void
+done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
+{
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
+	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+
+	INP_WLOCK_ASSERT(inp);
+
+	TAILQ_REMOVE(&lctx->synq, synqe, link);
+	inp = release_lctx(sc, lctx);
+	if (inp)
+		INP_WUNLOCK(inp);
+	remove_tid(sc, synqe->tid);
+	release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
+	t4_l2t_release(e);
+	release_synqe(synqe);	/* removed from synq list */
+}
+
+int
+do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	int txqid;
+	struct sge_wrq *ofld_txq;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_REQ_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+	if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    cpl->status == CPL_ERR_PERSIST_NEG_ADVICE)
+		return (0);	/* Ignore negative advice */
+
+	INP_WLOCK(inp);
+
+	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
+	ofld_txq = &sc->sge.ofld_txq[txqid];
+
+	/*
+	 * If we'd initiated an abort earlier the reply to it is responsible for
+	 * cleaning up resources.  Otherwise we tear everything down right here
+	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
+	 */
+	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) {
+		INP_WUNLOCK(inp);
+		goto done;
+	}
+
+	done_with_synqe(sc, synqe);
+	/* inp lock released by done_with_synqe */
+done:
+	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+	return (0);
+}
+
+int
+do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_RPL_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+	INP_WLOCK(inp);
+	KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
+	    __func__, synqe, synqe->flags));
+
+	done_with_synqe(sc, synqe);
+	/* inp lock released by done_with_synqe */
+
+	return (0);
+}
+
+void
+t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct synq_entry *synqe = arg;
+#ifdef INVARIANTS
+	struct inpcb *inp = sotoinpcb(so);
+#endif
+	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
+	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
+
+	INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	    ("%s: %p not a synq_entry?", __func__, arg));
+
+	offload_socket(so, toep);
+	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+	toepcb_set_flag(toep, TPF_CPL_PENDING);
+	update_tid(sc, synqe->tid, toep);
+}
+
+static inline void
+save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
+{
+	uint32_t txqid, rxqid;
+
+	txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+	rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+
+	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
+}
+
+static inline void
+get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
+{
+
+	if (txqid)
+		*txqid = m->m_pkthdr.flowid >> 16;
+	if (rxqid)
+		*rxqid = m->m_pkthdr.flowid & 0xffff;
+}
+
+/*
+ * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
+ * store some state temporarily.
+ */
+static struct synq_entry *
+mbuf_to_synqe(struct mbuf *m)
+{
+	int len = roundup(sizeof (struct synq_entry), 8);
+	int tspace = M_TRAILINGSPACE(m);
+	struct synq_entry *synqe = NULL;
+
+	if (tspace < len) {
+		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
+		if (synqe == NULL)
+			return (NULL);
+	} else
+		synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe));
+
+	synqe->flags = 0;
+	synqe_set_flag(synqe, TPF_SYNQE);
+	if (tspace < len)
+		synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+	return (synqe);
+}
+
+static void
+t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
+{
+	bzero(to, sizeof(*to));
+
+	if (t4opt->mss) {
+		to->to_flags |= TOF_MSS;
+		to->to_mss = be16toh(t4opt->mss);
+	}
+
+	if (t4opt->wsf) {
+		to->to_flags |= TOF_SCALE;
+		to->to_wscale = t4opt->wsf;
+	}
+
+	if (t4opt->tstamp)
+		to->to_flags |= TOF_TS;
+
+	if (t4opt->sack)
+		to->to_flags |= TOF_SACKPERM;
+}
+
+/*
+ * Options2 for passive open.
+ */
+static uint32_t
+calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
+    const struct tcp_options *tcpopt, struct tcphdr *th)
+{
+	uint32_t opt2 = 0;
+	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+	if (V_tcp_do_rfc1323) {
+		if (tcpopt->tstamp)
+			opt2 |= F_TSTAMPS_EN;
+		if (tcpopt->sack)
+			opt2 |= F_SACK_EN;
+		if (tcpopt->wsf > 0)
+			opt2 |= F_WND_SCALE_EN;
+	}
+
+	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
+		opt2 |= F_CCTRL_ECN;
+
+	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
+
+	return htobe32(opt2);
+}
+
+/* XXX: duplication. */
+static inline void
+tcp_fields_to_host(struct tcphdr *th)
+{
+
+	th->th_seq = ntohl(th->th_seq);
+	th->th_ack = ntohl(th->th_ack);
+	th->th_win = ntohs(th->th_win);
+	th->th_urp = ntohs(th->th_urp);
+}
+
+static void
+pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
+    struct tcphdr *th)
+{
+	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+	const struct ether_header *eh;
+	unsigned int hlen = be32toh(cpl->hdr_len);
+	const struct ip *ip;
+	const struct tcphdr *tcp;
+
+	eh = (const void *)(cpl + 1);
+	ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
+	tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen));
+
+	if (inc) {
+		bzero(inc, sizeof(*inc));
+		inc->inc_faddr = ip->ip_src;
+		inc->inc_laddr = ip->ip_dst;
+		inc->inc_fport = tcp->th_sport;
+		inc->inc_lport = tcp->th_dport;
+		if (ip->ip_v == 6)
+			inc->inc_flags |= INC_ISIPV6;
+	}
+
+	if (th) {
+		bcopy(tcp, th, sizeof(*th));
+		tcp_fields_to_host(th);		/* just like tcp_input */
+	}
+}
+
+#define REJECT_PASS_ACCEPT()	do { \
+	reject_reason = __LINE__; \
+	goto reject; \
+} while (0)
+
+/*
+ * The context associated with a tid entry via insert_tid could be a synq_entry
+ * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
+ */
+CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
+
+/*
+ * Incoming SYN on a listening socket.
+ *
+ * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
+ * etc.
+ */
+static int
+do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	struct toedev *tod;
+	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+	struct cpl_pass_accept_rpl *rpl;
+	struct wrqe *wr;
+	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+	unsigned int tid = GET_TID(cpl);
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp;
+	struct socket *so;
+	struct in_conninfo inc;
+	struct tcphdr th;
+	struct tcpopt to;
+	struct port_info *pi;
+	struct ifnet *ifp, *ifp_vlan = NULL;
+	struct l2t_entry *e = NULL;
+	struct rtentry *rt;
+	struct sockaddr_in nam;
+	int rscale, mtu_idx, rx_credits, rxqid;
+	struct synq_entry *synqe = NULL;
+	int reject_reason;
+	uint16_t vid;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
+	    lctx);
+
+	pass_accept_req_to_protohdrs(m, &inc, &th);
+	t4opt_to_tcpopt(&cpl->tcpopt, &to);
+
+	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
+	ifp = pi->ifp;
+	m->m_pkthdr.rcvif = ifp;
+	tod = TOEDEV(ifp);
+
+	/*
+	 * Don't offload if the interface that received the SYN doesn't have
+	 * IFCAP_TOE enabled.
+	 */
+	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
+		REJECT_PASS_ACCEPT();
+
+	/* Don't offload IPv6 connections. XXX: add IPv6 support */
+	if (inc.inc_flags & INC_ISIPV6)
+		REJECT_PASS_ACCEPT();
+
+	/*
+	 * Don't offload if the SYN had a VLAN tag and the vid doesn't match
+	 * anything on this interface.
+	 */
+	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
+	if (vid != 0xfff) {
+		ifp_vlan = VLAN_DEVAT(ifp, vid);
+		if (ifp_vlan == NULL)
+			REJECT_PASS_ACCEPT();
+	}
+
+	/*
+	 * Don't offload if the peer requested a TCP option that's not known to
+	 * the silicon.
+	 */
+	if (cpl->tcpopt.unknown)
+		REJECT_PASS_ACCEPT();
+
+	/*
+	 * Don't offload if the outgoing interface for the route back to the
+	 * peer is not the same as the interface that received the SYN.
+	 * XXX: too restrictive.
+	 */
+	nam.sin_len = sizeof(nam);
+	nam.sin_family = AF_INET;
+	nam.sin_addr = inc.inc_faddr;
+	rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
+	if (rt == NULL)
+		REJECT_PASS_ACCEPT();
+	else {
+		struct sockaddr *nexthop;
+
+		RT_UNLOCK(rt);
+		nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
+		    (struct sockaddr *)&nam;
+		if (rt->rt_ifp == ifp ||
+		    (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan))
+			e = t4_l2t_get(pi, rt->rt_ifp, nexthop);
+		RTFREE(rt);
+		if (e == NULL)
+			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
+	}
+
+	synqe = mbuf_to_synqe(m);
+	if (synqe == NULL)
+		REJECT_PASS_ACCEPT();
+
+	wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
+	if (wr == NULL)
+		REJECT_PASS_ACCEPT();
+	rpl = wrtod(wr);
+
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for 4-tuple check, syncache_add */
+
+	/* Don't offload if the 4-tuple is already in use */
+	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+
+	inp = lctx->inp;		/* listening socket, not owned by TOE */
+	INP_WLOCK(inp);
+
+	/* Don't offload if the listening socket has closed */
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The reply from the TOE to
+		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
+		 * resources tied to this listen context.
+		 */
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+	so = inp->inp_socket;
+
+	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
+	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
+	SOCKBUF_LOCK(&so->so_rcv);
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	save_qids_in_mbuf(m, pi);
+	get_qids_from_mbuf(m, NULL, &rxqid);
+
+	INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
+	rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits,
+	    ULP_MODE_NONE);
+	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th);
+
+	synqe->tid = tid;
+	synqe->lctx = lctx;
+	synqe->syn = m;
+	m = NULL;
+	refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the
+					     duration of this function */
+	synqe->l2e_idx = e->idx;
+	synqe->rcv_bufsize = rx_credits;
+	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
+
+	insert_tid(sc, tid, synqe);
+	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
+	hold_synqe(synqe);	/* hold for the duration it's in the synq */
+	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
+
+	/*
+	 * If all goes well t4_syncache_respond will get called during
+	 * syncache_add.  Also note that syncache_add releases both pcbinfo and
+	 * pcb locks.
+	 */
+	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
+	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+
+	/*
+	 * If we replied during syncache_add (synqe->wr has been consumed),
+	 * good.  Otherwise, set it to 0 so that further syncache_respond
+	 * attempts by the kernel will be ignored.
+	 *
+	 * The extra hold on the synqe makes sure that it is still around, even
+	 * if the listener has been dropped and the synqe was aborted and the
+	 * reply to the abort has removed and released the synqe from the synq
+	 * list.
+	 */
+	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
+
+		INP_WLOCK(inp);
+		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+			/* listener closed.  synqe must have been aborted. */
+			KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+			    ("%s: listener %p closed but synqe %p not aborted",
+			    __func__, inp, synqe));
+
+			CTR5(KTR_CXGBE,
+			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
+			    __func__, stid, tid, lctx, synqe);
+			INP_WUNLOCK(inp);
+			free(wr, M_CXGBE);
+			release_synqe(synqe);	/* about to exit function */
+			return (__LINE__);
+		}
+
+		/*
+		 * synqe aborted before TOM replied to PASS_ACCEPT_REQ.  But
+		 * that can only happen if the listener was closed and we just
+		 * checked for that.
+		 */
+		KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		    ("%s: synqe %p aborted, but listener %p not dropped.",
+		    __func__, synqe, inp));
+
+		/* Yank the synqe out of the lctx synq. */
+		TAILQ_REMOVE(&lctx->synq, synqe, link);
+		release_synqe(synqe);	/* removed from synq list */
+		inp = release_lctx(sc, lctx);
+		if (inp)
+			INP_WUNLOCK(inp);
+
+		/*
+		 * syncache may or may not have a hold on the synqe, which may
+		 * or may not be stashed in the original SYN mbuf passed to us.
+		 * Just copy it over instead of dealing with all possibilities.
+		 */
+		m = m_dup(synqe->syn, M_DONTWAIT);
+		if (m)
+			m->m_pkthdr.rcvif = ifp;
+
+		release_synqe(synqe);	/* about to exit function */
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+	release_synqe(synqe);	/* about to exit function */
+	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
+	    __func__, stid, tid, lctx, synqe);
+	return (0);
+reject:
+	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
+	    reject_reason);
+
+	if (e)
+		t4_l2t_release(e);
+	release_tid(sc, tid, lctx->ctrlq);
+
+	if (__predict_true(m != NULL)) {
+		m_adj(m, sizeof(*cpl));
+		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
+		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+		m->m_pkthdr.csum_data = 0xffff;
+		ifp->if_input(ifp, m);
+	}
+
+	return (reject_reason);
+}
+
+static void
+synqe_to_protohdrs(struct synq_entry *synqe,
+    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
+    struct tcphdr *th, struct tcpopt *to)
+{
+	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
+
+	/* start off with the original SYN */
+	pass_accept_req_to_protohdrs(synqe->syn, inc, th);
+
+	/* modify parts to make it look like the ACK to our SYN|ACK */
+	th->th_flags = TH_ACK;
+	th->th_ack = synqe->iss + 1;
+	th->th_seq = be32toh(cpl->rcv_isn);
+	bzero(to, sizeof(*to));
+	if (G_TCPOPT_TSTAMP(tcp_opt)) {
+		to->to_flags |= TOF_TS;
+		to->to_tsecr = synqe->ts;
+	}
+}
+
+static int
+do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	struct port_info *pi;
+	struct ifnet *ifp;
+	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
+#if defined(KTR) || defined(INVARIANTS)
+	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+#endif
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	struct socket *so;
+	struct tcphdr th;
+	struct tcpopt to;
+	struct in_conninfo inc;
+	struct toepcb *toep;
+	u_int txqid, rxqid;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_ESTABLISH,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
+
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for syncache_expand */
+	INP_WLOCK(inp);
+
+	CTR6(KTR_CXGBE,
+	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
+	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
+
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The TOM must have aborted
+		 * all the embryonic connections (including this one) that were
+		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
+		 * for cleaning up.
+		 */
+		KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		    ("%s: listen socket dropped but tid %u not aborted.",
+		    __func__, tid));
+
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		return (0);
+	}
+
+	ifp = synqe->syn->m_pkthdr.rcvif;
+	pi = ifp->if_softc;
+	KASSERT(pi->adapter == sc,
+	    ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
+
+	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
+	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
+	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
+	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
+
+	toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
+	if (toep == NULL) {
+reset:
+		/* The reply to this abort will perform final cleanup */
+		send_reset_synqe(TOEDEV(ifp), synqe);
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		return (0);
+	}
+	toep->tid = tid;
+	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
+	toep->ulp_mode = ULP_MODE_NONE;
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	toep->rx_credits = synqe->rcv_bufsize;
+
+	so = inp->inp_socket;
+	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
+
+	/* Come up with something that syncache_expand should be ok with. */
+	synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
+
+	/*
+	 * No more need for anything in the mbuf that carried the
+	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
+	 * there.  XXX: bad form but I don't want to increase the size of synqe.
+	 */
+	m = synqe->syn;
+	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
+	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
+	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
+	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
+
+	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
+		free_toepcb(toep);
+		goto reset;
+	}
+
+	/* Done with the synqe */
+	TAILQ_REMOVE(&lctx->synq, synqe, link);
+	inp = release_lctx(sc, lctx);
+	if (inp != NULL)
+		INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	release_synqe(synqe);
+
+	return (0);
+}
+
+void
+t4_init_listen_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+	t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+	t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+	t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_tom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_tom.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#ifdef TCP_OFFLOAD
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* Module ops */
+static int t4_tom_mod_load(void);
+static int t4_tom_mod_unload(void);
+static int t4_tom_modevent(module_t, int, void *);
+
+/* ULD ops and helpers */
+static int t4_tom_activate(struct adapter *);
+static int t4_tom_deactivate(struct adapter *);
+
+static struct uld_info tom_uld_info = {
+	.uld_id = ULD_TOM,
+	.activate = t4_tom_activate,
+	.deactivate = t4_tom_deactivate,
+};
+
+static void queue_tid_release(struct adapter *, int);
+static void release_offload_resources(struct toepcb *);
+static int alloc_tid_tabs(struct tid_info *);
+static void free_tid_tabs(struct tid_info *);
+static void free_tom_data(struct adapter *, struct tom_data *);
+
+struct toepcb *
+alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags)
+{
+	struct adapter *sc = pi->adapter;
+	struct toepcb *toep;
+	int tx_credits, txsd_total, len;
+
+	/*
+	 * The firmware counts tx work request credits in units of 16 bytes
+	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
+	 * about tx credits if it wants to abort a connection.
+	 */
+	tx_credits = sc->params.ofldq_wr_cred;
+	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
+
+	/*
+	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
+	 * immediate payload, and firmware counts tx work request credits in
+	 * units of 16 byte.  Calculate the maximum work requests possible.
+	 */
+	txsd_total = tx_credits /
+	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16);
+
+	if (txqid < 0)
+		txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+	KASSERT(txqid >= pi->first_ofld_txq &&
+	    txqid < pi->first_ofld_txq + pi->nofldtxq,
+	    ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi,
+		pi->first_ofld_txq, pi->nofldtxq));
+
+	if (rxqid < 0)
+		rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+	KASSERT(rxqid >= pi->first_ofld_rxq &&
+	    rxqid < pi->first_ofld_rxq + pi->nofldrxq,
+	    ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi,
+		pi->first_ofld_rxq, pi->nofldrxq));
+
+	len = offsetof(struct toepcb, txsd) +
+	    txsd_total * sizeof(struct ofld_tx_sdesc);
+
+	toep = malloc(len, M_CXGBE, M_ZERO | flags);
+	if (toep == NULL)
+		return (NULL);
+
+	toep->td = sc->tom_softc;
+	toep->port = pi;
+	toep->tx_credits = tx_credits;
+	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
+	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
+	toep->txsd_total = txsd_total;
+	toep->txsd_avail = txsd_total;
+	toep->txsd_pidx = 0;
+	toep->txsd_cidx = 0;
+
+	return (toep);
+}
+
+void
+free_toepcb(struct toepcb *toep)
+{
+
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	    ("%s: attached to an inpcb", __func__));
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	    ("%s: CPL pending", __func__));
+
+	free(toep, M_CXGBE);
+}
+
+/*
+ * Set up the socket for TCP offload.
+ */
+void
+offload_socket(struct socket *so, struct toepcb *toep)
+{
+	struct tom_data *td = toep->td;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct sockbuf *sb;
+
+	INP_WLOCK_ASSERT(inp);
+
+	/* Update socket */
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+
+	/* Update TCP PCB */
+	tp->tod = &td->tod;
+	tp->t_toe = toep;
+	tp->t_flags |= TF_TOE;
+
+	/* Install an extra hold on inp */
+	toep->inp = inp;
+	toepcb_set_flag(toep, TPF_ATTACHED);
+	in_pcbref(inp);
+
+	/* Add the TOE PCB to the active list */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+}
+
+/* This is _not_ the normal way to "unoffload" a socket. */
+void
+undo_offload_socket(struct socket *so)
+{
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	struct tom_data *td = toep->td;
+	struct sockbuf *sb;
+
+	INP_WLOCK_ASSERT(inp);
+
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags &= ~SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags &= ~SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+
+	tp->tod = NULL;
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+
+	toep->inp = NULL;
+	toepcb_clr_flag(toep, TPF_ATTACHED);
+	if (in_pcbrele_wlocked(inp))
+		panic("%s: inp freed.", __func__);
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+}
+
+static void
+release_offload_resources(struct toepcb *toep)
+{
+	struct tom_data *td = toep->td;
+	struct adapter *sc = td_adapter(td);
+	int tid = toep->tid;
+
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	    ("%s: %p has CPL pending.", __func__, toep));
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	    ("%s: %p is still attached.", __func__, toep));
+
+	CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)",
+	    __func__, toep, tid, toep->l2te);
+
+	if (toep->l2te)
+		t4_l2t_release(toep->l2te);
+
+	if (tid >= 0) {
+		remove_tid(sc, tid);
+		release_tid(sc, tid, toep->ctrlq);
+	}
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+
+	free_toepcb(toep);
+}
+
+/*
+ * The kernel is done with the TCP PCB and this is our opportunity to unhook the
+ * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
+ * pending CPL) then it is time to release all resources tied to the toepcb.
+ *
+ * Also gets called when an offloaded active open fails and the TOM wants the
+ * kernel to take the TCP PCB back.
+ */
+static void
+t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
+{
+#if defined(KTR) || defined(INVARIANTS)
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED),
+	    ("%s: not attached", __func__));
+
+#ifdef KTR
+	if (tp->t_state == TCPS_SYN_SENT) {
+		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
+		    __func__, toep->tid, toep, toep->flags, inp,
+		    inp->inp_flags);
+	} else {
+		CTR6(KTR_CXGBE,
+		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
+		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
+		    inp->inp_flags);
+	}
+#endif
+
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+	toepcb_clr_flag(toep, TPF_ATTACHED);
+
+	if (toepcb_flag(toep, TPF_CPL_PENDING) == 0)
+		release_offload_resources(toep);
+}
+
+/*
+ * The TOE driver will not receive any more CPLs for the tid associated with the
+ * toepcb; release the hold on the inpcb.
+ */
+void
+final_cpl_received(struct toepcb *toep)
+{
+	struct inpcb *inp = toep->inp;
+
+	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING),
+	    ("%s: CPL not pending already?", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
+	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
+
+	toep->inp = NULL;
+	toepcb_clr_flag(toep, TPF_CPL_PENDING);
+
+	if (toepcb_flag(toep, TPF_ATTACHED) == 0)
+		release_offload_resources(toep);
+
+	if (!in_pcbrele_wlocked(inp))
+		INP_WUNLOCK(inp);
+}
+
+void
+insert_tid(struct adapter *sc, int tid, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = ctx;
+	atomic_add_int(&t->tids_in_use, 1);
+}
+
+void *
+lookup_tid(struct adapter *sc, int tid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->tid_tab[tid]);
+}
+
+void
+update_tid(struct adapter *sc, int tid, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = ctx;
+}
+
+void
+remove_tid(struct adapter *sc, int tid)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = NULL;
+	atomic_subtract_int(&t->tids_in_use, 1);
+}
+
+void
+release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
+{
+	struct wrqe *wr;
+	struct cpl_tid_release *req;
+
+	wr = alloc_wrqe(sizeof(*req), ctrlq);
+	if (wr == NULL) {
+		queue_tid_release(sc, tid);	/* defer */
+		return;
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
+
+	t4_wrq_tx(sc, wr);
+}
+
+static void
+queue_tid_release(struct adapter *sc, int tid)
+{
+
+	CXGBE_UNIMPLEMENTED("deferred tid release");
+}
+
+/*
+ * What mtu_idx to use, given a 4-tuple and/or an MSS cap
+ */
+int
+find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
+{
+	unsigned short *mtus = &sc->params.mtus[0];
+	int i = 0, mss;
+
+	KASSERT(inc != NULL || pmss > 0,
+	    ("%s: at least one of inc/pmss must be specified", __func__));
+
+	mss = inc ? tcp_mssopt(inc) : pmss;
+	if (pmss > 0 && mss > pmss)
+		mss = pmss;
+
+	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
+		++i;
+
+	return (i);
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+u_long
+select_rcv_wnd(struct socket *so)
+{
+	unsigned long wnd;
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	wnd = sbspace(&so->so_rcv);
+	if (wnd < MIN_RCV_WND)
+		wnd = MIN_RCV_WND;
+
+	return min(wnd, MAX_RCV_WND);
+}
+
+int
+select_rcv_wscale(void)
+{
+	int wscale = 0;
+	unsigned long space = sb_max;
+
+	if (space > MAX_RCV_WND)
+		space = MAX_RCV_WND;
+
+	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
+		wscale++;
+
+	return (wscale);
+}
+
+extern int always_keepalive;
+#define VIID_SMACIDX(v)	(((unsigned int)(v) & 0x7f) << 1)
+
+/*
+ * socket so could be a listening socket too.
+ */
+uint64_t
+calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e,
+    int mtu_idx, int rscale, int rx_credits, int ulp_mode)
+{
+	uint64_t opt0;
+
+	KASSERT(rx_credits <= M_RCV_BUFSIZ,
+	    ("%s: rcv_bufsiz too high", __func__));
+
+	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
+	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
+
+	if (so != NULL) {
+		struct inpcb *inp = sotoinpcb(so);
+		struct tcpcb *tp = intotcpcb(inp);
+		int keepalive = always_keepalive ||
+		    so_options_get(so) & SO_KEEPALIVE;
+
+		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
+		opt0 |= V_KEEP_ALIVE(keepalive != 0);
+	}
+
+	if (e != NULL)
+		opt0 |= V_L2T_IDX(e->idx);
+
+	if (pi != NULL) {
+		opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid));
+		opt0 |= V_TX_CHAN(pi->tx_chan);
+	}
+
+	return htobe64(opt0);
+}
+
+#define FILTER_SEL_WIDTH_P_FC (3 + 1)
+#define FILTER_SEL_WIDTH_VIN_P_FC (6 + 7 + FILTER_SEL_WIDTH_P_FC)
+#define FILTER_SEL_WIDTH_TAG_P_FC (3 + FILTER_SEL_WIDTH_VIN_P_FC)
+#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC)
+#define VLAN_NONE 0xfff
+#define FILTER_SEL_VLAN_NONE 0xffff
+
+uint32_t
+select_ntuple(struct port_info *pi, struct l2t_entry *e, uint32_t filter_mode)
+{
+	uint16_t viid = pi->viid;
+	uint32_t ntuple = 0;
+
+	if (filter_mode == HW_TPL_FR_MT_PR_IV_P_FC) {
+                if (e->vlan == VLAN_NONE)
+			ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC;
+                else {
+                        ntuple |= e->vlan << FILTER_SEL_WIDTH_P_FC;
+                        ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+                }
+                ntuple |= e->lport << S_PORT;
+		ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+	} else if (filter_mode == HW_TPL_FR_MT_PR_OV_P_FC) {
+                ntuple |= G_FW_VIID_VIN(viid) << FILTER_SEL_WIDTH_P_FC;
+                ntuple |= G_FW_VIID_PFN(viid) << FILTER_SEL_WIDTH_VIN_P_FC;
+                ntuple |= G_FW_VIID_VIVLD(viid) << FILTER_SEL_WIDTH_TAG_P_FC;
+                ntuple |= e->lport << S_PORT;
+		ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+        }
+
+	return (htobe32(ntuple));
+}
+
+static int
+alloc_tid_tabs(struct tid_info *t)
+{
+	size_t size;
+	unsigned int i;
+
+	size = t->ntids * sizeof(*t->tid_tab) +
+	    t->natids * sizeof(*t->atid_tab) +
+	    t->nstids * sizeof(*t->stid_tab);
+
+	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
+	if (t->tid_tab == NULL)
+		return (ENOMEM);
+
+	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
+	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
+	t->afree = t->atid_tab;
+	t->atids_in_use = 0;
+	for (i = 1; i < t->natids; i++)
+		t->atid_tab[i - 1].next = &t->atid_tab[i];
+	t->atid_tab[t->natids - 1].next = NULL;
+
+	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
+	t->stid_tab = (union serv_entry *)&t->atid_tab[t->natids];
+	t->sfree = t->stid_tab;
+	t->stids_in_use = 0;
+	for (i = 1; i < t->nstids; i++)
+		t->stid_tab[i - 1].next = &t->stid_tab[i];
+	t->stid_tab[t->nstids - 1].next = NULL;
+
+	atomic_store_rel_int(&t->tids_in_use, 0);
+
+	return (0);
+}
+
+static void
+free_tid_tabs(struct tid_info *t)
+{
+	KASSERT(t->tids_in_use == 0,
+	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
+	KASSERT(t->atids_in_use == 0,
+	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
+	KASSERT(t->stids_in_use == 0,
+	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
+
+	free(t->tid_tab, M_CXGBE);
+	t->tid_tab = NULL;
+
+	if (mtx_initialized(&t->atid_lock))
+		mtx_destroy(&t->atid_lock);
+	if (mtx_initialized(&t->stid_lock))
+		mtx_destroy(&t->stid_lock);
+}
+
+static void
+free_tom_data(struct adapter *sc, struct tom_data *td)
+{
+	KASSERT(TAILQ_EMPTY(&td->toep_list),
+	    ("%s: TOE PCB list is not empty.", __func__));
+	KASSERT(td->lctx_count == 0,
+	    ("%s: lctx hash table is not empty.", __func__));
+
+	t4_uninit_l2t_cpl_handlers(sc);
+
+	if (td->listen_mask != 0)
+		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
+
+	if (mtx_initialized(&td->lctx_hash_lock))
+		mtx_destroy(&td->lctx_hash_lock);
+	if (mtx_initialized(&td->toep_list_lock))
+		mtx_destroy(&td->toep_list_lock);
+
+	free_tid_tabs(&sc->tids);
+	free(td, M_CXGBE);
+}
+
+/*
+ * Ground control to Major TOM
+ * Commencing countdown, engines on
+ */
+static int
+t4_tom_activate(struct adapter *sc)
+{
+	struct tom_data *td;
+	struct toedev *tod;
+	int i, rc;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	/* per-adapter softc for TOM */
+	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
+	if (td == NULL)
+		return (ENOMEM);
+
+	/* List of TOE PCBs and associated lock */
+	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
+	TAILQ_INIT(&td->toep_list);
+
+	/* Listen context */
+	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
+	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
+	    &td->listen_mask, HASH_NOWAIT);
+
+	/* TID tables */
+	rc = alloc_tid_tabs(&sc->tids);
+	if (rc != 0)
+		goto done;
+
+	/* CPL handlers */
+	t4_init_connect_cpl_handlers(sc);
+	t4_init_l2t_cpl_handlers(sc);
+	t4_init_listen_cpl_handlers(sc);
+	t4_init_cpl_io_handlers(sc);
+
+	/* toedev ops */
+	tod = &td->tod;
+	init_toedev(tod);
+	tod->tod_softc = sc;
+	tod->tod_connect = t4_connect;
+	tod->tod_listen_start = t4_listen_start;
+	tod->tod_listen_stop = t4_listen_stop;
+	tod->tod_rcvd = t4_rcvd;
+	tod->tod_output = t4_tod_output;
+	tod->tod_send_rst = t4_send_rst;
+	tod->tod_send_fin = t4_send_fin;
+	tod->tod_pcb_detach = t4_pcb_detach;
+	tod->tod_l2_update = t4_l2_update;
+	tod->tod_syncache_added = t4_syncache_added;
+	tod->tod_syncache_removed = t4_syncache_removed;
+	tod->tod_syncache_respond = t4_syncache_respond;
+	tod->tod_offload_socket = t4_offload_socket;
+
+	for_each_port(sc, i)
+		TOEDEV(sc->port[i]->ifp) = &td->tod;
+
+	sc->tom_softc = td;
+	sc->flags |= TOM_INIT_DONE;
+	register_toedev(sc->tom_softc);
+
+done:
+	if (rc != 0)
+		free_tom_data(sc, td);
+	return (rc);
+}
+
+static int
+t4_tom_deactivate(struct adapter *sc)
+{
+	int rc = 0;
+	struct tom_data *td = sc->tom_softc;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	if (td == NULL)
+		return (0);	/* XXX. KASSERT? */
+
+	if (sc->offload_map != 0)
+		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
+
+	mtx_lock(&td->toep_list_lock);
+	if (!TAILQ_EMPTY(&td->toep_list))
+		rc = EBUSY;
+	mtx_unlock(&td->toep_list_lock);
+
+	mtx_lock(&td->lctx_hash_lock);
+	if (td->lctx_count > 0)
+		rc = EBUSY;
+	mtx_unlock(&td->lctx_hash_lock);
+
+	if (rc == 0) {
+		unregister_toedev(sc->tom_softc);
+		free_tom_data(sc, td);
+		sc->tom_softc = NULL;
+		sc->flags &= ~TOM_INIT_DONE;
+	}
+
+	return (rc);
+}
+
+static int
+t4_tom_mod_load(void)
+{
+	int rc;
+
+	rc = t4_register_uld(&tom_uld_info);
+	if (rc != 0)
+		t4_tom_mod_unload();
+
+	return (rc);
+}
+
+static void
+tom_uninit(struct adapter *sc, void *arg __unused)
+{
+	/* Try to free resources (works only if no port has IFCAP_TOE) */
+	ADAPTER_LOCK(sc);
+	if (sc->flags & TOM_INIT_DONE)
+		t4_deactivate_uld(sc, ULD_TOM);
+	ADAPTER_UNLOCK(sc);
+}
+
+static int
+t4_tom_mod_unload(void)
+{
+	t4_iterate(tom_uninit, NULL);
+
+	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
+		return (EBUSY);
+
+	return (0);
+}
+#endif	/* TCP_OFFLOAD */
+
+static int
+t4_tom_modevent(module_t mod, int cmd, void *arg)
+{
+	int rc = 0;
+
+#ifdef TCP_OFFLOAD
+	switch (cmd) {
+	case MOD_LOAD:
+		rc = t4_tom_mod_load();
+		break;
+
+	case MOD_UNLOAD:
+		rc = t4_tom_mod_unload();
+		break;
+
+	default:
+		rc = EINVAL;
+	}
+#else
+	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
+	rc = EOPNOTSUPP;
+#endif
+	return (rc);
+}
+
+static moduledata_t t4_tom_moddata= {
+	"t4_tom",
+	t4_tom_modevent,
+	0
+};
+
+MODULE_VERSION(t4_tom, 1);
+MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
+DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_tom.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_tom.h	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,248 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_H__
+#define __T4_TOM_H__
+
+#define KTR_CXGBE	KTR_SPARE3
+#define LISTEN_HASH_SIZE 32
+
+/*
+ * Min receive window.  We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+
+/*
+ * Max receive window supported by HW in bytes.  Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/* TOE PCB flags */
+enum {
+	TPF_ATTACHED,		/* a tcpcb refers to this toepcb */
+	TPF_FLOWC_WR_SENT,	/* firmware flow context WR sent */
+	TPF_TX_DATA_SENT,	/* some data sent */
+	TPF_TX_SUSPENDED,	/* tx suspended for lack of resources */
+	TPF_SEND_FIN,		/* send FIN after sending all pending data */
+	TPF_FIN_SENT,		/* FIN has been sent */
+	TPF_ABORT_SHUTDOWN,	/* connection abort is in progress */
+	TPF_CPL_PENDING,	/* haven't received the last CPL */
+	TPF_SYNQE,		/* synq_entry, not really a toepcb */
+	TPF_SYNQE_NEEDFREE,	/* synq_entry was allocated externally */
+};
+
+struct ofld_tx_sdesc {
+	uint32_t plen;		/* payload length */
+	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
+};
+
+struct toepcb {
+	TAILQ_ENTRY(toepcb) link; /* toep_list */
+	unsigned int flags;	/* miscellaneous flags */
+	struct tom_data *td;
+	struct inpcb *inp;	/* backpointer to host stack's PCB */
+	struct port_info *port;	/* physical port */
+	struct sge_wrq *ofld_txq;
+	struct sge_ofld_rxq *ofld_rxq;
+	struct sge_wrq *ctrlq;
+	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
+	int tid;		/* Connection identifier */
+	unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */
+	unsigned int enqueued;	/* # of bytes added to so_rcv (not yet read) */
+	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
+
+	unsigned int ulp_mode;	/* ULP mode */
+
+	/* Tx software descriptor */
+	uint8_t txsd_total;
+	uint8_t txsd_pidx;
+	uint8_t txsd_cidx;
+	uint8_t txsd_avail;
+	struct ofld_tx_sdesc txsd[];
+};
+
+struct flowc_tx_params {
+	uint32_t snd_nxt;
+	uint32_t rcv_nxt;
+	unsigned int snd_space;
+	unsigned int mss;
+};
+
+static inline int
+toepcb_flag(struct toepcb *toep, int flag)
+{
+
+	return isset(&toep->flags, flag);
+}
+
+static inline void
+toepcb_set_flag(struct toepcb *toep, int flag)
+{
+
+	setbit(&toep->flags, flag);
+}
+
+static inline void
+toepcb_clr_flag(struct toepcb *toep, int flag)
+{
+
+	clrbit(&toep->flags, flag);
+}
+
+/*
+ * Compressed state for embryonic connections for a listener.  Barely fits in
+ * 64B, try not to grow it further.
+ */
+struct synq_entry {
+	TAILQ_ENTRY(synq_entry) link;	/* listen_ctx's synq link */
+	int flags;			/* same as toepcb's tp_flags */
+	int tid;
+	struct listen_ctx *lctx;	/* backpointer to listen ctx */
+	struct mbuf *syn;
+	uint32_t iss;
+	uint32_t ts;
+	volatile uintptr_t wr;
+	volatile u_int refcnt;
+	uint16_t l2e_idx;
+	uint16_t rcv_bufsize;
+};
+
+static inline int
+synqe_flag(struct synq_entry *synqe, int flag)
+{
+
+	return isset(&synqe->flags, flag);
+}
+
+static inline void
+synqe_set_flag(struct synq_entry *synqe, int flag)
+{
+
+	setbit(&synqe->flags, flag);
+}
+
+static inline void
+synqe_clr_flag(struct synq_entry *synqe, int flag)
+{
+
+	clrbit(&synqe->flags, flag);
+}
+
+/* listen_ctx flags */
+#define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
+
+struct listen_ctx {
+	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
+	volatile int refcount;
+	int stid;
+	int flags;
+	struct inpcb *inp;		/* listening socket's inp */
+	struct sge_wrq *ctrlq;
+	struct sge_ofld_rxq *ofld_rxq;
+	TAILQ_HEAD(, synq_entry) synq;
+};
+
+struct tom_data {
+	struct toedev tod;
+
+	/* toepcb's associated with this TOE device */
+	struct mtx toep_list_lock;
+	TAILQ_HEAD(, toepcb) toep_list;
+
+	LIST_HEAD(, listen_ctx) *listen_hash;
+	u_long listen_mask;
+	int lctx_count;		/* # of lctx in the hash table */
+	struct mtx lctx_hash_lock;
+};
+
+static inline struct tom_data *
+tod_td(struct toedev *tod)
+{
+
+	return (member2struct(tom_data, tod, tod));
+}
+
+static inline struct adapter *
+td_adapter(struct tom_data *td)
+{
+
+	return (td->tod.tod_softc);
+}
+
+/* t4_tom.c */
+struct toepcb *alloc_toepcb(struct port_info *, int, int, int);
+void free_toepcb(struct toepcb *);
+void offload_socket(struct socket *, struct toepcb *);
+void undo_offload_socket(struct socket *);
+void final_cpl_received(struct toepcb *);
+void insert_tid(struct adapter *, int, void *);
+void *lookup_tid(struct adapter *, int);
+void update_tid(struct adapter *, int, void *);
+void remove_tid(struct adapter *, int);
+void release_tid(struct adapter *, int, struct sge_wrq *);
+int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
+u_long select_rcv_wnd(struct socket *);
+int select_rcv_wscale(void);
+uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *,
+    int, int, int, int);
+uint32_t select_ntuple(struct port_info *, struct l2t_entry *, uint32_t);
+
+/* t4_connect.c */
+void t4_init_connect_cpl_handlers(struct adapter *);
+int t4_connect(struct toedev *, struct socket *, struct rtentry *,
+    struct sockaddr *);
+
+/* t4_listen.c */
+void t4_init_listen_cpl_handlers(struct adapter *);
+int t4_listen_start(struct toedev *, struct tcpcb *);
+int t4_listen_stop(struct toedev *, struct tcpcb *);
+void t4_syncache_added(struct toedev *, void *);
+void t4_syncache_removed(struct toedev *, void *);
+int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
+int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
+    struct mbuf *);
+int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
+    struct mbuf *);
+void t4_offload_socket(struct toedev *, void *, struct socket *);
+
+/* t4_cpl_io.c */
+void t4_init_cpl_io_handlers(struct adapter *);
+void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
+void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
+void send_reset(struct adapter *, struct toepcb *, uint32_t);
+void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
+void t4_rcvd(struct toedev *, struct tcpcb *);
+int t4_tod_output(struct toedev *, struct tcpcb *);
+int t4_send_fin(struct toedev *, struct tcpcb *);
+int t4_send_rst(struct toedev *, struct tcpcb *);
+
+#endif
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_tom_l2t.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,405 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sbuf.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/jhash.h"
+#include "common/t4_msg.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+#define VLAN_NONE	0xfff
+
+#define SA(x)           ((struct sockaddr *)(x))
+#define SIN(x)          ((struct sockaddr_in *)(x))
+#define SINADDR(x)      (SIN(x)->sin_addr.s_addr)
+
+static inline void
+l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
+		atomic_subtract_int(&d->nfree, 1);
+}
+
+static inline unsigned int
+arp_hash(const uint32_t key, int ifindex)
+{
+	return jhash_2words(key, ifindex, 0) & (L2T_SIZE - 1);
+}
+
+/*
+ * Add a WR to an L2T entry's queue of work requests awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void
+arpq_enqueue(struct l2t_entry *e, struct wrqe *wr)
+{
+	mtx_assert(&e->lock, MA_OWNED);
+
+	STAILQ_INSERT_TAIL(&e->wr_list, wr, link);
+}
+
+static inline void
+send_pending(struct adapter *sc, struct l2t_entry *e)
+{
+	struct wrqe *wr;
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+		STAILQ_REMOVE_HEAD(&e->wr_list, link);
+		t4_wrq_tx(sc, wr);
+	}
+}
+
+static void
+resolution_failed_for_wr(struct wrqe *wr)
+{
+	log(LOG_ERR, "%s: leaked work request %p, wr_len %d", __func__, wr,
+	    wr->wr_len);
+
+	/* free(wr, M_CXGBE); */
+}
+
+static void
+resolution_failed(struct l2t_entry *e)
+{
+	struct wrqe *wr;
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+		STAILQ_REMOVE_HEAD(&e->wr_list, link);
+		resolution_failed_for_wr(wr);
+	}
+}
+
+static void
+update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr,
+    uint16_t vtag)
+{
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	/*
+	 * The entry may be in active use (e->refcount > 0) or not.  We update
+	 * it even when it's not as this simplifies the case where we decide to
+	 * reuse the entry later.
+	 */
+
+	if (lladdr == NULL &&
+	    (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) {
+		/*
+		 * Never got a valid L2 address for this one.  Just mark it as
+		 * failed instead of removing it from the hash (for which we'd
+		 * need to wlock the table).
+		 */
+		e->state = L2T_STATE_FAILED;
+		resolution_failed(e);
+		return;
+
+	} else if (lladdr == NULL) {
+
+		/* Valid or already-stale entry was deleted (or expired) */
+
+		KASSERT(e->state == L2T_STATE_VALID ||
+		    e->state == L2T_STATE_STALE,
+		    ("%s: lladdr NULL, state %d", __func__, e->state));
+
+		e->state = L2T_STATE_STALE;
+
+	} else {
+
+		if (e->state == L2T_STATE_RESOLVING ||
+		    e->state == L2T_STATE_FAILED ||
+		    memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) {
+
+			/* unresolved -> resolved; or dmac changed */
+
+			memcpy(e->dmac, lladdr, ETHER_ADDR_LEN);
+			e->vlan = vtag;
+			t4_write_l2e(sc, e, 1);
+		}
+		e->state = L2T_STATE_VALID;
+	}
+}
+
+static int
+resolve_entry(struct adapter *sc, struct l2t_entry *e)
+{
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	struct sockaddr_in sin = {0};
+	uint8_t dmac[ETHER_ADDR_LEN];
+	uint16_t vtag = VLAN_NONE;
+	int rc;
+
+	sin.sin_family = AF_INET;
+	sin.sin_len = sizeof(struct sockaddr_in);
+	SINADDR(&sin) = e->addr;
+
+	rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag);
+	if (rc == EWOULDBLOCK)
+		return (rc);
+
+	mtx_lock(&e->lock);
+	update_entry(sc, e, rc == 0 ? dmac : NULL, vtag);
+	mtx_unlock(&e->lock);
+
+	return (rc);
+}
+
+int
+t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+
+		if (resolve_entry(sc, e) != EWOULDBLOCK)
+			goto again;	/* entry updated, re-examine state */
+
+		/* Fall through */
+
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+
+		t4_wrq_tx(sc, wr);
+		return (0);
+
+	case L2T_STATE_RESOLVING:
+	case L2T_STATE_SYNC_WRITE:
+
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_SYNC_WRITE &&
+		    e->state != L2T_STATE_RESOLVING) {
+			/* state changed by the time we got here */
+			mtx_unlock(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, wr);
+		mtx_unlock(&e->lock);
+
+		if (resolve_entry(sc, e) == EWOULDBLOCK)
+			break;
+
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list))
+			send_pending(sc, e);
+		if (e->state == L2T_STATE_FAILED)
+			resolution_failed(e);
+		mtx_unlock(&e->lock);
+		break;
+
+	case L2T_STATE_FAILED:
+		resolution_failed_for_wr(wr);
+		return (EHOSTUNREACH);
+	}
+
+	return (0);
+}
+
+/*
+ * Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+
+static int
+do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(rpl);
+	unsigned int idx = tid & (L2T_SIZE - 1);
+	int rc;
+
+	rc = do_l2t_write_rpl(iq, rss, m);
+	if (rc != 0)
+		return (rc);
+
+	if (tid & F_SYNC_WR) {
+		struct l2t_entry *e = &sc->l2t->l2tab[idx];
+
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_SWITCHING) {
+			send_pending(sc, e);
+			e->state = L2T_STATE_VALID;
+		}
+		mtx_unlock(&e->lock);
+	}
+
+	return (0);
+}
+
+void
+t4_init_l2t_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl2);
+}
+
+void
+t4_uninit_l2t_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+}
+
+/*
+ * The TOE wants an L2 table entry that it can use to reach the next hop over
+ * the specified port.  Produce such an entry - create one if needed.
+ *
+ * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
+ * top of the real cxgbe interface.
+ */
+struct l2t_entry *
+t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
+{
+	struct l2t_entry *e;
+	struct l2t_data *d = pi->adapter->l2t;
+	uint32_t addr = SINADDR(sa);
+	int hash = arp_hash(addr, ifp->if_index);
+	unsigned int smt_idx = pi->port_id;
+
+	if (sa->sa_family != AF_INET)
+		return (NULL);	/* XXX: no IPv6 support right now */
+
+#ifndef VLAN_TAG
+	if (ifp->if_type == IFT_L2VLAN)
+		return (NULL);
+#endif
+
+	rw_wlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next) {
+		if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) {
+			l2t_hold(d, e);
+			goto done;
+		}
+	}
+
+	/* Need to allocate a new entry */
+	e = t4_alloc_l2e(d);
+	if (e) {
+		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+
+		e->state = L2T_STATE_RESOLVING;
+		e->addr = addr;
+		e->ifp = ifp;
+		e->smt_idx = smt_idx;
+		e->hash = hash;
+		e->lport = pi->lport;
+		atomic_store_rel_int(&e->refcnt, 1);
+#ifdef VLAN_TAG
+		if (ifp->if_type == IFT_L2VLAN)
+			VLAN_TAG(ifp, &e->vlan);
+		else
+			e->vlan = VLAN_NONE;
+#endif
+		mtx_unlock(&e->lock);
+	}
+done:
+	rw_wunlock(&d->lock);
+	return e;
+}
+
+/*
+ * Called when the host's ARP layer makes a change to some entry that is loaded
+ * into the HW L2 table.
+ */
+void
+t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t vtag)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct l2t_entry *e;
+	struct l2t_data *d = sc->l2t;
+	uint32_t addr = SINADDR(sa);
+	int hash = arp_hash(addr, ifp->if_index);
+
+	KASSERT(d != NULL, ("%s: no L2 table", __func__));
+
+	rw_rlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next) {
+		if (e->addr == addr && e->ifp == ifp) {
+			mtx_lock(&e->lock);
+			if (atomic_load_acq_int(&e->refcnt))
+				goto found;
+			e->state = L2T_STATE_STALE;
+			mtx_unlock(&e->lock);
+			break;
+		}
+	}
+	rw_runlock(&d->lock);
+
+	/*
+	 * This is of no interest to us.  We've never had an offloaded
+	 * connection to this destination, and we aren't attempting one right
+	 * now.
+	 */
+	return;
+
+found:
+	rw_runlock(&d->lock);
+
+	KASSERT(e->state != L2T_STATE_UNUSED,
+	    ("%s: unused entry in the hash.", __func__));
+
+	update_entry(sc, e, lladdr, vtag);
+	mtx_unlock(&e->lock);
+}
+#endif
diff -r 7cec8c20120e sys/dev/cxgbe/tom/t4_tom_l2t.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_L2T_H
+#define __T4_TOM_L2T_H
+
+#include "t4_l2t.h"
+
+int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *);
+struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
+    struct sockaddr *);
+void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *,
+    uint8_t *, uint16_t);
+void t4_init_l2t_cpl_handlers(struct adapter *);
+void t4_uninit_l2t_cpl_handlers(struct adapter *);
+
+static inline int
+t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+	if (__predict_true(e->state == L2T_STATE_VALID)) {
+		t4_wrq_tx(sc, wr);
+		return (0);
+	} else
+		return (t4_l2t_send_slow(sc, wr, e));
+}
+
+#endif  /* __T4_TOM_L2T_H */
diff -r 7cec8c20120e sys/i386/conf/GENERIC
--- a/sys/i386/conf/GENERIC	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/i386/conf/GENERIC	Mon Jun 11 00:15:24 2012 -0700
@@ -30,6 +30,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
diff -r 7cec8c20120e sys/i386/conf/XEN
--- a/sys/i386/conf/XEN	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/i386/conf/XEN	Mon Jun 11 00:15:24 2012 -0700
@@ -7,7 +7,7 @@
 ident		XEN
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
-makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve sound sym trm xfs"
+makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve rdma sound sym trm xfs"
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
diff -r 7cec8c20120e sys/modules/Makefile
--- a/sys/modules/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -314,6 +314,7 @@
 	${_ti} \
 	tl \
 	tmpfs \
+	${_toecore} \
 	${_tpm} \
 	trm \
 	${_twa} \
@@ -392,6 +393,7 @@
 .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \
 	defined(ALL_MODULES)
 _carp=	carp
+_toecore=	toecore
 .endif
 
 .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES)
diff -r 7cec8c20120e sys/modules/cxgb/Makefile
--- a/sys/modules/cxgb/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/cxgb/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -1,39 +1,12 @@
 # $FreeBSD$
 SUBDIR= cxgb
-SUBDIR+= ${_toecore}
+SUBDIR+= cxgb_t3fw
 SUBDIR+= ${_tom}
 SUBDIR+= ${_iw_cxgb}
-SUBDIR+= cxgb_t3fw
 
-.if defined(SYSDIR)
-_sysdir = ${SYSDIR}
-.endif
-
-# Based on bsd.kmod.mk but we don't modify SYSDIR in this one.
-.for _dir in ${.CURDIR}/../.. ${.CURDIR}/../../.. ${.CURDIR}/../../../.. \
-    /sys /usr/src/sys
-.if !defined(_sysdir) && exists(${_dir}/kern/) && exists(${_dir}/conf/kmod.mk)
-_sysdir = ${_dir}
-.endif
-.endfor
-.if !defined(_sysdir) || !exists(${_sysdir}/kern/) || \
-    !exists(${_sysdir}/conf/kmod.mk)
-.error "can't find kernel source tree"
-.endif
-
-_toe_header = ${_sysdir}/netinet/toedev.h
-
-.if exists(${_toe_header})
-_toecore = toecore
-#_tom = tom
-.endif
-
-.if ${MACHINE_CPUARCH} == "i386" && exists(${_toe_header})
-_iw_cxgb = iw_cxgb
-.endif
-
-.if ${MACHINE_CPUARCH} == "amd64" && exists(${_toe_header})
-_iw_cxgb = iw_cxgb
+.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
+_tom=		tom
+_iw_cxgb=	iw_cxgb
 .endif
 
 .include <bsd.subdir.mk>
diff -r 7cec8c20120e sys/modules/cxgb/cxgb/Makefile
--- a/sys/modules/cxgb/cxgb/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/cxgb/cxgb/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -8,7 +8,7 @@
 KMOD=	if_cxgb
 SRCS=	cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c 
 SRCS+=	cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c cxgb_aq100x.c
-SRCS+=  cxgb_sge.c cxgb_offload.c cxgb_tn1010.c
+SRCS+=  cxgb_sge.c cxgb_tn1010.c
 SRCS+=	device_if.h bus_if.h pci_if.h
 SRCS+=	opt_inet.h opt_inet6.h opt_zero.h opt_sched.h
 SRCS+=	uipc_mvec.c
@@ -19,6 +19,7 @@
 .if ${MK_INET_SUPPORT} != "no"
 opt_inet.h:
 	@echo "#define INET 1" > ${.TARGET}
+	@echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
 .endif
 
 .if ${MK_INET6_SUPPORT} != "no"
diff -r 7cec8c20120e sys/modules/cxgb/iw_cxgb/Makefile
--- a/sys/modules/cxgb/iw_cxgb/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/cxgb/iw_cxgb/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -1,5 +1,7 @@
 # $FreeBSD$
 
+.include <bsd.own.mk>
+
 CXGB = ${.CURDIR}/../../../dev/cxgb
 .PATH: ${CXGB}/ulp/iw_cxgb
 
@@ -8,8 +10,15 @@
 SRCS+=  iw_cxgb_provider.c iw_cxgb_qp.c iw_cxgb_resource.c
 SRCS+=  iw_cxgb_ev.c iw_cxgb_mem.c iw_cxgb_dbg.c iw_cxgb_cq.c
 SRCS+=  bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h opt_ktr.h
-SRCS+=  opt_inet.h
-CFLAGS+= -g -I${CXGB}
-#CFLAGS+= -DDEBUG
+SRCS+=  opt_inet.h opt_ofed.h vnode_if.h
+CFLAGS+= -I${CXGB} -I${.CURDIR}/../../../ofed/include -DLINUX_TYPES_DEFINED
+
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	echo "#define INET 1" > ${.TARGET}
+	echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
+.endif
+.endif
 
 .include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/modules/cxgb/toecore/Makefile
--- a/sys/modules/cxgb/toecore/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-# $FreeBSD$
-
-CXGB = ${.CURDIR}/../../../dev/cxgb
-.PATH: ${CXGB}/ulp/toecore
-
-KMOD=	toecore
-SRCS=   toedev.c
-SRCS+=	device_if.h bus_if.h pci_if.h opt_sched.h opt_inet.h
-CFLAGS+= -g -I${CXGB}
-
-.include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/modules/cxgb/tom/Makefile
--- a/sys/modules/cxgb/tom/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/cxgb/tom/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -1,15 +1,25 @@
 # $FreeBSD$
 
+.include <bsd.own.mk>
+
 CXGB = ${.CURDIR}/../../../dev/cxgb
 .PATH: ${CXGB}/ulp/tom
 
-KMOD=	tom
-SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
-SRCS+=  cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c
+KMOD=	t3_tom
+SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_l2t.c
 SRCS+=	opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h
 SRCS+=	opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h
 SRCS+=	device_if.h bus_if.h pci_if.h
 CFLAGS+= -g -I${CXGB}
 
 #CFLAGS+= -DDEBUG_PRINT -DDEBUG
+
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	echo "#define INET 1" > ${.TARGET}
+	echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
+.endif
+.endif
+
 .include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/modules/cxgbe/Makefile
--- a/sys/modules/cxgbe/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/cxgbe/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -4,5 +4,10 @@
 
 SUBDIR = if_cxgbe
 SUBDIR+= firmware
+SUBDIR+= ${_tom}
+
+.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
+_tom=		tom
+.endif
 
 .include <bsd.subdir.mk>
diff -r 7cec8c20120e sys/modules/cxgbe/tom/Makefile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/modules/cxgbe/tom/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+CXGBE = ${.CURDIR}/../../../dev/cxgbe
+.PATH: ${CXGBE}/tom
+
+KMOD = t4_tom
+SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c
+SRCS+= device_if.h bus_if.h pci_if.h
+SRCS+= opt_inet.h
+
+CFLAGS+= -I${CXGBE}
+
+.include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/modules/rdma/krping/Makefile
--- a/sys/modules/rdma/krping/Makefile	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/modules/rdma/krping/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -6,5 +6,7 @@
 KMOD= krping
 SRCS= krping.c krping_dev.c getopt.c
 SRCS+=  bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h
+SRCS+=  vnode_if.h
+CFLAGS+= -I${.CURDIR}/../../../ofed/include 
 
 .include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/modules/toecore/Makefile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/modules/toecore/Makefile	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,9 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../netinet
+
+KMOD=	toecore
+SRCS=	toecore.c
+SRCS+=	opt_ofed.h
+
+.include <bsd.kmod.mk>
diff -r 7cec8c20120e sys/net/if_var.h
--- a/sys/net/if_var.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/net/if_var.h	Mon Jun 11 00:15:24 2012 -0700
@@ -209,6 +209,8 @@
 	void	*if_pspare[8];		/* 1 netmap, 7 TDB */
 };
 
+#define	TOEDEV(ifp)	((ifp)->if_llsoftc)
+
 typedef void if_init_f_t(void *);
 
 /*
diff -r 7cec8c20120e sys/net/if_vlan.c
--- a/sys/net/if_vlan.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/net/if_vlan.c	Mon Jun 11 00:15:24 2012 -0700
@@ -746,8 +746,8 @@
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
-		vlan_cookie_p = vlan_cookie;
-		vlan_setcookie_p = vlan_setcookie;
+		vlan_cookie_p = NULL;
+		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCK_DESTROY();
 		if (bootverbose)
@@ -1503,6 +1503,22 @@
 		ifp->if_capenable &= ~(p->if_capenable & IFCAP_TSO);
 		ifp->if_hwassist &= ~(p->if_hwassist & CSUM_TSO);
 	}
+
+	/*
+	 * If the parent interface can offload TCP connections over VLANs then
+	 * propagate its TOE capability to the VLAN interface.
+	 *
+	 * All TOE drivers in the tree today can deal with VLANs.  If this
+	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
+	 * with its own bit.
+	 */
+#define IFCAP_VLAN_TOE IFCAP_TOE
+	if (p->if_capabilities & IFCAP_VLAN_TOE)
+		ifp->if_capabilities |= p->if_capabilities & IFCAP_TOE;
+	if (p->if_capenable & IFCAP_VLAN_TOE) {
+		TOEDEV(ifp) = TOEDEV(p);
+		ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
+	}
 }
 
 static void
diff -r 7cec8c20120e sys/netinet/if_ether.c
--- a/sys/netinet/if_ether.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/if_ether.c	Mon Jun 11 00:15:24 2012 -0700
@@ -180,6 +180,17 @@
 		    callout_active(&lle->la_timer)) {
 			callout_stop(&lle->la_timer);
 			LLE_REMREF(lle);
+
+			if (lle->la_flags != LLE_DELETED) {
+				int evt;
+
+				if (lle->la_flags & LLE_VALID)
+					evt = LLENTRY_EXPIRED;
+				else
+					evt = LLENTRY_TIMEDOUT;
+				EVENTHANDLER_INVOKE(lle_event, lle, evt);
+			}
+
 			pkts_dropped = llentry_free(lle);
 			ARPSTAT_ADD(dropped, pkts_dropped);
 			ARPSTAT_INC(timeouts);
@@ -312,7 +323,7 @@
 	}
 retry:
 	IF_AFDATA_RLOCK(ifp);	
-	la = lla_lookup(LLTABLE(ifp), flags, dst);
+	la = *lle = lla_lookup(LLTABLE(ifp), flags, dst);
 	IF_AFDATA_RUNLOCK(ifp);	
 	if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
 	    && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {		
@@ -344,7 +355,6 @@
 			la->la_preempt--;
 		}
 		
-		*lle = la;
 		error = 0;
 		goto done;
 	} 
@@ -727,6 +737,7 @@
 		la->la_flags |= LLE_VALID;
 
 		EVENTHANDLER_INVOKE(arp_update_event, la);
+		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;
diff -r 7cec8c20120e sys/netinet/if_ether.h
--- a/sys/netinet/if_ether.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/if_ether.h	Mon Jun 11 00:15:24 2012 -0700
@@ -122,9 +122,19 @@
 void	arp_ifscrub(struct ifnet *, uint32_t);
 
 #include <sys/eventhandler.h>
+/* XXX: can be retired once OFED code is updated to use lle_event */
 typedef void (*llevent_arp_update_fn)(void *, struct llentry *);
 EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn);
 
+enum {
+	LLENTRY_RESOLVED,
+	LLENTRY_TIMEDOUT,
+	LLENTRY_DELETED,
+	LLENTRY_EXPIRED,
+};
+typedef void (*lle_event_fn)(void *, struct llentry *, int);
+EVENTHANDLER_DECLARE(lle_event, lle_event_fn);
+
 #endif
 
 #endif
diff -r 7cec8c20120e sys/netinet/in.c
--- a/sys/netinet/in.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/in.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1470,6 +1470,7 @@
 			LLE_WLOCK(lle);
 			lle->la_flags = LLE_DELETED;
 			EVENTHANDLER_INVOKE(arp_update_event, lle);
+			EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 			LLE_WUNLOCK(lle);
 #ifdef DIAGNOSTIC
 			log(LOG_INFO, "ifaddr cache = %p  is deleted\n", lle);	
diff -r 7cec8c20120e sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_input.c	Mon Jun 11 00:15:24 2012 -0700
@@ -105,6 +105,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -958,6 +961,14 @@
 		goto dropwithreset;
 	}
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE) {
+		tcp_offload_input(tp, m);
+		m = NULL;	/* consumed by the TOE driver */
+		goto dropunlock;
+	}
+#endif
+
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
@@ -1320,7 +1331,7 @@
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
-		syncache_add(&inc, &to, th, inp, &so, m);
+		syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Everything already unlocked by syncache_add().
diff -r 7cec8c20120e sys/netinet/tcp_offload.c
--- a/sys/netinet/tcp_offload.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_offload.c	Mon Jun 11 00:15:24 2012 -0700
@@ -1,145 +1,176 @@
 /*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
-
+#include <sys/sockopt.h>
 #include <net/if.h>
-#include <net/if_types.h>
-#include <net/if_var.h>
 #include <net/route.h>
-#include <net/vnet.h>
-
 #include <netinet/in.h>
-#include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_offload.h>
-#include <netinet/toedev.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
 
-uint32_t toedev_registration_count;
+int registered_toedevs;
 
+/*
+ * Provide an opportunity for a TOE driver to offload.
+ */
 int
 tcp_offload_connect(struct socket *so, struct sockaddr *nam)
 {
 	struct ifnet *ifp;
-	struct toedev *tdev;
+	struct toedev *tod;
 	struct rtentry *rt;
-	int error;
+	int error = EOPNOTSUPP;
 
-	if (toedev_registration_count == 0)
-		return (EINVAL);
-	
-	/*
-	 * Look up the route used for the connection to 
-	 * determine if it uses an interface capable of
-	 * offloading the connection.
-	 */
-	rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/);
-	if (rt) 
+	INP_WLOCK_ASSERT(sotoinpcb(so));
+	KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
+	    ("%s: called with sa_family %d", __func__, nam->sa_family));
+
+	if (registered_toedevs == 0)
+		return (error);
+
+	rt = rtalloc1(nam, 0, 0);
+	if (rt)
 		RT_UNLOCK(rt);
-	else 
+	else
 		return (EHOSTUNREACH);
 
 	ifp = rt->rt_ifp;
-	if ((ifp->if_capenable & IFCAP_TOE) == 0) {
-		error = EINVAL;
-		goto fail;
-	}
-	
-	tdev = TOEDEV(ifp);
-	if (tdev == NULL) {
-		error = EPERM;
-		goto fail;
-	}
-	
-	if (tdev->tod_can_offload(tdev, so) == 0) {
-		error = EPERM;
-		goto fail;
-	}
-	
-	return (tdev->tod_connect(tdev, so, rt, nam));
-fail:
+
+	if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4))
+		goto done;
+	if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))
+		goto done;
+
+	tod = TOEDEV(ifp);
+	if (tod != NULL)
+		error = tod->tod_connect(tod, so, rt, nam);
+done:
 	RTFREE(rt);
 	return (error);
 }
 
+void
+tcp_offload_listen_start(struct tcpcb *tp)
+{
 
-/*
- * This file contains code as a short-term staging area before it is moved in 
- * to sys/netinet/tcp_offload.c
- */
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
+}
 
 void
-tcp_offload_twstart(struct tcpcb *tp)
+tcp_offload_listen_stop(struct tcpcb *tp)
 {
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tcp_twstart(tp);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
 }
 
-struct tcpcb *
-tcp_offload_close(struct tcpcb *tp)
+void
+tcp_offload_input(struct tcpcb *tp, struct mbuf *m)
 {
+	struct toedev *tod = tp->tod;
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tp = tcp_close(tp);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-	if (tp)
-		INP_WUNLOCK(tp->t_inpcb);
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL", __func__));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	return (tp);
+	tod->tod_input(tod, tp, m);
 }
 
-struct tcpcb *
-tcp_offload_drop(struct tcpcb *tp, int error)
+int
+tcp_offload_output(struct tcpcb *tp)
 {
+	struct toedev *tod = tp->tod;
+	int error, flags;
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tp = tcp_drop(tp, error);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-	if (tp)
-		INP_WUNLOCK(tp->t_inpcb);
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL", __func__));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	return (tp);
+	flags = tcp_outflags[tp->t_state];
+
+	if (flags & TH_RST) {
+		/* XXX: avoid repeated calls like we do for FIN */
+		error = tod->tod_send_rst(tod, tp);
+	} else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) &&
+	    (tp->t_flags & TF_SENTFIN) == 0) {
+		error = tod->tod_send_fin(tod, tp);
+		if (error == 0)
+			tp->t_flags |= TF_SENTFIN;
+	} else
+		error = tod->tod_output(tod, tp);
+
+	return (error);
 }
 
+void
+tcp_offload_rcvd(struct tcpcb *tp)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL", __func__));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_rcvd(tod, tp);
+}
+
+void
+tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL", __func__));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name);
+}
+
+void
+tcp_offload_detach(struct tcpcb *tp)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL", __func__));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_pcb_detach(tod, tp);
+}
diff -r 7cec8c20120e sys/netinet/tcp_offload.h
--- a/sys/netinet/tcp_offload.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_offload.h	Mon Jun 11 00:15:24 2012 -0700
@@ -1,354 +1,48 @@
 /*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
  * $FreeBSD$
+ *
  */
 
 #ifndef _NETINET_TCP_OFFLOAD_H_
-#define	_NETINET_TCP_OFFLOAD_H_
+#define _NETINET_TCP_OFFLOAD_H_
 
 #ifndef _KERNEL
 #error "no user-serviceable parts inside"
 #endif
 
-/*
- * A driver publishes that it provides offload services
- * by setting IFCAP_TOE in the ifnet. The offload connect
- * will bypass any further work if the interface that a
- * connection would use does not support TCP offload.
- *
- * The TOE API assumes that the tcp offload engine can offload the 
- * the entire connection from set up to teardown, with some provision 
- * being made to allowing the software stack to handle time wait. If
- * the device does not meet these criteria, it is the driver's responsibility
- * to overload the functions that it needs to in tcp_usrreqs and make
- * its own calls to tcp_output if it needs to do so.
- *
- * There is currently no provision for the device advertising the congestion
- * control algorithms it supports as there is currently no API for querying 
- * an operating system for the protocols that it has loaded. This is a desirable
- * future extension.
- *
- *
- *
- * It is assumed that individuals deploying TOE will want connections
- * to be offloaded without software changes so all connections on an
- * interface providing TOE are offloaded unless the SO_NO_OFFLOAD 
- * flag is set on the socket.
- *
- *
- * The toe_usrreqs structure constitutes the TOE driver's 
- * interface to the TCP stack for functionality that doesn't
- * interact directly with userspace. If one wants to provide
- * (optional) functionality to do zero-copy to/from
- * userspace one still needs to override soreceive/sosend 
- * with functions that fault in and pin the user buffers.
- *
- * + tu_send
- *   - tells the driver that new data may have been added to the 
- *     socket's send buffer - the driver should not fail if the
- *     buffer is in fact unchanged
- *   - the driver is responsible for providing credits (bytes in the send window)
- *     back to the socket by calling sbdrop() as segments are acknowledged.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_rcvd
- *   - returns credits to the driver and triggers window updates
- *     to the peer (a credit as used here is a byte in the peer's receive window)
- *   - the driver is expected to determine how many bytes have been 
- *     consumed and credit that back to the card so that it can grow
- *     the window again by maintaining its own state between invocations.
- *   - In principle this could be used to shrink the window as well as
- *     grow the window, although it is not used for that now.
- *   - this function needs to correctly handle being called any number of
- *     times without any bytes being consumed from the receive buffer.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_disconnect
- *   - tells the driver to send FIN to peer
- *   - driver is expected to send the remaining data and then do a clean half close
- *   - disconnect implies at least half-close so only send, reset, and detach
- *     are legal
- *   - the driver is expected to handle transition through the shutdown
- *     state machine and allow the stack to support SO_LINGER.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_reset
- *   - closes the connection and sends a RST to peer
- *   - driver is expectd to trigger an RST and detach the toepcb
- *   - no further calls are legal after reset
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- *   The following fields in the tcpcb are expected to be referenced by the driver:
- *	+ iss
- *	+ rcv_nxt
- *	+ rcv_wnd
- *	+ snd_isn
- *	+ snd_max
- *	+ snd_nxt
- *	+ snd_una
- *	+ t_flags
- *	+ t_inpcb
- *	+ t_maxseg
- *	+ t_toe
- *
- *   The following fields in the inpcb are expected to be referenced by the driver:
- *	+ inp_lport
- *	+ inp_fport
- *	+ inp_laddr
- *	+ inp_fport
- *	+ inp_socket
- *	+ inp_ip_tos
- *
- *   The following fields in the socket are expected to be referenced by the
- *   driver:
- *	+ so_comp
- *	+ so_error
- *	+ so_linger
- *	+ so_options
- *	+ so_rcv
- *	+ so_snd
- *	+ so_state
- *	+ so_timeo
- *
- *   These functions all return 0 on success and can return the following errors
- *   as appropriate:
- *	+ EPERM:
- *	+ ENOBUFS: memory allocation failed
- *	+ EMSGSIZE: MTU changed during the call
- *	+ EHOSTDOWN:
- *	+ EHOSTUNREACH:
- *	+ ENETDOWN:
- *	* ENETUNREACH: the peer is no longer reachable
- *
- * + tu_detach
- *   - tells driver that the socket is going away so disconnect
- *     the toepcb and free appropriate resources
- *   - allows the driver to cleanly handle the case of connection state
- *     outliving the socket
- *   - no further calls are legal after detach
- *   - the driver is expected to provide its own synchronization between
- *     detach and receiving new data.
- * 
- * + tu_syncache_event
- *   - even if it is not actually needed, the driver is expected to
- *     call syncache_add for the initial SYN and then syncache_expand
- *     for the SYN,ACK
- *   - tells driver that a connection either has not been added or has 
- *     been dropped from the syncache
- *   - the driver is expected to maintain state that lives outside the 
- *     software stack so the syncache needs to be able to notify the
- *     toe driver that the software stack is not going to create a connection
- *     for a received SYN
- *   - The driver is responsible for any synchronization required between
- *     the syncache dropping an entry and the driver processing the SYN,ACK.
- * 
- */
-struct toe_usrreqs {
-	int (*tu_send)(struct tcpcb *tp);
-	int (*tu_rcvd)(struct tcpcb *tp);
-	int (*tu_disconnect)(struct tcpcb *tp);
-	int (*tu_reset)(struct tcpcb *tp);
-	void (*tu_detach)(struct tcpcb *tp);
-	void (*tu_syncache_event)(int event, void *toep);
-};
+extern int registered_toedevs;
 
-/*
- * Proxy for struct tcpopt between TOE drivers and TCP functions.
- */
-struct toeopt {
-	u_int64_t	to_flags;	/* see tcpopt in tcp_var.h */
-	u_int16_t	to_mss;		/* maximum segment size */
-	u_int8_t	to_wscale;	/* window scaling */
+int  tcp_offload_connect(struct socket *, struct sockaddr *);
+void tcp_offload_listen_start(struct tcpcb *);
+void tcp_offload_listen_stop(struct tcpcb *);
+void tcp_offload_input(struct tcpcb *, struct mbuf *);
+int  tcp_offload_output(struct tcpcb *);
+void tcp_offload_rcvd(struct tcpcb *);
+void tcp_offload_ctloutput(struct tcpcb *, int, int);
+void tcp_offload_detach(struct tcpcb *);
 
-	u_int8_t	_pad1;		/* explicit pad for 64bit alignment */
-	u_int32_t	_pad2;		/* explicit pad for 64bit alignment */
-	u_int64_t	_pad3[4];	/* TBD */
-};
-
-#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
-#define	TOE_SC_DROP			2	/* connection was timed out */
-
-/*
- * Because listen is a one-to-many relationship (a socket can be listening 
- * on all interfaces on a machine some of which may be using different TCP
- * offload devices), listen uses a publish/subscribe mechanism. The TCP
- * offload driver registers a listen notification function with the stack.
- * When a listen socket is created all TCP offload devices are notified
- * so that they can do the appropriate set up to offload connections on the
- * port to which the socket is bound. When the listen socket is closed,
- * the offload devices are notified so that they will stop listening on that
- * port and free any associated resources as well as sending RSTs on any
- * connections in the SYN_RCVD state.
- *
- */
-
-typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
-typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
-
-EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
-EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
-
-/*
- * Check if the socket can be offloaded by the following steps:
- * - determine the egress interface
- * - check the interface for TOE capability and TOE is enabled
- * - check if the device has resources to offload the connection
- */
-int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
-
-/*
- * The tcp_output_* routines are wrappers around the toe_usrreqs calls
- * which trigger packet transmission. In the non-offloaded case they
- * translate to tcp_output. The tcp_offload_* routines notify TOE
- * of specific events. I the non-offloaded case they are no-ops.
- *
- * Listen is a special case because it is a 1 to many relationship
- * and there can be more than one offload driver in the system.
- */
-
-/*
- * Connection is offloaded
- */
-#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
-
-/*
- * hackish way of allowing this file to also be included by TOE
- * which needs to be kept ignorant of socket implementation details
- */
-#ifdef _SYS_SOCKETVAR_H_
-/*
- * The socket has not been marked as "do not offload"
- */
-#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
-
-static __inline int
-tcp_output_connect(struct socket *so, struct sockaddr *nam)
-{
-	struct tcpcb *tp = sototcpcb(so);
-	int error;
-
-	/*
-	 * If offload has been disabled for this socket or the 
-	 * connection cannot be offloaded just call tcp_output
-	 * to start the TCP state machine.
-	 */
-#ifndef TCP_OFFLOAD_DISABLE	
-	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
-#endif		
-		error = tcp_output(tp);
-	return (error);
-}
-
-static __inline int
-tcp_output_send(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_send(tp));
 #endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_rcvd(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_rcvd(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_disconnect(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_disconnect(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_reset(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_reset(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline void
-tcp_offload_detach(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		tp->t_tu->tu_detach(tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_open(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
-		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_close(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
-#endif	
-}
-#undef SO_OFFLOADABLE
-#endif /* _SYS_SOCKETVAR_H_ */
-#undef tp_offload
-
-void tcp_offload_twstart(struct tcpcb *tp);
-struct tcpcb *tcp_offload_close(struct tcpcb *tp);
-struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
-
-#endif /* _NETINET_TCP_OFFLOAD_H_ */
diff -r 7cec8c20120e sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_output.c	Mon Jun 11 00:15:24 2012 -0700
@@ -75,6 +75,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -191,6 +194,11 @@
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return (tcp_offload_output(tp));
+#endif
+
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
diff -r 7cec8c20120e sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_subr.c	Mon Jun 11 00:15:24 2012 -0700
@@ -85,7 +85,6 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
@@ -96,6 +95,9 @@
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -824,7 +826,7 @@
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
-		(void) tcp_output_reset(tp);
+		(void) tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
@@ -924,8 +926,12 @@
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
+
+#ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
-	tcp_offload_detach(tp);
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_detach(tp);
+#endif
 		
 	tcp_free_sackholes(tp);
 
@@ -954,9 +960,10 @@
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
-	/* Notify any offload devices of listener close */
+#ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
-		tcp_offload_listen_close(tp);
+		tcp_offload_listen_stop(tp);
+#endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
@@ -1695,7 +1702,7 @@
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
-	tcp_output_send(tp);
+	tcp_output(tp);
 	return (inp);
 }
 
diff -r 7cec8c20120e sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_syncache.c	Mon Jun 11 00:15:24 2012 -0700
@@ -81,10 +81,12 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/toecore.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -110,10 +112,8 @@
     &VNET_NAME(tcp_syncookiesonly), 0,
     "Use only TCP SYN cookies");
 
-#ifdef TCP_OFFLOAD_DISABLE
-#define TOEPCB_ISSET(sc) (0)
-#else
-#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
+#ifdef TCP_OFFLOAD
+#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
@@ -332,6 +332,14 @@
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_added(tod, sc->sc_todctx);
+	}
+#endif
+
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
@@ -356,10 +364,14 @@
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
-#ifndef TCP_OFFLOAD_DISABLE
-	if (sc->sc_tu)
-		sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
-#endif		    
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_removed(tod, sc->sc_todctx);
+	}
+#endif
+
 	syncache_free(sc);
 	V_tcp_syncache.cache_count--;
 }
@@ -846,6 +858,18 @@
 	if (sc->sc_rxmits > 1)
 		tp->snd_cwnd = tp->t_maxseg;
 
+#ifdef TCP_OFFLOAD
+	/*
+	 * Allow a TOE driver to install its hooks.  Note that we hold the
+	 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
+	 * new connection before the TOE driver has done its thing.
+	 */
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_offload_socket(tod, sc->sc_todctx, so);
+	}
+#endif
 	/*
 	 * Copy and activate timers.
 	 */
@@ -926,6 +950,13 @@
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			tod->tod_syncache_removed(tod, sc->sc_todctx);
+		}
+#endif
 		V_tcp_syncache.cache_count--;
 		SCH_UNLOCK(sch);
 	}
@@ -934,7 +965,7 @@
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
-	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
+	if (th->th_ack != sc->sc_iss + 1) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
@@ -945,9 +976,8 @@
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
-	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
-	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
-	    !TOEPCB_ISSET(sc)) {
+	if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
@@ -964,8 +994,7 @@
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
-	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
-	    !TOEPCB_ISSET(sc)) {
+	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
@@ -993,25 +1022,6 @@
 	return (0);
 }
 
-int
-tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct socket **lsop, struct mbuf *m)
-{
-	struct tcpopt to;
-	int rc;
-
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-	
-	INP_INFO_WLOCK(&V_tcbinfo);
-	rc = syncache_expand(inc, &to, th, lsop, m);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-
-	return (rc);
-}
-
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
@@ -1025,10 +1035,10 @@
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
-static void
-_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
-    struct inpcb *inp, struct socket **lsop, struct mbuf *m,
-    struct toe_usrreqs *tu, void *toepcb)
+void
+syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
+    void *todctx)
 {
 	struct tcpcb *tp;
 	struct socket *so;
@@ -1114,11 +1124,6 @@
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
-#ifndef TCP_OFFLOAD_DISABLE
-		if (sc->sc_tu)
-			sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
-			    sc->sc_toepcb);
-#endif		    
 		TCPSTAT_INC(tcps_sc_dupsyn);
 		if (ipopts) {
 			/*
@@ -1151,7 +1156,7 @@
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
-		if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
+		if (syncache_respond(sc) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			TCPSTAT_INC(tcps_sndacks);
@@ -1202,9 +1207,9 @@
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
-#ifndef TCP_OFFLOAD_DISABLE	
-	sc->sc_tu = tu;
-	sc->sc_toepcb = toepcb;
+#ifdef TCP_OFFLOAD
+	sc->sc_tod = tod;
+	sc->sc_todctx = todctx;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
@@ -1299,7 +1304,7 @@
 	/*
 	 * Do a standard 3-way handshake.
 	 */
-	if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
+	if (syncache_respond(sc) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
@@ -1491,37 +1496,21 @@
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+			return (error);
+		}
+#endif
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 #endif
 	return (error);
 }
 
-void
-syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
-    struct inpcb *inp, struct socket **lsop, struct mbuf *m)
-{
-	_syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
-}
-
-void
-tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
-    struct toe_usrreqs *tu, void *toepcb)
-{
-	struct tcpopt to;
-
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(inp);
-
-	_syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb);
-}
-
 /*
  * The purpose of SYN cookies is to avoid keeping track of all SYN's we
  * receive and to be able to handle SYN floods from bogus source addresses
diff -r 7cec8c20120e sys/netinet/tcp_syncache.h
--- a/sys/netinet/tcp_syncache.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_syncache.h	Mon Jun 11 00:15:24 2012 -0700
@@ -34,8 +34,6 @@
 #define _NETINET_TCP_SYNCACHE_H_
 #ifdef _KERNEL
 
-struct toeopt;
-
 void	 syncache_init(void);
 #ifdef VIMAGE
 void	syncache_destroy(void);
@@ -43,14 +41,9 @@
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcpopt *,
 	     struct tcphdr *, struct socket **, struct mbuf *);
-int	 tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-             struct tcphdr *th, struct socket **lsop, struct mbuf *m);
 void	 syncache_add(struct in_conninfo *, struct tcpopt *,
-	     struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
-void	 tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *,
-             struct tcphdr *, struct inpcb *, struct socket **,
-             struct toe_usrreqs *tu, void *toepcb);
-
+	     struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *,
+	     void *, void *);
 void	 syncache_chkrst(struct in_conninfo *, struct tcphdr *);
 void	 syncache_badack(struct in_conninfo *);
 int	 syncache_pcbcount(void);
@@ -75,10 +68,10 @@
 	u_int8_t	sc_requested_s_scale:4,
 			sc_requested_r_scale:4;
 	u_int16_t	sc_flags;
-#ifndef TCP_OFFLOAD_DISABLE
-	struct toe_usrreqs *sc_tu;		/* TOE operations */
-	void		*sc_toepcb;		/* TOE protocol block */
-#endif			
+#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE)
+	struct toedev	*sc_tod;		/* entry added by this TOE */
+	void		*sc_todctx;		/* TOE driver context */
+#endif
 	struct label	*sc_label;		/* MAC label reference */
 	struct ucred	*sc_cred;		/* cred cache for jail checks */
 
diff -r 7cec8c20120e sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_timer.c	Mon Jun 11 00:15:24 2012 -0700
@@ -602,6 +602,11 @@
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = INP_CPU(inp);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return;
+#endif
+
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
diff -r 7cec8c20120e sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_usrreq.c	Mon Jun 11 00:15:24 2012 -0700
@@ -87,7 +87,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
+#endif
 
 /*
  * TCP protocol interface to socket abstraction.
@@ -367,7 +369,9 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
-		tcp_offload_listen_open(tp);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -409,6 +413,9 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -459,7 +466,13 @@
 	TCPDEBUG1();
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
@@ -519,7 +532,12 @@
 			goto out;
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out;
-		error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+		if (registered_toedevs > 0 &&
+		    (error = tcp_offload_connect(so, nam)) == 0)
+			goto out;
+#endif
+		error = tcp_output(tp);
 		goto out;
 	}
 #endif
@@ -530,7 +548,13 @@
 		goto out;
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_CONNECT);
@@ -709,7 +733,7 @@
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
-		error = tcp_output_disconnect(tp);
+		error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
@@ -739,7 +763,11 @@
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
-	tcp_output_rcvd(tp);
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_rcvd(tp);
+#endif
+	tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_RCVD);
@@ -835,7 +863,7 @@
 		if (!(inp->inp_flags & INP_DROPPED)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_output_send(tp);
+			error = tcp_output(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
@@ -884,7 +912,7 @@
 		}
 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
 		tp->t_flags |= TF_FORCEDATA;
-		error = tcp_output_send(tp);
+		error = tcp_output(tp);
 		tp->t_flags &= ~TF_FORCEDATA;
 	}
 out:
@@ -1119,7 +1147,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1192,7 +1219,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1323,9 +1349,9 @@
 				tp->t_flags |= TF_SIGNATURE;
 			else
 				tp->t_flags &= ~TF_SIGNATURE;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 #endif /* TCP_SIGNATURE */
+
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 			INP_WUNLOCK(inp);
@@ -1351,6 +1377,13 @@
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
+unlock_and_done:
+#ifdef TCP_OFFLOAD
+			if (tp->t_flags & TF_TOE) {
+				tcp_offload_ctloutput(tp, sopt->sopt_dir,
+				    sopt->sopt_name);
+			}
+#endif
 			INP_WUNLOCK(inp);
 			break;
 
@@ -1369,8 +1402,7 @@
 				if (TCPS_HAVEESTABLISHED(tp->t_state))
 					error = tcp_output(tp);
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
@@ -1385,8 +1417,7 @@
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
@@ -1438,8 +1469,7 @@
 				}
 			}
 			CC_LIST_RUNLOCK();
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
@@ -1491,8 +1521,7 @@
 					    TP_KEEPINIT(tp));
 				break;
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		default:
 			INP_WUNLOCK(inp);
@@ -1635,7 +1664,7 @@
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
-			tcp_output_disconnect(tp);
+			tcp_output(tp);
 	}
 }
 
@@ -1658,7 +1687,9 @@
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
-		tcp_offload_listen_close(tp);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_stop(tp);
+#endif
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp->t_state = TCPS_CLOSED;
diff -r 7cec8c20120e sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/netinet/tcp_var.h	Mon Jun 11 00:15:24 2012 -0700
@@ -194,7 +194,7 @@
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
-	struct toe_usrreqs *t_tu;	/* offload operations vector */
+	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
diff -r 7cec8c20120e sys/netinet/toecore.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/toecore.c	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/types.h>
+#include <sys/sockopt.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/if_llatbl.h>
+#include <net/route.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet6/nd6.h>
+#include <netinet/in_pcb.h>
+#define TCPSTATES
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/toecore.h>
+
+static struct mtx toedev_lock;
+static TAILQ_HEAD(, toedev) toedev_list;
+static eventhandler_tag listen_start_eh, listen_stop_eh;
+static eventhandler_tag lle_event_eh, route_redirect_eh;
+
+static int
+toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
+    struct rtentry *rt __unused, struct sockaddr *nam __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return;
+}
+
+static void
+toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static int
+toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static void
+toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct sockaddr *sa __unused, uint8_t *lladdr __unused,
+    uint16_t vtag __unused)
+{
+
+	return;
+}
+
+static void
+toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct rtentry *rt0 __unused, struct rtentry *rt1 __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static int
+toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return (0);
+}
+
+static void
+toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused,
+    struct socket *so __unused)
+{
+
+	return;
+}
+
+static void
+toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    int sopt_dir __unused, int sopt_name __unused)
+{
+
+	return;
+}
+
+/*
+ * Inform one or more TOE devices about a listening socket.
+ */
+static void
+toe_listen_start(struct inpcb *inp, void *arg)
+{
+	struct toedev *t = arg, *tod;
+	struct tcpcb *tp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(inp->inp_pcbinfo == &V_tcbinfo,
+	    ("%s: inp is not a TCP inp", __func__));
+
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
+		return;
+
+	tp = intotcpcb(inp);
+	if (tp->t_state != TCPS_LISTEN)
+		return;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link) {
+		if (t == NULL || t == tod)
+			tod->tod_listen_start(tod, tp);
+	}
+	mtx_unlock(&toedev_lock);
+}
+
+static void
+toe_listen_start_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	toe_listen_start(inp, NULL);
+}
+
+static void
+toe_listen_stop_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct toedev *tod;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link)
+	    tod->tod_listen_stop(tod, tp);
+	mtx_unlock(&toedev_lock);
+}
+
+/*
+ * Fill up a freshly allocated toedev struct with reasonable defaults.
+ */
+void
+init_toedev(struct toedev *tod)
+{
+
+	tod->tod_softc = NULL;
+
+	/*
+	 * Provide no-op defaults so that the kernel can call any toedev
+	 * function without having to check whether the TOE driver supplied one
+	 * or not.
+	 */
+	tod->tod_connect = toedev_connect;
+	tod->tod_listen_start = toedev_listen_start;
+	tod->tod_listen_stop = toedev_listen_stop;
+	tod->tod_input = toedev_input;
+	tod->tod_rcvd = toedev_rcvd;
+	tod->tod_output = toedev_output;
+	tod->tod_send_rst = toedev_output;
+	tod->tod_send_fin = toedev_output;
+	tod->tod_pcb_detach = toedev_pcb_detach;
+	tod->tod_l2_update = toedev_l2_update;
+	tod->tod_route_redirect = toedev_route_redirect;
+	tod->tod_syncache_added = toedev_syncache_added;
+	tod->tod_syncache_removed = toedev_syncache_removed;
+	tod->tod_syncache_respond = toedev_syncache_respond;
+	tod->tod_offload_socket = toedev_offload_socket;
+	tod->tod_ctloutput = toedev_ctloutput;
+}
+
+/*
+ * Register an active TOE device with the system.  This allows it to receive
+ * notifications from the kernel.
+ */
+int
+register_toedev(struct toedev *tod)
+{
+	struct toedev *t;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(t, &toedev_list, link) {
+		if (t == tod) {
+			mtx_unlock(&toedev_lock);
+			return (EEXIST);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&toedev_list, tod, link);
+	registered_toedevs++;
+	mtx_unlock(&toedev_lock);
+
+	inp_apply_all(toe_listen_start, tod);
+
+	return (0);
+}
+
+/*
+ * Remove the TOE device from the global list of active TOE devices.  It is the
+ * caller's responsibility to ensure that the TOE device is quiesced prior to
+ * this call.
+ */
+int
+unregister_toedev(struct toedev *tod)
+{
+	struct toedev *t, *t2;
+	int rc = ENODEV;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) {
+		if (t == tod) {
+			TAILQ_REMOVE(&toedev_list, tod, link);
+			registered_toedevs--;
+			rc = 0;
+			break;
+		}
+	}
+	KASSERT(registered_toedevs >= 0,
+	    ("%s: registered_toedevs < 0", __func__));
+	mtx_unlock(&toedev_lock);
+	return (rc);
+}
+
+void
+toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, void *tod, void *todctx)
+{
+	struct socket *lso = inp->inp_socket;
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(inp);
+
+	syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx);
+}
+
+int
+toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
+    struct tcphdr *th, struct socket **lsop)
+{
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+	return (syncache_expand(inc, to, th, lsop, NULL));
+}
+
+/*
+ * General purpose check to see if a 4-tuple is in use by the kernel.  If a TCP
+ * header (presumably for an incoming SYN) is also provided, an existing 4-tuple
+ * in TIME_WAIT may be assassinated freeing it up for re-use.
+ *
+ * Note that the TCP header must have been run through tcp_fields_to_host() or
+ * equivalent.
+ */
+int
+toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
+{
+	struct inpcb *inp;
+
+	if (inc->inc_flags & INC_ISIPV6)
+		return (ENOSYS);	/* XXX: implement */
+
+	inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport,
+	    inc->inc_laddr, inc->inc_lport, INPLOOKUP_WLOCKPCB, ifp);
+	if (inp != NULL) {
+		INP_WLOCK_ASSERT(inp);
+
+		if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
+			if (!tcp_twcheck(inp, NULL, th, NULL, 0))
+				return (EADDRINUSE);
+		} else {
+			INP_WUNLOCK(inp);
+			return (EADDRINUSE);
+		}
+	}
+
+	return (0);
+}
+
+static void
+toe_lle_event(void *arg __unused, struct llentry *lle, int evt)
+{
+	struct toedev *tod;
+	struct ifnet *ifp;
+	uint8_t *lladdr;
+	uint16_t vtag = 0xfff;
+	struct sockaddr *sa;
+
+	LLE_WLOCK_ASSERT(lle);
+
+	ifp = lle->lle_tbl->llt_ifp;
+	sa = L3_ADDR(lle);
+
+	KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6,
+	    ("%s: lle_event but sa !INET && !INET6", __func__));
+
+	/*
+	 * Not interested if the interface's TOE capability is not enabled.
+	 */
+	if ((sa->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) ||
+	    (sa->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)))
+		return;
+
+	tod = TOEDEV(ifp);
+	if (tod == NULL)
+		return;
+
+	if (evt != LLENTRY_RESOLVED) {
+
+		/*
+		 * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean
+		 * this entry is going to be deleted.
+		 */
+
+		lladdr = NULL;
+	} else {
+
+		KASSERT(lle->la_flags & LLE_VALID,
+		    ("%s: %p resolved but not valid?", __func__, lle));
+
+		lladdr = (uint8_t *)&lle->ll_addr;
+#ifdef VLAN_TAG
+		VLAN_TAG(ifp, &vtag);
+#endif
+	}
+
+	tod->tod_l2_update(tod, ifp, sa, lladdr, vtag);
+}
+
+/*
+ * XXX: implement.
+ */
+static void
+toe_route_redirect_event(void *arg __unused, struct rtentry *rt0,
+    struct rtentry *rt1, struct sockaddr *sa)
+{
+	return;
+}
+
+/*
+ * Returns 0 or EWOULDBLOCK on success (any other value is an error).  0 means
+ * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's
+ * tod_l2_update will be called later, when the entry is resolved or times out.
+ */
+int
+toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t *vtag)
+{
+	struct llentry *lle;
+	int rc;
+
+	switch (sa->sa_family) {
+	case AF_INET:
+		rc = arpresolve(ifp, NULL, NULL, sa, lladdr, &lle);
+		break;
+	case AF_INET6:
+		rc = nd6_storelladdr(ifp, NULL, sa, lladdr, &lle);
+		break;
+	default:
+		return (EPROTONOSUPPORT);
+	}
+
+	if (rc == 0) {
+#ifdef VLAN_TAG
+		if (VLAN_TAG(ifp, vtag) != 0)
+#endif
+			*vtag = 0xfff;
+	}
+
+	return (rc);
+}
+
+void
+toe_connect_failed(struct toedev *tod, struct tcpcb *tp, int err)
+{
+	struct inpcb *inp = tp->t_inpcb;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_flags & TF_TOE,
+	    ("%s: tp %p not offloaded.", __func__, tp));
+
+	if (!(inp->inp_flags & INP_DROPPED)) {
+		if (err == EAGAIN) {
+
+			/*
+			 * Temporary failure during offload, take this PCB back.
+			 * Detach from the TOE driver and do the rest of what
+			 * TCP's pru_connect would have done if the connection
+			 * wasn't offloaded.
+			 */
+
+			tod->tod_pcb_detach(tod, tp);
+			KASSERT(!(tp->t_flags & TF_TOE),
+			    ("%s: tp %p still offloaded.", __func__, tp));
+			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+			(void) tcp_output(tp);
+		} else {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+			tp = tcp_drop(tp, err);
+			if (tp == NULL)
+				INP_WLOCK(inp);	/* re-acquire */
+		}
+	}
+	INP_WLOCK_ASSERT(inp);
+}
+
+static int
+toecore_load(void)
+{
+
+	mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF);
+	TAILQ_INIT(&toedev_list);
+
+	listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+	    toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY);
+	listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+	    toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY);
+	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	route_redirect_eh = EVENTHANDLER_REGISTER(route_redirect_event,
+	    toe_route_redirect_event, NULL, EVENTHANDLER_PRI_ANY);
+
+	return (0);
+}
+
+static int
+toecore_unload(void)
+{
+
+	mtx_lock(&toedev_lock);
+	if (!TAILQ_EMPTY(&toedev_list)) {
+		mtx_unlock(&toedev_lock);
+		return (EBUSY);
+	}
+
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh);
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh);
+	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
+	EVENTHANDLER_DEREGISTER(route_redirect_event, route_redirect_eh);
+
+	mtx_unlock(&toedev_lock);
+	mtx_destroy(&toedev_lock);
+
+	return (0);
+}
+
+static int
+toecore_mod_handler(module_t mod, int cmd, void *arg)
+{
+
+	if (cmd == MOD_LOAD)
+		return (toecore_load());
+
+	if (cmd == MOD_UNLOAD)
+		return (toecore_unload());
+
+	return (0);
+}
+
+static moduledata_t mod_data= {
+	"toecore",
+	toecore_mod_handler,
+	0
+};
+
+MODULE_VERSION(toecore, 1);
+DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff -r 7cec8c20120e sys/netinet/toecore.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/toecore.h	Mon Jun 11 00:15:24 2012 -0700
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _NETINET_TOE_H_
+#define _NETINET_TOE_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct tcpopt;
+struct tcphdr;
+struct in_conninfo;
+
+struct toedev {
+	TAILQ_ENTRY(toedev) link;	/* glue for toedev_list */
+	void *tod_softc;		/* TOE driver private data */
+
+	/* Active open. */
+	int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *,
+	    struct sockaddr *);
+
+	/* Passive open. */
+	int (*tod_listen_start)(struct toedev *, struct tcpcb *);
+	int (*tod_listen_stop)(struct toedev *, struct tcpcb *);
+
+	/* Frame received by kernel for an offloaded connection */
+	void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *);
+
+	/* Some data read */
+	void (*tod_rcvd)(struct toedev *, struct tcpcb *);
+
+	/* Output data, if any is waiting to be sent out. */
+	int (*tod_output)(struct toedev *, struct tcpcb *);
+
+	/* Immediate teardown, send RST to peer */
+	int (*tod_send_rst)(struct toedev *, struct tcpcb *);
+
+	/* Orderly disconnect, send FIN to the peer */
+	int (*tod_send_fin)(struct toedev *, struct tcpcb *);
+
+	/* Kernel is done with the TCP PCB */
+	void (*tod_pcb_detach)(struct toedev *, struct tcpcb *);
+
+	/* Information about an L2 entry is now available. */
+	void (*tod_l2_update)(struct toedev *, struct ifnet *,
+	    struct sockaddr *, uint8_t *, uint16_t);
+
+	/* XXX.  Route has been redirected. */
+	void (*tod_route_redirect)(struct toedev *, struct ifnet *,
+	    struct rtentry *, struct rtentry *);
+
+	/* Syncache interaction. */
+	void (*tod_syncache_added)(struct toedev *, void *);
+	void (*tod_syncache_removed)(struct toedev *, void *);
+	int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *);
+	void (*tod_offload_socket)(struct toedev *, void *, struct socket *);
+
+	/* TCP socket option */
+	void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int);
+};
+
+#include <sys/eventhandler.h>
+typedef	void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
+typedef	void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
+EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
+EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
+
+void init_toedev(struct toedev *);
+int  register_toedev(struct toedev *);
+int  unregister_toedev(struct toedev *);
+
+/*
+ * General interface for looking up L2 information for an IP or IPv6 address.
+ * If an answer is not available right away then the TOE driver's tod_l2_update
+ * will be called later.
+ */
+int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *,
+    uint8_t *, uint16_t *);
+
+void toe_connect_failed(struct toedev *, struct tcpcb *, int);
+
+void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct inpcb *, void *, void *);
+int  toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct socket **);
+
+int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *);
+#endif
diff -r 7cec8c20120e sys/netinet/toedev.h
--- a/sys/netinet/toedev.h	Sun Jun 10 23:57:43 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-/*-
- * Copyright (c) 2007, Chelsio Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _NETINET_TOEDEV_H_
-#define	_NETINET_TOEDEV_H_
-
-#ifndef _KERNEL
-#error "no user-serviceable parts inside"
-#endif
-
-extern uint32_t toedev_registration_count;
-
-/* Parameter values for offload_get_phys_egress(). */
-enum {
-	TOE_OPEN,
-	TOE_FAILOVER,
-};
-
-/* Parameter values for toe_failover(). */
-enum {
-	TOE_ACTIVE_SLAVE,
-	TOE_LINK_DOWN,
-	TOE_LINK_UP,
-	TOE_RELEASE,
-	TOE_RELEASE_ALL,
-};
-
-#define	TOENAMSIZ	16
-
-/* Get the toedev associated with a ifnet. */
-#define	TOEDEV(ifp)	((ifp)->if_llsoftc)
-
-struct offload_id {
-	unsigned int	id;
-	unsigned long	data;
-};
-
-struct ifnet;
-struct rt_entry;
-struct tom_info;
-struct sysctl_oid;
-struct socket;
-struct mbuf;
-
-struct toedev {
-	TAILQ_ENTRY(toedev) entry;  
-	char 		tod_name[TOENAMSIZ];	/* TOE device name */
-	unsigned int 	tod_ttid;		/* TOE type id */
-	unsigned long 	tod_flags;		/* device flags */
-	unsigned int	tod_mtu;		/* max TX offloaded data */
-	unsigned int	tod_nconn;		/* max # of offloaded
-						 * connections
-						 */
-	struct ifnet 	*tod_lldev;   		/* first interface */
-	const struct tom_info *tod_offload_mod; /* TCP offload module */
-
-	/*
-	 * This TOE device is capable of offloading the connection for socket so
-	 */
-	int	(*tod_can_offload)(struct toedev *dev, struct socket *so);
-
-	/*
-	 * Establish a connection to nam using the TOE device dev
-	 */
-	int	(*tod_connect)(struct toedev *dev, struct socket *so,
-	        struct rtentry *rt, struct sockaddr *nam);
-	/*
-	 * Send an mbuf down to the toe device 
-	 */
-	int	(*tod_send)(struct toedev *dev, struct mbuf *m);
-	/*
-	 * Receive an array of mbufs from the TOE device dev 
-	 */
-	int	(*tod_recv)(struct toedev *dev, struct mbuf **m, int n);
-	/*
-	 * Device specific ioctl interface
-	 */
-	int	(*tod_ctl)(struct toedev *dev, unsigned int req, void *data);
-	/*
-	 * Update L2 entry in toedev 
-	 */
-	void	(*tod_arp_update)(struct toedev *dev, struct rtentry *neigh);
-	/*
-	 * Failover from one toe device to another
-	 */
-	void	(*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp,
-			 struct ifnet *ndev, int event);
-	void	*tod_priv;			/* driver private data */
-	void 	*tod_l2opt;			/* optional layer 2 data */
-	void	*tod_l3opt; 			/* optional layer 3 data */
-	void 	*tod_l4opt;			/* optional layer 4 data */
-	void 	*tod_ulp;			/* upper lever protocol */
-};
-
-struct tom_info {
-	TAILQ_ENTRY(tom_info)	entry;
-	int		(*ti_attach)(struct toedev *dev,
-	                             const struct offload_id *entry);
-	int		(*ti_detach)(struct toedev *dev);
-	const char	*ti_name;
-	const struct offload_id	*ti_id_table;
-};
-
-static __inline void
-init_offload_dev(struct toedev *dev)
-{
-}
-
-int	register_tom(struct tom_info *t);
-int	unregister_tom(struct tom_info *t);
-int	register_toedev(struct toedev *dev, const char *name);
-int	unregister_toedev(struct toedev *dev);
-int	activate_offload(struct toedev *dev);
-int	toe_send(struct toedev *dev, struct mbuf *m);
-void	toe_arp_update(struct rtentry *rt);
-struct ifnet	*offload_get_phys_egress(struct ifnet *ifp,
-        struct socket *so, int context);
-int 	toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n);
-
-static __inline void
-toe_neigh_update(struct ifnet *ifp)
-{
-}
-
-static __inline void
-toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event)
-{
-}
-
-static __inline int
-toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp)
-{
-	return (0);
-}
-
-#endif /* _NETINET_TOEDEV_H_ */
diff -r 7cec8c20120e sys/ofed/drivers/infiniband/core/cma.c
--- a/sys/ofed/drivers/infiniband/core/cma.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/ofed/drivers/infiniband/core/cma.c	Mon Jun 11 00:15:24 2012 -0700
@@ -59,10 +59,10 @@
 module_param_named(tavor_quirk, tavor_quirk, int, 0644);
 MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
 
-int unify_tcp_port_space = 0;
+int unify_tcp_port_space = 1;
 module_param(unify_tcp_port_space, int, 0644);
 MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
-		 "space allocation (default=0)");
+		 "space allocation (default=1)");
 
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
@@ -1478,6 +1478,7 @@
 	struct sockaddr_in *sin;
 
 	id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device,
+					    id_priv->sock,
 					    iw_conn_req_handler,
 					    id_priv);
 	if (IS_ERR(id_priv->cm_id.iw))
@@ -2055,7 +2056,16 @@
 				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
 		}
 	}
-	return rdma_bind_addr(id, src_addr);
+	if (!cma_any_addr(src_addr))
+		return rdma_bind_addr(id, src_addr);
+	else {
+		struct sockaddr_in addr_in;
+
+        	memset(&addr_in, 0, sizeof addr_in);
+        	addr_in.sin_family = dst_addr->sa_family;
+        	addr_in.sin_len = sizeof addr_in;
+        	return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
+	}
 }
 
 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
@@ -2247,6 +2257,7 @@
 		sock_release(sock);
 		return ret;
 	}
+
 	size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
 	ret = sock_getname(sock,
 			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
@@ -2255,6 +2266,7 @@
 		sock_release(sock);
 		return ret;
 	}
+
 	id_priv->sock = sock;
 	return 0;
 }
@@ -2604,7 +2616,8 @@
 	int ret;
 	struct iw_cm_conn_param iw_param;
 
-	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+	cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock,
+				cma_iw_handler, id_priv);
 	if (IS_ERR(cm_id)) {
 		ret = PTR_ERR(cm_id);
 		goto out;
diff -r 7cec8c20120e sys/ofed/drivers/infiniband/core/iwcm.c
--- a/sys/ofed/drivers/infiniband/core/iwcm.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c	Mon Jun 11 00:15:24 2012 -0700
@@ -189,6 +189,7 @@
 static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
 
 struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 struct socket *so,
 				 iw_cm_handler cm_handler,
 				 void *context)
 {
@@ -205,6 +206,7 @@
 	cm_id_priv->id.event_handler = cm_event_handler;
 	cm_id_priv->id.add_ref = add_ref;
 	cm_id_priv->id.rem_ref = rem_ref;
+	cm_id_priv->id.so = so;
 	spin_lock_init(&cm_id_priv->lock);
 	atomic_set(&cm_id_priv->refcount, 1);
 	init_waitqueue_head(&cm_id_priv->connect_wait);
@@ -629,6 +631,7 @@
 	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
 
 	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				iw_event->so,
 				listen_id_priv->id.cm_handler,
 				listen_id_priv->id.context);
 	/* If the cm_id could not be created, ignore the request */
diff -r 7cec8c20120e sys/ofed/include/linux/net.h
--- a/sys/ofed/include/linux/net.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/ofed/include/linux/net.h	Mon Jun 11 00:15:24 2012 -0700
@@ -48,12 +48,12 @@
 	int error;
 
 	nam = NULL;
-	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
-		return (-ENOTCONN);
+	if (peer) {
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+			return (-ENOTCONN);
 
-	if (peer)
 		error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam);
-	else
+	} else
 		error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam);
 	if (error)
 		return (-error);
diff -r 7cec8c20120e sys/ofed/include/rdma/iw_cm.h
--- a/sys/ofed/include/rdma/iw_cm.h	Sun Jun 10 23:57:43 2012 -0700
+++ b/sys/ofed/include/rdma/iw_cm.h	Mon Jun 11 00:15:24 2012 -0700
@@ -63,6 +63,7 @@
 	void *private_data;
 	u8 private_data_len;
 	void *provider_data;
+	struct socket *so;
 };
 
 /**
@@ -98,6 +99,7 @@
 	/* Used by provider to add and remove refs on IW cm_id */
 	void (*add_ref)(struct iw_cm_id *);
 	void (*rem_ref)(struct iw_cm_id *);
+	struct socket           *so;
 };
 
 struct iw_cm_conn_param {
@@ -139,7 +141,7 @@
  *   returned IW CM identifier.
  * @context: User specified context associated with the id.
  */
-struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so,
 				 iw_cm_handler cm_handler, void *context);
 
 /**
diff -r 7cec8c20120e usr.bin/netstat/inet.c
--- a/usr.bin/netstat/inet.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/usr.bin/netstat/inet.c	Mon Jun 11 00:15:24 2012 -0700
@@ -461,7 +461,10 @@
 #endif
 		vchar = ((inp->inp_vflag & INP_IPV4) != 0) ?
 		    "4 " : "  ";
-		printf("%-3.3s%-2.2s ", name, vchar);
+		if (istcp && (tp->t_flags & TF_TOE) != 0)
+			printf("%-3.3s%-2.2s ", "toe", vchar);
+		else
+			printf("%-3.3s%-2.2s ", name, vchar);
 		if (Lflag) {
 			char buf1[15];
 
diff -r 7cec8c20120e usr.bin/sockstat/sockstat.c
--- a/usr.bin/sockstat/sockstat.c	Sun Jun 10 23:57:43 2012 -0700
+++ b/usr.bin/sockstat/sockstat.c	Mon Jun 11 00:15:24 2012 -0700
@@ -325,6 +325,7 @@
 			}
 			inp = &xtp->xt_inp;
 			so = &xtp->xt_socket;
+			protoname = xtp->xt_tp.t_flags & TF_TOE ? "toe" : "tcp";
 			break;
 		case IPPROTO_UDP:
 		case IPPROTO_DIVERT: