Index: sys/dev/e1000/if_em.c =================================================================== --- sys/dev/e1000/if_em.c (revision 211398) +++ sys/dev/e1000/if_em.c (working copy) @@ -237,9 +237,10 @@ static int em_fixup_rx(struct rx_ring *); #endif static void em_receive_checksum(struct e1000_rx_desc *, struct mbuf *); -static void em_transmit_checksum_setup(struct tx_ring *, struct mbuf *, - u32 *, u32 *); -static bool em_tso_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); +static void em_transmit_checksum_setup(struct tx_ring *, struct mbuf *, int, + struct ip *, u32 *, u32 *); +static void em_tso_setup(struct tx_ring *, struct mbuf *, int, struct ip *, + struct tcphdr *, u32 *, u32 *); static void em_set_promisc(struct adapter *); static void em_disable_promisc(struct adapter *); static void em_set_multi(struct adapter *); @@ -1729,13 +1730,20 @@ struct em_buffer *tx_buffer, *tx_buffer_mapped; struct e1000_tx_desc *ctxd = NULL; struct mbuf *m_head; + struct ether_header *eh; + struct ip *ip; + struct tcphdr *tp; u32 txd_upper, txd_lower, txd_used, txd_saved; + int ip_off, poff; int nsegs, i, j, first, last = 0; int error, do_tso, tso_desc = 0; m_head = *m_headp; txd_upper = txd_lower = txd_used = txd_saved = 0; do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0); + ip_off = poff = 0; + ip = NULL; + tp = NULL; /* ** When doing checksum offload, it is critical to @@ -1751,15 +1759,101 @@ } /* - * TSO workaround: - * If an mbuf is only header we need - * to pull 4 bytes of data into it. + * XXX + * Intel recommends entire IP/TCP header length reside in a single + * buffer. If multiple descriptors are used to describe the IP and + * TCP header, each descriptor should describe one or more + * complete headers; descriptors referencing only parts of headers + * are not supported. If all layer headers are not coalesced into + * a single buffer, each buffer should not cross a 4KB boundary, + * or be larger than the maximum read request size. + * Controller also requires modifing IP/TCP header to make TSO work + * so we firstly get a writable mbuf chain then coalesce ethernet/ + * IP/TCP header into a single buffer to meet the requirement of + * controller. This also simplifies IP/TCP/UDP checksum offloading + * which also has similiar restrictions. */ - if (do_tso && (m_head->m_len <= M_TSO_LEN)) { - m_head = m_pullup(m_head, M_TSO_LEN + 4); + if (do_tso || m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { + if (do_tso || (m_head->m_next != NULL && + m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD)) { + if (M_WRITABLE(*m_headp) == 0) { + m_head = m_dup(*m_headp, M_DONTWAIT); + m_freem(*m_headp); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + *m_headp = m_head; + } + } + /* + * XXX + * Assume IPv4, we don't have TSO/checksum offload support + * for IPv6 yet. + */ + ip_off = sizeof(struct ether_header); + m_head = m_pullup(m_head, ip_off); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + eh = mtod(m_head, struct ether_header *); + if (eh->ether_type == htons(ETHERTYPE_VLAN)) { + ip_off = sizeof(struct ether_vlan_header); + m_head = m_pullup(m_head, ip_off); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + } + m_head = m_pullup(m_head, ip_off + sizeof(struct ip)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + ip = (struct ip *)(mtod(m_head, char *) + ip_off); + poff = ip_off + (ip->ip_hl << 2); + m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + if (do_tso) { + tp = (struct tcphdr *)(mtod(m_head, char *) + poff); + /* + * TSO workaround: + * pull 4 more bytes of data into it. + */ + m_head = m_pullup(m_head, poff + (tp->th_off << 2) + 4); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + ip->ip_len = 0; + ip->ip_sum = 0; + /* + * The pseudo TCP checksum does not include TCP payload + * length so driver should recompute the checksum here + * what hardware expect to see. This is adherence of + * Microsoft's Large Send specification. + */ + tp->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { + tp = (struct tcphdr *)(mtod(m_head, char *) + poff); + m_head = m_pullup(m_head, poff + (tp->th_off << 2)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { + m_head = m_pullup(m_head, poff + sizeof(struct udphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + } *m_headp = m_head; - if (m_head == NULL) - return (ENOBUFS); } /* @@ -1836,15 +1930,14 @@ /* Do hardware assists */ #if __FreeBSD_version >= 700000 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { - error = em_tso_setup(txr, m_head, &txd_upper, &txd_lower); - if (error != TRUE) - return (ENXIO); /* something foobar */ + em_tso_setup(txr, m_head, ip_off, ip, tp, &txd_upper, + &txd_lower); /* we need to make a final sentinel transmit desc */ tso_desc = TRUE; } else #endif if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) - em_transmit_checksum_setup(txr, m_head, + em_transmit_checksum_setup(txr, m_head, ip_off, ip, &txd_upper, &txd_lower); i = txr->next_avail_desc; @@ -3055,6 +3148,13 @@ /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; + /* Clear checksum offload context. */ + txr->last_hw_offload = 0; + txr->last_hw_ipcss = 0; + txr->last_hw_ipcso = 0; + txr->last_hw_tucss = 0; + txr->last_hw_tucso = 0; + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); EM_TX_UNLOCK(txr); @@ -3251,144 +3351,168 @@ /********************************************************************* * * The offload context needs to be set when we transfer the first - * packet of a particular protocol (TCP/UDP). This routine has been - * enhanced to deal with inserted VLAN headers, and IPV6 (not complete) + * packet of a particular protocol (TCP/UDP). * * Added back the old method of keeping the current context type * and not setting if unnecessary, as this is reported to be a * big performance win. -jfv + * + * The reason comes from the fact that how ethernet controllers + * support four pipelined request from the the Tx data DMA. The + * four requests can belong to the same packet or to consecutive + * packet. However all request for a packet are issued before a + * request is issued for a subsequent packet and if a request + * for the next packet requires context change, the request for + * the next packet is not issued until the previous request + * completed. This means setting up a new context effectively + * disables pipelined Tx data DMA which in turn greatly slow + * down performance to send small sized frames. We should avoid + * setting up a new context as possible as we can to keep + * controller pipelining Tx data DMA. In addition, some + * controllers like 631xESB/632xESB/82571EB/82572EI that support + * multiple Tx queues may requires new context for each queue + * because driver may not know checksum context configured in + * other Tx queue. The hardware tracks only for the last context + * descriptor that was written. + * This is severe limitation of current Intel controller + * implementation and it would be better to disable checksum + * offloading in upper stack when multiple Tx queues are used. **********************************************************************/ static void -em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, - u32 *txd_upper, u32 *txd_lower) +em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, + struct ip *ip, u32 *txd_upper, u32 *txd_lower) { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD = NULL; struct em_buffer *tx_buffer; - struct ether_vlan_header *eh; - struct ip *ip = NULL; - struct ip6_hdr *ip6; - int cur, ehdrlen; - u32 cmd, hdr_len, ip_hlen; - u16 etype; - u8 ipproto; + int cur, hdr_len; + u32 cmd; + u16 offload; + u8 ipcso, ipcss, tucso, tucss; - - cmd = hdr_len = ipproto = 0; - *txd_upper = *txd_lower = 0; + ipcss = 0; + ipcso = 0; + tucss = 0; + tucso = 0; + offload = 0; + cmd = 0; + hdr_len = ip_off + (ip->ip_hl << 2); cur = txr->next_avail_desc; - /* - * Determine where frame payload starts. - * Jump over vlan headers if already present, - * helpful for QinQ too. - */ - eh = mtod(mp, struct ether_vlan_header *); - if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { - etype = ntohs(eh->evl_proto); - ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; - } else { - etype = ntohs(eh->evl_encap_proto); - ehdrlen = ETHER_HDR_LEN; + if (adapter->num_queues > 1) { + /* + * We're using multiple Tx queues, invalidate old + * checksum context to build new one as we don't + * know checksum context configured in other Tx + * queue. + */ + txr->last_hw_ipcss = 0; + txr->last_hw_ipcso = 0; + txr->last_hw_tucss = 0; + txr->last_hw_tucso = 0; } - /* - * We only support TCP/UDP for IPv4 and IPv6 for the moment. - * TODO: Support SCTP too when it hits the tree. - */ - switch (etype) { - case ETHERTYPE_IP: - ip = (struct ip *)(mp->m_data + ehdrlen); - ip_hlen = ip->ip_hl << 2; - - /* Setup of IP header checksum. */ - if (mp->m_pkthdr.csum_flags & CSUM_IP) { - /* - * Start offset for header checksum calculation. - * End offset for header checksum calculation. - * Offset of place to put the checksum. - */ - TXD = (struct e1000_context_desc *) - &txr->tx_base[cur]; - TXD->lower_setup.ip_fields.ipcss = ehdrlen; - TXD->lower_setup.ip_fields.ipcse = - htole16(ehdrlen + ip_hlen); - TXD->lower_setup.ip_fields.ipcso = - ehdrlen + offsetof(struct ip, ip_sum); - cmd |= E1000_TXD_CMD_IP; - *txd_upper |= E1000_TXD_POPTS_IXSM << 8; - } - - hdr_len = ehdrlen + ip_hlen; - ipproto = ip->ip_p; - break; - - case ETHERTYPE_IPV6: - ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); - ip_hlen = sizeof(struct ip6_hdr); /* XXX: No header stacking. */ - - /* IPv6 doesn't have a header checksum. */ - - hdr_len = ehdrlen + ip_hlen; - ipproto = ip6->ip6_nxt; - break; - - default: - return; + /* Setup of IP header checksum. */ + if (mp->m_pkthdr.csum_flags & CSUM_IP) { + *txd_upper |= E1000_TXD_POPTS_IXSM << 8; + offload |= CSUM_IP; + ipcss = ip_off; + ipcso = ip_off + offsetof(struct ip, ip_sum); + /* + * Start offset for header checksum calculation. + * End offset for header checksum calculation. + * Offset of place to put the checksum. + */ + TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; + TXD->lower_setup.ip_fields.ipcss = ipcss; + TXD->lower_setup.ip_fields.ipcse = htole16(hdr_len); + TXD->lower_setup.ip_fields.ipcso = ipcso; + cmd |= E1000_TXD_CMD_IP; } - switch (ipproto) { - case IPPROTO_TCP: - if (mp->m_pkthdr.csum_flags & CSUM_TCP) { - *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; - *txd_upper |= E1000_TXD_POPTS_TXSM << 8; - /* no need for context if already set */ - if (txr->last_hw_offload == CSUM_TCP) - return; - txr->last_hw_offload = CSUM_TCP; - /* - * Start offset for payload checksum calculation. - * End offset for payload checksum calculation. - * Offset of place to put the checksum. - */ - TXD = (struct e1000_context_desc *) - &txr->tx_base[cur]; - TXD->upper_setup.tcp_fields.tucss = hdr_len; - TXD->upper_setup.tcp_fields.tucse = htole16(0); - TXD->upper_setup.tcp_fields.tucso = - hdr_len + offsetof(struct tcphdr, th_sum); - cmd |= E1000_TXD_CMD_TCP; - } - break; - case IPPROTO_UDP: - { - if (mp->m_pkthdr.csum_flags & CSUM_UDP) { - *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; - *txd_upper |= E1000_TXD_POPTS_TXSM << 8; - /* no need for context if already set */ - if (txr->last_hw_offload == CSUM_UDP) - return; - txr->last_hw_offload = CSUM_UDP; - /* - * Start offset for header checksum calculation. - * End offset for header checksum calculation. - * Offset of place to put the checksum. - */ - TXD = (struct e1000_context_desc *) - &txr->tx_base[cur]; - TXD->upper_setup.tcp_fields.tucss = hdr_len; - TXD->upper_setup.tcp_fields.tucse = htole16(0); - TXD->upper_setup.tcp_fields.tucso = - hdr_len + offsetof(struct udphdr, uh_sum); - } - /* Fall Thru */ - } - default: - break; - } + if (mp->m_pkthdr.csum_flags & CSUM_TCP) { + *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; + *txd_upper |= E1000_TXD_POPTS_TXSM << 8; + offload |= CSUM_TCP; + tucss = hdr_len; + tucso = hdr_len + offsetof(struct tcphdr, th_sum); + /* + * Setting up new checksum offload context for every frames + * takes a lot of processing time for hardware. This also + * reduces performance a lot for small sized frames so avoid + * it if driver can use previously configured checksum + * offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + /* + * Start offset for payload checksum calculation. + * End offset for payload checksum calculation. + * Offset of place to put the checksum. + */ + TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; + TXD->upper_setup.tcp_fields.tucss = hdr_len; + TXD->upper_setup.tcp_fields.tucse = htole16(0); + TXD->upper_setup.tcp_fields.tucso = tucso; + cmd |= E1000_TXD_CMD_TCP; + } else if (mp->m_pkthdr.csum_flags & CSUM_UDP) { + *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; + *txd_upper |= E1000_TXD_POPTS_TXSM << 8; + tucss = hdr_len; + tucso = hdr_len + offsetof(struct udphdr, uh_sum); + /* + * Setting up new checksum offload context for every frames + * takes a lot of processing time for hardware. This also + * reduces performance a lot for small sized frames so avoid + * it if driver can use previously configured checksum + * offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + /* + * Start offset for header checksum calculation. + * End offset for header checksum calculation. + * Offset of place to put the checksum. + */ + TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; + TXD->upper_setup.tcp_fields.tucss = tucss; + TXD->upper_setup.tcp_fields.tucse = htole16(0); + TXD->upper_setup.tcp_fields.tucso = tucso; + } + + if (offload & CSUM_IP) { + txr->last_hw_ipcss = ipcss; + txr->last_hw_ipcso = ipcso; + } - if (TXD == NULL) - return; TXD->tcp_seg_setup.data = htole32(0); TXD->cmd_and_length = htole32(adapter->txd_cmd | E1000_TXD_CMD_DEXT | cmd); @@ -3409,124 +3533,53 @@ * Setup work for hardware segmentation offload (TSO) * **********************************************************************/ -static bool -em_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *txd_upper, - u32 *txd_lower) +static void +em_tso_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, + struct ip *ip, struct tcphdr *tp, u32 *txd_upper, u32 *txd_lower) { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD; struct em_buffer *tx_buffer; - struct ether_vlan_header *eh; - struct ip *ip; - struct ip6_hdr *ip6; - struct tcphdr *th; - int cur, ehdrlen, hdr_len, ip_hlen, isip6; - u16 etype; + int cur, hdr_len; /* - * This function could/should be extended to support IP/IPv6 - * fragmentation as well. But as they say, one step at a time. + * XXX + * In theory we can use the same TSO context if and only if + * frame is the same type(IP/TCP) and the same MSS. However + * checking whether a frame has the same IP/TCP structure is + * hard thing so just ignore that and always restablish a + * new TSO context. */ - - /* - * Determine where frame payload starts. - * Jump over vlan headers if already present, - * helpful for QinQ too. - */ - eh = mtod(mp, struct ether_vlan_header *); - if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { - etype = ntohs(eh->evl_proto); - ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; - } else { - etype = ntohs(eh->evl_encap_proto); - ehdrlen = ETHER_HDR_LEN; - } - - /* Ensure we have at least the IP+TCP header in the first mbuf. */ - if (mp->m_len < ehdrlen + sizeof(struct ip) + sizeof(struct tcphdr)) - return FALSE; /* -1 */ - - /* - * We only support TCP for IPv4 and IPv6 (notyet) for the moment. - * TODO: Support SCTP too when it hits the tree. - */ - switch (etype) { - case ETHERTYPE_IP: - isip6 = 0; - ip = (struct ip *)(mp->m_data + ehdrlen); - if (ip->ip_p != IPPROTO_TCP) - return FALSE; /* 0 */ - ip->ip_len = 0; - ip->ip_sum = 0; - ip_hlen = ip->ip_hl << 2; - if (mp->m_len < ehdrlen + ip_hlen + sizeof(struct tcphdr)) - return FALSE; /* -1 */ - th = (struct tcphdr *)((caddr_t)ip + ip_hlen); -#if 1 - th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(IPPROTO_TCP)); -#else - th->th_sum = mp->m_pkthdr.csum_data; -#endif - break; - case ETHERTYPE_IPV6: - isip6 = 1; - return FALSE; /* Not supported yet. */ - ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); - if (ip6->ip6_nxt != IPPROTO_TCP) - return FALSE; /* 0 */ - ip6->ip6_plen = 0; - ip_hlen = sizeof(struct ip6_hdr); /* XXX: no header stacking. */ - if (mp->m_len < ehdrlen + ip_hlen + sizeof(struct tcphdr)) - return FALSE; /* -1 */ - th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); -#if 0 - th->th_sum = in6_pseudo(ip6->ip6_src, ip->ip6_dst, - htons(IPPROTO_TCP)); /* XXX: function notyet. */ -#else - th->th_sum = mp->m_pkthdr.csum_data; -#endif - break; - default: - return FALSE; - } - hdr_len = ehdrlen + ip_hlen + (th->th_off << 2); - + hdr_len = ip_off + (ip->ip_hl << 2) + (tp->th_off << 2); *txd_lower = (E1000_TXD_CMD_DEXT | /* Extended descr type */ E1000_TXD_DTYP_D | /* Data descr type */ E1000_TXD_CMD_TSE); /* Do TSE on this packet */ /* IP and/or TCP header checksum calculation and insertion. */ - *txd_upper = ((isip6 ? 0 : E1000_TXD_POPTS_IXSM) | - E1000_TXD_POPTS_TXSM) << 8; + *txd_upper = (E1000_TXD_POPTS_IXSM | E1000_TXD_POPTS_TXSM) << 8; cur = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[cur]; TXD = (struct e1000_context_desc *) &txr->tx_base[cur]; - /* IPv6 doesn't have a header checksum. */ - if (!isip6) { - /* - * Start offset for header checksum calculation. - * End offset for header checksum calculation. - * Offset of place put the checksum. - */ - TXD->lower_setup.ip_fields.ipcss = ehdrlen; - TXD->lower_setup.ip_fields.ipcse = - htole16(ehdrlen + ip_hlen - 1); - TXD->lower_setup.ip_fields.ipcso = - ehdrlen + offsetof(struct ip, ip_sum); - } /* + * Start offset for header checksum calculation. + * End offset for header checksum calculation. + * Offset of place put the checksum. + */ + TXD->lower_setup.ip_fields.ipcss = ip_off; + TXD->lower_setup.ip_fields.ipcse = + htole16(ip_off + (ip->ip_hl << 2) - 1); + TXD->lower_setup.ip_fields.ipcso = ip_off + offsetof(struct ip, ip_sum); + /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. * Offset of place to put the checksum. */ - TXD->upper_setup.tcp_fields.tucss = - ehdrlen + ip_hlen; + TXD->upper_setup.tcp_fields.tucss = ip_off + (ip->ip_hl << 2); TXD->upper_setup.tcp_fields.tucse = 0; TXD->upper_setup.tcp_fields.tucso = - ehdrlen + ip_hlen + offsetof(struct tcphdr, th_sum); + ip_off + (ip->ip_hl << 2) + offsetof(struct tcphdr, th_sum); /* * Payload size per packet w/o any headers. * Length of all headers up to payload. @@ -3537,7 +3590,7 @@ TXD->cmd_and_length = htole32(adapter->txd_cmd | E1000_TXD_CMD_DEXT | /* Extended descr */ E1000_TXD_CMD_TSE | /* TSE context */ - (isip6 ? 0 : E1000_TXD_CMD_IP) | + E1000_TXD_CMD_IP | /* Do IP csum */ E1000_TXD_CMD_TCP | /* Do TCP checksum */ (mp->m_pkthdr.len - (hdr_len))); /* Total len */ @@ -3550,8 +3603,6 @@ txr->tx_avail--; txr->next_avail_desc = cur; txr->tx_tso = TRUE; - - return TRUE; } Index: sys/dev/e1000/if_em.h =================================================================== --- sys/dev/e1000/if_em.h (revision 211398) +++ sys/dev/e1000/if_em.h (working copy) @@ -284,6 +284,10 @@ volatile u16 tx_avail; u32 tx_tso; /* last tx was tso */ u16 last_hw_offload; + uint8_t last_hw_ipcso; + uint8_t last_hw_ipcss; + uint8_t last_hw_tucso; + uint8_t last_hw_tucss; #if __FreeBSD_version >= 800000 struct buf_ring *br; #endif