Index: net/if.h =================================================================== RCS file: /home/ncvs/src/sys/net/if.h,v retrieving revision 1.103 diff -u -p -r1.103 if.h --- net/if.h 19 Jun 2006 22:20:44 -0000 1.103 +++ net/if.h 3 Sep 2006 19:01:01 -0000 @@ -180,7 +180,12 @@ struct if_data { #define IF_Mbps(x) (IF_Kbps((x) * 1000)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000)) /* gigabits/sec. */ -/* Capabilities that interfaces can advertise. */ +/* + * Capabilities that interfaces can advertise. + * if_capabilities + * if_capenabled + * ifi_hwassist in mbuf CSUM_ flag form, controlled by above + */ #define IFCAP_RXCSUM 0x0001 /* can offload checksum on RX */ #define IFCAP_TXCSUM 0x0002 /* can offload checksum on TX */ #define IFCAP_NETCONS 0x0004 /* can be a network console */ @@ -189,6 +194,7 @@ struct if_data { #define IFCAP_JUMBO_MTU 0x0020 /* 9000 byte MTU supported */ #define IFCAP_POLLING 0x0040 /* driver supports polling */ #define IFCAP_VLAN_HWCSUM 0x0080 /* can do IFCAP_HWCSUM on VLANs */ +#define IFCAP_TSO 0x0100 /* can do TCP Segmentation Offload */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) Index: netinet/ip_output.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/ip_output.c,v retrieving revision 1.259 diff -u -p -r1.259 ip_output.c --- netinet/ip_output.c 17 Aug 2006 00:37:03 -0000 1.259 +++ netinet/ip_output.c 3 Sep 2006 19:01:01 -0000 @@ -486,17 +495,24 @@ passout: /* * If small enough for interface, or the interface will take - * care of the fragmentation for us, can just send directly. + * care of the fragmentation for us, we can just send directly. */ - if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && - ((ip->ip_off & IP_DF) == 0))) { + if (ip->ip_len <= ifp->if_mtu || + (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || + ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); - /* Record statistics for this interface address. */ + /* + * Record statistics for this interface address. + * XXX: With CSUM_TSO the packet count will be + * incorrect because the packetization of a large + * number of bytes is done in the network card + * instead of the stack. + */ if (!(flags & IP_FORWARDING) && ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; Index: netinet/tcp_input.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.304 diff -u -p -r1.304 tcp_input.c --- netinet/tcp_input.c 11 Aug 2006 21:15:23 -0000 1.304 +++ netinet/tcp_input.c 3 Sep 2006 19:01:02 -0000 @@ -2857,6 +2867,7 @@ tcp_mss(tp, offer) struct socket *so; struct hc_metrics_lite metrics; int origoffer = offer; + int mtuflags = 0; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? @@ -2869,12 +2880,12 @@ tcp_mss(tp, offer) /* initialize */ #ifdef INET6 if (isipv6) { - maxmtu = tcp_maxmtu6(&inp->inp_inc); + maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { - maxmtu = tcp_maxmtu(&inp->inp_inc); + maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } so = inp->inp_socket; @@ -3081,6 +3092,10 @@ tcp_mss(tp, offer) tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; + + /* Check the interface for TSO capabilities. */ + if (mtuflags & CSUM_TSO) + tp->t_flags |= TF_TSO; } /* @@ -3103,14 +3118,14 @@ tcp_mssopt(inc) #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; - maxmtu = tcp_maxmtu6(inc); + maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; - maxmtu = tcp_maxmtu(inc); + maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } Index: netinet/tcp_output.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.115 diff -u -p -r1.115 tcp_output.c --- netinet/tcp_output.c 23 Feb 2006 21:14:34 -0000 1.115 +++ netinet/tcp_output.c 3 Sep 2006 19:01:02 -0000 @@ -102,8 +102,12 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, loca &ss_fltsz_local, 1, "Slow start flight size for local networks"); int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, - 0, "Enable NewReno Algorithms"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, + &tcp_do_newreno, 0, "Enable NewReno Algorithms"); + +int tcp_do_tso = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, + &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); /* * Tcp output routine: figure out what should be sent and send it. @@ -127,6 +131,7 @@ tcp_output(struct tcpcb *tp) int i, sack_rxmit; int sack_bytes_rxmt; struct sackhole *p; + int tso = 0; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -376,12 +381,33 @@ after_sack_rexmit: /* * len will be >= 0 after this point. Truncate to the maximum - * segment length and ensure that FIN is removed if the length - * no longer contains the last data byte. + * segment length or enable TCP Segmentation Offloading (if supported + * by hardware) and ensure that FIN is removed if the length no longer + * contains the last data byte. + * + * TSO may only be used if we are in a pure bulk sending state. The + * presence of TCP-MD5, SACK retransmits, SACK advertizements and + * IP options prevent using TSO. With TSO the TCP header is the same + * (except for the sequence number) for all generated packets. This + * makes it impossible to transmit any options which vary per generated + * segment or packet. + * + * The length of TSO bursts is limited by TCP_MAXWIN. That limit and + * removal of FIN (if not already catched here) are handled later after + * the exact length of the TCP options are known. */ if (len > tp->t_maxseg) { - len = tp->t_maxseg; - sendalot = 1; + if ((tp->t_flags & TF_TSO) && tcp_do_tso && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->in6p_options == NULL) { + tso = 1; + } else { + len = tp->t_maxseg; + sendalot = 1; + tso = 0; + } } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) @@ -397,7 +423,7 @@ after_sack_rexmit: * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * - * - We have a full segment + * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) @@ -406,7 +432,7 @@ after_sack_rexmit: * - we need to retransmit */ if (len) { - if (len == tp->t_maxseg) + if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote @@ -702,14 +728,27 @@ send: * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. + * + * When doing TSO limit a burst to TCP_MAXWIN and set the + * flag to continue sending and prevent the last segment + * from being fractional thus making them all equal sized. */ if (len + optlen + ipoptlen > tp->t_maxopd) { - /* - * If there is still more to send, don't close the connection. - */ flags &= ~TH_FIN; - len = tp->t_maxopd - optlen - ipoptlen; - sendalot = 1; + if (tso) { + if (len > TCP_MAXWIN) { + len = TCP_MAXWIN - TCP_MAXWIN % + (tp->t_maxopd - optlen); + sendalot = 1; + } else { + len = min(len, TCP_MAXWIN); + if (tp->t_flags & TF_NEEDFIN) + sendalot = 1; + } + } else { + len = tp->t_maxopd - optlen - ipoptlen; + sendalot = 1; + } } /*#ifdef DIAGNOSTIC*/ @@ -926,14 +965,20 @@ send: */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 - if (isipv6) + if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen + len); - else + + /* Enable TSO and specify the size of the segments. */ + if (tso) { + m->m_pkthdr.csum_flags = CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + } + } else #endif /* INET6 */ { m->m_pkthdr.csum_flags = CSUM_TCP; @@ -941,6 +986,12 @@ send: th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); + /* Enable TSO and specify the size of the segments. */ + if (tso) { + m->m_pkthdr.csum_flags = CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + } + /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); @@ -1119,11 +1170,22 @@ out: } if (error == EMSGSIZE) { /* - * ip_output() will have already fixed the route - * for us. tcp_mtudisc() will, as its last action, - * initiate retransmission, so it is important to - * not do so here. + * For some reason the interface we used initially + * to send segments changed to another or lowered + * its MTU. + * + * tcp_mtudisc() will find out the new MTU and as + * its last action, initiate retransmission, so it + * is important to not do so here. + * + * If TSO was active we either got an interface + * without TSO capabilits or TSO was turned off. + * Disable it for this connection as too and + * immediatly retry with MSS sized segments generated + * by this function. */ + if (tso) + tp->t_flags &= ~TF_TSO; tcp_mtudisc(tp->t_inpcb, 0); return 0; } @@ -1132,6 +1194,7 @@ out: tp->t_softerror = error; return (0); } + /* if (error == EACCES) */ return (error); } tcpstat.tcps_sndtotal++; Index: netinet/tcp_subr.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v retrieving revision 1.255 diff -u -p -r1.255 tcp_subr.c --- netinet/tcp_subr.c 26 Aug 2006 17:53:19 -0000 1.255 +++ netinet/tcp_subr.c 3 Sep 2006 19:01:02 -0000 @@ -1230,7 +1230,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s * or route MTU. tcp_mtudisc() * will do right thing by itself. */ - if (mtu <= tcp_maxmtu(&inc)) + if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } @@ -1509,9 +1509,9 @@ tcp_mtudisc(struct inpcb *inp, int errno maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ romtu = #ifdef INET6 - isipv6 ? tcp_maxmtu6(&inp->inp_inc) : + isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) : #endif /* INET6 */ - tcp_maxmtu(&inp->inp_inc); + tcp_maxmtu(&inp->inp_inc, NULL); if (!maxmtu) maxmtu = romtu; else @@ -1588,7 +1588,7 @@ tcp_mtudisc(struct inpcb *inp, int errno * to get the interface MTU. */ u_long -tcp_maxmtu(struct in_conninfo *inc) +tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; @@ -1611,6 +1611,12 @@ tcp_maxmtu(struct in_conninfo *inc) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + /* Report additional interface capabilities or information. */ + if (flags != NULL) { + if (ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro.ro_rt); } return (maxmtu); @@ -1618,7 +1624,7 @@ tcp_maxmtu(struct in_conninfo *inc) #ifdef INET6 u_long -tcp_maxmtu6(struct in_conninfo *inc) +tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; @@ -1640,6 +1646,12 @@ tcp_maxmtu6(struct in_conninfo *inc) else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + + /* Report additional interface capabilities or information. */ + if (flags != NULL) { + if (ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro6.ro_rt); } Index: netinet/tcp_var.h =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.133 diff -u -p -r1.133 tcp_var.h --- netinet/tcp_var.h 26 Jun 2006 15:35:25 -0000 1.133 +++ netinet/tcp_var.h 3 Sep 2006 19:01:02 -0000 @@ -114,6 +114,7 @@ struct tcpcb { #define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ +#define TF_TSO 0x1000000 /* TSO enabled on this connection */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -506,8 +508,8 @@ void tcp_init(void); void tcp_fini(void *); void tcp_reass_init(void); void tcp_input(struct mbuf *, int); -u_long tcp_maxmtu(struct in_conninfo *); -u_long tcp_maxmtu6(struct in_conninfo *); +u_long tcp_maxmtu(struct in_conninfo *, int *); +u_long tcp_maxmtu6(struct in_conninfo *, int *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * Index: netinet6/icmp6.c =================================================================== RCS file: /home/ncvs/src/sys/netinet6/icmp6.c,v retrieving revision 1.71 diff -u -p -r1.71 icmp6.c --- netinet6/icmp6.c 4 Aug 2006 21:27:38 -0000 1.71 +++ netinet6/icmp6.c 3 Sep 2006 19:01:02 -0000 @@ -1132,7 +1132,7 @@ icmp6_mtudisc_update(ip6cp, validated) if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; - if (mtu < tcp_maxmtu6(&inc)) { + if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); icmp6stat.icp6s_pmtuchg++; } Index: sys/mbuf.h =================================================================== RCS file: /home/ncvs/src/sys/sys/mbuf.h,v retrieving revision 1.192 diff -u -p -r1.192 mbuf.h --- sys/mbuf.h 24 Jul 2006 01:49:57 -0000 1.192 +++ sys/mbuf.h 3 Sep 2006 19:01:03 -0000 @@ -110,6 +110,7 @@ struct pkthdr { /* variables for hardware checksum */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ + int tso_segsz; /* TSO segment size */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ }; @@ -215,6 +216,7 @@ struct mbuf { #define CSUM_UDP 0x0004 /* will csum UDP */ #define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ #define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define CSUM_TSO 0x0020 /* will do TSO */ #define CSUM_IP_CHECKED 0x0100 /* did csum IP */ #define CSUM_IP_VALID 0x0200 /* ... the csum is valid */