--- tcp_output.c.orig Tue Jan 7 04:17:06 2003 +++ tcp_output.c Tue Jan 7 04:17:15 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995, 1998 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,10 +30,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * + * TCP Selective Acknowledgment (SACK) support: + * Copyright (c) 2002. Hiten Pandya. All rights reserved. + * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.73 2002/10/16 19:16:33 dillon Exp $ */ +#include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" @@ -105,6 +109,109 @@ int tcp_do_newreno = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); + +#ifdef TCP_SACK +extern int tcprexmtthresh; + +#if defined(TCP_SACK_DEBUG) +/* + * Print holes in segments. + */ +void +tcp_sack_print_holes(struct tcpcb *tp) +{ + struct sackhole *p = tp->t_snd_holes; + if (p == 0) + return; + + printf("(TCP_SACK) Hole report: start - end dups rxmit\n"); + while (p != 0) { + printf("%x--%x d %d r %x\n", p->sh_start, p->sh_end, + p->sh_dups, p->sh_rxmit); + p = p->next; + } + printf("\n"); +} +#endif /* TCP_SACK_DEBUG */ + +/* + * Tcp output routine, specific to SACK. It returns a pointer to + * struct sackhole if there are any pending retransmissions; NULL + * otherwise. + */ +struct sackhole * +tcp_sack_output(struct tcpcb *tp) +{ + struct sackhole *p; + + if (tp->t_sack_disable) + return 0; + + p = tp->t_snd_holes; + while (p) { +#ifndef TCP_FACK + if (p->sh_dups >= tcprexmtthresh && + SEQ_LT(p->sh_rxmit, p->sh_end)) { +#else + /* + * In FACK, if p->sh_dups is less than tcprexmtthresh, but + * t_snd_fack advances more than tcprexmtthresh * tp->t_maxseg, + * tcp_input() will try fast retransmit (NewReno). This + * forces output. + */ + if ((p->sh_dups >= tcprexmtthresh || + tp->t_dupacks == tcprexmtthresh) && + SEQ_LT(p->sh_rxmit, p->sh_end)) { +#endif /* TCP_FACK */ + if (SEQ_LT(p->sh_rxmit, tp->snd_una)) { /* old hole */ + p = p->next; + continue; + } +#if defined(TCP_SACK_DEBUG) + if (p != 0) + tcp_sack_print_holes(tp); +#endif + return p; + } + p = p->next; + } + return 0; +} + +/* + * The SACK list may be rebuilt after a timeout. This SACK + * information should be used to avoid retransmission of SACK'ed + * data. This routine traverses the SACK list to see if 'snd_nxt' + * should be moved forward. + */ +void +tcp_sack_adjust(struct tcpcb *tp) +{ + int i; + + if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) + return; /* We're already beyond any SACK'ed blocks */ + + /* + * Two cases where we want to advance snd_nxt: + * 1) it lies between end of one hole and beg. of another. + * 2) it lies between end of last hole and rcv_lastsack. + */ + for (i = 0; i < tp->rcv_numsacks; i++) { + if (SEQ_LT(tp->snd_nxt, tp->t_sackblks[i].blk_start)) + break; + if (SEQ_LEQ(tp->t_sackblks[i].blk_end, tp->snd_nxt)) + continue; + if (tp->t_sackblks[i].blk_start == 0 && + tp->t_sackblks[i].blk_end == 0) + continue; + /* snd_nxt must be in middle of block of SACK'ed data */ + tp->snd_nxt = tp->t_sackblks[i].blk_end; + break; + } +} +#endif /* TCP_SACK */ + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -129,10 +236,15 @@ #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; - + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif +#ifdef TCP_SACK + int i, sack_rxmit = 0; + struct sackhole *sackp; +#endif + #ifndef INET6 mtx_assert(&tp->t_inpcb->inp_mtx, MA_OWNED); #endif @@ -173,6 +285,17 @@ } again: sendalot = 0; +#ifdef TCP_SACK + /* + * If we've recently taken out a timeout, snd_max will be greater + * than snd_nxt. There may be SACK information that allows us to + * avoid resending already delivered data. Adjust snd_nxt + * accordingly. + */ + if (!tp->t_sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); +#endif + off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); win = min(win, tp->snd_bwnd); @@ -193,6 +316,36 @@ * and timer expired, we will send what we can * and go to transmit state. */ + +#ifdef TCP_SACK + /* + * Send any SACK generated retransmissions. If we're explicitily + * trying to send out new data (when sendalot = 1), bypass this + * routine. + * + * If we retransmit in fast recovery mode, decrement snd_cwnd, + * since we're replacing a (future) new transmission with a + * retransmission now, and we previously incremented snd_cwnd + * in tcp_input(). + */ + if (!tp->t_sack_disable && !sendalot) { + if (tp->t_dupacks >= tcprexmtthresh && + (sackp = tcp_sack_output(tp))) { + off = sackp->sh_rxmit - tp->snd_una; + sack_rxmit = 1; + +#if 0 + /* Coalesce holes into a single rexmit. */ +#endif +#ifndef TCP_FACK + /* in FACK, hold snd_cwnd constant during recovery */ + if (SEQ_LT(tp->snd_una, tp->snd_recover)) + tp->snd_cwnd -= tp->t_maxseg; +#endif + } + } +#endif /* TCP_SACK */ + if (tp->t_force) { if (win == 0) { /* @@ -232,7 +385,25 @@ * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ +#ifdef TCP_SACK + if (!sack_rxmit) { +#endif len = (long)ulmin(so->so_snd.sb_cc, win) - off; + +#if defined(TCP_FACK) + /* + * If we're in fast recovery (SEQ_GT(tp->snd_last, + * tp->snd_una)), and amount of outstanding data (t_snd_awnd) + * is >= snd_cwnd, then do not send data (like zero window + * conditions) + */ + if (!tp->t_sack_disable && len && SEQ_GT(tp->snd_recover, tp->snd_una) + && (tp->t_snd_awnd >= tp->snd_cwnd)) + len = 0; +#endif /* TCP_FACK */ +#ifdef TCP_SACK + } +#endif if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { taop = &tao_noncached; @@ -279,11 +450,13 @@ len = 0; if (win == 0) { callout_stop(tp->tt_rexmt); - tp->t_rxtshift = 0; +/* tp->t_rxtshift = 0; */ +#ifdef TCP_SACK + if (!sack_rxmit) +#endif tp->snd_nxt = tp->snd_una; - if (!callout_active(tp->tt_persist)) - tcp_setpersist(tp); - } +/* if (!callout_active(tp->tt_persist)) + tcp_setpersist(tp); */ } /* @@ -334,6 +507,10 @@ goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; +#ifdef TCP_SACK + if (!sack_rxmit) + goto send; +#endif } /* @@ -376,6 +553,22 @@ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; +#ifdef TCP_SACK + /* + * In SACK, it is possible for tcp_output to fail to send a + * segment after the retransmission timer has been turned off. + * Make sure that the retransmission timer is set. + */ + if (SEQ_GT(tp->snd_max, tp->snd_una) && + !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* No reason to send, just return. */ + return 0; + } +#endif /* TCP_SACK */ /* * TCP window updates are not reliable, rather a polling protocol @@ -437,6 +630,21 @@ (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; +#ifdef TCP_SACK + /* + * If this is the first SYN of connection (not a + * SYN ACK), include SACK_PERMIT_HDR option. If + * this is a SYN ACK, include SACK_PERMIT_HDR + * option if peer has already done so. + */ + if (!tp->t_sack_disable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_int32_t *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } +#endif + if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { @@ -468,6 +676,34 @@ optlen += TCPOLEN_TSTAMP_APPA; } +#ifdef TCP_SACK + /* + * Send SACKs if neccessary. This should be the _LAST_ option to be + * processed. Only TCP_MAX_SACK are permitted for send. No more + * than three SACKs are sent. + */ + if (!tp->t_sack_disable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numsacks) { + u_int32_t *lp = (u_int32_t *) (opt + optlen); + u_int32_t *olp = lp++; + int count = 0; /* actual number of SACKs inserted. */ + int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK; + + maxsack = min(maxsack, TCP_MAX_SACK); + for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblock sack = tp->t_sackblks[i]; + if (sack.blk_start == 0 && sack.blk_end == 0) + continue; + *lp++ = htonl(sack.blk_start); + *lp++ = htonl(sack.blk_end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK * count + 2)); + optlen += TCPOLEN_SACK * count + 4; /* including NOPS. */ + } +#endif /* TCP_SACK */ + /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. @@ -713,6 +949,23 @@ th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); +#ifdef TCP_SACK + if (sack_rxmit) { + /* + * If sendalot was turned on (due to option stuffing), turn + * it off. Properly set the th_seq field. Advance the + * ret'x pointer by len. + */ + if (sendalot) + sendalot = 0; + th->th_seq = htonl(sackp->sh_rxmit); + sackp->sh_rxmit += len; +#if defined(TCP_FACK) + tp->t_retran_data += len; +#endif + } +#endif + th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -801,6 +1054,15 @@ tp->t_flags |= TF_SENTFIN; } } + +#ifdef TCP_SACK + if (!tp->t_sack_disable) { + if (sack_rxmit && (sackp->sh_rxmit != tp->snd_nxt)) { + goto timer; + } + } +#endif + tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -823,6 +1085,19 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ +#ifdef TCP_SACK +timer: + if (!tp->t_sack_disable && sack_rxmit && + !callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_una) { + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + } +#endif if (!callout_active(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { if (callout_active(tp->tt_persist)) { @@ -916,6 +1191,15 @@ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb); } + +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * Update t_snd_awnd to reflect the new data that was sent. + */ + tp->t_snd_awnd = tcp_seq_subtract(tp->snd_max, tp->t_snd_fack) + + tp->t_retran_data; +#endif + if (error) { /*