Index: tcp.h =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp.h,v retrieving revision 1.18 diff -u -r1.18 tcp.h --- tcp.h 2 Oct 2002 04:22:34 -0000 1.18 +++ tcp.h 23 Dec 2002 16:52:30 -0000 @@ -89,6 +89,7 @@ #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* 2 * sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ @@ -103,6 +104,22 @@ #define TCPOPT_CC_HDR(ccopt) \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|(ccopt)<<8|TCPOLEN_CC) +#ifdef TCP_SACK +/* + * TCP SACK Option Definitions + */ +#define TCPOPT_SACK_PERMIT_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8 | \ + TCPOLEN_SACK_PERMITTED) + +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) + +/* Max SACK blocks stored by sender side. */ +#define MAX_SACK_BLKS 6 +/* Max SACKs sent in any segment. */ +#define TCP_MAX_SACK 3 +#endif /* TCP_SACK */ + /* * Default maximum segment size for TCP. * With an IP MSS of 576, this is 536, @@ -124,6 +141,7 @@ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ +/* Should be made local to TCP SACK */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ @@ -139,6 +157,10 @@ #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ +#endif + +#ifdef TCP_SACK +#define TCP_NOSACK 0x300 /* disable SACKs (if enabled by default.) */ #endif #endif /* !_NETINET_TCP_H_ */ Index: tcp_input.c =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.185 diff -u -r1.185 tcp_input.c --- tcp_input.c 20 Dec 2002 11:16:52 -0000 1.185 +++ tcp_input.c 23 Dec 2002 20:36:08 -0000 @@ -99,7 +99,7 @@ MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); -static const int tcprexmtthresh = 3; +const int tcprexmtthresh = 3; tcp_cc tcp_ccgen; struct tcpstat tcpstat; @@ -925,6 +925,13 @@ if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); } + +#ifdef TCP_SACK + if (tp->t_sack_disable) { + tp->rcv_laststart = ti->ti_seq; /* last rec'vd segment */ + tp->rcv_lastend = ti->ti_seq + ti->ti_len; + } +#endif /* * Header prediction: check for the two common cases Index: tcp_output.c =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.73 diff -u -r1.73 tcp_output.c --- tcp_output.c 16 Oct 2002 19:16:33 -0000 1.73 +++ tcp_output.c 23 Dec 2002 20:36:08 -0000 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995, 1998 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,6 +30,9 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * + * TCP Selective Acknowledgment (SACK) support: + * Copyright (c) 2002. Hiten Pandya. All rights reserved. + * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ @@ -105,6 +108,108 @@ int tcp_do_newreno = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); + +#ifdef TCP_SACK +extern int tcprexmtthresh; + +#if defined(TCP_SACK_DEBUG) +/* + * Print holes in segments. + */ +void +sack_print_holes(struct tcpcb *tp) +{ + struct sackhole *p = tp->t_snd_holes; + if (p == 0) + return; + + printf("(TCP_SACK) Hole report: start - end dups rxmit\n"); + while (p != 0) { + printf("%x--%x d %d r %x\n", p->sh_start, p->sh_end, + p->sh_dups, p->sh_rxmit); + p = p->next; + } + printf("\n"); +} +#endif + +/* + * Tcp output routine, specific to SACK. It returns a pointer to + * struct sackhole if there are any pending retransmissions; NULL + * otherwise. + */ +struct sackhole * +tcp_sack_output(struct tcpcb *tp) +{ + struct sackhole *p; + + if (tp->t_sack_disable) + return 0; + + p = tp->tp->t_snd_holes; + while (p) { +#ifndef TCP_FACK + if (p->sh_dups >= tcprexmtthresh && + SEQ_LT(p->sh_rxmit, p->sh_end)) { +#else + /* + * In FACK, if p->sh_dups is less than tcprexmtthresh, but + * t_snd_fack advances more than tcprexmtthresh * tp->t_maxseg, + * tcp_input() will try fast retransmit (NewReno). This + * forces output. + */ + if ((p->sh_dups >= tcprexmtthresh || + tp->t_dupacks == tcprexmtthresh) && + SEQ_LT(p->sh_rxmit, p->sh_end)) { +#endif /* TCP_FACK */ + if (SEQ_LT(p->sh_rxmit, tp->snd_una)) { /* old hole */ + p = p->next; + continue; + } +#ifdef TCP_SACK_DEBUG + if (p != 0) + sack_print_holes(tp); +#endif + return p; + } + p = p->next; + } + return 0; +} + +/* + * The SACK list may be rebuilt after a timeout. This SACK + * information should be used to avoid retransmission of SACK'ed + * data. This routine traverses the SACK list to see if 'snd_nxt' + * should be moved forward. + */ +void +tcp_sack_adjust(struct tcpcb *tp) +{ + int i; + + if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) + return; /* We're already beyond any SACK'ed blocks */ + + /* + * Two cases where we want to advance snd_nxt: + * 1) it lies between end of one hole and beg. of another. + * 2) it lies between end of last hole and rcv_lastsack. + */ + for (i = 0; i < tp->rcv_numsacks; i++) { + if (SEQ_LT(tp->tp->snd_nxt, tp->sackblks[i].sh_start)) + break; + if (SEQ_LEQ(tp->sackblks[i].sh_end, tp->snd_nxt)) + continue; + if (tp->sackblks[i].sh_start == 0 && tp->sackblks[i].end == 0) + continue; + /* snd_nxt must be in middle of block of SACK'ed data */ + tp->snd_nxt = tp->sackblks[i].end; + break; + } +} +#endif /* TCP_SACK */ + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -130,6 +235,11 @@ struct ip6_hdr *ip6 = NULL; int isipv6; +#ifdef TCP_SACK + int i, sack_rxmit = 0; + struct sackhole *sackp; +#endif + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif @@ -173,6 +283,17 @@ } again: sendalot = 0; +#ifdef TCP_SACK + /* + * If we've recently taken out a timeout, snd_max will be greater + * than snd_nxt. There may be SACK information that allows us to + * avoid resending already delivered data. Adjust snd_nxt + * accordingly. + */ + if (!tp->t_sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); +#endif + off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); win = min(win, tp->snd_bwnd); @@ -193,6 +314,35 @@ * and timer expired, we will send what we can * and go to transmit state. */ + +#ifdef TCP_SACK + /* + * Send any SACK generated retransmissions. If we're explicitily + * trying to send out new data (when sendalot = 1), bypass this + * routine. + * + * If we retransmit in fast recovery mode, decrement snd_cwnd, + * since we're replacing a (future) new transmission with a + * retransmission now, and we previously incremented snd_cwnd + * in tcp_input(). + */ + if (!tp->t_sack_disable && !sendalot) { + if ((sackp = tcp_sack_output(tp))) { + off = sackp->sh_rxmit - tp->snd_una; + sack_rxmit = 1; + +#if 0 + /* Coalesce holes into a single rexmit. */ +#endif +#ifndef TCP_FACK + /* in FACK, hold snd_cwnd constant during recovery */ + if (SEQ_LT(tp->snd_una, tp->snd_last)) + tp->snd_cwnd -= tp->t_maxseg; +#endif + } + } +#endif /* TCP_SACK */ + if (tp->t_force) { if (win == 0) { /* @@ -232,7 +382,25 @@ * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ +#ifdef TCP_SACK + if (!sack_rxmit) { +#endif len = (long)ulmin(so->so_snd.sb_cc, win) - off; + +#if defined(TCP_FACK) + /* + * If we're in fast recovery (SEQ_GT(tp->snd_last, + * tp->snd_una)), and amount of outstanding data (t_snd_awnd) + * is >= snd_cwnd, then do not send data (like zero window + * conditions) + */ + if (!tp->t_sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) + && (tp->t_snd_awnd >= tp->snd_cwnd)) + len = 0; +#endif /* TCP_FACK */ +#ifdef TCP_SACK + } +#endif if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { taop = &tao_noncached; @@ -279,10 +447,11 @@ len = 0; if (win == 0) { callout_stop(tp->tt_rexmt); - tp->t_rxtshift = 0; +/* tp->t_rxtshift = 0; */ + if (!sack_rxmit) { tp->snd_nxt = tp->snd_una; - if (!callout_active(tp->tt_persist)) - tcp_setpersist(tp); +/* if (!callout_active(tp->tt_persist)) + tcp_setpersist(tp); */ } } @@ -334,6 +503,10 @@ goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; +#ifdef TCP_SACK + if (!sack_rxmit) + goto send; +#endif } /* @@ -376,6 +549,22 @@ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; +#ifdef TCP_SACK + /* + * In SACK, it is possible for tcp_output to fail to send a + * segment after the retransmission timer has been turned off. + * Make sure that the retransmission timer is set. + */ + if (SEQ_GT(tp->snd_max, tp->snd_una) && + !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* No reason to send, just return. */ + return 0; + } +#endif /* TCP_SACK */ /* * TCP window updates are not reliable, rather a polling protocol @@ -437,6 +626,21 @@ (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; +#ifdef TCP_SACK + /* + * If this is the first SYN of connection (not a + * SYN ACK), include SACK_PERMIT_HDR option. If + * this is a SYN ACK, include SACK_PERMIT_HDR + * option if peer has already done so. + */ + if (!tp->sack_disable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_int32_t *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } +#endif + if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { @@ -468,6 +672,34 @@ optlen += TCPOLEN_TSTAMP_APPA; } +#ifdef TCP_SACK + /* + * Send SACKs if neccessary. This should be the _LAST_ option to be + * processed. Only TCP_MAX_SACK are permitted for send. No more + * than three SACKs are sent. + */ + if (!tp->t_sack_disable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numacks) { + u_int32_t *lp = (u_int32_t *) (opt + optlen); + u_int32_t *olp = lp++; + int count = 0; /* actual number of SACKs inserted. */ + int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK; + + maxsack = min(maxsack, TCP_MAX_SACK); + for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblk sack = tp->sackblksp[i]; + if (sack.sh_start == 0 && sack.sh_end == 0) + continue; + *lp++ = htonl(sack.sh_start); + *lp++ = htonl(sack.sh_end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK * count + 2)); + optlen += TCPOLEN_SACK * count + 4; /* including NOPS. */ + } +#endif /* TCP_SACK */ + /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. @@ -713,6 +945,23 @@ th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); +#ifdef TCP_SACK + if (sack_rxmit) { + /* + * If sendalot was turned on (due to option stuffing), turn + * it off. Properly set the th_seq field. Advance the + * ret'x pointer by len. + */ + if (sendalot) + sendalot = 0; + ti->ti_seq = htonl(sackp->sh_rxmit); + sackp->sh_rxmit += len; +#if defined(TCP_FACK) + tp->t_retran_data += len; +#endif + } +#endif + th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -801,6 +1050,15 @@ tp->t_flags |= TF_SENTFIN; } } + +#ifdef TCP_SACK + if (!tp->t_sack_disable) { + if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { + goto timer; + } + } +#endif + tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -823,6 +1081,19 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ +#ifdef TCP_SACK +timer: + if (!tp->t_sack_disable && sack_rxmit && + !callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_una) { + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + } +#endif if (!callout_active(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { if (callout_active(tp->tt_persist)) { @@ -832,6 +1103,7 @@ callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); } + } else { /* * Persist case, update snd_max but since we are in @@ -916,6 +1188,15 @@ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb); } + +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * Update t_snd_awnd to reflect the new data that was sent. + */ + tp->t_snd_awnd = tcp_seq_subtract(tp->snd_max, tp->t_snd_fack) + + tp->t_retran_data; +#endif + if (error) { /* @@ -1004,3 +1285,5 @@ if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } + + Index: tcp_subr.c =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_subr.c,v retrieving revision 1.147 diff -u -r1.147 tcp_subr.c --- tcp_subr.c 14 Dec 2002 21:00:17 -0000 1.147 +++ tcp_subr.c 23 Dec 2002 20:36:08 -0000 @@ -134,6 +134,12 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); +#ifdef TCP_SACK +int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_SACK, sack, CTLFLAG_RW, + &tcp_do_sack, 0, "Enable rfc2108 (SACK) Selectective Ack. extentions"); +#endif + static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); @@ -560,6 +566,9 @@ #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ +#ifdef TCP_SACK + struct sackhole *sack_p, *sack_q; +#endif it = (struct inp_tp *)inp; tp = &it->tcb; @@ -571,6 +580,13 @@ #endif /* INET6 */ tcp_mssdflt; +#ifdef TCP_SACK + /* + * Enable Selective Ack. depending on what + * sysctl `net.inet.tcp.sack' says. + */ + tp->t_sack_disable = tcp_do_sack ? 0 : 1; +#endif /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); callout_init(tp->tt_persist = &it->inp_tp_persist, 0); @@ -770,6 +786,16 @@ m_freem(q->tqe_m); FREE(q, M_TSEGQ); } +#ifdef TCP_SACK + /* XXXSACK Free SACK holes. */ + sack_q = sack_q = tp->t_snd_holes; + while (sack_p != 0) { + sack_q = sack_p->next; + FREE(sack_p, M_PCB); + sack_p = sack_q; + } +#endif + inp->inp_ppcb = NULL; soisdisconnected(so); #ifdef INET6 @@ -797,7 +823,7 @@ * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially - * usefull. + * useful. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { Index: tcp_timer.c =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_timer.c,v retrieving revision 1.54 diff -u -r1.54 tcp_timer.c --- tcp_timer.c 5 Sep 2002 15:33:30 -0000 1.54 +++ tcp_timer.c 23 Dec 2002 20:36:08 -0000 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -206,6 +207,9 @@ ostate = tp->t_state; #endif +#ifdef TCP_SACK + struct sackhole *p, *q; +#endif s = splnet(); INP_INFO_WLOCK(&tcbinfo); inp = tp->t_inpcb; @@ -217,6 +221,22 @@ return; } callout_deactivate(tp->tt_2msl); + +#ifdef TCP_SACK + q = p = tp->t_snd_holes; + while (p != 0) { + q = p->next; + free(p, M_PCB); + P = q; + } + + tp->t_snd_holes = 0; +#if defined(TCP_FACK) + tp->t_snd_fack = tp->snd_una; + tp->t_retran_data = 0; + tp->t_snd_awnd = 0; +#endif +#endif /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle @@ -474,6 +494,9 @@ tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; +#ifdef TCP_SACK + tp->snd_last = tp->snd_una; +#endif /* * Note: We overload snd_recover to function also as the * snd_last variable described in RFC 2582 Index: tcp_usrreq.c =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.84 diff -u -r1.84 tcp_usrreq.c --- tcp_usrreq.c 24 Oct 2002 02:02:34 -0000 1.84 +++ tcp_usrreq.c 23 Dec 2002 20:36:08 -0000 @@ -898,6 +898,16 @@ tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); +#ifdef TCP_SACK + tp->snd_last = tp->snd_una; + +#if defined(TCP_FACK) + tp->t_snd_fack = tp->snd_una; + tp->t_retran_data = 0; + tp->t_snd_awnd = 0; +#endif +#endif + /* * Generate a CC value for this connection and * check whether CC or CCnew should be used. @@ -1100,7 +1110,6 @@ else error = EINVAL; break; - default: error = ENOPROTOOPT; break; @@ -1121,6 +1130,11 @@ case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; break; +#ifdef TCP_SACK + case TCP_NOSACK: + optval = tp->t_sack_disable; + break; +#endif default: error = ENOPROTOOPT; break; Index: tcp_var.h =================================================================== RCS file: /home/hiten/ncvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.83 diff -u -r1.83 tcp_var.h --- tcp_var.h 17 Aug 2002 18:26:02 -0000 1.83 +++ tcp_var.h 23 Dec 2002 16:39:54 -0000 @@ -46,6 +46,30 @@ extern int tcp_do_rfc1323; extern int tcp_do_rfc1644; +#ifdef TCP_SACK +/* + * TODO + * - Use sys/queue.h macros, instead of in-house + * list manipulation. + * + * - After groking the SACK code, put the most + * used fields at the top of the respective structs + * + */ +typedef struct sackblock { + tcp_seq blk_start; /* Sequence no. start of SACK block. */ + tcp_seq blk_end; /* Sequence no. end of SACK block. */ +}; + +typedef struct sackhole { + struct sackhole *next; /* Next struct in the chain. */ + int sh_dups; /* No. of dup ACKs for this hole. */ + tcp_seq sh_start; /* Start sequence no. of this hole. */ + tcp_seq sh_end; /* End sequence no. of this hole. */ + tcp_seq sh_rxmit; /* Next seq. in hole for retransmission. */ +}; +#endif + /* TCP segment queue entry */ struct tseg_qent { LIST_ENTRY(tseg_qent) tqe_q; @@ -121,6 +145,13 @@ tcp_seq rcv_adv; /* advertised window */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_up; /* receive urgent pointer */ +#ifdef TCP_SACK + tcp_seq rcv_laststart; /* start of last received segment */ + tcp_seq rcv_lastend; /* end of last received segment */ + tcp_seq rcv_lastseq; /* last SACK'd seq. no.(+1) by recvr */ + int rcv_numsacks; /* # of distinct SACK blocks present */ + sackblk_t sackblks[MAX_SACK_BLKS]; /* seq. numbers of SACK blocks */ +#endif u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ @@ -175,6 +206,27 @@ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ +#ifdef TCP_SACK + /* + * XXX the fields have been put at the end + * because I know jack-shit about "cache-line efficiency" + */ + int t_sack_disable; /* disable SACK for this connection. */ + int t_snd_numholes; /* number of holes seen by sender. */ + sackhole_t *t_snd_holes; /* XXX linked list of holes (sorted). */ +#if defined(TCP_FACK) + tcp_seq t_snd_fack; /* FACK congestion control. */ + u_long t_snd_awnd; /* snd_nxt - snd_fack + + * retransmitted data. + */ + int t_retran_data; /* amount of outstanding retx. data.*/ +#endif +#endif /* TCP_SACK */ +/* + * We don't need to add `tcp_seq snd_last` because NewReno already + * uses `tcp_seq snd_recover`, which has been defined earlier in + * this structure. XXX + */ }; /* @@ -409,6 +461,7 @@ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_MAXID 14 +#define TCPCTL_DO_SACK 15 /* use Selective Acknowledgments */ #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -425,6 +478,7 @@ { "pcblist", CTLTYPE_STRUCT }, \ { "delacktime", CTLTYPE_INT }, \ { "v6mssdflt", CTLTYPE_INT }, \ + { "sack", CTLTYPE_INT }, \ } @@ -439,6 +493,9 @@ extern int tcp_mssdflt; /* XXX */ extern int tcp_delack_enabled; extern int tcp_do_newreno; +#ifdef TCP_SACK +extern int tcp_do_sack; /* net.inet.tcp.sack sysctl */ +#endif extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; @@ -480,6 +537,21 @@ tcp_timers(struct tcpcb *, int); void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int); void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); + +#ifdef TCP_SACK +int tcp_sack_option(struct tcpcb *, struct tcpiphdr *, u_char *, int); +void tcp_update_sack_list(struct tcpcb *tp); +void tcp_del_sackholes(struct tcpcb *, struct tcpiphdr *); +void tcp_clean_sackreport(struct tcpcb *tp); +void tcp_sack_adjust(struct tcpcb *tp); +sackhole_t *tcp_sack_output __P((struct tcpcb *tp)); +int tcp_sack_partialack __P((struct tcpcb *, struct tcpiphdr *)); +#if defined(TCP_SACK) && defined(SACK_DEBUG) +void tcp_print_holes(struct tcpcb *tp); +#endif +u_long tcp_seq_subtract(u_long, u_long ); +#endif /* TCP_SACK */ + void syncache_init(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcphdr *,