Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c (revision 221219) +++ sys/netinet/tcp_input.c (working copy) @@ -161,6 +161,11 @@ &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); +VNET_DEFINE(int, tcp_do_rfc5682) = 0; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc5682, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc5682), 0, + "Enable RFC 5682 (Forward RTO-Recovery: F-RTO)"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 0; @@ -260,6 +265,220 @@ } } +static int +tcp_frto_send2mss(struct tcpcb *tp, struct tcphdr *th) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + u_long oldcwnd; + tcp_seq onxt; + + INP_WLOCK_ASSERT(inp); + + /* + * If the TCP sender does not have any new data to send, OR the + * advertised window prohibits new transmissions, the recommended + * action is to skip step 3 of this algorithm and continue with + * slow-start retransmissions, following the conventional RTO recovery + * algorithm. However, alternative ways of handling the window-limited + * cases that could result in better performance are discussed in + * Appendix A. + */ + if (so->so_snd.sb_cc == 0 || tp->snd_wnd == 0) { + /* skip step 3 OR do alternative ways. */ + return (-1); + } + oldcwnd = tp->snd_cwnd; + onxt = tp->snd_nxt; + /* + * transmit up to two new (previously unsent) segments and enter + * step 3 of this algorithm. If the TCP sender does not have enough + * unsent data, it can send only one segment. + */ + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + 2 * tp->t_maxseg; + /* + * XXXWG in addition, the TCP sender MAY override the Nagle algorithm. + */ + tp->t_flags |= TF_ACKNOW; + (void)tcp_output(tp); + tp->snd_nxt = onxt; + tp->snd_cwnd = oldcwnd; + return (0); +} + +static void inline +tcp_frto_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* SACK-Enhanced F-RTO */ + + if (tp->t_flags & TF_SACK_PERMIT) { + if ((tp->t_frtoflags & FRTO_INSTEP1) != 0 && + (tp->t_frtoflags & FRTO_CANGOSTEP2) != 0) { + /* SACK-Enhanced F-RTO - step 2 */ + if (type == CC_DUPACK) { + /* + * 2) If duplicate ACKs arrive before the + * cumulative acknowledgment for retransmitted + * data, adjust the scoreboard according to + * the incoming SACK information. Stay in + * step 2 and wait for the next new + * acknowledgment. + */ + return; + } + KASSERT(type == CC_ACK, + ("%s: expected ACK (but %d)", __func__, type)); + + /* + * 2) When a new acknowledgment arrives, set variable + * "RecoveryPoint" to indicate the highest sequence + * number transmitted so far. + */ + tp->t_frtoflags |= FRTO_INSTEP2; + tp->snd_recover = tp->snd_max; + + /* + * 2a) If the Cumulative Acknowledgement field covers + * "RecoveryPoint" but not more than + * "RecoveryPoint", revert to the conventional RTO + * recovery and set the congestion window to no + * more than 2 * MSS, like a regular TCP would do. + * Do not enter step 3 of this algorithm. + */ + if (th->th_ack == tp->snd_recover) { + tp->snd_cwnd = 2 * tp->t_maxseg; + TCP_FRTO_FAIL(tp); + return; + /* + * 2b) If the Cumulative Acknowledgment field does not + * cover "RecoveryPoint" but is larger than + * SND.UNA. + */ + } else if (SEQ_LT(th->th_ack, tp->snd_recover) && + SEQ_GT(th->th_ack, tp->snd_una) && + !tcp_frto_send2mss(tp, th)) { + tp->t_frtoflags |= FRTO_CANGOSTEP3; + tp->t_frtofack = tp->snd_fack; + } else { + tp->snd_cwnd = 2 * tp->t_maxseg; + TCP_FRTO_FAIL(tp); + } + } + if ((tp->t_frtoflags & FRTO_INSTEP2) != 0 && + (tp->t_frtoflags & FRTO_CANGOSTEP3) != 0) { + struct sackhole *q = TAILQ_FIRST(&tp->snd_holes); + /* the Cumulative Acknowledgment */ + tcp_seq cack = SEQ_MAX(th->th_ack, tp->snd_una); + + /* + * 3a) If the Cumulative Acknowledgment field or + * the SACK information covers more than + * "RecoveryPoint". + */ + if (SEQ_GEQ(cack, tp->snd_recover) || + (q != NULL && SEQ_GT(q->start, tp->snd_recover))) + goto revert_frto; + /* + * 3a) take this branch also when the + * acknowledgment is a duplicate ACK and it does not + * acknowledge any new, previously unacknowledged + * data below "RecoveryPoint" in the SACK information. + * + * XXXWG FIXME At this implementation there is a + * limitation that it doesn't traverse all SACK + * information below "RecoveryPoint". So needs to + * implement a simple way to check whether SACK + * information is updated or not after receiving + * this ACK. + */ + if (type == CC_DUPACK && + SEQ_LEQ(tp->snd_fack, tp->snd_recover) && + SEQ_LEQ(tp->snd_fack, tp->t_frtofack)) + goto revert_frto; + /* + * 3b) If the Cumulative Acknowledgment field or a + * SACK information in the ACK does not cover more than + * "RecoveryPoint" AND it acknowledges data that was + * not acknowledged earlier, declare the timeout + * spurious. + */ + if (type == CC_ACK && + (SEQ_LEQ(cack, tp->snd_recover) || + (q != NULL && + SEQ_LT(q->start, tp->snd_recover)))) { + /* SPUR_TO: the timeout is spurious */ + TCP_FRTO_SPUR_TO(tp, th); + tp->snd_recover = tp->snd_una; + } else + goto revert_frto; + } + return; + } + + /* + * Basic F-RTO algorithm + */ + + if ((tp->t_frtoflags & FRTO_INSTEP1) != 0 && + (tp->t_frtoflags & FRTO_CANGOSTEP2) != 0) { + /* Basic F-RTO - step 2 */ + tp->t_frtoflags |= FRTO_INSTEP2; + tp->snd_recover = tp->snd_max; + + if (type == CC_DUPACK || + /* + * The acknowledgment field covers "recover" but not + * more than "recover" + */ + th->th_ack == tp->snd_recover || + /* + * acknowledgement does not acknowledge all of + * the data that was retransmitted in step 1. + */ + SEQ_LEQ(th->th_ack, tp->snd_una)) + TCP_FRTO_FAIL(tp); + else if (type == CC_ACK && + /* + * Acknowledgement field does not cover + * "recover" + */ + SEQ_LT(th->th_ack, tp->snd_recover) && + !tcp_frto_send2mss(tp, th)) + tp->t_frtoflags |= FRTO_CANGOSTEP3; + else + TCP_FRTO_FAIL(tp); + } + if ((tp->t_frtoflags & FRTO_INSTEP2) != 0 && + (tp->t_frtoflags & FRTO_CANGOSTEP3) != 0) { + /* Basic F-RTO - step 3 */ + if (type != CC_ACK) { + /* + * set the congestion window to no more + * than 3 * MSS (where MSS indicates Maximum + * Segment Size), and continue with the + * slow-start algorithm retransmitting + * unacknowledged segments. The congestion + * window can be set to 3 * MSS, because two + * round-trip times have elapsed since the TRO, + * and a conventional TCP sender would have + * increased cwnd to 3 during the same time. + */ + goto revert_frto; + } + /* SPUR_TO: the timeout is spurious */ + TCP_FRTO_SPUR_TO(tp, th); + tp->snd_recover = tp->snd_una; + } + return; +revert_frto: + tp->snd_cwnd = 3 * tp->t_maxseg; + TCP_FRTO_FAIL(tp); +} + /* * CC wrapper hook functions */ @@ -268,6 +487,9 @@ { INP_WLOCK_ASSERT(tp->t_inpcb); + if (V_tcp_do_rfc5682) + tcp_frto_ack_received(tp, th, type); + tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) tp->ccv->flags |= CCF_CWND_LIMITED; @@ -2601,7 +2823,16 @@ EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; - if (tp->t_flags & TF_SACK_PERMIT) { + /* + * XXXWG: according to RFC5682 SACK-Enhanced F-RTO step1, it + * mentioned that it doesn't process F-RTO algorithm if the + * "RecoverPoint" is larger than or equal to SND.UNA. However + * the below routine always updates "RecoverPoint" to be same + * with SND.UNA whenever eacho ACK is received so it means that + * it never could go in SACK-Enhanced F-RTO algorithm if it's + * enabled. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && V_tcp_do_rfc5682 == 0) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c (revision 221219) +++ sys/netinet/tcp_timer.c (working copy) @@ -60,6 +60,7 @@ #endif #include #include +#include #include #include #include @@ -558,8 +559,64 @@ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } + if (V_tcp_do_rfc5682) { + /* + * If the retrasmission timer expires again during + * the execution of the F-RTO algorithm, the TCP sender MUST + * re-start the algorithm processing from step 1. + */ + tp->t_frtoflags = FRTO_INSTEP1; + + /* + * If the sender implements some loss recovery algorithm + * other than Reno or NewReno [FHH04], the F-RTO algorithm + * SHOULD NOT be entered when earlier fast recovery is underway. + */ + if (CC_ALGO(tp) != &newreno_cc_algo && + IN_RECOVERY(tp->t_flags)) + goto cant_rfc5682; + /* + * SACK-Enhanced F-RTO + * This algorithm SHOULD NOT be applied if the TCP sender is + * already in loss recovery when a retransmission timeout + * occurs. + */ + if ((tp->t_flags & TF_SACK_PERMIT) != 0 && + IN_RECOVERY(tp->t_flags)) + goto cant_rfc5682; + /* + * Basic F-RTO algorithm - step 1: if the TCP sender is already + * in RTO recovery AND "recover" is larger than or equal to + * SND.UNA (the oldest unacknowledged sequence number [Pos81]), + * do not enter step 2 of this algorithm. Instead, store the + * highest sequence number transmitted so far in variable + * "recover". + */ + if ((tp->t_flags & TF_SACK_PERMIT) == 0 && + tp->t_rxtshift > 0 && + SEQ_GEQ(tp->snd_recover, tp->snd_una)) { + /* + * continue with slow-start retransmissions following + * the conventional RTO recovery algorithm. + */ + goto cant_rfc5682; + + /* + * SACK-Enhanced F-RTO - step 1: If "RecoveryPoint" is larger + * than or equal to SND.UNA, do not enter step 2 of this + * algorithm. + */ + } else if ((tp->t_flags & TF_SACK_PERMIT) != 0 && + SEQ_GEQ(tp->snd_recover, tp->snd_una)) + goto cant_rfc5682; + else + tp->t_frtoflags |= FRTO_CANGOSTEP2; + } else { +cant_rfc5682: + tp->snd_recover = tp->snd_max; + } tp->snd_nxt = tp->snd_una; - tp->snd_recover = tp->snd_max; + /* * Force a segment to be sent. */ Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h (revision 221219) +++ sys/netinet/tcp_var.h (working copy) @@ -202,6 +202,12 @@ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ + u_int t_frtoflags; /* step flags for RFC5682 where it's */ +#define FRTO_INSTEP1 0x01 +#define FRTO_CANGOSTEP2 0x02 +#define FRTO_INSTEP2 0x04 +#define FRTO_CANGOSTEP3 0x08 + tcp_seq t_frtofack; /* previous snd_fack at step 1 */ int t_ispare; /* explicit pad for 64bit alignment */ void *t_pspare2[4]; /* 4 TBD */ @@ -493,6 +499,10 @@ u_long tcps_sig_err_sigopt; /* No signature expected by socket */ u_long tcps_sig_err_nosigopt; /* No signature provided by segment */ + /* F-RTO related stats */ + u_long tcps_frto_spurto; /* F-RTO spurious timeouts */ + u_long tcps_frto_fail; /* F-RTO failed */ + u_long _pad[12]; /* 6 UTO, 6 TBD */ }; @@ -526,6 +536,22 @@ int tso; tcp_seq curack; }; + +/* + * helper macros for RFC5682. + */ +#define TCP_FRTO_SPUR_TO(tp, th) do { \ + if (tp->t_flags & TF_PREVVALID) \ + cc_cong_signal(tp, th, CC_RTO_ERR); \ + TCPSTAT_INC(tcps_frto_spurto); \ + (tp)->t_frtoflags = 0; \ +} while (0) +/* revert to the conventional RTO recovery. */ +#define TCP_FRTO_FAIL(tp) do { \ + TCPSTAT_INC(tcps_frto_fail); \ + (tp)->t_frtoflags = 0; \ +} while (0) + #endif /* @@ -610,6 +636,7 @@ VNET_DECLARE(int, ss_fltsz_local); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_abc_l_var); +VNET_DECLARE(int, tcp_do_rfc5682); #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcpstat VNET(tcpstat) @@ -622,6 +649,7 @@ #define V_ss_fltsz_local VNET(ss_fltsz_local) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) +#define V_tcp_do_rfc5682 VNET(tcp_do_rfc5682) VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c (revision 221219) +++ usr.bin/netstat/inet.c (working copy) @@ -714,6 +714,9 @@ p(tcps_ecn_ect1, "\t%lu packet%s with ECN ECT(1) bit set\n"); p(tcps_ecn_shs, "\t%lu successful ECN handshake%s\n"); p(tcps_ecn_rcwnd, "\t%lu time%s ECN reduced the congestion window\n"); + + p(tcps_frto_spurto, "\t%lu time%s F-RTO success\n"); + p(tcps_frto_fail, "\t%lu time%s F-RTO failure\n"); #undef p #undef p1a #undef p2