Property changes on: sys ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys:r215166,215377,215391-215393,215395,216101,216103,216105,216107,216749,216760,217748,218167 Index: sys/conf/files =================================================================== --- sys/conf/files (revision 219081) +++ sys/conf/files (working copy) @@ -2560,6 +2560,8 @@ netinet/ip_options.c optional inet netinet/ip_output.c optional inet netinet/raw_ip.c optional inet +netinet/cc/cc.c optional inet +netinet/cc/cc_newreno.c optional inet netinet/sctp_asconf.c optional inet sctp netinet/sctp_auth.c optional inet sctp netinet/sctp_bsd_addr.c optional inet sctp Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c (revision 219081) +++ sys/netinet/tcp_input.c (working copy) @@ -1,7 +1,21 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. + * Copyright (c) 2007-2008,2010 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. * + * Portions of this software were developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, James Healy and + * David Hayes, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -61,6 +75,7 @@ #define TCPSTATES /* for logging */ +#include #include #include #include @@ -75,7 +90,6 @@ #include #include #include -#include #include #include #include @@ -96,7 +110,7 @@ #include -static const int tcprexmtthresh = 3; +const int tcprexmtthresh = 3; VNET_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, @@ -132,19 +146,16 @@ "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; -#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; -#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; -#define V_tcp_abc_l_var VNET(tcp_abc_l_var) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); @@ -203,8 +214,10 @@ struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline - tcp_congestion_exp(struct tcpcb *); +static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +static void inline cc_conn_init(struct tcpcb *tp); +static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); /* * Kernel module interface for updating tcpstat. The argument is an index @@ -220,22 +233,192 @@ (*((u_long *)&V_tcpstat + statnum))++; } +/* + * CC wrapper hook functions + */ static void inline -tcp_congestion_exp(struct tcpcb *tp) +cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { - u_int win; - - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; - ENTER_FASTRECOVERY(tp); - tp->snd_recover = tp->snd_max; - if (tp->t_flags & TF_ECN_PERMIT) - tp->t_flags |= TF_ECN_SND_CWR; + INP_WLOCK_ASSERT(tp->t_inpcb); + + tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); + if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) + tp->ccv->flags |= CCF_CWND_LIMITED; + else + tp->ccv->flags &= ~CCF_CWND_LIMITED; + + if (type == CC_ACK) { + if (tp->snd_cwnd > tp->snd_ssthresh) { + tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, + V_tcp_abc_l_var * tp->t_maxseg); + if (tp->t_bytes_acked >= tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + tp->ccv->flags |= CCF_ABC_SENTAWND; + } + } else { + tp->ccv->flags &= ~CCF_ABC_SENTAWND; + tp->t_bytes_acked = 0; + } + } + + if (CC_ALGO(tp)->ack_received != NULL) { + /* XXXLAS: Find a way to live without this */ + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->ack_received(tp->ccv, type); + } } +static void inline +cc_conn_init(struct tcpcb *tp) +{ + struct hc_metrics_lite metrics; + struct inpcb *inp = tp->t_inpcb; + int rtt; +#ifdef INET6 + int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; +#endif + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tcp_hc_get(&inp->inp_inc, &metrics); + + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + TCPSTAT_INC(tcps_usedrtt); + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + TCPSTAT_INC(tcps_usedrttvar); + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); + TCPSTAT_INC(tcps_usedssthresh); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * XXXAO: Initializing the CWND from the hostcache is broken + * and in its current form not RFC conformant. It is disabled + * until fixed or removed entirely. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. + */ +/* #define TCP_METRICS_CWND */ +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif + if (V_tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); +#ifdef INET6 + else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && in_localaddr(inp->inp_faddr))) +#else + else if (in_localaddr(inp->inp_faddr)) +#endif + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; + + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp->ccv); +} + +void inline +cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + switch(type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(tp->t_flags)) { + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(tp->t_flags)) { + TCPSTAT_INC(tcps_ecn_rcwnd); + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; + } + break; + case CC_RTO: + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; + EXIT_RECOVERY(tp->t_flags); + tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg) * tp->t_maxseg; + tp->snd_cwnd = tp->t_maxseg; + break; + case CC_RTO_ERR: + TCPSTAT_INC(tcps_sndrexmitbad); + /* RTO was unnecessary, so reset everything. */ + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + if (tp->t_flags & TF_WASFRECOVERY) + ENTER_FASTRECOVERY(tp->t_flags); + if (tp->t_flags & TF_WASCRECOVERY) + ENTER_CONGRECOVERY(tp->t_flags); + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + break; + } + + if (CC_ALGO(tp)->cong_signal != NULL) { + if (th != NULL) + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->cong_signal(tp->ccv, type); + } +} + +static void inline +cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* XXXLAS: KASSERT that we're in recovery? */ + + if (CC_ALGO(tp)->post_recovery != NULL) { + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->post_recovery(tp->ccv); + } + /* XXXLAS: EXIT_RECOVERY ? */ + tp->t_bytes_acked = 0; +} + /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ @@ -1157,14 +1340,9 @@ TCPSTAT_INC(tcps_ecn_ect1); break; } - /* - * Congestion experienced. - * Ignore if we are already trying to recover. - */ - if ((thflags & TH_ECE) && - SEQ_LEQ(th->th_ack, tp->snd_recover)) { - TCPSTAT_INC(tcps_ecn_rcwnd); - tcp_congestion_exp(tp); + /* Congestion experienced. */ + if (thflags & TH_ECE) { + cc_cong_signal(tp, th, CC_ECN); } } @@ -1259,15 +1437,9 @@ if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && - tp->snd_cwnd >= tp->snd_wnd && - ((!V_tcp_do_newreno && - !(tp->t_flags & TF_SACK_PERMIT) && - tp->t_dupacks < tcprexmtthresh) || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && - (to.to_flags & TOF_SACK) == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + !IN_RECOVERY(tp->t_flags) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ @@ -1287,15 +1459,7 @@ */ if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) { - TCPSTAT_INC(tcps_sndrexmitbad); - tp->snd_cwnd = tp->snd_cwnd_prev; - tp->snd_ssthresh = - tp->snd_ssthresh_prev; - tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) - ENTER_FASTRECOVERY(tp); - tp->snd_nxt = tp->snd_max; - tp->t_badrxtwin = 0; + cc_cong_signal(tp, th, CC_RTO_ERR); } /* @@ -1322,13 +1486,22 @@ ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); - acked = th->th_ack - tp->snd_una; + acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; + + /* + * Let the congestion control algorithm update + * congestion control related information. This + * typically means increasing the congestion + * window. + */ + cc_ack_received(tp, th, CC_ACK); + tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative @@ -1588,6 +1761,7 @@ thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } } else { @@ -1991,6 +2165,7 @@ tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } /* @@ -2059,11 +2234,10 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp->t_flags)) { + cc_ack_received(tp, th, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && - IN_FASTRECOVERY(tp)) { + IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* @@ -2094,19 +2268,20 @@ * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { - if (IN_FASTRECOVERY(tp)) { + if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } - } else if (V_tcp_do_newreno || - V_tcp_do_ecn) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - tcp_congestion_exp(tp); + /* Congestion signal before ack. */ + cc_cong_signal(tp, th, CC_NDUPACK); + cc_ack_received(tp, th, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { @@ -2130,6 +2305,7 @@ tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { + cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; @@ -2171,37 +2347,14 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { - if (IN_FASTRECOVERY(tp)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->t_flags & TF_SACK_PERMIT) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } - } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + if (IN_FASTRECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else + cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* @@ -2232,7 +2385,7 @@ ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); - acked = th->th_ack - tp->snd_una; + acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2243,16 +2396,8 @@ * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) { - TCPSTAT_INC(tcps_sndrexmitbad); - tp->snd_cwnd = tp->snd_cwnd_prev; - tp->snd_ssthresh = tp->snd_ssthresh_prev; - tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) - ENTER_FASTRECOVERY(tp); - tp->snd_nxt = tp->snd_max; - tp->t_badrxtwin = 0; /* XXX probably not required */ - } + if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) + cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed @@ -2300,61 +2445,12 @@ goto step6; /* - * When new data is acked, open the congestion window. - * Method depends on which congestion control state we're - * in (slow start or cong avoid) and if ABC (RFC 3465) is - * enabled. - * - * slow start: cwnd <= ssthresh - * cong avoid: cwnd > ssthresh - * - * slow start and ABC (RFC 3465): - * Grow cwnd exponentially by the amount of data - * ACKed capping the max increment per ACK to - * (abc_l_var * maxseg) bytes. - * - * slow start without ABC (RFC 2581): - * Grow cwnd exponentially by maxseg per ACK. - * - * cong avoid and ABC (RFC 3465): - * Grow cwnd linearly by maxseg per RTT for each - * cwnd worth of ACKed data. - * - * cong avoid without ABC (RFC 2581): - * Grow cwnd linearly by approximately maxseg per RTT using - * maxseg^2 / cwnd per ACK as the increment. - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to - * avoid capping cwnd. + * Let the congestion control algorithm update congestion + * control related information. This typically means increasing + * the congestion window. */ - if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || - !IN_FASTRECOVERY(tp)) { - u_int cw = tp->snd_cwnd; - u_int incr = tp->t_maxseg; - /* In congestion avoidance? */ - if (cw > tp->snd_ssthresh) { - if (V_tcp_do_rfc3465) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= tp->snd_cwnd) - tp->t_bytes_acked -= cw; - else - incr = 0; - } - else - incr = max((incr * incr / cw), 1); - /* - * In slow-start with ABC enabled and no RTO in sight? - * (Must not use abc_l_var > 1 if slow starting after an - * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt == - * snd_max check is sufficient to handle this). - */ - } else if (V_tcp_do_rfc3465 && - tp->snd_nxt == tp->snd_max) - incr = min(acked, - V_tcp_abc_l_var * tp->t_maxseg); - /* ABC is on by default, so (incr == 0) frequently. */ - if (incr > 0) - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); - } + cc_ack_received(tp, th, CC_ACK); + SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; @@ -2368,16 +2464,14 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && + if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp) && + /* XXXLAS: Can this be moved up into cc_post_recovery? */ + if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { - EXIT_FASTRECOVERY(tp); - tp->t_bytes_acked = 0; + EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { @@ -3242,24 +3336,19 @@ void tcp_mss(struct tcpcb *tp, int offer) { - int rtt, mss; + int mss; u_long bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; int mtuflags = 0; -#ifdef INET6 - int isipv6; -#endif + KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); tcp_mss_update(tp, offer, &metrics, &mtuflags); mss = tp->t_maxseg; inp = tp->t_inpcb; -#ifdef INET6 - isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; -#endif /* * If there's a pipesize, change the socket buffer to that size, @@ -3299,78 +3388,7 @@ (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); - /* - * While we're here, check the others too. - */ - if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { - tp->t_srtt = rtt; - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - TCPSTAT_INC(tcps_usedrtt); - if (metrics.rmx_rttvar) { - tp->t_rttvar = metrics.rmx_rttvar; - TCPSTAT_INC(tcps_usedrttvar); - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); - } - if (metrics.rmx_ssthresh) { - /* - * There's some sort of gateway or interface - * buffer limit on the path. Use this to set - * the slow start threshhold, but set the - * threshold to no less than 2*mss. - */ - tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); - TCPSTAT_INC(tcps_usedssthresh); - } - if (metrics.rmx_bandwidth) - tp->snd_bandwidth = metrics.rmx_bandwidth; - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - * - * Extend this so we cache the cwnd too and retrieve it here. - * Make cwnd even bigger than RFC3390 suggests but only if we - * have previous experience with the remote host. Be careful - * not make cwnd bigger than remote receive window or our own - * send socket buffer. Maybe put some additional upper bound - * on the retrieved cwnd. Should do incremental updates to - * hostcache when cwnd collapses so next connection doesn't - * overloads the path again. - * - * XXXAO: Initializing the CWND from the hostcache is broken - * and in its current form not RFC conformant. It is disabled - * until fixed or removed entirely. - * - * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. - * We currently check only in syncache_socket for that. - */ -/* #define TCP_METRICS_CWND */ -#ifdef TCP_METRICS_CWND - if (metrics.rmx_cwnd) - tp->snd_cwnd = max(mss, - min(metrics.rmx_cwnd / 2, - min(tp->snd_wnd, so->so_snd.sb_hiwat))); - else -#endif - if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = mss * V_ss_fltsz_local; - else - tp->snd_cwnd = mss * V_ss_fltsz; - /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) tp->t_flags |= TF_TSO; @@ -3433,7 +3451,7 @@ * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ - tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; @@ -3443,8 +3461,8 @@ * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ - if (tp->snd_cwnd > th->th_ack - tp->snd_una) - tp->snd_cwnd -= th->th_ack - tp->snd_una; + if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) + tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c (revision 219081) +++ sys/netinet/tcp_subr.c (working copy) @@ -62,6 +62,7 @@ #include #include +#include #include #include #include @@ -80,7 +81,6 @@ #include #endif #include -#include #include #include #include @@ -284,6 +284,7 @@ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; + struct cc_var ccv; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); @@ -699,6 +700,26 @@ if (tm == NULL) return (NULL); tp = &tm->tcb; + + /* Initialise cc_var struct for this tcpcb. */ + tp->ccv = &tm->ccv; + tp->ccv->type = IPPROTO_TCP; + tp->ccv->ccvc.tcp = tp; + + /* + * Use the current system default CC algorithm. + */ + CC_LIST_RLOCK(); + KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); + CC_ALGO(tp) = CC_DEFAULT(); + CC_LIST_RUNLOCK(); + + if (CC_ALGO(tp)->cb_init != NULL) + if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + uma_zfree(V_tcpcb_zone, tm); + return (NULL); + } + #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif @@ -748,6 +769,69 @@ } /* + * Switch the congestion control algorithm back to NewReno for any active + * control blocks using an algorithm which is about to go away. + * This ensures the CC framework can allow the unload to proceed without leaving + * any dangling pointers which would trigger a panic. + * Returning non-zero would inform the CC framework that something went wrong + * and it would be unsafe to allow the unload to proceed. However, there is no + * way for this to occur with this implementation so we always return zero. + */ +int +tcp_ccalgounload(struct cc_algo *unload_algo) +{ + struct cc_algo *tmpalgo; + struct inpcb *inp; + struct tcpcb *tp; + VNET_ITERATOR_DECL(vnet_iter); + + /* + * Check all active control blocks across all network stacks and change + * any that are using "unload_algo" back to NewReno. If "unload_algo" + * requires cleanup code to be run, call it. + */ + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + INP_INFO_RLOCK(&V_tcbinfo); + /* + * New connections already part way through being initialised + * with the CC algo we're removing will not race with this code + * because the INP_INFO_WLOCK is held during initialisation. We + * therefore don't enter the loop below until the connection + * list has stabilised. + */ + LIST_FOREACH(inp, &V_tcb, inp_list) { + INP_WLOCK(inp); + /* Important to skip tcptw structs. */ + if (!(inp->inp_flags & INP_TIMEWAIT) && + (tp = intotcpcb(inp)) != NULL) { + /* + * By holding INP_WLOCK here, we are assured + * that the connection is not currently + * executing inside the CC module's functions + * i.e. it is safe to make the switch back to + * NewReno. + */ + if (CC_ALGO(tp) == unload_algo) { + tmpalgo = CC_ALGO(tp); + /* NewReno does not require any init. */ + CC_ALGO(tp) = &newreno_cc_algo; + if (tmpalgo->cb_destroy != NULL) + tmpalgo->cb_destroy(tp->ccv); + } + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + return (0); +} + +/* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. @@ -857,6 +941,12 @@ tcp_offload_detach(tp); tcp_free_sackholes(tp); + + /* Allow the CC algorithm to clean up after itself. */ + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp->ccv); + + CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); @@ -1621,7 +1711,7 @@ tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) - EXIT_FASTRECOVERY(tp); + EXIT_FASTRECOVERY(tp->t_flags); tcp_output_send(tp); return (inp); } Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c (revision 219081) +++ sys/netinet/tcp_timer.c (working copy) @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #include #endif #include -#include #include #include #include @@ -497,10 +497,14 @@ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; - if (IN_FASTRECOVERY(tp)) - tp->t_flags |= TF_WASFRECOVERY; + if (IN_FASTRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASFRECOVERY; else - tp->t_flags &= ~TF_WASFRECOVERY; + tp->t_flags &= ~TF_WASFRECOVERY; + if (IN_CONGRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASCRECOVERY; + else + tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); } TCPSTAT_INC(tcps_rexmttimeo); @@ -544,40 +548,9 @@ * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_dupacks = 0; - } - EXIT_FASTRECOVERY(tp); - tp->t_bytes_acked = 0; + + cc_cong_signal(tp, NULL, CC_RTO); + (void) tcp_output(tp); out: Index: sys/netinet/tcp_sack.c =================================================================== --- sys/netinet/tcp_sack.c (revision 219081) +++ sys/netinet/tcp_sack.c (working copy) @@ -576,7 +576,7 @@ tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ - if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) + if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h (revision 219081) +++ sys/netinet/tcp_var.h (working copy) @@ -197,7 +197,11 @@ int t_bytes_acked; /* # bytes acked during current RTT */ int t_ispare; /* explicit pad for 64bit alignment */ - void *t_pspare2[6]; /* 2 CC / 4 TBD */ + + struct cc_algo *cc_algo; /* congestion control algorithm */ + struct cc_var *ccv; /* congestion control specific vars */ + + void *t_pspare2[4]; /* 4 TBD */ uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ }; @@ -234,11 +237,23 @@ #define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */ #define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */ #define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ +#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ +#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ -#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) -#define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY -#define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY +#define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) +#define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY +#define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY +#define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) +#define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY +#define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY + +#define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) +#define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) +#define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) + +#define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) + /* * Flags for the t_oobflags field. */ @@ -557,10 +572,11 @@ VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_rfc3390); -VNET_DECLARE(int, tcp_do_newreno); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, ss_fltsz); VNET_DECLARE(int, ss_fltsz_local); +VNET_DECLARE(int, tcp_do_rfc3465); +VNET_DECLARE(int, tcp_abc_l_var); #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcpstat VNET(tcpstat) @@ -568,10 +584,11 @@ #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) -#define V_tcp_do_newreno VNET(tcp_do_newreno) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_ss_fltsz VNET(ss_fltsz) #define V_ss_fltsz_local VNET(ss_fltsz_local) +#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) +#define V_tcp_abc_l_var VNET(tcp_abc_l_var) VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ @@ -584,6 +601,7 @@ #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) int tcp_addoptions(struct tcpopt *, u_char *); +int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); @@ -674,6 +692,8 @@ int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long ); +void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); + #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c (revision 219081) +++ sys/netinet/tcp_output.c (working copy) @@ -53,6 +53,7 @@ #include #include +#include #include #include #include @@ -64,7 +65,6 @@ #include #include #endif -#include #define TCPOUTFLAGS #include #include @@ -102,11 +102,6 @@ CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1, "Slow start flight size for local networks"); -VNET_DEFINE(int, tcp_do_newreno) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &VNET_NAME(tcp_do_newreno), 0, - "Enable NewReno Algorithms"); - VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, @@ -131,8 +126,21 @@ &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); +static void inline cc_after_idle(struct tcpcb *tp); /* + * CC wrapper hook functions + */ +static void inline +cc_after_idle(struct tcpcb *tp) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp->ccv); +} + +/* * Tcp output routine: figure out what should be sent and send it. */ int @@ -140,7 +148,7 @@ { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; - int off, flags, error, rw; + int off, flags, error; struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; @@ -174,37 +182,8 @@ * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) { - /* - * If we've been idle for more than one retransmit - * timeout the old congestion window is no longer - * current and we have to reduce it to the restart - * window before we can transmit again. - * - * The restart window is the initial window or the last - * CWND, whichever is smaller. - * - * This is done to prevent us from flooding the path with - * a full CWND at wirespeed, overloading router and switch - * buffers along the way. - * - * See RFC5681 Section 4.1. "Restarting Idle Connections". - */ - if (V_tcp_do_rfc3390) - rw = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, 4380)); -#ifdef INET6 - else if ((isipv6 ? in6_localaddr(&tp->t_inpcb->in6p_faddr) : - in_localaddr(tp->t_inpcb->inp_faddr))) -#else - else if (in_localaddr(tp->t_inpcb->inp_faddr)) -#endif - rw = V_ss_fltsz_local * tp->t_maxseg; - else - rw = V_ss_fltsz * tp->t_maxseg; - - tp->snd_cwnd = min(rw, tp->snd_cwnd); - } + if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) + cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { @@ -242,7 +221,7 @@ sack_bytes_rxmt = 0; len = 0; p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && + if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; @@ -1302,7 +1281,7 @@ * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ - if (sendalot && (!V_tcp_do_newreno || --maxburst)) + if (sendalot && --maxburst) goto again; #endif if (sendalot) Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c (revision 219081) +++ sys/netinet/tcp_usrreq.c (working copy) @@ -62,6 +62,7 @@ #include #include +#include #include #include #ifdef INET6 @@ -77,7 +78,6 @@ #include #include #endif -#include #include #include #include @@ -1253,6 +1253,8 @@ struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; + char buf[TCP_CA_NAME_MAX]; + struct cc_algo *algo; error = 0; inp = sotoinpcb(so); @@ -1363,6 +1365,54 @@ error = EINVAL; break; + case TCP_CONGESTION: + INP_WUNLOCK(inp); + bzero(buf, sizeof(buf)); + error = sooptcopyin(sopt, &buf, sizeof(buf), 1); + if (error) + break; + INP_WLOCK_RECHECK(inp); + /* + * Return EINVAL if we can't find the requested cc algo. + */ + error = EINVAL; + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) + == 0) { + /* We've found the requested algo. */ + error = 0; + /* + * We hold a write lock over the tcb + * so it's safe to do these things + * without ordering concerns. + */ + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp->ccv); + CC_ALGO(tp) = algo; + /* + * If something goes pear shaped + * initialising the new algo, + * fall back to newreno (which + * does not require initialisation). + */ + if (algo->cb_init != NULL) + if (algo->cb_init(tp->ccv) > 0) { + CC_ALGO(tp) = &newreno_cc_algo; + /* + * The only reason init + * should fail is + * because of malloc. + */ + error = ENOMEM; + } + break; /* Break the STAILQ_FOREACH. */ + } + } + CC_LIST_RUNLOCK(); + INP_WUNLOCK(inp); + break; + default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1406,6 +1456,12 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_CONGESTION: + bzero(buf, sizeof(buf)); + strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1715,6 +1771,10 @@ db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_CONGRECOVERY) { + db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); + comma = 1; + } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; Property changes on: sys/contrib/pf ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys/contrib/pf:r215166,215377,215391-215393,215395,216101,216103,216105,216107,216749,216760,217748,218167 Property changes on: sys/contrib/dev/acpica ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys/contrib/dev/acpica:r215166,215377,215391-215393,215395,216101,216103,216105,216107,216749,216760,217748,218167 Property changes on: sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys/cddl/contrib/opensolaris:r215166,215377,215391-215393,215395,216101,216103,216105,216107,216749,216760,217748,218167 Property changes on: sys/amd64/include/xen ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys/amd64/include/xen:r215166,215377,215391-215393,215395,216101,216103,216105,216107,216749,216760,217748,218167 Index: sys/sys/param.h =================================================================== --- sys/sys/param.h (revision 219081) +++ sys/sys/param.h (working copy) @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 802500 /* Master, propagated to newvers */ +#define __FreeBSD_version 802501 /* Master, propagated to newvers */ #ifdef _KERNEL #define P_OSREL_SIGSEGV 700004 Index: sys/netinet/cc/cc_module.h =================================================================== --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ sys/netinet/cc/cc_module.h 2011-02-27 23:08:18.875006897 +1100 @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2009-2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed by Lawrence Stewart while studying at the Centre + * for Advanced Internet Architectures, Swinburne University, made possible in + * part by a grant from the Cisco University Research Program Fund at Community + * Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/8/sys/netinet/cc/cc_module.h 215166 2010-11-12 06:41:55Z lstewart $ + */ + +/* + * This software was first released in 2009 by Lawrence Stewart as part of the + * NewTCP research project at Swinburne University's Centre for Advanced + * Internet Architectures, Melbourne, Australia, which was made possible in part + * by a grant from the Cisco University Research Program Fund at Community + * Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#ifndef _NETINET_CC_MODULE_H_ +#define _NETINET_CC_MODULE_H_ + +/* + * Allows a CC algorithm to manipulate a commonly named CC variable regardless + * of the transport protocol and associated C struct. + * XXXLAS: Out of action until the work to support SCTP is done. + * +#define CCV(ccv, what) \ +(*( \ + (ccv)->type == IPPROTO_TCP ? &(ccv)->ccvc.tcp->what : \ + &(ccv)->ccvc.sctp->what \ +)) + */ +#define CCV(ccv, what) (ccv)->ccvc.tcp->what + +#define DECLARE_CC_MODULE(ccname, ccalgo) \ + static moduledata_t cc_##ccname = { \ + .name = #ccname, \ + .evhand = cc_modevent, \ + .priv = ccalgo \ + }; \ + DECLARE_MODULE(ccname, cc_##ccname, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY) + +int cc_modevent(module_t mod, int type, void *data); + +#endif /* _NETINET_CC_MODULE_H_ */ Index: sys/netinet/cc/cc_newreno.c =================================================================== --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ sys/netinet/cc/cc_newreno.c 2011-02-27 23:31:00.832577970 +1100 @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. + * Copyright (c) 2007-2008,2010 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, James Healy and + * David Hayes, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University's + * Centre for Advanced Internet Architectures, Melbourne, Australia, which was + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#include +__FBSDID("$FreeBSD: stable/8/sys/netinet/cc/cc_newreno.c 215166 2010-11-12 06:41:55Z lstewart $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +static void newreno_ack_received(struct cc_var *ccv, uint16_t type); +static void newreno_after_idle(struct cc_var *ccv); +static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); +static void newreno_post_recovery(struct cc_var *ccv); + +struct cc_algo newreno_cc_algo = { + .name = "newreno", + .ack_received = newreno_ack_received, + .after_idle = newreno_after_idle, + .cong_signal = newreno_cong_signal, + .post_recovery = newreno_post_recovery, +}; + +static void +newreno_ack_received(struct cc_var *ccv, uint16_t type) +{ + if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && + (ccv->flags & CCF_CWND_LIMITED)) { + u_int cw = CCV(ccv, snd_cwnd); + u_int incr = CCV(ccv, t_maxseg); + + /* + * Regular in-order ACK, open the congestion window. + * Method depends on which congestion control state we're + * in (slow start or cong avoid) and if ABC (RFC 3465) is + * enabled. + * + * slow start: cwnd <= ssthresh + * cong avoid: cwnd > ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 5681): + * Grow cwnd exponentially by maxseg per ACK. + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 5681): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if (cw > CCV(ccv, snd_ssthresh)) { + if (V_tcp_do_rfc3465) { + if (ccv->flags & CCF_ABC_SENTAWND) + ccv->flags &= ~CCF_ABC_SENTAWND; + else + incr = 0; + } else + incr = max((incr * incr / cw), 1); + } else if (V_tcp_do_rfc3465) { + /* + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after + * an RTO. On RTO, snd_nxt = snd_una, so the + * snd_nxt == snd_max check is sufficient to + * handle this). + * + * XXXLAS: Find a way to signal SS after RTO that + * doesn't rely on tcpcb vars. + */ + if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) + incr = min(ccv->bytes_this_ack, + V_tcp_abc_l_var * CCV(ccv, t_maxseg)); + else + incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + } + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + CCV(ccv, snd_cwnd) = min(cw + incr, + TCP_MAXWIN << CCV(ccv, snd_scale)); + } +} + +static void +newreno_after_idle(struct cc_var *ccv) +{ + int rw; + + /* + * If we've been idle for more than one retransmit timeout the old + * congestion window is no longer current and we have to reduce it to + * the restart window before we can transmit again. + * + * The restart window is the initial window or the last CWND, whichever + * is smaller. + * + * This is done to prevent us from flooding the path with a full CWND at + * wirespeed, overloading router and switch buffers along the way. + * + * See RFC5681 Section 4.1. "Restarting Idle Connections". + */ + if (V_tcp_do_rfc3390) + rw = min(4 * CCV(ccv, t_maxseg), + max(2 * CCV(ccv, t_maxseg), 4380)); + else + rw = CCV(ccv, t_maxseg) * 2; + + CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +static void +newreno_cong_signal(struct cc_var *ccv, uint32_t type) +{ + u_int win; + + /* Catch algos which mistakenly leak private signal types. */ + KASSERT((type & CC_SIGPRIVMASK) == 0, + ("%s: congestion signal type 0x%08x is private\n", __func__, type)); + + win = max(CCV(ccv, snd_cwnd) / 2 / CCV(ccv, t_maxseg), 2) * + CCV(ccv, t_maxseg); + + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { + if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) + CCV(ccv, snd_ssthresh) = win; + ENTER_RECOVERY(CCV(ccv, t_flags)); + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { + CCV(ccv, snd_ssthresh) = win; + CCV(ccv, snd_cwnd) = win; + ENTER_CONGRECOVERY(CCV(ccv, t_flags)); + } + break; + } +} + +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +static void +newreno_post_recovery(struct cc_var *ccv) +{ + if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { + /* + * Fast recovery will conclude after returning from this + * function. Window inflation should have left us with + * approximately snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do it via the + * slow start mechanism. + * + * XXXLAS: Find a way to do this without needing curack + */ + if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh), + CCV(ccv, snd_max))) + CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) - + ccv->curack + CCV(ccv, t_maxseg); + else + CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); + } +} + + +DECLARE_CC_MODULE(newreno, &newreno_cc_algo); Index: sys/netinet/cc/cc.c =================================================================== --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ sys/netinet/cc/cc.c 2011-02-27 23:28:42.166216918 +1100 @@ -0,0 +1,325 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University's + * Centre for Advanced Internet Architectures, Melbourne, Australia, which was + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#include +__FBSDID("$FreeBSD: stable/8/sys/netinet/cc/cc.c 215166 2010-11-12 06:41:55Z lstewart $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* + * List of available cc algorithms on the current system. First element + * is used as the system default CC algorithm. + */ +struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); + +/* Protects the cc_list TAILQ. */ +struct rwlock cc_list_lock; + +VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; + +/* + * Sysctl handler to show and change the default CC algorithm. + */ +static int +cc_default_algo(SYSCTL_HANDLER_ARGS) +{ + char default_cc[TCP_CA_NAME_MAX]; + struct cc_algo *funcs; + int err, found; + + err = found = 0; + + if (req->newptr == NULL) { + /* Just print the current default. */ + CC_LIST_RLOCK(); + strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX); + CC_LIST_RUNLOCK(); + err = sysctl_handle_string(oidp, default_cc, 1, req); + } else { + /* Find algo with specified name and set it to default. */ + CC_LIST_RLOCK(); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (strncmp((char *)req->newptr, funcs->name, + TCP_CA_NAME_MAX) == 0) { + found = 1; + V_default_cc_ptr = funcs; + } + } + CC_LIST_RUNLOCK(); + + if (!found) + err = ESRCH; + } + + return (err); +} + +/* + * Sysctl handler to display the list of available CC algorithms. + */ +static int +cc_list_available(SYSCTL_HANDLER_ARGS) +{ + struct cc_algo *algo; + struct sbuf *s; + int err, first, nalgos; + + err = nalgos = 0; + first = 1; + + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + nalgos++; + } + CC_LIST_RUNLOCK(); + + s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); + + if (s == NULL) + return (ENOMEM); + + /* + * It is theoretically possible for the CC list to have grown in size + * since the call to sbuf_new() and therefore for the sbuf to be too + * small. If this were to happen (incredibly unlikely), the sbuf will + * reach an overflow condition, sbuf_printf() will return an error and + * the sysctl will fail gracefully. + */ + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); + if (err) { + /* Sbuf overflow condition. */ + err = EOVERFLOW; + break; + } + first = 0; + } + CC_LIST_RUNLOCK(); + + if (!err) { + sbuf_finish(s); + err = sysctl_handle_string(oidp, sbuf_data(s), 1, req); + } + + sbuf_delete(s); + return (err); +} + +/* + * Reset the default CC algo to NewReno for any netstack which is using the algo + * that is about to go away as its default. + */ +static void +cc_checkreset_default(struct cc_algo *remove_cc) +{ + VNET_ITERATOR_DECL(vnet_iter); + + CC_LIST_LOCK_ASSERT(); + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + if (strncmp(CC_DEFAULT()->name, remove_cc->name, + TCP_CA_NAME_MAX) == 0) + V_default_cc_ptr = &newreno_cc_algo; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Initialise CC subsystem on system boot. + */ +static void +cc_init(void) +{ + CC_LIST_LOCK_INIT(); + STAILQ_INIT(&cc_list); +} + +/* + * Returns non-zero on success, 0 on failure. + */ +int +cc_deregister_algo(struct cc_algo *remove_cc) +{ + struct cc_algo *funcs, *tmpfuncs; + int err; + + err = ENOENT; + + /* Never allow newreno to be deregistered. */ + if (&newreno_cc_algo == remove_cc) + return (EPERM); + + /* Remove algo from cc_list so that new connections can't use it. */ + CC_LIST_WLOCK(); + STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { + if (funcs == remove_cc) { + cc_checkreset_default(remove_cc); + STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); + err = 0; + break; + } + } + CC_LIST_WUNLOCK(); + + if (!err) + /* + * XXXLAS: + * - We may need to handle non-zero return values in future. + * - If we add CC framework support for protocols other than + * TCP, we may want a more generic way to handle this step. + */ + tcp_ccalgounload(remove_cc); + + return (err); +} + +/* + * Returns 0 on success, non-zero on failure. + */ +int +cc_register_algo(struct cc_algo *add_cc) +{ + struct cc_algo *funcs; + int err; + + err = 0; + + /* + * Iterate over list of registered CC algorithms and make sure + * we're not trying to add a duplicate. + */ + CC_LIST_WLOCK(); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (funcs == add_cc || strncmp(funcs->name, add_cc->name, + TCP_CA_NAME_MAX) == 0) + err = EEXIST; + } + + if (!err) + STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); + + CC_LIST_WUNLOCK(); + + return (err); +} + +/* + * Handles kld related events. Returns 0 on success, non-zero on failure. + */ +int +cc_modevent(module_t mod, int event_type, void *data) +{ + struct cc_algo *algo; + int err; + + err = 0; + algo = (struct cc_algo *)data; + + switch(event_type) { + case MOD_LOAD: + if (algo->mod_init != NULL) + err = algo->mod_init(); + if (!err) + err = cc_register_algo(algo); + break; + + case MOD_QUIESCE: + case MOD_SHUTDOWN: + case MOD_UNLOAD: + err = cc_deregister_algo(algo); + if (!err && algo->mod_destroy != NULL) + algo->mod_destroy(); + if (err == ENOENT) + err = 0; + break; + + default: + err = EINVAL; + break; + } + + return (err); +} + +SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); + +/* Declare sysctl tree and populate it. */ +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, + "congestion control related settings"); + +SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, + NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, cc_list_available, "A", + "list available congestion control algorithms"); Index: sys/netinet/cc.h =================================================================== --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ sys/netinet/cc.h 2011-02-27 23:31:01.008506663 +1100 @@ -0,0 +1,166 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/8/sys/netinet/cc.h 215166 2010-11-12 06:41:55Z lstewart $ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University's + * Centre for Advanced Internet Architectures, Melbourne, Australia, which was + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */ +#include + +/* Global CC vars. */ +extern STAILQ_HEAD(cc_head, cc_algo) cc_list; +extern const int tcprexmtthresh; +extern struct cc_algo newreno_cc_algo; + +/* Per-netstack bits. */ +VNET_DECLARE(struct cc_algo *, default_cc_ptr); +#define V_default_cc_ptr VNET(default_cc_ptr) + +/* Define the new net.inet.tcp.cc sysctl tree. */ +SYSCTL_DECL(_net_inet_tcp_cc); + +/* CC housekeeping functions. */ +int cc_register_algo(struct cc_algo *add_cc); +int cc_deregister_algo(struct cc_algo *remove_cc); + +/* + * Wrapper around transport structs that contain same-named congestion + * control variables. Allows algos to be shared amongst multiple CC aware + * transprots. + */ +struct cc_var { + void *cc_data; /* Per-connection private CC algorithm data. */ + int bytes_this_ack; /* # bytes acked by the current ACK. */ + tcp_seq curack; /* Most recent ACK. */ + uint32_t flags; /* Flags for cc_var (see below) */ + int type; /* Indicates which ptr is valid in ccvc. */ + union ccv_container { + struct tcpcb *tcp; + struct sctp_nets *sctp; + } ccvc; +}; + +/* cc_var flags. */ +#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ +#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ + +/* ACK types passed to the ack_received() hook. */ +#define CC_ACK 0x0001 /* Regular in sequence ACK. */ +#define CC_DUPACK 0x0002 /* Duplicate ACK. */ +#define CC_PARTIALACK 0x0004 /* Not yet. */ +#define CC_SACK 0x0008 /* Not yet. */ + +/* + * Congestion signal types passed to the cong_signal() hook. The highest order 8 + * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own + * congestion signal types. + */ +#define CC_ECN 0x00000001 /* ECN marked packet received. */ +#define CC_RTO 0x00000002 /* RTO fired. */ +#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ +#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ + +#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ + +/* + * Structure to hold data and function pointers that together represent a + * congestion control algorithm. + */ +struct cc_algo { + char name[TCP_CA_NAME_MAX]; + + /* Init global module state on kldload. */ + int (*mod_init)(void); + + /* Cleanup global module state on kldunload. */ + int (*mod_destroy)(void); + + /* Init CC state for a new control block. */ + int (*cb_init)(struct cc_var *ccv); + + /* Cleanup CC state for a terminating control block. */ + void (*cb_destroy)(struct cc_var *ccv); + + /* Init variables for a newly established connection. */ + void (*conn_init)(struct cc_var *ccv); + + /* Called on receipt of an ack. */ + void (*ack_received)(struct cc_var *ccv, uint16_t type); + + /* Called on detection of a congestion signal. */ + void (*cong_signal)(struct cc_var *ccv, uint32_t type); + + /* Called after exiting congestion recovery. */ + void (*post_recovery)(struct cc_var *ccv); + + /* Called when data transfer resumes after an idle period. */ + void (*after_idle)(struct cc_var *ccv); + + STAILQ_ENTRY (cc_algo) entries; +}; + +/* Macro to obtain the CC algo's struct ptr. */ +#define CC_ALGO(tp) ((tp)->cc_algo) + +/* Macro to obtain the CC algo's data ptr. */ +#define CC_DATA(tp) ((tp)->ccv->cc_data) + +/* Macro to obtain the system default CC algo's struct ptr. */ +#define CC_DEFAULT() V_default_cc_ptr + +extern struct rwlock cc_list_lock; +#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") +#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) +#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) +#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) +#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) +#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) +#define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) + +#endif /* _NETINET_CC_H_ */