Index: tcp_input.c =================================================================== --- tcp_input.c (revision 197703) +++ tcp_input.c (working copy) @@ -1196,6 +1196,19 @@ (thflags & TH_SYN) ? TO_SYN : 0); /* + * If the TCP user timeout option is present, record it but + * do nothing because it's an optional option. + * We will process on the following cases: + * 1. We need do retransmission. + * 2. Users request a UTO value. + */ + if (to.to_flags & TOF_UTO) { + tp->uto_flags |= TCPUTO_RCVD; + tp->rcv_uto = to.to_uto; + TCPSTAT_INC(tcps_rcvuto); + } + + /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection @@ -2247,6 +2260,14 @@ } process_ACK: + /* + * If received an ACK for a previously sent TCP UTO option, + * stop including the TCP UTO option on output packets. + */ + if (tp->uto_flags & TCPUTO_SENDING) + if (SEQ_GEQ(th->th_ack, tp->uto_carrier)) + tp->uto_flags &= ~TCPUTO_SENDING; + INP_INFO_LOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, ("tcp_input: process_ACK ti_locked %d", ti_locked)); @@ -2953,6 +2974,14 @@ to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; + case TCPOPT_UTO: + if (optlen != TCPOLEN_UTO) + continue; + to->to_flags |= TOF_UTO; + bcopy((char *)cp + 2, + (char *)&to->to_uto, sizeof(to->to_uto)); + to->to_uto = ntohs(to->to_uto); + break; default: continue; } Index: tcp_subr.c =================================================================== --- tcp_subr.c (revision 197703) +++ tcp_subr.c (working copy) @@ -342,6 +342,7 @@ V_tcp_autorcvbuf_max = 256*1024; V_tcp_do_rfc3465 = 1; V_tcp_abc_l_var = 2; + V_tcp_uto_enable = 0; V_tcp_mssdflt = TCP_MSS; #ifdef INET6 @@ -425,6 +426,8 @@ tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + V_tcp_uto_min = TCPTV_UTO_MIN; + V_tcp_uto_max = TCPTV_UTO_MAX; #ifdef TCP_SORECEIVE_STREAM TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream); @@ -787,7 +790,18 @@ tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; + + tp->uto_flags = TCPUTO_CHANGEABLE; + if (V_tcp_uto_enable) + tp->uto_flags |= TCPUTO_ENABLE; /* + * According to RFC 5482, t_uto_adv is UTO option advertised to the + * remote TCP peer. It defaults to the default system-wide USER + * TIMEOUT. + */ + tp->t_uto_adv = TCPTV_UTO_DEFAULT; + + /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. Index: tcp_timer.c =================================================================== --- tcp_timer.c (revision 197703) +++ tcp_timer.c (working copy) @@ -110,7 +110,21 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); +VNET_DEFINE(int, tcp_uto_min); +SYSCTL_VNET_PROC(_net_inet_tcp, OID_AUTO, uto_min, CTLTYPE_INT|CTLFLAG_RW, + &VNET_NAME(tcp_uto_min), 0, sysctl_msec_to_ticks, "I", + "Minimun User Timeout"); +VNET_DEFINE(int, tcp_uto_max); +SYSCTL_VNET_PROC(_net_inet_tcp, OID_AUTO, uto_max, CTLTYPE_INT|CTLFLAG_RW, + &VNET_NAME(tcp_uto_max), 0, sysctl_msec_to_ticks, "I", + "Maximum User Timeout"); + +VNET_DEFINE(int, tcp_uto_enable); +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, uto_always, CTLTYPE_INT|CTLFLAG_RW, + &VNET_NAME(tcp_uto_enable), 0, + "Enable TCP UTO (RFC5482) on every socket"); + static int tcp_keepcnt = TCPTV_KEEPCNT; /* max idle probes */ int tcp_maxpersistidle; @@ -473,12 +487,20 @@ } callout_deactivate(&tp->t_timers->tt_rexmt); tcp_free_sackholes(tp); + + if (tp->t_rxtshift == 0) { + TCPT_RESOLVE_UTO(tp); + tp->t_uto_left = tp->t_uto_impl / hz; + tp->t_uto_left -= tcp_backoff[0]; + } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + if ((++tp->t_rxtshift > TCP_MAXRXTSHIFT && + (tp->uto_flags & TCPUTO_IMPL) == 0) || + (tp->t_uto_left == 0 && tp->uto_flags & TCPUTO_IMPL)) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); tp = tcp_drop(tp, tp->t_softerror ? @@ -510,9 +532,27 @@ if (tp->t_state == TCPS_SYN_SENT) rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; else - rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; - TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); + if ((tp->uto_flags & TCPUTO_IMPL) == 0) { + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, + TCPTV_REXMTMAX); + } else { + int rxtshift, interval; + rxtshift = min(TCP_MAXRXTSHIFT, tp->t_rxtshift); + interval = min(TCP_REXMTMAX, tcp_backoff[rxtshift]); + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + if (tp->t_uto_left < interval) { + tp->t_rxtcur = (tp->t_rxtcur * tp->t_uto_left) + / interval; + /* Prevent t_rxtcur from reaching zero */ + TCPT_RANGESET(tp->t_rxtcur, tp->t_rxtcur, + tp->t_rttmin, TCPTV_REXMTMAX); + } + tp->t_uto_left -= min(tp->t_uto_left, interval); + } + /* * Disable rfc1323 if we havn't got any response to * our third SYN to work-around some broken terminal servers Index: tcp_timer.h =================================================================== --- tcp_timer.h (revision 197703) +++ tcp_timer.h (working copy) @@ -91,6 +91,10 @@ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ +#define TCPTV_UTO_MIN ( 120*hz) /* min user timeout */ +#define TCPTV_UTO_MAX (1020*hz) /* max user timeout */ +#define TCPTV_UTO_DEFAULT ( 511*hz) /* default user timeout */ + /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with @@ -113,8 +117,9 @@ */ #define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ #define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ -#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ - +#define TCP_REXMTMAX 64 /* max allowable REXMT value + in seconds */ +#define TCPTV_REXMTMAX ( TCP_REXMTMAX*hz ) /* max allowable REXMT value */ #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ @@ -170,7 +175,6 @@ extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; - void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); struct tcptw * Index: tcp_var.h =================================================================== --- tcp_var.h (revision 197703) +++ tcp_var.h (working copy) @@ -200,9 +200,23 @@ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ - int t_ispare; /* explicit pad for 64bit alignment */ + /* TCP User Timeout variables (RFC 5482) */ + uint16_t rcv_uto; /* received user timeout */ + uint16_t snd_uto; /* send user timeout */ + uint8_t uto_flags; + uint8_t _utopad; + uint16_t _utopad2; + u_int t_uto_adv; /* user timeout sent to remote peer + (ticks) */ + u_int t_uto_impl; /* implemented user timeout (ticks) */ + u_int t_uto_left; /* remained user timeout value + (seconds) */ + tcp_seq uto_carrier; /* max sequence number that carry user + timeout */ + void *t_pspare2[6]; /* 2 CC / 4 TBD */ - uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ + uint32_t _pad2; + uint64_t _pad[10]; /* 7 TBD (1-2 CC/RTT?) */ }; /* @@ -245,6 +259,35 @@ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 +/* + * Flags for the uto_flags field. + */ +#define TCPUTO_ENABLE 0x01 /* enable tcp user timeout */ +#define TCPUTO_CHANGEABLE 0x02 /* user timeout can be changed by other side */ +#define TCPUTO_IMPL 0x04 /* implement user timeout */ +#define TCPUTO_RCVD 0x08 /* other side has requested user timeout */ +#define TCPUTO_NEED 0x10 /* user timeout needs to be sent */ +#define TCPUTO_SENDING 0x20 /* user timeout is in the process of sending */ + +/* + * Resolve user timeout value(ticks). +*/ +#define TCPT_RESOLVE_UTO(tp) do { \ + if ((tp)->uto_flags & TCPUTO_ENABLE && \ + (tp)->uto_flags & TCPUTO_RCVD && \ + (tp)->uto_flags & TCPUTO_CHANGEABLE) { \ + (tp)->t_uto_impl = (tp)->rcv_uto >> 1; \ + if ((tp)->rcv_uto & 1) \ + (tp)->t_uto_impl *= 60; \ + (tp)->t_uto_impl *= hz; \ + (tp)->t_uto_impl = min(tcp_uto_max, \ + max((tp)->t_uto_adv, \ + max((tp)->t_uto_impl, tcp_uto_min))); \ + (tp)->uto_flags &= ~TCPUTO_RCVD; \ + (tp)->uto_flags |= TCPUTO_IMPL; \ + } \ +} while(0) + #ifdef TCP_SIGNATURE /* * Defines which are needed by the xform_tcp module and tcp_[in|out]put @@ -276,7 +319,8 @@ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ -#define TOF_MAXOPT 0x0100 +#define TOF_UTO 0x0100 /* user timeout (RFC5482) */ +#define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ @@ -284,6 +328,7 @@ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ + u_int16_t to_uto; /* UTO option (RFC5482) */ }; /* @@ -470,7 +515,9 @@ u_long tcps_ecn_shs; /* ECN successful handshakes */ u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ - u_long _pad[12]; /* 6 UTO, 6 TBD */ + u_long tcps_snduto; /* packets sent with TCP UTO opt */ + u_long tcps_rcvuto; /* packets received with TCP UTO opt */ + u_long _pad[10]; /* 10 TBD */ }; #ifdef _KERNEL @@ -625,6 +672,9 @@ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */ VNET_DECLARE(int, tcp_ecn_maxretries); +VNET_DECLARE(int, tcp_uto_min); +VNET_DECLARE(int, tcp_uto_max); +VNET_DECLARE(int, tcp_uto_enable); #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) @@ -633,6 +683,9 @@ #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +#define V_tcp_uto_min VNET(tcp_uto_min) +#define V_tcp_uto_max VNET(tcp_uto_max) +#define V_tcp_uto_enable VNET(tcp_uto_enable) int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * Index: tcp_output.c =================================================================== --- tcp_output.c (revision 197703) +++ tcp_output.c (working copy) @@ -694,9 +694,40 @@ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ + /* + * We set the UTO option in TCP header in two cases: the + * segment has a SYN, a SYN | ACK, or a normal data segment. + */ + if (flags & TH_SYN || + (len && (tp->t_flags & TF_FORCEDATA) == 0)) + if (tp->uto_flags & (TCPUTO_NEED | TCPUTO_SENDING)) { + to.to_uto = tp->snd_uto; + to.to_flags |= TOF_UTO; + } /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); + + /* + * According to RFC 5482: + * "In addition to exchanging UTO options in the SYN segments, + * a connection that has enabled UTO options SHOULD include a + * UTO option in the first packet that does not have the SYN + * flag set. This helps to minimize the amount of state + * information TCP must keep for connections in + * non-synchronized states." + * So even though the UTO option is set in the SYN segment, + * we we shall retransmit it. + */ + if (tp->uto_flags & (TCPUTO_NEED | TCPUTO_SENDING) && + (to.to_flags & TOF_UTO) == 0) { + if ((flags & TH_SYN) == 0) { + tp->uto_flags &= ~(TCPUTO_NEED | TCPUTO_SENDING); + tp->uto_flags |= TCPUTO_SENDING; + tp->uto_carrier = tp->snd_nxt + len; + } + TCPSTAT_INC(tcps_snduto); + } } #ifdef INET6 @@ -1323,6 +1354,11 @@ * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). + * + * TCP option UTO (user timeout, defined in RFC 5482), is an optional option + * that consumes 4 bytes. We attach the UTO option only when there is enough + * free space in the TCP header. + * Although UTO is optional, we should try our best to transmit it. */ int tcp_addoptions(struct tcpopt *to, u_char *optp) @@ -1437,6 +1473,19 @@ TCPSTAT_INC(tcps_sack_send_blocks); break; } + case TOF_UTO: + { + if (TCP_MAXOLEN - optlen < TCPOLEN_UTO) + continue; + *optp++ = TCPOPT_UTO; + *optp++ = TCPOLEN_UTO; + optlen += TCPOLEN_UTO; + to->to_uto = htons(to->to_uto); + bcopy((u_char *)&to->to_uto, optp, sizeof(to->to_uto)); + optp += sizeof(to->to_uto); + to->to_flags &= ~TOF_UTO; + break; + } default: panic("%s: unknown TCP option type", __func__); break; Index: tcp.h =================================================================== --- tcp.h (revision 197703) +++ tcp.h (working copy) @@ -96,6 +96,8 @@ #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +#define TCPOPT_UTO 28 +#define TCPOLEN_UTO 4 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ @@ -150,6 +152,14 @@ #define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ #define TCP_INFO 0x20 /* retrieve tcp_info structure */ #define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ +#define TCP_UTO 0x80 /* set tcp user timeout */ +struct tcputo { + int uto; + int flags; +}; +#define TCP_UTO_STORE 0x01 +#define TCP_UTO_ENABLE 0x02 +#define TCP_UTO_CHANGE 0x04 #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ @@ -158,6 +168,7 @@ #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 +#define TCPI_OPT_UTO 0x20 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits @@ -217,9 +228,10 @@ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ - + u_int32_t tcpi_uto; /* tcp user timeout value */ + /* Padding to grow without breaking ABI. */ - u_int32_t __tcpi_pad[29]; /* Padding. */ + u_int32_t __tcpi_pad[28]; /* Padding. */ }; #endif Index: tcp_syncache.c =================================================================== --- tcp_syncache.c (revision 197703) +++ tcp_syncache.c (working copy) @@ -774,6 +774,10 @@ #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; + if (sc->sc_flags & SCF_UTO) { + tp->uto_flags |= TCPUTO_RCVD; + tp->rcv_uto = sc->sc_peer_uto; + } } if (sc->sc_flags & SCF_ECN) @@ -1212,6 +1216,11 @@ sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; + if (to->to_flags & TOF_UTO) { + sc->sc_peer_uto = to->to_uto; + sc->sc_flags |= SCF_UTO; + TCPSTAT_INC(tcps_rcvuto); + } if (V_tcp_syncookies) { syncookie_generate(sch, sc, &flowtmp); Index: tcp_syncache.h =================================================================== --- tcp_syncache.h (revision 197703) +++ tcp_syncache.h (working copy) @@ -74,6 +74,7 @@ u_int8_t sc_ip_tos; /* IPv4 TOS */ u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; + u_int16_t sc_peer_uto; /* peer's user timeout */ u_int16_t sc_flags; #ifndef TCP_OFFLOAD_DISABLE struct toe_usrreqs *sc_tu; /* TOE operations */ @@ -94,6 +95,7 @@ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ #define SCF_SACK 0x80 /* send SACK option */ #define SCF_ECN 0x100 /* send ECN setup packet */ +#define SCF_UTO 0x200 /* UTO option received */ #define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ #define SYNCOOKIE_LIFETIME 16 /* seconds */ Index: tcp_usrreq.c =================================================================== --- tcp_usrreq.c (revision 197703) +++ tcp_usrreq.c (working copy) @@ -1192,6 +1192,11 @@ ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; + } + TCPT_RESOLVE_UTO(tp); + if (tp->uto_flags & TCPUTO_IMPL) { + ti->tcpi_options |= TCPI_OPT_UTO; + ti->tcpi_uto = tp->t_uto_impl / hz; } ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; @@ -1236,6 +1241,7 @@ struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; + struct tcputo tu; error = 0; inp = sotoinpcb(so); @@ -1345,6 +1351,53 @@ error = EINVAL; break; + case TCP_UTO: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &tu, sizeof tu, + sizeof tu); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (tu.flags & ~(TCP_UTO_ENABLE | TCP_UTO_STORE | + TCP_UTO_CHANGE)) { + error = EINVAL; + break; + } + if (tu.flags & TCP_UTO_ENABLE) + tp->uto_flags |= TCPUTO_ENABLE; + if (tu.flags & TCP_UTO_STORE) { + tp->uto_flags |= TCPUTO_NEED; + if (tu.uto > 0 && tu.uto <= 0x7FFF * 60) { + if (tu.uto > 0x7FFF) { + tp->snd_uto = tu.uto / 60; + tp->snd_uto <<= 1; + tp->snd_uto |= 1; + } else { + tp->snd_uto = tu.uto; + tp->snd_uto <<= 1; + } + if (tp->uto_flags & TCPUTO_ENABLE && + tp->uto_flags & TCPUTO_NEED) { + uint32_t maxtime; + + maxtime = max(tu.uto*hz, + tcp_uto_min); + tp->t_uto_impl = + min(tcp_uto_max, maxtime); + tp->t_uto_adv = tp->t_uto_impl; + tp->uto_flags &= + ~TCPUTO_CHANGEABLE; + tp->uto_flags |= TCPUTO_IMPL; + } + } else + error = EINVAL; + } + if (tu.flags & TCP_UTO_CHANGE) + tp->uto_flags |= TCPUTO_CHANGEABLE; + INP_WUNLOCK(inp); + break; + default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1388,6 +1441,20 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_UTO: + tu.flags = 0; + if (tp->uto_flags & TCPUTO_ENABLE) + tu.flags |= TCP_UTO_ENABLE; + if (tp->uto_flags & TCPUTO_CHANGEABLE) + tu.flags |= TCP_UTO_CHANGE; + TCPT_RESOLVE_UTO(tp); + if (tp->uto_flags & TCPUTO_IMPL) { + tu.flags |= TCP_UTO_STORE; + tu.uto = tp->t_uto_impl / hz; + } + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &tu, sizeof tu); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT;