Index: kern/uipc_socket.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.286 diff -u -p -r1.286 uipc_socket.c --- kern/uipc_socket.c 22 Nov 2006 23:54:29 -0000 1.286 +++ kern/uipc_socket.c 12 Dec 2006 21:55:52 -0000 @@ -367,6 +367,10 @@ socreate(dom, aso, type, proto, cred, td knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), NULL, NULL, NULL); so->so_count = 1; + /* + * Auto-sizing of socket buffers is managed by the protocols and + * the appropriate flags must be set in the pru_attach function. + */ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", @@ -441,6 +445,8 @@ sonewconn(head, connstatus) so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; so->so_state |= connstatus; ACCEPT_LOCK(); if (connstatus) { @@ -2107,6 +2316,8 @@ sosetopt(so, sopt) error = ENOBUFS; goto bad; } + (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : + &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; break; /* Index: netinet/tcp_input.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.311 diff -u -p -r1.311 tcp_input.c --- netinet/tcp_input.c 12 Dec 2006 12:17:56 -0000 1.311 +++ netinet/tcp_input.c 12 Dec 2006 21:56:04 -0000 @@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); +int tcp_do_autorcvbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, + &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); + +int tcp_autorcvbuf_inc = 16*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, + &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); + +int tcp_autorcvbuf_max = 256*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, + &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); + struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; @@ -1295,6 +1313,8 @@ after_listen: } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { + int newsize = 0; /* automatic sockbuf scaling */ + KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; @@ -1321,18 +1341,82 @@ after_listen: tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ - /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - * Add data to socket buffer. - */ + /* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 2. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 3. receive buffer size has not hit maximal automatic size; + * 4. Profit! + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + */ + if (tcp_do_autorcvbuf && + to.to_tsecr && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + if (to.to_tsecr > tp->rfbuf_ts && + to.to_tsecr - tp->rfbuf_ts < hz) { + if (tp->rfbuf_cnt > + (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < + tcp_autorcvbuf_max) { + newsize = + min(so->so_rcv.sb_hiwat + + tcp_autorcvbuf_inc, + tcp_autorcvbuf_max); +#if 1 + log(LOG_DEBUG, "%s: hiwat %i, ref_ts %i, ts %i, " + "count %i, new %i, max %i\n", + __func__, so->so_rcv.sb_hiwat, tp->rfbuf_ts, + to.to_tsecr, tp->rfbuf_cnt, newsize, (int)sb_max); +#endif + } + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else + tp->rfbuf_cnt += tlen; /* add up */ + } + + /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { + /* + * Set new socket buffer size. + * Give up when limit is reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, curthread)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } @@ -1361,6 +1445,10 @@ after_listen: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + switch (tp->t_state) { /* Index: netinet/tcp_output.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.121 diff -u -p -r1.121 tcp_output.c --- netinet/tcp_output.c 22 Oct 2006 11:52:16 -0000 1.121 +++ netinet/tcp_output.c 12 Dec 2006 21:56:04 -0000 @@ -48,6 +48,7 @@ #include #include #include +#include #include @@ -103,13 +104,26 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, loca &ss_fltsz_local, 1, "Slow start flight size for local networks"); int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, - 0, "Enable NewReno Algorithms"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, + &tcp_do_newreno, 0, "Enable NewReno Algorithms"); int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +int tcp_do_autosndbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, + &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); + +int tcp_autosndbuf_inc = 8*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, + &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); + +int tcp_autosndbuf_max = 256*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, + &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); + + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -380,11 +394,58 @@ after_sack_rexmit: } } + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("%s: len < 0", __func__)); + /* - * len will be >= 0 after this point. Truncate to the maximum - * segment length or enable TCP Segmentation Offloading (if supported - * by hardware) and ensure that FIN is removed if the length no longer - * contains the last data byte. + * Automatic sizing of send socket buffer. Often the send buffer + * size is not optimally adjusted to the actual network conditions + * at hand (delay bandwidth product). Setting the buffer size too + * small limits throughput on links with high bandwidth and high + * delay (eg. trans-continental/oceanic links). Setting the + * buffer size too big consumes too much real kernel memory, + * especially with many connections on busy servers. + * + * The criteria to step up the send buffer one notch are: + * 1. receive window of remote host is larger than send buffer + * (with a fudge factor of 5/4th); + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. send buffer fill has not hit maximal automatic size; + * 4. our send window (slow start and cogestion controlled) is + * larger than sent but unacknowledged data in send buffer. + * + * The remote host receive window scaling factor may limit the + * growing of the send buffer before it reaches its allowed + * maximum. + * + * Optional: Shrink send buffer during idle periods together + * with congestion window. Requires another timer. Has to + * wait for upcoming tcp timer rewrite. + */ + if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && + so->so_snd.sb_cc < tcp_autosndbuf_max && + sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { +#if 1 + log(LOG_DEBUG, "%s: inc sockbuf, old %i, new %i, " + "sb_cc %i, snd_wnd %i, sendwnd %i\n", + __func__, so->so_snd.sb_hiwat, + so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + so->so_snd.sb_cc, (int)tp->snd_wnd, (int)sendwin); +#endif + if (!sbreserve_locked(&so->so_snd, + min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + tcp_autosndbuf_max), so, curthread)) + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + } + } + + /* + * Truncate to the maximum segment length or enable TCP Segmentation + * Offloading (if supported by hardware) and ensure that FIN is removed + * if the length no longer contains the last data byte. * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and @@ -606,6 +705,10 @@ send: optlen += TCPOLEN_TSTAMP_APPA; } + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = ticks; + #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) Index: netinet/tcp_syncache.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_syncache.c,v retrieving revision 1.102 diff -u -p -r1.102 tcp_syncache.c --- netinet/tcp_syncache.c 22 Oct 2006 11:52:17 -0000 1.102 +++ netinet/tcp_syncache.c 12 Dec 2006 21:56:04 -0000 @@ -983,9 +1011,15 @@ syncache_add(struct in_conninfo *inc, st if (to->to_flags & TOF_SCALE) { int wscale = 0; - /* Compute proper scaling value from buffer space */ + /* + * Compute proper scaling value from buffer space: + * Place window scale into sweet spot where it is + * about one fourth of the MSS. This allows us to + * scale the receive buffer over a wide range while + * not losing any efficiency or fine granularity. + */ while (wscale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << wscale) < sb_hiwat) + (0x1 << wscale) < tcp_minmss) /* XXX */ wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_requested_s_scale; Index: netinet/tcp_usrreq.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.142 diff -u -p -r1.142 tcp_usrreq.c --- netinet/tcp_usrreq.c 22 Nov 2006 17:16:54 -0000 1.142 +++ netinet/tcp_usrreq.c 12 Dec 2006 21:56:05 -0000 @@ -1131,9 +1134,13 @@ tcp_connect(tp, nam, td) inp->inp_laddr = laddr; in_pcbrehash(inp); - /* Compute window scaling to request. */ + /* + * Compute window scaling to request: + * Scale to fit into sweet spot. See tcp_syncache.c. + * XXX: This should be moved to tcp_output(). + */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + (0x1 << tp->request_r_scale) < tcp_minmss) /* XXX */ tp->request_r_scale++; soisconnecting(so); @@ -1441,6 +1448,8 @@ tcp_attach(so) if (error) return (error); } + so->so_rcv.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= SB_AUTOSIZE; INP_INFO_WLOCK(&tcbinfo); error = in_pcballoc(so, &tcbinfo); if (error) { Index: netinet/tcp_var.h =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.137 diff -u -p -r1.137 tcp_var.h --- netinet/tcp_var.h 13 Sep 2006 13:08:27 -0000 1.137 +++ netinet/tcp_var.h 12 Dec 2006 21:56:05 -0000 @@ -202,6 +202,8 @@ struct tcpcb { episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ + int rfbuf_cnt; /* recv buffer autoscaling byte count */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) Index: sys/socketvar.h =================================================================== RCS file: /home/ncvs/src/sys/sys/socketvar.h,v retrieving revision 1.154 diff -u -p -r1.154 socketvar.h --- sys/socketvar.h 1 Aug 2006 10:30:26 -0000 1.154 +++ sys/socketvar.h 16 Nov 2006 22:41:18 -0000 @@ -128,6 +128,7 @@ struct socket { #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ +#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ void (*so_upcall)(struct socket *, void *, int); void *so_upcallarg;