diff -r 2d08de78744a -r 97f3a97b1f1a sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c Sun Jul 18 20:23:10 2010 +0000 +++ b/sys/netinet/tcp_input.c Mon Jul 26 15:06:17 2010 +1000 @@ -1468,10 +1468,24 @@ * Set new socket buffer size. * Give up when limit is reached. */ - if (newsize) + if (newsize) { if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + else { + /* + * Scale reassembly queue to 8/7 + * the socket buffer size to + * allow a little wiggle room. + */ + tp->t_segq.tsegq_maxmbufs = + ((newsize / tp->t_maxseg) << + 3) / 7; + tp->t_segq.tsegq_maxbytes = + tp->t_segq.tsegq_maxmbufs * + tp->t_maxseg; + } + } m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } @@ -3302,6 +3316,9 @@ /* * While we're here, check the others too. */ + tp->t_segq.tsegq_maxmbufs = ((bufsize / mss) << 3) / 7; + tp->t_segq.tsegq_maxbytes = tp->t_segq.tsegq_maxmbufs * mss; + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; diff -r 2d08de78744a -r 97f3a97b1f1a sys/netinet/tcp_reass.c --- a/sys/netinet/tcp_reass.c Sun Jul 18 20:23:10 2010 +0000 +++ b/sys/netinet/tcp_reass.c Mon Jul 26 15:06:17 2010 +1000 @@ -74,39 +74,57 @@ #include #endif /* TCPDEBUG */ +#include + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static VNET_DEFINE(int, tcp_reass_maxseg) = 0; -#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg) -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &VNET_NAME(tcp_reass_maxseg), 0, - "Global maximum number of TCP Segments in Reassembly Queue"); +VNET_DEFINE(int, tcp_reass_curmbufs) = 0; +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, curmbufs, CTLFLAG_RD, + &VNET_NAME(tcp_reass_curmbufs), 0, + "Global number of mbufs currently held in TCP reassembly queues"); -VNET_DEFINE(int, tcp_reass_qsize) = 0; -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, - &VNET_NAME(tcp_reass_qsize), 0, - "Global number of TCP Segments currently in Reassembly Queue"); +VNET_DEFINE(int, tcp_reass_curbytes) = 0; +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, curbytes, CTLFLAG_RD, + &VNET_NAME(tcp_reass_curbytes), 0, + "Global number of bytes currently held in TCP reassembly queues"); -static VNET_DEFINE(int, tcp_reass_maxqlen) = 48; -#define V_tcp_reass_maxqlen VNET(tcp_reass_maxqlen) -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, - &VNET_NAME(tcp_reass_maxqlen), 0, - "Maximum number of TCP Segments per individual Reassembly Queue"); +static VNET_DEFINE(int, tcp_reass_maxmbufs) = 0; +#define V_tcp_reass_maxmbufs VNET(tcp_reass_maxmbufs) +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxmbufs, CTLFLAG_RD, + &VNET_NAME(tcp_reass_maxmbufs), 0, + "Global maximum number of mbufs permitted across TCP reassembly queues"); + +static VNET_DEFINE(int, tcp_reass_maxbytes) = 0; +#define V_tcp_reass_maxbytes VNET(tcp_reass_maxbytes) +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxbytes, CTLFLAG_RD, + &VNET_NAME(tcp_reass_maxbytes), 0, + "Global maximum number of bytes permitted across TCP reassembly queues"); static VNET_DEFINE(int, tcp_reass_overflows) = 0; #define V_tcp_reass_overflows VNET(tcp_reass_overflows) SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &VNET_NAME(tcp_reass_overflows), 0, - "Global number of TCP Segment Reassembly Queue Overflows"); + "Global number of overflows across TCP reassembly queues"); + +/* Most common MSS for bulk transfers, */ +#define COMMON_MSS 1448 + +#define ADJ_REASS_COUNTERS(tsegq, bytesdiff, mbufsdiff) do { \ + (tsegq)->tsegq_bytes += (bytesdiff); \ + (tsegq)->tsegq_mbufs += (mbufsdiff); \ + atomic_add_int(&V_tcp_reass_curbytes, (bytesdiff)); \ + atomic_add_int(&V_tcp_reass_curmbufs, (mbufsdiff)); \ +} while(0) /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { - - V_tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); + /* XXXLAS: Better justify these magic numbers somehow. */ + V_tcp_reass_maxmbufs = nmbclusters / 16; + V_tcp_reass_maxbytes = V_tcp_reass_maxmbufs * COMMON_MSS; + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxmbufs); } VNET_DEFINE(uma_zone_t, tcp_reass_zone); @@ -114,13 +132,12 @@ void tcp_reass_init(void) { - - V_tcp_reass_maxseg = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &V_tcp_reass_maxseg); + V_tcp_reass_maxmbufs = nmbclusters / 16; + TUNABLE_INT_FETCH("net.inet.tcp.reass.maxmbufs", &V_tcp_reass_maxmbufs); + V_tcp_reass_maxbytes = V_tcp_reass_maxmbufs * COMMON_MSS; V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxmbufs); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -134,18 +151,50 @@ } #endif +void +tcp_reass_flush(struct tsegq *t_segq) +{ + struct tseg_qent *qe; + int bytesdiff, mbufsdiff; + + bytesdiff = mbufsdiff = 0; + + while ((qe = LIST_FIRST(t_segq)) != NULL) { + LIST_REMOVE(qe, tqe_q); + bytesdiff -= qe->tqe_len; + mbufsdiff--; + m_freem(qe->tqe_m); + uma_zfree(V_tcp_reass_zone, qe); + } + + ADJ_REASS_COUNTERS(t_segq, bytesdiff, mbufsdiff); + + KASSERT((t_segq->tsegq_bytes == 0), + ("Reassembly queue byte count is %d instead of 0 after flush!", + t_segq->tsegq_bytes)); + KASSERT((t_segq->tsegq_mbufs == 0), + ("Reassembly queue mbuf count is %d instead of 0 after flush!", + t_segq->tsegq_mbufs)); +} + int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { struct tseg_qent *q; - struct tseg_qent *p = NULL; + struct tseg_qent *p; struct tseg_qent *nq; - struct tseg_qent *te = NULL; - struct socket *so = tp->t_inpcb->inp_socket; - int flags; + struct tseg_qent *te; + struct tsegq *t_segq; + struct socket *so; + int flags, bytesdiff, mbufsdiff; INP_WLOCK_ASSERT(tp->t_inpcb); + p = te = NULL; + so = tp->t_inpcb->inp_socket; + t_segq = &tp->t_segq; + bytesdiff = mbufsdiff = 0; + /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). @@ -166,9 +215,11 @@ * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && - (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg || - tp->t_segqlen >= V_tcp_reass_maxqlen)) { - V_tcp_reass_overflows++; + (V_tcp_reass_curmbufs + 1 > V_tcp_reass_maxmbufs || + V_tcp_reass_curbytes + *tlenp > V_tcp_reass_maxbytes || + t_segq->tsegq_bytes + *tlenp >= t_segq->tsegq_maxbytes || + t_segq->tsegq_mbufs + 1 > t_segq->tsegq_maxmbufs)) { + atomic_add_int(&V_tcp_reass_overflows, 1); TCPSTAT_INC(tcps_rcvmemdrop); m_freem(m); *tlenp = 0; @@ -186,8 +237,6 @@ *tlenp = 0; return (0); } - tp->t_segqlen++; - V_tcp_reass_qsize++; /* * Find a segment which begins after this one does. @@ -213,8 +262,6 @@ TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); m_freem(m); uma_zfree(V_tcp_reass_zone, te); - tp->t_segqlen--; - V_tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. @@ -231,6 +278,9 @@ TCPSTAT_INC(tcps_rcvoopack); TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); + bytesdiff = *tlenp; + mbufsdiff++; + /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. @@ -243,15 +293,16 @@ q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); + bytesdiff -= i; break; } + mbufsdiff--; + bytesdiff -= q->tqe_len; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(V_tcp_reass_zone, q); - tp->t_segqlen--; - V_tcp_reass_qsize--; q = nq; } @@ -271,15 +322,21 @@ * Present data to user, advancing rcv_nxt through * completed sequence space. */ - if (!TCPS_HAVEESTABLISHED(tp->t_state)) + if (!TCPS_HAVEESTABLISHED(tp->t_state)) { + ADJ_REASS_COUNTERS(t_segq, bytesdiff, mbufsdiff); return (0); + } q = LIST_FIRST(&tp->t_segq); - if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) { + ADJ_REASS_COUNTERS(t_segq, bytesdiff, mbufsdiff); return (0); + } SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; + bytesdiff -= q->tqe_len; + mbufsdiff--; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) @@ -287,10 +344,9 @@ else sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(V_tcp_reass_zone, q); - tp->t_segqlen--; - V_tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ADJ_REASS_COUNTERS(t_segq, bytesdiff, mbufsdiff); ND6_HINT(tp); sorwakeup_locked(so); return (flags); diff -r 2d08de78744a -r 97f3a97b1f1a sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c Sun Jul 18 20:23:10 2010 +0000 +++ b/sys/netinet/tcp_subr.c Mon Jul 26 15:06:17 2010 +1000 @@ -106,6 +106,7 @@ #include #endif /*IPSEC*/ +#include #include #include @@ -768,7 +769,6 @@ void tcp_discardcb(struct tcpcb *tp) { - struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 @@ -857,13 +857,8 @@ } /* free the reassembly queue, if any */ - while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { - LIST_REMOVE(q, tqe_q); - m_freem(q->tqe_m); - uma_zfree(V_tcp_reass_zone, q); - tp->t_segqlen--; - V_tcp_reass_qsize--; - } + tcp_reass_flush(&tp->t_segq); + /* Disconnect offload device, if any. */ tcp_offload_detach(tp); @@ -921,7 +916,6 @@ CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; - struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, @@ -937,14 +931,7 @@ continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { - while ((te = LIST_FIRST(&tcpb->t_segq)) - != NULL) { - LIST_REMOVE(te, tqe_q); - m_freem(te->tqe_m); - uma_zfree(V_tcp_reass_zone, te); - tcpb->t_segqlen--; - V_tcp_reass_qsize--; - } + tcp_reass_flush(&tcpb->t_segq); tcp_clean_sackreport(tcpb); } INP_WUNLOCK(inpb); diff -r 2d08de78744a -r 97f3a97b1f1a sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c Sun Jul 18 20:23:10 2010 +0000 +++ b/sys/netinet/tcp_usrreq.c Mon Jul 26 15:06:17 2010 +1000 @@ -1460,6 +1460,11 @@ INP_INFO_WUNLOCK(&V_tcbinfo); return (ENOBUFS); } + + tp->t_segq.tsegq_maxmbufs = ((so->so_rcv.sb_hiwat / tp->t_maxseg) << 3) + / 7; + tp->t_segq.tsegq_maxbytes = tp->t_segq.tsegq_maxmbufs * tp->t_maxseg; + tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1757,8 +1762,8 @@ indent += 2; db_print_indent(indent); - db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", - LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); + db_printf("t_segq first: %p t_dupacks: %d\n", + LIST_FIRST(&tp->t_segq), tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", diff -r 2d08de78744a -r 97f3a97b1f1a sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h Sun Jul 18 20:23:10 2010 +0000 +++ b/sys/netinet/tcp_var.h Mon Jul 26 15:06:17 2010 +1000 @@ -44,10 +44,14 @@ VNET_DECLARE(int, tcp_do_rfc1323); #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) -VNET_DECLARE(int, tcp_reass_qsize); VNET_DECLARE(struct uma_zone *, tcp_reass_zone); -#define V_tcp_reass_qsize VNET(tcp_reass_qsize) #define V_tcp_reass_zone VNET(tcp_reass_zone) + +VNET_DECLARE(int, tcp_reass_curmbufs); +#define V_tcp_reass_curmbufs VNET(tcp_reass_curmbufs) + +VNET_DECLARE(int, tcp_reass_curbytes); +#define V_tcp_reass_curbytes VNET(tcp_reass_curbytes) #endif /* _KERNEL */ /* TCP segment queue entry */ @@ -57,7 +61,14 @@ struct tcphdr *tqe_th; /* a pointer to tcp header */ struct mbuf *tqe_m; /* mbuf contains packet */ }; -LIST_HEAD(tsegqe_head, tseg_qent); + +struct tsegq { + int tsegq_mbufs; + int tsegq_bytes; + int tsegq_maxbytes; + int tsegq_maxmbufs; + struct tseg_qent *lh_first; +}; struct sackblk { tcp_seq start; /* start seq no. of sack block */ @@ -103,9 +114,8 @@ * Organized for 16 byte cacheline efficiency. */ struct tcpcb { - struct tsegqe_head t_segq; /* segment reassembly queue */ + struct tsegq t_segq; /* segment reassembly queue */ void *t_pspare[2]; /* new reassembly queue */ - int t_segqlen; /* segment reassembly queue length */ int t_dupacks; /* consecutive dup acks recd */ struct tcp_timer *t_timers; /* All the TCP timers in one struct */ @@ -616,6 +626,7 @@ #ifdef VIMAGE void tcp_reass_destroy(void); #endif +void tcp_reass_flush(struct tsegq *t_segq); void tcp_input(struct mbuf *, int); u_long tcp_maxmtu(struct in_conninfo *, int *); u_long tcp_maxmtu6(struct in_conninfo *, int *);