--- - Add some helper hook points to the TCP stack. The hooks allow Khelp --- modules to access inbound/outbound events and associated data for --- established TCP connections. The hooks only run if at least one hook --- function is registered for the hook point, ensuring the impact on the --- stack is effectively zero when no TCP Khelp modules are loaded. struct --- tcp_hhook_data is passed as contextual data to any registered Khelp module --- hook functions. --- --- - Add a new sackhint to track the most recent and highest sacked --- sequence number. This will be used by the incoming Enhanced RTT Khelp --- module. --- --- - Add an OSD (Object Specific Data) pointer to struct tcpcb to allow Khelp --- modules to associate per-connection data with the tcpcb. --- --- In collaboration with: David Hayes and --- Grenville Armitage --- Sponsored by: FreeBSD Foundation --- Reviewed by: bz, others along the way --- MFC after: 3 months (maybe) --- diff -r 8e66402f5dd1 sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c Wed Dec 22 00:59:20 2010 +1100 +++ b/sys/netinet/tcp_input.c Tue Dec 28 01:37:52 2010 +1100 @@ -47,20 +47,21 @@ __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include +#include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include @@ -211,36 +212,56 @@ static void tcp_do_segment(struct mbuf static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type); static void inline cc_conn_init(struct tcpcb *tp); static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +static void inline hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array of u_long. While this encodes the * general layout of tcpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_tcpstat_inc(int statnum) { (*((u_long *)&V_tcpstat + statnum))++; } /* + * Wrapper for the TCP established input helper hook. + */ +static void inline +hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) +{ + struct tcp_hhook_data hhook_data; + + if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { + hhook_data.tp = tp; + hhook_data.th = th; + hhook_data.to = to; + + hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, + tp->osd); + } +} + +/* * CC wrapper hook functions */ static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) tp->ccv->flags |= CCF_CWND_LIMITED; @@ -1479,20 +1500,24 @@ tcp_do_segment(struct mbuf *m, struct tc ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, &to); + TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This @@ -2192,20 +2217,24 @@ tcp_do_segment(struct mbuf *m, struct tc case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, &to); + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet diff -r 8e66402f5dd1 sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c Wed Dec 22 00:59:20 2010 +1100 +++ b/sys/netinet/tcp_output.c Tue Dec 28 01:37:52 2010 +1100 @@ -33,20 +33,21 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include +#include #include #include #include #include #include #include #include #include #include @@ -119,23 +120,47 @@ VNET_DEFINE(int, tcp_autosndbuf_inc) = 8 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); +static void inline hhook_run_tcp_est_out(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to, + long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* + * Wrapper for the TCP established ouput helper hook. + */ +static void inline +hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, + struct tcpopt *to, long len, int tso) +{ + struct tcp_hhook_data hhook_data; + + if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { + hhook_data.tp = tp; + hhook_data.th = th; + hhook_data.to = to; + hhook_data.len = len; + hhook_data.tso = tso; + + hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, + tp->osd); + } +} + +/* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } @@ -1127,20 +1152,24 @@ timer: if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } + /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ + hhook_run_tcp_est_out(tp, th, &to, len, tso); + + #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { diff -r 8e66402f5dd1 sys/netinet/tcp_sack.c --- a/sys/netinet/tcp_sack.c Wed Dec 22 00:59:20 2010 +1100 +++ b/sys/netinet/tcp_sack.c Tue Dec 28 01:37:52 2010 +1100 @@ -418,20 +418,21 @@ tcp_sack_doack(struct tcpcb *tp, struct * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ + tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; diff -r 8e66402f5dd1 sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c Wed Dec 22 00:59:20 2010 +1100 +++ b/sys/netinet/tcp_subr.c Tue Dec 28 01:37:52 2010 +1100 @@ -34,21 +34,23 @@ __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include +#include #include +#include #include #include #include #include #ifdef INET6 #include #endif #include #include #include @@ -207,20 +209,22 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, #ifdef TCP_SORECEIVE_STREAM static int tcp_soreceive_stream = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #endif VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) +VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); + static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize @@ -232,20 +236,21 @@ static char * tcp_log_addr(struct in_con /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; + struct osd osd; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -271,20 +276,27 @@ tcp_inpcb_init(void *mem, int size, int INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize; + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, + &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, + &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); /* @@ -654,20 +666,26 @@ tcp_newtcpcb(struct inpcb *inp) KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { uma_zfree(V_tcpcb_zone, tm); return (NULL); } + tp->osd = &tm->osd; + if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + uma_zfree(V_tcpcb_zone, tm); + return (NULL); + } + #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; @@ -887,20 +905,22 @@ tcp_discardcb(struct tcpcb *tp) tcp_reass_flush(tp); /* Disconnect offload device, if any. */ tcp_offload_detach(tp); tcp_free_sackholes(tp); /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); + khelp_destroy_osd(tp->osd); + CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ diff -r 8e66402f5dd1 sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h Wed Dec 22 00:59:20 2010 +1100 +++ b/sys/netinet/tcp_var.h Tue Dec 28 01:37:52 2010 +1100 @@ -63,20 +63,21 @@ struct sackblk { struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; + tcp_seq last_sack_ack; /* Most recent/largest sacked ack seq */ int ispare; /* explicit pad for 64bit alignment */ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ }; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; @@ -193,20 +194,21 @@ struct tcpcb { int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toe_usrreqs *t_tu; /* offload operations vector */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; + struct osd *osd; int t_ispare; /* explicit pad for 64bit alignment */ void *t_pspare2[4]; /* 4 TBD */ uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ }; /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x000001 /* ack peer immediately */ @@ -493,20 +495,36 @@ struct tcpstat { */ #define TCPSTAT_ADD(name, val) V_tcpstat.name += (val) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long)) + +/* + * TCP specific helper hook point identifiers. + */ +#define HHOOK_TCP_EST_IN 0 +#define HHOOK_TCP_EST_OUT 1 +#define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT + +struct tcp_hhook_data { + struct tcpcb *tp; + struct tcphdr *th; + struct tcpopt *to; + long len; + int tso; + tcp_seq curack; +}; #endif /* * TCB structure exported to user-land via sysctl(3). * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcp_timer { int tt_rexmt; /* retransmit timer */ @@ -600,20 +618,23 @@ VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */ VNET_DECLARE(int, tcp_ecn_maxretries); #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); +#define V_tcp_hhh VNET(tcp_hhh) + int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); #if 0 int tcp_twrecycleable(struct tcptw *tw); #endif void tcp_twclose(struct tcptw *_tw, int _reuse);