diff -r 12a803881681 sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c Fri Dec 17 10:37:39 2010 +1100 +++ b/sys/netinet/tcp_input.c Fri Dec 17 10:38:20 2010 +1100 @@ -47,20 +47,21 @@ __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include +#include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include @@ -1252,21 +1253,21 @@ drop: static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags, acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; u_long tiwin; struct tcpopt to; - + struct tcp_hhook_data hhook_data; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; @@ -1479,20 +1480,29 @@ tcp_do_segment(struct mbuf *m, struct tc ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); + + /* Run HHOOK_TCP_ESTABLISHED_IN helper hook functions. */ + hhook_data.new_sacked_bytes = 0; + hhook_data.tp = tp; + hhook_data.th = th; + hhook_data.to = &to; + HHOOKS_RUN_IF(V_tcp_hhh[HHOOK_TCP_EST_IN], + &hhook_data, tp->osd); + TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This @@ -2188,24 +2198,35 @@ tcp_do_segment(struct mbuf *m, struct tc case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } + hhook_data.new_sacked_bytes = 0; if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || - !TAILQ_EMPTY(&tp->snd_holes))) + !TAILQ_EMPTY(&tp->snd_holes))) { tcp_sack_doack(tp, &to, th->th_ack); + /* XXXDH: should only be one if a productive SACK */ + hhook_data.new_sacked_bytes = 1; + } + + hhook_data.tp = tp; + hhook_data.th = th; + hhook_data.to = &to; + HHOOKS_RUN_IF(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, + tp->osd); + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet diff -r 12a803881681 sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c Fri Dec 17 10:37:39 2010 +1100 +++ b/sys/netinet/tcp_output.c Fri Dec 17 10:38:20 2010 +1100 @@ -33,20 +33,21 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include +#include #include #include #include #include #include #include #include #include #include @@ -156,20 +157,21 @@ tcp_output(struct tcpcb *tp) u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso; struct tcpopt to; + struct tcp_hhook_data hhook_data; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif @@ -1127,20 +1129,28 @@ timer: if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } + hhook_data.th = th; + hhook_data.tp = tp; + hhook_data.to = &to; + hhook_data.len = len; + hhook_data.tso = tso; + HHOOKS_RUN_IF(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); + + #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { diff -r 12a803881681 sys/netinet/tcp_sack.c --- a/sys/netinet/tcp_sack.c Fri Dec 17 10:37:39 2010 +1100 +++ b/sys/netinet/tcp_sack.c Fri Dec 17 10:38:20 2010 +1100 @@ -418,20 +418,21 @@ tcp_sack_doack(struct tcpcb *tp, struct * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ + tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; diff -r 12a803881681 sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c Fri Dec 17 10:37:39 2010 +1100 +++ b/sys/netinet/tcp_subr.c Fri Dec 17 10:38:20 2010 +1100 @@ -34,21 +34,23 @@ __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include +#include #include +#include #include #include #include #include #ifdef INET6 #include #endif #include #include #include @@ -207,20 +209,22 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, #ifdef TCP_SORECEIVE_STREAM static int tcp_soreceive_stream = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #endif VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) +VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); + static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize @@ -232,20 +236,21 @@ static char * tcp_log_addr(struct in_con /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; + struct osd osd; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -271,20 +276,27 @@ tcp_inpcb_init(void *mem, int size, int INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize; + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, + &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, + &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); /* @@ -654,20 +666,26 @@ tcp_newtcpcb(struct inpcb *inp) KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { uma_zfree(V_tcpcb_zone, tm); return (NULL); } + tp->osd = &tm->osd; + if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + uma_zfree(V_tcpcb_zone, tm); + return (NULL); + } + #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; @@ -887,20 +905,22 @@ tcp_discardcb(struct tcpcb *tp) tcp_reass_flush(tp); /* Disconnect offload device, if any. */ tcp_offload_detach(tp); tcp_free_sackholes(tp); /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); + khelp_destroy_osd(tp->osd); + CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ diff -r 12a803881681 sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h Fri Dec 17 10:37:39 2010 +1100 +++ b/sys/netinet/tcp_var.h Fri Dec 17 10:38:20 2010 +1100 @@ -63,21 +63,21 @@ struct sackblk { struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; - + tcp_seq last_sack_ack; /* Last sack block acked with current pkt - used for enhanced RTT calculations*/ int ispare; /* explicit pad for 64bit alignment */ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ }; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ @@ -193,20 +193,21 @@ struct tcpcb { int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toe_usrreqs *t_tu; /* offload operations vector */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; + struct osd *osd; int t_ispare; /* explicit pad for 64bit alignment */ void *t_pspare2[4]; /* 4 TBD */ uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ }; /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x000001 /* ack peer immediately */ @@ -493,20 +494,37 @@ struct tcpstat { */ #define TCPSTAT_ADD(name, val) V_tcpstat.name += (val) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long)) + +/* + * TCP specific helper hook point identifiers. + */ +#define HHOOK_TCP_EST_IN 0 +#define HHOOK_TCP_EST_OUT 1 +#define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT + +struct tcp_hhook_data { + struct tcpcb *tp; + struct tcphdr *th; + struct tcpopt *to; + long len; + int tso; + tcp_seq curack; + int new_sacked_bytes; +}; #endif /* * TCB structure exported to user-land via sysctl(3). * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcp_timer { int tt_rexmt; /* retransmit timer */ @@ -600,20 +618,23 @@ VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */ VNET_DECLARE(int, tcp_ecn_maxretries); #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); +#define V_tcp_hhh VNET(tcp_hhh) + int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); #if 0 int tcp_twrecycleable(struct tcptw *tw); #endif void tcp_twclose(struct tcptw *_tw, int _reuse);