diff -r baaf79fa6928 sys/modules/khelp/Makefile --- a/sys/modules/khelp/Makefile Mon Dec 13 22:24:43 2010 +1100 +++ b/sys/modules/khelp/Makefile Mon Dec 13 22:24:59 2010 +1100 @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR= +SUBDIR= h_ertt .include diff -r baaf79fa6928 sys/modules/khelp/h_ertt/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/modules/khelp/h_ertt/Makefile Mon Dec 13 22:24:59 2010 +1100 @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.include + +.PATH: ${.CURDIR}/../../../netinet +KMOD= h_ertt +SRCS= h_ertt.c + +.include diff -r baaf79fa6928 sys/netinet/h_ertt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/h_ertt.c Mon Dec 13 22:24:59 2010 +1100 @@ -0,0 +1,450 @@ +/* + * (c) 2009-2010 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by David Hayes and Lawrence Stewart. + * + * Development is part of the CAIA NEWTCP project, + * http://caia.swin.edu.au/urp/newtcp/ + * + * This project has been made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. Testing and + * development was further assisted by a grant from the FreeBSD Foundation. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * The h_ertt module provides enhanced rtt timing for use with the modular + * congestion control algorithms. It works within the kernel helper (khelp) + * framework. The key information provided is (see h_ertt.h): + * rtt - most recent round trip time measurement + * maxrtt - longest rtt seen + * minrtt - shortest rtt seen + * markedpkt_rtt - rtt of a marked packet (packet sent after + * e_t->flags &= ERTT_NEW_MEASUREMENT has been reset) + * bytes_tx_in_marked_rtt - bytes transmitted in a marked rtt measurement + * maked_snd_cwnd - cwnd for the marked rtt measurement + * + * The marked packet measurements are for use by congestion control algorithms + * wanting VEGAS style measurements. + */ + +uma_zone_t txseginfo_zone; + +#define DLYACK_SMOOTH 5 /* smoothing factor for delayed ack guess */ +#define MAX_TS_ERR 10 /* maximum number of time stamp errors + * allowed in a session */ + +static void ertt_packet_measurement_hook(int hhook_type, int hhook_id, + void *udata, void *ctx_data, void *hdata, struct osd *hosd); +static void ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, + void *udata, void *ctx_data, void *hdata, struct osd *hosd); +static int ertt_mod_init(void); +static int ertt_mod_destroy(void); +static int ertt_uma_ctor(void *mem, int size, void *arg, int flags); +static void ertt_uma_dtor(void *mem, int size, void *arg); + +/* + * Structure contains the information about the sent segment, for comparison + * with the corresponding ack + */ +struct txseginfo { + TAILQ_ENTRY (txseginfo) txsegi_lnk; + /* segment sequence number */ + tcp_seq seq; + long len; + /* time stamp indicating when the packet was sent */ + u_int32_t tx_ts; + /* Last received receiver ts (if the tcp option is used). */ + u_int32_t rx_ts; + u_int flags; /* flags for operation */ +}; + +/* txseginfo flags */ +#define TXSI_TSO 0x01 /* TSO was used for this entry */ +#define TXSI_RTT_MEASURE_START 0x02 /* a rate measure starts here based on + * this txsi's rtt */ +#define TXSI_RX_MEASURE_END 0x04 /* measure the received rate until this + * txsi */ + +struct helper ertt_helper = { + .mod_init = ertt_mod_init, + .mod_destroy = ertt_mod_destroy, + .h_flags = HELPER_NEEDS_OSD, + .h_classes = HELPER_CLASS_TCP, +}; + +/* Define the helper hook info required by ERTT. */ +struct hookinfo ertt_hooks[] = { + { + .hook_type = HHOOK_TYPE_TCP, + .hook_id = HHOOK_TCP_EST_IN, + .hook_udata = NULL, + .hook_func = &ertt_packet_measurement_hook, + }, + { + .hook_type = HHOOK_TYPE_TCP, + .hook_id = HHOOK_TCP_EST_OUT, + .hook_udata = NULL, + .hook_func = &ertt_add_tx_segment_info_hook, + }, +}; + + +#define MULTI_ACK 1 + +/* + * This fuction measures the RTT of a particular segment/ack pair, or the next + * closest if this will yield and inaccurate result due to delayed + * acknowledgements + */ +static void inline +marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, + struct tcphdr *th, u_int32_t *pmeasurenext, int multiack) +{ + /* if we can't measure this one properly due to delayed acking adjust + * byte counters and flag to measure next txsi. Note that since the + * marked packet's tx and rx bytes are measured we need to subtract the + * tx, and not add the rx. Then pretend the next txsi was marked */ + if (multiack && e_t->dlyack_rx && !*pmeasurenext){ + *pmeasurenext = txsi->tx_ts; + } else { + if (*pmeasurenext) + e_t->markedpkt_rtt = ticks - *pmeasurenext + 1; + + else + e_t->markedpkt_rtt = ticks - txsi->tx_ts + 1; + e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt; + e_t->marked_snd_cwnd = tp->snd_cwnd; + + /* set flags */ + /* Not measuring - indicates to add_tx_segment_info a new + * measurment needs to be started */ + e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; + /* indicates to the CC that a new marked RTT measurement has + * been taken */ + e_t->flags |= ERTT_NEW_MEASUREMENT; + + if (tp->t_flags & TF_TSO) { + /* temporarily disable TSO to aid in a new measurment */ + tp->t_flags &= ~TF_TSO; + /* note that I've done it so I can renable it later */ + e_t->flags |= ERTT_TSO_DISABLED; + } + } +} + + + +/* + * Ertt_packet_measurements uses a small amount of state kept on each packet + * sent to match incoming acknowledgements. This enables more accurate and + * secure round trip time measurements. The resulting measurement is used for + * congestion control algorithms which require a more accurate time. + * Ertt_packet_measurements is called via the helper hook in tcp_input.c + */ +static void +ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, + void *ctx_data, void *hdata, struct osd *hosd) +{ + struct tcpcb *tp = ((struct tcp_hhook_data *)ctx_data)->tp; + struct tcphdr *th = ((struct tcp_hhook_data *)ctx_data)->th; + struct tcpopt *to = ((struct tcp_hhook_data *)ctx_data)->to; + int new_sacked_bytes = ((struct tcp_hhook_data *)ctx_data)->new_sacked_bytes; + struct ertt *e_t = (struct ertt *)hdata; + + struct txseginfo *txsi; + u_int32_t rts = 0; + u_int32_t measurenext = 0; + tcp_seq ack; + int multiack = 0; + + INP_WLOCK_ASSERT(tp->t_inpcb); + int acked = th->th_ack - tp->snd_una; + + /* Packet has provided new acknowledgements */ + if (acked > 0 || new_sacked_bytes) { + if (acked == 0 && new_sacked_bytes) { + /* no delayed acks at the moment, use packets being + * acknowledged with sack instead of th_ack */ + ack = tp->sackhint.last_sack_ack; + } else + ack = th->th_ack; + + txsi = TAILQ_FIRST(&e_t->txsegi_q); + while (txsi != NULL) { + rts = 0; + + /* acking more than this txsi */ + if (SEQ_GT(ack, txsi->seq + txsi->len)) { + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, MULTI_ACK); + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(txseginfo_zone, txsi); + txsi = TAILQ_FIRST(&e_t->txsegi_q); + continue; + } + /* Guess if delayed acks are being used by the receiver */ + if (!new_sacked_bytes) { + if (acked > tp->t_maxseg) { + e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH)? 1 : 0; + + multiack = 1; + } else if (acked > txsi->len) { + multiack = 1; + e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH)? 1 : 0; + } else if (acked == tp->t_maxseg || acked == txsi->len) + e_t->dlyack_rx -= (e_t->dlyack_rx > 0)? 1 : 0; + + /* otherwise leave dlyack_rx the way it was */ + } + /* Time stamps are only used to help identify packets */ + if (e_t->timestamp_errors < MAX_TS_ERR && + (to->to_flags & TOF_TS) != 0 && to->to_tsecr){ + /* + * Note: All packets sent with the offload will + * have the same time stamp. If we are sending + * on a fast interface, and the t_maxseg is much + * smaller than one tick, this will be fine. The + * time stamp would be the same whether we were + * using tso or not. However, if the interface + * is slow, this will cause problems with the + * calculations. If the interface is slow, there + * is not reason to be using tso, and it should + * be turned off. + */ + /* + * If there are too many time stamp errors, time + * stamps won't be trusted + */ + rts = to->to_tsecr; + /* before this packet */ + if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) + /* When delayed acking is used, the + * reflected time stamp is of the first + * packet, and thus may be before + * txsi->tx_ts */ + break; + if (TSTMP_GT(rts, txsi->tx_ts)) { + /* if reflected time stamp is later than + * tx_tsi, then this txsi is old */ + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(txseginfo_zone, txsi); + txsi = TAILQ_FIRST(&e_t->txsegi_q); + continue; + } + if (rts == txsi->tx_ts && TSTMP_LT(to->to_tsval, txsi->rx_ts)) { + /* rx before sent!!! something wrong + * with rx timestamping process without + * timestamps */ + e_t->timestamp_errors++; + } + } + /* old txsi that may have had the same seq numbers (rtx) + * should have been removed if time stamps are being + * used */ + if (SEQ_LEQ(ack, txsi->seq)) + break; /* before first packet in txsi */ + + /* + * only ack > txsi->seq and ack <= txsi->seq+txsi->len + * past this point + */ + if (!e_t->dlyack_rx || multiack || new_sacked_bytes){ + e_t->rtt = ticks - txsi->tx_ts + 1;/* new measurement */ + + if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) + e_t->minrtt = e_t->rtt; + + if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) + e_t->maxrtt = e_t->rtt; + } + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + + if (txsi->flags & TXSI_TSO) { + txsi->len -= acked; + if (txsi->len > 0) { + /* this presumes ack for first bytes in + * txsi, this may not be true but it + * shouldn't cause problems for the + * timing */ + txsi->seq += acked; + /* reset measure flag */ + txsi->flags &= ~TXSI_RTT_MEASURE_START; + break; /* still more data to be acked + * with this tso transmission */ + } + } + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(txseginfo_zone, txsi); + break; + } /* end while */ + if (measurenext) + /* need to do a tx rate measurement, won't be the best + * if I'm doing it here */ + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + } +} + +/* + * Add information about a transmitted segment to a list. + * This is called via the helper hook in tcp_output.c + */ +static void +ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, + void *ctx_data, void *hdata, struct osd *hosd) +{ + struct tcpcb *tp = ((struct tcp_hhook_data *)ctx_data)->tp; + struct tcphdr *th = ((struct tcp_hhook_data *)ctx_data)->th; + struct tcpopt *to = ((struct tcp_hhook_data *)ctx_data)->to; + long len = ((struct tcp_hhook_data *)ctx_data)->len; + int tso = ((struct tcp_hhook_data *)ctx_data)->tso; + + + INP_WLOCK_ASSERT(tp->t_inpcb); + + + if (len > 0) { + struct txseginfo *txsi; + + txsi = (struct txseginfo *)uma_zalloc(txseginfo_zone, M_NOWAIT); + if (txsi != NULL) { + struct ertt *e_t = (struct ertt *)hdata; + + txsi->flags = 0;/* needs to be initialised */ + txsi->seq = ntohl(th->th_seq); + txsi->len = len; + if (tso) + txsi->flags |= TXSI_TSO; + else if (e_t->flags & ERTT_TSO_DISABLED){ + tp->t_flags |= TF_TSO; + e_t->flags &= ~ERTT_TSO_DISABLED; + } + if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS){ + e_t->bytes_tx_in_rtt += len; + } else { + txsi->flags |= TXSI_RTT_MEASURE_START; + e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; + e_t->bytes_tx_in_rtt = len; + } + if (((tp->t_flags & TF_NOOPT) == 0) && (to->to_flags & TOF_TS)) { + txsi->tx_ts = ntohl(to->to_tsval) - tp->ts_offset; + txsi->rx_ts = ntohl(to->to_tsecr); + } else { + txsi->tx_ts = ticks; + txsi->rx_ts = 0; /* no received time stamp */ + } + TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); + } + } +} + +static int +ertt_mod_init(void) +{ + + txseginfo_zone = uma_zcreate("txseginfo", sizeof(struct txseginfo), + NULL, NULL, NULL, NULL, 0, 0); + + return (0); +} + +static int +ertt_mod_destroy(void) +{ + + uma_zdestroy(txseginfo_zone); + + return (0); +} + +static int +ertt_uma_ctor(void *mem, int size, void *arg, int flags) +{ + struct ertt *e_t = (struct ertt *)mem; + + TAILQ_INIT(&e_t->txsegi_q); + e_t->timestamp_errors = 0; + e_t->minrtt = 0; + e_t->maxrtt = 0; + e_t->rtt = 0; + e_t->flags = 0; + e_t->dlyack_rx = 0; + e_t->bytes_tx_in_rtt = 0; + e_t->markedpkt_rtt = 0; + + return (0); +} + +static void +ertt_uma_dtor(void *mem, int size, void *arg) +{ + struct ertt *e_t = (struct ertt *)mem; + struct txseginfo *txsi, *n_txsi; + + txsi = TAILQ_FIRST(&e_t->txsegi_q); + while (txsi != NULL) { + n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); + uma_zfree(txseginfo_zone, txsi); + txsi = n_txsi; + } +} + +KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 2, sizeof(struct ertt), + ertt_uma_ctor, ertt_uma_dtor); diff -r baaf79fa6928 sys/netinet/h_ertt.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/h_ertt.h Mon Dec 13 22:24:59 2010 +1100 @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2009-2010 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by David Hayes and Lawrence Stewart. + * + * Development is part of the CAIA NEWTCP project, + * http://caia.swin.edu.au/urp/newtcp/ + * + * This project has been made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. Testing and + * development was further assisted by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_H_ERTT_ +#define _NETINET_H_ERTT_ + +/* Structure contains the information about the + sent segment, for comparison with the corresponding ack */ +struct txseginfo; + +/* Structure used as the ertt data block. */ +struct ertt { + /* information about transmitted segments to aid in + * RTT calculation for delay/rate based CC */ + TAILQ_HEAD(txseginfo_head, txseginfo) txsegi_q; + int rtt; /* per packet measured round trip time */ + int maxrtt; /* maximum seen rtt */ + int minrtt; /* minimum seen rtt */ + int dlyack_rx; /* guess if the receiver is using + * delayed acknowledgements.*/ + int timestamp_errors; /* for keeping track of inconsistencies + * in packet timestamps */ + int markedpkt_rtt; /* rtt for a marked packet */ + long bytes_tx_in_rtt; /* bytes tx so far in marked rtt */ + long bytes_tx_in_marked_rtt; /* final version of above */ + u_long marked_snd_cwnd; /* cwnd for marked rtt */ + int flags; /* flags*/ +}; + +/* flags */ +#define ERTT_NEW_MEASUREMENT 0x01 /* new measurement */ +#define ERTT_MEASUREMENT_IN_PROGRESS 0x02 /* measuring marked RTT */ +#define ERTT_TSO_DISABLED 0x04 /* indicates TSO has been + * temporarily disabled */ + +#endif /* _NETINET_H_ERTT_ */