diff -r d69f537670da UPDATING
--- a/UPDATING	Fri Nov 12 00:44:18 2010 +0000
+++ b/UPDATING	Fri Nov 12 12:31:55 2010 +1100
@@ -15,20 +15,27 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 9.
 	system primitives, and encourage loud failure through extra sanity
 	checking and fail stop semantics.  They also substantially impact
 	system performance.  If you want to do performance measurement,
 	benchmarking, and optimization, you'll want to turn them off.  This
 	includes various WITNESS- related kernel options, INVARIANTS, malloc
 	debugging flags in userland, and various verbose features in the
 	kernel.  Many developers choose to disable these features on build
 	machines to maximize performance.  (To disable malloc debugging, run
 	ln -s aj /etc/malloc.conf.)
 
+20101111:
+	The TCP stack has received a significant update to add support for
+	modularised congestion control and generally improve the clarity of
+	congestion control decisions. Bump __FreeBSD_version to 900025. User
+	space tools that rely on the size of struct tcpcb in tcp_var.h (e.g.
+	sockstat) need to be recompiled.
+
 20101002:
 	The man(1) utility has been replaced by a new version that no longer
 	uses /etc/manpath.config. Please consult man.conf(5) for how to
 	migrate local entries to the new format.
 
 20100928:
 	The copyright strings printed by login(1) and sshd(8) at the time of a
 	new connection have been removed to follow other operating systems and
 	upstream sshd.
 
diff -r d69f537670da sys/conf/files
--- a/sys/conf/files	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/conf/files	Fri Nov 12 12:31:55 2010 +1100
@@ -2591,20 +2591,22 @@ netinet/ipfw/ip_fw_pfil.c	optional inet 
 netinet/ipfw/ip_fw_sockopt.c	optional inet ipfirewall
 netinet/ipfw/ip_fw_table.c	optional inet ipfirewall
 netinet/ipfw/ip_fw_nat.c	optional inet ipfirewall_nat
 netinet/ip_icmp.c		optional inet
 netinet/ip_input.c		optional inet
 netinet/ip_ipsec.c		optional inet ipsec
 netinet/ip_mroute.c		optional mrouting inet | mrouting inet6
 netinet/ip_options.c		optional inet
 netinet/ip_output.c		optional inet
 netinet/raw_ip.c		optional inet
+netinet/cc/cc.c			optional inet
+netinet/cc/cc_newreno.c		optional inet
 netinet/sctp_asconf.c		optional inet sctp
 netinet/sctp_auth.c		optional inet sctp
 netinet/sctp_bsd_addr.c		optional inet sctp
 netinet/sctp_cc_functions.c	optional inet sctp
 netinet/sctp_crc32.c		optional inet sctp
 netinet/sctp_indata.c		optional inet sctp
 netinet/sctp_input.c		optional inet sctp
 netinet/sctp_output.c		optional inet sctp
 netinet/sctp_pcb.c		optional inet sctp
 netinet/sctp_peeloff.c		optional inet sctp
diff -r d69f537670da sys/netinet/cc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc.h	Fri Nov 12 12:31:55 2010 +1100
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 2007-2008
+ * 	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ *   http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */
+#include <netinet/tcp.h>
+
+/* Global CC vars. */
+extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
+extern const int tcprexmtthresh;
+extern struct cc_algo newreno_cc_algo;
+
+/* Define the new net.inet.tcp.cc sysctl tree. */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/* CC housekeeping functions. */
+void	cc_init(void);
+int	cc_register_algo(struct cc_algo *add_cc);
+int	cc_deregister_algo(struct cc_algo *remove_cc);
+
+/*
+ * Wrapper around transport structs that contain same-named congestion
+ * control variables. Allows algos to be shared amongst multiple CC aware
+ * transprots.
+ */
+struct cc_var {
+	void		*cc_data; /* Per-connection private CC algorithm data. */
+	int		bytes_this_ack; /* # bytes acked by the current ACK. */
+	tcp_seq		curack; /* Most recent ACK. */
+	uint32_t	flags; /* Flags for cc_var (see below) */
+	int		type; /* Indicates which ptr is valid in ccvc. */
+	union ccv_container {
+		struct tcpcb		*tcp;
+		struct sctp_nets	*sctp;
+	} ccvc;
+};
+
+/* cc_var flags. */
+#define	CCF_ABC_SENTAWND	0x0001	/* ABC counted cwnd worth of bytes? */
+#define	CCF_CWND_LIMITED	0x0002	/* Are we currently cwnd limited? */
+
+/* ACK types passed to the ack_received() hook. */
+#define	CC_ACK		0x0001	/* Regular in sequence ACK. */
+#define	CC_DUPACK	0x0002	/* Duplicate ACK. */
+#define	CC_PARTIALACK	0x0004	/* Not yet. */
+#define	CC_SACK		0x0008	/* Not yet. */
+
+/*
+ * Congestion signal types passed to the cong_signal() hook. The highest order 8
+ * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own
+ * congestion signal types.
+ */
+#define	CC_ECN		0x000001/* ECN marked packet received. */
+#define	CC_RTO		0x000002/* RTO fired. */
+#define	CC_RTO_ERR	0x000004/* RTO fired in error. */
+#define	CC_NDUPACK	0x000008/* Threshold of dupack's reached. */
+
+/*
+ * Structure to hold data and function pointers that together represent a
+ * congestion control algorithm.
+ */
+struct cc_algo {
+	char	name[TCP_CA_NAME_MAX];
+
+	/* Init global module state on kldload. */
+	int	(*mod_init)(void);
+
+	/* Cleanup global module state on kldunload. */
+	int	(*mod_destroy)(void);
+
+	/* Init CC state for a new control block. */
+	int	(*cb_init)(struct cc_var *ccv);
+
+	/* Cleanup CC state for a terminating control block. */
+	void	(*cb_destroy)(struct cc_var *ccv);
+
+	/* Init variables for a newly established connection. */
+	void	(*conn_init)(struct cc_var *ccv);
+
+	/* Called on receipt of an ack. */
+	void	(*ack_received)(struct cc_var *ccv, uint16_t type);
+
+	/* Called on detection of a congestion signal. */
+	void	(*cong_signal)(struct cc_var *ccv, uint32_t type);
+
+	/* Called after exiting congestion recovery. */
+	void	(*post_recovery)(struct cc_var *ccv);
+
+	/* Called when data transfer resumes after an idle period. */
+	void	(*after_idle)(struct cc_var *ccv);
+
+	STAILQ_ENTRY (cc_algo) entries;
+};
+
+/* Macro to obtain the CC algo's struct ptr. */
+#define	CC_ALGO(tp)	((tp)->cc_algo)
+
+/* Macro to obtain the CC algo's data ptr. */
+#define	CC_DATA(tp)	((tp)->ccv->cc_data)
+
+/* Macro to obtain the system default CC algo's struct ptr. */
+#define	CC_DEFAULT()	STAILQ_FIRST(&cc_list)
+
+extern struct rwlock cc_list_lock;
+#define	CC_LIST_LOCK_INIT()	rw_init(&cc_list_lock, "cc_list")
+#define	CC_LIST_LOCK_DESTROY()	rw_destroy(&cc_list_lock)
+#define	CC_LIST_RLOCK()		rw_rlock(&cc_list_lock)
+#define	CC_LIST_RUNLOCK()	rw_runlock(&cc_list_lock)
+#define	CC_LIST_WLOCK()		rw_wlock(&cc_list_lock)
+#define	CC_LIST_WUNLOCK()	rw_wunlock(&cc_list_lock)
+#define	CC_LIST_WLOCK_ASSERT()	rw_assert(&cc_list_lock, RA_WLOCKED)
+
+#endif /* _NETINET_CC_H_ */
diff -r d69f537670da sys/netinet/cc/cc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc.c	Fri Nov 12 12:31:55 2010 +1100
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2007-2008
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ *   http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+/*
+ * List of available cc algorithms on the current system. First element
+ * is used as the system default CC algorithm.
+ */
+struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
+
+/* Protects the cc_list TAILQ. */
+struct rwlock cc_list_lock;
+
+/*
+ * Set the default CC algorithm to new_default. The default is identified
+ * by being the first element in the cc_list TAILQ.
+ */
+static void
+cc_set_default(struct cc_algo *new_default)
+{
+	CC_LIST_WLOCK_ASSERT();
+
+	/*
+	 * Make the requested system default CC algorithm the first element in
+	 * the list if it isn't already.
+	 */
+	if (new_default != CC_DEFAULT()) {
+		STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
+		STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
+	}
+}
+
+/*
+ * Sysctl handler to show and change the default CC algorithm.
+ */
+static int
+cc_default_algo(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *funcs;
+	int err, found;
+
+	err = found = 0;
+
+	if (req->newptr == NULL) {
+		char default_cc[TCP_CA_NAME_MAX];
+
+		/* Just print the current default. */
+		CC_LIST_RLOCK();
+		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
+		CC_LIST_RUNLOCK();
+		err = sysctl_handle_string(oidp, default_cc, 1, req);
+	} else {
+		/* Find algo with specified name and set it to default. */
+		CC_LIST_WLOCK();
+		STAILQ_FOREACH(funcs, &cc_list, entries) {
+			if (strncmp((char *)req->newptr, funcs->name,
+			    TCP_CA_NAME_MAX) == 0) {
+				found = 1;
+				cc_set_default(funcs);
+			}
+		}
+		CC_LIST_WUNLOCK();
+
+		if (!found)
+			err = ESRCH;
+	}
+
+	return (err);
+}
+
+/*
+ * Sysctl handler to display the list of available CC algorithms.
+ */
+static int
+cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *algo;
+	struct sbuf *s;
+	int err, first;
+
+	err = 0;
+	first = 1;
+	s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
+
+	if (s == NULL)
+		return (ENOMEM);
+
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH(algo, &cc_list, entries) {
+		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
+		if (err)
+			break;
+		first = 0;
+	}
+	CC_LIST_RUNLOCK();
+
+	if (!err) {
+		sbuf_finish(s);
+		err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
+	}
+
+	sbuf_delete(s);
+	return (err);
+}
+
+/*
+ * Initialise CC subsystem on system boot.
+ */
+void
+cc_init()
+{
+	CC_LIST_LOCK_INIT();
+	STAILQ_INIT(&cc_list);
+}
+
+/*
+ * Returns non-zero on success, 0 on failure.
+ */
+int
+cc_deregister_algo(struct cc_algo *remove_cc)
+{
+	struct cc_algo *funcs, *tmpfuncs;
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	int err;
+
+	err = ENOENT;
+
+	/* Never allow newreno to be deregistered. */
+	if (&newreno_cc_algo == remove_cc)
+		return (EPERM);
+
+	/* Remove algo from cc_list so that new connections can't use it. */
+	CC_LIST_WLOCK();
+	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+		if (funcs == remove_cc) {
+			/*
+			 * If we're removing the current system default,
+			 * reset the default to newreno.
+			 */
+			if (strncmp(CC_DEFAULT()->name, remove_cc->name,
+			    TCP_CA_NAME_MAX) == 0)
+				cc_set_default(&newreno_cc_algo);
+
+			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+			err = 0;
+			break;
+		}
+	}
+	CC_LIST_WUNLOCK();
+	
+	if (!err) {
+		/*
+		 * Check all active control blocks and change any that are
+		 * using this algorithm back to newreno. If the algorithm that
+		 * was in use requires cleanup code to be run, call it.
+		 *
+		 * New connections already part way through being initialised
+		 * with the CC algo we're removing will not race with this code
+		 * because the INP_INFO_WLOCK is held during initialisation.
+		 * We therefore don't enter the loop below until the connection
+		 * list has stabilised.
+		 */
+		INP_INFO_RLOCK(&V_tcbinfo);
+		LIST_FOREACH(inp, &V_tcb, inp_list) {
+			INP_WLOCK(inp);
+			/* Important to skip tcptw structs. */
+			if (!(inp->inp_flags & INP_TIMEWAIT) &&
+			    (tp = intotcpcb(inp)) != NULL) {
+				/*
+				 * By holding INP_WLOCK here, we are
+				 * assured that the connection is not
+				 * currently executing inside the CC
+				 * module's functions i.e. it is safe to
+				 * make the switch back to newreno.
+				 */
+				if (CC_ALGO(tp) == remove_cc) {
+					tmpfuncs = CC_ALGO(tp);
+					/* Newreno does not require any init. */
+					CC_ALGO(tp) = &newreno_cc_algo;
+					if (tmpfuncs->cb_destroy != NULL)
+						tmpfuncs->cb_destroy(tp->ccv);
+				}
+			}
+			INP_WUNLOCK(inp);
+		}
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+
+	return (err);
+}
+
+/*
+ * Returns 0 on success, non-zero on failure.
+ */
+int
+cc_register_algo(struct cc_algo *add_cc)
+{
+	struct cc_algo *funcs;
+	int err;
+
+	err = 0;
+
+	/*
+	 * Iterate over list of registered CC algorithms and make sure
+	 * we're not trying to add a duplicate.
+	 */
+	CC_LIST_WLOCK();
+	STAILQ_FOREACH(funcs, &cc_list, entries) {
+		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
+		    TCP_CA_NAME_MAX) == 0)
+			err = EEXIST;
+	}
+
+	if (!err)
+		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+
+	CC_LIST_WUNLOCK();
+
+	return (err);
+}
+
+/*
+ * Handles kld related events. Returns 0 on success, non-zero on failure.
+ */
+int
+cc_modevent(module_t mod, int event_type, void *data)
+{
+	struct cc_algo *algo;
+	int err;
+
+	err = 0;
+	algo = (struct cc_algo *)data;
+
+	switch(event_type) {
+	case MOD_LOAD:
+		if (algo->mod_init != NULL)
+			err = algo->mod_init();
+		if (!err)
+			err = cc_register_algo(algo);
+		break;
+
+	case MOD_QUIESCE:
+	case MOD_SHUTDOWN:
+	case MOD_UNLOAD:
+		err = cc_deregister_algo(algo);
+		if (!err && algo->mod_destroy != NULL)
+			algo->mod_destroy();
+		if (err == ENOENT)
+			err = 0;
+		break;
+
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+/* Declare sysctl tree and populate it. */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
+    "congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+    NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, cc_list_available, "A",
+    "list available congestion control algorithms");
diff -r d69f537670da sys/netinet/cc/cc_module.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc_module.h	Fri Nov 12 12:31:55 2010 +1100
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University, made possible in
+ * part by a grant from the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2009 by Lawrence Stewart as part of the
+ * NewTCP research project at Swinburne University's Centre for Advanced
+ * Internet Architectures, Melbourne, Australia, which was made possible in part
+ * by a grant from the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley. More details are available at:
+ *   http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_MODULE_H_
+#define _NETINET_CC_MODULE_H_
+
+/*
+ * Allows a CC algorithm to manipulate a commonly named CC variable regardless
+ * of the transport protocol and associated C struct.
+ * XXXLAS: Out of action until the work to support SCTP is done.
+ *
+#define	CCV(ccv, what)							\
+(*(									\
+	(ccv)->type == IPPROTO_TCP ?	&(ccv)->ccvc.tcp->what :	\
+					&(ccv)->ccvc.sctp->what		\
+))
+ */
+#define	CCV(ccv, what) (ccv)->ccvc.tcp->what
+
+#define	DECLARE_CC_MODULE(ccname, ccalgo) 				\
+	static moduledata_t cc_##ccname = {				\
+		.name = #ccname,					\
+		.evhand = cc_modevent,					\
+		.priv = ccalgo						\
+	};								\
+	DECLARE_MODULE(ccname, cc_##ccname,				\
+	    SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY)
+
+int	cc_modevent(module_t mod, int type, void *data);
+
+#endif /* _NETINET_CC_MODULE_H_ */
diff -r d69f537670da sys/netinet/cc/cc_newreno.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc_newreno.c	Fri Nov 12 12:31:55 2010 +1100
@@ -0,0 +1,231 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.
+ * Copyright (c) 2007-2008,2010
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ *   http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
+void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
+void	newreno_post_recovery(struct cc_var *ccv);
+void	newreno_after_idle(struct cc_var *ccv);
+
+struct cc_algo newreno_cc_algo = {
+	.name = "newreno",
+	.ack_received = newreno_ack_received,
+	.cong_signal = newreno_cong_signal,
+	.post_recovery = newreno_post_recovery,
+	.after_idle = newreno_after_idle
+};
+
+/*
+ * Increase cwnd on receipt of a successful ACK:
+ * if cwnd <= ssthresh, increases by 1 MSS per ACK
+ * if cwnd > ssthresh, increase by ~1 MSS per RTT
+ */
+void
+newreno_ack_received(struct cc_var *ccv, uint16_t type)
+{
+	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
+	    (ccv->flags & CCF_CWND_LIMITED)) {
+		u_int cw = CCV(ccv, snd_cwnd);
+		u_int incr = CCV(ccv, t_maxseg);
+
+		/*
+		 * Regular in-order ACK, open the congestion window.
+		 * Method depends on which congestion control state we're
+		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
+		 * enabled.
+		 *
+		 * slow start: cwnd <= ssthresh
+		 * cong avoid: cwnd > ssthresh
+		 *
+		 * slow start and ABC (RFC 3465):
+		 *   Grow cwnd exponentially by the amount of data
+		 *   ACKed capping the max increment per ACK to
+		 *   (abc_l_var * maxseg) bytes.
+		 *
+		 * slow start without ABC (RFC 5681):
+		 *   Grow cwnd exponentially by maxseg per ACK.
+		 *
+		 * cong avoid and ABC (RFC 3465):
+		 *   Grow cwnd linearly by maxseg per RTT for each
+		 *   cwnd worth of ACKed data.
+		 *
+		 * cong avoid without ABC (RFC 5681):
+		 *   Grow cwnd linearly by approximately maxseg per RTT using
+		 *   maxseg^2 / cwnd per ACK as the increment.
+		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
+		 *   avoid capping cwnd.
+		 */
+		if (cw > CCV(ccv, snd_ssthresh)) {
+			if (V_tcp_do_rfc3465) {
+				if (ccv->flags & CCF_ABC_SENTAWND)
+					ccv->flags &= ~CCF_ABC_SENTAWND;
+				else
+					incr = 0;
+			} else
+				incr = max((incr * incr / cw), 1);
+		} else if (V_tcp_do_rfc3465) {
+			/*
+			 * In slow-start with ABC enabled and no RTO in sight?
+			 * (Must not use abc_l_var > 1 if slow starting after
+			 * an RTO. On RTO, snd_nxt = snd_una, so the
+			 * snd_nxt == snd_max check is sufficient to
+			 * handle this).
+			 *
+			 * XXXLAS: Find a way to signal SS after RTO that
+			 * doesn't rely on tcpcb vars.
+			 */
+			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
+				incr = min(ccv->bytes_this_ack,
+				    V_tcp_abc_l_var * CCV(ccv, t_maxseg));
+			else
+				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
+		}
+		/* ABC is on by default, so incr equals 0 frequently. */
+		if (incr > 0)
+			CCV(ccv, snd_cwnd) = min(cw + incr,
+			    TCP_MAXWIN << CCV(ccv, snd_scale));
+	}
+}
+
+/*
+ * manage congestion signals
+ */
+void
+newreno_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+	u_int win;
+
+	win = max(CCV(ccv, snd_cwnd) / 2 / CCV(ccv, t_maxseg), 2) *
+	    CCV(ccv, t_maxseg);
+
+	switch (type) {
+	case CC_NDUPACK:
+		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
+				CCV(ccv, snd_ssthresh) = win;
+			ENTER_RECOVERY(CCV(ccv, t_flags));
+		}
+		break;
+	case CC_ECN:
+		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
+			CCV(ccv, snd_ssthresh) = win;
+			CCV(ccv, snd_cwnd) = win;
+			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
+		}
+		break;
+	}
+}
+
+/*
+ * decrease the cwnd in response to packet loss or a transmit timeout.
+ * th can be null, in which case cwnd will be set according to reno instead
+ * of new reno.
+ */
+void
+newreno_post_recovery(struct cc_var *ccv)
+{
+	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+		/*
+		 * Fast recovery will conclude after returning from this
+		 * function. Window inflation should have left us with
+		 * approximately snd_ssthresh outstanding data. But in case we
+		 * would be inclined to send a burst, better to do it via the
+		 * slow start mechanism.
+		 *
+		 * XXXLAS: Find a way to do this without needing curack
+		 */
+		if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh),
+		    CCV(ccv, snd_max)))
+			CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) -
+			ccv->curack + CCV(ccv, t_maxseg);
+		else
+			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
+	}
+}
+
+/*
+ * if a connection has been idle for a while and more data is ready to be sent,
+ * reset cwnd
+ */
+void
+newreno_after_idle(struct cc_var *ccv)
+{
+	/*
+	 * We have been idle for "a while" and no acks are expected to clock out
+	 * any data we send -- slow start to get ack "clock" running again.
+	 */
+	if (V_tcp_do_rfc3390)
+		CCV(ccv, snd_cwnd) = min(4 * CCV(ccv, t_maxseg),
+		    max(2 * CCV(ccv, t_maxseg), 4380));
+	else
+		CCV(ccv, snd_cwnd) = CCV(ccv, t_maxseg) * 2;
+}
+
+
+DECLARE_CC_MODULE(newreno, &newreno_cc_algo);
diff -r d69f537670da sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_input.c	Fri Nov 12 12:31:55 2010 +1100
@@ -1,13 +1,27 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
@@ -54,56 +68,56 @@ __FBSDID("$FreeBSD$");
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
 
 VNET_DEFINE(struct tcpstat, tcpstat);
 SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
     &VNET_NAME(tcpstat), tcpstat,
     "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0,
     "Log all incoming TCP segments to closed ports");
@@ -125,33 +139,30 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3042), 0,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-#define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-#define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
-#define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 
 VNET_DEFINE(int, tcp_do_ecn) = 0;
 SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
     &VNET_NAME(tcp_do_ecn), 0,
     "TCP ECN support");
@@ -196,51 +207,221 @@ VNET_DEFINE(struct inpcbinfo, tcbinfo);
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int, uint8_t,
 		     int);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline
-		 tcp_congestion_exp(struct tcpcb *);
+static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+			    uint16_t type);
+static void inline	cc_conn_init(struct tcpcb *tp);
+static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 
 /*
  * Kernel module interface for updating tcpstat.  The argument is an index
  * into tcpstat treated as an array of u_long.  While this encodes the
  * general layout of tcpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_tcpstat_inc(int statnum)
 {
 
 	(*((u_long *)&V_tcpstat + statnum))++;
 }
 
+/*
+ * CC wrapper hook functions
+ */
 static void inline
-tcp_congestion_exp(struct tcpcb *tp)
+cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
-	u_int win;
-	
-	win = min(tp->snd_wnd, tp->snd_cwnd) /
-	    2 / tp->t_maxseg;
-	if (win < 2)
-		win = 2;
-	tp->snd_ssthresh = win * tp->t_maxseg;
-	ENTER_FASTRECOVERY(tp);
-	tp->snd_recover = tp->snd_max;
-	if (tp->t_flags & TF_ECN_PERMIT)
-		tp->t_flags |= TF_ECN_SND_CWR;
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+	if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd))
+		tp->ccv->flags |= CCF_CWND_LIMITED;
+	else
+		tp->ccv->flags &= ~CCF_CWND_LIMITED;
+
+	if (type == CC_ACK) {
+		if (tp->snd_cwnd > tp->snd_ssthresh) {
+			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
+			     V_tcp_abc_l_var * tp->t_maxseg);
+			if (tp->t_bytes_acked >= tp->snd_cwnd) {
+				tp->t_bytes_acked -= tp->snd_cwnd;
+				tp->ccv->flags |= CCF_ABC_SENTAWND;
+			}
+		} else {
+				tp->ccv->flags &= ~CCF_ABC_SENTAWND;
+				tp->t_bytes_acked = 0;
+		}
+	}
+
+	if (CC_ALGO(tp)->ack_received != NULL) {
+		/* XXXLAS: Find a way to live without this */
+		tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->ack_received(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_conn_init(struct tcpcb *tp)
+{
+	struct hc_metrics_lite metrics;
+	struct inpcb *inp = tp->t_inpcb;
+	int rtt;
+#ifdef INET6
+	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tcp_hc_get(&inp->inp_inc, &metrics);
+
+	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+		tp->t_srtt = rtt;
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+		TCPSTAT_INC(tcps_usedrtt);
+		if (metrics.rmx_rttvar) {
+			tp->t_rttvar = metrics.rmx_rttvar;
+			TCPSTAT_INC(tcps_usedrttvar);
+		} else {
+			/* default variation is +- 1 rtt */
+			tp->t_rttvar =
+			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+		}
+		TCPT_RANGESET(tp->t_rxtcur,
+		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+		    tp->t_rttmin, TCPTV_REXMTMAX);
+	}
+	if (metrics.rmx_ssthresh) {
+		/*
+		 * There's some sort of gateway or interface
+		 * buffer limit on the path.  Use this to set
+		 * the slow start threshhold, but set the
+		 * threshold to no less than 2*mss.
+		 */
+		tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+		TCPSTAT_INC(tcps_usedssthresh);
+	}
+
+	/*
+	 * Set the slow-start flight size depending on whether this
+	 * is a local network or not.
+	 *
+	 * Extend this so we cache the cwnd too and retrieve it here.
+	 * Make cwnd even bigger than RFC3390 suggests but only if we
+	 * have previous experience with the remote host. Be careful
+	 * not make cwnd bigger than remote receive window or our own
+	 * send socket buffer. Maybe put some additional upper bound
+	 * on the retrieved cwnd. Should do incremental updates to
+	 * hostcache when cwnd collapses so next connection doesn't
+	 * overloads the path again.
+	 *
+	 * XXXAO: Initializing the CWND from the hostcache is broken
+	 * and in its current form not RFC conformant.  It is disabled
+	 * until fixed or removed entirely.
+	 *
+	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+	 * We currently check only in syncache_socket for that.
+	 */
+/* #define TCP_METRICS_CWND */
+#ifdef TCP_METRICS_CWND
+	if (metrics.rmx_cwnd)
+		tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2,
+		    min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+	else
+#endif
+	if (V_tcp_do_rfc3390)
+		tp->snd_cwnd = min(4 * tp->t_maxseg,
+		    max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+		 (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+	else if (in_localaddr(inp->inp_faddr))
+#endif
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
+	else
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
+
+	if (CC_ALGO(tp)->conn_init != NULL)
+		CC_ALGO(tp)->conn_init(tp->ccv);
+}
+
+void inline
+cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	switch(type) {
+	case CC_NDUPACK:
+		if (!IN_FASTRECOVERY(tp->t_flags)) {
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_ECN:
+		if (!IN_CONGRECOVERY(tp->t_flags)) {
+			TCPSTAT_INC(tcps_ecn_rcwnd);
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_RTO:
+		tp->t_dupacks = 0;
+		tp->t_bytes_acked = 0;
+		EXIT_RECOVERY(tp->t_flags);
+		tp->snd_cwnd = tp->t_maxseg;
+		break;
+	case CC_RTO_ERR:
+		TCPSTAT_INC(tcps_sndrexmitbad);
+		/* RTO was unnecessary, so reset everything. */
+		tp->snd_cwnd = tp->snd_cwnd_prev;
+		tp->snd_ssthresh = tp->snd_ssthresh_prev;
+		tp->snd_recover = tp->snd_recover_prev;
+		if (tp->t_flags & TF_WASFRECOVERY)
+			ENTER_FASTRECOVERY(tp->t_flags);
+		if (tp->t_flags & TF_WASCRECOVERY)
+			ENTER_CONGRECOVERY(tp->t_flags);
+		tp->snd_nxt = tp->snd_max;
+		tp->t_badrxtwin = 0;
+		break;
+	}
+
+	if (CC_ALGO(tp)->cong_signal != NULL) {
+		if (th != NULL)
+			tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->cong_signal(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/* XXXLAS: KASSERT that we're in recovery? */
+
+	if (CC_ALGO(tp)->post_recovery != NULL) {
+		tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->post_recovery(tp->ccv);
+	}
+	/* XXXLAS: EXIT_RECOVERY ? */
+	tp->t_bytes_acked = 0;
 }
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
 		nd6_nud_hint(NULL, NULL, 0); \
 } while (0)
@@ -1150,28 +1331,23 @@ tcp_do_segment(struct mbuf *m, struct tc
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
-		/*
-		 * Congestion experienced.
-		 * Ignore if we are already trying to recover.
-		 */
-		if ((thflags & TH_ECE) &&
-		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
-			TCPSTAT_INC(tcps_ecn_rcwnd);
-			tcp_congestion_exp(tp);
+		/* Congestion experienced. */
+		if (thflags & TH_ECE) {
+			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
@@ -1252,57 +1428,43 @@ tcp_do_segment(struct mbuf *m, struct tc
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = ticks;
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
-			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!V_tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((V_tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
+			    !IN_RECOVERY(tp->t_flags) &&
+			    (to.to_flags & TOF_SACK) == 0 &&
+			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				if (ti_locked == TI_RLOCKED)
 					INP_INFO_RUNLOCK(&V_tcbinfo);
 				else if (ti_locked == TI_WLOCKED)
 					INP_INFO_WUNLOCK(&V_tcbinfo);
 				else
 					panic("%s: ti_locked %d on pure ACK",
 					    __func__, ti_locked);
 				ti_locked = TI_UNLOCKED;
 
 				TCPSTAT_INC(tcps_predack);
 
 				/*
 				 * "bad retransmit" recovery.
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
-					TCPSTAT_INC(tcps_sndrexmitbad);
-					tp->snd_cwnd = tp->snd_cwnd_prev;
-					tp->snd_ssthresh =
-					    tp->snd_ssthresh_prev;
-					tp->snd_recover = tp->snd_recover_prev;
-					if (tp->t_flags & TF_WASFRECOVERY)
-					    ENTER_FASTRECOVERY(tp);
-					tp->snd_nxt = tp->snd_max;
-					tp->t_badrxtwin = 0;
+					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
@@ -1314,27 +1476,36 @@ tcp_do_segment(struct mbuf *m, struct tc
 					tcp_xmit_timer(tp,
 					    ticks - to.to_tsecr + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
-				acked = th->th_ack - tp->snd_una;
+				acked = BYTES_THIS_ACK(tp, th);
 				TCPSTAT_INC(tcps_rcvackpack);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
+				
+				/*
+				 * Let the congestion control algorithm update
+				 * congestion control related information. This
+				 * typically means increasing the congestion
+				 * window.
+				 */
+				cc_ack_received(tp, th, CC_ACK);
+
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* Some progress has been made. */
 
@@ -1580,20 +1751,21 @@ tcp_do_segment(struct mbuf *m, struct tc
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_state = TCPS_FIN_WAIT_1;
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
+				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.  If segment contains CC option
 			 * and there is a cached CC, apply TAO test.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
@@ -1983,20 +2155,21 @@ tcp_do_segment(struct mbuf *m, struct tc
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_state = TCPS_FIN_WAIT_1;
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
+			cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
@@ -2051,25 +2224,24 @@ tcp_do_segment(struct mbuf *m, struct tc
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-				    ((V_tcp_do_newreno ||
-				      (tp->t_flags & TF_SACK_PERMIT)) &&
-				     IN_FASTRECOVERY(tp))) {
+				     IN_FASTRECOVERY(tp->t_flags)) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
-					    IN_FASTRECOVERY(tp)) {
+					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's 	
 						 * worth of data in flight.
 						 */
 						awnd = (tp->snd_nxt - tp->snd_fack) +
 							tp->sackhint.sack_bytes_rexmit;
@@ -2086,33 +2258,34 @@ tcp_do_segment(struct mbuf *m, struct tc
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
-						if (IN_FASTRECOVERY(tp)) {
+						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (V_tcp_do_newreno ||
-					    V_tcp_do_ecn) {
+					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
-					tcp_congestion_exp(tp);
+					/* Congestion signal before ack. */
+					cc_cong_signal(tp, th, CC_NDUPACK);
+					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
 						(void) tcp_output(tp);
 						goto drop;
 					}
@@ -2122,20 +2295,21 @@ tcp_do_segment(struct mbuf *m, struct tc
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
@@ -2163,51 +2337,28 @@ tcp_do_segment(struct mbuf *m, struct tc
 			break;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
-			if (IN_FASTRECOVERY(tp)) {
-				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					if (tp->t_flags & TF_SACK_PERMIT)
-						tcp_sack_partialack(tp, th);
-					else
-						tcp_newreno_partial_ack(tp, th);
-				} else {
-					/*
-					 * Out of fast recovery.
-					 * Window inflation should have left us
-					 * with approximately snd_ssthresh
-					 * outstanding data.
-					 * But in case we would be inclined to
-					 * send a burst, better to do it via
-					 * the slow start mechanism.
-					 */
-					if (SEQ_GT(th->th_ack +
-							tp->snd_ssthresh,
-						   tp->snd_max))
-						tp->snd_cwnd = tp->snd_max -
-								th->th_ack +
-								tp->t_maxseg;
-					else
-						tp->snd_cwnd = tp->snd_ssthresh;
-				}
-			}
-		} else {
-			if (tp->t_dupacks >= tcprexmtthresh &&
-			    tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd = tp->snd_ssthresh;
+		if (IN_FASTRECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+				if (tp->t_flags & TF_SACK_PERMIT)
+					tcp_sack_partialack(tp, th);
+				else
+					tcp_newreno_partial_ack(tp, th);
+			} else
+				cc_post_recovery(tp, th);
 		}
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
@@ -2224,41 +2375,33 @@ tcp_do_segment(struct mbuf *m, struct tc
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
-		acked = th->th_ack - tp->snd_una;
+		acked = BYTES_THIS_ACK(tp, th);
 		TCPSTAT_INC(tcps_rcvackpack);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
-		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
-			TCPSTAT_INC(tcps_sndrexmitbad);
-			tp->snd_cwnd = tp->snd_cwnd_prev;
-			tp->snd_ssthresh = tp->snd_ssthresh_prev;
-			tp->snd_recover = tp->snd_recover_prev;
-			if (tp->t_flags & TF_WASFRECOVERY)
-				ENTER_FASTRECOVERY(tp);
-			tp->snd_nxt = tp->snd_max;
-			tp->t_badrxtwin = 0;	/* XXX probably not required */
-		}
+		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0)
+			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
@@ -2291,98 +2434,47 @@ process_ACK:
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
-		 * When new data is acked, open the congestion window.
-		 * Method depends on which congestion control state we're
-		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
-		 * enabled.
-		 *
-		 * slow start: cwnd <= ssthresh
-		 * cong avoid: cwnd > ssthresh
-		 *
-		 * slow start and ABC (RFC 3465):
-		 *   Grow cwnd exponentially by the amount of data
-		 *   ACKed capping the max increment per ACK to
-		 *   (abc_l_var * maxseg) bytes.
-		 *
-		 * slow start without ABC (RFC 2581):
-		 *   Grow cwnd exponentially by maxseg per ACK.
-		 *
-		 * cong avoid and ABC (RFC 3465):
-		 *   Grow cwnd linearly by maxseg per RTT for each
-		 *   cwnd worth of ACKed data.
-		 *
-		 * cong avoid without ABC (RFC 2581):
-		 *   Grow cwnd linearly by approximately maxseg per RTT using
-		 *   maxseg^2 / cwnd per ACK as the increment.
-		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
-		 *   avoid capping cwnd.
+		 * Let the congestion control algorithm update congestion
+		 * control related information. This typically means increasing
+		 * the congestion window.
 		 */
-		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
-		    !IN_FASTRECOVERY(tp)) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_maxseg;
-			/* In congestion avoidance? */
-			if (cw > tp->snd_ssthresh) {
-				if (V_tcp_do_rfc3465) {
-					tp->t_bytes_acked += acked;
-					if (tp->t_bytes_acked >= tp->snd_cwnd)
-						tp->t_bytes_acked -= cw;
-					else
-						incr = 0;
-				}
-				else
-					incr = max((incr * incr / cw), 1);
-			/*
-			 * In slow-start with ABC enabled and no RTO in sight?
-			 * (Must not use abc_l_var > 1 if slow starting after an
-			 * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
-			 * snd_max check is sufficient to handle this).
-			 */
-			} else if (V_tcp_do_rfc3465 &&
-			    tp->snd_nxt == tp->snd_max)
-				incr = min(acked,
-				    V_tcp_abc_l_var * tp->t_maxseg);
-			/* ABC is on by default, so (incr == 0) frequently. */
-			if (incr > 0)
-				tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
-		}
+		cc_ack_received(tp, th, CC_ACK);
+
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop_locked(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    !IN_FASTRECOVERY(tp) &&
+		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    IN_FASTRECOVERY(tp) &&
+		/* XXXLAS: Can this be moved up into cc_post_recovery? */
+		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
-			EXIT_FASTRECOVERY(tp);
-			tp->t_bytes_acked = 0;
+			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
@@ -3233,38 +3325,33 @@ tcp_mss_update(struct tcpcb *tp, int off
 #else
 	if (mss > MCLBYTES)
 		mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
-	int rtt, mss;
+	int mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	int mtuflags = 0;
-#ifdef INET6
-	int isipv6;
-#endif
+
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 	
 	tcp_mss_update(tp, offer, &metrics, &mtuflags);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
-#ifdef INET6
-	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
-#endif
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
@@ -3290,85 +3377,20 @@ tcp_mss(struct tcpcb *tp, int offer)
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
-	/*
-	 * While we're here, check the others too.
-	 */
-	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
-		tp->t_srtt = rtt;
-		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
-		TCPSTAT_INC(tcps_usedrtt);
-		if (metrics.rmx_rttvar) {
-			tp->t_rttvar = metrics.rmx_rttvar;
-			TCPSTAT_INC(tcps_usedrttvar);
-		} else {
-			/* default variation is +- 1 rtt */
-			tp->t_rttvar =
-			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
-		}
-		TCPT_RANGESET(tp->t_rxtcur,
-			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
-			      tp->t_rttmin, TCPTV_REXMTMAX);
-	}
-	if (metrics.rmx_ssthresh) {
-		/*
-		 * There's some sort of gateway or interface
-		 * buffer limit on the path.  Use this to set
-		 * the slow start threshhold, but set the
-		 * threshold to no less than 2*mss.
-		 */
-		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
-		TCPSTAT_INC(tcps_usedssthresh);
-	}
-
-	/*
-	 * Set the slow-start flight size depending on whether this
-	 * is a local network or not.
-	 *
-	 * Extend this so we cache the cwnd too and retrieve it here.
-	 * Make cwnd even bigger than RFC3390 suggests but only if we
-	 * have previous experience with the remote host. Be careful
-	 * not make cwnd bigger than remote receive window or our own
-	 * send socket buffer. Maybe put some additional upper bound
-	 * on the retrieved cwnd. Should do incremental updates to
-	 * hostcache when cwnd collapses so next connection doesn't
-	 * overloads the path again.
-	 *
-	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
-	 * We currently check only in syncache_socket for that.
-	 */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
-	if (metrics.rmx_cwnd)
-		tp->snd_cwnd = max(mss,
-				min(metrics.rmx_cwnd / 2,
-				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
-	else
-#endif
-	if (V_tcp_do_rfc3390)
-		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
-	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-		 (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
-	else if (in_localaddr(inp->inp_faddr))
-#endif
-		tp->snd_cwnd = mss * V_ss_fltsz_local;
-	else
-		tp->snd_cwnd = mss * V_ss_fltsz;
 
 	/* Check the interface for TSO capabilities. */
 	if (mtuflags & CSUM_TSO)
 		tp->t_flags |= TF_TSO;
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
@@ -3418,26 +3440,26 @@ tcp_newreno_partial_ack(struct tcpcb *tp
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
-	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
-	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
-		tp->snd_cwnd -= th->th_ack - tp->snd_una;
+	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
+		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
 }
diff -r d69f537670da sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_output.c	Fri Nov 12 12:31:55 2010 +1100
@@ -46,32 +46,32 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
-#include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
@@ -95,25 +95,20 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO,
 VNET_DEFINE(int, ss_fltsz) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 	&VNET_NAME(ss_fltsz), 1,
 	"Slow start flight size");
 
 VNET_DEFINE(int, ss_fltsz_local) = 4;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
 	CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1,
 	"Slow start flight size for local networks");
 
-VNET_DEFINE(int, tcp_do_newreno) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
-	&VNET_NAME(tcp_do_newreno), 0,
-	"Enable NewReno Algorithms");
-
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 #define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
@@ -124,20 +119,33 @@ VNET_DEFINE(int, tcp_autosndbuf_inc) = 8
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024;
 #define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
+static void inline	cc_after_idle(struct tcpcb *tp);
+
+/*
+ * CC wrapper hook functions
+ */
+static void inline
+cc_after_idle(struct tcpcb *tp)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (CC_ALGO(tp)->after_idle != NULL)
+		CC_ALGO(tp)->after_idle(tp->ccv);
+}
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error, rw;
@@ -234,21 +242,21 @@ again:
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
-	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
+	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
@@ -1308,21 +1316,21 @@ out:
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
-	if (sendalot && (!V_tcp_do_newreno || --maxburst))
+	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
diff -r d69f537670da sys/netinet/tcp_sack.c
--- a/sys/netinet/tcp_sack.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_sack.c	Fri Nov 12 12:31:55 2010 +1100
@@ -569,21 +569,21 @@ tcp_free_sackholes(struct tcpcb *tp)
  */
 void
 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 {
 	int num_segs = 1;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	/* Send one or 2 segments based on how much new data was acked. */
-	if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
+	if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2)
 		num_segs = 2;
 	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 	    (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 }
 
 #if 0
diff -r d69f537670da sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_subr.c	Fri Nov 12 12:31:55 2010 +1100
@@ -55,39 +55,39 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
@@ -231,20 +231,21 @@ static char *	tcp_log_addr(struct in_con
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
+	struct cc_var		ccv;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 struct callout isn_callout;
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
@@ -270,20 +271,22 @@ tcp_inpcb_init(void *mem, int size, int 
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 void
 tcp_init(void)
 {
 	int hashsize;
 
+	cc_init();
+
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
 	    "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE);
 
 	/*
@@ -633,20 +636,40 @@ tcp_newtcpcb(struct inpcb *inp)
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
+
+	/* Initialise cc_var struct for this tcpcb. */
+	tp->ccv = &tm->ccv;
+	tp->ccv->type = IPPROTO_TCP;
+	tp->ccv->ccvc.tcp = tp;
+
+	/*
+	 * Use the current system default CC algorithm.
+	 */
+	CC_LIST_RLOCK();
+	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
+	CC_ALGO(tp) = CC_DEFAULT();
+	CC_LIST_RUNLOCK();
+
+	if (CC_ALGO(tp)->cb_init != NULL)
+		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+			uma_zfree(V_tcpcb_zone, tm);
+			return (NULL);
+		}
+
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
@@ -798,20 +821,26 @@ tcp_discardcb(struct tcpcb *tp)
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 	/* Disconnect offload device, if any. */
 	tcp_offload_detach(tp);
 		
 	tcp_free_sackholes(tp);
+
+	/* Allow the CC algorithm to clean up after itself. */
+	if (CC_ALGO(tp)->cb_destroy != NULL)
+		CC_ALGO(tp)->cb_destroy(tp->ccv);
+
+	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(V_tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
@@ -1565,21 +1594,21 @@ tcp_mtudisc(struct inpcb *inp, int errno
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
-		EXIT_FASTRECOVERY(tp);
+		EXIT_FASTRECOVERY(tp->t_flags);
 	tcp_output_send(tp);
 	return (inp);
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
diff -r d69f537670da sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_timer.c	Fri Nov 12 12:31:55 2010 +1100
@@ -44,28 +44,28 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 int	tcp_keepinit;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
@@ -508,24 +508,28 @@ tcp_timer_rexmt(void * xtp)
 		 * be recovered if this turns out to be a "bad" retransmit.
 		 * A retransmit is considered "bad" if an ACK for this
 		 * segment is received within RTT/2 interval; the assumption
 		 * here is that the ACK was already in flight.  See
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
-		if (IN_FASTRECOVERY(tp))
-		  tp->t_flags |= TF_WASFRECOVERY;
+		if (IN_FASTRECOVERY(tp->t_flags))
+			tp->t_flags |= TF_WASFRECOVERY;
 		else
-		  tp->t_flags &= ~TF_WASFRECOVERY;
+			tp->t_flags &= ~TF_WASFRECOVERY;
+		if (IN_CONGRECOVERY(tp->t_flags))
+			tp->t_flags |= TF_WASCRECOVERY;
+		else
+			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 	}
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if (tp->t_state == TCPS_SYN_SENT)
 		rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 		      tp->t_rttmin, TCPTV_REXMTMAX);
 	/*
@@ -555,54 +559,23 @@ tcp_timer_rexmt(void * xtp)
 	tp->snd_nxt = tp->snd_una;
 	tp->snd_recover = tp->snd_max;
 	/*
 	 * Force a segment to be sent.
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
-	/*
-	 * Close the congestion window down to one segment
-	 * (we'll open it by one segment for each ack we get).
-	 * Since we probably have a window's worth of unacked
-	 * data accumulated, this "slow start" keeps us from
-	 * dumping all that data as back-to-back packets (which
-	 * might overwhelm an intermediate gateway).
-	 *
-	 * There are two phases to the opening: Initially we
-	 * open by one mss on each ack.  This makes the window
-	 * size increase exponentially with time.  If the
-	 * window is larger than the path can handle, this
-	 * exponential growth results in dropped packet(s)
-	 * almost immediately.  To get more time between
-	 * drops but still "push" the network to take advantage
-	 * of improving conditions, we switch from exponential
-	 * to linear window opening at some threshhold size.
-	 * For a threshhold, we use half the current window
-	 * size, truncated to a multiple of the mss.
-	 *
-	 * (the minimum cwnd that will give us exponential
-	 * growth is 2 mss.  We don't allow the threshhold
-	 * to go below this.)
-	 */
-	{
-		u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
-		if (win < 2)
-			win = 2;
-		tp->snd_cwnd = tp->t_maxseg;
-		tp->snd_ssthresh = win * tp->t_maxseg;
-		tp->t_dupacks = 0;
-	}
-	EXIT_FASTRECOVERY(tp);
-	tp->t_bytes_acked = 0;
+
+	cc_cong_signal(tp, 0, CC_RTO);
+
 	(void) tcp_output(tp);
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
diff -r d69f537670da sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_usrreq.c	Fri Nov 12 12:31:55 2010 +1100
@@ -55,36 +55,36 @@ __FBSDID("$FreeBSD$");
 #include <sys/jail.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet/tcp_offload.h>
 
@@ -1235,20 +1235,22 @@ tcp_fill_info(struct tcpcb *tp, struct t
 	tp = intotcpcb(inp);						\
 } while(0)
 
 int
 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int	error, opt, optval;
 	struct	inpcb *inp;
 	struct	tcpcb *tp;
 	struct	tcp_info ti;
+	char buf[TCP_CA_NAME_MAX];
+	struct cc_algo *algo;
 
 	error = 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != IPPROTO_TCP) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
@@ -1344,20 +1346,68 @@ tcp_ctloutput(struct socket *so, struct 
 			else
 				error = EINVAL;
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
+		case TCP_CONGESTION:
+			INP_WUNLOCK(inp);
+			bzero(buf, sizeof(buf));
+			error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+			if (error)
+				break;
+			INP_WLOCK_RECHECK(inp);
+			/*
+			 * Return EINVAL if we can't find the requested cc algo.
+			 */
+			error = EINVAL;
+			CC_LIST_RLOCK();
+			STAILQ_FOREACH(algo, &cc_list, entries) {
+				if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
+				    == 0) {
+					/* We've found the requested algo. */
+					error = 0;
+					/*
+					 * We hold a write lock over the tcb
+					 * so it's safe to do these things
+					 * without ordering concerns.
+					 */
+					if (CC_ALGO(tp)->cb_destroy != NULL)
+						CC_ALGO(tp)->cb_destroy(tp->ccv);
+					CC_ALGO(tp) = algo;
+					/*
+					 * If something goes pear shaped
+					 * initialising the new algo,
+					 * fall back to newreno (which
+					 * does not require initialisation).
+					 */
+					if (algo->cb_init != NULL)
+						if (algo->cb_init(tp->ccv) > 0) {
+							CC_ALGO(tp) = &newreno_cc_algo;
+							/*
+							 * The only reason init
+							 * should fail is
+							 * because of malloc.
+							 */
+							error = ENOMEM;
+						}
+					break; /* Break the STAILQ_FOREACH. */
+				}
+			}
+			CC_LIST_RUNLOCK();
+			INP_WUNLOCK(inp);
+			break;
+
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		tp = intotcpcb(inp);
 		switch (sopt->sopt_name) {
@@ -1387,20 +1437,26 @@ tcp_ctloutput(struct socket *so, struct 
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_INFO:
 			tcp_fill_info(tp, &ti);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
+		case TCP_CONGESTION:
+			bzero(buf, sizeof(buf));
+			strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+			INP_WUNLOCK(inp);
+			error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
+			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef INP_WLOCK_RECHECK
@@ -1700,20 +1756,24 @@ db_print_tflags(u_int t_flags)
 		comma = 1;
 	}
 	if (t_flags & TF_RXWIN0SENT) {
 		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTRECOVERY) {
 		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
+	if (t_flags & TF_CONGRECOVERY) {
+		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
+		comma = 1;
+	}
 	if (t_flags & TF_WASFRECOVERY) {
 		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SIGNATURE) {
 		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FORCEDATA) {
 		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
diff -r d69f537670da sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/netinet/tcp_var.h	Fri Nov 12 12:31:55 2010 +1100
@@ -188,23 +188,25 @@ struct tcpcb {
 	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	tcp_seq sack_newdata;		/* New data xmitted in this recovery
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 	struct toe_usrreqs *t_tu;	/* offload operations vector */
 	void	*t_toe;			/* TOE pcb pointer */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
+	struct cc_algo	*cc_algo;	/* congestion control algorithm */
+	struct cc_var	*ccv;
 
 	int	t_ispare;		/* explicit pad for 64bit alignment */
-	void	*t_pspare2[6];		/* 2 CC / 4 TBD */
+	void	*t_pspare2[4];		/* 4 TBD */
 	uint64_t _pad[12];		/* 7 UTO, 5 TBD (1-2 CC/RTT?) */
 };
 
 /*
  * Flags and utility macros for the t_flags field.
  */
 #define	TF_ACKNOW	0x000001	/* ack peer immediately */
 #define	TF_DELACK	0x000002	/* ack, but try to delay it */
 #define	TF_NODELAY	0x000004	/* don't delay packets to coalesce */
 #define	TF_NOOPT	0x000008	/* don't use tcp options */
@@ -223,24 +225,36 @@ struct tcpcb {
 #define	TF_RXWIN0SENT	0x080000	/* sent a receiver win 0 in response */
 #define	TF_FASTRECOVERY	0x100000	/* in NewReno Fast Recovery */
 #define	TF_WASFRECOVERY	0x200000	/* was in NewReno Fast Recovery */
 #define	TF_SIGNATURE	0x400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x800000	/* force out a byte */
 #define	TF_TSO		0x1000000	/* TSO enabled on this connection */
 #define	TF_TOE		0x2000000	/* this connection is offloaded */
 #define	TF_ECN_PERMIT	0x4000000	/* connection ECN-ready */
 #define	TF_ECN_SND_CWR	0x8000000	/* ECN CWR in queue */
 #define	TF_ECN_SND_ECE	0x10000000	/* ECN ECE in queue */
+#define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
+#define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 
-#define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
-#define ENTER_FASTRECOVERY(tp)	tp->t_flags |= TF_FASTRECOVERY
-#define EXIT_FASTRECOVERY(tp)	tp->t_flags &= ~TF_FASTRECOVERY
+#define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
+#define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
+#define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
+
+#define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
+#define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
+#define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
+
+#define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
+#define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
+#define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
+
+#define	BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
 
 /*
  * Flags for the t_oobflags field.
  */
 #define	TCPOOB_HAVEDATA	0x01
 #define	TCPOOB_HADDATA	0x02
 
 #ifdef TCP_SIGNATURE
 /*
  * Defines which are needed by the xform_tcp module and tcp_[in|out]put
@@ -555,35 +569,37 @@ MALLOC_DECLARE(M_TCPLOG);
 #endif
 
 VNET_DECLARE(struct inpcbhead, tcb);		/* queue of active tcpcb's */
 VNET_DECLARE(struct inpcbinfo, tcbinfo);
 VNET_DECLARE(struct tcpstat, tcpstat);		/* tcp statistics */
 extern	int tcp_log_in_vain;
 VNET_DECLARE(int, tcp_mssdflt);	/* XXX */
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_rfc3390);
-VNET_DECLARE(int, tcp_do_newreno);
 VNET_DECLARE(int, path_mtu_discovery);
 VNET_DECLARE(int, ss_fltsz);
 VNET_DECLARE(int, ss_fltsz_local);
+VNET_DECLARE(int, tcp_do_rfc3465);
+VNET_DECLARE(int, tcp_abc_l_var);
 #define	V_tcb			VNET(tcb)
 #define	V_tcbinfo		VNET(tcbinfo)
 #define	V_tcpstat		VNET(tcpstat)
 #define	V_tcp_mssdflt		VNET(tcp_mssdflt)
 #define	V_tcp_minmss		VNET(tcp_minmss)
 #define	V_tcp_delack_enabled	VNET(tcp_delack_enabled)
 #define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
-#define	V_tcp_do_newreno	VNET(tcp_do_newreno)
 #define	V_path_mtu_discovery	VNET(path_mtu_discovery)
 #define	V_ss_fltsz		VNET(ss_fltsz)
 #define	V_ss_fltsz_local	VNET(ss_fltsz_local)
+#define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
+#define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 
 VNET_DECLARE(int, tcp_do_sack);			/* SACK enabled/disabled */
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);	/* RST on sock alloc failure */
 #define	V_tcp_do_sack		VNET(tcp_do_sack)
 #define	V_tcp_sc_rst_sock_fail	VNET(tcp_sc_rst_sock_fail)
 
 VNET_DECLARE(int, tcp_do_ecn);			/* TCP ECN enabled/disabled */
 VNET_DECLARE(int, tcp_ecn_maxretries);
 #define	V_tcp_do_ecn		VNET(tcp_do_ecn)
 #define	V_tcp_ecn_maxretries	VNET(tcp_ecn_maxretries)
@@ -671,13 +687,15 @@ tcp_seq tcp_new_isn(struct tcpcb *);
 void	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_free_sackholes(struct tcpcb *tp);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long	 tcp_seq_subtract(u_long, u_long );
 
+void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */
diff -r d69f537670da sys/sys/param.h
--- a/sys/sys/param.h	Fri Nov 12 00:44:18 2010 +0000
+++ b/sys/sys/param.h	Fri Nov 12 12:31:55 2010 +1100
@@ -51,21 +51,21 @@
  * Currently this lives here:
  *
  *	doc/en_US.ISO8859-1/books/porters-handbook/book.sgml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 900024	/* Master, propagated to newvers */
+#define __FreeBSD_version 900025	/* Master, propagated to newvers */
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)