/*
 * Copyright (c) 2001
 * 	Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by members and contributors
 *	of The FreeBSD Project (http://www.FreeBSD.org/)
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD$
 */

#include "opt_param.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>

/* XXX: Ack, where do we get this from for real? */
#define	NCPU 2

/*
 * The mbuf allocator is heavily based on Alfred Perlstein's
 * (alfred@FreeBSD.org) "memcache" allocator which is itself based
 * on concepts from several per-CPU memory allocators. The difference
 * between this allocator and memcache is that, among other things:
 *
 * (i) We don't need to do as many things as memcache so we shouldn't waste
 *     resources worrying about it (e.g. we don't want to ever free back to
 *     mb_map).
 *
 * (ii) We want to leave room for future optimizations which may allow us
 *      to inline a portion of "the easy allocation," provided that the
 *      generated code is small enough.
 *
 * (iii) We block on a condition variable in the worst-case scenario, and
 *	 attempt to "steal" objects from other lists. Before we do either,
 *	 we also drain protocols, a task very mbuf-system-specific.
 *
 * The mbuf allocator keeps all objects that it allocates in mb_buckets.
 * The buckets keep a page worth of objects (an object can be an mbuf, an
 * mbuf cluster, or an external object reference counter) and facilitate
 * moving larger sets of contiguous objects (objects from the same page)
 * from the per-CPU lists to the main list for the given object. The buckets
 * also have an added advantage in that after several moves from a per-CPU
 * list to the main list and back to the per-CPU list, contiguous objects
 * are kept together, thus trying to put the TLB cache to good use.
 *
 * The buckets are kept on singly-linked lists called "containers." A container
 * is protected by a mutex lock in order to ensure consistency. The mutex lock
 * itself is allocated seperately and attached to the container at boot time,
 * thus allowing for certain containers to share the same mutex lock. Per-CPU
 * containers for mbufs, clusters, and counters all share the same per-CPU
 * lock whereas the "general system" containers (i.e. the "main lists") for
 * these objects share one global lock.
 *
 * When, during allocation, the per-CPU container, the main container for
 * the given object, and the space reserved in mb_map for that object are
 * all depleted then, depending on whether the allocation was done with
 * M_TRYWAIT, we may be allowed to block for a maximum of mbuf_wait ticks.
 * The blocking is implemented with a condition variable found only in the
 * main (general) container for the given object. During exhaustion, all
 * freeing is done to the general list so that any blockers can pick up
 * whatever comes in first.
 *
 */
struct mb_bucket {
	SLIST_ENTRY(mb_bucket) 	buck_list;
	int 			bckt_owner;
	int			num_free;
	void 			*free[1];
};

struct mb_container {
	SLIST_HEAD(, mb_bucket)	buck_head;
	struct 			mtx *mtx_lock;
	int			num_owner;
	u_int			starved;
};

struct mb_gen_list {
	struct	mb_container mb_cont;
	struct	cv m_starved;
};

struct mb_pcpu_list {
	struct	mb_container mb_cont;
	u_int	obj_count;
};

/*
 * Parameters used to scale the size of mb_map and its submaps.
 * These are tunable at boottime.
 */ 
int	nmbufs;
int	nmbclusters;
int	nmbcnt;
#ifndef	NMBCLUSTERS
#define	NMBCLUSTERS	(512 + MAXUSERS * 16)
#endif
#ifndef	NMBUFS
#define	NMBUFS		(NMBCLUSTERS * 4)
#endif
#ifndef	NMBCNTS	
#define	NMBCNTS		(NMBCLUSTERS + nsfbufs)
#endif
TUNABLE_INT_DECL("kern.ipc.nmbufs", NMBUFS, nmbufs);
TUNABLE_INT_DECL("kern.ipc.nmbclusters", NMBCLUSTERS, nmbclusters);
TUNABLE_INT_DECL("kern.ipc.nmbcnt", NMBCNTS, nmbcnt);

/*
 * The freelist structures and mutex locks. The number statically declared
 * here depends on the number of CPUs.
 *
 * We setup in such a way that all the objects (mbufs, clusters, ref counters)
 * share the same mutex lock. It has been established that we do not benefit
 * from different locks for different objects, so we use the same lock,
 * regardless of object type.
 */
struct mb_lstmngr {
	struct 		mb_gen_list *gen_list;
	struct		mb_pcpu_list *cnt_lst[NCPU];
	vm_map_t	map;
	vm_offset_t	map_base;
	vm_offset_t	map_top;
	struct		mb_bucket **b_table;
	int		map_full;
	u_int		obj_size;
	u_int		*wm_high;
};
struct	mb_lstmngr mb_list_mbuf, mb_list_clust, mb_list_cnt;
struct	mtx mbuf_gen, mbuf_pcpu[NCPU];

#define	MB_GET_PCPU_LIST(mb_lst)	  (mb_lst)->cnt_lst[PCPU_GET(cpuid)]
#define	MB_GET_PCPU_LIST_NUM(mb_lst, num) (mb_lst)->cnt_lst[(num)]
#define	MB_GET_GEN_LIST(mb_lst)		  (mb_lst)->gen_list

#define	MB_LOCK_CONT(mb_cnt)	mtx_lock((mb_cnt)->mb_cont.mtx_lock)
#define	MB_UNLOCK_CONT(mb_cnt)	mtx_unlock((mb_cnt)->mb_cont.mtx_lock)

#define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
    (int)(((char *)(mb_obj) - (char *)(mb_lst)->map_base) / PAGE_SIZE)

/*
 * Ownership of buckets/containers is managed through integers. The PCPU
 * lists range from 0 to NCPU-1. We need a free numerical id for the general
 * list (usually NCPU). We also need a non-conflicting free bit to indicate
 * that the bucket is free and removed from a container, while not losing
 * the bucket's originating container id. We usually use the highest bit
 * for the free marker.
 */
#define	MB_GENLIST_OWNER	(NCPU)
#define	MB_BUCKET_FREE		(1 << (sizeof(int) * 8 - 1))

/*
 * sysctl(8) exported objects
 */
SYSCTL_DECL(_kern_ipc);
SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, 
    "Maximum number of mbuf clusters available");
SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
    "Maximum number of mbufs available"); 
SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
    "Maximum number of ext_buf counters available");
SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
    "Sleep time of mbuf subsystem wait allocations during exhaustion");
SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
    "Upper limit of number of mbufs allowed on each PCPU list");
SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
    "Upper limit of number of mbuf clusters allowed on each PCPU list");
SYSCTL_UINT(_kern_ipc, OID_AUTO, cnt_limit, CTLFLAG_RW, &cnt_limit, 0,
    "Upper limit of number of m_ext counters allowed on each PCPU list");

/*
 * Prototypes of local (internal) routines.
 */
void			*mb_alloc_wait(struct mb_lstmngr *);
static	void		mb_init(void *);
struct	mb_bucket	*mb_pop_cont(struct mb_lstmngr *, int,
			    struct mb_pcpu_list *);
void			mb_reclaim(void);

/*
 * Initial allocation numbers. Each parameter represents the number of buckets
 * of each object that will be placed initially in each PCPU container for
 * said object.
 */
#define	NMB_MBUF_INIT	2
#define	NMB_CLUST_INIT	2
#define	NMB_CNT_INIT	(NMBCLUSTERS * sizeof(struct mext_refcnt) / PAGE_SIZE) 

/*
 * Initialize the mbuf subsystem.
 *
 * We sub-divide the mb_map into several submaps; this way, we don't have
 * to worry about artificially limiting the number of mbuf or mbuf cluster
 * allocations, due to fear of one type of allocation "stealing" address
 * space initially reserved for another.
 *
 * Setup both the general containers and all the PCPU containers. Populate
 * the PCPU containers with initial numbers.
 */
MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mb_init, NULL)
static void
mb_init(void *dummy)
{
	struct		mb_pcpu_list *pcpu_cnt;
	vm_offset_t	maxaddr, mb_map_base;
	vm_size_t	mb_map_size;
	int		i, j;

	/*
	 * Setup the mb_map, allocate requested VM space.
	 */
	mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * MCLBYTES +
	    nmbcnt * sizeof(struct mext_refcnt));
	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
	mb_map = kmem_suballoc(kmem_map, &mb_map_base, &maxaddr, mb_map_size);
	/* XXX XXX XXX: mb_map->system_map = 1; */

	/*
	 * Setup all the submaps, for each type of object that we deal
	 * with in this allocator.
	 */
	mb_map_size = (vm_size_t)(nmbufs * MSIZE);
	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
	mb_list_mbuf.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE,
	    M_MBUF, M_NOWAIT);
	if (mb_list_mbuf.b_table == NULL)
		goto bad;
	mb_list_mbuf.map = kmem_suballoc(mb_map, &(mb_list_mbuf.map_base),
	    &(mb_list_mbuf.map_top), mb_map_size);
	mb_list_mbuf.map_full = 0;
	mb_list_mbuf.obj_size = MSIZE;
	mb_list_mbuf.wm_high = &mbuf_limit;

	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
	mb_list_clust.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE,
	    M_MBUF, M_NOWAIT);
	if (mb_list_clust.b_table == NULL)
		goto bad;
	mb_list_clust.map = kmem_suballoc(mb_map, &(mb_list_clust.map_base),
	    &(mb_list_clust.map_top), mb_map_size);
	mb_list_clust.map_full = 0;
	mb_list_clust.obj_size = MCLBYTES;
	mb_list_clust.wm_high = &clust_limit;

	mb_map_size = (vm_size_t)(nmbcnt * sizeof(struct mext_refcnt));
	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
	mb_list_cnt.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE,
	    M_MBUF, M_NOWAIT);
	if (mb_list_cnt.b_table == NULL)
		goto bad;
	mb_list_cnt.map = kmem_suballoc(mb_map, &(mb_list_cnt.map_base),
	    &(mb_list_cnt.map_top), mb_map_size);
	mb_list_cnt.map_full = 0;
	mb_list_cnt.obj_size = sizeof(struct mext_refcnt);
	mb_list_cnt.wm_high = &cnt_limit;
	/* XXX XXX XXX: mbuf_map->system_map = clust_map->system_map =
			refcnt_map->system_map = 1; */

	/*
	 * Allocate required general (global) containers for each object type.
	 */
	mb_list_mbuf.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF,
	    M_NOWAIT);
	mb_list_clust.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF,
	    M_NOWAIT);
	mb_list_cnt.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF,
	    M_NOWAIT);
	if ((mb_list_mbuf.gen_list == NULL) || (mb_list_clust.gen_list == NULL)
	    || (mb_list_cnt.gen_list == NULL))
		goto bad;

	/*
	 * Initialize condition variables and general container mutex locks.
	 */
	mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", 0);
	cv_init(&(mb_list_mbuf.gen_list->m_starved), "mbuf pool starved");
	cv_init(&(mb_list_clust.gen_list->m_starved), "mcluster pool starved");
	cv_init(&(mb_list_cnt.gen_list->m_starved), "mext cntr pool starved");
	mb_list_mbuf.gen_list->mb_cont.mtx_lock =
	    mb_list_clust.gen_list->mb_cont.mtx_lock =
	    mb_list_cnt.gen_list->mb_cont.mtx_lock = &mbuf_gen;

	/*
	 * Setup the general containers for each object.
	 */
	mb_list_mbuf.gen_list->mb_cont.num_owner =
	    mb_list_clust.gen_list->mb_cont.num_owner =
	    mb_list_cnt.gen_list->mb_cont.num_owner = MB_GENLIST_OWNER;
	mb_list_mbuf.gen_list->mb_cont.starved =
	    mb_list_clust.gen_list->mb_cont.starved =
	    mb_list_cnt.gen_list->mb_cont.starved = 0;
	SLIST_INIT(&(mb_list_mbuf.gen_list->mb_cont.buck_head));
	SLIST_INIT(&(mb_list_clust.gen_list->mb_cont.buck_head));
	SLIST_INIT(&(mb_list_cnt.gen_list->mb_cont.buck_head));

	/*
	 * Allocate and initialize PCPU containers.
	 */
	for (i = 0; i < NCPU; i++) {
		mb_list_mbuf.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list),
		    M_MBUF, M_NOWAIT);
		mb_list_clust.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list),
		    M_MBUF, M_NOWAIT);
		mb_list_cnt.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list),
		    M_MBUF, M_NOWAIT);
		if ((mb_list_mbuf.cnt_lst[i] == NULL) ||
		    (mb_list_clust.cnt_lst[i] == NULL) ||
		    (mb_list_cnt.cnt_lst[i] == NULL))
			goto bad;

		mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", 0);
		mb_list_mbuf.cnt_lst[i]->mb_cont.mtx_lock =
		    mb_list_clust.cnt_lst[i]->mb_cont.mtx_lock =
		    mb_list_cnt.cnt_lst[i]->mb_cont.mtx_lock = &mbuf_pcpu[i];

		mb_list_mbuf.cnt_lst[i]->mb_cont.num_owner =
		    mb_list_clust.cnt_lst[i]->mb_cont.num_owner =
		    mb_list_cnt.cnt_lst[i]->mb_cont.num_owner = i;
		mb_list_mbuf.cnt_lst[i]->mb_cont.starved =
		    mb_list_clust.cnt_lst[i]->mb_cont.starved =
		    mb_list_cnt.cnt_lst[i]->mb_cont.starved = 0;
		mb_list_mbuf.cnt_lst[i]->obj_count =
		    mb_list_clust.cnt_lst[i]->obj_count =
		    mb_list_cnt.cnt_lst[i]->obj_count = 0;

		SLIST_INIT(&(mb_list_mbuf.cnt_lst[i]->mb_cont.buck_head));
		SLIST_INIT(&(mb_list_clust.cnt_lst[i]->mb_cont.buck_head));
		SLIST_INIT(&(mb_list_cnt.cnt_lst[i]->mb_cont.buck_head));

		/*
		 * Perform initial allocations.
		 */
		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
		MB_LOCK_CONT(pcpu_cnt);
		for (j = 0; j < NMB_MBUF_INIT; j++) {
			if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
			    == NULL)
				goto bad;
		}
		MB_UNLOCK_CONT(pcpu_cnt);

		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
		MB_LOCK_CONT(pcpu_cnt);
		for (j = 0; j < NMB_CLUST_INIT; j++) {
			if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
			    == NULL)
				goto bad;
		}
		MB_UNLOCK_CONT(pcpu_cnt);

		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_cnt, i);
		MB_LOCK_CONT(pcpu_cnt);
		for (j = 0; j < NMB_CNT_INIT; j++) {
			if (mb_pop_cont(&mb_list_cnt, M_DONTWAIT, pcpu_cnt)
			    == NULL)
				goto bad;
		}
		MB_UNLOCK_CONT(pcpu_cnt);
	}

	return;
bad:
	panic("mb_init(): failed to initialize mbuf subsystem!");
}

/*
 * Populate a given mbuf PCPU container with a bucket full of fresh new
 * buffers. Return a pointer to the new bucket (already in the container if
 * successful), or return NULL on failure.
 *
 * LOCKING NOTES:
 * PCPU container lock must be held when this is called.
 * The lock is dropped here so that we can cleanly call the underlying VM
 * code. If we fail, we return with no locks held. If we succeed (i.e. return
 * non-NULL), we return with the PCPU lock held, ready for allocation from
 * the returned bucket.
 */
struct mb_bucket *
mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
{
	struct	mb_bucket *bucket;
	caddr_t	p;
	int	i;

	MB_UNLOCK_CONT(cnt_lst);

	/*
	 * If our object's (finite) map is starved now (i.e. no more address
	 * space), bail out now.
	 */
	if (mb_list->map_full)
		return (NULL);

	bucket = malloc(sizeof(struct mb_bucket) +
	    PAGE_SIZE / mb_list->obj_size * sizeof(void *), M_MBUF,
	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
	if (bucket == NULL)
		return (NULL);

	p = (caddr_t)kmem_malloc(mb_list->map, PAGE_SIZE,
	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
	if (p == NULL) {
		free(bucket, M_MBUF);
		return (NULL);
	}

	bucket->num_free = 0;
	mb_list->b_table[MB_BUCKET_INDX(p, mb_list)] = bucket;
	for (i = 0; i < (PAGE_SIZE / mb_list->obj_size); i++) {
		bucket->free[i] = p;
		bucket->num_free++;
		p += mb_list->obj_size;
	}

	MB_LOCK_CONT(cnt_lst);
	bucket->bckt_owner = cnt_lst->mb_cont.num_owner;
	SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head), bucket, buck_list);
	cnt_lst->obj_count += bucket->num_free;

	return (bucket);
}

/*
 * Allocate an object residing in a submap of mb_map.
 * The general case is very easy. Complications only arise if our PCPU
 * container is empty. Things get worse if the PCPU container is empty,
 * the general container is empty, and we've run out of address space
 * in our map; then we try to block if we're willing to (M_TRYWAIT).
 */
void *
mb_alloc(struct mb_lstmngr *mb_list, int how)
{
	struct	mb_pcpu_list *cnt_lst;
	struct	mb_bucket *bucket;
	void	*m;

	m = NULL;
	cnt_lst = MB_GET_PCPU_LIST(mb_list);
	MB_LOCK_CONT(cnt_lst);

	if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.buck_head))) != NULL) {
		/*
		 * This is the easy allocation case. We just grab an object
		 * from a bucket in the PCPU container. At worst, we
		 * have just emptied the bucket and so we remove it
		 * from the container.
		 */
		bucket->num_free--;
		m = bucket->free[(bucket->num_free)];
		cnt_lst->obj_count--;
		if (bucket->num_free == 0) {
			/*
			 * The bucket is now empty, so mark it so after it has
			 * been removed from the PCPU list on which it sits.
			 */
			SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head),
			    buck_list);
			SLIST_NEXT(bucket, buck_list) = NULL;
			bucket->bckt_owner |= MB_BUCKET_FREE;
		}
		MB_UNLOCK_CONT(cnt_lst);
	} else {
		struct	mb_gen_list *gen_list;

		/*
		 * This is the less-common more difficult case. We must
		 * first verify if the general list has anything for us
		 * and if that also fails, we must allocate a page from
		 * the map and create a new bucket to place in our PCPU
		 * container (already locked). If the map is starved then
		 * we're really in for trouble, as we have to wait on
		 * the general container's condition variable.
		 */
		gen_list = MB_GET_GEN_LIST(mb_list);
		MB_LOCK_CONT(gen_list);

		if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head)))
		    != NULL) {
			/*
			 * Give ownership of the bucket to our CPU's
			 * container, but only actually put the bucket
			 * in the container if it doesn't become free
			 * upon removing an mbuf from it.
			 */
			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.buck_head),
			    buck_list);
			bucket->bckt_owner = cnt_lst->mb_cont.num_owner;
			bucket->num_free--;
			m = bucket->free[(bucket->num_free)];
			if (bucket->num_free == 0) {
				SLIST_NEXT(bucket, buck_list) = NULL;
				bucket->bckt_owner |= MB_BUCKET_FREE;
			} else {
				SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head),
				     bucket, buck_list);
				cnt_lst->obj_count += bucket->num_free;
			}
			MB_UNLOCK_CONT(gen_list);
			MB_UNLOCK_CONT(cnt_lst);
		} else {
			/*
			 * We'll have to allocate a new page.
			 */
			MB_UNLOCK_CONT(gen_list);
			bucket = mb_pop_cont(mb_list, how, cnt_lst);
			if (bucket != NULL) {
				bucket->num_free--;
				m = bucket->free[(bucket->num_free)];
				cnt_lst->obj_count--;
				MB_UNLOCK_CONT(cnt_lst);
			} else if (how == M_TRYWAIT)
			/*
			 * Absolute worst-case scenario. We block if we're
			 * willing to, but only after trying to steal from
			 * other lists.
			 */
				m = mb_alloc_wait(mb_list);
		}
	}

	return (m);
}

/*
 * This is the worst-case scenario called only if we're allocating with
 * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
 * by looking in every PCPU container. If we're still unsuccesful, we
 * try the general container one last time and possibly block on our
 * starved cv.
 */
void *
mb_alloc_wait(struct mb_lstmngr *mb_list)
{
	struct	mb_pcpu_list *cnt_lst;
	struct	mb_gen_list *gen_list;
	struct	mb_bucket *bucket;
	void	*m;
	int	i, cv_ret;

	/*
	 * Try to reclaim mbuf-related objects (mbufs, clusters).
	 */
	mb_reclaim();

	/*
	 * Cycle all the PCPU containers. Increment starved counts if found
	 * empty.
	 */
	for (i = 0; i < NCPU; i++) {
		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
		MB_LOCK_CONT(cnt_lst);

		/*
		 * If container is non-empty, steal a single object from it.
		 * If empty, increment starved count.
		 */
		if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.buck_head))) !=
		    NULL) {
			bucket->num_free--;
			m = bucket->free[(bucket->num_free)];
			cnt_lst->obj_count--;
			if (bucket->num_free == 0) {
				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head),
				    buck_list);
				SLIST_NEXT(bucket, buck_list) = NULL;
				bucket->bckt_owner |= MB_BUCKET_FREE;
			}
			MB_UNLOCK_CONT(cnt_lst);
			return (m);
		} else
			cnt_lst->mb_cont.starved++;

		MB_UNLOCK_CONT(cnt_lst);
	}

	/*
	 * We're still here, so that means it's time to get the general
	 * container lock, check it one more time (now that mb_reclaim()
	 * has been called) and if we still get nothing, block on the cv.
	 */
	gen_list = MB_GET_GEN_LIST(mb_list);
	MB_LOCK_CONT(gen_list);
	if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head))) != NULL) {
		bucket->num_free--;
		m = bucket->free[(bucket->num_free)];
		if (bucket->num_free == 0) {
			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.buck_head),
			    buck_list);
			SLIST_NEXT(bucket, buck_list) = NULL;
			bucket->bckt_owner |= MB_BUCKET_FREE;
		}
		MB_UNLOCK_CONT(gen_list);
		return (m);
	}

	gen_list->mb_cont.starved++;
	cv_ret = cv_timedwait(&(gen_list->m_starved),
	    gen_list->mb_cont.mtx_lock, mbuf_wait);
	gen_list->mb_cont.starved--;

	if ((cv_ret == 0) &&
	    ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head))) != NULL)) {
		bucket->num_free--;
		m = bucket->free[(bucket->num_free)];
		if (bucket->num_free == 0) {
			SLIST_REMOVE_HEAD(
			    &(gen_list->mb_cont.buck_head), buck_list);
			SLIST_NEXT(bucket, buck_list) = NULL;
			bucket->bckt_owner |= MB_BUCKET_FREE;
		}
	} else
		m = NULL;

	MB_UNLOCK_CONT(gen_list);
	return (m);
}

/*
 * Free an object to its rightful container.
 * In the very general case, this operation is really very easy.
 * Complications arise primarily if:
 *	(a) We've hit the high limit on number of free objects allowed in
 *	    our PCPU container.
 *	(b) We're in a critical situation where our container has been
 *	    marked 'starved' and we need to issue wakeups on the starved
 *	    condition variable.
 *	(c) Minor (odd) cases: our bucket has migrated while we were
 *	    waiting for the lock; our bucket is in the general container;
 *	    our bucket is empty.
 */
void
mb_free(struct mb_lstmngr *mb_list, void *m)
{
	struct	mb_pcpu_list *cnt_lst;
	struct	mb_gen_list *gen_list;
	struct	mb_bucket *bucket;
	u_int	owner;

	bucket = mb_list->b_table[MB_BUCKET_INDX(m, mb_list)];

	/*
	 * Make sure that if after we lock the bucket's present container the
	 * bucket has migrated, that we drop the lock and get the new one.
	 */
retry_lock:
	owner = bucket->bckt_owner & ~MB_BUCKET_FREE;
	switch (owner) {
	case MB_GENLIST_OWNER:
		gen_list = MB_GET_GEN_LIST(mb_list);
		MB_LOCK_CONT(gen_list);
		if (owner != (bucket->bckt_owner & ~MB_BUCKET_FREE)) {
			MB_UNLOCK_CONT(gen_list);
			goto retry_lock;
		}

		/*
		 * If we're intended for the general container, this is
		 * real easy: no migrating required. The only `bogon'
		 * is that we're now contending with all the threads
		 * dealing with the general list, but this is expected.
		 */
		bucket->free[(bucket->num_free)] = m;
		bucket->num_free++;
		if (gen_list->mb_cont.starved > 0)
			cv_signal(&(gen_list->m_starved));
		MB_UNLOCK_CONT(gen_list);
		break;

	default: 
		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
		MB_LOCK_CONT(cnt_lst);
		if (owner != (bucket->bckt_owner & ~MB_BUCKET_FREE)) {
			MB_UNLOCK_CONT(cnt_lst);
			goto retry_lock;
		}

		bucket->free[(bucket->num_free)] = m;
		bucket->num_free++;
		cnt_lst->obj_count++;

		if (cnt_lst->mb_cont.starved > 0) {
			/*
			 * This is a tough case. It means that we've
			 * been flagged at least once to indicate that
			 * we're empty, and that the system is in a critical
			 * situation, so we ought to migrate at least one
			 * bucket over to the general container.
			 * There may or may not be a thread blocking on
			 * the starved condition variable, but chances
			 * are that one will eventually come up soon so
			 * it's better to migrate now than never.
			 */
			gen_list = MB_GET_GEN_LIST(mb_list);
			MB_LOCK_CONT(gen_list);
			SLIST_INSERT_HEAD(&(gen_list->mb_cont.buck_head),
			    bucket, buck_list);
			bucket->bckt_owner = MB_GENLIST_OWNER;
			cnt_lst->obj_count--;

			/*
			 * Determine whether or not to keep transferring
			 * buckets to the general list or whether we've
			 * transferred enough already.
			 * We realize that although we may flag another
			 * bucket to be migrated to the general container
			 * that in the meantime, the thread that was
			 * blocked on the cv is already woken up and
			 * long gone. But in that case, the worst
			 * consequence is that we will end up migrating
			 * one bucket too many, which is really not a big
			 * deal, especially if we're close to a critical
			 * situation.
			 */
			if (gen_list->mb_cont.starved > 0) {
				cnt_lst->mb_cont.starved--;
				cv_signal(&(gen_list->m_starved));
			} else
				cnt_lst->mb_cont.starved = 0;

			MB_UNLOCK_CONT(gen_list);
			MB_UNLOCK_CONT(cnt_lst);
			break;
		}

		if (cnt_lst->obj_count > *(mb_list->wm_high)) {
			/*
			 * We've hit the high limit of allowed numbers of mbufs
			 * on this PCPU list. We must now migrate a bucket
			 * over to the general container.
			 */
			gen_list = MB_GET_GEN_LIST(mb_list);
			MB_LOCK_CONT(gen_list);
			if ((bucket->bckt_owner & MB_BUCKET_FREE) == 0) {
				bucket =
				    SLIST_FIRST(&(cnt_lst->mb_cont.buck_head));
				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head),
				    buck_list);
			}
			SLIST_INSERT_HEAD(&(gen_list->mb_cont.buck_head),
			    bucket, buck_list);
			bucket->bckt_owner = MB_GENLIST_OWNER;
			cnt_lst->obj_count -= bucket->num_free;

			MB_UNLOCK_CONT(gen_list);
			MB_UNLOCK_CONT(cnt_lst);
			break;
		}

		if (bucket->bckt_owner & MB_BUCKET_FREE) {
			SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head),
			    bucket, buck_list);
			bucket->bckt_owner = cnt_lst->mb_cont.num_owner;
		}

		MB_UNLOCK_CONT(cnt_lst);
		break;
	}

	return;
}

/*
 * Drain protocols in hopes to free up some resources.
 *
 * LOCKING NOTES:
 * No locks should be held when this is called. The drain routines have to
 * presently acquire some locks which raises the possibility of lock order
 * violation if we're holding any mutex if that mutex is acquired in reverse
 * order relative to one of the locks in the drain routines.
 */
void
mb_reclaim(void)
{
	struct	domain *dp;
	struct	protosw *pr;

#ifdef WITNESS
	KASSERT(witness_list(curproc) == 0,
	    ("mb_reclaim() called with locks held"));
#endif

	for (dp = domains; dp; dp = dp->dom_next)
		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
			if (pr->pr_drain)
				(*pr->pr_drain)();

}