/* * Copyright (c) 2001 * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by members and contributors * of The FreeBSD Project (http://www.FreeBSD.org/) * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX: Ack, where do we get this from for real? */ #define NCPU 2 /* * The mbuf allocator is heavily based on Alfred Perlstein's * (alfred@FreeBSD.org) "memcache" allocator which is itself based * on concepts from several per-CPU memory allocators. The difference * between this allocator and memcache is that, among other things: * * (i) We don't need to do as many things as memcache so we shouldn't waste * resources worrying about it (e.g. we don't want to ever free back to * mb_map). * * (ii) We want to leave room for future optimizations which may allow us * to inline a portion of "the easy allocation," provided that the * generated code is small enough. * * (iii) We block on a condition variable in the worst-case scenario, and * attempt to "steal" objects from other lists. Before we do either, * we also drain protocols, a task very mbuf-system-specific. * * The mbuf allocator keeps all objects that it allocates in mb_buckets. * The buckets keep a page worth of objects (an object can be an mbuf, an * mbuf cluster, or an external object reference counter) and facilitate * moving larger sets of contiguous objects (objects from the same page) * from the per-CPU lists to the main list for the given object. The buckets * also have an added advantage in that after several moves from a per-CPU * list to the main list and back to the per-CPU list, contiguous objects * are kept together, thus trying to put the TLB cache to good use. * * The buckets are kept on singly-linked lists called "containers." A container * is protected by a mutex lock in order to ensure consistency. The mutex lock * itself is allocated seperately and attached to the container at boot time, * thus allowing for certain containers to share the same mutex lock. Per-CPU * containers for mbufs, clusters, and counters all share the same per-CPU * lock whereas the "general system" containers (i.e. the "main lists") for * these objects share one global lock. * * When, during allocation, the per-CPU container, the main container for * the given object, and the space reserved in mb_map for that object are * all depleted then, depending on whether the allocation was done with * M_TRYWAIT, we may be allowed to block for a maximum of mbuf_wait ticks. * The blocking is implemented with a condition variable found only in the * main (general) container for the given object. During exhaustion, all * freeing is done to the general list so that any blockers can pick up * whatever comes in first. * */ struct mb_bucket { SLIST_ENTRY(mb_bucket) buck_list; int bckt_owner; int num_free; void *free[1]; }; struct mb_container { SLIST_HEAD(, mb_bucket) buck_head; struct mtx *mtx_lock; int num_owner; u_int starved; }; struct mb_gen_list { struct mb_container mb_cont; struct cv m_starved; }; struct mb_pcpu_list { struct mb_container mb_cont; u_int obj_count; }; /* * Parameters used to scale the size of mb_map and its submaps. * These are tunable at boottime. */ int nmbufs; int nmbclusters; int nmbcnt; #ifndef NMBCLUSTERS #define NMBCLUSTERS (512 + MAXUSERS * 16) #endif #ifndef NMBUFS #define NMBUFS (NMBCLUSTERS * 4) #endif #ifndef NMBCNTS #define NMBCNTS (NMBCLUSTERS + nsfbufs) #endif TUNABLE_INT_DECL("kern.ipc.nmbufs", NMBUFS, nmbufs); TUNABLE_INT_DECL("kern.ipc.nmbclusters", NMBCLUSTERS, nmbclusters); TUNABLE_INT_DECL("kern.ipc.nmbcnt", NMBCNTS, nmbcnt); /* * The freelist structures and mutex locks. The number statically declared * here depends on the number of CPUs. * * We setup in such a way that all the objects (mbufs, clusters, ref counters) * share the same mutex lock. It has been established that we do not benefit * from different locks for different objects, so we use the same lock, * regardless of object type. */ struct mb_lstmngr { struct mb_gen_list *gen_list; struct mb_pcpu_list *cnt_lst[NCPU]; vm_map_t map; vm_offset_t map_base; vm_offset_t map_top; struct mb_bucket **b_table; int map_full; u_int obj_size; u_int *wm_high; }; struct mb_lstmngr mb_list_mbuf, mb_list_clust, mb_list_cnt; struct mtx mbuf_gen, mbuf_pcpu[NCPU]; #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->cnt_lst[PCPU_GET(cpuid)] #define MB_GET_PCPU_LIST_NUM(mb_lst, num) (mb_lst)->cnt_lst[(num)] #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->gen_list #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mtx_lock) #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mtx_lock) #define MB_BUCKET_INDX(mb_obj, mb_lst) \ (int)(((char *)(mb_obj) - (char *)(mb_lst)->map_base) / PAGE_SIZE) /* * Ownership of buckets/containers is managed through integers. The PCPU * lists range from 0 to NCPU-1. We need a free numerical id for the general * list (usually NCPU). We also need a non-conflicting free bit to indicate * that the bucket is free and removed from a container, while not losing * the bucket's originating container id. We usually use the highest bit * for the free marker. */ #define MB_GENLIST_OWNER (NCPU) #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) /* * sysctl(8) exported objects */ SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "Maximum number of mbuf clusters available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, "Maximum number of mbufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0, "Maximum number of ext_buf counters available"); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, "Sleep time of mbuf subsystem wait allocations during exhaustion"); SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0, "Upper limit of number of mbufs allowed on each PCPU list"); SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0, "Upper limit of number of mbuf clusters allowed on each PCPU list"); SYSCTL_UINT(_kern_ipc, OID_AUTO, cnt_limit, CTLFLAG_RW, &cnt_limit, 0, "Upper limit of number of m_ext counters allowed on each PCPU list"); /* * Prototypes of local (internal) routines. */ void *mb_alloc_wait(struct mb_lstmngr *); static void mb_init(void *); struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, struct mb_pcpu_list *); void mb_reclaim(void); /* * Initial allocation numbers. Each parameter represents the number of buckets * of each object that will be placed initially in each PCPU container for * said object. */ #define NMB_MBUF_INIT 2 #define NMB_CLUST_INIT 2 #define NMB_CNT_INIT (NMBCLUSTERS * sizeof(struct mext_refcnt) / PAGE_SIZE) /* * Initialize the mbuf subsystem. * * We sub-divide the mb_map into several submaps; this way, we don't have * to worry about artificially limiting the number of mbuf or mbuf cluster * allocations, due to fear of one type of allocation "stealing" address * space initially reserved for another. * * Setup both the general containers and all the PCPU containers. Populate * the PCPU containers with initial numbers. */ MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mb_init, NULL) static void mb_init(void *dummy) { struct mb_pcpu_list *pcpu_cnt; vm_offset_t maxaddr, mb_map_base; vm_size_t mb_map_size; int i, j; /* * Setup the mb_map, allocate requested VM space. */ mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt * sizeof(struct mext_refcnt)); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_map = kmem_suballoc(kmem_map, &mb_map_base, &maxaddr, mb_map_size); /* XXX XXX XXX: mb_map->system_map = 1; */ /* * Setup all the submaps, for each type of object that we deal * with in this allocator. */ mb_map_size = (vm_size_t)(nmbufs * MSIZE); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_list_mbuf.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE, M_MBUF, M_NOWAIT); if (mb_list_mbuf.b_table == NULL) goto bad; mb_list_mbuf.map = kmem_suballoc(mb_map, &(mb_list_mbuf.map_base), &(mb_list_mbuf.map_top), mb_map_size); mb_list_mbuf.map_full = 0; mb_list_mbuf.obj_size = MSIZE; mb_list_mbuf.wm_high = &mbuf_limit; mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_list_clust.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE, M_MBUF, M_NOWAIT); if (mb_list_clust.b_table == NULL) goto bad; mb_list_clust.map = kmem_suballoc(mb_map, &(mb_list_clust.map_base), &(mb_list_clust.map_top), mb_map_size); mb_list_clust.map_full = 0; mb_list_clust.obj_size = MCLBYTES; mb_list_clust.wm_high = &clust_limit; mb_map_size = (vm_size_t)(nmbcnt * sizeof(struct mext_refcnt)); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_list_cnt.b_table = malloc((unsigned long)mb_map_size / PAGE_SIZE, M_MBUF, M_NOWAIT); if (mb_list_cnt.b_table == NULL) goto bad; mb_list_cnt.map = kmem_suballoc(mb_map, &(mb_list_cnt.map_base), &(mb_list_cnt.map_top), mb_map_size); mb_list_cnt.map_full = 0; mb_list_cnt.obj_size = sizeof(struct mext_refcnt); mb_list_cnt.wm_high = &cnt_limit; /* XXX XXX XXX: mbuf_map->system_map = clust_map->system_map = refcnt_map->system_map = 1; */ /* * Allocate required general (global) containers for each object type. */ mb_list_mbuf.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); mb_list_clust.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); mb_list_cnt.gen_list = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.gen_list == NULL) || (mb_list_clust.gen_list == NULL) || (mb_list_cnt.gen_list == NULL)) goto bad; /* * Initialize condition variables and general container mutex locks. */ mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", 0); cv_init(&(mb_list_mbuf.gen_list->m_starved), "mbuf pool starved"); cv_init(&(mb_list_clust.gen_list->m_starved), "mcluster pool starved"); cv_init(&(mb_list_cnt.gen_list->m_starved), "mext cntr pool starved"); mb_list_mbuf.gen_list->mb_cont.mtx_lock = mb_list_clust.gen_list->mb_cont.mtx_lock = mb_list_cnt.gen_list->mb_cont.mtx_lock = &mbuf_gen; /* * Setup the general containers for each object. */ mb_list_mbuf.gen_list->mb_cont.num_owner = mb_list_clust.gen_list->mb_cont.num_owner = mb_list_cnt.gen_list->mb_cont.num_owner = MB_GENLIST_OWNER; mb_list_mbuf.gen_list->mb_cont.starved = mb_list_clust.gen_list->mb_cont.starved = mb_list_cnt.gen_list->mb_cont.starved = 0; SLIST_INIT(&(mb_list_mbuf.gen_list->mb_cont.buck_head)); SLIST_INIT(&(mb_list_clust.gen_list->mb_cont.buck_head)); SLIST_INIT(&(mb_list_cnt.gen_list->mb_cont.buck_head)); /* * Allocate and initialize PCPU containers. */ for (i = 0; i < NCPU; i++) { mb_list_mbuf.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); mb_list_clust.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); mb_list_cnt.cnt_lst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.cnt_lst[i] == NULL) || (mb_list_clust.cnt_lst[i] == NULL) || (mb_list_cnt.cnt_lst[i] == NULL)) goto bad; mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", 0); mb_list_mbuf.cnt_lst[i]->mb_cont.mtx_lock = mb_list_clust.cnt_lst[i]->mb_cont.mtx_lock = mb_list_cnt.cnt_lst[i]->mb_cont.mtx_lock = &mbuf_pcpu[i]; mb_list_mbuf.cnt_lst[i]->mb_cont.num_owner = mb_list_clust.cnt_lst[i]->mb_cont.num_owner = mb_list_cnt.cnt_lst[i]->mb_cont.num_owner = i; mb_list_mbuf.cnt_lst[i]->mb_cont.starved = mb_list_clust.cnt_lst[i]->mb_cont.starved = mb_list_cnt.cnt_lst[i]->mb_cont.starved = 0; mb_list_mbuf.cnt_lst[i]->obj_count = mb_list_clust.cnt_lst[i]->obj_count = mb_list_cnt.cnt_lst[i]->obj_count = 0; SLIST_INIT(&(mb_list_mbuf.cnt_lst[i]->mb_cont.buck_head)); SLIST_INIT(&(mb_list_clust.cnt_lst[i]->mb_cont.buck_head)); SLIST_INIT(&(mb_list_cnt.cnt_lst[i]->mb_cont.buck_head)); /* * Perform initial allocations. */ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_MBUF_INIT; j++) { if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_CLUST_INIT; j++) { if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_cnt, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_CNT_INIT; j++) { if (mb_pop_cont(&mb_list_cnt, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); } return; bad: panic("mb_init(): failed to initialize mbuf subsystem!"); } /* * Populate a given mbuf PCPU container with a bucket full of fresh new * buffers. Return a pointer to the new bucket (already in the container if * successful), or return NULL on failure. * * LOCKING NOTES: * PCPU container lock must be held when this is called. * The lock is dropped here so that we can cleanly call the underlying VM * code. If we fail, we return with no locks held. If we succeed (i.e. return * non-NULL), we return with the PCPU lock held, ready for allocation from * the returned bucket. */ struct mb_bucket * mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) { struct mb_bucket *bucket; caddr_t p; int i; MB_UNLOCK_CONT(cnt_lst); /* * If our object's (finite) map is starved now (i.e. no more address * space), bail out now. */ if (mb_list->map_full) return (NULL); bucket = malloc(sizeof(struct mb_bucket) + PAGE_SIZE / mb_list->obj_size * sizeof(void *), M_MBUF, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (bucket == NULL) return (NULL); p = (caddr_t)kmem_malloc(mb_list->map, PAGE_SIZE, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (p == NULL) { free(bucket, M_MBUF); return (NULL); } bucket->num_free = 0; mb_list->b_table[MB_BUCKET_INDX(p, mb_list)] = bucket; for (i = 0; i < (PAGE_SIZE / mb_list->obj_size); i++) { bucket->free[i] = p; bucket->num_free++; p += mb_list->obj_size; } MB_LOCK_CONT(cnt_lst); bucket->bckt_owner = cnt_lst->mb_cont.num_owner; SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head), bucket, buck_list); cnt_lst->obj_count += bucket->num_free; return (bucket); } /* * Allocate an object residing in a submap of mb_map. * The general case is very easy. Complications only arise if our PCPU * container is empty. Things get worse if the PCPU container is empty, * the general container is empty, and we've run out of address space * in our map; then we try to block if we're willing to (M_TRYWAIT). */ void * mb_alloc(struct mb_lstmngr *mb_list, int how) { struct mb_pcpu_list *cnt_lst; struct mb_bucket *bucket; void *m; m = NULL; cnt_lst = MB_GET_PCPU_LIST(mb_list); MB_LOCK_CONT(cnt_lst); if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.buck_head))) != NULL) { /* * This is the easy allocation case. We just grab an object * from a bucket in the PCPU container. At worst, we * have just emptied the bucket and so we remove it * from the container. */ bucket->num_free--; m = bucket->free[(bucket->num_free)]; cnt_lst->obj_count--; if (bucket->num_free == 0) { /* * The bucket is now empty, so mark it so after it has * been removed from the PCPU list on which it sits. */ SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head), buck_list); SLIST_NEXT(bucket, buck_list) = NULL; bucket->bckt_owner |= MB_BUCKET_FREE; } MB_UNLOCK_CONT(cnt_lst); } else { struct mb_gen_list *gen_list; /* * This is the less-common more difficult case. We must * first verify if the general list has anything for us * and if that also fails, we must allocate a page from * the map and create a new bucket to place in our PCPU * container (already locked). If the map is starved then * we're really in for trouble, as we have to wait on * the general container's condition variable. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head))) != NULL) { /* * Give ownership of the bucket to our CPU's * container, but only actually put the bucket * in the container if it doesn't become free * upon removing an mbuf from it. */ SLIST_REMOVE_HEAD(&(gen_list->mb_cont.buck_head), buck_list); bucket->bckt_owner = cnt_lst->mb_cont.num_owner; bucket->num_free--; m = bucket->free[(bucket->num_free)]; if (bucket->num_free == 0) { SLIST_NEXT(bucket, buck_list) = NULL; bucket->bckt_owner |= MB_BUCKET_FREE; } else { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head), bucket, buck_list); cnt_lst->obj_count += bucket->num_free; } MB_UNLOCK_CONT(gen_list); MB_UNLOCK_CONT(cnt_lst); } else { /* * We'll have to allocate a new page. */ MB_UNLOCK_CONT(gen_list); bucket = mb_pop_cont(mb_list, how, cnt_lst); if (bucket != NULL) { bucket->num_free--; m = bucket->free[(bucket->num_free)]; cnt_lst->obj_count--; MB_UNLOCK_CONT(cnt_lst); } else if (how == M_TRYWAIT) /* * Absolute worst-case scenario. We block if we're * willing to, but only after trying to steal from * other lists. */ m = mb_alloc_wait(mb_list); } } return (m); } /* * This is the worst-case scenario called only if we're allocating with * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf * by looking in every PCPU container. If we're still unsuccesful, we * try the general container one last time and possibly block on our * starved cv. */ void * mb_alloc_wait(struct mb_lstmngr *mb_list) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; void *m; int i, cv_ret; /* * Try to reclaim mbuf-related objects (mbufs, clusters). */ mb_reclaim(); /* * Cycle all the PCPU containers. Increment starved counts if found * empty. */ for (i = 0; i < NCPU; i++) { cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); MB_LOCK_CONT(cnt_lst); /* * If container is non-empty, steal a single object from it. * If empty, increment starved count. */ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.buck_head))) != NULL) { bucket->num_free--; m = bucket->free[(bucket->num_free)]; cnt_lst->obj_count--; if (bucket->num_free == 0) { SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head), buck_list); SLIST_NEXT(bucket, buck_list) = NULL; bucket->bckt_owner |= MB_BUCKET_FREE; } MB_UNLOCK_CONT(cnt_lst); return (m); } else cnt_lst->mb_cont.starved++; MB_UNLOCK_CONT(cnt_lst); } /* * We're still here, so that means it's time to get the general * container lock, check it one more time (now that mb_reclaim() * has been called) and if we still get nothing, block on the cv. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head))) != NULL) { bucket->num_free--; m = bucket->free[(bucket->num_free)]; if (bucket->num_free == 0) { SLIST_REMOVE_HEAD(&(gen_list->mb_cont.buck_head), buck_list); SLIST_NEXT(bucket, buck_list) = NULL; bucket->bckt_owner |= MB_BUCKET_FREE; } MB_UNLOCK_CONT(gen_list); return (m); } gen_list->mb_cont.starved++; cv_ret = cv_timedwait(&(gen_list->m_starved), gen_list->mb_cont.mtx_lock, mbuf_wait); gen_list->mb_cont.starved--; if ((cv_ret == 0) && ((bucket = SLIST_FIRST(&(gen_list->mb_cont.buck_head))) != NULL)) { bucket->num_free--; m = bucket->free[(bucket->num_free)]; if (bucket->num_free == 0) { SLIST_REMOVE_HEAD( &(gen_list->mb_cont.buck_head), buck_list); SLIST_NEXT(bucket, buck_list) = NULL; bucket->bckt_owner |= MB_BUCKET_FREE; } } else m = NULL; MB_UNLOCK_CONT(gen_list); return (m); } /* * Free an object to its rightful container. * In the very general case, this operation is really very easy. * Complications arise primarily if: * (a) We've hit the high limit on number of free objects allowed in * our PCPU container. * (b) We're in a critical situation where our container has been * marked 'starved' and we need to issue wakeups on the starved * condition variable. * (c) Minor (odd) cases: our bucket has migrated while we were * waiting for the lock; our bucket is in the general container; * our bucket is empty. */ void mb_free(struct mb_lstmngr *mb_list, void *m) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; u_int owner; bucket = mb_list->b_table[MB_BUCKET_INDX(m, mb_list)]; /* * Make sure that if after we lock the bucket's present container the * bucket has migrated, that we drop the lock and get the new one. */ retry_lock: owner = bucket->bckt_owner & ~MB_BUCKET_FREE; switch (owner) { case MB_GENLIST_OWNER: gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if (owner != (bucket->bckt_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(gen_list); goto retry_lock; } /* * If we're intended for the general container, this is * real easy: no migrating required. The only `bogon' * is that we're now contending with all the threads * dealing with the general list, but this is expected. */ bucket->free[(bucket->num_free)] = m; bucket->num_free++; if (gen_list->mb_cont.starved > 0) cv_signal(&(gen_list->m_starved)); MB_UNLOCK_CONT(gen_list); break; default: cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); MB_LOCK_CONT(cnt_lst); if (owner != (bucket->bckt_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(cnt_lst); goto retry_lock; } bucket->free[(bucket->num_free)] = m; bucket->num_free++; cnt_lst->obj_count++; if (cnt_lst->mb_cont.starved > 0) { /* * This is a tough case. It means that we've * been flagged at least once to indicate that * we're empty, and that the system is in a critical * situation, so we ought to migrate at least one * bucket over to the general container. * There may or may not be a thread blocking on * the starved condition variable, but chances * are that one will eventually come up soon so * it's better to migrate now than never. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); SLIST_INSERT_HEAD(&(gen_list->mb_cont.buck_head), bucket, buck_list); bucket->bckt_owner = MB_GENLIST_OWNER; cnt_lst->obj_count--; /* * Determine whether or not to keep transferring * buckets to the general list or whether we've * transferred enough already. * We realize that although we may flag another * bucket to be migrated to the general container * that in the meantime, the thread that was * blocked on the cv is already woken up and * long gone. But in that case, the worst * consequence is that we will end up migrating * one bucket too many, which is really not a big * deal, especially if we're close to a critical * situation. */ if (gen_list->mb_cont.starved > 0) { cnt_lst->mb_cont.starved--; cv_signal(&(gen_list->m_starved)); } else cnt_lst->mb_cont.starved = 0; MB_UNLOCK_CONT(gen_list); MB_UNLOCK_CONT(cnt_lst); break; } if (cnt_lst->obj_count > *(mb_list->wm_high)) { /* * We've hit the high limit of allowed numbers of mbufs * on this PCPU list. We must now migrate a bucket * over to the general container. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket->bckt_owner & MB_BUCKET_FREE) == 0) { bucket = SLIST_FIRST(&(cnt_lst->mb_cont.buck_head)); SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.buck_head), buck_list); } SLIST_INSERT_HEAD(&(gen_list->mb_cont.buck_head), bucket, buck_list); bucket->bckt_owner = MB_GENLIST_OWNER; cnt_lst->obj_count -= bucket->num_free; MB_UNLOCK_CONT(gen_list); MB_UNLOCK_CONT(cnt_lst); break; } if (bucket->bckt_owner & MB_BUCKET_FREE) { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.buck_head), bucket, buck_list); bucket->bckt_owner = cnt_lst->mb_cont.num_owner; } MB_UNLOCK_CONT(cnt_lst); break; } return; } /* * Drain protocols in hopes to free up some resources. * * LOCKING NOTES: * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * violation if we're holding any mutex if that mutex is acquired in reverse * order relative to one of the locks in the drain routines. */ void mb_reclaim(void) { struct domain *dp; struct protosw *pr; #ifdef WITNESS KASSERT(witness_list(curproc) == 0, ("mb_reclaim() called with locks held")); #endif for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain) (*pr->pr_drain)(); }