diff -ruN vendor_sys/./alpha/alpha/vm_machdep.c mbuma/src/sys/./alpha/alpha/vm_machdep.c --- vendor_sys/./alpha/alpha/vm_machdep.c Sun Mar 7 14:51:45 2004 +++ mbuma/src/sys/./alpha/alpha/vm_machdep.c Sat Mar 6 15:17:31 2004 @@ -107,6 +107,24 @@ SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ @@ -393,6 +411,9 @@ { struct sf_buf *sf_bufs; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); diff -ruN vendor_sys/./amd64/amd64/vm_machdep.c mbuma/src/sys/./amd64/amd64/vm_machdep.c --- vendor_sys/./amd64/amd64/vm_machdep.c Sun Mar 7 14:52:11 2004 +++ mbuma/src/sys/./amd64/amd64/vm_machdep.c Sat Mar 6 15:17:31 2004 @@ -90,6 +90,24 @@ SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ @@ -446,6 +464,9 @@ { struct sf_buf *sf_bufs; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); diff -ruN vendor_sys/./boot/common/help.common mbuma/src/sys/./boot/common/help.common --- vendor_sys/./boot/common/help.common Sun Mar 7 14:52:14 2004 +++ mbuma/src/sys/./boot/common/help.common Wed Mar 3 23:41:39 2004 @@ -233,12 +233,6 @@ Various kernel tunable parameters can be overridden by specifying new values in the environment. - set kern.ipc.nmbclusters= NMBCLUSTERS - - Set the number of mbuf clusters to be allocated. The value - cannot be set below the default determined when the kernel - was compiled. - set kern.ipc.nsfbufs= NSFBUFS Set the number of sendfile buffers to be allocated. This diff -ruN vendor_sys/./boot/common/loader.8 mbuma/src/sys/./boot/common/loader.8 --- vendor_sys/./boot/common/loader.8 Sun Mar 7 14:52:15 2004 +++ mbuma/src/sys/./boot/common/loader.8 Wed Mar 3 23:41:39 2004 @@ -439,12 +439,6 @@ tunable. When set, this tunable replaces the value declared in the kernel compile-time configuration file. -.It Va kern.ipc.nmbclusters -Set the number of mbuf clusters to be allocated. -The value cannot be set below the default -determined when the kernel was compiled. -Modifies -.Va NMBCLUSTERS . .It Va kern.ipc.nsfbufs Set the number of .Xr sendfile 2 @@ -467,8 +461,7 @@ Only mess around with this parameter if you need to greatly extend the KVM reservation for other resources such as the -buffer cache or -.Va NMBCLUSTERS . +buffer cache. Modifies .Va VM_SWZONE_SIZE_MAX . .It Va kern.maxbcache @@ -480,8 +473,7 @@ KVM in large-memory machine configurations. Only mess around with this parameter if you need to greatly extend the KVM reservation for other resources -such as the swap zone or -.Va NMBCLUSTERS . +such as the swap zone. Note that the NBUF parameter will override this limit. Modifies diff -ruN vendor_sys/./boot/forth/loader.conf mbuma/src/sys/./boot/forth/loader.conf --- vendor_sys/./boot/forth/loader.conf Sun Mar 7 14:52:18 2004 +++ mbuma/src/sys/./boot/forth/loader.conf Mon Mar 1 17:16:08 2004 @@ -94,7 +94,6 @@ #kern.cam.scsi_delay="2000" # Delay (in ms) before probing SCSI #kern.ipc.maxsockets="" # Set the maximum number of sockets avaliable #kern.ipc.nmbclusters="" # Set the number of mbuf clusters -#kern.ipc.nmbufs="" # Set the maximum number of mbufs #kern.ipc.nsfbufs="" # Set the number of sendfile(2) bufs #net.inet.tcp.tcbhashsize="" # Set the value of TCBHASHSIZE #vfs.root.mountfrom="" # Specify root partition in a way the diff -ruN vendor_sys/./conf/NOTES mbuma/src/sys/./conf/NOTES --- vendor_sys/./conf/NOTES Sun Mar 7 14:52:41 2004 +++ mbuma/src/sys/./conf/NOTES Wed Mar 3 23:41:41 2004 @@ -1595,8 +1595,8 @@ # the D-Link DFE-550TX. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the -# 3Com 3c985, the Netgear GA620 and various others. Note that you will -# probably want to bump up NMBCLUSTERS a lot to use this driver. +# 3Com 3c985, the Netgear GA620 and various others. Note that you +# may want to tune up nmbclusters when you use this driver. # tl: Support for the Texas Instruments TNETE100 series 'ThunderLAN' # cards and integrated ethernet controllers. This includes several # Compaq Netelligent 10/100 cards and the built-in ethernet controllers @@ -2361,8 +2361,6 @@ options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers - -options NMBCLUSTERS=1024 # Number of mbuf clusters options SCSI_NCR_DEBUG options SCSI_NCR_MAX_SYNC=10000 diff -ruN vendor_sys/./conf/files mbuma/src/sys/./conf/files --- vendor_sys/./conf/files Sun Mar 7 14:52:43 2004 +++ mbuma/src/sys/./conf/files Fri Mar 5 22:17:47 2004 @@ -1145,7 +1145,6 @@ kern/subr_kobj.c standard kern/subr_log.c standard kern/subr_mbpool.c optional libmbpool -kern/subr_mbuf.c standard kern/subr_mchain.c optional libmchain kern/subr_module.c standard kern/subr_msgbuf.c standard @@ -1693,5 +1692,6 @@ vm/vm_pager.c standard vm/vm_unix.c standard vm/uma_core.c standard +vm/uma_mbuf.c standard vm/uma_dbg.c standard vm/vnode_pager.c standard diff -ruN vendor_sys/./conf/options mbuma/src/sys/./conf/options --- vendor_sys/./conf/options Sun Mar 7 14:52:47 2004 +++ mbuma/src/sys/./conf/options Wed Mar 3 23:41:41 2004 @@ -239,7 +239,6 @@ HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h -NMBCLUSTERS opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h diff -ruN vendor_sys/./dev/en/midway.c mbuma/src/sys/./dev/en/midway.c --- vendor_sys/./dev/en/midway.c Sun Mar 7 14:54:49 2004 +++ mbuma/src/sys/./dev/en/midway.c Mon Mar 1 17:16:10 2004 @@ -424,11 +424,7 @@ * * This is called each time when a map is allocated * from the pool and about to be returned to the user. Here we actually - * allocate the map if there isn't one. The problem is that we may fail - * to allocate the DMA map yet have no means to signal this error. Therefor - * when allocating a map, the call must check that there is a map. An - * additional problem is, that i386 maps will be NULL, yet are ok and must - * be freed so let's use a flag to signal allocation. + * allocate the map if there isn't one. * * Caveat: we have no way to know that we are called from an interrupt context * here. We rely on the fact, that bus_dmamap_create uses M_NOWAIT in all @@ -436,7 +432,7 @@ * * LOCK: any, not needed */ -static void +static int en_map_ctor(void *mem, int size, void *arg) { struct en_softc *sc = arg; @@ -448,13 +444,17 @@ if (!(map->flags & ENMAP_ALLOC)) { err = bus_dmamap_create(sc->txtag, 0, &map->map); - if (err != 0) + map->flags &= ~ENMAP_LOADED; + if (err != 0) { if_printf(&sc->ifatm.ifnet, "cannot create DMA map %d\n", err); - else - map->flags |= ENMAP_ALLOC; + map->flags &= ~ENMAP_LOADED; /* Make sure. */ + uma_zfree(sc->map_zone, map); + return 0; + } + map->flags |= ENMAP_ALLOC; } - map->flags &= ~ENMAP_LOADED; + return 1; } /* @@ -1040,7 +1040,7 @@ * locks. */ map = uma_zalloc_arg(sc->map_zone, sc, M_NOWAIT); - if (map == NULL || !(map->flags & ENMAP_ALLOC)) { + if (map == NULL) { /* drop that packet */ EN_COUNT(sc->stats.txnomap); if (map != NULL) @@ -2329,7 +2329,7 @@ if (m != NULL) { /* M_NOWAIT - called from interrupt context */ map = uma_zalloc_arg(sc->map_zone, sc, M_NOWAIT); - if (map == NULL || !(map->flags & ENMAP_ALLOC)) { + if (map == NULL) { rx.post_skip += mlen; m_freem(m); DBG(sc, SERV, ("rx%td: out of maps", diff -ruN vendor_sys/./dev/raidframe/rf_freebsdkintf.c mbuma/src/sys/./dev/raidframe/rf_freebsdkintf.c --- vendor_sys/./dev/raidframe/rf_freebsdkintf.c Sun Mar 7 14:55:55 2004 +++ mbuma/src/sys/./dev/raidframe/rf_freebsdkintf.c Fri Feb 27 17:23:40 2004 @@ -1270,7 +1270,7 @@ /* XXX Should check return code here */ bioq_init(&sc->bio_queue); sc->sc_cbufpool = uma_zcreate("raidpl", sizeof(struct raidbuf), NULL, - NULL, NULL, NULL, 0, 0); + NULL, NULL, NULL, 0, 0); /* XXX There may be a weird interaction here between this, and * protectedSectors, as used in RAIDframe. */ diff -ruN vendor_sys/./geom/geom_io.c mbuma/src/sys/./geom/geom_io.c --- vendor_sys/./geom/geom_io.c Sun Mar 7 14:57:02 2004 +++ mbuma/src/sys/./geom/geom_io.c Mon Feb 16 19:22:33 2004 @@ -154,9 +154,7 @@ g_bioq_init(&g_bio_run_up); g_bioq_init(&g_bio_run_task); biozone = uma_zcreate("g_bio", sizeof (struct bio), - NULL, NULL, - NULL, NULL, - 0, 0); + NULL, NULL, NULL, NULL, 0, 0); } int diff -ruN vendor_sys/./i386/i386/vm_machdep.c mbuma/src/sys/./i386/i386/vm_machdep.c --- vendor_sys/./i386/i386/vm_machdep.c Sun Mar 7 14:57:16 2004 +++ mbuma/src/sys/./i386/i386/vm_machdep.c Sat Mar 6 15:17:31 2004 @@ -107,6 +107,24 @@ LIST_HEAD(sf_head, sf_buf); /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * A hash table of active sendfile(2) buffers */ static struct sf_head *sf_buf_active; @@ -590,6 +608,9 @@ struct sf_buf *sf_bufs; vm_offset_t sf_base; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask); TAILQ_INIT(&sf_buf_freelist); diff -ruN vendor_sys/./ia64/ia64/pmap.c mbuma/src/sys/./ia64/ia64/pmap.c --- vendor_sys/./ia64/ia64/pmap.c Sun Mar 7 14:57:34 2004 +++ mbuma/src/sys/./ia64/ia64/pmap.c Sun Mar 7 14:00:16 2004 @@ -531,11 +531,13 @@ if (initial_pvs > MAXPV) initial_pvs = MAXPV; pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(pvzone, initial_pvs); ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(ptezone, initial_pvs); /* diff -ruN vendor_sys/./ia64/ia64/vm_machdep.c mbuma/src/sys/./ia64/ia64/vm_machdep.c --- vendor_sys/./ia64/ia64/vm_machdep.c Sun Mar 7 14:57:34 2004 +++ mbuma/src/sys/./ia64/ia64/vm_machdep.c Sat Mar 6 15:17:31 2004 @@ -104,6 +104,24 @@ SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ @@ -332,6 +350,9 @@ { struct sf_buf *sf_bufs; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); diff -ruN vendor_sys/./kern/kern_malloc.c mbuma/src/sys/./kern/kern_malloc.c --- vendor_sys/./kern/kern_malloc.c Sun Mar 7 14:57:40 2004 +++ mbuma/src/sys/./kern/kern_malloc.c Mon Mar 1 20:21:03 2004 @@ -414,7 +414,6 @@ void *dummy; { u_int8_t indx; - u_long npg; u_long mem_size; int i; @@ -466,17 +465,8 @@ */ init_param3(vm_kmem_size / PAGE_SIZE); - /* - * In mbuf_init(), we set up submaps for mbufs and clusters, in which - * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES), - * respectively. Mathematically, this means that what we do here may - * amount to slightly more address space than we need for the submaps, - * but it never hurts to have an extra page in kmem_map. - */ - npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; - kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, - (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE)); + (vm_offset_t *)&kmemlimit, (vm_size_t)vm_kmem_size); kmem_map->system_map = 1; uma_startup2(); diff -ruN vendor_sys/./kern/kern_proc.c mbuma/src/sys/./kern/kern_proc.c --- vendor_sys/./kern/kern_proc.c Sun Mar 7 14:57:40 2004 +++ mbuma/src/sys/./kern/kern_proc.c Fri Feb 27 18:46:35 2004 @@ -78,7 +78,7 @@ static void orphanpg(struct pgrp *pg); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); -static void proc_ctor(void *mem, int size, void *arg); +static int proc_ctor(void *mem, int size, void *arg); static void proc_dtor(void *mem, int size, void *arg); static void proc_init(void *mem, int size); static void proc_fini(void *mem, int size); @@ -132,12 +132,13 @@ /* * Prepare a proc for use. */ -static void +static int proc_ctor(void *mem, int size, void *arg) { struct proc *p; p = (struct proc *)mem; + return 1; } /* diff -ruN vendor_sys/./kern/kern_thread.c mbuma/src/sys/./kern/kern_thread.c --- vendor_sys/./kern/kern_thread.c Sun Mar 7 14:57:42 2004 +++ mbuma/src/sys/./kern/kern_thread.c Wed Mar 3 00:07:26 2004 @@ -135,7 +135,7 @@ /* * Prepare a thread for use. */ -static void +static int thread_ctor(void *mem, int size, void *arg) { struct thread *td; @@ -144,6 +144,7 @@ td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_critnest = 1; + return 1; } /* diff -ruN vendor_sys/./kern/subr_mbuf.c mbuma/src/sys/./kern/subr_mbuf.c --- vendor_sys/./kern/subr_mbuf.c Sun Mar 7 14:57:43 2004 +++ mbuma/src/sys/./kern/subr_mbuf.c Wed Dec 31 19:00:00 1969 @@ -1,1548 +0,0 @@ -/*- - * Copyright (c) 2001, 2002, 2003 - * Bosko Milekic . All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD: src/sys/kern/subr_mbuf.c,v 1.58 2003/12/27 07:52:47 silby Exp $"); - -#include "opt_mac.h" -#include "opt_param.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * mb_alloc: network buffer allocator - * - * XXX: currently, the "low watermark" sysctl is marked read-only as its - * effects are not completely implemented. To be fixed soon. - */ - -/* - * Maximum number of PCPU containers. If you know what you're doing you could - * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your - * system during compilation, and thus prevent kernel structure bloat. - * - * SMP and non-SMP kernels clearly have a different number of possible CPUs, - * but because we cannot assume a dense array of CPUs, we always allocate - * and traverse PCPU containers up to NCPU amount and merely check for - * CPU availability. - */ -#ifdef MBALLOC_NCPU -#define NCPU MBALLOC_NCPU -#else -#define NCPU MAXCPU -#endif - -/*- - * The mbuf allocator is based on Alfred Perlstein's - * "memcache" proof-of-concept allocator which was itself based on - * several well-known SMP-friendly allocators. - * - * The mb_alloc mbuf allocator is a special when compared to other - * general-purpose allocators. Some things to take note of: - * - * Mbufs and mbuf clusters are two different objects. Sometimes we - * will allocate a single mbuf, other times a single cluster, - * other times both. Further, we may sometimes wish to allocate a - * whole chain of mbufs with clusters. This allocator will perform - * the common case of each scenario in one function call (this - * includes constructing or destructing the object) while only - * locking/unlocking the cache once, if it can get away with it. - * The caches consist of pure mbufs and pure clusters; that is - * there are no 'zones' containing mbufs with already pre-hooked - * clusters. Since we can allocate both objects atomically anyway, - * we don't bother fragmenting our caches for any particular 'scenarios.' - * - * We allocate from seperate sub-maps of kmem_map, thus imposing - * an ultimate upper-limit on the number of allocatable clusters - * and mbufs and also, since the clusters all come from a - * virtually contiguous region, we can keep reference counters - * for them and "allocate" them purely by indexing into a - * dense refcount vector. - * - * We call out to protocol drain routines (which can be hooked - * into us) when we're low on space. - * - * The mbuf allocator keeps all objects that it allocates in mb_buckets. - * The buckets keep a number of objects (an object can be an mbuf or an - * mbuf cluster) and facilitate moving larger sets of contiguous objects - * from the per-CPU caches to the global cache. The buckets also have - * the added advantage that objects, when migrated from cache to cache, - * are migrated in chunks that keep contiguous objects together, - * minimizing TLB pollution. - * - * The buckets are kept on singly-linked lists called "containers." A container - * is protected by a mutex in order to ensure consistency. The mutex - * itself is allocated separately and attached to the container at boot time, - * thus allowing for certain containers to share the same lock. Per-CPU - * containers for mbufs and mbuf clusters all share the same per-CPU - * lock whereas the global cache containers for these objects share one - * global lock. - */ -struct mb_bucket { - SLIST_ENTRY(mb_bucket) mb_blist; - int mb_owner; - int mb_numfree; - void *mb_free[0]; -}; - -struct mb_container { - SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead; - struct mtx *mc_lock; - int mc_numowner; - u_int mc_starved; - long *mc_types; - u_long *mc_objcount; - u_long *mc_numbucks; -}; - -struct mb_gen_list { - struct mb_container mb_cont; - struct cv mgl_mstarved; -}; - -struct mb_pcpu_list { - struct mb_container mb_cont; -}; - -/* - * Boot-time configurable object counts that will determine the maximum - * number of permitted objects in the mbuf and mcluster cases. In the - * ext counter (nmbcnt) case, it's just an indicator serving to scale - * kmem_map size properly - in other words, we may be allowed to allocate - * more than nmbcnt counters, whereas we will never be allowed to allocate - * more than nmbufs mbufs or nmbclusters mclusters. - * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be - * allocatable by the sfbuf allocator (found in uipc_syscalls.c) - */ -#ifndef NMBCLUSTERS -#define NMBCLUSTERS (1024 + maxusers * 64) -#endif -#ifndef NMBUFS -#define NMBUFS (nmbclusters * 2) -#endif -#ifndef NSFBUFS -#define NSFBUFS (512 + maxusers * 16) -#endif -#ifndef NMBCNTS -#define NMBCNTS (nmbclusters + nsfbufs) -#endif -int nmbufs; -int nmbclusters; -int nmbcnt; -int nsfbufs; -int nsfbufspeak; -int nsfbufsused; - -/* - * Sizes of objects per bucket. There are this size's worth of mbufs - * or clusters in each bucket. Please keep these a power-of-2. - */ -#define MBUF_BUCK_SZ (PAGE_SIZE * 2) -#define CLUST_BUCK_SZ (PAGE_SIZE * 4) - -/* - * Perform sanity checks of tunables declared above. - */ -static void -tunable_mbinit(void *dummy) -{ - - /* - * This has to be done before VM init. - */ - nmbclusters = NMBCLUSTERS; - TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); - nmbufs = NMBUFS; - TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); - nsfbufs = NSFBUFS; - TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); - nmbcnt = NMBCNTS; - TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt); - /* Sanity checks */ - if (nmbufs < nmbclusters * 2) - nmbufs = nmbclusters * 2; - if (nmbcnt < nmbclusters + nsfbufs) - nmbcnt = nmbclusters + nsfbufs; -} -SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); - -/* - * The freelist structures and mutex locks. The number statically declared - * here depends on the number of CPUs. - * - * We set up in such a way that all the objects (mbufs, clusters) - * share the same mutex lock. It has been established that we do not benefit - * from different locks for different objects, so we use the same lock, - * regardless of object type. This also allows us to do optimised - * multi-object allocations without dropping the lock in between. - */ -struct mb_lstmngr { - struct mb_gen_list *ml_genlist; - struct mb_pcpu_list *ml_cntlst[NCPU]; - struct mb_bucket **ml_btable; - vm_map_t ml_map; - vm_offset_t ml_mapbase; - vm_offset_t ml_maptop; - int ml_mapfull; - u_int ml_objsize; - u_int ml_objbucks; - u_int *ml_wmhigh; - u_int *ml_wmlow; -}; -static struct mb_lstmngr mb_list_mbuf, mb_list_clust; -static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; -static u_int *cl_refcntmap; - -/* - * Local macros for internal allocator structure manipulations. - */ -#ifdef SMP -#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)] -#else -#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0] -#endif - -#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist - -#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock) - -#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock) - -#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \ - (mb_lst)->ml_cntlst[(num)] - -#define MB_BUCKET_INDX(mb_obj, mb_lst) \ - (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \ - ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize)) - -#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ -{ \ - struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \ - \ - (mb_bckt)->mb_numfree--; \ - (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \ - (*((mb_lst)->mb_cont.mc_objcount))--; \ - if ((mb_bckt)->mb_numfree == 0) { \ - SLIST_REMOVE_HEAD(_mchd, mb_blist); \ - SLIST_NEXT((mb_bckt), mb_blist) = NULL; \ - (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \ - } \ -} - -#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \ - (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \ - (mb_bckt)->mb_numfree++; \ - (*((mb_lst)->mb_cont.mc_objcount))++; - -#define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \ - if ((mb_type) != MT_NOTMBUF) \ - (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num) - -#define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \ - if ((mb_type) != MT_NOTMBUF) \ - (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num) - -/* - * Ownership of buckets/containers is represented by integers. The PCPU - * lists range from 0 to NCPU-1. We need a free numerical id for the general - * list (we use NCPU). We also need a non-conflicting free bit to indicate - * that the bucket is free and removed from a container, while not losing - * the bucket's originating container id. We use the highest bit - * for the free marker. - */ -#define MB_GENLIST_OWNER (NCPU) -#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) - -/* Statistics structures for allocator (per-CPU and general). */ -static struct mbpstat mb_statpcpu[NCPU + 1]; -struct mbstat mbstat; - -/* Sleep time for wait code (in ticks). */ -static int mbuf_wait = 64; - -static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */ -static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */ -static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */ -static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */ - -/* - * Objects exported by sysctl(8). - */ -SYSCTL_DECL(_kern_ipc); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0, - "Maximum number of mbuf clusters available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0, - "Maximum number of mbufs available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0, - "Number used to scale kmem_map to ensure sufficient space for counters"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, - "Maximum number of sendfile(2) sf_bufs available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, - "Number of sendfile(2) sf_bufs at peak usage"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, - "Number of sendfile(2) sf_bufs in use"); -SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, - "Sleep time of mbuf subsystem wait allocations during exhaustion"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0, - "Upper limit of number of mbufs allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0, - "Lower limit of number of mbufs allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0, - "Upper limit of number of mbuf clusters allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0, - "Lower limit of number of mbuf clusters allowed in each cache"); -SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, - "Mbuf general information and statistics"); -SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, - sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics"); - -/* - * Prototypes of local allocator routines. - */ -static void *mb_alloc_wait(struct mb_lstmngr *, short); -static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, - struct mb_pcpu_list *); -static void mb_reclaim(void); -static void mbuf_init(void *); - -/* - * Initial allocation numbers. Each parameter represents the number of buckets - * of each object that will be placed initially in each PCPU container for - * said object. - */ -#define NMB_MBUF_INIT 2 -#define NMB_CLUST_INIT 8 - -/* - * Internal flags that allow for cache locks to remain "persistent" across - * allocation and free calls. They may be used in combination. - */ -#define MBP_PERSIST 0x1 /* Return with lock still held. */ -#define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */ - -/* - * Initialize the mbuf subsystem. - * - * We sub-divide the kmem_map into several submaps; this way, we don't have - * to worry about artificially limiting the number of mbuf or mbuf cluster - * allocations, due to fear of one type of allocation "stealing" address - * space initially reserved for another. - * - * Set up both the general containers and all the PCPU containers. Populate - * the PCPU containers with initial numbers. - */ -MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); -SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) -static void -mbuf_init(void *dummy) -{ - struct mb_pcpu_list *pcpu_cnt; - vm_size_t mb_map_size; - int i, j; - - /* - * Set up all the submaps, for each type of object that we deal - * with in this allocator. - */ - mb_map_size = (vm_size_t)(nmbufs * MSIZE); - mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ); - mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / - MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); - if (mb_list_mbuf.ml_btable == NULL) - goto bad; - mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), - &(mb_list_mbuf.ml_maptop), mb_map_size); - mb_list_mbuf.ml_map->system_map = 1; - mb_list_mbuf.ml_mapfull = 0; - mb_list_mbuf.ml_objsize = MSIZE; - mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize; - mb_list_mbuf.ml_wmhigh = &mbuf_hiwm; - mb_list_mbuf.ml_wmlow = &mbuf_lowm; - - mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); - mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ); - mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / - CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); - if (mb_list_clust.ml_btable == NULL) - goto bad; - mb_list_clust.ml_map = kmem_suballoc(kmem_map, - &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop), - mb_map_size); - mb_list_clust.ml_map->system_map = 1; - mb_list_clust.ml_mapfull = 0; - mb_list_clust.ml_objsize = MCLBYTES; - mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize; - mb_list_clust.ml_wmhigh = &clust_hiwm; - mb_list_clust.ml_wmlow = &clust_lowm; - - /* - * Allocate required general (global) containers for each object type. - */ - mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, - M_NOWAIT); - mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, - M_NOWAIT); - if ((mb_list_mbuf.ml_genlist == NULL) || - (mb_list_clust.ml_genlist == NULL)) - goto bad; - - /* - * Initialize condition variables and general container mutex locks. - */ - mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, MTX_DEF); - cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved"); - cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved), - "mcluster pool starved"); - mb_list_mbuf.ml_genlist->mb_cont.mc_lock = - mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen; - - /* - * Set up the general containers for each object. - */ - mb_list_mbuf.ml_genlist->mb_cont.mc_numowner = - mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER; - mb_list_mbuf.ml_genlist->mb_cont.mc_starved = - mb_list_clust.ml_genlist->mb_cont.mc_starved = 0; - mb_list_mbuf.ml_genlist->mb_cont.mc_objcount = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); - mb_list_clust.ml_genlist->mb_cont.mc_objcount = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); - mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks); - mb_list_clust.ml_genlist->mb_cont.mc_numbucks = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks); - mb_list_mbuf.ml_genlist->mb_cont.mc_types = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); - mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; - SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead)); - SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead)); - - /* - * Allocate all the required counters for clusters. This makes - * cluster allocations/deallocations much faster. - */ - cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT); - if (cl_refcntmap == NULL) - goto bad; - - /* - * Initialize general mbuf statistics. - */ - mbstat.m_msize = mb_list_mbuf.ml_objsize; - mbstat.m_mclbytes = mb_list_clust.ml_objsize; - mbstat.m_minclsize = MINCLSIZE; - mbstat.m_mlen = MLEN; - mbstat.m_mhlen = MHLEN; - mbstat.m_numtypes = MT_NTYPES; - mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks; - mbstat.m_clperbuck = mb_list_clust.ml_objbucks; - - /* - * Allocate and initialize PCPU containers. - */ - for (i = 0; i < NCPU; i++) { - if (CPU_ABSENT(i)) { - mb_statpcpu[i].mb_active = 0; - continue; - } - - mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), - M_MBUF, M_NOWAIT); - mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), - M_MBUF, M_NOWAIT); - if ((mb_list_mbuf.ml_cntlst[i] == NULL) || - (mb_list_clust.ml_cntlst[i] == NULL)) - goto bad; - - mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, MTX_DEF); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i]; - - mb_statpcpu[i].mb_active = 1; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount = - &(mb_statpcpu[i].mb_mbfree); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = - &(mb_statpcpu[i].mb_clfree); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks = - &(mb_statpcpu[i].mb_mbbucks); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks = - &(mb_statpcpu[i].mb_clbucks); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = - &(mb_statpcpu[i].mb_mbtypes[0]); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; - - SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead)); - SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead)); - - /* - * Perform initial allocations. - */ - pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); - MB_LOCK_CONT(pcpu_cnt); - for (j = 0; j < NMB_MBUF_INIT; j++) { - if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) - == NULL) - goto bad; - } - MB_UNLOCK_CONT(pcpu_cnt); - - pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); - MB_LOCK_CONT(pcpu_cnt); - for (j = 0; j < NMB_CLUST_INIT; j++) { - if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) - == NULL) - goto bad; - } - MB_UNLOCK_CONT(pcpu_cnt); - } - - return; -bad: - panic("mbuf_init(): failed to initialize mbuf subsystem!"); -} - -/* - * Populate a given mbuf PCPU container with a bucket full of fresh new - * buffers. Return a pointer to the new bucket (already in the container if - * successful), or return NULL on failure. - * - * LOCKING NOTES: - * PCPU container lock must be held when this is called. - * The lock is dropped here so that we can cleanly call the underlying VM - * code. If we fail, we return with no locks held. If we succeed (i.e., return - * non-NULL), we return with the PCPU lock held, ready for allocation from - * the returned bucket. - */ -static struct mb_bucket * -mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) -{ - struct mb_bucket *bucket; - caddr_t p; - int i; - - MB_UNLOCK_CONT(cnt_lst); - /* - * If our object's (finite) map is starved now (i.e., no more address - * space), bail out now. - */ - if (mb_list->ml_mapfull) - return (NULL); - - bucket = malloc(sizeof(struct mb_bucket) + - mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how)); - if (bucket == NULL) - return (NULL); - - p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * - mb_list->ml_objbucks, MBTOM(how)); - if (p == NULL) { - free(bucket, M_MBUF); - if (how == M_TRYWAIT) - mb_list->ml_mapfull = 1; - return (NULL); - } - - bucket->mb_numfree = 0; - mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; - for (i = 0; i < mb_list->ml_objbucks; i++) { - bucket->mb_free[i] = p; - bucket->mb_numfree++; - p += mb_list->ml_objsize; - } - - MB_LOCK_CONT(cnt_lst); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); - (*(cnt_lst->mb_cont.mc_numbucks))++; - *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; - - return (bucket); -} - -/* - * Allocate a network buffer. - * The general case is very easy. Complications only arise if our PCPU - * container is empty. Things get worse if the PCPU container is empty, - * the general container is empty, and we've run out of address space - * in our map; then we try to block if we're willing to (M_TRYWAIT). - */ -static -void * -mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, - int *pers_list) -{ - static int last_report; - struct mb_pcpu_list *cnt_lst; - struct mb_bucket *bucket; - void *m; - -#ifdef INVARIANTS - int flags; - - flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); - if (flags != M_DONTWAIT && flags != M_TRYWAIT) { - static struct timeval lasterr; - static int curerr; - if (ppsratecheck(&lasterr, &curerr, 1)) { - printf("Bad mbuf alloc flags: %x\n", flags); - backtrace(); - how = M_TRYWAIT; - } - } -#endif - - m = NULL; - if ((persist & MBP_PERSISTENT) != 0) { - /* - * If we're a "persistent" call, then the per-CPU #(pers_list) - * cache lock is already held, and we just need to refer to - * the correct cache descriptor. - */ - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); - } else { - cnt_lst = MB_GET_PCPU_LIST(mb_list); - MB_LOCK_CONT(cnt_lst); - } - - if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { - /* - * This is the easy allocation case. We just grab an object - * from a bucket in the PCPU container. At worst, we - * have just emptied the bucket and so we remove it - * from the container. - */ - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = cnt_lst->mb_cont.mc_numowner; - } else { - struct mb_gen_list *gen_list; - - /* - * This is the less-common more difficult case. We must - * first verify if the general list has anything for us - * and if that also fails, we must allocate a page from - * the map and create a new bucket to place in our PCPU - * container (already locked). If the map is starved then - * we're really in for trouble, as we have to wait on - * the general container's condition variable. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - - if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) - != NULL) { - /* - * Give ownership of the bucket to our CPU's - * container, but only actually put the bucket - * in the container if it doesn't become free - * upon removing an mbuf from it. - */ - SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), - mb_blist); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - (*(gen_list->mb_cont.mc_numbucks))--; - (*(cnt_lst->mb_cont.mc_numbucks))++; - *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; - bucket->mb_numfree--; - m = bucket->mb_free[(bucket->mb_numfree)]; - if (bucket->mb_numfree == 0) { - SLIST_NEXT(bucket, mb_blist) = NULL; - bucket->mb_owner |= MB_BUCKET_FREE; - } else { - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), - bucket, mb_blist); - *(cnt_lst->mb_cont.mc_objcount) += - bucket->mb_numfree; - } - MB_UNLOCK_CONT(gen_list); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = cnt_lst->mb_cont.mc_numowner; - } else { - /* - * We'll have to allocate a new page. - */ - MB_UNLOCK_CONT(gen_list); - bucket = mb_pop_cont(mb_list, how, cnt_lst); - if (bucket != NULL) { - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list=cnt_lst->mb_cont.mc_numowner; - } else { - if (how == M_TRYWAIT) { - /* - * Absolute worst-case scenario. - * We block if we're willing to, but - * only after trying to steal from - * other lists. - */ - m = mb_alloc_wait(mb_list, type); - } else { - /* XXX: No consistency. */ - mbstat.m_drops++; - - if (ticks < last_report || - (ticks - last_report) >= hz) { - last_report = ticks; - printf( -"All mbufs or mbuf clusters exhausted, please see tuning(7).\n"); - } - - } - if (m != NULL && (persist & MBP_PERSIST) != 0) { - cnt_lst = MB_GET_PCPU_LIST(mb_list); - MB_LOCK_CONT(cnt_lst); - *pers_list=cnt_lst->mb_cont.mc_numowner; - } - } - } - } - - return (m); -} - -/* - * This is the worst-case scenario called only if we're allocating with - * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf - * by looking in every PCPU container. If we're still unsuccesful, we - * try the general container one last time and possibly block on our - * starved cv. - */ -static void * -mb_alloc_wait(struct mb_lstmngr *mb_list, short type) -{ - struct mb_pcpu_list *cnt_lst; - struct mb_gen_list *gen_list; - struct mb_bucket *bucket; - void *m; - int i, cv_ret; - - /* - * Try to reclaim mbuf-related objects (mbufs, clusters). - */ - mb_reclaim(); - - /* - * Cycle all the PCPU containers. Increment starved counts if found - * empty. - */ - for (i = 0; i < NCPU; i++) { - if (CPU_ABSENT(i)) - continue; - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); - MB_LOCK_CONT(cnt_lst); - - /* - * If container is non-empty, get a single object from it. - * If empty, increment starved count. - */ - if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != - NULL) { - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - MB_UNLOCK_CONT(cnt_lst); - mbstat.m_wait++; /* XXX: No consistency. */ - return (m); - } else - cnt_lst->mb_cont.mc_starved++; - - MB_UNLOCK_CONT(cnt_lst); - } - - /* - * We're still here, so that means it's time to get the general - * container lock, check it one more time (now that mb_reclaim() - * has been called) and if we still get nothing, block on the cv. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { - MB_GET_OBJECT(m, bucket, gen_list); - MB_MBTYPES_INC(gen_list, type, 1); - MB_UNLOCK_CONT(gen_list); - mbstat.m_wait++; /* XXX: No consistency. */ - return (m); - } - - gen_list->mb_cont.mc_starved++; - cv_ret = cv_timedwait(&(gen_list->mgl_mstarved), - gen_list->mb_cont.mc_lock, mbuf_wait); - gen_list->mb_cont.mc_starved--; - - if ((cv_ret == 0) && - ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) { - MB_GET_OBJECT(m, bucket, gen_list); - MB_MBTYPES_INC(gen_list, type, 1); - mbstat.m_wait++; /* XXX: No consistency. */ - } else { - mbstat.m_drops++; /* XXX: No consistency. */ - m = NULL; - } - - MB_UNLOCK_CONT(gen_list); - - return (m); -} - -/*- - * Free an object to its rightful container. - * In the very general case, this operation is really very easy. - * Complications arise primarily if: - * (a) We've hit the high limit on number of free objects allowed in - * our PCPU container. - * (b) We're in a critical situation where our container has been - * marked 'starved' and we need to issue wakeups on the starved - * condition variable. - * (c) Minor (odd) cases: our bucket has migrated while we were - * waiting for the lock; our bucket is in the general container; - * our bucket is empty. - */ -static -void -mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist, - int *pers_list) -{ - struct mb_pcpu_list *cnt_lst; - struct mb_gen_list *gen_list; - struct mb_bucket *bucket; - u_int owner; - - bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)]; - - /* - * Make sure that if after we lock the bucket's present container the - * bucket has migrated, that we drop the lock and get the new one. - */ -retry_lock: - owner = bucket->mb_owner & ~MB_BUCKET_FREE; - switch (owner) { - case MB_GENLIST_OWNER: - gen_list = MB_GET_GEN_LIST(mb_list); - if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { - if (*pers_list != MB_GENLIST_OWNER) { - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, - *pers_list); - MB_UNLOCK_CONT(cnt_lst); - MB_LOCK_CONT(gen_list); - } - } else { - MB_LOCK_CONT(gen_list); - } - if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { - MB_UNLOCK_CONT(gen_list); - *pers_list = -1; - goto retry_lock; - } - - /* - * If we're intended for the general container, this is - * real easy: no migrating required. The only `bogon' - * is that we're now contending with all the threads - * dealing with the general list, but this is expected. - */ - MB_PUT_OBJECT(m, bucket, gen_list); - MB_MBTYPES_DEC(gen_list, type, 1); - if (bucket->mb_owner & MB_BUCKET_FREE) { - SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = MB_GENLIST_OWNER; - } - if (gen_list->mb_cont.mc_starved > 0) - cv_signal(&(gen_list->mgl_mstarved)); - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(gen_list); - else - *pers_list = MB_GENLIST_OWNER; - break; - - default: - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); - if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { - if (*pers_list == MB_GENLIST_OWNER) { - gen_list = MB_GET_GEN_LIST(mb_list); - MB_UNLOCK_CONT(gen_list); - MB_LOCK_CONT(cnt_lst); - } else { - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, - *pers_list); - owner = *pers_list; - } - } else { - MB_LOCK_CONT(cnt_lst); - } - if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { - MB_UNLOCK_CONT(cnt_lst); - *pers_list = -1; - goto retry_lock; - } - - MB_PUT_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_DEC(cnt_lst, type, 1); - if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) || - (cnt_lst->mb_cont.mc_starved > 0)) { - /* - * We've hit the high limit of allowed numbers of mbufs - * on this PCPU list or we've been flagged that we need - * to transfer a bucket over to the general cache. - * We must now migrate a bucket over to the general - * container. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) { - bucket = - SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead)); - SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead), - mb_blist); - } - SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = MB_GENLIST_OWNER; - *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; - *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; - (*(cnt_lst->mb_cont.mc_numbucks))--; - (*(gen_list->mb_cont.mc_numbucks))++; - - /* - * While we're at it, transfer some of the mbtypes - * "count load" onto the general list's mbtypes - * array, seeing as how we're moving the bucket - * there now, meaning that the freeing of objects - * there will now decrement the _general list's_ - * mbtypes counters, and no longer our PCPU list's - * mbtypes counters. We do this for the type presently - * being freed in an effort to keep the mbtypes - * counters approximately balanced across all lists. - */ - MB_MBTYPES_DEC(cnt_lst, type, - mb_list->ml_objbucks - bucket->mb_numfree); - MB_MBTYPES_INC(gen_list, type, - mb_list->ml_objbucks - bucket->mb_numfree); - - if (cnt_lst->mb_cont.mc_starved > 0) { - /* - * Determine whether or not to keep - * transferring buckets to the general list - * or whether we've transferred enough already. - * The thread that is blocked may end up waking - * up in the meantime, but transferring an - * extra bucket in a constrained situation - * is not so bad, as we're likely to need - * it soon anyway. - */ - if (gen_list->mb_cont.mc_starved > 0) { - cnt_lst->mb_cont.mc_starved--; - cv_signal(&(gen_list->mgl_mstarved)); - } else - cnt_lst->mb_cont.mc_starved = 0; - } - MB_UNLOCK_CONT(gen_list); - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = owner; - break; - } - - if (bucket->mb_owner & MB_BUCKET_FREE) { - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - } - - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = owner; - break; - } -} - -/* - * Drain protocols in hopes to free up some resources. - * - * LOCKING NOTES: - * No locks should be held when this is called. The drain routines have to - * presently acquire some locks which raises the possibility of lock order - * violation if we're holding any mutex if that mutex is acquired in reverse - * order relative to one of the locks in the drain routines. - */ -static void -mb_reclaim(void) -{ - struct domain *dp; - struct protosw *pr; - - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, - "mb_reclaim()"); - - mbstat.m_drain++; /* XXX: No consistency. */ - - for (dp = domains; dp != NULL; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) - if (pr->pr_drain != NULL) - (*pr->pr_drain)(); -} - -/****************************************************************************** - * Internal setup macros. - */ - -#define _mb_setup(m, type) do { \ - (m)->m_type = (type); \ - (m)->m_next = NULL; \ - (m)->m_nextpkt = NULL; \ - (m)->m_data = (m)->m_dat; \ - (m)->m_flags = 0; \ -} while (0) - -#define _mbhdr_setup(m, type) do { \ - (m)->m_type = (type); \ - (m)->m_next = NULL; \ - (m)->m_nextpkt = NULL; \ - (m)->m_data = (m)->m_pktdat; \ - (m)->m_flags = M_PKTHDR; \ - (m)->m_pkthdr.rcvif = NULL; \ - (m)->m_pkthdr.csum_flags = 0; \ - SLIST_INIT(&(m)->m_pkthdr.tags); \ -} while (0) - -#define _mcl_setup(m) do { \ - (m)->m_data = (m)->m_ext.ext_buf; \ - (m)->m_flags |= M_EXT; \ - (m)->m_ext.ext_free = NULL; \ - (m)->m_ext.ext_args = NULL; \ - (m)->m_ext.ext_size = MCLBYTES; \ - (m)->m_ext.ext_type = EXT_CLUSTER; \ -} while (0) - -#define _mext_init_ref(m, ref) do { \ - (m)->m_ext.ref_cnt = ((ref) == NULL) ? \ - malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \ - if ((m)->m_ext.ref_cnt != NULL) { \ - *((m)->m_ext.ref_cnt) = 0; \ - MEXT_ADD_REF((m)); \ - } \ -} while (0) - -#define cl2ref(cl) \ - (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT) - -#define _mext_dealloc_ref(m) \ - if ((m)->m_ext.ext_type != EXT_EXTREF) \ - free((m)->m_ext.ref_cnt, M_MBUF) - -/****************************************************************************** - * Internal routines. - * - * Because mb_alloc() and mb_free() are inlines (to keep the common - * cases down to a maximum of one function call), below are a few - * routines used only internally for the sole purpose of making certain - * functions smaller. - * - * - _mext_free(): frees associated storage when the ref. count is - * exactly one and we're freeing. - * - * - _mgetm_internal(): common "persistent-lock" routine that allocates - * an mbuf and a cluster in one shot, but where the lock is already - * held coming in (which is what makes it different from the exported - * m_getcl()). The lock is dropped when done. This is used by m_getm() - * and, therefore, is very m_getm()-specific. - */ -static struct mbuf *_mgetm_internal(int, short, short, int); - -void -_mext_free(struct mbuf *mb) -{ - - if (mb->m_ext.ext_type == EXT_CLUSTER) { - mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, - 0, NULL); - } else { - (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); - _mext_dealloc_ref(mb); - } -} - -static struct mbuf * -_mgetm_internal(int how, short type, short persist, int cchnum) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum); - if (mb == NULL) - return NULL; - _mb_setup(mb, type); - - if ((persist & MBP_PERSIST) != 0) { - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, - how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - mb = NULL; - } - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - } - return (mb); -} - -/****************************************************************************** - * Exported buffer allocation and de-allocation routines. - */ - -/* - * Allocate and return a single (normal) mbuf. NULL is returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_get(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) - _mb_setup(mb, type); - return (mb); -} - -/* - * Allocate a given length worth of mbufs and/or clusters (whatever fits - * best) and return a pointer to the top of the allocated chain. If an - * existing mbuf chain is provided, then we will append the new chain - * to the existing one but still return the top of the newly allocated - * chain. NULL is returned on failure, in which case the [optional] - * provided chain is left untouched, and any memory already allocated - * is freed. - * - * Arguments: - * - m: existing chain to which to append new chain (optional). - * - len: total length of data to append, either in mbufs or clusters - * (we allocate whatever combination yields the best fit). - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_getm(struct mbuf *m, int len, int how, short type) -{ - struct mbuf *mb, *top, *cur, *mtail; - int num, rem, cchnum; - short persist; - int i; - - KASSERT(len >= 0, ("m_getm(): len is < 0")); - - /* If m != NULL, we will append to the end of that chain. */ - if (m != NULL) - for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); - else - mtail = NULL; - - /* - * In the best-case scenario (which should be the common case - * unless we're in a starvation situation), we will be able to - * go through the allocation of all the desired mbufs and clusters - * here without dropping our per-CPU cache lock in between. - */ - num = len / MCLBYTES; - rem = len % MCLBYTES; - persist = 0; - cchnum = -1; - top = cur = NULL; - for (i = 0; i < num; i++) { - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, - MBP_PERSIST | persist, &cchnum); - if (mb == NULL) - goto failed; - _mb_setup(mb, type); - mb->m_len = 0; - - persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0; - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, - how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - goto failed; - } - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - persist = MBP_PERSISTENT; - - if (cur == NULL) - top = cur = mb; - else - cur = (cur->m_next = mb); - } - if (rem > 0) { - if (cchnum >= 0) { - persist = MBP_PERSISTENT; - persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0; - mb = _mgetm_internal(how, type, persist, cchnum); - if (mb == NULL) - goto failed; - } else if (rem > MINCLSIZE) { - mb = m_getcl(how, type, 0); - } else { - mb = m_get(how, type); - } - if (mb != NULL) { - mb->m_len = 0; - if (cur == NULL) - top = mb; - else - cur->m_next = mb; - } else - goto failed; - } - - if (mtail != NULL) - mtail->m_next = top; - return top; -failed: - if (top != NULL) - m_freem(top); - return NULL; -} - -/* - * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_gethdr(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mbhdr_setup(mb, type); -#ifdef MAC - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } -#endif - } - return (mb); -} - -/* - * Allocate and return a single (normal) pre-zero'd mbuf. NULL is - * returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_get_clrd(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mb_setup(mb, type); - bzero(mtod(mb, caddr_t), MLEN); - } - return (mb); -} - -/* - * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is - * returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_gethdr_clrd(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mbhdr_setup(mb, type); -#ifdef MAC - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } -#endif - bzero(mtod(mb, caddr_t), MHLEN); - } - return (mb); -} - -/* - * Free a single mbuf and any associated storage that it may have attached - * to it. The associated storage may not be immediately freed if its - * reference count is above 1. Returns the next mbuf in the chain following - * the mbuf being freed. - * - * Arguments: - * - mb: the mbuf to free. - */ -struct mbuf * -m_free(struct mbuf *mb) -{ - struct mbuf *nb; - int cchnum; - short persist = 0; - -#ifdef INVARIANTS - if (mb->m_flags & M_FREELIST) - panic("m_free detected a mbuf double-free"); - mb->m_flags |= M_FREELIST; -#endif - if ((mb->m_flags & M_PKTHDR) != 0) - m_tag_delete_chain(mb, NULL); - nb = mb->m_next; - if ((mb->m_flags & M_EXT) != 0) { - MEXT_REM_REF(mb); - if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) { - if (mb->m_ext.ext_type == EXT_CLUSTER) { - mb_free(&mb_list_clust, - (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, - MBP_PERSIST, &cchnum); - persist = MBP_PERSISTENT; - } else { - (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, - mb->m_ext.ext_args); - _mext_dealloc_ref(mb); - persist = 0; - } - } - } - mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum); - return (nb); -} - -/* - * Free an entire chain of mbufs and associated external buffers, if - * applicable. Right now, we only optimize a little so that the cache - * lock may be held across a single mbuf+cluster free. Hopefully, - * we'll eventually be holding the lock across more than merely two - * consecutive frees but right now this is hard to implement because of - * things like _mext_dealloc_ref (may do a free()) and atomic ops in the - * loop. - * - * - mb: the mbuf chain to free. - */ -void -m_freem(struct mbuf *mb) -{ - - while (mb != NULL) - mb = m_free(mb); -} - -/* - * Fetch an mbuf with a cluster attached to it. If one of the - * allocations fails, the entire allocation fails. This routine is - * the preferred way of fetching both the mbuf and cluster together, - * as it avoids having to unlock/relock between allocations. Returns - * NULL on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - * - flags: any flags to pass to the mbuf being allocated; if this includes - * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf. - */ -struct mbuf * -m_getcl(int how, short type, int flags) -{ - struct mbuf *mb; - int cchnum; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, - MBP_PERSIST, &cchnum); - if (mb == NULL) - return NULL; - mb->m_type = type; - mb->m_next = NULL; - mb->m_flags = flags; - if ((flags & M_PKTHDR) != 0) { - mb->m_nextpkt = NULL; - mb->m_pkthdr.rcvif = NULL; - mb->m_pkthdr.csum_flags = 0; - SLIST_INIT(&mb->m_pkthdr.tags); - } - - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, - MT_NOTMBUF, MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - mb = NULL; - } else { - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - } -#ifdef MAC - if (flags & M_PKTHDR) { - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } - } -#endif - return (mb); -} - -/* - * Fetch a single mbuf cluster and attach it to an existing mbuf. If - * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf - * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags. - * The M_EXT bit is not set on failure. - * - * Arguments: - * - mb: the existing mbuf to which to attach the allocated cluster. - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - */ -void -m_clget(struct mbuf *mb, int how) -{ - - mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF, - 0, NULL); - if (mb->m_ext.ext_buf != NULL) { - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - } -} - -/* - * Configure a provided mbuf to refer to the provided external storage - * buffer and setup a reference count for said buffer. If the setting - * up of the reference count fails, the M_EXT bit will not be set. If - * successfull, the M_EXT bit is set in the mbuf's flags. - * - * Arguments: - * - mb: the existing mbuf to which to attach the provided buffer. - * - buf: the address of the provided external storage buffer. - * - size: the size of the provided buffer. - * - freef: a pointer to a routine that is responsible for freeing the - * provided external storage buffer. - * - args: a pointer to an argument structure (of any type) to be passed - * to the provided freef routine (may be NULL). - * - flags: any other flags to be passed to the provided mbuf. - * - type: the type that the external storage buffer should be labeled with. - */ -void -m_extadd(struct mbuf *mb, caddr_t buf, u_int size, - void (*freef)(void *, void *), void *args, int flags, int type) -{ - u_int *ref_cnt = NULL; - - if (type == EXT_CLUSTER) - ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]; - else if (type == EXT_EXTREF) - ref_cnt = mb->m_ext.ref_cnt; - _mext_init_ref(mb, ref_cnt); - if (mb->m_ext.ref_cnt != NULL) { - mb->m_flags |= (M_EXT | flags); - mb->m_ext.ext_buf = buf; - mb->m_data = mb->m_ext.ext_buf; - mb->m_ext.ext_size = size; - mb->m_ext.ext_free = freef; - mb->m_ext.ext_args = args; - mb->m_ext.ext_type = type; - } -} - -/* - * Change type of provided mbuf. This is a relatively expensive operation - * (due to the cost of statistics manipulations) and should be avoided, where - * possible. - * - * Arguments: - * - mb: the provided mbuf for which the type needs to be changed. - * - new_type: the new type to change the mbuf to. - */ -void -m_chtype(struct mbuf *mb, short new_type) -{ - struct mb_gen_list *gen_list; - - gen_list = MB_GET_GEN_LIST(&mb_list_mbuf); - MB_LOCK_CONT(gen_list); - MB_MBTYPES_DEC(gen_list, mb->m_type, 1); - MB_MBTYPES_INC(gen_list, new_type, 1); - MB_UNLOCK_CONT(gen_list); - mb->m_type = new_type; -} diff -ruN vendor_sys/./kern/sys_pipe.c mbuma/src/sys/./kern/sys_pipe.c --- vendor_sys/./kern/sys_pipe.c Sun Mar 7 14:57:44 2004 +++ mbuma/src/sys/./kern/sys_pipe.c Fri Feb 27 20:19:16 2004 @@ -191,7 +191,7 @@ #endif static int pipespace(struct pipe *cpipe, int size); -static void pipe_zone_ctor(void *mem, int size, void *arg); +static int pipe_zone_ctor(void *mem, int size, void *arg); static void pipe_zone_dtor(void *mem, int size, void *arg); static void pipe_zone_init(void *mem, int size); static void pipe_zone_fini(void *mem, int size); @@ -210,7 +210,7 @@ KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); } -static void +static int pipe_zone_ctor(void *mem, int size, void *arg) { struct pipepair *pp; @@ -256,6 +256,7 @@ pp->pp_label = NULL; atomic_add_int(&amountpipes, 2); + return 1; } static void diff -ruN vendor_sys/./kern/uipc_mbuf.c mbuma/src/sys/./kern/uipc_mbuf.c --- vendor_sys/./kern/uipc_mbuf.c Sun Mar 7 14:57:45 2004 +++ mbuma/src/sys/./kern/uipc_mbuf.c Fri Mar 5 22:17:47 2004 @@ -90,6 +90,157 @@ #endif /* + * Malloc-type for external ext_buf ref counts. + */ +MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts"); + +/* + * Allocate a given length worth of mbufs and/or clusters (whatever fits + * best) and return a pointer to the top of the allocated chain. If an + * existing mbuf chain is provided, then we will append the new chain + * to the existing one but still return the top of the newly allocated + * chain. + */ +struct mbuf * +m_getm(struct mbuf *m, int len, int how, short type) +{ + struct mbuf *mb, *top, *cur, *mtail; + int num, rem; + int i; + + KASSERT(len >= 0, ("m_getm(): len is < 0")); + + /* If m != NULL, we will append to the end of that chain. */ + if (m != NULL) + for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); + else + mtail = NULL; + + /* + * Calculate how many mbufs+clusters ("packets") we need and how much + * leftover there is after that and allocate the first mbuf+cluster + * if required. + */ + num = len / MCLBYTES; + rem = len % MCLBYTES; + top = cur = NULL; + if (num > 0) { + if ((top = cur = m_getcl(how, type, 0)) == NULL) + goto failed; + } + num--; + top->m_len = 0; + + for (i = 0; i < num; i++) { + mb = m_getcl(how, type, 0); + if (mb == NULL) + goto failed; + mb->m_len = 0; + cur = (cur->m_next = mb); + } + if (rem > 0) { + mb = (rem > MINCLSIZE) ? + m_getcl(how, type, 0) : m_get(how, type); + if (mb == NULL) + goto failed; + mb->m_len = 0; + if (cur == NULL) + top = mb; + else + cur->m_next = mb; + } + + if (mtail != NULL) + mtail->m_next = top; + return top; +failed: + if (top != NULL) + m_freem(top); + return NULL; +} + +/* + * Free an entire chain of mbufs and associated external buffers, if + * applicable. + */ +void +m_freem(struct mbuf *mb) +{ + + while (mb != NULL) + mb = m_free(mb); +} + +/*- + * Configure a provided mbuf to refer to the provided external storage + * buffer and setup a reference count for said buffer. If the setting + * up of the reference count fails, the M_EXT bit will not be set. If + * successfull, the M_EXT bit is set in the mbuf's flags. + * + * Arguments: + * mb The existing mbuf to which to attach the provided buffer. + * buf The address of the provided external storage buffer. + * size The size of the provided buffer. + * freef A pointer to a routine that is responsible for freeing the + * provided external storage buffer. + * args A pointer to an argument structure (of any type) to be passed + * to the provided freef routine (may be NULL). + * flags Any other flags to be passed to the provided mbuf. + * type The type that the external storage buffer should be + * labeled with. + * + * Returns: + * Nothing. + */ +void +m_extadd(struct mbuf *mb, caddr_t buf, u_int size, + void (*freef)(void *, void *), void *args, int flags, int type) +{ + u_int *ref_cnt = NULL; + + if (type == EXT_CLUSTER) + ref_cnt = (u_int *)find_refcnt(zone_clust, + mb->m_ext.ext_buf); + else if (type == EXT_EXTREF) + ref_cnt = mb->m_ext.ref_cnt; + mb->m_ext.ref_cnt = (ref_cnt == NULL) ? + malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt; + if (mb->m_ext.ref_cnt != NULL) { + *(mb->m_ext.ref_cnt) = 0; + MEXT_ADD_REF(mb); + mb->m_flags |= (M_EXT | flags); + mb->m_ext.ext_buf = buf; + mb->m_data = mb->m_ext.ext_buf; + mb->m_ext.ext_size = size; + mb->m_ext.ext_free = freef; + mb->m_ext.ext_args = args; + mb->m_ext.ext_type = type; + } +} + +/* + * Non-directly-exported function to clean up after mbufs with M_EXT + * storage attached to them if the reference count hits 0. + */ +void +mb_free_ext(struct mbuf *m) +{ + + MEXT_REM_REF(m); + if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) { + if (m->m_ext.ext_type == EXT_CLUSTER) { + uma_kfree(keg_packet, m); + return; + } + (*(m->m_ext.ext_free))(m->m_ext.ext_buf, + m->m_ext.ext_args); + if (m->m_ext.ext_type != EXT_EXTREF) + free(m->m_ext.ref_cnt, M_MBUF); + } + uma_zfree(zone_mbuf, m); +} + +/* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ diff -ruN vendor_sys/./kern/uipc_mbuf2.c mbuma/src/sys/./kern/uipc_mbuf2.c --- vendor_sys/./kern/uipc_mbuf2.c Sun Mar 7 14:57:45 2004 +++ mbuma/src/sys/./kern/uipc_mbuf2.c Mon Feb 16 19:22:44 2004 @@ -234,14 +234,10 @@ * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ - MGET(o, M_DONTWAIT, m->m_type); - if (o && len > MLEN) { - MCLGET(o, M_DONTWAIT); - if ((o->m_flags & M_EXT) == 0) { - m_free(o); - o = NULL; - } - } + if (len > MLEN) + o = m_getcl(M_DONTWAIT, m->m_type, 0); + else + o = m_get(M_DONTWAIT, m->m_type); if (!o) { m_freem(m); return NULL; /* ENOBUFS */ @@ -278,29 +274,27 @@ m_dup1(struct mbuf *m, int off, int len, int wait) { struct mbuf *n; - int l; int copyhdr; if (len > MCLBYTES) return NULL; - if (off == 0 && (m->m_flags & M_PKTHDR) != 0) { + if (off == 0 && (m->m_flags & M_PKTHDR) != 0) copyhdr = 1; - MGETHDR(n, wait, m->m_type); - l = MHLEN; - } else { + else copyhdr = 0; - MGET(n, wait, m->m_type); - l = MLEN; - } - if (n && len > l) { - MCLGET(n, wait); - if ((n->m_flags & M_EXT) == 0) { - m_free(n); - n = NULL; - } + if (len >= MINCLSIZE) { + if (copyhdr == 1) + n = m_getcl(wait, m->m_type, M_PKTHDR); + else + n = m_getcl(wait, m->m_type, 0); + } else { + if (copyhdr == 1) + n = m_gethdr(wait, m->m_type); + else + n = m_get(wait, m->m_type); } if (!n) - return NULL; + return NULL; /* ENOBUFS */ if (copyhdr && !m_dup_pkthdr(n, m, wait)) { m_free(n); diff -ruN vendor_sys/./net/rtsock.c mbuma/src/sys/./net/rtsock.c --- vendor_sys/./net/rtsock.c Sun Mar 7 14:58:02 2004 +++ mbuma/src/sys/./net/rtsock.c Mon Feb 16 18:49:07 2004 @@ -637,16 +637,13 @@ } if (len > MCLBYTES) panic("rt_msg1"); - m = m_gethdr(M_DONTWAIT, MT_DATA); - if (m && len > MHLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - m = NULL; - } - } - if (m == 0) - return (m); + if (len > MHLEN) + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + else + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return NULL; /* ENOBUFS */ + m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = 0; rtm = mtod(m, struct rt_msghdr *); diff -ruN vendor_sys/./opencrypto/crypto.c mbuma/src/sys/./opencrypto/crypto.c --- vendor_sys/./opencrypto/crypto.c Sun Mar 7 14:58:38 2004 +++ mbuma/src/sys/./opencrypto/crypto.c Fri Mar 5 20:47:47 2004 @@ -127,10 +127,10 @@ mtx_init(&crypto_ret_q_mtx, "crypto", "crypto return queues", MTX_DEF); cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop), - 0, 0, 0, 0, + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc), - 0, 0, 0, 0, + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); if (cryptodesc_zone == NULL || cryptop_zone == NULL) { printf("crypto_init: cannot setup crypto zones\n"); diff -ruN vendor_sys/./powerpc/powerpc/vm_machdep.c mbuma/src/sys/./powerpc/powerpc/vm_machdep.c --- vendor_sys/./powerpc/powerpc/vm_machdep.c Sun Mar 7 14:58:55 2004 +++ mbuma/src/sys/./powerpc/powerpc/vm_machdep.c Sat Mar 6 15:17:31 2004 @@ -107,6 +107,24 @@ SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ @@ -244,6 +262,9 @@ struct sf_buf *sf_bufs; vm_offset_t sf_base; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); diff -ruN vendor_sys/./sparc64/sparc64/vm_machdep.c mbuma/src/sys/./sparc64/sparc64/vm_machdep.c --- vendor_sys/./sparc64/sparc64/vm_machdep.c Sun Mar 7 14:59:02 2004 +++ mbuma/src/sys/./sparc64/sparc64/vm_machdep.c Sat Mar 6 15:17:32 2004 @@ -91,6 +91,24 @@ SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) /* + * NSFBUFS-related variables and associated sysctls + */ +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + +/* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ @@ -371,6 +389,9 @@ struct sf_buf *sf_bufs; vm_offset_t sf_base; int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); diff -ruN vendor_sys/./sys/mbuf.h mbuma/src/sys/./sys/mbuf.h --- vendor_sys/./sys/mbuf.h Sun Mar 7 14:59:05 2004 +++ mbuma/src/sys/./sys/mbuf.h Fri Mar 5 22:17:47 2004 @@ -31,13 +31,17 @@ * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 - * $FreeBSD: src/sys/sys/mbuf.h,v 1.139 2004/02/26 03:53:54 mlaier Exp $ + * $FreeBSD: src/sys/sys/mbuf.h,v 1.137 2004/02/18 00:04:52 mlaier Exp $ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ +/* XXX: These includes suck. Sorry! */ +#include #include +#include +#include /* * Mbufs are of a single size, MSIZE (sys/param.h), which @@ -61,6 +65,16 @@ */ #define mtod(m, t) ((t)((m)->m_data)) #define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1))) + +/* + * Argument structure passed to UMA routines during mbuf and packet + * allocations. + */ +struct mb_args { + int flags; /* Flags for mbuf being allocated */ + int how; /* How to allocate: M_WAITOK or M_DONTWAIT */ + short type; /* Type of mbuf being allocated */ +}; #endif /* _KERNEL */ /* @@ -227,24 +241,7 @@ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ /* - * Mbuf and cluster allocation statistics PCPU structure. - */ -struct mbpstat { - u_long mb_mbfree; - u_long mb_mbbucks; - u_long mb_clfree; - u_long mb_clbucks; - long mb_mbtypes[MT_NTYPES]; - short mb_active; -}; - -/* * General mbuf allocator statistics structure. - * XXX: Modifications of these are not protected by any mutex locks nor by - * any atomic() manipulations. As a result, we may occasionally lose - * a count or two. Luckily, not all of these fields are modified at all - * and remain static, and those that are manipulated are only manipulated - * in failure situations, which do not occur (hopefully) very often. */ struct mbstat { u_long m_drops; /* times failed to allocate */ @@ -257,8 +254,6 @@ u_long m_minclsize; /* min length of data to allocate a cluster */ u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ - u_int m_mbperbuck; /* number of mbufs per "bucket" */ - u_int m_clperbuck; /* number of clusters per "bucket" */ /* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */ short m_numtypes; /* XXX: Sendfile stats should eventually move to their own struct */ @@ -272,11 +267,13 @@ * M_DONTWAIT means "don't block if nothing is available" whereas * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is * available." + * + * XXX: This is a mess. */ -#define M_DONTWAIT 0x4 /* don't conflict with M_NOWAIT */ -#define M_TRYWAIT 0x8 /* or M_WAITOK */ -#define M_WAIT M_TRYWAIT /* XXX: deprecated */ -#define MBTOM(how) ((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT) +#define MBTOM(how) (how) +#define M_DONTWAIT M_NOWAIT +#define M_TRYWAIT M_WAITOK +#define M_WAIT M_WAITOK #ifdef _KERNEL /*- @@ -300,12 +297,99 @@ #define MEXT_ADD_REF(m) atomic_add_int((m)->m_ext.ref_cnt, 1) /* + * Network buffer allocation API + * + * The rest of it is defined in kern/subr_mbuf.c + */ + +extern uma_zone_t zone_mbuf; +extern uma_zone_t zone_clust; +extern uma_keg_t keg_packet; + +/* XXX */ +#define m_get_clrd m_get +#define m_gethdr_clrd m_gethdr +/* XXX */ +static __inline struct mbuf *m_get(int how, short type); +static __inline struct mbuf *m_gethdr(int how, short type); +static __inline struct mbuf *m_getcl(int how, short type, int flags); +static __inline struct mbuf *m_free(struct mbuf *m); +static __inline void m_clget(struct mbuf *m, int how); +static __inline void m_chtype(struct mbuf *m, short new_type); +void mb_free_ext(struct mbuf *); + +static __inline +struct mbuf * +m_get(int how, short type) +{ + struct mb_args args; + + args.flags = 0; + args.how = how; + args.type = type; + return (uma_zalloc_arg(zone_mbuf, &args, how)); +} + +static __inline +struct mbuf * +m_gethdr(int how, short type) +{ + struct mb_args args; + + args.flags = M_PKTHDR; + args.how = how; + args.type = type; + return (uma_zalloc_arg(zone_mbuf, &args, how)); +} + +static __inline +struct mbuf * +m_getcl(int how, short type, int flags) +{ + struct mb_args args; + + args.flags = flags; + args.how = how; + args.type = type; + return (uma_kalloc_arg(keg_packet, &args, how)); +} + +static __inline +struct mbuf * +m_free(struct mbuf *m) +{ + struct mbuf *n = m->m_next; + +#ifdef INVARIANTS + m->m_flags |= M_FREELIST; +#endif + if (m->m_flags & M_EXT) + mb_free_ext(m); + else + uma_zfree(zone_mbuf, m); + return n; +} + +static __inline +void +m_clget(struct mbuf *m, int how) +{ + uma_zalloc_arg(zone_clust, m, how); +} + +static __inline +void +m_chtype(struct mbuf *m, short new_type) +{ + m->m_type = new_type; +} + +/* * mbuf, cluster, and external object allocation macros * (for compatibility purposes). */ /* NB: M_COPY_PKTHDR is deprecated. Use M_MOVE_PKTHDR or m_dup_pktdr. */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) -#define m_getclr(how, type) m_get_clrd((how), (type)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) @@ -313,23 +397,6 @@ m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type)) /* - * MEXTFREE(m): disassociate (and possibly free) an external object from (m). - * - * If the atomic_cmpset_int() returns 0, then we effectively do nothing - * in terms of "cleaning up" (freeing the ext buf and ref. counter) as - * this means that either there are still references, or another thread - * is taking care of the clean-up. - */ -#define MEXTFREE(m) do { \ - struct mbuf *_mb = (m); \ - \ - MEXT_REM_REF(_mb); \ - if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1)) \ - _mext_free(_mb); \ - _mb->m_flags &= ~M_EXT; \ -} while (0) - -/* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this * can be both the local data payload, or an external buffer area, * depending on whether M_EXT is set). @@ -433,18 +500,13 @@ extern int max_protohdr; /* Largest protocol header */ extern struct mbstat mbstat; /* General mbuf stats/infos */ extern int nmbclusters; /* Maximum number of clusters */ -extern int nmbcnt; /* Scale kmem_map for counter space */ -extern int nmbufs; /* Maximum number of mbufs */ struct uio; -void _mext_free(struct mbuf *); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, unsigned int), void *); void m_cat(struct mbuf *, struct mbuf *); -void m_chtype(struct mbuf *, short); -void m_clget(struct mbuf *, int); void m_extadd(struct mbuf *, caddr_t, u_int, void (*)(void *, void *), void *, int, int); void m_copyback(struct mbuf *, int, int, caddr_t); @@ -459,13 +521,7 @@ int m_dup_pkthdr(struct mbuf *, struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); -struct mbuf *m_free(struct mbuf *); void m_freem(struct mbuf *); -struct mbuf *m_get(int, short); -struct mbuf *m_get_clrd(int, short); -struct mbuf *m_getcl(int, short, int); -struct mbuf *m_gethdr(int, short); -struct mbuf *m_gethdr_clrd(int, short); struct mbuf *m_getm(struct mbuf *, int, int, short); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); diff -ruN vendor_sys/./vm/device_pager.c mbuma/src/sys/./vm/device_pager.c --- vendor_sys/./vm/device_pager.c Sun Mar 7 14:59:11 2004 +++ mbuma/src/sys/./vm/device_pager.c Mon Feb 16 19:23:09 2004 @@ -96,7 +96,7 @@ mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF); fakepg_zone = uma_zcreate("DP fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE|UMA_ZONE_VM); + UMA_ZONE_NOFREE|UMA_ZONE_VM); } /* diff -ruN vendor_sys/./vm/uma.h mbuma/src/sys/./vm/uma.h --- vendor_sys/./vm/uma.h Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/uma.h Mon Mar 1 19:27:37 2004 @@ -47,8 +47,12 @@ /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; +struct uma_keg; +/* Opaque type used as a handle to the keg */ +typedef struct uma_keg * uma_keg_t; + /* - * Item constructor + * Zone item constructor * * Arguments: * item A pointer to the memory which has been allocated. @@ -56,16 +60,20 @@ * size The size of the allocated item * * Returns: - * Nothing + * 1 on success, 0 on failure. If we fail in the ctor, UMA + * will forcibly return a failed allocation (NULL), but it + * will _not_ destruct the object nor will it free it. This + * is up to the caller (because there may be more than one + * object to free). * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ -typedef void (*uma_ctor)(void *mem, int size, void *arg); +typedef int (*uma_ctor)(void *mem, int size, void *arg); /* - * Item destructor + * Zone item destructor * * Arguments: * item A pointer to the memory which has been allocated. @@ -82,6 +90,85 @@ */ typedef void (*uma_dtor)(void *mem, int size, void *arg); +/* + * Keg item constructor + * + * Arguments: + * item A pointer to the memory which has been allocated + * arg The arg field passed to uma_kalloc_arg + * + * Returns: + * 1 on success, 0 on failure. If we fail in the ctor, UMA + * will forcibly return a failed allocation, but not destruct + * the object nor free it. The behavior of this ctor is + * essentially the same as the zone one defined above. + */ +typedef int (*uma_kctor)(void *mem, void *arg); + +/* + * Keg item destructor + * + * Arguments: + * item A pointer to the memory which has been allocated + * arg Argument passed through uma_kfree_arg + * + * Returns: + * Nothing. + * + * Discussion: + * Same as above for zone destructor, except doesn't take + * a 'size' argument. Called for every keg item free. + */ +typedef void (*uma_kdtor)(void *mem, void *arg); + +/* + * Keg item back-end allocator + * + * Arguments: + * keg The keg which we populate + * arg Argument passed through uma_kalloc_arg + * count The number of items to allocate at most in total and fill + * the keg bucket with + * cpu The cpu id of the keg bucket which we populate + * flags Flags passed down from uma_kalloc_arg + * + * Returns: + * A pointer to one allocated item if successful, otherwise NULL. + * + * Discussion: + * The backend allocator is called by the uma_keg code when a keg + * is empty. The uma_keg code will decide what the count is so + * the allocator should be able to deal with count > 1. The cpu id + * provided designates which bucket to fill and the cpu lock should NOT + * be held when this is called. The item being allocated is returned + * constructed, the items cached in the bucket may need construction + * at allocation time. + */ +typedef void *(*uma_kballoc)(uma_keg_t keg, void *arg, + int count, int cpu, int flags); + +/* + * Keg item back-end free + * + * Arguments: + * keg The keg which we free from + * mem Pointer to one of the items being freed + * arg Argument passed through uma_kfree_arg + * count The number of items to free at most in total from the keg bucket + * cpu The cpu id of the keg bucket which we free from + * flags Flags passed through uma_kfree_arg + * + * Returns: + * Nothing. + * + * Discussion: + * Will free the item being returned and, if applicable, up to count + * other items from the specified cpu id's keg bucket. The cpu lock + * should NOT be held when this is called. + */ +typedef void (*uma_kbfree)(uma_keg_t keg, void *mem, void *arg, + int count, int cpu); + /* * Item initializer * @@ -135,7 +222,7 @@ */ -/* Function proto types */ +/* Function prototypes */ /* * Create a new uma zone @@ -149,6 +236,9 @@ * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. + * drain A drain function that will be called from zone_drain which + * in turn is called from the pageout daemon that can perform + * user-specified drainage prior to draining the zone * align A bitmask that corisponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone @@ -185,6 +275,11 @@ * Use a hash table instead of caching * information in the vm_page. */ +#define UMA_ZONE_REFCNT 0x0200 /* + * Allocate a reference count for + * each item in the zone and store + * the counts in the slab's freelist. + */ /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ @@ -194,6 +289,77 @@ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (16 - 1) /* Cache line size align */ +/* Definitions for uma_kcreate flags */ +#define UMA_KFLAG_SOMETHING 0x0001 /* XXX: No flags yet? */ + +/* + * Destroys an empty uma keg. If the keg is not empty uma complains loudly. + * + * Arguments: + * keg The keg we want to destroy. + * + */ + +void uma_kdestroy(uma_keg_t keg); + +/* + * Allocates an item out of a keg. + * + * Arguments: + * keg The keg we are allocating from + * arg This data is passed to the ctor and the backend alloc functions + * flags See sys/malloc.h for available flags, passed down to backend alloc + * + * Returns: + * A non null pointer to an initialized element from the keg is + * guaranteed if the wait flag is M_WAITOK, otherwise a null pointer may be + * returned if the keg is empty and the backend alloc failed. + */ + +void *uma_kalloc_arg(uma_keg_t keg, void *arg, int flags); + +/* + * Allocates an item out of a keg without supplying an argument + * + * This is just a wrapper for uma_kalloc_arg for convenience. + * + */ +static __inline void *uma_kalloc(uma_keg_t keg, int flags); + +static __inline void * +uma_kalloc(uma_keg_t keg, int flags) +{ + return uma_kalloc_arg(keg, NULL, flags); +} + +/* + * Frees an item to a keg. + * + * Arguments: + * keg The keg the item was originally allocated out of + * item The memory to be freed + * arg Argument passed to the destructor and backend free functions + * + * Returns: + * Nothing. + */ + +void uma_kfree_arg(uma_keg_t keg, void *item, void *arg); + +/* + * Frees an item back to a keg without supplying an argument + * + * This is just a wrapper for uma_kfree_arg for convenience. + * + */ +static __inline void uma_kfree(uma_keg_t keg, void *item); + +static __inline void +uma_kfree(uma_keg_t keg, void *item) +{ + uma_kfree_arg(keg, item, NULL); +} + /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * @@ -263,6 +429,16 @@ } /* + * Return a keg item to the specified keg's pcpu bucket. See uma_int.h + */ +int uma_kput(uma_keg_t keg, int cpu, void *item); + +/* + * Grab keg item from specified keg's pcpu bucket. See uma_int.h + */ +void *uma_kget(uma_keg_t keg, int cpu); + +/* * XXX The rest of the prototypes in this header are h0h0 magic for the VM. * If you think you need to use it for a normal zone you're probably incorrect. */ @@ -295,7 +471,27 @@ */ typedef void (*uma_free)(void *item, int size, u_int8_t pflag); - +/* + * Create a new uma keg + * + * Arguments: + * name The text name of the keg for debugging and stats, this memory + * should not be freed until the keg has been deallocated. + * ctor The constructor that is called when the object is allocated + * dtor The destructor that is called when the object is freed + * alloc The backend allocation routine called when the keg is empty + * free The backend free routine called when the keg is full + * flags A set of parameters that control the behavior of the keg + * count The desired per-cpu bucket item count, must be smaller + * than UMA_BUCKET_SIZE (see uma_int.h) + * + * Returns: + * A pointer to a structure which is intended to be opaque to users of + * the interface. + */ +uma_keg_t uma_kcreate(char *name, uma_ctor ctor, uma_dtor dtor, + uma_kballoc alloc, uma_kbfree free, + u_int16_t flags, u_int16_t count); /* * Sets up the uma allocator. (Called by vm_mem_init) @@ -430,5 +626,19 @@ */ void uma_prealloc(uma_zone_t zone, int itemcnt); +/* + * Used to lookup the reference counter allocated for an item + * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, + * reference counters are allocated for items and stored in + * the underlying slab header. + * + * Arguments: + * zone The UMA_ZONE_REFCNT zone to which the item belongs. + * item The address of the item for which we want a refcnt. + * + * Returns: + * A pointer to a u_int32_t reference counter. + */ +u_int32_t *find_refcnt(uma_zone_t zone, void *item); #endif diff -ruN vendor_sys/./vm/uma_core.c mbuma/src/sys/./vm/uma_core.c --- vendor_sys/./vm/uma_core.c Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/uma_core.c Sun Mar 7 14:00:18 2004 @@ -93,6 +93,7 @@ /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; +static uma_zone_t slabrefzone; /* * The initial hash tables come out of this zone so they can be allocated @@ -103,6 +104,11 @@ static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); /* + * Zone that uma_keg structs come from. + */ +static uma_zone_t kegzone; + +/* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; @@ -114,7 +120,7 @@ static struct mtx uma_mtx; /* These are the pcpu cache locks */ -static struct mtx uma_pcpu_mtx[MAXCPU]; +struct mtx uma_pcpu_mtx[MAXCPU]; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = @@ -169,6 +175,20 @@ uint8_t bucket_size[BUCKET_ZONES]; +/* + * This structure is passed as the keg ctor arg so that kegs can be constructed + * on-the-fly via the UMA ctor functionality. + */ +struct uma_kctor_args { + char *name; + uma_ctor ctor; + uma_dtor dtor; + uma_kballoc alloc; + uma_kbfree free; + u_int16_t flags; + u_int16_t count; +}; + /* Prototypes.. */ static void *obj_alloc(uma_zone_t, int, u_int8_t *, int); @@ -179,7 +199,9 @@ static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); -static void zone_ctor(void *, int, void *); +static int keg_ctor(void *, int, void *); +static void keg_dtor(void *, int, void *); +static int zone_ctor(void *, int, void *); static void zone_dtor(void *, int, void *); static void zero_init(void *, int); static void zone_small_init(uma_zone_t zone); @@ -212,9 +234,31 @@ SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); /* - * This routine checks to see whether or not it's safe to enable buckets. + * Here we lookup the reference count corresponding to the specified + * item (which should have come from a UMA_ZONE_REFCNT zone) and return + * it. The reference count is found in the item's underlying slab header. */ +u_int32_t * +find_refcnt(uma_zone_t zone, void *item) +{ + uma_slabrefcnt_t slab; + u_int32_t *refcnt; + u_int8_t *mem; + u_int8_t idx; + + KASSERT((zone->uz_flags & UMA_ZONE_REFCNT) != 0, + ("find_refcnt(): specified zone is not UMA_ZONE_REFCNT")); + mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); + slab = (uma_slabrefcnt_t)hash_sfind(&zone->uz_hash, mem); + idx = ((unsigned long)item - (unsigned long)slab->us_data) + / zone->uz_rsize; + refcnt = &(slab->us_freelist[idx].us_refcnt); + return refcnt; +} +/* + * This routine checks to see whether or not it's safe to enable buckets. + */ static void bucket_enable(void) { @@ -515,7 +559,6 @@ * Returns: * Nothing */ - static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { @@ -693,7 +736,7 @@ mem = slab->us_data; if (zone->uz_flags & UMA_ZONE_OFFPAGE) - uma_zfree_internal(slabzone, slab, NULL, 0); + uma_zfree_internal(zone->uz_slabzone, slab, NULL, 0); if (zone->uz_flags & UMA_ZONE_MALLOC) { vm_object_t obj; @@ -729,6 +772,7 @@ slab_zalloc(uma_zone_t zone, int wait) { uma_slab_t slab; /* Starting slab */ + uma_slabrefcnt_t slabref; u_int8_t *mem; u_int8_t flags; int i; @@ -741,7 +785,7 @@ ZONE_UNLOCK(zone); if (zone->uz_flags & UMA_ZONE_OFFPAGE) { - slab = uma_zalloc_internal(slabzone, NULL, wait); + slab = uma_zalloc_internal(zone->uz_slabzone, NULL, wait); if (slab == NULL) { ZONE_LOCK(zone); return NULL; @@ -781,7 +825,13 @@ slab->us_firstfree = 0; slab->us_flags = flags; for (i = 0; i < zone->uz_ipers; i++) - slab->us_freelist[i] = i+1; + slab->us_freelist[i].us_item = i+1; + + if (zone->uz_flags & UMA_ZONE_REFCNT) { + slabref = (uma_slabrefcnt_t)slab; + for (i = 0; i < zone->uz_ipers; i++) + slabref->us_freelist[i].us_refcnt = 0; + } if (zone->uz_init) for (i = 0; i < zone->uz_ipers; i++) @@ -1044,14 +1094,76 @@ } /* + * Keg header ctor. This initializes all fields, etc. + * + * Arguments/Returns follow uma_ctor specifications + * udata Actually uma_kctor_args + */ + +static int +keg_ctor(void *mem, int size, void *udata) +{ + struct uma_kctor_args *arg = udata; + uma_keg_t keg = mem; + int cpu; + + bzero(keg, size); + keg->uk_name = arg->name; + keg->uk_ctor = arg->ctor; + keg->uk_dtor = arg->dtor; + keg->uk_alloc = arg->alloc; + keg->uk_free = arg->free; + keg->uk_flags = arg->flags; + if (arg->count <= BUCKET_MAX) + keg->uk_count = arg->count; + else + keg->uk_count = BUCKET_MAX; + + /* + * Kegs have per cpu _buckets_ (and not caches). We initialize + * them here. + */ + for (cpu = 0; cpu <= mp_maxid; cpu++) + keg->uk_cpu[cpu] = NULL; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + keg->uk_cpu[cpu] = bucket_alloc(keg->uk_count, M_NOWAIT); + if (keg->uk_cpu[cpu] == NULL) { + keg_dtor(NULL, 0, keg); + return 0; + } + } + + return 1; +} + +/* + * Keg header dtor. This frees all data, etc. + * + * Arguments/Returns follow uma_dtor specifications + * udata unused + */ + +static void +keg_dtor(void *arg, int size, void *udata) +{ + uma_keg_t keg = udata; + int cpu; + + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (keg->uk_cpu[cpu] != NULL) + bucket_free(keg->uk_cpu[cpu]); + } +} + +/* * Zone header ctor. This initializes all fields, locks, etc. And inserts * the zone onto the global zone list. * * Arguments/Returns follow uma_ctor specifications - * udata Actually uma_zcreat_args + * udata Actually uma_zctor_args */ -static void +static int zone_ctor(void *mem, int size, void *udata) { struct uma_zctor_args *arg = udata; @@ -1071,6 +1183,12 @@ zone->uz_flags = arg->flags; zone->uz_allocf = page_alloc; zone->uz_freef = page_free; + zone->uz_slabzone = NULL; + + if (arg->flags & UMA_ZONE_REFCNT) { + zone->uz_flags |= UMA_ZONE_OFFPAGE; + zone->uz_slabzone = slabrefzone; + } if (arg->flags & UMA_ZONE_ZINIT) zone->uz_init = zero_init; @@ -1078,6 +1196,9 @@ if (arg->flags & UMA_ZONE_VM) zone->uz_flags |= UMA_ZFLAG_CACHEONLY; + if (arg->flags & UMA_ZONE_OFFPAGE) + zone->uz_flags |= UMA_ZONE_HASH; + /* * XXX: * The +1 byte added to uz_size is to account for the byte of @@ -1089,6 +1210,10 @@ zone_large_init(zone); else zone_small_init(zone); + + if (zone->uz_flags & UMA_ZONE_OFFPAGE) + zone->uz_slabzone = slabzone; + /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. @@ -1151,12 +1276,13 @@ * caches. If we're internal, bail out here. */ if (zone->uz_flags & UMA_ZFLAG_INTERNAL) - return; + return 1; if (zone->uz_ipers <= BUCKET_MAX) zone->uz_count = zone->uz_ipers; else zone->uz_count = BUCKET_MAX; + return 1; } /* @@ -1276,6 +1402,20 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); + /* + * We also create a zone for the bigger slabs with reference + * counts in them, to accomodate UMA_ZONE_REFCNT zones. + */ + slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt); + slabsize /= UMA_MAX_WASTE; + slabsize++; /* In case there it's rounded */ + slabsize += 4 * slabsize; /* uma_slab_refcnts are larger */ + slabsize += sizeof(struct uma_slab_refcnt); + slabrefzone = uma_zcreate("UMA Slabs with refcnts", + slabsize, + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); + hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, @@ -1283,6 +1423,10 @@ bucket_init(); + kegzone = uma_zcreate("UMA Kegs", sizeof(struct uma_keg) + + sizeof(uma_bucket_t) * mp_maxid, keg_ctor, keg_dtor, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); + #ifdef UMA_MD_SMALL_ALLOC booted = 1; #endif @@ -1322,10 +1466,90 @@ } /* See uma.h */ +uma_keg_t +uma_kcreate(char *name, uma_ctor ctor, uma_dtor dtor, uma_kballoc alloc, + uma_kbfree free, u_int16_t flags, u_int16_t count) +{ + struct uma_kctor_args args; + + args.name = name; + args.ctor = ctor; + args.dtor = dtor; + args.alloc = alloc; + args.free = free; + args.flags = flags; + args.count = count; + + return (uma_zalloc_internal(kegzone, &args, M_WAITOK)); +} + +/* See uma.h */ +void +uma_kdestroy(uma_keg_t keg) +{ + + uma_zfree_internal(kegzone, keg, NULL, 0); +} + +/* See uma.h */ +void * +uma_kalloc_arg(uma_keg_t keg, void *arg, int flags) +{ + void *item; + int cpu; + + cpu = PCPU_GET(cpuid); + item = uma_kget_internal(keg, cpu); + if (item == NULL) { + /* + * While it is possible to tune the bucket size at keg creation, + * we allocate in units of half the maximum number of items per + * bucket (uk_count), and we free in the same number of units. + * This is to hopefully minimize calls into the underlying zones. + */ + item = keg->uk_alloc(keg, arg, keg->uk_count/2+1, cpu, flags); + return item; + } + + if (keg->uk_ctor) + if (!keg->uk_ctor(item, 0, arg)) + return NULL; + return item; +} + +/* See uma.h */ +void +uma_kfree_arg(uma_keg_t keg, void *item, void *arg) +{ + int cpu; + + if (keg->uk_dtor) + keg->uk_dtor(item, 0, arg); + + cpu = PCPU_GET(cpuid); + if (!uma_kput_internal(keg, cpu, item)) + /* Free half the maximum bucket count back to the zones. */ + keg->uk_free(keg, item, arg, keg->uk_count/2+1, cpu); +} + +/* See uma.h */ +int +uma_kput(uma_keg_t keg, int cpu, void *item) +{ + return uma_kput_internal(keg, cpu, item); +} + +/* See uma.h */ +void * +uma_kget(uma_keg_t keg, int cpu) +{ + return uma_kget_internal(keg, cpu); +} + +/* See uma.h */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int16_t flags) - { struct uma_zctor_args args; @@ -1413,7 +1637,8 @@ #endif CPU_UNLOCK(cpu); if (zone->uz_ctor) - zone->uz_ctor(item, zone->uz_size, udata); + if (!zone->uz_ctor(item, zone->uz_size, udata)) + return NULL; if (flags & M_ZERO) bzero(item, zone->uz_size); return (item); @@ -1568,7 +1793,7 @@ u_int8_t freei; freei = slab->us_firstfree; - slab->us_firstfree = slab->us_freelist[freei]; + slab->us_firstfree = slab->us_freelist[freei].us_item; item = slab->us_data + (zone->uz_rsize * freei); slab->us_freecount--; @@ -1689,7 +1914,8 @@ ZONE_UNLOCK(zone); if (zone->uz_ctor != NULL) - zone->uz_ctor(item, zone->uz_size, udata); + if (!zone->uz_ctor(item, zone->uz_size, udata)) + return NULL; if (flags & M_ZERO) bzero(item, zone->uz_size); @@ -1900,7 +2126,7 @@ uma_dbg_free(zone, slab, item); #endif - slab->us_freelist[freei] = slab->us_firstfree; + slab->us_freelist[freei].us_item = slab->us_firstfree; slab->us_firstfree = freei; slab->us_freecount++; @@ -2020,6 +2246,7 @@ * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ + zone_drain(slabrefzone); zone_drain(slabzone); bucket_zone_drain(); } diff -ruN vendor_sys/./vm/uma_dbg.c mbuma/src/sys/./vm/uma_dbg.c --- vendor_sys/./vm/uma_dbg.c Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/uma_dbg.c Sun Feb 29 19:41:28 2004 @@ -56,7 +56,7 @@ * Complies with standard ctor arg/return * */ -void +int trash_ctor(void *mem, int size, void *arg) { int cnt; @@ -68,6 +68,7 @@ if (*p != uma_junk) panic("Memory modified after free %p(%d) val=%x @ %p\n", mem, size, *p, p); + return 1; } /* @@ -118,7 +119,7 @@ * Complies with standard ctor arg/return * */ -void +int mtrash_ctor(void *mem, int size, void *arg) { struct malloc_type **ksp; @@ -137,6 +138,7 @@ panic("Most recently used by %s\n", (*ksp == NULL)? "none" : (*ksp)->ks_shortdesc); } + return 1; } /* @@ -227,7 +229,7 @@ freei = ((unsigned long)item - (unsigned long)slab->us_data) / zone->uz_rsize; - slab->us_freelist[freei] = 255; + slab->us_freelist[freei].us_item = 255; return; } @@ -264,9 +266,9 @@ (freei * zone->uz_rsize) + slab->us_data); } - if (slab->us_freelist[freei] != 255) { + if (slab->us_freelist[freei].us_item != 255) { printf("Slab at %p, freei %d = %d.\n", - slab, freei, slab->us_freelist[freei]); + slab, freei, slab->us_freelist[freei].us_item); panic("Duplicate free of item %p from zone %p(%s)\n", item, zone, zone->uz_name); } @@ -276,5 +278,5 @@ * Until then the count of valid slabs will make sure we don't * accidentally follow this and assume it's a valid index. */ - slab->us_freelist[freei] = 0; + slab->us_freelist[freei].us_item = 0; } diff -ruN vendor_sys/./vm/uma_dbg.h mbuma/src/sys/./vm/uma_dbg.h --- vendor_sys/./vm/uma_dbg.h Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/uma_dbg.h Mon Feb 16 18:50:21 2004 @@ -37,13 +37,13 @@ #ifndef VM_UMA_DBG_H #define VM_UMA_DBG_H -void trash_ctor(void *mem, int size, void *arg); +int trash_ctor(void *mem, int size, void *arg); void trash_dtor(void *mem, int size, void *arg); void trash_init(void *mem, int size); void trash_fini(void *mem, int size); /* For use only by malloc */ -void mtrash_ctor(void *mem, int size, void *arg); +int mtrash_ctor(void *mem, int size, void *arg); void mtrash_dtor(void *mem, int size, void *arg); void mtrash_init(void *mem, int size); void mtrash_fini(void *mem, int size); diff -ruN vendor_sys/./vm/uma_int.h mbuma/src/sys/./vm/uma_int.h --- vendor_sys/./vm/uma_int.h Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/uma_int.h Sun Feb 29 19:41:28 2004 @@ -41,6 +41,20 @@ * and rsize is the result of that. The zone also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. + * + * Kegs are optional higher-level structures used in cases where we need + * to allocate the same type of object but in different forms (sort of like + * instances of a polymorphic object). In such a scenario, an object "A" may + * sometimes need to be allocated, initialized, and constructed solely in + * the form of "object A," whereas at other times, it may need to be + * initialized and constructed in an entirely different way, possibly with + * an attached auxillary object "B". Thus, object A can appear as "A" or + * as "A+B". Here we typically create a zone for "A", a zone for "B" and a + * keg to cache "A+B" instances. The lower-level zones ("A" and "B") remain + * logically separate and the keg itself is backed by those zones. A good + * example of such a scenario is the mbuf code, where mbufs can appear as + * just mbufs, clusters just as clusters, but where mbufs may also appear + * as mbufs with clusters attached to them. * * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may * be allocated off the page from a special slab zone. The free list within a @@ -137,7 +151,7 @@ /* Page management structure */ /* Sorry for the union, but space efficiency is important */ -struct uma_slab { +struct uma_slab_head { uma_zone_t us_zone; /* Zone we live in */ union { LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */ @@ -148,13 +162,39 @@ u_int8_t us_flags; /* Page flags see uma.h */ u_int8_t us_freecount; /* How many are free? */ u_int8_t us_firstfree; /* First free item index */ - u_int8_t us_freelist[1]; /* Free List (actually larger) */ }; -#define us_link us_type._us_link -#define us_size us_type._us_size +/* The standard slab structure */ +struct uma_slab { + struct uma_slab_head us_head; /* slab header data */ + struct { + u_int8_t us_item; + } us_freelist[1]; /* actual number bigger */ +}; + +/* + * The slab structure for UMA_ZONE_REFCNT zones for whose items we + * maintain reference counters in the slab for. + */ +struct uma_slab_refcnt { + struct uma_slab_head us_head; /* slab header data */ + struct { + u_int8_t us_item; + u_int32_t us_refcnt; + } us_freelist[1]; /* actual number bigger */ +}; + +#define us_zone us_head.us_zone +#define us_link us_head.us_type._us_link +#define us_size us_head.us_type._us_size +#define us_hlink us_head.us_hlink +#define us_data us_head.us_data +#define us_flags us_head.us_flags +#define us_freecount us_head.us_freecount +#define us_firstfree us_head.us_firstfree typedef struct uma_slab * uma_slab_t; +typedef struct uma_slab_refcnt * uma_slabrefcnt_t; /* Hash table for freed address -> slab translation */ @@ -188,6 +228,25 @@ typedef struct uma_cache * uma_cache_t; /* + * Keg management structure + */ +struct uma_keg { + char *uk_name; /* Text name of the keg */ + uma_ctor uk_ctor; /* Constructor for each allocation */ + uma_dtor uk_dtor; /* Destructor for each free */ + uma_kballoc uk_alloc; /* Lower-level backend allocator */ + uma_kbfree uk_free; /* Lower-level backend free */ + u_int16_t uk_count; /* Max number allowed in pcpu buckets */ + u_int16_t uk_flags; /* Flags for keg */ + + /* + * Must be last item as the number of pointers required is + * allocated by the init code at runtime according to mp_maxid. + */ + uma_bucket_t uk_cpu[1]; /* Per-cpu buckets */ +}; + +/* * Zone management structure * * TODO: Optimize for cache line size @@ -212,6 +271,7 @@ LIST_HEAD(,uma_bucket) uz_free_bucket; /* Buckets for frees */ u_int32_t uz_size; /* Requested size of each item */ u_int32_t uz_rsize; /* Real size of each item */ + uma_zone_t uz_slabzone; /* The slabzone we use (for OFFPAGE) */ struct uma_hash uz_hash; u_int16_t uz_pgoff; /* Offset to uma_slab struct */ @@ -230,7 +290,7 @@ u_int32_t uz_maxpages; /* Maximum number of pages to alloc */ int uz_recurse; /* Allocation recursion count */ uint16_t uz_fills; /* Outstanding bucket fills */ - uint16_t uz_count; /* Highest value ub_ptr can have */ + uint16_t uz_count; /* Highest value ub_cnt can have */ /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. @@ -247,6 +307,10 @@ #define UMA_ZFLAG_CACHEONLY 0x8000 /* Don't ask VM for buckets. */ /* Internal prototypes */ +static __inline int uma_kput_internal(uma_keg_t keg, int cpu, void *item) + __always_inline; +static __inline void * uma_kget_internal(uma_keg_t keg, int cpu) + __always_inline; static __inline uma_slab_t hash_sfind(struct uma_hash *hash, u_int8_t *data); void *uma_large_malloc(int size, int wait); void uma_large_free(uma_slab_t slab); @@ -267,6 +331,11 @@ #define ZONE_LOCK(z) mtx_lock(&(z)->uz_lock) #define ZONE_UNLOCK(z) mtx_unlock(&(z)->uz_lock) +/* + * Per-cpu lock framework. + */ +extern struct mtx uma_pcpu_mtx[MAXCPU]; + #define CPU_LOCK_INIT(cpu) \ mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu", \ MTX_DEF | MTX_DUPOK) @@ -276,6 +345,76 @@ #define CPU_UNLOCK(cpu) \ mtx_unlock(&uma_pcpu_mtx[(cpu)]) + +/* + * Puts a single item into the specified keg, but does _not_ call any + * destructor nor does it call any backend routines. Used to merely + * store a single item into a keg from the backend allocation routines + * (uk_alloc) with proper locking. + * + * Arguments: + * keg The keg to which to store the item into + * cpu The cpu id specifying to which pcpu bucket in the keg to + * store in + * item Pointer to the item which we are to store + * + * Returns: + * 1 if successful or 0 on failure (i.e., keg is full) + */ +static __inline int +uma_kput_internal(uma_keg_t keg, int cpu, void *item) +{ + uma_bucket_t bucket; + int ret; + + CPU_LOCK(cpu); + bucket = keg->uk_cpu[cpu]; + if (bucket->ub_cnt < keg->uk_count) { + KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, + ("uma_kput_internal(): Populating non-free bucket index")); + bucket->ub_bucket[bucket->ub_cnt] = item; + bucket->ub_cnt++; + ret = 1; + } else + ret = 0; + CPU_UNLOCK(cpu); + return ret; +} + +/* + * Takes a single item from the specified keg's pcpu bucket. This does not + * call any constructor nor any backend allocation routine should the + * allocation fail. + * + * Arguments: + * keg The keg from which to grab the item from + * cpu The cpu id of the pcpu bucket in the keg from which we take + * + * Returns: + * A pointer to the grabbed item on success, NULL on + * failure (i.e., empty keg) + */ +static __inline void * +uma_kget_internal(uma_keg_t keg, int cpu) +{ + uma_bucket_t bucket; + void *item; + + item = NULL; + CPU_LOCK(cpu); + bucket = keg->uk_cpu[cpu]; + if (bucket->ub_cnt > 0) { + bucket->ub_cnt--; + item = bucket->ub_bucket[bucket->ub_cnt]; +#ifdef INVARIANTS + bucket->ub_bucket[bucket->ub_cnt] = NULL; +#endif + KASSERT(item != NULL, + ("uma_kget_internal(): Bucket pointer mangled!")); + } + CPU_UNLOCK(cpu); + return item; +} /* * Find a slab within a hash table. This is used for OFFPAGE zones to lookup diff -ruN vendor_sys/./vm/uma_mbuf.c mbuma/src/sys/./vm/uma_mbuf.c --- vendor_sys/./vm/uma_mbuf.c Wed Dec 31 19:00:00 1969 +++ mbuma/src/sys/./vm/uma_mbuf.c Sat Mar 6 16:10:27 2004 @@ -0,0 +1,438 @@ +/*- + * Copyright (c) 2004 + * Bosko Milekic . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_mac.h" +#include "opt_param.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * This is the FreeBSD network buffer allocator. It sets up several + * UMA zones for various allocations and also overlays a UMA keg + * over the standard zones. It uses keg optimizations in UMA thus + * attempting to minimize latency associated with allocating and + * freeing certain combinations (e.g., mbuf+cluster) of network + * buffers. + * + * If most users of UMA (i.e., those who define zones along + * with constructors, destructors, and init and fini routines) are + * considered "higher-layer" and UMA itself (uma_core) is considered + * the "UMA lower-layer," then the mbuf allocation framework which + * defines both zones and intermediate uma_kegs is in between. + */ + +int nmbclusters; +struct mbstat mbstat; + +static void +tunable_mbinit(void *dummy) +{ + + /* This has to be done before VM init. */ + nmbclusters = 1024 + maxusers * 64; + TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); +} +SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0, + "Maximum number of mbuf clusters allowed"); +SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, + "Mbuf general information and statistics"); + +/* + * Zones, kegs, and maps from which we allocate. + */ +uma_zone_t zone_mbuf; +uma_zone_t zone_clust; +uma_keg_t keg_packet; + +/* + * Local prototypes. + */ +static int mb_ctor_mbuf(void *, int, void *); +static int mb_ctor_clust(void *, int, void *); +static int mb_ctor_pack(void *, int, void *); +static void *mb_alloc_pack(uma_keg_t, void *, int, int, int); +static void mb_drainkegs(void *); +static void mb_dtor_mbuf(void *, int, void *); +static void mb_free_pack(uma_keg_t, void *, void *, int, int); +static void mb_reclaim(void *); +static void mbuf_init(void *); + +/* + * Initialize the network buffer allocation subsystem. + * + * Returns nothing, will panic on failure. + */ +SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) +static void +mbuf_init(void *dummy) +{ + + /* + * Configure UMA zones for mbufs, clusters, and possibly other + * network buffer types. + */ + zone_mbuf = uma_zcreate("mbuf zone", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, + NULL, NULL, UMA_ALIGN_PTR, 0); + zone_clust = uma_zcreate("mbuf cluster zone", MCLBYTES, mb_ctor_clust, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT); + if ((zone_mbuf == NULL) || (zone_clust == NULL)) + goto bad; + if (nmbclusters > 0) + uma_zone_set_max(zone_clust, nmbclusters); + + /* + * Configure overlayed hot-caches. These are called 'kegs' because + * they can be stacked on top of zones. + */ + keg_packet = uma_kcreate("packet keg", mb_ctor_pack, NULL, + mb_alloc_pack, mb_free_pack, 0, 16); + if (keg_packet == NULL) + goto bad; + + EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); + EVENTHANDLER_REGISTER(vm_lowmem, mb_drainkegs, NULL, EVENTHANDLER_PRI_ANY); + + /* + * [Re]set counters and local statistics knobs. + */ + mbstat.m_drain = 0; + + return; +bad: + panic("mbuf_init(): failed to initialize mbuf subsystem!"); +} + +/* + * Constructor for mbuf zone. + * + * The 'arg' pointer points to a mb_args structure which + * contains call-specific information required to support the + * mbuf allocation API. + */ +static int +mb_ctor_mbuf(void *mem, int size, void *arg) +{ + struct mbuf *m; + struct mb_args *args; + int flags; + int how; + short type; + + m = (struct mbuf *)mem; + args = (struct mb_args *)arg; + flags = args->flags; + how = args->how; + type = args->type; + + m->m_type = type; + m->m_next = NULL; + m->m_nextpkt = NULL; + if (flags & M_PKTHDR) { + m->m_data = m->m_pktdat; + m->m_flags = M_PKTHDR; + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.csum_flags = 0; + SLIST_INIT(&m->m_pkthdr.tags); +#ifdef MAC + /* If the label init fails, fail the alloc */ + if (mac_init_mbuf(m, how) != 0) { + m_free(m); + return 0; + } +#endif + } else { + m->m_data = m->m_dat; + m->m_flags = 0; + } + return 1; +} + +/* + * The mbuf zone destructor. + */ +static void +mb_dtor_mbuf(void *mem, int size, void *arg) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + if ((m->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(m, NULL); +} + +/* + * The cluster zone constructor. + * + * Here the 'arg' pointer points to the mbuf which we + * are configuring cluster storage for. + */ +static int +mb_ctor_clust(void *mem, int size, void *arg) +{ + struct mbuf *m; + + m = (struct mbuf *)arg; + m->m_ext.ext_buf = (caddr_t)mem; + m->m_data = m->m_ext.ext_buf; + m->m_flags |= M_EXT; + m->m_ext.ext_free = NULL; + m->m_ext.ext_args = NULL; + m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_type = EXT_CLUSTER; + m->m_ext.ref_cnt = (u_int *)find_refcnt(zone_clust, + m->m_ext.ext_buf); + *(m->m_ext.ref_cnt) = 1; + return 1; +} + +/* + * Backend allocation for the overlayed "packet" keg. + * + * The packet keg is backed by the mbuf and cluster zones. When + * the keg is empty, this routine is called to allocate up to + * a maximum of 'count' packets and fill the 'cpu' bucket. + * The 'cpu' bucket to be filled should not be locked when this + * is called. + * + * It should be noted that the 'flags' argument is ignored + * here, as we fetch our only allocation flags from the + * caller-specified mb_args 'arg' pointer. + * + * We do not call the packet constructor here as the underlying + * objects (mbuf and cluster) are properly constructed via their + * zone's constructors. + */ +static void * +mb_alloc_pack(uma_keg_t keg, void *arg, + int count, int cpu, int flags) +{ + struct mbuf *m; + + /* + * We populate our 'cpu' bucket with up to a maximum + * of 'count - 1' packets. + */ + while (count > 1) { + m = NULL; + m = uma_zalloc_arg(zone_mbuf, arg, flags); + if (m == NULL) + goto check_avail; + + m->m_ext.ext_buf = uma_zalloc_arg(zone_clust, m, flags); + if (m->m_ext.ext_buf == NULL) { + uma_zfree_arg(zone_mbuf, m, NULL); + m = NULL; + goto check_avail; + } + + if (!uma_kput(keg, cpu, m)) + return m; + } + + m = NULL; + m = uma_zalloc_arg(zone_mbuf, arg, flags); + if (m == NULL) + goto check_avail; + m->m_ext.ext_buf = uma_zalloc_arg(zone_clust, m, flags); + if (m->m_ext.ext_buf == NULL) { + uma_zfree_arg(zone_mbuf, m, NULL); + m = NULL; + goto check_avail; + } + return m; + +check_avail: + /* + * If we end up grabbing a packet from the keg, we need + * to call the packet constructor before returning it. + * If the packet's underlying objects (mbuf and cluster) + * came from their respective zones and were not put on + * the keg, they're already constructed so that is why + * we do not call the packet constructor above. + */ + m = uma_kget(keg, cpu); + if (m != NULL) + if (!mb_ctor_pack(m, 0, arg)) + return NULL; + return m; +} + +/* + * Backend free for the overlayed "packet" keg. + * + * Frees a maximum of 'count' packets from the specified + * 'cpu' bucket back to the underlying zones. The 'cpu' + * bucket should not be locked when this is called. + * + * We do not call the packet destructor here as the underlying + * objects (mbuf and cluster) are properly destructed via their + * zone's destructors. + */ +static void +mb_free_pack(uma_keg_t keg, void *mem, void *arg, + int count, int cpu) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + uma_zfree_arg(zone_clust, m->m_ext.ext_buf, arg); + uma_zfree_arg(zone_mbuf, m, arg); + if (count <= 1) + return; + + /* Free more, if requested. */ + count--; + while (count > 0) { + m = uma_kget(keg, cpu); + if (m == NULL) + break; + uma_zfree_arg(zone_clust, m->m_ext.ext_buf, arg); + uma_zfree_arg(zone_mbuf, m, arg); + count--; + } +} + +/* + * The "packet" keg constructor. + */ +static int +mb_ctor_pack(void *mem, int size, void *arg) +{ + struct mbuf *m; + struct mb_args *args; + int flags; + short type; + + m = (struct mbuf *)mem; + args = (struct mb_args *)arg; + flags = args->flags; + type = args->type; + + m->m_type = type; + m->m_next = NULL; + m->m_data = m->m_ext.ext_buf; + m->m_flags = flags|M_EXT; + m->m_ext.ext_free = NULL; + m->m_ext.ext_args = NULL; + m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_type = EXT_CLUSTER; + m->m_ext.ref_cnt = (u_int *)find_refcnt(zone_clust, + m->m_ext.ext_buf); + *(m->m_ext.ref_cnt) = 1; + + if (flags & M_PKTHDR) { + m->m_nextpkt = NULL; + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.csum_flags = 0; + SLIST_INIT(&m->m_pkthdr.tags); +#ifdef MAC + /* If the label init fails, fail the alloc */ + if (mac_init_mbuf(m, how) != 0) { + m_free(m); + return 0; + } +#endif + } + return 1; +} + +/* + * This is the protocol drain routine. + * + * No locks should be held when this is called. The drain routines have to + * presently acquire some locks which raises the possibility of lock order + * reversal. + */ +static void +mb_reclaim(void *junk) +{ + struct domain *dp; + struct protosw *pr; + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, + "mb_reclaim()"); + + mbstat.m_drain++; + for (dp = domains; dp != NULL; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain != NULL) + (*pr->pr_drain)(); +} + +/* + * This is the packet keg drain routine. + * + * It will drain the packet keg to its underlying zones. Following this, + * the zones themselves will likely drain via uma_reclaim(). This is + * to occur _after_ the protocols have been drained. + */ +static void +mb_drainkegs(void *junk) +{ + struct mbuf *m; + int i; + + /* + * This isn't ideal as we use the keg and zone UMA APIs + * to grab object by object from the keg_packet keg and + * free each individually, with the respective keg and + * zone locks dropped in between. However, if the load is + * low and we're reclaiming after a spike, this should be + * fine. If the load is high then in between frees there + * may be keg refills, which may cause endless looping + * here. XXX REVISIT THIS + */ + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + while ((m = uma_kget(keg_packet, i)) != NULL) { + uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); + uma_zfree_arg(zone_mbuf, m, NULL); + } + } +} diff -ruN vendor_sys/./vm/vm_kern.c mbuma/src/sys/./vm/vm_kern.c --- vendor_sys/./vm/vm_kern.c Sun Mar 7 14:59:12 2004 +++ mbuma/src/sys/./vm/vm_kern.c Wed Mar 3 23:41:42 2004 @@ -326,16 +326,6 @@ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr)) { vm_map_unlock(map); - if (map != kmem_map) { - static int last_report; /* when we did it (in ticks) */ - if (ticks < last_report || - (ticks - last_report) >= hz) { - last_report = ticks; - printf("Out of mbuf address space!\n"); - printf("Consider increasing NMBCLUSTERS\n"); - } - return (0); - } if ((flags & M_NOWAIT) == 0) panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated", (long)size, (long)map->size); diff -ruN vendor_sys/./vm/vm_map.c mbuma/src/sys/./vm/vm_map.c --- vendor_sys/./vm/vm_map.c Sun Mar 7 14:59:13 2004 +++ mbuma/src/sys/./vm/vm_map.c Mon Feb 16 19:23:10 2004 @@ -164,11 +164,11 @@ #endif vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_prealloc(mapzone, MAX_KMAP); - kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), + kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); uma_prealloc(kmapentzone, MAX_KMAPENT); - mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), + mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(mapentzone, MAX_MAPENT); } diff -ruN vendor_sys/./vm/vm_object.c mbuma/src/sys/./vm/vm_object.c --- vendor_sys/./vm/vm_object.c Sun Mar 7 14:59:13 2004 +++ mbuma/src/sys/./vm/vm_object.c Fri Feb 27 17:24:05 2004 @@ -253,7 +253,8 @@ #else NULL, #endif - vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); + vm_object_zinit, NULL, UMA_ALIGN_PTR, + UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(obj_zone, VM_OBJECTS_INIT); }