commit 56d01269eb87dd389c946b39fd3977711cd69616 Author: Mark Johnston Date: Wed Mar 2 19:37:36 2016 -0800 Maintain a working set estimate for UMA zone bucket caches Use it to get an idea of how many buckets are in active use by the zone, and modify uma_reclaim() to avoid purging buckets that are likely to be used in the near future. diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 262f8b0..450c0f8 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -158,7 +158,6 @@ static int booted = 0; * outside of the allocation fast path. */ static struct callout uma_callout; -#define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create @@ -239,7 +238,10 @@ static void zone_dtor(void *, int, void *); static int zero_init(void *, int, int); static void keg_small_init(uma_keg_t keg); static void keg_large_init(uma_keg_t keg); +static void zone_add_bucket(uma_zone_t zone, uma_bucket_t bucket, bool ws); +static void zone_remove_bucket(uma_zone_t zone, uma_bucket_t bucket, bool ws); static void zone_foreach(void (*zfunc)(uma_zone_t)); +static void zone_prune(uma_zone_t zone); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); @@ -463,8 +465,35 @@ uma_timeout(void *unused) bucket_enable(); zone_foreach(zone_timeout); - /* Reschedule this event */ - callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); + /* Reschedule this event. Avoid aliasing uma_reclaim() calls. */ + callout_reset(&uma_callout, (vm_pageout_lowmem_period + 1) * hz, + uma_timeout, NULL); +} + +static inline void +zone_add_bucket(uma_zone_t zone, uma_bucket_t bucket, bool ws) +{ + + ZONE_LOCK_ASSERT(zone); + if (ws) + zone->uz_bktallocs -= min(bucket->ub_cnt, zone->uz_bktallocs); + zone->uz_bktcount += bucket->ub_cnt; + LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); +} + +static inline void +zone_remove_bucket(uma_zone_t zone, uma_bucket_t bucket, bool ws) +{ + + ZONE_LOCK_ASSERT(zone); + MPASS(zone->uz_bktcount >= bucket->ub_cnt); + if (ws) { + zone->uz_bktallocs += bucket->ub_cnt; + if (zone->uz_bktset < zone->uz_bktallocs) + zone->uz_bktset = zone->uz_bktallocs; + } + zone->uz_bktcount -= bucket->ub_cnt; + LIST_REMOVE(bucket, ub_link); } /* @@ -519,8 +548,21 @@ keg_timeout(uma_keg_t keg) static void zone_timeout(uma_zone_t zone) { + const int div = 12; + int weight; zone_foreach_keg(zone, &keg_timeout); + + /* + * Update the average size of the bucket cache working set. With the + * default lowmem period, this gives us a running average over the last + * ~120s. A weight is used to allow the working set estimate to grow + * more quickly than it decays. + */ + weight = zone->uz_bktset > zone->uz_bktsetavg ? 2 : 1; + zone->uz_bktsetavg = (weight * zone->uz_bktset + + (div - weight) * zone->uz_bktsetavg) / div; + zone->uz_bktallocs = zone->uz_bktset = 0; } /* @@ -726,16 +768,14 @@ cache_drain_safe_cpu(uma_zone_t zone) cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket) { if (cache->uc_allocbucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, - cache->uc_allocbucket, ub_link); + zone_add_bucket(zone, cache->uc_allocbucket, false); else b1 = cache->uc_allocbucket; cache->uc_allocbucket = NULL; } if (cache->uc_freebucket) { if (cache->uc_freebucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, - cache->uc_freebucket, ub_link); + zone_add_bucket(zone, cache->uc_freebucket, false); else b2 = cache->uc_freebucket; cache->uc_freebucket = NULL; @@ -784,24 +824,48 @@ cache_drain_safe(uma_zone_t zone) } /* - * Drain the cached buckets from a zone. Expects a locked zone on entry. + * Drain some or all of the cached buckets from a zone. Expects a locked zone + * on entry. */ static void bucket_cache_drain(uma_zone_t zone) { - uma_bucket_t bucket; + LIST_HEAD(, uma_bucket) bh; + uma_bucket_t bucket, tmp; + int64_t skip; + + LIST_INIT(&bh); + + /* + * If we're only attempting to trim excess buckets, stop once we've + * brought the bucket list close to the working set average. Otherwise + * drain the entire queue, keeping two buckets per CPU. + */ + if ((zone->uz_flags & UMA_ZFLAG_PRUNING) != 0) + skip = zone->uz_bktsetavg; + else + skip = 0; /* - * Drain the bucket queues and free the buckets, we just keep two per - * cpu (alloc/free). + * Start draining towards the end of list to preserve the most recently + * used buckets. Move excess buckets to a temporary list so that we can + * drain them without holding the zone lock. */ - while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { + LIST_FOREACH_SAFE(bucket, &zone->uz_buckets, ub_link, tmp) { + skip -= bucket->ub_cnt; + if (skip > 0) + continue; + zone_remove_bucket(zone, bucket, false); + LIST_INSERT_HEAD(&bh, bucket, ub_link); + } + + ZONE_UNLOCK(zone); + while ((bucket = LIST_FIRST(&bh)) != NULL) { LIST_REMOVE(bucket, ub_link); - ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(zone, bucket, NULL); - ZONE_LOCK(zone); } + ZONE_LOCK(zone); /* * Shrink further bucket sizes. Price of single zone lock collision @@ -931,6 +995,15 @@ zone_drain(uma_zone_t zone) zone_drain_wait(zone, M_NOWAIT); } +static void +zone_prune(uma_zone_t zone) +{ + + zone->uz_flags |= UMA_ZFLAG_PRUNING; + zone_drain_wait(zone, M_NOWAIT); + zone->uz_flags &= ~UMA_ZFLAG_PRUNING; +} + /* * Allocate a new slab for a keg. This does not insert the slab onto a list. * @@ -1549,6 +1622,10 @@ zone_ctor(void *mem, int size, void *udata, int flags) zone->uz_frees = 0; zone->uz_fails = 0; zone->uz_sleeps = 0; + zone->uz_bktcount = 0; + zone->uz_bktallocs = 0; + zone->uz_bktset = 0; + zone->uz_bktsetavg = 0; zone->uz_count = 0; zone->uz_count_min = 0; zone->uz_flags = 0; @@ -1855,7 +1932,8 @@ uma_startup3(void) printf("Starting callout.\n"); #endif callout_init(&uma_callout, 1); - callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); + callout_reset(&uma_callout, (vm_pageout_lowmem_period + 1) * hz, + uma_timeout, NULL); #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif @@ -2239,7 +2317,7 @@ zalloc_start: KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); - LIST_REMOVE(bucket, ub_link); + zone_remove_bucket(zone, bucket, true); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; @@ -2269,12 +2347,17 @@ zalloc_start: /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim - * the memory directly. + * the memory directly. Record this as part of the bucket + * cache's working set only if we didn't lose the race to + * prevent double-counting. */ - if (cache->uc_allocbucket == NULL) + if (cache->uc_allocbucket == NULL) { cache->uc_allocbucket = bucket; - else - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + zone->uz_bktallocs += bucket->ub_cnt; + if (zone->uz_bktset < zone->uz_bktallocs) + zone->uz_bktset = zone->uz_bktallocs; + } else + zone_add_bucket(zone, bucket, false); ZONE_UNLOCK(zone); goto zalloc_start; } @@ -2759,7 +2842,7 @@ zfree_start: /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + zone_add_bucket(zone, bucket, true); } /* We are no longer associated with this CPU. */ @@ -3171,8 +3254,9 @@ uma_reclaim_locked(bool kmem_danger) #endif sx_assert(&uma_drain_lock, SA_XLOCKED); bucket_enable(); - zone_foreach(zone_drain); + zone_foreach(zone_prune); if (vm_page_count_min() || kmem_danger) { + /* Go all-out if we're in the danger zone. */ cache_drain_safe(NULL); zone_foreach(zone_drain); } @@ -3410,7 +3494,6 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat ups; - uma_bucket_t bucket; struct sbuf sbuf; uma_cache_t cache; uma_klink_t kl; @@ -3466,8 +3549,7 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - uth.uth_zone_free += bucket->ub_cnt; + uth.uth_zone_free += z->uz_bktcount; uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; @@ -3634,7 +3716,6 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) DB_SHOW_COMMAND(uma, db_show_uma) { uint64_t allocs, frees, sleeps; - uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; int cachefree; @@ -3654,8 +3735,7 @@ DB_SHOW_COMMAND(uma, db_show_uma) if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + cachefree += z->uz_bktcount; db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, @@ -3669,7 +3749,6 @@ DB_SHOW_COMMAND(uma, db_show_uma) DB_SHOW_COMMAND(umacache, db_show_umacache) { uint64_t allocs, frees; - uma_bucket_t bucket; uma_zone_t z; int cachefree; @@ -3677,8 +3756,7 @@ DB_SHOW_COMMAND(umacache, db_show_umacache) "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + cachefree += z->uz_bktcount; db_printf("%18s %8ju %8jd %8d %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index c4235ce..126f605 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -268,10 +268,11 @@ typedef struct uma_klink *uma_klink_t; struct uma_zone { struct mtx_padalign uz_lock; /* Lock for the zone */ struct mtx_padalign *uz_lockptr; - const char *uz_name; /* Text name of the zone */ + LIST_HEAD(, uma_bucket) uz_buckets; /* full buckets */ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */ + + const char *uz_name; /* Text name of the zone */ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ struct uma_klink uz_klink; /* klink for first keg. */ @@ -292,7 +293,11 @@ struct uma_zone { volatile u_long uz_fails; /* Total number of alloc failures */ volatile u_long uz_frees; /* Total number of frees */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ - uint16_t uz_count; /* Amount of items in full bucket */ + uint64_t uz_bktcount; /* Bucket cache size */ + int64_t uz_bktallocs; /* Items alloced from bucket cache */ + int64_t uz_bktset; /* Bucket cache WS this period */ + int64_t uz_bktsetavg; /* Bucket cache working set average */ + uint16_t uz_count; /* Amount of items in a full bucket */ uint16_t uz_count_min; /* Minimal amount of items there */ /* The next two fields are used to print a rate-limited warnings. */ @@ -311,7 +316,8 @@ struct uma_zone { /* * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ -#define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */ +#define UMA_ZFLAG_MULTI 0x02000000 /* Multiple kegs in the zone. */ +#define UMA_ZFLAG_PRUNING 0x04000000 /* Draining excess items only */ #define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ @@ -367,6 +373,7 @@ void uma_large_free(uma_slab_t slab); #define ZONE_LOCK(z) mtx_lock((z)->uz_lockptr) #define ZONE_TRYLOCK(z) mtx_trylock((z)->uz_lockptr) #define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lockptr) +#define ZONE_LOCK_ASSERT(z) mtx_assert((z)->uz_lockptr, MA_OWNED) #define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock) /* diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index b7f6887..36fc774 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -159,6 +159,7 @@ SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_wakeup_thresh; +int vm_pageout_lowmem_period = 10; static int vm_pageout_oom_seq = 12; #if !defined(NO_SWAPPING) @@ -172,7 +173,6 @@ static int vm_max_launder = 32; static int vm_pageout_update_period; static int defer_swap_pageouts; static int disable_swap_pageouts; -static int lowmem_period = 10; static time_t lowmem_uptime; #if defined(NO_SWAPPING) @@ -200,7 +200,8 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); -SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, +SYSCTL_INT(_vm, OID_AUTO, lowmem_period, + CTLFLAG_RW, &vm_pageout_lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) @@ -891,7 +892,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && - (time_uptime - lowmem_uptime) >= lowmem_period) { + (time_uptime - lowmem_uptime) >= vm_pageout_lowmem_period) { /* * Decrease registered cache sizes. */ diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index 2d0b961..d0706b3 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -75,6 +75,7 @@ extern int vm_page_max_wired; extern int vm_pages_needed; /* should be some "event" structure */ extern int vm_pageout_deficit; extern int vm_pageout_page_count; +extern int vm_pageout_lowmem_period; /* * Swap out requests