From 414b7cdb23a79bd07ed2b625c562224eb5bc45b2 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 14 Dec 2015 19:15:14 -0800 Subject: [PATCH 2/5] Add PQ_STASIS. The is a paging queue used to store dirty inactive anon pages that cannot be laundered. Currently this occurs when no swap devices are configured, but could be used to handle other pager errors. This change adds swapon and swapoff EVENTHANDLERs, serialized by the swapconf lock. The laundry thread's handlers for these events increment and decrement a count of configured swap devices. If the number of swap devices is 0, an error of VM_PAGER_FAIL for a putpages call on a dirty page belonging to an object of type DEFAULT or SWAP results in the page entering the stasis queue. If the number of configured swap devices transitions from 0 to 1, all pages in the stasis queue are placed at the end of the laundry queue. There is no limit on the number of pages in stasis. pagedaemon targets are not scaled according to the number of pages in stasis. This queue's purpose is to ensure that the pagedaemon will not waste CPU time examining pages it cannot reclaim. This change modifies top(1) and vmstat(8) to display the number of changes in stasis. --- sys/sys/eventhandler.h | 9 +++-- sys/sys/vmmeter.h | 2 +- sys/vm/swap_pager.c | 2 ++ sys/vm/vm_meter.c | 1 + sys/vm/vm_page.c | 31 ++++++++++++++++- sys/vm/vm_page.h | 6 ++-- sys/vm/vm_pageout.c | 93 +++++++++++++++++++++++++++++++++++++++++-------- usr.bin/systat/vmstat.c | 8 +++-- usr.bin/top/machine.c | 18 +++++----- usr.bin/vmstat/vmstat.c | 1 + 10 files changed, 139 insertions(+), 32 deletions(-) diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index d82ece7..4df3038 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -211,7 +211,6 @@ EVENTHANDLER_DECLARE(vfs_unmounted, vfs_unmounted_notify_fn); */ struct proc; struct image_params; - typedef void (*exitlist_fn)(void *, struct proc *); typedef void (*forklist_fn)(void *, struct proc *, struct proc *, int); typedef void (*execlist_fn)(void *, struct proc *, struct image_params *); @@ -234,7 +233,6 @@ typedef void (*app_coredump_start_fn)(void *, struct thread *, char *name); typedef void (*app_coredump_progress_fn)(void *, struct thread *td, int byte_count); typedef void (*app_coredump_finish_fn)(void *, struct thread *td); typedef void (*app_coredump_error_fn)(void *, struct thread *td, char *msg, ...); - EVENTHANDLER_DECLARE(app_coredump_start, app_coredump_start_fn); EVENTHANDLER_DECLARE(app_coredump_progress, app_coredump_progress_fn); EVENTHANDLER_DECLARE(app_coredump_finish, app_coredump_finish_fn); @@ -270,4 +268,11 @@ typedef void (*unregister_framebuffer_fn)(void *, struct fb_info *); EVENTHANDLER_DECLARE(register_framebuffer, register_framebuffer_fn); EVENTHANDLER_DECLARE(unregister_framebuffer, unregister_framebuffer_fn); +/* Swap pager device events */ +struct swdevt; +typedef void (*swapon_fn)(void *, struct swdevt *); +typedef void (*swapoff_fn)(void *, struct swdevt *); +EVENTHANDLER_DECLARE(swapon, swapon_fn); +EVENTHANDLER_DECLARE(swapoff, swapoff_fn); + #endif /* _SYS_EVENTHANDLER_H_ */ diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index f00d29e..d0ef0f8 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -97,6 +97,7 @@ struct vmmeter { u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ u_int v_laundry_count; /* (q) pages dirty */ + u_int v_stasis_count; /* (q) pages dirty and non-reclaimable */ u_int v_cache_count; /* (f) pages on cache queue */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ @@ -112,7 +113,6 @@ struct vmmeter { u_int v_vforkpages; /* (p) VM pages affected by vfork() */ u_int v_rforkpages; /* (p) VM pages affected by rfork() */ u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ - u_int v_spare[1]; }; #ifdef _KERNEL diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 33c376f..03f3905 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2138,6 +2138,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, swapon_check_swzone(swap_total / PAGE_SIZE); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); + EVENTHANDLER_INVOKE(swapon, sp); } /* @@ -2240,6 +2241,7 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) } swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; mtx_unlock(&sw_dev_mtx); + EVENTHANDLER_INVOKE(swapoff, sp); /* * Page in the contents of the device and close it. diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index ee55dc8..bb462d3 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -304,6 +304,7 @@ VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); VM_STATS_VM(v_laundry_count, "Dirty pages"); +VM_STATS_VM(v_stasis_count, "Pages in stasis"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 464dc21..aaffee5 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -394,6 +394,11 @@ vm_page_domain_init(struct vm_domain *vmd) "vm laundry pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = &vm_cnt.v_laundry_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_STASIS].pq_name) = + "vm stasis pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_STASIS].pq_vcnt) = + &vm_cnt.v_stasis_count; + vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; @@ -3202,6 +3207,30 @@ vm_page_launder(vm_page_t m) } /* + * vm_page_enter_stasis + * + * Put a page in stasis. Such pages are swap-backed and cannot be + * reclaimed because zero swap devices are configured, or all available + * devices are full. If the page is already in stasis, it will be + * requeued. + */ +void +vm_page_enter_stasis(vm_page_t m) +{ + + KASSERT(m->wire_count == 0, ("page %p is wired", m)); + KASSERT(m->dirty != 0, ("page %p is clean", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); + KASSERT(m->object->type == OBJT_DEFAULT || m->object->type == OBJT_SWAP, + ("invalid object type %d", m->object->type)); + + vm_page_assert_locked(m); + if (m->queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_STASIS, m); +} + +/* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. @@ -3265,7 +3294,7 @@ vm_page_advise(vm_page_t m, int advice) vm_page_dirty(m); /* - * Place clean pages at the head of the inactive queue rather than the + * Place clean pages near the head of the inactive queue rather than the * tail, thus defeating the queue's LRU operation and ensuring that the * page will be reused quickly. */ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 5c90cf8..2b06b01 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -207,7 +207,8 @@ struct vm_page { #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_LAUNDRY 2 -#define PQ_COUNT 3 +#define PQ_STASIS 3 +#define PQ_COUNT 4 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); @@ -460,6 +461,7 @@ void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); +void vm_page_enter_stasis(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); @@ -715,7 +717,7 @@ static inline bool vm_page_in_laundry(vm_page_t m) { - return (m->queue == PQ_LAUNDRY); + return (m->queue == PQ_LAUNDRY || m->queue == PQ_STASIS); } #endif /* _KERNEL */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 3d62921..aff87fa 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -185,6 +185,7 @@ static int vm_swap_idle_enabled = 0; static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif +static int vm_swapdev_cnt = 0; static int vm_panic_on_oom = 0; @@ -241,6 +242,8 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); static void vm_pageout_launder(struct vm_domain *vmd); static void vm_pageout_laundry_worker(void *arg); +static void vm_pageout_swapon(void *arg, struct swdevt *sp __unused); +static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); @@ -570,12 +573,21 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the XXX list. (We - * will try paging out it again later). + * If no swap devices exist, place the page in stasis. + * It'll go back into the laundry if a swap device is + * configured at some point in the future. For other + * types of errors, reactivate the page to avoid + * clogging the laundry queue. This ensures that the + * pagedaemon will continue aggressively scanning + * the active queue for other pages to reclaim. */ vm_page_lock(mt); - vm_page_activate(mt); // XXX + if (pageout_status[i] == VM_PAGER_FAIL && + mt->dirty != 0 && (object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP) && vm_swapdev_cnt == 0) + vm_page_enter_stasis(mt); + else + vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; @@ -885,7 +897,8 @@ vm_pageout_launder(struct vm_domain *vmd) vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, error, launder, maxscan, numpagedout, vnodes_skipped; + int act_delta, error, launder, maxscan, numpagedout; + int vnodes_skipped; boolean_t pageout_ok, queue_locked; /* @@ -908,15 +921,18 @@ vm_pageout_launder(struct vm_domain *vmd) /* * Scan the laundry queue for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once - * we've reached the end of the queue. A single iteration of this loop - * may cause more than one page to be laundered because of clustering. + * we've reached the ends of the queues. A single iteration of this + * loop may cause more than one page to be laundered because of + * clustering. * - * maxscan ensures that we don't re-examine requeued pages. Any - * additional pages written as part of a cluster are subtracted from - * maxscan since they must be taken from the laundry queue. + * During a queue scan, maxscan ensures that we don't re-examine + * requeued pages. Any additional pages written as part of + * a cluster are subtracted from maxscan since they are removed from + * the queue before I/O is initiated. */ pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; maxscan = pq->pq_cnt; + vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); @@ -930,9 +946,11 @@ vm_pageout_launder(struct vm_domain *vmd) if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + ("PG_FICTITIOUS page %p cannot be in queue %d", m, + m->queue)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); + ("VPO_UNMANAGED page %p cannot be in queue %d", m, + m->queue)); if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { vm_page_unlock(m); continue; @@ -947,9 +965,8 @@ vm_pageout_launder(struct vm_domain *vmd) } /* - * We unlock the laundry queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * We unlock the queue, invalidating the 'next' pointer. Use + * our marker to remember our place. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, plinks.q); @@ -1081,6 +1098,14 @@ vm_pageout_laundry_worker(void *arg) vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); /* + * Calls to these handlers are serialized by the swapconf lock. + */ + (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain, + EVENTHANDLER_PRI_ANY); + (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain, + EVENTHANDLER_PRI_ANY); + + /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { @@ -1764,6 +1789,44 @@ vm_pageout_worker(void *arg) } } +static void +vm_pageout_swapon(void *arg, struct swdevt *sp __unused) +{ + struct vm_domain *vmd; + struct vm_pagequeue *pq; + vm_page_t m, next; + + vmd = arg; + if (vm_swapdev_cnt++ == 0) { + pq = &vmd->vmd_pagequeues[PQ_STASIS]; + /* + * We now have a swap device, so migrate pages back to the + * laundry queue. Locking rules make this somewhat awkward, but + * this is a rare operation. + */ + vm_pagequeue_lock(pq); + while ((m = TAILQ_FIRST(&pq->pq_pl)) != NULL) { + if (!vm_pageout_page_lock(m, &next)) { + vm_page_unlock(m); + continue; + } + vm_page_dequeue_locked(m); + vm_pagequeue_unlock(pq); + vm_page_launder(m); + vm_page_unlock(m); + vm_pagequeue_lock(pq); + } + vm_pagequeue_unlock(pq); + } +} + +static void +vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) +{ + + vm_swapdev_cnt--; +} + /* * vm_pageout_init initialises basic pageout daemon settings. */ diff --git a/usr.bin/systat/vmstat.c b/usr.bin/systat/vmstat.c index 382405d..cdad416 100644 --- a/usr.bin/systat/vmstat.c +++ b/usr.bin/systat/vmstat.c @@ -107,7 +107,8 @@ static struct Info { u_int v_wire_count; /* number of pages wired down */ u_int v_active_count; /* number of pages active */ u_int v_inactive_count; /* number of pages inactive */ - u_int v_laundry_count; /* number of pages in laundry queue */ + u_int v_laundry_count; /* number of pages in laundry */ + u_int v_stasis_count; /* number of pages in stasis */ u_long v_kmem_map_size; /* Current kmem allocation size */ struct vmtotal Total; struct nchstats nchstats; @@ -520,9 +521,10 @@ showkre(void) putint(pgtokb(s.v_active_count), VMSTATROW + 13, VMSTATCOL, 8); putint(pgtokb(s.v_inactive_count), VMSTATROW + 14, VMSTATCOL, 8); putint(pgtokb(s.v_laundry_count), VMSTATROW + 15, VMSTATCOL, 8); - putint(pgtokb(s.v_free_count), VMSTATROW + 16, VMSTATCOL, 8); + putint(pgtokb(s.v_stasis_count), VMSTATROW + 16, VMSTATCOL, 8); + putint(pgtokb(s.v_free_count), VMSTATROW + 17, VMSTATCOL, 8); if (LINES - 1 > VMSTATROW + 17) - putint(s.bufspace / 1024, VMSTATROW + 17, VMSTATCOL, 8); + putint(s.bufspace / 1024, VMSTATROW + 18, VMSTATCOL, 8); PUTRATE(v_vnodein, PAGEROW + 2, PAGECOL + 6, 5); PUTRATE(v_vnodeout, PAGEROW + 2, PAGECOL + 12, 5); PUTRATE(v_swapin, PAGEROW + 2, PAGECOL + 19, 5); diff --git a/usr.bin/top/machine.c b/usr.bin/top/machine.c index a46f80c..d30cb02 100644 --- a/usr.bin/top/machine.c +++ b/usr.bin/top/machine.c @@ -174,10 +174,10 @@ char *cpustatenames[] = { /* these are for detailing the memory statistics */ -int memory_stats[7]; +int memory_stats[8]; char *memorynames[] = { - "K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ", - "K Free", NULL + "K Active, ", "K Inact, ", "K Laundry, ", "K Stasis, ", "K Wired, ", + "K Buf, ", "K Free", NULL }; int arc_stats[7]; @@ -491,8 +491,9 @@ get_system_info(struct system_info *si) GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]); GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]); GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]); - GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]); - GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]); + GETSYSCTL("vm.stats.vm.v_stasis_count", memory_stats[3]); + GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[4]); + GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[6]); GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout); /* convert memory stats to Kbytes */ @@ -500,9 +501,10 @@ get_system_info(struct system_info *si) memory_stats[1] = pagetok(memory_stats[1]); memory_stats[2] = pagetok(memory_stats[2]); memory_stats[3] = pagetok(memory_stats[3]); - memory_stats[4] = bufspace / 1024; - memory_stats[5] = pagetok(memory_stats[5]); - memory_stats[6] = -1; + memory_stats[4] = pagetok(memory_stats[4]); + memory_stats[5] = bufspace / 1024; + memory_stats[6] = pagetok(memory_stats[6]); + memory_stats[7] = -1; /* first interval */ if (swappgsin < 0) { diff --git a/usr.bin/vmstat/vmstat.c b/usr.bin/vmstat/vmstat.c index 6d3e0a8..e14cdbe 100644 --- a/usr.bin/vmstat/vmstat.c +++ b/usr.bin/vmstat/vmstat.c @@ -581,6 +581,7 @@ fill_vmmeter(struct vmmeter *vmmp) GET_VM_STATS(vm, v_inactive_target); GET_VM_STATS(vm, v_inactive_count); GET_VM_STATS(vm, v_laundry_count); + GET_VM_STATS(vm, v_stasis_count); GET_VM_STATS(vm, v_pageout_free_min); GET_VM_STATS(vm, v_interrupt_free_min); /*GET_VM_STATS(vm, v_free_severe);*/ -- 2.8.1