From 10fb73fd59560ab18c711f8e78a6485b903c0404 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 14 Dec 2015 19:15:14 -0800 Subject: [PATCH 2/6] Add PQ_STASIS. The is a paging queue used to store dirty inactive anon pages that cannot be laundered, for example because the system has no swap devices configured or all existing swap devices are full. If at least one swap device is configured, vm_pageout_launder() will attempt to launder pages from PQ_STASIS before PQ_LAUNDRY. If no swap devices are configured, pages from the PQ_STASIS queue are ignored. Pages in PQ_STASIS are not taken into account in the active queue scan target - they are effectively treated the same as wired pages. This change adds swapon and swapoff EVENTHANDLERs, protected by the swapconf lock. They're primarily used to keep track of the number of configured swap devices. --- sys/sys/eventhandler.h | 9 +++- sys/sys/vmmeter.h | 2 +- sys/vm/swap_pager.c | 2 + sys/vm/vm_meter.c | 1 + sys/vm/vm_page.c | 31 +++++++++++- sys/vm/vm_page.h | 6 ++- sys/vm/vm_pageout.c | 132 ++++++++++++++++++++++++++++++++++++++++-------- usr.bin/systat/vmstat.c | 8 +-- usr.bin/top/machine.c | 18 ++++--- usr.bin/vmstat/vmstat.c | 1 + 10 files changed, 172 insertions(+), 38 deletions(-) diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index d82ece7..4df3038 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -211,7 +211,6 @@ EVENTHANDLER_DECLARE(vfs_unmounted, vfs_unmounted_notify_fn); */ struct proc; struct image_params; - typedef void (*exitlist_fn)(void *, struct proc *); typedef void (*forklist_fn)(void *, struct proc *, struct proc *, int); typedef void (*execlist_fn)(void *, struct proc *, struct image_params *); @@ -234,7 +233,6 @@ typedef void (*app_coredump_start_fn)(void *, struct thread *, char *name); typedef void (*app_coredump_progress_fn)(void *, struct thread *td, int byte_count); typedef void (*app_coredump_finish_fn)(void *, struct thread *td); typedef void (*app_coredump_error_fn)(void *, struct thread *td, char *msg, ...); - EVENTHANDLER_DECLARE(app_coredump_start, app_coredump_start_fn); EVENTHANDLER_DECLARE(app_coredump_progress, app_coredump_progress_fn); EVENTHANDLER_DECLARE(app_coredump_finish, app_coredump_finish_fn); @@ -270,4 +268,11 @@ typedef void (*unregister_framebuffer_fn)(void *, struct fb_info *); EVENTHANDLER_DECLARE(register_framebuffer, register_framebuffer_fn); EVENTHANDLER_DECLARE(unregister_framebuffer, unregister_framebuffer_fn); +/* Swap pager device events */ +struct swdevt; +typedef void (*swapon_fn)(void *, struct swdevt *); +typedef void (*swapoff_fn)(void *, struct swdevt *); +EVENTHANDLER_DECLARE(swapon, swapon_fn); +EVENTHANDLER_DECLARE(swapoff, swapoff_fn); + #endif /* _SYS_EVENTHANDLER_H_ */ diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index 8166d2e..bccbb2d 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -97,6 +97,7 @@ struct vmmeter { u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ u_int v_laundry_count; /* (q) pages dirty */ + u_int v_stasis_count; /* (q) pages dirty and non-reclaimable */ u_int v_cache_count; /* (f) pages on cache queue */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ @@ -112,7 +113,6 @@ struct vmmeter { u_int v_vforkpages; /* (p) VM pages affected by vfork() */ u_int v_rforkpages; /* (p) VM pages affected by rfork() */ u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ - u_int v_spare[1]; }; #ifdef _KERNEL diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index f68e166..30fbc2a 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2138,6 +2138,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, swapon_check_swzone(swap_total / PAGE_SIZE); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); + EVENTHANDLER_INVOKE(swapon, sp); } /* @@ -2240,6 +2241,7 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) } swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; mtx_unlock(&sw_dev_mtx); + EVENTHANDLER_INVOKE(swapoff, sp); /* * Page in the contents of the device and close it. diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index ee55dc8..bb462d3 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -304,6 +304,7 @@ VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); VM_STATS_VM(v_laundry_count, "Dirty pages"); +VM_STATS_VM(v_stasis_count, "Pages in stasis"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 54270f6..5d922c2 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -394,6 +394,11 @@ vm_page_domain_init(struct vm_domain *vmd) "vm laundry pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = &vm_cnt.v_laundry_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_STASIS].pq_name) = + "vm stasis pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_STASIS].pq_vcnt) = + &vm_cnt.v_stasis_count; + vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; @@ -3201,6 +3206,30 @@ vm_page_launder(vm_page_t m) } /* + * vm_page_enter_stasis + * + * Put a page in stasis. Such pages are swap-backed and cannot be + * reclaimed because zero swap devices are configured, or all available + * devices are full. If the page is already in stasis, it will be + * requeued. + */ +void +vm_page_enter_stasis(vm_page_t m) +{ + + KASSERT(m->wire_count == 0, ("page %p is wired", m)); + KASSERT(m->dirty != 0, ("page %p is clean", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); + KASSERT(m->object->type == OBJT_DEFAULT || m->object->type == OBJT_SWAP, + ("invalid object type %d", m->object->type)); + + vm_page_assert_locked(m); + if (m->queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_STASIS, m); +} + +/* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. @@ -3264,7 +3293,7 @@ vm_page_advise(vm_page_t m, int advice) vm_page_dirty(m); /* - * Place clean pages at the head of the inactive queue rather than the + * Place clean pages near the head of the inactive queue rather than the * tail, thus defeating the queue's LRU operation and ensuring that the * page will be reused quickly. */ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index c41104c..84f1401 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -207,7 +207,8 @@ struct vm_page { #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_LAUNDRY 2 -#define PQ_COUNT 3 +#define PQ_STASIS 3 +#define PQ_COUNT 4 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); @@ -460,6 +461,7 @@ void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); +void vm_page_enter_stasis(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); @@ -715,7 +717,7 @@ static inline bool vm_page_in_laundry(vm_page_t m) { - return (m->queue == PQ_LAUNDRY); + return (m->queue == PQ_LAUNDRY || m->queue == PQ_STASIS); } #endif /* _KERNEL */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index f54c302..6896c1c 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -185,6 +185,7 @@ static int vm_swap_idle_enabled = 0; static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif +static int vm_swapdev_cnt = 0; static int vm_panic_on_oom = 0; @@ -241,6 +242,8 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); static void vm_pageout_launder(struct vm_domain *vmd); static void vm_pageout_laundry_worker(void *arg); +static void vm_pageout_swapon(void *arg, struct swdevt *sp __unused); +static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); @@ -567,12 +570,23 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the XXX list. (We - * will try paging out it again later). + * If we've run out of swap space or there are no swap + * devices to begin with, place the page in stasis. If + * swap devices are available, we will periodically + * re-attempt a pageout. + * + * For other types of errors, reactivate the page to + * avoid clogging the laundry queue. This ensures that + * the pagedaemon will continue aggressively scanning + * the active queue if necessary. */ vm_page_lock(mt); - vm_page_activate(mt); // XXX + if (pageout_status[i] == VM_PAGER_FAIL && + mt->dirty != 0 && (object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP)) + vm_page_enter_stasis(mt); + else + vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; @@ -882,7 +896,8 @@ vm_pageout_launder(struct vm_domain *vmd) vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, error, launder, maxscan, numpagedout, vnodes_skipped; + int act_delta, error, launder, maxscan, numpagedout, pass; + int vnodes_skipped; boolean_t pageout_ok, queue_locked; /* @@ -903,17 +918,37 @@ vm_pageout_launder(struct vm_domain *vmd) vnodes_skipped = 0; /* - * Scan the laundry queue for pages eligible to be laundered. We stop - * once the target number of dirty pages have been laundered, or once - * we've reached the end of the queue. A single iteration of this loop - * may cause more than one page to be laundered because of clustering. + * Scan the laundry and stasis queues queue for pages eligible to be + * laundered. We stop once the target number of dirty pages have been + * laundered, or once we've reached the ends of the queues. A single + * iteration of this loop may cause more than one page to be laundered + * because of clustering. + * + * Pages in stasis are only examined when at least one swap device is + * available. We don't attempt to scan the entire stasis queue; it is + * likely that many if not all of its pages cannot be paged out. For + * now we only attempt to examine the target number of pages before + * falling back to the laundry queue. * - * maxscan ensures that we don't re-examine requeued pages. Any - * additional pages written as part of a cluster are subtracted from - * maxscan since they must be taken from the laundry queue. + * During the laundry queue scan, maxscan ensures that we don't + * re-examine requeued pages. Any additional pages written as part of + * a cluster are subtracted from maxscan since they must be taken from + * the laundry queue. */ - pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; - maxscan = pq->pq_cnt; + pass = 0; +scan: + if (pass == 0) { + if (vm_swapdev_cnt == 0) { + pass++; + goto scan; + } + pq = &vmd->vmd_pagequeues[PQ_STASIS]; + maxscan = launder; + } else { + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + maxscan = pq->pq_cnt; + } + vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); @@ -927,9 +962,11 @@ vm_pageout_launder(struct vm_domain *vmd) if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + ("PG_FICTITIOUS page %p cannot be in queue %d", m, + m->queue)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); + ("VPO_UNMANAGED page %p cannot be in queue %d", m, + m->queue)); if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { vm_page_unlock(m); continue; @@ -944,9 +981,8 @@ vm_pageout_launder(struct vm_domain *vmd) } /* - * We unlock the laundry queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * We unlock the queue, invalidating the 'next' pointer. Use + * our marker to remember our place. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, plinks.q); @@ -1036,8 +1072,7 @@ requeue_page: if (error == 0) { launder -= numpagedout; maxscan -= numpagedout - 1; - } - else if (error == EDEADLK) { + } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } @@ -1057,6 +1092,15 @@ relock_queue: vm_pagequeue_unlock(pq); /* + * If we didn't meet our target with the stasis queue, try again using + * the laundry queue. + */ + if (pass == 0 && launder > 0) { + pass++; + goto scan; + } + + /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ @@ -1079,6 +1123,14 @@ vm_pageout_laundry_worker(void *arg) vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); /* + * Calls to these handlers are serialized by the swapconf lock. + */ + (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain, + EVENTHANDLER_PRI_ANY); + (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain, + EVENTHANDLER_PRI_ANY); + + /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { @@ -1762,6 +1814,44 @@ vm_pageout_worker(void *arg) } } +static void +vm_pageout_swapon(void *arg, struct swdevt *sp __unused) +{ + struct vm_domain *vmd; + struct vm_pagequeue *pq; + vm_page_t m, next; + + vmd = arg; + if (vm_swapdev_cnt++ == 0) { + pq = &vmd->vmd_pagequeues[PQ_STASIS]; + /* + * We now have a swap device, so migrate pages back to the + * laundry queue. Locking rules make this somewhat awkward, but + * this is a rare operation. + */ + vm_pagequeue_lock(pq); + while ((m = TAILQ_FIRST(&pq->pq_pl)) != NULL) { + if (!vm_pageout_page_lock(m, &next)) { + vm_page_unlock(m); + continue; + } + vm_page_dequeue_locked(m); + vm_pagequeue_unlock(pq); + vm_page_launder(m); + vm_page_unlock(m); + vm_pagequeue_lock(pq); + } + vm_pagequeue_unlock(pq); + } +} + +static void +vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) +{ + + vm_swapdev_cnt--; +} + /* * vm_pageout_init initialises basic pageout daemon settings. */ diff --git a/usr.bin/systat/vmstat.c b/usr.bin/systat/vmstat.c index 382405d..cdad416 100644 --- a/usr.bin/systat/vmstat.c +++ b/usr.bin/systat/vmstat.c @@ -107,7 +107,8 @@ static struct Info { u_int v_wire_count; /* number of pages wired down */ u_int v_active_count; /* number of pages active */ u_int v_inactive_count; /* number of pages inactive */ - u_int v_laundry_count; /* number of pages in laundry queue */ + u_int v_laundry_count; /* number of pages in laundry */ + u_int v_stasis_count; /* number of pages in stasis */ u_long v_kmem_map_size; /* Current kmem allocation size */ struct vmtotal Total; struct nchstats nchstats; @@ -520,9 +521,10 @@ showkre(void) putint(pgtokb(s.v_active_count), VMSTATROW + 13, VMSTATCOL, 8); putint(pgtokb(s.v_inactive_count), VMSTATROW + 14, VMSTATCOL, 8); putint(pgtokb(s.v_laundry_count), VMSTATROW + 15, VMSTATCOL, 8); - putint(pgtokb(s.v_free_count), VMSTATROW + 16, VMSTATCOL, 8); + putint(pgtokb(s.v_stasis_count), VMSTATROW + 16, VMSTATCOL, 8); + putint(pgtokb(s.v_free_count), VMSTATROW + 17, VMSTATCOL, 8); if (LINES - 1 > VMSTATROW + 17) - putint(s.bufspace / 1024, VMSTATROW + 17, VMSTATCOL, 8); + putint(s.bufspace / 1024, VMSTATROW + 18, VMSTATCOL, 8); PUTRATE(v_vnodein, PAGEROW + 2, PAGECOL + 6, 5); PUTRATE(v_vnodeout, PAGEROW + 2, PAGECOL + 12, 5); PUTRATE(v_swapin, PAGEROW + 2, PAGECOL + 19, 5); diff --git a/usr.bin/top/machine.c b/usr.bin/top/machine.c index 51bef52..42e3794 100644 --- a/usr.bin/top/machine.c +++ b/usr.bin/top/machine.c @@ -174,10 +174,10 @@ char *cpustatenames[] = { /* these are for detailing the memory statistics */ -int memory_stats[7]; +int memory_stats[8]; char *memorynames[] = { - "K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ", - "K Free", NULL + "K Active, ", "K Inact, ", "K Laundry, ", "K Stasis, ", "K Wired, ", + "K Buf, ", "K Free", NULL }; int arc_stats[7]; @@ -491,8 +491,9 @@ get_system_info(struct system_info *si) GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]); GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]); GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]); - GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]); - GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]); + GETSYSCTL("vm.stats.vm.v_stasis_count", memory_stats[3]); + GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[4]); + GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[6]); GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout); /* convert memory stats to Kbytes */ @@ -500,9 +501,10 @@ get_system_info(struct system_info *si) memory_stats[1] = pagetok(memory_stats[1]); memory_stats[2] = pagetok(memory_stats[2]); memory_stats[3] = pagetok(memory_stats[3]); - memory_stats[4] = bufspace / 1024; - memory_stats[5] = pagetok(memory_stats[5]); - memory_stats[6] = -1; + memory_stats[4] = pagetok(memory_stats[4]); + memory_stats[5] = bufspace / 1024; + memory_stats[6] = pagetok(memory_stats[6]); + memory_stats[7] = -1; /* first interval */ if (swappgsin < 0) { diff --git a/usr.bin/vmstat/vmstat.c b/usr.bin/vmstat/vmstat.c index 952dd2f..0d8fd76 100644 --- a/usr.bin/vmstat/vmstat.c +++ b/usr.bin/vmstat/vmstat.c @@ -581,6 +581,7 @@ fill_vmmeter(struct vmmeter *vmmp) GET_VM_STATS(vm, v_inactive_target); GET_VM_STATS(vm, v_inactive_count); GET_VM_STATS(vm, v_laundry_count); + GET_VM_STATS(vm, v_stasis_count); GET_VM_STATS(vm, v_pageout_free_min); GET_VM_STATS(vm, v_interrupt_free_min); /*GET_VM_STATS(vm, v_free_severe);*/ -- 2.7.2