From b47950f4551463a1c08908dd2a3fbae03beab3a3 Mon Sep 17 00:00:00 2001
From: Mark Johnston <mjohnston@isilon.com>
Date: Sun, 21 Feb 2016 15:52:17 -0800
Subject: [PATCH 4/6] Add a basic laundering policy.

This policy stems from the notion that there are two reasons to launder
pages:

1. Shortfall, in which the inactive and free queues are depleted, and
   the system _must_ launder dirty pages in order to reclaim memory.

2. Fairness: the system should periodically launder dirty pages to
   ensure that applications cannot excessively influence the system's
   memory reclaimation behaviour. Note that this does not imply that
   clear and dirty pages must be treated equally: page laundering is an
   expensive operation. However, the relative costs of reclaiming a
   clean vs. dirty page should be bounded in some well-defined way, and
   in particular, it should not be possible to force the system to
   reclaim only clean pages indefinitely. Under memory pressure the
   system should eventually launder some dirty pages, even when inactive
   clean pages are plentiful.

Thus, laundering targets are chosen based on the current state of the
paging queues. In shortfall, the laundry thread attempts to meet the
shortfall within 0.5s, the pagedaemon sleep period. Because it is the
sole source of clean pages, no attempts are made to limit the laundering
rate: the laundry thread goes all-out.

If the system is not in shortfall, the laundry thread may elect to
launder some dirty pages in an attempt to satisfy the fairness policy.
This is referred to as background laundering. Several conditions must be
met for background laundering to occur:

a) The laundry queue must contain a significant amount of the system's
   inactive memory: if the number of dirty pages is miniscule, nothing
   is gained by laundering them. Moreover, write clustering works better
   if the number of dirty pages is allowed to grow to some threshold
   before any laundering is performed. The ratio of clean to dirty pages
   serves as a threshold here, controlled by bkgrd_launder_ratio. By
   default, dirty pages must constitute at least 1% of inactive memory
   for background laundering to occur.

b) The number of free pages must be low. If there is plentiful free
   memory, there's no reason to launder pages. The number of free pages
   must be smaller than bkgrd_launder_thresh for background laundering
   to occur. By default, this is chosen to be the max of half the free
   target and 3/2s of the pagedaemon wakeup threshold. The idea is to
   start laundering before the pagedaemon wakes up.

c) The pagedaemon thread(s) must be active. If the number of free pages
   is low but the system is not under memory pressure, we should not
   continue background laundering indefinitely. We use
   vm_cnt.v_pdwakeups as a proxy for pagedaemon activity: when a
   background laundering run begins, the pdwakeups value is recorded; a
   second run cannot begin until pdwakeups has been incremented at least
   once.

When the conditions for background laundering are met, the laundry
thread determines the target number of pages and begins laundering. It
attempts to meet the target within one second unless the corresponding
laundering rate would exceed bkgrd_launder_max (32MB/s by default). The
target is given by 0.5*l(L)*FT/l(I), where FT is the free page threshold
used by the pagedaemon. In particular, the number of pages laundered is
proportional to the ratio of dirty to clean inactive pages.
---
 sys/vm/vm_pageout.c | 145 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 122 insertions(+), 23 deletions(-)

diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 4f0b5c1..9854569 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -233,6 +233,21 @@ SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight,
 	CTLFLAG_RW, &act_scan_laundry_weight, 0,
 	"weight given to clean vs. dirty pages in active queue scans");
 
+static u_int bkgrd_launder_ratio = 100;
+SYSCTL_UINT(_vm, OID_AUTO, bkgrd_launder_ratio,
+	CTLFLAG_RW, &bkgrd_launder_ratio, 0,
+	"ratio of inactive to laundry pages to trigger background laundering");
+
+static u_int bkgrd_launder_max = 32768;
+SYSCTL_UINT(_vm, OID_AUTO, bkgrd_launder_max,
+	CTLFLAG_RW, &bkgrd_launder_max, 0,
+	"maximum background laundering rate, in pages per second");
+
+static u_int bkgrd_launder_thresh;
+SYSCTL_UINT(_vm, OID_AUTO, bkgrd_launder_thresh,
+	CTLFLAG_RW, &bkgrd_launder_thresh, 0,
+	"free page threshold below which background laundering may be started");
+
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
@@ -241,7 +256,8 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static void vm_pageout_launder(struct vm_domain *vmd, struct vm_oom_state *oom);
+static int vm_pageout_launder(struct vm_domain *vmd, struct vm_oom_state *oom,
+    int launder);
 static void vm_pageout_laundry_worker(void *arg);
 static void vm_pageout_swapon(void *arg, struct swdevt *sp __unused);
 static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused);
@@ -889,33 +905,23 @@ unlock_mp:
 }
 
 /*
- * XXX
+ * Attempt to launder the specified number of pages.
+ *
+ * Returns the number of pages successfully laundered.
  */
-static void
-vm_pageout_launder(struct vm_domain *vmd, struct vm_oom_state *oom)
+static int
+vm_pageout_launder(struct vm_domain *vmd, struct vm_oom_state *oom, int launder)
 {
 	vm_page_t m, next;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
-	int act_delta, error, launder, maxscan, numpagedout, pass;
-	int starting_target, vnodes_skipped;
+	int act_delta, error, maxscan, numpagedout, pass, starting_target;
+	int vnodes_skipped;
 	boolean_t pageout_ok, queue_locked;
 
-	/*
-	 * Compute the number of pages we want to move from the laundry queue to
-	 * the inactive queue.  If there is no shortage of clean, inactive
-	 * pages, we allow laundering to proceed at a trickle to ensure that
-	 * dirty pages will eventually be reused.  Otherwise, the inactive queue
-	 * target is scaled by the ratio of the sleep intervals of the laundry
-	 * queue and inactive queue worker threads.
-	 */
-	launder = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count +
-	    vm_paging_target() + vm_pageout_deficit;
-	if (launder < 0)
-		launder = 1;
-	else
-		launder /= VM_LAUNDER_RATE;
+	DTRACE_PROBE1(vm__launder, "int", launder);
 
+	starting_target = launder;
 	vnodes_skipped = 0;
 
 	/*
@@ -1112,26 +1118,41 @@ relock_queue:
 	 * If we failed to launder any pages, vote for OOM.
 	 */
 	vm_pageout_mightbe_oom(oom, launder, starting_target);
+
+	return (starting_target - launder);
 }
 
 /*
- * XXX
+ * Perform the work of the laundry thread: periodically wake up and determine
+ * whether any pages need to be laundered.  If so, determine the number of pages
+ * that need to be laundered, and launder them.
  */
 static void
 vm_pageout_laundry_worker(void *arg)
 {
 	struct vm_oom_state oom;
 	struct vm_domain *domain;
-	int domidx;
+	uint64_t ninact, nlaundry;
+	int cycle, tcycle, domidx, gen, launder, laundered;
+	int shortfall, prev_shortfall, target;
 
 	domidx = (uintptr_t)arg;
 	domain = &vm_dom[domidx];
 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
 	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
 
+	cycle = tcycle = 0;
+	gen = -1;
+	shortfall = prev_shortfall = 0;
+	target = 0;
+
 	oom.oom_seq = 0;
 	oom.oom_voted = FALSE;
 
+	if (bkgrd_launder_thresh == 0)
+		bkgrd_launder_thresh = max(vm_cnt.v_free_target / 2,
+		    3 * vm_pageout_wakeup_thresh / 2);
+
 	/*
 	 * Calls to these handlers are serialized by the swapconf lock.
 	 */
@@ -1144,9 +1165,87 @@ vm_pageout_laundry_worker(void *arg)
 	 * The pageout laundry worker is never done, so loop forever.
 	 */
 	for (;;) {
+		KASSERT(target >= 0, ("negative target %d", target));
+		launder = 0;
+
+		/*
+		 * First determine whether we're in shortfall.  If so, there's
+		 * an impending need for clean pages.  We attempt to launder the
+		 * target within one pagedaemon sleep period.
+		 */
+		shortfall = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count +
+		    vm_paging_target() + vm_pageout_deficit;
+		if (shortfall > 0) {
+			/*
+			 * If the shortfall has grown since the last cycle or
+			 * we're still in shortfall despite a previous
+			 * laundering run, start a new run.
+			 */
+			if (shortfall > prev_shortfall || cycle == tcycle) {
+				target = shortfall;
+				cycle = 0;
+				tcycle = VM_LAUNDER_RATE;
+			}
+			prev_shortfall = shortfall;
+			launder = target / (tcycle - (cycle % tcycle));
+			goto launder;
+		} else {
+			if (prev_shortfall > 0)
+				/* We're out of shortfall; the target is met. */
+				target = 0;
+			shortfall = prev_shortfall = 0;
+		}
+
+		/*
+		 * There's no immediate need to launder any pages; see if we
+		 * meet the conditions to perform background laundering:
+		 *
+		 * 1. we haven't yet reached the target of the current
+		 *    background laundering run, or
+		 * 2. the ratio of dirty to clean inactive pages exceeds the
+		 *    background laundering threshold and the free page count is
+		 *    low.
+		 *
+		 * We don't start a new background laundering run unless the
+		 * pagedaemon has been woken up at least once since the previous
+		 * run.
+		 */
+		if (target > 0 && cycle != tcycle) {
+			/* Continue an ongoing background run. */
+			launder = target / (tcycle - (cycle % tcycle));
+			goto launder;
+		}
+
+		ninact = vm_cnt.v_inactive_count;
+		nlaundry = vm_cnt.v_laundry_count;
+		if (ninact > 0 &&
+		    vm_cnt.v_pdwakeups != gen &&
+		    vm_cnt.v_free_count < bkgrd_launder_thresh &&
+		    nlaundry * bkgrd_launder_ratio >= ninact) {
+			cycle = 0;
+			tcycle = VM_LAUNDER_INTERVAL;
+			gen = vm_cnt.v_pdwakeups;
+			if (nlaundry >= ninact)
+				target = vm_cnt.v_free_target;
+			else
+				target = (nlaundry * vm_cnt.v_free_target << 16) /
+				    ninact >> 16;
+			target /= 2;
+			if (target > bkgrd_launder_max)
+				tcycle = target * VM_LAUNDER_INTERVAL /
+				    bkgrd_launder_max;
+			launder = target / (tcycle - (cycle % tcycle));
+		}
+
+launder:
+		if (launder > 0) {
+			laundered = vm_pageout_launder(domain, &oom, launder);
+			target -= min(laundered, target);
+		}
+
 		tsleep(&vm_cnt.v_laundry_count, PVM, "laundr",
 		    hz / VM_LAUNDER_INTERVAL);
-		vm_pageout_launder(domain, &oom);
+		cycle++;
 	}
 }
 
-- 
2.7.2