sys/vm/vm_page.c | 108 +++++++++------- sys/vm/vm_page.h | 33 ++++- sys/vm/vm_pageout.c | 349 +++++++++++++++++++++++++++++++++++--------------- sys/vm/vm_phys.c | 46 ++++++- sys/vm/vm_phys.h | 11 ++ sys/x86/acpica/srat.c | 4 + 7 files changed, 394 insertions(+), 158 deletions(-) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 2dde31f..a9e67dc 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -64,8 +64,7 @@ * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a - * page queue (vm_pagequeues[]), regardless of other locks or the - * busy state of a page. + * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. @@ -124,20 +123,7 @@ __FBSDID("$FreeBSD$"); * page structure. */ -struct vm_pagequeue vm_pagequeues[PQ_COUNT] = { - [PQ_INACTIVE] = { - .pq_pl = TAILQ_HEAD_INITIALIZER( - vm_pagequeues[PQ_INACTIVE].pq_pl), - .pq_cnt = &cnt.v_inactive_count, - .pq_name = "vm inactive pagequeue" - }, - [PQ_ACTIVE] = { - .pq_pl = TAILQ_HEAD_INITIALIZER( - vm_pagequeues[PQ_ACTIVE].pq_pl), - .pq_cnt = &cnt.v_active_count, - .pq_name = "vm active pagequeue" - } -}; +struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; @@ -256,6 +242,33 @@ vm_page_blacklist_lookup(char *list, vm_paddr_t pa) return (0); } +static void +vm_page_domain_init(struct vm_domain *vmd) +{ + struct vm_pagequeue *pq; + int i; + + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = + "vm inactive pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = + &cnt.v_inactive_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = + "vm active pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = + &cnt.v_active_count; + vmd->vmd_fullintervalcount = 0; + vmd->vmd_page_count = 0; + vmd->vmd_free_count = 0; + vmd->vmd_segs = 0; + vmd->vmd_oom = FALSE; + for (i = 0; i < PQ_COUNT; i++) { + pq = &vmd->vmd_pagequeues[i]; + TAILQ_INIT(&pq->pq_pl); + mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", + MTX_DEF | MTX_DUPOK); + } +} + /* * vm_page_startup: * @@ -319,8 +332,8 @@ vm_page_startup(vm_offset_t vaddr) mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); - for (i = 0; i < PQ_COUNT; i++) - vm_pagequeue_init_lock(&vm_pagequeues[i]); + for (i = 0; i < vm_ndomains; i++) + vm_page_domain_init(&vm_dom[i]); /* * Allocate memory for use when boot strapping the kernel memory @@ -640,7 +653,7 @@ vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) } m->phys_addr = paddr; m->queue = PQ_NONE; - /* Fictitious pages don't use "segind". */ + m->segind = 0; /* Pretend we're in the first segment. */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_BUSY | VPO_UNMANAGED; @@ -1054,7 +1067,7 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, ("vm_page_cache_free: page %p has inconsistent flags", m)); cnt.v_cache_count--; - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); } empty = vm_radix_is_empty(&object->cache); mtx_unlock(&vm_page_queue_free_mtx); @@ -1310,7 +1323,7 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) ("vm_page_alloc: page %p is not free", m)); KASSERT(m->valid == 0, ("vm_page_alloc: free page %p is valid", m)); - cnt.v_free_count--; + vm_phys_freecnt_adj(m, -1); } /* @@ -1568,7 +1581,7 @@ vm_page_alloc_init(vm_page_t m) ("vm_page_alloc_init: page %p is not free", m)); KASSERT(m->valid == 0, ("vm_page_alloc_init: free page %p is valid", m)); - cnt.v_free_count--; + vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } @@ -1710,6 +1723,13 @@ vm_waitpfault(void) "pfault", 0); } +struct vm_pagequeue * +vm_page_pageq(vm_page_t m) +{ + + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); +} + /* * vm_page_dequeue: * @@ -1725,11 +1745,11 @@ vm_page_dequeue(vm_page_t m) vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_dequeue: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pageq(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, pageq); - (*pq->pq_cnt)--; + vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } @@ -1746,11 +1766,11 @@ vm_page_dequeue_locked(vm_page_t m) struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pageq(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, pageq); - (*pq->pq_cnt)--; + vm_pagequeue_cnt_dec(pq); } /* @@ -1766,11 +1786,11 @@ vm_page_enqueue(int queue, vm_page_t m) struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); - pq = &vm_pagequeues[queue]; + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); - ++*pq->pq_cnt; + vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } @@ -1789,7 +1809,7 @@ vm_page_requeue(vm_page_t m) vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pageq(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, pageq); TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); @@ -1810,7 +1830,7 @@ vm_page_requeue_locked(vm_page_t m) KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pageq(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, pageq); TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); @@ -1947,7 +1967,7 @@ vm_page_free_toq(vm_page_t m) */ mtx_lock(&vm_page_queue_free_mtx); m->flags |= PG_FREE; - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else @@ -2080,14 +2100,14 @@ _vm_page_deactivate(vm_page_t m, int athead) if (queue != PQ_NONE) vm_page_dequeue(m); m->flags &= ~PG_WINATCFLS; - pq = &vm_pagequeues[PQ_INACTIVE]; + pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); m->queue = PQ_INACTIVE; if (athead) TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); - cnt.v_inactive_count++; + vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } @@ -2889,18 +2909,16 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info) DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { - - db_printf("PQ_FREE:"); - db_printf(" %d", cnt.v_free_count); - db_printf("\n"); - - db_printf("PQ_CACHE:"); - db_printf(" %d", cnt.v_cache_count); - db_printf("\n"); - - db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", - *vm_pagequeues[PQ_ACTIVE].pq_cnt, - *vm_pagequeues[PQ_INACTIVE].pq_cnt); + int dom; + + for (dom = 0; dom < vm_ndomains; dom++) { + db_printf("DOMAIN: %d ", dom); + db_printf("PQ_FREE: %d ", cnt.v_free_count); + db_printf("PQ_CACHE: %d ", cnt.v_cache_count); + db_printf("PQ_ACTIVE: %d PQ_INACTIVE: %d\n", + vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, + vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt); + } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 4fe5d7e..057dd24 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -181,18 +181,40 @@ TAILQ_HEAD(pglist, vm_page); struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; - int *const pq_cnt; - const char *const pq_name; + int pq_cnt; + int * const pq_vcnt; + const char * const pq_name; } __aligned(CACHE_LINE_SIZE); -extern struct vm_pagequeue vm_pagequeues[PQ_COUNT]; + +struct vm_domain { + struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; + int vmd_fullintervalcount; + u_int vmd_page_count; + u_int vmd_free_count; + long vmd_segs; /* bitmask of the segments */ + boolean_t vmd_oom; +}; + +extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) -#define vm_pagequeue_init_lock(pq) mtx_init(&(pq)->pq_mutex, \ - (pq)->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) +static __inline void +vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) +{ + +#ifdef notyet + vm_pagequeue_assert_locked(pq); +#endif + pq->pq_cnt += addend; + atomic_add_int(pq->pq_vcnt, addend); +} +#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) +#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) + extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; @@ -393,6 +415,7 @@ boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); +struct vm_pagequeue *vm_page_pageq(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 6d6e626..3f0e529 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); -static void vm_pageout_scan(int pass); +static void vm_pageout_scan(struct vm_domain *vmd, int pass); +static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); struct proc *pageproc; @@ -220,14 +223,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t); +static boolean_t vm_pageout_launder(struct vm_domain *vmd, int, int, + vm_paddr_t, vm_paddr_t); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); -static void vm_pageout_page_stats(void); +static void vm_pageout_page_stats(struct vm_domain *vmd); /* * Initialize a dummy page for marking the caller's place in the specified @@ -236,12 +240,13 @@ static void vm_pageout_page_stats(void); * count to one as safety precautions. */ static void -vm_pageout_init_marker(vm_page_t marker, u_short queue) +vm_pageout_init_marker(vm_page_t marker, int segind, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->oflags = VPO_BUSY; + marker->segind = segind; marker->queue = queue; marker->hold_count = 1; } @@ -270,8 +275,8 @@ vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) vm_object_t object; queue = m->queue; - vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + vm_pageout_init_marker(&marker, m->segind, queue); + pq = vm_page_pageq(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); @@ -312,8 +317,8 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next) return (TRUE); queue = m->queue; - vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + vm_pageout_init_marker(&marker, m->segind, queue); + pq = vm_page_pageq(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); vm_pagequeue_unlock(pq); @@ -571,7 +576,8 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, } static boolean_t -vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) +vm_pageout_launder(struct vm_domain *vmd, int queue, int tries, + vm_paddr_t low, vm_paddr_t high) { struct mount *mp; struct vm_pagequeue *pq; @@ -580,7 +586,7 @@ vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) vm_paddr_t pa; vm_page_t m, m_tmp, next; - pq = &vm_pagequeues[queue]; + pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) { KASSERT(m->queue == queue, @@ -665,7 +671,8 @@ vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) void vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) { - int actl, actmax, inactl, inactmax; + int actl, actmax, inactl, inactmax, dom, initial_dom; + static int start_dom = 0; if (tries > 0) { /* @@ -681,19 +688,55 @@ vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) */ uma_reclaim(); } + + /* + * Make the next scan start on the next domain. + */ + initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; + inactl = 0; inactmax = cnt.v_inactive_count; actl = 0; actmax = tries < 2 ? 0 : cnt.v_active_count; + dom = initial_dom; + + /* + * Scan domains in round-robin order, first inactive queues, + * then active. Since domain usually owns large physically + * contigous chunk of memory, it makes sense to completely + * exhaust one domain before switching to next, while growing + * the pool of contiguous physical pages. + * + * Do not even start laundring a domain which cannot contain + * the specified address range, as indicated by segments + * constituting the domain. + */ again: - if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low, - high)) { - inactl++; - goto again; + if (inactl < inactmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom], PQ_INACTIVE, tries, + low, high)) { + inactl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; } - if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) { - actl++; - goto again; + if (actl < actmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom], PQ_ACTIVE, tries, + low, high)) { + actl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; } } @@ -867,7 +910,7 @@ vm_pageout_map_deactivate_pages(map, desired) * vm_pageout_scan does the dirty work for the pageout daemon. */ static void -vm_pageout_scan(int pass) +vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_page marker; @@ -880,7 +923,8 @@ vm_pageout_scan(int pass) int maxlaunder; boolean_t queues_locked; - vm_pageout_init_marker(&marker, PQ_INACTIVE); + KASSERT(vmd->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&marker, ffsl(vmd->vmd_segs) - 1, PQ_INACTIVE); /* * Decrease registered cache sizes. @@ -894,7 +938,7 @@ vm_pageout_scan(int pass) /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the - * number of pages from cnt.v_inactive_count that should be + * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit); @@ -920,8 +964,6 @@ vm_pageout_scan(int pass) if (pass) maxlaunder = 10000; - maxscan = cnt.v_inactive_count; - /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or @@ -929,7 +971,8 @@ vm_pageout_scan(int pass) * is not used to form decisions for the inactive queue, only for the * active queue. */ - pq = &vm_pagequeues[PQ_INACTIVE]; + pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; + maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queues_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); @@ -1044,7 +1087,7 @@ vm_pageout_scan(int pass) /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted - * from cnt.v_inactive_count. See the + * from the inactive count. See the * calculation of the page_shortage for the * loop over the active queue below. */ @@ -1268,7 +1311,7 @@ relock_queues: * active queue to the inactive queue. */ page_shortage = vm_paging_target() + - cnt.v_inactive_target - cnt.v_inactive_count; + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; /* @@ -1276,8 +1319,8 @@ relock_queues: * track the per-page activity counter and use it to locate * deactivation candidates. */ - pcount = cnt.v_active_count; - pq = &vm_pagequeues[PQ_ACTIVE]; + pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; + pcount = pq->pq_cnt; vm_pagequeue_lock(pq); m = TAILQ_FIRST(&pq->pq_pl); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { @@ -1382,9 +1425,10 @@ relock_queues: vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* - * Idle process swapout -- run once per second. + * Idle process swapout -- run once per second, only from the + * pageout thread for the domain zero. */ - if (vm_swap_idle_enabled) { + if (vm_swap_idle_enabled && vmd == &vm_dom[0]) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); @@ -1414,12 +1458,54 @@ relock_queues: * chance to flush out dirty vnode-backed pages and to allow * active pages to be moved to the inactive queue and reclaimed. */ - if (pass != 0 && - ((swap_pager_avail < 64 && vm_page_count_min()) || - (swap_pager_full && vm_paging_target() > 0))) - vm_pageout_oom(VM_OOM_MEM); + vm_pageout_mightbe_oom(vmd, pass); } +static int vm_pageout_oom_vote; + +/* + * The pagedaemon threads randlomly select one to perform the + * OOM. Trying to kill processes before all pagedaemons + * failed to reach free target is premature. + */ +static void +vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) +{ + int old_vote; + + if (pass == 0 || !((swap_pager_avail < 64 && vm_page_count_min()) || + (swap_pager_full && vm_paging_target() > 0))) { + if (vmd->vmd_oom) { + vmd->vmd_oom = FALSE; + atomic_subtract_int(&vm_pageout_oom_vote, 1); + } + return; + } + + if (vmd->vmd_oom) + return; + + vmd->vmd_oom = TRUE; + old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); + if (old_vote != vm_ndomains - 1) + return; + + /* + * The current pagedaemon thread is the last in the quorum to + * start OOM. Initiate the selection and signaling of the + * victim. + */ + vm_pageout_oom(VM_OOM_MEM); + + /* + * After one round of OOM terror, recall our vote. On the + * next pass, current pagedaemon would vote again if the low + * memory condition is still there, due to vmd_oom being + * false. + */ + vmd->vmd_oom = FALSE; + atomic_subtract_int(&vm_pageout_oom_vote, 1); +} void vm_pageout_oom(int shortage) @@ -1522,14 +1608,13 @@ vm_pageout_oom(int shortage) * helps the situation where paging just starts to occur. */ static void -vm_pageout_page_stats(void) +vm_pageout_page_stats(struct vm_domain *vmd) { struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, next; int pcount, tpcount; /* Number of pages to check */ - static int fullintervalcount = 0; - int page_shortage; + int actcount, page_shortage; page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - @@ -1538,25 +1623,30 @@ vm_pageout_page_stats(void) if (page_shortage <= 0) return; - pcount = cnt.v_active_count; - fullintervalcount += vm_pageout_stats_interval; - if (fullintervalcount < vm_pageout_full_stats_interval) { - vm_pageout_stats++; - tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count / - cnt.v_page_count; + pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; + + /* + * pcount limits the depth of the queue scan. In particular, + * for the full scan, it prevents the iteration from looking + * into the requeued pages. The limit is not exact since the + * page queue lock is dropped during the iteration. + */ + pcount = pq->pq_cnt; + vmd->vmd_fullintervalcount += vm_pageout_stats_interval; + if (vmd->vmd_fullintervalcount < vm_pageout_full_stats_interval) { + atomic_add_int(&vm_pageout_stats, 1); + tpcount = (int64_t)vm_pageout_stats_max * pcount / + vmd->vmd_page_count; if (pcount > tpcount) pcount = tpcount; } else { - vm_pageout_full_stats++; - fullintervalcount = 0; + atomic_add_int(&vm_pageout_full_stats, 1); + vmd->vmd_fullintervalcount = 0; } - pq = &vm_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); m = TAILQ_FIRST(&pq->pq_pl); - while ((m != NULL) && (pcount-- > 0)) { - int actcount; - + while (m != NULL && pcount-- > 0) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_page_stats: page %p isn't active", m)); @@ -1581,11 +1671,11 @@ vm_pageout_page_stats(void) } /* - * Don't deactivate pages that are busy. + * Don't deactivate pages that are busy or held. */ - if ((m->busy != 0) || - (m->oflags & VPO_BUSY) || - (m->hold_count != 0)) { + if (m->busy != 0 || + (m->oflags & VPO_BUSY) != 0 || + m->hold_count != 0) { vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); vm_page_requeue_locked(m); @@ -1600,7 +1690,7 @@ vm_pageout_page_stats(void) } actcount += pmap_ts_referenced(m); - if (actcount) { + if (actcount != 0) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; @@ -1632,13 +1722,103 @@ vm_pageout_page_stats(void) vm_pagequeue_unlock(pq); } +static void +vm_pageout_worker(void *arg) +{ + struct vm_domain *domain; + struct pcpu *pc; + int cpu, error, ndomain, pass; + + pass = 0; + ndomain = (uintptr_t)arg; + domain = &vm_dom[ndomain]; + + /* + * XXXKIB The bind is rather arbitrary. With some minor + * complications, we could assign the cpuset consisting of all + * CPUs in the same domain. In fact, it even does not matter + * if the CPU we bind to is in the affinity domain of this + * page queue, we only need to establish the fair distribution + * of pagedaemon threads among CPUs. + * + * XXXKIB It would be useful to allocate vm_pages for the + * domain from the domain, and put pcpu area into the page + * owned by the domain. + */ + if (mem_affinity != NULL) { + CPU_FOREACH(cpu) { + pc = pcpu_find(cpu); + if (pc->pc_domain == ndomain) { + thread_lock(curthread); + sched_bind(curthread, cpu); + thread_unlock(curthread); + break; + } + } + } + + /* + * The pageout daemon worker is never done, so loop forever. + */ + while (TRUE) { + /* + * If we have enough free memory, wakeup waiters. Do + * not clear vm_pages_needed until we reach our target, + * otherwise we may be woken up over and over again and + * waste a lot of cpu. + */ + mtx_lock(&vm_page_queue_free_mtx); + if (vm_pages_needed && !vm_page_count_min()) { + if (!vm_paging_needed()) + vm_pages_needed = 0; + wakeup(&cnt.v_free_count); + } + if (vm_pages_needed) { + /* + * Still not done, take a second pass without waiting + * (unlimited dirty cleaning), otherwise sleep a bit + * and try again. + */ + ++pass; + if (pass > 1) + msleep(&vm_pages_needed, + &vm_page_queue_free_mtx, PVM, "psleep", + hz / 2); + } else { + /* + * Good enough, sleep & handle stats. Prime the pass + * for the next run. + */ + if (pass > 1) + pass = 1; + else + pass = 0; + error = msleep(&vm_pages_needed, + &vm_page_queue_free_mtx, PVM, "psleep", + vm_pageout_stats_interval * hz); + if (error && !vm_pages_needed) { + mtx_unlock(&vm_page_queue_free_mtx); + pass = 0; + vm_pageout_page_stats(domain); + continue; + } + } + if (vm_pages_needed) + cnt.v_pdwakeups++; + mtx_unlock(&vm_page_queue_free_mtx); + vm_pageout_scan(domain, pass); + } +} + /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { - int error, pass; +#if MAXMEMDOM > 1 + int error, i; +#endif /* * Initialize some paging parameters. @@ -1708,58 +1888,17 @@ vm_pageout(void) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; swap_pager_swap_init(); - pass = 0; - /* - * The pageout daemon is never done, so loop forever. - */ - while (TRUE) { - /* - * If we have enough free memory, wakeup waiters. Do - * not clear vm_pages_needed until we reach our target, - * otherwise we may be woken up over and over again and - * waste a lot of cpu. - */ - mtx_lock(&vm_page_queue_free_mtx); - if (vm_pages_needed && !vm_page_count_min()) { - if (!vm_paging_needed()) - vm_pages_needed = 0; - wakeup(&cnt.v_free_count); - } - if (vm_pages_needed) { - /* - * Still not done, take a second pass without waiting - * (unlimited dirty cleaning), otherwise sleep a bit - * and try again. - */ - ++pass; - if (pass > 1) - msleep(&vm_pages_needed, - &vm_page_queue_free_mtx, PVM, "psleep", - hz / 2); - } else { - /* - * Good enough, sleep & handle stats. Prime the pass - * for the next run. - */ - if (pass > 1) - pass = 1; - else - pass = 0; - error = msleep(&vm_pages_needed, - &vm_page_queue_free_mtx, PVM, "psleep", - vm_pageout_stats_interval * hz); - if (error && !vm_pages_needed) { - mtx_unlock(&vm_page_queue_free_mtx); - pass = 0; - vm_pageout_page_stats(); - continue; - } +#if MAXMEMDOM > 1 + for (i = 1; i < vm_ndomains; i++) { + error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, + curproc, NULL, 0, 0, "dom%d", i); + if (error != 0) { + panic("starting pageout for domain %d, error %d\n", + i, error); } - if (vm_pages_needed) - cnt.v_pdwakeups++; - mtx_unlock(&vm_page_queue_free_mtx); - vm_pageout_scan(pass); } +#endif + vm_pageout_worker((uintptr_t)0); } /* diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index b871d79..6639092 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -65,6 +65,9 @@ __FBSDID("$FreeBSD$"); #include #include +_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, + "Too many physsegs."); + struct vm_freelist { struct pglist pl; int lcnt; @@ -140,6 +143,21 @@ vm_rr_selectdomain(void) #endif } +boolean_t +vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) +{ + struct vm_phys_seg *s; + int idx; + + while ((idx = ffsl(mask)) != 0) { + mask &= ~(1UL << idx); + s = &vm_phys_segs[idx]; + if (low < s->end && high > s->start) + return (TRUE); + } + return (FALSE); +} + /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. @@ -378,12 +396,16 @@ void vm_phys_add_page(vm_paddr_t pa) { vm_page_t m; + struct vm_domain *vmd; cnt.v_page_count++; m = vm_phys_paddr_to_vm_page(pa); m->phys_addr = pa; m->queue = PQ_NONE; m->segind = vm_phys_paddr_to_segind(pa); + vmd = vm_phys_domain(m); + vmd->vmd_page_count++; + vmd->vmd_segs |= 1UL << m->segind; m->flags = PG_FREE; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_add_page: page %p has unexpected order %d", @@ -391,7 +413,7 @@ vm_phys_add_page(vm_paddr_t pa) m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } @@ -633,6 +655,24 @@ vm_phys_paddr_to_segind(vm_paddr_t pa) } /* + * vm_phys_domain: + * + * Return the memory domain the page belongs to. + */ +struct vm_domain * +vm_phys_domain(vm_page_t m) +{ + int domn, segind; + + /* XXXKIB try to assert that the page is managed */ + segind = m->segind; + KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m)); + domn = vm_phys_segs[segind].domain; + KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); + return (&vm_dom[domn]); +} + +/* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. @@ -814,12 +854,12 @@ vm_phys_zero_pages_idle(void) for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { vm_phys_unfree_page(m_tmp); - cnt.v_free_count--; + vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); pmap_zero_page_idle(m_tmp); m_tmp->flags |= PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m_tmp, 0); vm_page_zero_count++; cnt_prezero++; diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 9812816..7d01c08 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -58,6 +58,8 @@ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); vm_page_t vm_phys_alloc_freelist_pages(int flind, int pool, int order); vm_page_t vm_phys_alloc_pages(int pool, int order); +struct vm_domain *vm_phys_domain(vm_page_t m); +boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); @@ -70,5 +72,14 @@ void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); +static __inline void +vm_phys_freecnt_adj(vm_page_t m, int adj) +{ + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + cnt.v_free_count += adj; + vm_phys_domain(m)->vmd_free_count += adj; +} + #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 7ea715e..8b5082c 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -31,10 +31,14 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include +#include #include #include #include +#include #include #include