diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 938d92b0a988..d6805831fd27 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -5955,7 +5955,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((om->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -7140,7 +7140,7 @@ pmap_remove_pages(pmap_t pmap) pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) - if ((mt->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } @@ -7158,7 +7158,7 @@ pmap_remove_pages(pmap_t pmap) pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if ((m->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index e0abe3d6516d..dd3ddf0068ce 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -415,7 +415,8 @@ extern int pmap_pcid_enabled; extern int invpcid_works; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) -#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) +#define pmap_page_is_write_mapped(m) \ + ((vm_page_aflags(m) & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) struct thread; diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 98b67799767d..ff42c997518c 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -3331,7 +3331,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); - if ((om->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -4391,7 +4391,7 @@ pmap_remove_pages(pmap_t pmap) pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) - if ((mt->aflags & PGA_WRITEABLE) != 0 && + if (vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } @@ -4413,7 +4413,7 @@ pmap_remove_pages(pmap_t pmap) TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if ((m->aflags & PGA_WRITEABLE) != 0 && + if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 3c38749e1b82..d54a2d1d14a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1761,12 +1761,10 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, bcopy((char *)db->db_data + bufoff, va, PAGESIZE); zfs_unmap_page(sf); vm_page_valid(m); - vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); - vm_page_unlock(m); vm_page_sunbusy(m); } *rbehind = i; @@ -1884,12 +1882,10 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, } zfs_unmap_page(sf); vm_page_valid(m); - vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); - vm_page_unlock(m); vm_page_sunbusy(m); } *rahead = i; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index ebcc0ad92e0f..77ceda5b359d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -545,9 +545,7 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) zfs_vmobject_wlock(obj); if (error == 0) { vm_page_valid(pp); - vm_page_lock(pp); vm_page_activate(pp); - vm_page_unlock(pp); } vm_page_sunbusy(pp); if (error != 0 && !vm_page_wired(pp) == 0 && diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c index 060d6d68afc7..32b9b41b8d94 100644 --- a/sys/dev/virtio/balloon/virtio_balloon.c +++ b/sys/dev/virtio/balloon/virtio_balloon.c @@ -332,8 +332,6 @@ vtballoon_inflate(struct vtballoon_softc *sc, int npages) sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; - KASSERT(m->queue == PQ_NONE, - ("%s: allocated page %p on queue", __func__, m)); TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, plinks.q); } diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index b26a3906af15..435411258a20 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -1425,9 +1425,7 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) * current operation is not regarded * as an access. */ - vm_page_lock(m); vm_page_launder(m); - vm_page_unlock(m); } else { vm_page_free(m); if (ignerr) diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 3ad1321d78af..cc3f698c19f9 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -3783,7 +3783,7 @@ __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, ("pmap_enter: no PV entry for %#x", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((om->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 34ea1d301b51..d1ce6c57ff09 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2158,7 +2158,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pv = pmap_pvh_remove(&om->md, pmap, va); if (!pte_test(&newpte, PTE_MANAGED)) free_pv_entry(pmap, pv); - if ((om->aflags & PGA_WRITEABLE) != 0 && + if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } @@ -3224,7 +3224,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); - if ((m->aflags & PGA_REFERENCED) != 0) + if ((vm_page_aflags(m) & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index 96534d080d45..567ba2c814e2 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -1906,7 +1906,8 @@ moea_remove_all(mmu_t mmu, vm_page_t m) moea_pvo_remove(pvo, -1); PMAP_UNLOCK(pmap); } - if ((m->aflags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) { + if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + moea_query_bit(m, PTE_CHG)) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 8ccc875bda45..36872cba6ac3 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1482,7 +1482,7 @@ moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ - if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && + if (pmap != kernel_pmap && (vm_page_aflags(m) & PGA_EXECUTABLE) != 0 && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); @@ -2243,7 +2243,8 @@ moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; - if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && + if (pm != kernel_pmap && pg != NULL && + (vm_page_aflags(pg) & PGA_EXECUTABLE) == 0 && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); @@ -2457,7 +2458,8 @@ moea64_remove_all(mmu_t mmu, vm_page_t m) } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); - KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); + KASSERT((vm_page_aflags(m) & PGA_WRITEABLE) == 0, + ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index d85aaa680d08..01a27c68a447 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -2832,7 +2832,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((om->aflags & PGA_WRITEABLE) != 0 && + if ((vm_page_aflags(om) & PGA_WRITEABLE) == 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) @@ -3586,7 +3586,7 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && - (mt->aflags & PGA_WRITEABLE) != 0) + (vm_page_aflags(mt) & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); @@ -3604,7 +3604,7 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && - (m->aflags & PGA_WRITEABLE) != 0) { + (vm_page_aflags(m) & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); @@ -4138,7 +4138,7 @@ pmap_clear_modify(vm_page_t m) * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((m->aflags & PGA_WRITEABLE) == 0) + if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c index 520476a4b331..f037e9f0828f 100644 --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -194,9 +194,7 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, VM_OBJECT_WLOCK(object); TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q); vm_page_replace_checked(page, object, offset, m[0]); - vm_page_lock(m[0]); vm_page_free(m[0]); - vm_page_unlock(m[0]); m[0] = page; vm_page_valid(page); diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index ec49bf2e171d..22ebf3cd0a8d 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1563,9 +1563,7 @@ swp_pager_async_iodone(struct buf *bp) * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); - vm_page_lock(m); vm_page_activate(m); - vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { @@ -1600,9 +1598,7 @@ swp_pager_async_iodone(struct buf *bp) ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); - vm_page_lock(m); vm_page_deactivate_noreuse(m); - vm_page_unlock(m); vm_page_sunbusy(m); } } @@ -1649,12 +1645,6 @@ swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); -#ifdef INVARIANTS - vm_page_lock(m); - if (!vm_page_wired(m) && m->queue == PQ_NONE) - panic("page %p is neither wired nor queued", m); - vm_page_unlock(m); -#endif vm_page_xunbusy(m); swap_pager_unswapped(m); } @@ -1664,9 +1654,7 @@ swp_pager_force_launder(vm_page_t m) { vm_page_dirty(m); - vm_page_lock(m); vm_page_launder(m); - vm_page_unlock(m); vm_page_xunbusy(m); swap_pager_unswapped(m); } diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 1fd2899c766e..9b083c7fb1dc 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -155,9 +155,7 @@ release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); - vm_page_lock(fs->m); vm_page_deactivate(fs->m); - vm_page_unlock(fs->m); fs->m = NULL; } @@ -384,9 +382,7 @@ vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); - vm_page_lock(m); vm_page_deactivate(m); - vm_page_unlock(m); vm_page_xunbusy(m); } } @@ -1390,9 +1386,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(fs.m); } else { - vm_page_lock(fs.m); vm_page_activate(fs.m); - vm_page_unlock(fs.m); } if (m_hold != NULL) { *m_hold = fs.m; @@ -1494,10 +1488,8 @@ vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) * pages that triggered page faults are in the * active queue. */ - vm_page_lock(m); if (!vm_page_inactive(m)) vm_page_deactivate(m); - vm_page_unlock(m); } } } @@ -1871,9 +1863,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, ("dst_m %p is not wired", dst_m)); } } else { - vm_page_lock(dst_m); vm_page_activate(dst_m); - vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 25cd370cf210..d8d80135a6a3 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -928,9 +928,9 @@ kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ - if ((m->aflags & PGA_REFERENCED) != 0 || + if ((vm_page_aflags(m) & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || - (m->aflags & PGA_REFERENCED) != 0) + (vm_page_aflags(m) & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 4023877d4565..6cafe824a52a 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -779,7 +779,7 @@ vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ - if ((flags & OBJPC_NOSYNC) != 0 && (p->aflags & PGA_NOSYNC) != 0) { + if ((flags & OBJPC_NOSYNC) != 0 && (p->astate.flags & PGA_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { @@ -2383,10 +2383,12 @@ sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. + * + * XXX missing laundry */ - if (m->queue == PQ_ACTIVE) + if (vm_page_active(m)) kvo->kvo_active++; - else if (m->queue == PQ_INACTIVE) + else if (vm_page_inactive(m)) kvo->kvo_inactive++; } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index b4ad47a8c0cb..f55f22a6442b 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -134,6 +134,11 @@ static int vm_pageproc_waiters; static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD, 0, "VM page statistics"); +static counter_u64_t pqstate_commit_aborts = EARLY_COUNTER; +SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, commit_aborts, CTLFLAG_RD, + &pqstate_commit_aborts, + "Failed page queue state updates"); + static counter_u64_t queue_ops = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, &queue_ops, @@ -148,6 +153,7 @@ static void counter_startup(void) { + pqstate_commit_aborts = counter_u64_alloc(M_WAITOK); queue_ops = counter_u64_alloc(M_WAITOK); queue_nops = counter_u64_alloc(M_WAITOK); } @@ -179,16 +185,17 @@ static void vm_page_alloc_check(vm_page_t m); static void _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, const char *wmesg, bool nonshared, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); -static void vm_page_dequeue_complete(vm_page_t m); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); -static void vm_page_mvqueue(vm_page_t m, uint8_t queue); +static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, + const uint16_t nflag); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); +static bool vm_page_release_toq(vm_page_t m, uint8_t queue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, @@ -433,10 +440,10 @@ vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags) { bzero(marker, sizeof(*marker)); - marker->flags = PG_MARKER; - marker->aflags = aflags; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; - marker->queue = queue; + marker->astate.flags = aflags; + marker->astate.queue = queue; + marker->flags = PG_MARKER; } static void @@ -506,9 +513,10 @@ vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_UNBUSIED; - m->flags = m->aflags = 0; + m->flags = 0; m->phys_addr = pa; - m->queue = PQ_NONE; + m->astate.flags = 0; + m->astate.queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; @@ -1217,7 +1225,7 @@ vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) goto memattr; } m->phys_addr = paddr; - m->queue = PQ_NONE; + m->astate.queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ @@ -1306,12 +1314,10 @@ vm_page_readahead_finish(vm_page_t m) * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ - vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); - vm_page_unlock(m); vm_page_xunbusy(m); } @@ -1672,7 +1678,7 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mold = vm_radix_replace(&object->rtree, mnew); - KASSERT(mold->queue == PQ_NONE, + KASSERT(mold->astate.queue == PQ_NONE, ("vm_page_replace: old page %p is on a paging queue", mold)); /* Keep the resident page list in sorted order. */ @@ -1948,7 +1954,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; - m->aflags = 0; + m->astate.flags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; @@ -1964,7 +1970,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, vm_wire_add(1); m->ref_count = 1; } - m->act_count = 0; + m->astate.act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { @@ -2158,12 +2164,12 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { - m->aflags = 0; + m->astate.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; - m->act_count = 0; + m->astate.act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { @@ -2206,9 +2212,10 @@ vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); - KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->astate.queue == PQ_NONE && + (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", - m, m->queue, (m->aflags & PGA_QUEUE_STATE_MASK))); + m, m->astate.queue, (m->astate.flags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); @@ -2282,7 +2289,7 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req) /* * Initialize the page. Only the PG_ZERO flag is inherited. */ - m->aflags = 0; + m->astate.flags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; @@ -2461,8 +2468,7 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && - vm_page_queue(m) != PQ_NONE && !vm_page_busied(m) && - !vm_page_wired(m)) { + !vm_page_busied(m) && !vm_page_wired(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one @@ -2610,8 +2616,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; - else if (vm_page_queue(m) != PQ_NONE && - vm_page_tryxbusy(m) != 0) { + else if (vm_page_tryxbusy(m)) { if (vm_page_wired(m)) { vm_page_xunbusy(m); error = EBUSY; @@ -2685,7 +2690,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ - m_new->aflags = m->aflags & + m_new->astate.flags = m->astate.flags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); @@ -3151,66 +3156,151 @@ vm_waitpfault(struct domainset *dset, int timo) mtx_unlock(&vm_domainset_lock); } -static struct vm_pagequeue * -vm_page_pagequeue(vm_page_t m) +bool +vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { + vm_page_t next; + struct vm_pagequeue *pq; + int mask; + bool queued; - uint8_t queue; + if (new._bits == old->_bits) + /* A no-op is trivially successful. */ + return (true); - if ((queue = atomic_load_8(&m->queue)) == PQ_NONE) - return (NULL); - return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); + if (old->queue != PQ_NONE && old->queue != new.queue) { + new.flags &= ~PGA_ENQUEUED; + + /* + * The physical queue state might change at any point before the + * page queue lock is acquired, so we must verify that the lock + * is correct before proceeding. + */ + pq = _vm_page_pagequeue(m, old->queue); + vm_pagequeue_lock(pq); + if (__predict_false(m->astate.queue != old->queue)) { + vm_pagequeue_unlock(pq); + *old = vm_page_astate_load(m); + return (false); + } + + /* + * Once the queue index of the page changes, there is nothing + * synchronizing with further updates to the page's physical + * queue state. Therefore we must remove the page from the + * queue now in anticipation of a successful commit, and be + * prepared to roll back. + */ + queued = (m->astate.flags & PGA_ENQUEUED) != 0; + if (__predict_true(queued)) { + next = TAILQ_NEXT(m, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + } + if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { + if (queued) { + if (next == NULL) + TAILQ_INSERT_TAIL(&pq->pq_pl, m, + plinks.q); + else + TAILQ_INSERT_BEFORE(next, m, plinks.q); + } + vm_pagequeue_unlock(pq); + counter_u64_add(pqstate_commit_aborts, 1); + return (false); + } + if ((old->flags & PGA_ENQUEUED) != 0) + vm_pagequeue_cnt_dec(pq); + vm_pagequeue_unlock(pq); + } else if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { + counter_u64_add(pqstate_commit_aborts, 1); + return (false); + } + + /* + * If any queue operation flags were set, create a request for deferred + * processing of those flags. + */ + if (new.queue != PQ_NONE) { + mask = new.flags & PGA_QUEUE_OP_MASK; + if (mask != 0 && (old->flags & mask) != mask) + vm_page_pqbatch_submit(m, new.queue); + } + + return (true); } static inline void -vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) +vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) { + vm_page_t next; struct vm_domain *vmd; - uint8_t qflags; + vm_page_astate_t old, new; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); + old = vm_page_astate_load(m); +retry: + if (__predict_false(old.queue != queue)) + return; + KASSERT(pq == _vm_page_pagequeue(m, queue), + ("page %p does not belong to queue %p", m, pq)); + KASSERT(old.queue != PQ_NONE || (old.flags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p has unexpected queue state", m)); + /* - * The page daemon is allowed to set m->queue = PQ_NONE without - * the page queue lock held. In this case it is about to free the page, - * which must not have any queue state. + * Update the page's queue state before modifying the page queues + * themselves, to avoid having to roll back updates when a queue state + * update fails and requires a retry. */ - qflags = atomic_load_8(&m->aflags); - KASSERT(pq == vm_page_pagequeue(m) || - (qflags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p doesn't belong to queue %p but has aflags %#x", - m, pq, qflags)); - - if ((qflags & PGA_DEQUEUE) != 0) { - if (__predict_true((qflags & PGA_ENQUEUED) != 0)) - vm_pagequeue_remove(pq, m); - vm_page_dequeue_complete(m); + new = old; + if ((old.flags & PGA_DEQUEUE) != 0) { + new.queue = PQ_NONE; + new.flags &= ~PGA_QUEUE_STATE_MASK; + if (__predict_true((old.flags & PGA_ENQUEUED) != 0)) { + next = TAILQ_NEXT(m, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + } + if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { + if ((old.flags & PGA_ENQUEUED) != 0) { + if (next == NULL) + TAILQ_INSERT_TAIL(&pq->pq_pl, m, + plinks.q); + else + TAILQ_INSERT_BEFORE(next, m, plinks.q); + } + counter_u64_add(pqstate_commit_aborts, 1); + goto retry; + } + if ((old.flags & PGA_ENQUEUED) != 0) + vm_pagequeue_cnt_dec(pq); counter_u64_add(queue_ops, 1); - } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { - if ((qflags & PGA_ENQUEUED) != 0) + } else if ((old.flags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { + new.flags |= PGA_ENQUEUED; + new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); + if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { + counter_u64_add(pqstate_commit_aborts, 1); + goto retry; + } + + if ((old.flags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - else { + else vm_pagequeue_cnt_inc(pq); - vm_page_aflag_set(m, PGA_ENQUEUED); - } /* - * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. - * In particular, if both flags are set in close succession, - * only PGA_REQUEUE_HEAD will be applied, even if it was set - * first. + * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In + * particular, if both flags are set in close succession, only + * PGA_REQUEUE_HEAD will be applied, even if it was set first. */ - if ((qflags & PGA_REQUEUE_HEAD) != 0) { - KASSERT(m->queue == PQ_INACTIVE, + if ((old.flags & PGA_REQUEUE_HEAD) != 0) { + KASSERT(old.queue == PQ_INACTIVE, ("head enqueue not supported for page %p", m)); vmd = vm_pagequeue_domain(m); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - } else + } else { TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - - vm_page_aflag_clear(m, qflags & (PGA_REQUEUE | - PGA_REQUEUE_HEAD)); + } counter_u64_add(queue_ops, 1); } else { counter_u64_add(queue_nops, 1); @@ -3221,15 +3311,10 @@ static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { - vm_page_t m; int i; - for (i = 0; i < bq->bq_cnt; i++) { - m = bq->bq_pa[i]; - if (__predict_false(m->queue != queue)) - continue; - vm_pqbatch_process_page(pq, m); - } + for (i = 0; i < bq->bq_cnt; i++) + vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); vm_batchqueue_init(bq); } @@ -3237,8 +3322,6 @@ vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. - * The caller must have encoded the requested operation in the page - * structure's aflags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) @@ -3249,8 +3332,6 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); - KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, - ("missing synchronization for page %p", m)); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_phys_domain(m); @@ -3267,21 +3348,7 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); vm_pqbatch_process(pq, bq, queue); - - /* - * The page may have been logically dequeued before we acquired the - * page queue lock. In this case, since we either hold the page lock - * or the page is being freed, a different thread cannot be concurrently - * enqueuing the page. - */ - if (__predict_true(m->queue == queue)) - vm_pqbatch_process_page(pq, m); - else { - KASSERT(m->queue == PQ_NONE, - ("invalid queue transition for page %p", m)); - KASSERT((m->aflags & PGA_ENQUEUED) == 0, - ("page %p is enqueued with invalid queue index", m)); - } + vm_pqbatch_process_page(pq, m, queue); vm_pagequeue_unlock(pq); critical_exit(); } @@ -3325,78 +3392,27 @@ vm_page_pqbatch_drain(void) thread_unlock(td); } -/* - * Complete the logical removal of a page from a page queue. We must be - * careful to synchronize with the page daemon, which may be concurrently - * examining the page with only the page lock held. The page must not be - * in a state where it appears to be logically enqueued. - */ +/* XXX comment */ static void -vm_page_dequeue_complete(vm_page_t m) +vm_page_dequeue_free(vm_page_t m) { + vm_page_astate_t old, new; - m->queue = PQ_NONE; - atomic_thread_fence_rel(); - vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); -} - -/* - * vm_page_dequeue_deferred: [ internal use only ] - * - * Request removal of the given page from its current page - * queue. Physical removal from the queue may be deferred - * indefinitely. - * - * The page must be locked. - */ -void -vm_page_dequeue_deferred(vm_page_t m) -{ - uint8_t queue; - - vm_page_assert_locked(m); - - if ((queue = vm_page_queue(m)) == PQ_NONE) - return; - - /* - * Set PGA_DEQUEUE if it is not already set to handle a concurrent call - * to vm_page_dequeue_deferred_free(). In particular, avoid modifying - * the page's queue state once vm_page_dequeue_deferred_free() has been - * called. In the event of a race, two batch queue entries for the page - * will be created, but the second will have no effect. - */ - if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE)) - vm_page_pqbatch_submit(m, queue); -} - -/* - * A variant of vm_page_dequeue_deferred() that does not assert the page - * lock and is only to be called from vm_page_free_prep(). Because the - * page is being freed, we can assume that nothing other than the page - * daemon is scheduling queue operations on this page, so we get for - * free the mutual exclusion that is otherwise provided by the page lock. - * To handle races, the page daemon must take care to atomically check - * for PGA_DEQUEUE when updating queue state. - */ -static void -vm_page_dequeue_deferred_free(vm_page_t m) -{ - uint8_t queue; - - KASSERT(m->ref_count == 0, ("page %p has references", m)); - - for (;;) { - if ((m->aflags & PGA_DEQUEUE) != 0) - return; - atomic_thread_fence_acq(); - if ((queue = atomic_load_8(&m->queue)) == PQ_NONE) - return; - if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, - PGA_DEQUEUE)) { - vm_page_pqbatch_submit(m, queue); + for (old = vm_page_astate_load(m);;) { + if (old.queue == PQ_NONE) { + KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p has unexpected queue state flags %#x", + m, old.flags)); + break; + } + if ((old.flags & PGA_DEQUEUE) != 0) { + vm_page_pqbatch_submit(m, old.queue); break; } + new = old; + new.flags |= PGA_DEQUEUE; + if (vm_page_pqstate_commit(m, &old, new)) + break; } } @@ -3404,57 +3420,26 @@ vm_page_dequeue_deferred_free(vm_page_t m) * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any. - * The page must either be locked or unallocated. This constraint - * ensures that the queue state of the page will remain consistent - * after this function returns. + * XXX */ void vm_page_dequeue(vm_page_t m) { - struct vm_pagequeue *pq, *pq1; - uint8_t aflags; - - KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, - ("page %p is allocated and unlocked", m)); + vm_page_astate_t old, new; - for (pq = vm_page_pagequeue(m);; pq = pq1) { - if (pq == NULL) { - /* - * A thread may be concurrently executing - * vm_page_dequeue_complete(). Ensure that all queue - * state is cleared before we return. - */ - aflags = atomic_load_8(&m->aflags); - if ((aflags & PGA_QUEUE_STATE_MASK) == 0) - return; - KASSERT((aflags & PGA_DEQUEUE) != 0, + for (old = vm_page_astate_load(m);;) { + if (old.queue == PQ_NONE) { + KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue state flags %#x", - m, aflags)); - - /* - * Busy wait until the thread updating queue state is - * finished. Such a thread must be executing in a - * critical section. - */ - cpu_spinwait(); - pq1 = vm_page_pagequeue(m); - continue; + m, old.flags)); + break; } - vm_pagequeue_lock(pq); - if ((pq1 = vm_page_pagequeue(m)) == pq) + new = old; + new.queue = PQ_NONE; + new.flags &= ~PGA_QUEUE_STATE_MASK; + if (vm_page_pqstate_commit(m, &old, new)) break; - vm_pagequeue_unlock(pq); } - KASSERT(pq == vm_page_pagequeue(m), - ("%s: page %p migrated directly between queues", __func__, m)); - KASSERT((m->aflags & PGA_DEQUEUE) != 0 || - mtx_owned(vm_page_lockptr(m)), - ("%s: queued unlocked page %p", __func__, m)); - - if ((m->aflags & PGA_ENQUEUED) != 0) - vm_pagequeue_remove(pq, m); - vm_page_dequeue_complete(m); - vm_pagequeue_unlock(pq); } /* @@ -3466,102 +3451,16 @@ vm_page_enqueue(vm_page_t m, uint8_t queue) { vm_page_assert_locked(m); - KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->astate.queue == PQ_NONE && + (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); - m->queue = queue; - if ((m->aflags & PGA_REQUEUE) == 0) + m->astate.queue = queue; + if ((m->astate.flags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } -/* - * vm_page_requeue: [ internal use only ] - * - * Schedule a requeue of the given page. - * - * The page must be locked. - */ -void -vm_page_requeue(vm_page_t m) -{ - - vm_page_assert_locked(m); - KASSERT(vm_page_queue(m) != PQ_NONE, - ("%s: page %p is not logically enqueued", __func__, m)); - - if ((m->aflags & PGA_REQUEUE) == 0) - vm_page_aflag_set(m, PGA_REQUEUE); - vm_page_pqbatch_submit(m, atomic_load_8(&m->queue)); -} - -/* - * vm_page_swapqueue: [ internal use only ] - * - * Move the page from one queue to another, or to the tail of its - * current queue, in the face of a possible concurrent call to - * vm_page_dequeue_deferred_free(). - */ -void -vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq) -{ - struct vm_pagequeue *pq; - vm_page_t next; - bool queued; - - KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq, - ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq)); - vm_page_assert_locked(m); - - pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq]; - vm_pagequeue_lock(pq); - - /* - * The physical queue state might change at any point before the page - * queue lock is acquired, so we must verify that we hold the correct - * lock before proceeding. - */ - if (__predict_false(m->queue != oldq)) { - vm_pagequeue_unlock(pq); - return; - } - - /* - * Once the queue index of the page changes, there is nothing - * synchronizing with further updates to the physical queue state. - * Therefore we must remove the page from the queue now in anticipation - * of a successful commit, and be prepared to roll back. - */ - if (__predict_true((m->aflags & PGA_ENQUEUED) != 0)) { - next = TAILQ_NEXT(m, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - vm_page_aflag_clear(m, PGA_ENQUEUED); - queued = true; - } else { - queued = false; - } - - /* - * Atomically update the queue field and set PGA_REQUEUE while - * ensuring that PGA_DEQUEUE has not been set. - */ - if (__predict_false(!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, - PGA_REQUEUE))) { - if (queued) { - vm_page_aflag_set(m, PGA_ENQUEUED); - if (next != NULL) - TAILQ_INSERT_BEFORE(next, m, plinks.q); - else - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - } - vm_pagequeue_unlock(pq); - return; - } - vm_pagequeue_cnt_dec(pq); - vm_pagequeue_unlock(pq); - vm_page_pqbatch_submit(m, newq); -} - /* * vm_page_free_prep: * @@ -3595,10 +3494,10 @@ vm_page_free_prep(vm_page_t m) if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); - KASSERT((m->aflags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, + KASSERT((m->astate.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { - KASSERT(m->queue == PQ_NONE, + KASSERT(m->astate.queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); @@ -3631,7 +3530,7 @@ vm_page_free_prep(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); - KASSERT(m->queue == PQ_NONE, + KASSERT(m->astate.queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } @@ -3642,7 +3541,7 @@ vm_page_free_prep(vm_page_t m) * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_dequeue_deferred_free(m); + vm_page_dequeue_free(m); m->valid = 0; vm_page_undirty(m); @@ -3749,6 +3648,8 @@ vm_page_wire(vm_page_t m) old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); + if ((m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); } @@ -3770,11 +3671,45 @@ vm_page_wire_mapped(vm_page_t m) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); + if ((m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); return (true); } +/* XXX comment */ +static void +vm_page_unwire_managed(vm_page_t m, uint8_t queue, bool noreuse) +{ + u_int old; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_unwire_managed: page %p is unmanaged", m)); + + /* + * Update LRU state before releasing the wiring reference. + * Use a release store when updating the reference count to + * synchronize with vm_page_free_prep(). + */ + old = m->ref_count; + do { + KASSERT(VPRC_WIRE_COUNT(old) > 0, + ("vm_page_unwire: wire count underflow for page %p", m)); + if (VPRC_WIRE_COUNT(old) == 1 && + !vm_page_release_toq(m, queue, noreuse)) { + old = atomic_load_int(&m->ref_count); + continue; + } + } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); + + if (VPRC_WIRE_COUNT(old) == 1) { + vm_wire_sub(1); + if (old == 1) + vm_page_free(m); + } +} + /* * Release one wiring of the specified page, potentially allowing it to be * paged out. @@ -3789,8 +3724,6 @@ vm_page_wire_mapped(vm_page_t m) void vm_page_unwire(vm_page_t m, uint8_t queue) { - u_int old; - bool locked; KASSERT(queue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); @@ -3798,42 +3731,8 @@ vm_page_unwire(vm_page_t m, uint8_t queue) if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); - return; - } - - /* - * Update LRU state before releasing the wiring reference. - * We only need to do this once since we hold the page lock. - * Use a release store when updating the reference count to - * synchronize with vm_page_free_prep(). - */ - old = m->ref_count; - locked = false; - do { - KASSERT(VPRC_WIRE_COUNT(old) > 0, - ("vm_page_unwire: wire count underflow for page %p", m)); - if (!locked && VPRC_WIRE_COUNT(old) == 1) { - vm_page_lock(m); - locked = true; - if (queue == PQ_ACTIVE && vm_page_queue(m) == PQ_ACTIVE) - vm_page_reference(m); - else - vm_page_mvqueue(m, queue); - } - } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); - - /* - * Release the lock only after the wiring is released, to ensure that - * the page daemon does not encounter and dequeue the page while it is - * still wired. - */ - if (locked) - vm_page_unlock(m); - - if (VPRC_WIRE_COUNT(old) == 1) { - vm_wire_sub(1); - if (old == 1) - vm_page_free(m); + } else { + vm_page_unwire_managed(m, queue, false); } } @@ -3870,25 +3769,45 @@ vm_page_unwire_noq(vm_page_t m) * before releasing the page lock, otherwise the page daemon may immediately * dequeue the page. * + * In many cases this function's parameters are known at compile-time, so + * it is inlined into its callers so as to allow constant folding to remove + * branches. + * * A managed page must be locked. */ static __always_inline void -vm_page_mvqueue(vm_page_t m, const uint8_t nqueue) +vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) { + vm_page_astate_t old, new; - vm_page_assert_locked(m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_mvqueue: page %p is unmanaged", m)); - - if (vm_page_queue(m) != nqueue) { - vm_page_dequeue(m); - vm_page_enqueue(m, nqueue); - } else if (nqueue != PQ_ACTIVE) { - vm_page_requeue(m); + KASSERT(m->ref_count > 0, + ("vm_page_mvqueue: page %p is missing refs", m)); + KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, + ("vm_page_mvqueue: unexpected queue state flag")); + KASSERT(nflag != PGA_REQUEUE_HEAD || nqueue == PQ_INACTIVE, + ("vm_page_mvqueue: wrong queue %d for PGA_REQUEUE_HEAD", nqueue)); + + for (old = vm_page_astate_load(m);;) { + if ((old.flags & PGA_DEQUEUE) != 0) + break; + new = old; + if (nqueue == PQ_ACTIVE) + new.act_count = max(old.act_count, ACT_INIT); + + if (old.queue == nqueue) { + if (nqueue != PQ_ACTIVE) + new.flags |= nflag; + if (new._bits == old._bits) + break; + } else { + new.flags |= nflag; + new.queue = nqueue; + } + if (vm_page_pqstate_commit(m, &old, new)) + break; } - - if (nqueue == PQ_ACTIVE && m->act_count < ACT_INIT) - m->act_count = ACT_INIT; } /* @@ -3898,9 +3817,9 @@ void vm_page_activate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) + if ((m->oflags & VPO_UNMANAGED) != 0) return; - vm_page_mvqueue(m, PQ_ACTIVE); + vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); } /* @@ -3911,30 +3830,9 @@ void vm_page_deactivate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) + if ((m->oflags & VPO_UNMANAGED) != 0) return; - vm_page_mvqueue(m, PQ_INACTIVE); -} - -/* - * Move the specified page close to the head of the inactive queue, - * bypassing LRU. A marker page is used to maintain FIFO ordering. - * As with regular enqueues, we use a per-CPU batch queue to reduce - * contention on the page queue lock. - */ -static void -_vm_page_deactivate_noreuse(vm_page_t m) -{ - - vm_page_assert_locked(m); - - if (!vm_page_inactive(m)) { - vm_page_dequeue(m); - m->queue = PQ_INACTIVE; - } - if ((m->aflags & PGA_REQUEUE_HEAD) == 0) - vm_page_aflag_set(m, PGA_REQUEUE_HEAD); - vm_page_pqbatch_submit(m, PQ_INACTIVE); + vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); } void @@ -3944,8 +3842,9 @@ vm_page_deactivate_noreuse(vm_page_t m) KASSERT(m->object != NULL, ("vm_page_deactivate_noreuse: page %p has no object", m)); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_wired(m)) - _vm_page_deactivate_noreuse(m); + if ((m->oflags & VPO_UNMANAGED) != 0) + return; + vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); } /* @@ -3957,7 +3856,7 @@ vm_page_launder(vm_page_t m) if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_LAUNDRY); + vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); } /* @@ -3975,11 +3874,17 @@ vm_page_unswappable(vm_page_t m) vm_page_enqueue(m, PQ_UNSWAPPABLE); } -static void -vm_page_release_toq(vm_page_t m, int flags) +/* XXX comment */ +static bool +vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse) { + vm_page_astate_t old, new; + uint16_t nflag; - vm_page_assert_locked(m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_release_toq: page %p is unmanaged", m)); + KASSERT(m->ref_count > 0, + ("vm_page_release_toq: page %p is missing refs", m)); /* * Use a check of the valid bits to determine whether we should @@ -3991,12 +3896,29 @@ vm_page_release_toq(vm_page_t m, int flags) * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ - if ((flags & (VPR_TRYFREE | VPR_NOREUSE)) != 0 || m->valid == 0) - _vm_page_deactivate_noreuse(m); - else if (vm_page_active(m)) - vm_page_reference(m); - else - vm_page_mvqueue(m, PQ_INACTIVE); + nflag = (noreuse || m->valid == 0) ? PGA_REQUEUE_HEAD : PGA_REQUEUE; + + /* XXX explain */ + vm_page_aflag_clear(m, PGA_DEQUEUE); + + for (old = vm_page_astate_load(m);;) { + new = old; + if ((new.flags & PGA_DEQUEUE) != 0) + return (false); + if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) { + new.flags |= PGA_REFERENCED; + } else { + if (nqueue == PQ_ACTIVE) + new.act_count = max(old.act_count, ACT_INIT); + else + new.flags |= nflag; + new.queue = nqueue; + } + + if (vm_page_pqstate_commit(m, &old, new)) + break; + } + return (true); } /* @@ -4006,8 +3928,6 @@ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; - u_int old; - bool locked; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); @@ -4033,36 +3953,7 @@ vm_page_release(vm_page_t m, int flags) } } - /* - * Update LRU state before releasing the wiring reference. - * Use a release store when updating the reference count to - * synchronize with vm_page_free_prep(). - */ - old = m->ref_count; - locked = false; - do { - KASSERT(VPRC_WIRE_COUNT(old) > 0, - ("vm_page_unwire: wire count underflow for page %p", m)); - if (!locked && VPRC_WIRE_COUNT(old) == 1) { - vm_page_lock(m); - locked = true; - vm_page_release_toq(m, flags); - } - } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); - - /* - * Release the lock only after the wiring is released, to ensure that - * the page daemon does not encounter and dequeue the page while it is - * still wired. - */ - if (locked) - vm_page_unlock(m); - - if (VPRC_WIRE_COUNT(old) == 1) { - vm_wire_sub(1); - if (old == 1) - vm_page_free(m); - } + vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); } /* See vm_page_release(). */ @@ -4080,9 +3971,7 @@ vm_page_release_locked(vm_page_t m, int flags) m->dirty == 0 && !vm_page_busied(m)) { vm_page_free(m); } else { - vm_page_lock(m); - vm_page_release_toq(m, flags); - vm_page_unlock(m); + (void)vm_page_release_toq(m, PQ_INACTIVE, flags != 0); } } } @@ -4995,6 +4884,22 @@ vm_page_object_busy_assert(vm_page_t m) VM_OBJECT_ASSERT_BUSY(m->object); } +void +vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue) +{ + + if ((m->flags & PG_MARKER) != 0) + return; + + /* + * The page's page queue index may only change while the + * current queue's lock is held. + */ + KASSERT(queue != PQ_NONE, + ("page %p does not belong to a queue", m)); + vm_pagequeue_assert_locked(_vm_page_pagequeue(m, queue)); +} + void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { @@ -5074,7 +4979,7 @@ DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref %u\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, - m->queue, m->ref_count, m->aflags, m->oflags, - m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); + m->astate.queue, m->ref_count, m->astate.flags, m->oflags, + m->flags, m->astate.act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 0218064fa4a7..7eb34b7c35bb 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -215,6 +215,15 @@ typedef uint32_t vm_page_bits_t; typedef uint64_t vm_page_bits_t; #endif +typedef union { + struct { + uint16_t flags; /* atomic flags (A) */ + uint8_t queue; /* page queue index (Q) */ + uint8_t act_count; /* activation count (A) */ + }; + uint32_t _bits; +} vm_page_astate_t; + struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ @@ -234,15 +243,13 @@ struct vm_page { struct md_page md; /* machine dependent stuff */ u_int ref_count; /* page references (A) */ volatile u_int busy_lock; /* busy owners lock */ - uint16_t flags; /* page PG_* flags (P) */ + vm_page_astate_t astate; /* atomically updated state */ + uint8_t flags; /* page PG_* flags (P) */ uint8_t order; /* index of the buddy queue (F) */ uint8_t pool; /* vm_phys freepool index (F) */ - uint8_t aflags; /* atomic flags (A) */ - uint8_t oflags; /* page VPO_* flags (O) */ - uint8_t queue; /* page queue index (Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; /* vm_phys segment index (C) */ - u_char act_count; /* page usage count (P) */ + uint8_t oflags; /* page VPO_* flags (O) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* valid DEV_BSIZE chunk map (O,B) */ @@ -423,8 +430,8 @@ extern struct mtx_padalign pa_lock[]; #define PGA_REQUEUE_HEAD 0x40 /* page requeue should bypass LRU */ #define PGA_NOSYNC 0x80 /* do not collect for syncer */ -#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE | \ - PGA_REQUEUE_HEAD) +#define PGA_QUEUE_OP_MASK (PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) +#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_QUEUE_OP_MASK) /* * Page flags. If changed at any other time than page allocation or @@ -434,11 +441,11 @@ extern struct mtx_padalign pa_lock[]; * allocated from a per-CPU cache. It is cleared the next time that the * page is allocated from the physical memory allocator. */ -#define PG_PCPU_CACHE 0x0001 /* was allocated from per-CPU caches */ -#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ -#define PG_ZERO 0x0008 /* page is zeroed */ -#define PG_MARKER 0x0010 /* special queue marker page */ -#define PG_NODUMP 0x0080 /* don't include this page in a dump */ +#define PG_PCPU_CACHE 0x01 /* was allocated from per-CPU caches */ +#define PG_FICTITIOUS 0x04 /* physical page doesn't exist */ +#define PG_ZERO 0x08 /* page is zeroed */ +#define PG_MARKER 0x10 /* special queue marker page */ +#define PG_NODUMP 0x80 /* don't include this page in a dump */ /* * Misc constants. @@ -597,7 +604,6 @@ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, void vm_page_deactivate(vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); -void vm_page_dequeue_deferred(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); bool vm_page_free_prep(vm_page_t m); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); @@ -609,6 +615,8 @@ vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); void vm_page_pqbatch_drain(void); void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue); +bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, + vm_page_astate_t new); vm_page_t vm_page_prev(vm_page_t m); bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); @@ -716,64 +724,52 @@ void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m); #define VM_PAGE_OBJECT_BUSY_ASSERT(m) vm_page_object_busy_assert(m) +void vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue); +#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) vm_page_pagequeue_lock_assert(m, q) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_BUSY_ASSERT(m) (void)0 +#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* - * We want to use atomic updates for the aflags field, which is 8 bits wide. - * However, not all architectures support atomic operations on 8-bit + * We want to use atomic updates for the aflags field, which is 16 bits wide. + * However, not all architectures support atomic operations on 16-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ -_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0, +_Static_assert(offsetof(struct vm_page, astate.flags) % sizeof(uint32_t) == 0, "aflags field is not 32-bit aligned"); +#define VM_PAGE_AFLAG_SHIFT __offsetof(vm_page_astate_t, flags) + /* - * We want to be able to update the aflags and queue fields atomically in - * the same operation. + * Return the atomic flag set for the page. */ -_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) == - offsetof(struct vm_page, queue) / sizeof(uint32_t), - "aflags and queue fields do not belong to the same 32-bit word"); -_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2, - "queue field is at an unexpected offset"); -_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1, - "queue field has an unexpected size"); - -#if BYTE_ORDER == LITTLE_ENDIAN -#define VM_PAGE_AFLAG_SHIFT 0 -#define VM_PAGE_QUEUE_SHIFT 16 -#else -#define VM_PAGE_AFLAG_SHIFT 24 -#define VM_PAGE_QUEUE_SHIFT 8 -#endif -#define VM_PAGE_QUEUE_MASK (0xff << VM_PAGE_QUEUE_SHIFT) +static inline int +vm_page_aflags(vm_page_t m) +{ + + return (m->astate.flags); +} /* * Clear the given bits in the specified page. */ static inline void -vm_page_aflag_clear(vm_page_t m, uint8_t bits) +vm_page_aflag_clear(vm_page_t m, uint16_t bits) { uint32_t *addr, val; - /* - * The PGA_REFERENCED flag can only be cleared if the page is locked. - */ - if ((bits & PGA_REFERENCED) != 0) - vm_page_assert_locked(m); - /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->aflags; + addr = (void *)&m->astate; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } @@ -782,7 +778,7 @@ vm_page_aflag_clear(vm_page_t m, uint8_t bits) * Set the given bits in the specified page. */ static inline void -vm_page_aflag_set(vm_page_t m, uint8_t bits) +vm_page_aflag_set(vm_page_t m, uint16_t bits) { uint32_t *addr, val; @@ -793,42 +789,43 @@ vm_page_aflag_set(vm_page_t m, uint8_t bits) * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->aflags; + addr = (void *)&m->astate; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); } -/* - * Atomically update the queue state of the page. The operation fails if - * any of the queue flags in "fflags" are set or if the "queue" field of - * the page does not match the expected value; if the operation is - * successful, the flags in "nflags" are set and all other queue state - * flags are cleared. - */ +static inline vm_page_astate_t +vm_page_astate_load(vm_page_t m) +{ + vm_page_astate_t astate; + + astate._bits = atomic_load_32(&m->astate); + return (astate); +} + static inline bool -vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq, - uint32_t fflags, uint32_t nflags) +vm_page_astate_fcmpset(vm_page_t m, vm_page_astate_t *old, + vm_page_astate_t new) { - uint32_t *addr, nval, oval, qsmask; - - fflags <<= VM_PAGE_AFLAG_SHIFT; - nflags <<= VM_PAGE_AFLAG_SHIFT; - newq <<= VM_PAGE_QUEUE_SHIFT; - oldq <<= VM_PAGE_QUEUE_SHIFT; - qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) << - VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK; - - addr = (void *)&m->aflags; - oval = atomic_load_32(addr); - do { - if ((oval & fflags) != 0) - return (false); - if ((oval & VM_PAGE_QUEUE_MASK) != oldq) - return (false); - nval = (oval & ~qsmask) | nflags | newq; - } while (!atomic_fcmpset_32(addr, &oval, nval)); - - return (true); + int ret; + + KASSERT(new.queue == PQ_INACTIVE || (new.flags & PGA_REQUEUE_HEAD) == 0, + ("vm_page_astate_fcmpset: unexpected head requeue for page %p", + m)); + KASSERT((new.flags & PGA_ENQUEUED) == 0 || new.queue != PQ_NONE, + ("vm_page_astate_fcmpset: setting PGA_ENQUEUED without a queue")); + KASSERT(new._bits != old->_bits, + ("vm_page_astate_fcmpset: bits are not changing")); + + ret = atomic_fcmpset_32(&m->astate._bits, &old->_bits, new._bits); + if (ret != 0) { + if (old->queue != PQ_NONE && old->queue != new.queue) + VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, old->queue); + KASSERT((new.flags & PGA_ENQUEUED) == 0 || old->queue == new.queue, + ("vm_page_astate_fcmpset: PGA_ENQUEUED set after queue change for page %p", m)); + } + + return (ret != 0); } /* @@ -884,19 +881,17 @@ vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, /* * vm_page_queue: * - * Return the index of the queue containing m. This index is guaranteed - * not to change while the page lock is held. + * Return the index of the queue containing m. */ static inline uint8_t vm_page_queue(vm_page_t m) { + vm_page_astate_t as; - vm_page_assert_locked(m); - - if ((m->aflags & PGA_DEQUEUE) != 0) + as = vm_page_astate_load(m); + if ((as.flags & PGA_DEQUEUE) != 0) return (PQ_NONE); - atomic_thread_fence_acq(); - return (m->queue); + return (as.queue); } static inline bool diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 7e638821cd34..b31fbd789b5f 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -218,7 +218,7 @@ vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, { vm_pagequeue_assert_locked(pq); - KASSERT((marker->aflags & PGA_ENQUEUED) == 0, + KASSERT((vm_page_aflags(marker) & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) @@ -242,7 +242,7 @@ vm_pageout_end_scan(struct scan_state *ss) pq = ss->pq; vm_pagequeue_assert_locked(pq); - KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0, + KASSERT((vm_page_aflags(ss->marker) & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); @@ -271,7 +271,7 @@ vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) marker = ss->marker; pq = ss->pq; - KASSERT((marker->aflags & PGA_ENQUEUED) != 0, + KASSERT((vm_page_aflags(marker) & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); @@ -280,7 +280,7 @@ vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { - KASSERT((m->aflags & PGA_ENQUEUED) != 0, + KASSERT((vm_page_aflags(m) & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); @@ -376,14 +376,11 @@ vm_pageout_cluster(vm_page_t m) vm_page_xunbusy(p); break; } - vm_page_lock(p); if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { - vm_page_unlock(p); vm_page_xunbusy(p); ib = 0; break; } - vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; @@ -405,17 +402,11 @@ vm_pageout_cluster(vm_page_t m) break; } vm_page_test_dirty(p); - if (p->dirty == 0) { - vm_page_xunbusy(p); - break; - } - vm_page_lock(p); - if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { - vm_page_unlock(p); + if (p->dirty == 0 || !vm_page_in_laundry(p) || + !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); break; } - vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; @@ -472,7 +463,7 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, KASSERT(vm_page_all_valid(mc[i]), ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); - KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, + KASSERT((vm_page_aflags(mc[i]) & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_busy_downgrade(mc[i]); } @@ -591,7 +582,6 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) vm_pindex_t pindex; int error, lockmode; - vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; @@ -611,7 +601,6 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) * of time. */ if (object->type == OBJT_VNODE) { - vm_page_unlock(m); vm_page_xunbusy(m); vp = object->handle; if (vp->v_type == VREG && @@ -642,7 +631,6 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) error = ENOENT; goto unlock_all; } - vm_page_lock(m); /* * While the object and page were unlocked, the page @@ -679,7 +667,6 @@ vm_pageout_clean(vm_page_t m, int *numpagedout) error = EBUSY; goto unlock_all; } - vm_page_unlock(m); /* * If a page is dirty, then it is either being washed @@ -715,14 +702,13 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; - struct mtx *mtx; vm_object_t object; vm_page_t m, marker; - int act_delta, error, numpagedout, queue, starting_target; + vm_page_astate_t old, new; + int act_delta, error, numpagedout, queue, refs, starting_target; int vnodes_skipped; bool pageout_ok; - mtx = NULL; object = NULL; starting_target = launder; vnodes_skipped = 0; @@ -750,78 +736,46 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) if (__predict_false((m->flags & PG_MARKER) != 0)) continue; - vm_page_change_lock(m, &mtx); - -recheck: /* - * The page may have been disassociated from the queue - * or even freed while locks were dropped. We thus must be - * careful whenever modifying page state. Once the object lock - * has been acquired, we have a stable reference to the page. + * Perform some quick and racy checks of the page's queue state. + * Bail if things are not as we expect. */ - if (vm_page_queue(m) != queue) + old = vm_page_astate_load(m); + if (old.queue != PQ_LAUNDRY || (old.flags & PGA_ENQUEUED) == 0) continue; - - /* - * A requeue was requested, so this page gets a second - * chance. - */ - if ((m->aflags & PGA_REQUEUE) != 0) { + if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_pqbatch_submit(m, queue); continue; } - /* - * Wired pages may not be freed. Complete their removal - * from the queue now to avoid needless revisits during - * future scans. This check is racy and must be reverified once - * we hold the object lock and have verified that the page - * is not busy. - */ - if (vm_page_wired(m)) { - vm_page_dequeue_deferred(m); - continue; - } - if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); - - /* - * A page's object pointer may be set to NULL before - * the object lock is acquired. - */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { - mtx_unlock(mtx); - /* Depends on type-stability. */ - VM_OBJECT_WLOCK(object); - mtx_lock(mtx); - goto recheck; + if (object == NULL) + continue; + VM_OBJECT_WLOCK(object); + if (m->object != object) { + VM_OBJECT_WUNLOCK(object); + object = NULL; + continue; } } - if (__predict_false(m->object == NULL)) - /* - * The page has been removed from its object. - */ - continue; - KASSERT(m->object == object, ("page %p does not belong to %p", - m, object)); if (vm_page_tryxbusy(m) == 0) continue; /* - * Re-check for wirings now that we hold the object lock and - * have verified that the page is unbusied. If the page is - * mapped, it may still be wired by pmap lookups. The call to + * Check for wirings now that we hold the object lock and have + * verified that the page is unbusied. If the page is mapped, + * it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { vm_page_xunbusy(m); - vm_page_dequeue_deferred(m); + vm_page_pqbatch_submit(m, queue); continue; } @@ -841,48 +795,69 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) * that a reference from a concurrently destroyed mapping is * observed here and now. */ - if (object->ref_count != 0) - act_delta = pmap_ts_referenced(m); - else { - KASSERT(!pmap_page_is_mapped(m), - ("page %p is mapped", m)); - act_delta = 0; - } - if ((m->aflags & PGA_REFERENCED) != 0) { - vm_page_aflag_clear(m, PGA_REFERENCED); - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { - vm_page_xunbusy(m); - VM_CNT_INC(v_reactivated); - vm_page_activate(m); + refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; - /* - * Increase the activation count if the page - * was referenced while in the laundry queue. - * This makes it less likely that the page will - * be returned prematurely to the inactive - * queue. - */ - m->act_count += act_delta + ACT_ADVANCE; + for (old = vm_page_astate_load(m);;) { + if (old.queue != queue || + (old.flags & PGA_ENQUEUED) == 0) { + vm_page_xunbusy(m); + goto next_page; + } - /* - * If this was a background laundering, count - * activated pages towards our target. The - * purpose of background laundering is to ensure - * that pages are eventually cycled through the - * laundry queue, and an activation is a valid - * way out. - */ - if (!in_shortfall) - launder--; - continue; - } else if ((object->flags & OBJ_DEAD) == 0) { + if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_xunbusy(m); - vm_page_requeue(m); - continue; + vm_page_pqbatch_submit(m, queue); + goto next_page; + } + + new = old; + act_delta = refs; + if ((old.flags & PGA_REFERENCED) != 0) { + new.flags &= ~PGA_REFERENCED; + act_delta++; } + if (act_delta != 0) { + if (object->ref_count != 0) { + /* + * Increase the activation count if the + * page was referenced while in the + * laundry queue. This makes it less + * likely that the page will be returned + * prematurely to the inactive queue. + */ + new.act_count += ACT_ADVANCE + + act_delta; + if (new.act_count > ACT_MAX) + new.act_count = ACT_MAX; + + new.flags |= PGA_REQUEUE; + new.queue = PQ_ACTIVE; + if (!vm_page_pqstate_commit(m, &old, + new)) + continue; + + vm_page_xunbusy(m); + VM_CNT_INC(v_reactivated); + + /* + * If this was a background laundering, + * count activated pages towards our + * target. The purpose of background + * laundering is to ensure that pages + * are eventually cycled through the + * laundry queue, and an activation is a + * valid way out. + */ + if (!in_shortfall) + launder--; + goto next_page; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_xunbusy(m); + vm_page_launder(m); + goto next_page; + } + } + break; } /* @@ -896,7 +871,7 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); - vm_page_dequeue_deferred(m); + vm_page_pqbatch_submit(m, queue); continue; } } @@ -921,7 +896,7 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) pageout_ok = true; if (!pageout_ok) { vm_page_xunbusy(m); - vm_page_requeue(m); + vm_page_launder(m); continue; } @@ -946,14 +921,9 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) pageout_lock_miss++; vnodes_skipped++; } - mtx = NULL; object = NULL; - } else - vm_page_xunbusy(m); - } - if (mtx != NULL) { - mtx_unlock(mtx); - mtx = NULL; + } +next_page:; } if (object != NULL) { VM_OBJECT_WUNLOCK(object); @@ -1191,12 +1161,13 @@ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; - struct mtx *mtx; vm_object_t object; vm_page_t m, marker; + vm_page_astate_t old, new; struct vm_pagequeue *pq; long min_scan; - int act_delta, max_scan, scan_tick; + int act_delta, max_scan, ps_delta, refs, scan_tick; + uint8_t nqueue; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; @@ -1230,7 +1201,6 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; - mtx = NULL; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { @@ -1249,29 +1219,6 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) if (__predict_false((m->flags & PG_MARKER) != 0)) continue; - vm_page_change_lock(m, &mtx); - - /* - * The page may have been disassociated from the queue - * or even freed while locks were dropped. We thus must be - * careful whenever modifying page state. Once the object lock - * has been acquired, we have a stable reference to the page. - */ - if (vm_page_queue(m) != PQ_ACTIVE) - continue; - - /* - * Wired pages are dequeued lazily. - */ - if (vm_page_wired(m)) { - vm_page_dequeue_deferred(m); - continue; - } - - /* - * A page's object pointer may be set to NULL before - * the object lock is acquired. - */ object = (vm_object_t)atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* @@ -1286,80 +1233,104 @@ vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) * that a reference from a concurrently destroyed mapping is * observed here and now. * - * Perform an unsynchronized object ref count check. While - * the page lock ensures that the page is not reallocated to - * another object, in particular, one with unmanaged mappings - * that cannot support pmap_ts_referenced(), two races are, + * Perform an unsynchronized object ref count check. While the + * page lock ensures that the page is not reallocated to another + * object, in particular, one with unmanaged mappings that + * cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: + * * 1) The count was transitioning to zero, but we saw a non- - * zero value. pmap_ts_referenced() will return zero - * because the page is not mapped. - * 2) The count was transitioning to one, but we saw zero. - * This race delays the detection of a new reference. At - * worst, we will deactivate and reactivate the page. + * zero value. pmap_ts_referenced() will return zero because + * the page is not mapped. + * 2) The count was transitioning to one, but we saw zero. This + * race delays the detection of a new reference. At worst, + * we will deactivate and reactivate the page. */ - if (object->ref_count != 0) - act_delta = pmap_ts_referenced(m); - else - act_delta = 0; - if ((m->aflags & PGA_REFERENCED) != 0) { - vm_page_aflag_clear(m, PGA_REFERENCED); - act_delta++; - } + refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; - /* - * Advance or decay the act_count based on recent usage. - */ - if (act_delta != 0) { - m->act_count += ACT_ADVANCE + act_delta; - if (m->act_count > ACT_MAX) - m->act_count = ACT_MAX; - } else - m->act_count -= min(m->act_count, ACT_DECLINE); + for (old = vm_page_astate_load(m);;) { + if (old.queue != PQ_ACTIVE || + (old.flags & PGA_ENQUEUED) == 0) + /* + * Something has moved the page out of the + * active queue. Don't touch it. + */ + break; + if ((old.flags & PGA_DEQUEUE) != 0) { + vm_page_pqbatch_submit(m, PQ_ACTIVE); + break; + } + + new = old; + act_delta = refs; + if ((old.flags & PGA_REFERENCED) != 0) { + new.flags &= ~PGA_REFERENCED; + act_delta++; + } - if (m->act_count == 0) { /* - * When not short for inactive pages, let dirty pages go - * through the inactive queue before moving to the - * laundry queues. This gives them some extra time to - * be reactivated, potentially avoiding an expensive - * pageout. However, during a page shortage, the - * inactive queue is necessarily small, and so dirty - * pages would only spend a trivial amount of time in - * the inactive queue. Therefore, we might as well - * place them directly in the laundry queue to reduce - * queuing overhead. + * Advance or decay the act_count based on recent usage. */ - if (page_shortage <= 0) { - vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE); + if (act_delta != 0) { + new.act_count += ACT_ADVANCE + act_delta; + if (new.act_count > ACT_MAX) + new.act_count = ACT_MAX; + } else { + new.act_count -= min(new.act_count, ACT_DECLINE); + } + + if (new.act_count > 0) { + /* + * Adjust the activation count and keep the page + * in the active queue. The count might be left + * unchanged if it is saturated. + */ + if (new.act_count == old.act_count || + vm_page_astate_fcmpset(m, &old, new)) + break; } else { /* + * When not short for inactive pages, let dirty + * pages go through the inactive queue before + * moving to the laundry queues. This gives + * them some extra time to be reactivated, + * potentially avoiding an expensive pageout. + * However, during a page shortage, the inactive + * queue is necessarily small, and so dirty + * pages would only spend a trivial amount of + * time in the inactive queue. Therefore, we + * might as well place them directly in the + * laundry queue to reduce queuing overhead. + * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, - * directing dirty pages into the laundry - * queue is only an optimization and not a + * directing dirty pages into the laundry queue + * is only an optimization and not a * requirement. Therefore, we simply rely on - * the opportunistic updates to the page's - * dirty field by the pmap. + * the opportunistic updates to the page's dirty + * field by the pmap. */ - if (m->dirty == 0) { - vm_page_swapqueue(m, PQ_ACTIVE, - PQ_INACTIVE); - page_shortage -= - act_scan_laundry_weight; + if (page_shortage <= 0) { + nqueue = PQ_INACTIVE; + ps_delta = 0; + } else if (m->dirty == 0) { + nqueue = PQ_INACTIVE; + ps_delta = act_scan_laundry_weight; } else { - vm_page_swapqueue(m, PQ_ACTIVE, - PQ_LAUNDRY); - page_shortage--; + nqueue = PQ_LAUNDRY; + ps_delta = 1; + } + + new.flags |= PGA_REQUEUE; + new.queue = nqueue; + if (vm_page_pqstate_commit(m, &old, new)) { + page_shortage -= ps_delta; + break; } } } } - if (mtx != NULL) { - mtx_unlock(mtx); - mtx = NULL; - } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); @@ -1371,20 +1342,30 @@ static int vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m) { struct vm_domain *vmd; + vm_page_astate_t old, new; - if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0) - return (0); - vm_page_aflag_set(m, PGA_ENQUEUED); - if ((m->aflags & PGA_REQUEUE_HEAD) != 0) { - vmd = vm_pagequeue_domain(m); - TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); - } else if ((m->aflags & PGA_REQUEUE) != 0) { - TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); - vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); - } else - TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); - return (1); + for (old = vm_page_astate_load(m);;) { + if (old.queue != PQ_INACTIVE || + (old.flags & (PGA_DEQUEUE | PGA_ENQUEUED)) != 0) + break; + + new = old; + new.flags |= PGA_ENQUEUED; + new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); + if (!vm_page_astate_fcmpset(m, &old, new)) + continue; + + if ((old.flags & PGA_REQUEUE_HEAD) != 0) { + vmd = vm_pagequeue_domain(m); + TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); + } else if ((old.flags & PGA_REQUEUE) != 0) { + TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); + } else { + TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); + } + return (1); + } + return (0); } /* @@ -1427,11 +1408,11 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, { struct scan_state ss; struct vm_batchqueue rq; - struct mtx *mtx; vm_page_t m, marker; + vm_page_astate_t old, new; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, addl_page_shortage, deficit, page_shortage; + int act_delta, addl_page_shortage, deficit, page_shortage, refs; int starting_page_shortage; /* @@ -1451,7 +1432,6 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = page_shortage = shortage + deficit; - mtx = NULL; object = NULL; vm_batchqueue_init(&rq); @@ -1469,65 +1449,31 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); - vm_page_change_lock(m, &mtx); - -recheck: /* - * The page may have been disassociated from the queue - * or even freed while locks were dropped. We thus must be - * careful whenever modifying page state. Once the object lock - * has been acquired, we have a stable reference to the page. + * Perform some quick and racy checks of the page's queue state. + * Bail if things are not as we expect. */ - if (vm_page_queue(m) != PQ_INACTIVE) { - addl_page_shortage++; + old = vm_page_astate_load(m); + if (old.queue != PQ_INACTIVE || (old.flags & PGA_ENQUEUED) != 0) continue; - } - - /* - * The page was re-enqueued after the page queue lock was - * dropped, or a requeue was requested. This page gets a second - * chance. - */ - if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE | - PGA_REQUEUE_HEAD)) != 0) - goto reinsert; - - /* - * Wired pages may not be freed. Complete their removal - * from the queue now to avoid needless revisits during - * future scans. This check is racy and must be reverified once - * we hold the object lock and have verified that the page - * is not busy. - */ - if (vm_page_wired(m)) { - vm_page_dequeue_deferred(m); + if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { + vm_page_pqbatch_submit(m, PQ_INACTIVE); continue; } if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); - - /* - * A page's object pointer may be set to NULL before - * the object lock is acquired. - */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { - mtx_unlock(mtx); - /* Depends on type-stability. */ - VM_OBJECT_WLOCK(object); - mtx_lock(mtx); - goto recheck; + if (object == NULL) + continue; + VM_OBJECT_WLOCK(object); + if (m->object != object) { + VM_OBJECT_WUNLOCK(object); + object = NULL; + goto reinsert; } } - if (__predict_false(m->object == NULL)) - /* - * The page has been removed from its object. - */ - continue; - KASSERT(m->object == object, ("page %p does not belong to %p", - m, object)); if (vm_page_tryxbusy(m) == 0) { /* @@ -1543,16 +1489,16 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, } /* - * Re-check for wirings now that we hold the object lock and - * have verified that the page is unbusied. If the page is - * mapped, it may still be wired by pmap lookups. The call to + * Check for wirings now that we hold the object lock and have + * verified that the page is unbusied. If the page is mapped, + * it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { vm_page_xunbusy(m); - vm_page_dequeue_deferred(m); + vm_page_pqbatch_submit(m, PQ_INACTIVE); continue; } @@ -1572,37 +1518,57 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, * that a reference from a concurrently destroyed mapping is * observed here and now. */ - if (object->ref_count != 0) - act_delta = pmap_ts_referenced(m); - else { - KASSERT(!pmap_page_is_mapped(m), - ("page %p is mapped", m)); - act_delta = 0; - } - if ((m->aflags & PGA_REFERENCED) != 0) { - vm_page_aflag_clear(m, PGA_REFERENCED); - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { + refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + + for (old = vm_page_astate_load(m);;) { + if (old.queue != PQ_INACTIVE || + (old.flags & PGA_ENQUEUED) != 0) { vm_page_xunbusy(m); - VM_CNT_INC(v_reactivated); - vm_page_activate(m); + goto next_page; + } - /* - * Increase the activation count if the page - * was referenced while in the inactive queue. - * This makes it less likely that the page will - * be returned prematurely to the inactive - * queue. - */ - m->act_count += act_delta + ACT_ADVANCE; - continue; - } else if ((object->flags & OBJ_DEAD) == 0) { + if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_xunbusy(m); - vm_page_aflag_set(m, PGA_REQUEUE); - goto reinsert; + vm_page_pqbatch_submit(m, PQ_INACTIVE); + goto next_page; } + + new = old; + act_delta = refs; + if ((old.flags & PGA_REFERENCED) != 0) { + new.flags &= ~PGA_REFERENCED; + act_delta++; + } + if (act_delta != 0) { + if (object->ref_count != 0) { + /* + * Increase the activation count if the + * page was referenced while in the + * inactive queue. This makes it less + * likely that the page will be returned + * prematurely to the inactive queue. + */ + new.act_count += ACT_ADVANCE + + act_delta; + if (new.act_count > ACT_MAX) + new.act_count = ACT_MAX; + + new.flags |= PGA_REQUEUE; + new.queue = PQ_ACTIVE; + if (!vm_page_pqstate_commit(m, &old, + new)) + continue; + + vm_page_xunbusy(m); + VM_CNT_INC(v_reactivated); + goto next_page; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_xunbusy(m); + vm_page_aflag_set(m, PGA_REQUEUE); + goto reinsert; + } + } + break; } /* @@ -1616,7 +1582,7 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); - vm_page_dequeue_deferred(m); + vm_page_pqbatch_submit(m, PQ_INACTIVE); continue; } } @@ -1630,15 +1596,23 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, */ if (m->dirty == 0) { free_page: + /* XXX comment */ + old = vm_page_astate_load(m); + if (old.queue != PQ_INACTIVE || + (old.flags & PGA_QUEUE_STATE_MASK) != 0) { + vm_page_xunbusy(m); + vm_page_pqbatch_submit(m, PQ_INACTIVE); + /* XXX why not continue */ + goto next_page; + } + /* * Because we dequeued the page and have already * checked for concurrent dequeue and enqueue * requests, we can safely disassociate the page * from the inactive queue. */ - KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p has queue state", m)); - m->queue = PQ_NONE; + m->astate.queue = PQ_NONE; vm_page_free(m); page_shortage--; continue; @@ -1646,12 +1620,11 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, vm_page_xunbusy(m); if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); +next_page: continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } - if (mtx != NULL) - mtx_unlock(mtx); if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index ba5e77ce6c8d..b3e244755a05 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -202,6 +202,8 @@ static inline void vm_pagequeue_remove(struct vm_pagequeue *pq, vm_page_t m) { + vm_pagequeue_assert_locked(pq); + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } @@ -249,6 +251,22 @@ vm_pagequeue_domain(vm_page_t m) return (VM_DOMAIN(vm_phys_domain(m))); } +static inline struct vm_pagequeue * +_vm_page_pagequeue(vm_page_t m, uint8_t queue) +{ + + if (queue == PQ_NONE) + return (NULL); + return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); +} + +static inline struct vm_pagequeue * +vm_page_pagequeue(vm_page_t m) +{ + + return (_vm_page_pagequeue(m, atomic_load_8(&m->astate.queue))); +} + /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c index 28a6a4d91afe..5eaccb9c3c8c 100644 --- a/sys/vm/vm_swapout.c +++ b/sys/vm/vm_swapout.c @@ -108,8 +108,9 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include +#include +#include #include #include #include @@ -170,6 +171,57 @@ static void swapout_procs(int action); static void vm_req_vmdaemon(int req); static void vm_thread_swapout(struct thread *td); +static void +vm_swapout_object_deactivate_page(vm_page_t m, int remove_mode) +{ + vm_page_astate_t old, new; + int act_delta, refs; + + refs = pmap_ts_referenced(m); + + for (old = vm_page_astate_load(m);;) { + if ((old.flags & PGA_DEQUEUE) != 0) + break; + new = old; + + act_delta = refs; + if ((old.flags & PGA_REFERENCED) != 0) { + new.flags &= ~PGA_REFERENCED; + act_delta++; + } + + if (old.queue != PQ_ACTIVE && act_delta != 0) { + if (new.act_count == ACT_MAX) + break; + new.act_count += act_delta; + new.flags |= PGA_REQUEUE; + new.queue = PQ_ACTIVE; + if (vm_page_pqstate_commit(m, &old, new)) + break; + } else if (old.queue == PQ_ACTIVE) { + if (act_delta == 0) { + new.act_count -= min(new.act_count, + ACT_DECLINE); + if (!remove_mode && new.act_count == 0) { + (void)vm_page_try_remove_all(m); + + new.flags |= PGA_REQUEUE; + new.queue = PQ_INACTIVE; + } + if (vm_page_pqstate_commit(m, &old, new)) + break; + } else { + if (new.act_count < ACT_MAX - ACT_ADVANCE) + new.act_count += ACT_ADVANCE; + if (vm_page_astate_fcmpset(m, &old, new)) + break; + } + } else { + (void)vm_page_try_remove_all(m); + } + } +} + /* * vm_swapout_object_deactivate_pages * @@ -184,7 +236,7 @@ vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, { vm_object_t backing_object, object; vm_page_t p; - int act_delta, remove_mode; + int remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) @@ -222,37 +274,8 @@ vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, vm_page_xunbusy(p); continue; } - act_delta = pmap_ts_referenced(p); - vm_page_lock(p); - if ((p->aflags & PGA_REFERENCED) != 0) { - if (act_delta == 0) - act_delta = 1; - vm_page_aflag_clear(p, PGA_REFERENCED); - } - if (!vm_page_active(p) && act_delta != 0) { - vm_page_activate(p); - p->act_count += act_delta; - } else if (vm_page_active(p)) { - /* - * The page daemon does not requeue pages - * after modifying their activation count. - */ - if (act_delta == 0) { - p->act_count -= min(p->act_count, - ACT_DECLINE); - if (!remove_mode && p->act_count == 0) { - (void)vm_page_try_remove_all(p); - vm_page_deactivate(p); - } - } else { - vm_page_activate(p); - if (p->act_count < ACT_MAX - - ACT_ADVANCE) - p->act_count += ACT_ADVANCE; - } - } else if (vm_page_inactive(p)) - (void)vm_page_try_remove_all(p); - vm_page_unlock(p); + + vm_swapout_object_deactivate_page(p, remove_mode); vm_page_xunbusy(p); } if ((backing_object = object->backing_object) == NULL)