Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c (revision 253884) +++ sys/amd64/amd64/pmap.c (working copy) @@ -317,6 +317,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +#ifdef INVARIANTS +static int emulate_ad_bits = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, emulate_ad_bits, CTLFLAG_RDTUN, + &emulate_ad_bits, 0, "Are accessed/dirty bits emulated?"); +#endif + #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ @@ -344,6 +350,8 @@ pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ + /* * Crashdump maps. */ @@ -749,6 +757,15 @@ vm_offset_t va; pt_entry_t *pte, *unused; +#ifdef INVARIANTS + /* + * Do software emulation of accessed and dirty bits for x86 pmaps. + */ + TUNABLE_INT_FETCH("vm.pmap.emulate_ad_bits", &emulate_ad_bits); + if (emulate_ad_bits) + pmap_flags |= PMAP_EMULATE_AD_BITS; +#endif + /* * Create an initial set of page tables to run the kernel in. */ @@ -773,7 +790,7 @@ kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); - kernel_pmap->pm_flags = PMAP_PDE_SUPERPAGE; + kernel_pmap->pm_flags = pmap_flags; /* * Initialize the global pv list lock. @@ -1089,6 +1106,13 @@ } static __inline boolean_t +pmap_emulate_ad_bits(pmap_t pmap) +{ + + return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); +} + +static __inline boolean_t pmap_ps_enabled(pmap_t pmap) { @@ -1445,7 +1469,7 @@ invlpg(addr); break; case PT_EPT: - pmap->eptgen++; + pmap->pm_eptgen++; break; default: panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); @@ -1977,7 +2001,7 @@ PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - pmap->pm_flags = PMAP_PDE_SUPERPAGE; + pmap->pm_flags = pmap_flags; } /* @@ -2031,6 +2055,7 @@ TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; + pmap->pm_eptgen = 0; return (1); } @@ -2039,7 +2064,7 @@ pmap_pinit(pmap_t pmap) { - return (pmap_pinit_type(pmap, PT_X86, PMAP_PDE_SUPERPAGE)); + return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* @@ -2473,7 +2498,7 @@ vm_page_t free, m, m_pc; uint64_t inuse; int bit, field, freed; - + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); @@ -2527,8 +2552,11 @@ if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((tpte & PG_RO) == 0, + ("readonly modified PTE %#lx", tpte)); vm_page_dirty(m); + } if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); @@ -3187,8 +3215,11 @@ eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { - if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((oldpde & PG_RO) == 0, + ("readonly modified PDE %#lx", oldpde)); vm_page_dirty(m); + } if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && @@ -3235,8 +3266,11 @@ pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); - if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((oldpte & PG_RO) == 0, + ("readonly modified PTE %#lx", oldpte)); vm_page_dirty(m); + } if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); @@ -3480,8 +3514,11 @@ /* * Update the vm_page_t clean and reference bits. */ - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((tpte & PG_RO) == 0, + ("readonly modified PTE %#lx", tpte)); vm_page_dirty(m); + } pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); @@ -3518,11 +3555,17 @@ eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) - if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((oldpde & PG_RO) == 0, + ("readonly modified PDE %#lx", oldpde)); vm_page_dirty(m); + } } - if ((prot & VM_PROT_WRITE) == 0) + if ((prot & VM_PROT_WRITE) == 0) { newpde &= ~(PG_RW | PG_M); + if (pmap_emulate_ad_bits(pmap)) + newpde |= PG_RO; + } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { @@ -3652,10 +3695,15 @@ if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { + KASSERT((pbits & PG_RO) == 0, + ("readonly modified PTE %#lx", + pbits)); m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); + if (pmap_emulate_ad_bits(pmap)) + pbits |= PG_RO; } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; @@ -3716,6 +3764,8 @@ return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { + KASSERT(!pmap_emulate_ad_bits(pmap), + ("invalid RW/M bits for dirty bit emulation %#lx", newpde)); /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. @@ -3741,6 +3791,9 @@ return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + KASSERT(!pmap_emulate_ad_bits(pmap), + ("invalid RW/M bits for dirty bit " + "emulation %#lx", oldpte)); /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. @@ -3799,6 +3852,14 @@ " in pmap %p", va, pmap); } +static __inline boolean_t +pmap_writeable_mapping(pmap_t pmap, pt_entry_t pte) +{ + + return ((pte & PG_RW) != 0 || + (pmap_emulate_ad_bits(pmap) && (pte & PG_RO) == 0)); +} + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -3855,6 +3916,38 @@ newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); + if (pmap_emulate_ad_bits(pmap)) { + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if ((newpte & PG_RW) != 0) + newpte |= PG_M; + } + + /* + * Dirty bit emulation enforces the following PG_RW behavior: + * - if PG_RW = 1 then PG_M = 1 + * - if PG_RW = 0 then PG_M = 0 + * + * If PG_RW = 0 then there are two possibilities: + * - the mapping is permanently readonly (PG_RO = 1) + * - the mapping is temporarily readonly for dirty bit emulation + */ + if ((newpte & PG_RW) == 0) + newpte |= PG_RO; + else if ((newpte & PG_M) == 0) + newpte &= ~PG_RW; + + if (((newpte & (PG_M | PG_RW)) != (PG_M | PG_RW)) && + ((newpte & (PG_M | PG_RW)) != 0)) { + panic("pmap_enter: invalid rw/modified bits for " + "dirty bit emulation %#lx", newpte); + } + } + mpte = NULL; lock = NULL; @@ -3921,7 +4014,7 @@ */ if ((origpte & PG_MANAGED) != 0) { newpte |= PG_MANAGED; - if ((newpte & PG_RW) != 0) + if (pmap_writeable_mapping(pmap, newpte)) vm_page_aflag_set(m, PGA_WRITEABLE); } if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) @@ -3946,7 +4039,7 @@ pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - if ((newpte & PG_RW) != 0) + if (pmap_writeable_mapping(pmap, newpte)) vm_page_aflag_set(m, PGA_WRITEABLE); } @@ -3961,8 +4054,12 @@ if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if ((origpte & (PG_M | PG_RW)) == (PG_M | - PG_RW)) + PG_RW)) { + KASSERT((origpte & PG_RO) == 0, + ("readonly modified PTE %#lx", + origpte)); vm_page_dirty(om); + } if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); @@ -3975,8 +4072,11 @@ } } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if ((origpte & PG_MANAGED) != 0) + if ((origpte & PG_MANAGED) != 0) { + KASSERT((origpte & PG_RO) == 0, + ("readonly modified PTE %#lx", origpte)); vm_page_dirty(m); + } /* * Although the PTE may still have PG_RW set, TLB @@ -4027,6 +4127,15 @@ rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Software emulation of the accessed bit requires that if PG_V is set + * then PG_A is also set. Therefore we defer setting up the mapping + * until the process actually tries to access it. + */ + if (pmap_emulate_ad_bits(pmap)) + return (FALSE); + if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); @@ -4170,6 +4279,14 @@ PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* + * Software emulation of the accessed bit requires that if PG_V is set + * then PG_A is also set. Therefore we defer setting up the mapping + * until the process actually tries to access it. + */ + if (pmap_emulate_ad_bits(pmap)) + return (NULL); + + /* * In the case that a page table page is not * resident, we are creating it here. */ @@ -4409,12 +4526,14 @@ goto out; } pte = pmap_pde_to_pte(pde, va); - if (wired && (*pte & PG_W) == 0) { - pmap->pm_stats.wired_count++; - atomic_set_long(pte, PG_W); - } else if (!wired && (*pte & PG_W) != 0) { - pmap->pm_stats.wired_count--; - atomic_clear_long(pte, PG_W); + if ((*pte & PG_V) != 0) { + if (wired && (*pte & PG_W) == 0) { + pmap->pm_stats.wired_count++; + atomic_set_long(pte, PG_W); + } else if (!wired && (*pte & PG_W) != 0) { + pmap->pm_stats.wired_count--; + atomic_clear_long(pte, PG_W); + } } out: if (pv_lists_locked) @@ -4444,6 +4563,9 @@ if (dst_addr != src_addr) return; + if (pmap_emulate_ad_bits(dst_pmap)) + return; + lock = NULL; rw_rlock(&pvh_global_lock); if (dst_pmap < src_pmap) { @@ -4868,6 +4990,9 @@ * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((tpte & PG_RO) == 0, + ("readonly modified PTE %#lx", + tpte)); if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); @@ -4992,7 +5117,7 @@ /* * pmap_is_prefaultable: * - * Return whether or not the specified virtual address is elgible + * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t @@ -5071,7 +5196,7 @@ pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *pde; - pt_entry_t oldpte, *pte, PG_M; + pt_entry_t oldpte, newpte, *pte, PG_M; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, @@ -5111,12 +5236,17 @@ pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; - if (oldpte & PG_RW) { - if (!atomic_cmpset_long(pte, oldpte, oldpte & - ~(PG_RW | PG_M))) + newpte = oldpte & ~(PG_RW | PG_M); + if (pmap_emulate_ad_bits(pmap)) + newpte |= PG_RO; + if (newpte != oldpte) { + if (!atomic_cmpset_long(pte, oldpte, newpte)) goto retry; - if ((oldpte & PG_M) != 0) + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((oldpte & PG_RO) == 0, + ("readonly modified PTE %#lx", oldpte)); vm_page_dirty(m); + } pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); @@ -5147,6 +5277,7 @@ pt_entry_t *pte, PG_A; vm_offset_t va; int rtval = 0; + vm_page_t free = NULL; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); @@ -5200,8 +5331,13 @@ " found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & PG_A) != 0) { - atomic_clear_long(pte, PG_A); - pmap_invalidate_page(pmap, pv->pv_va); + if (pmap_emulate_ad_bits(pmap)) { + pmap_remove_page(pmap, pv->pv_va, + pde, &free); + } else { + atomic_clear_long(pte, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + } rtval++; if (rtval > 4) pvn = NULL; @@ -5211,6 +5347,7 @@ } out: rw_wunlock(&pvh_global_lock); + pmap_free_zero_pages(free); return (rtval); } @@ -5226,6 +5363,7 @@ pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte, PG_M; vm_offset_t va; + long clear_bits; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); @@ -5263,14 +5401,18 @@ PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; - if ((oldpte & PG_V) != 0) { - while (!atomic_cmpset_long(pte, - oldpte, - oldpte & ~(PG_M | PG_RW))) - oldpte = *pte; - vm_page_dirty(m); - pmap_invalidate_page(pmap, va); - } + + if ((oldpte & (PG_RO | PG_RW | PG_M)) != + (PG_RW | PG_M)) + panic("inconsistent pte %#lx " + "after demotion from pde " + "%#lx", oldpte, oldpde); + + while (!atomic_cmpset_long(pte, oldpte, + oldpte & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); } } } @@ -5285,8 +5427,22 @@ KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); - if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - atomic_clear_long(pte, PG_M); + oldpte = *pte; + if (pmap_emulate_ad_bits(pmap)) { + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + KASSERT((oldpte & PG_RO) == 0, + ("modified readonly pte %#lx", oldpte)); + } else { + KASSERT((oldpte & (PG_M | PG_RW)) == 0, + ("invalid RW/M bits for dirty bit " + "emulation %#lx", oldpte)); + } + } + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + clear_bits = PG_M; + if (pmap_emulate_ad_bits(pmap)) + clear_bits |= PG_RW; + atomic_clear_long(pte, clear_bits); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); @@ -5308,6 +5464,7 @@ pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A; vm_offset_t va; + vm_page_t free = NULL; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_reference: page %p is not managed", m)); @@ -5325,6 +5482,10 @@ if ((oldpde & PG_A) != 0) { if (pmap_demote_pde(pmap, pde, va)) { /* + * XXX should we remove the page only if + * mapping is not wired? + */ + /* * Remove the mapping to a single page so * that a subsequent access may repromote. * Since the underlying page table page is @@ -5348,12 +5509,17 @@ " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if (*pte & PG_A) { - atomic_clear_long(pte, PG_A); - pmap_invalidate_page(pmap, pv->pv_va); + if (pmap_emulate_ad_bits(pmap)) { + pmap_remove_page(pmap, pv->pv_va, pde, &free); + } else { + atomic_clear_long(pte, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + } } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); + pmap_free_zero_pages(free); } /* @@ -5921,6 +6087,66 @@ *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } +static unsigned long num_dirty_emulations; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, + &num_dirty_emulations, 0, NULL); +int +pmap_emulate_dirty(pmap_t pmap, vm_offset_t va) +{ + int rv = -1; + struct rwlock *lock; + vm_page_t m, mpte; + pd_entry_t *pde; + pt_entry_t *pte, PG_A, PG_M; + + if (!pmap_emulate_ad_bits(pmap)) + return (-1); + + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + + lock = NULL; + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + + /* + * Dirty bit emulation is done in the fast path if 'va' is + * already mapped as a regular page and is writeable. + */ + pde = pmap_pde(pmap, va); + if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { + pte = pmap_pde_to_pte(pde, va); + if ((*pte & (PG_V | PG_RO)) == PG_V) { + KASSERT((*pte & PG_A) != 0, + ("pmap_emulate_dirty: accessed and valid bits ", + "mismatch %#lx", *pte)); + atomic_set_long(pte, PG_M | PG_RW); + atomic_add_long(&num_dirty_emulations, 1); + rv = 0; /* success */ + + /* try to promote the mapping */ + if (va < VM_MAXUSER_ADDRESS) + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + else + mpte = NULL; + + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + pg_ps_enabled && pmap_ps_enabled(pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pde(pmap, pde, va, &lock); + } + } + + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (rv); +} + void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c (revision 253884) +++ sys/amd64/amd64/trap.c (working copy) @@ -656,6 +656,25 @@ struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; + /* + * PGEX_I is defined only if the execute disable bit capability is + * supported and enabled. + */ + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_WRITE; + else if ((frame->tf_err & PGEX_I) && pg_nx != 0) + ftype = VM_PROT_EXECUTE; + else + ftype = VM_PROT_READ; + + va = trunc_page(eva); +#ifdef INVARIANTS + if ((frame->tf_err & PGEX_P) != 0 && ftype == VM_PROT_WRITE) { + if (pmap_emulate_dirty(PCPU_GET(curpmap), va) == 0) + return (0); + } +#endif + if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when @@ -700,7 +719,6 @@ return (-1); } } - va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. @@ -733,17 +751,6 @@ } } - /* - * PGEX_I is defined only if the execute disable bit capability is - * supported and enabled. - */ - if (frame->tf_err & PGEX_W) - ftype = VM_PROT_WRITE; - else if ((frame->tf_err & PGEX_I) && pg_nx != 0) - ftype = VM_PROT_EXECUTE; - else - ftype = VM_PROT_READ; - if (map != kernel_map) { /* * Keep swapout from messing with us during this Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h (revision 253884) +++ sys/amd64/include/pmap.h (working copy) @@ -79,6 +79,12 @@ #define PG_PROT (PG_RW|PG_U) /* all protection bits . */ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ +/* + * "readonly" pseudo-flag used in pmap entries that require software emulation + * of accessed/dirty bits. + */ +#define PG_RO (1ul << 52) + /* Page level cache control fields used to determine the PAT type */ #define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD) #define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD) @@ -88,7 +94,7 @@ * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ - PG_M | PG_A | PG_U | PG_RW | PG_V) + PG_M | PG_A | PG_U | PG_RW | PG_V | PG_RO) /* * Page Protection Exception bits @@ -264,6 +270,7 @@ /* flags */ #define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */ typedef struct pmap *pmap_t; @@ -283,6 +290,7 @@ #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_dirty(pmap_t pmap, vm_offset_t va); #endif /* Index: sys/amd64/vmm/intel/ept.c =================================================================== --- sys/amd64/vmm/intel/ept.c (revision 253884) +++ sys/amd64/vmm/intel/ept.c (working copy) @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -69,6 +70,7 @@ int ept_init(void) { + int use_hw_ad_bits; uint64_t cap; cap = rdmsr(MSR_VMX_EPT_VPID_CAP); @@ -91,8 +93,12 @@ if (EPT_PDE_SUPERPAGE(cap)) ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ - if (AD_BITS_SUPPORTED(cap)) + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("vmx.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; return (0); } Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c (revision 253884) +++ sys/amd64/vmm/vmm.c (working copy) @@ -60,6 +60,7 @@ #include #include +#include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" @@ -736,9 +737,8 @@ static int vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) { - int rv; + int rv, ftype, prot; struct vm_map *map; - vm_prot_t ftype; struct vcpu *vcpu; struct vm_exit *vme; @@ -745,14 +745,33 @@ vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - map = &vm->vmspace->vm_map; ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_WRITE || + ftype == VM_PROT_EXECUTE || + ftype == VM_PROT_READ, + ("vm_handle_paging: invalid fault_type %d", ftype)); + /* + * If the mapping exists then the write fault may be intentional + * for doing dirty bit emulation. + */ + prot = vme->u.paging.protection; + if ((prot & VM_PROT_READ) != 0 && ftype == VM_PROT_WRITE) { + rv = pmap_emulate_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa); + if (rv == 0) + goto done; + } + + map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d", + rv, vme->u.paging.gpa, ftype); + if (rv != KERN_SUCCESS) return (EFAULT); - +done: /* restart execution at the faulting instruction */ vme->inst_length = 0;