Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c (revision 256063) +++ lib/libvmmapi/vmmapi.c (working copy) @@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm) } int -vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len) +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired) { int error; struct vm_memory_segment seg; @@ -133,6 +134,8 @@ int seg.gpa = gpa; error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); *ret_len = seg.len; + if (wired != NULL) + *wired = seg.wired; return (error); } @@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu) done: return (error); } + +int +vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) +{ + int error, i; + struct vm_gpa_pte gpapte; + + bzero(&gpapte, sizeof(gpapte)); + gpapte.gpa = gpa; + + error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); + + if (error == 0) { + *num = gpapte.ptenum; + for (i = 0; i < gpapte.ptenum; i++) + pte[i] = gpapte.pte[i]; + } + + return (error); +} Index: lib/libvmmapi/vmmapi.h =================================================================== --- lib/libvmmapi/vmmapi.h (revision 256063) +++ lib/libvmmapi/vmmapi.h (working copy) @@ -45,9 +45,11 @@ enum vm_mmap_style { int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_destroy(struct vmctx *ctx); -int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, Index: sys/amd64/amd64/machdep.c =================================================================== --- sys/amd64/amd64/machdep.c (revision 256063) +++ sys/amd64/amd64/machdep.c (working copy) @@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first) /* * map page into kernel: valid, read/write,non-cacheable */ - *pte = pa | PG_V | PG_RW | PG_N; + *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c (revision 256063) +++ sys/amd64/amd64/pmap.c (working copy) @@ -76,6 +76,8 @@ * SUCH DAMAGE. */ +#define AMD64_NPT_AWARE + #include __FBSDID("$FreeBSD$"); @@ -143,6 +145,120 @@ __FBSDID("$FreeBSD$"); #include #endif +static __inline boolean_t +pmap_emulate_ad_bits(pmap_t pmap) +{ + + return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); +} + +static __inline pt_entry_t +pmap_valid_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_V; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_EMUL_V; + else + mask = EPT_PG_READ; + break; + default: + panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_rw_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_RW; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_EMUL_RW; + else + mask = EPT_PG_WRITE; + break; + default: + panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_global_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_G; + break; + case PT_EPT: + mask = 0; + break; + default: + panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_accessed_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_A; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_READ; + else + mask = EPT_PG_A; + break; + default: + panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_modified_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_M; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_WRITE; + else + mask = EPT_PG_M; + break; + default: + panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline @@ -247,6 +363,8 @@ static struct md_page *pv_table; pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ + static struct unrhdr pcid_unr; static struct mtx pcid_mtx; int pmap_pcid_enabled = 1; @@ -306,12 +424,12 @@ static void pmap_fill_ptp(pt_entry_t *firstpte, pt static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); -static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); +static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); -static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); +static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, @@ -323,7 +441,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t p vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); -static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); +static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); @@ -416,7 +534,9 @@ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); @@ -438,7 +558,9 @@ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); @@ -460,7 +582,9 @@ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); @@ -490,6 +614,8 @@ vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); + return (PTmap + ((va >> PAGE_SHIFT) & mask)); } @@ -498,6 +624,8 @@ vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); + return (PDmap + ((va >> PDRSHIFT) & mask)); } @@ -601,22 +729,24 @@ create_pagetables(vm_paddr_t *firstaddr) /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) - pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G; + pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) - pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; + pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) - pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G; + pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | + X86_PG_G; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) - pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U; + pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | + PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If @@ -630,36 +760,36 @@ create_pagetables(vm_paddr_t *firstaddr) for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G | - PG_M | PG_A; + pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + X86_PG_M | X86_PG_A; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G | - PG_M | PG_A; + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + X86_PG_M | X86_PG_A; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); - pdp_p[i] |= PG_RW | PG_V | PG_U; + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; - p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U; + p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U; + p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); - p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U; + p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; } } @@ -705,6 +835,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ CPU_ZERO(&kernel_pmap->pm_save); TAILQ_INIT(&kernel_pmap->pm_pvchunk); + kernel_pmap->pm_flags = pmap_flags; /* * Initialize the global pv list lock. @@ -948,35 +1079,131 @@ SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, C * Low level helper routines..... ***************************************************/ +static pt_entry_t +pmap_swap_pat(pmap_t pmap, pt_entry_t entry) +{ + int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; + + switch (pmap->pm_type) { + case PT_X86: + /* Verify that both PAT bits are not set at the same time */ + KASSERT((entry & x86_pat_bits) != x86_pat_bits, + ("Invalid PAT bits in entry %#lx", entry)); + + /* Swap the PAT bits if one of them is set */ + if ((entry & x86_pat_bits) != 0) + entry ^= x86_pat_bits; + break; + case PT_EPT: + /* + * Nothing to do - the memory attributes are represented + * the same way for regular pages and superpages. + */ + break; + default: + panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); + } + + return (entry); +} + /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int -pmap_cache_bits(int mode, boolean_t is_pde) +pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); - /* The PAT bit is different for PTE's and PDE's. */ - pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + switch (pmap->pm_type) { + case PT_X86: + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; - /* Map the caching mode to a PAT index. */ - pat_idx = pat_index[mode]; + /* Map the caching mode to a PAT index. */ + pat_idx = pat_index[mode]; - /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ - cache_bits = 0; - if (pat_idx & 0x4) - cache_bits |= pat_flag; - if (pat_idx & 0x2) - cache_bits |= PG_NC_PCD; - if (pat_idx & 0x1) - cache_bits |= PG_NC_PWT; + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_idx & 0x4) + cache_bits |= pat_flag; + if (pat_idx & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_idx & 0x1) + cache_bits |= PG_NC_PWT; + break; + + case PT_EPT: + cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); + break; + + default: + panic("unsupported pmap type %d", pmap->pm_type); + } + return (cache_bits); } +static int +pmap_cache_mask(pmap_t pmap, boolean_t is_pde) +{ + int mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; + break; + case PT_EPT: + mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); + break; + default: + panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline boolean_t +pmap_ps_enabled(pmap_t pmap) +{ + + return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); +} + +static void +pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) +{ + + switch (pmap->pm_type) { + case PT_X86: + break; + case PT_EPT: + /* + * XXX + * This is a little bogus since the generation number is + * supposed to be bumped up when a region of the address + * space is invalidated in the page tables. + * + * In this case the old PDE entry is valid but yet we want + * to make sure that any mappings using the old entry are + * invalidated in the TLB. + * + * The reason this works as expected is because we rendezvous + * "all" host cpus and force any vcpu context to exit as a + * side-effect. + */ + atomic_add_acq_long(&pmap->pm_eptgen, 1); + break; + default: + panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); + } + pde_store(pde, newpde); +} + /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the @@ -985,9 +1212,18 @@ static int * The calling thread must be pinned to a processor. */ static void -pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) +pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { + pt_entry_t PG_G; + if (pmap->pm_type == PT_EPT) + return; + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); + + PG_G = pmap_global_bit(pmap); + if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); @@ -1048,12 +1284,61 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t * immutable. The kernel page table is always active on every * processor. */ + +/* + * Interrupt the cpus that are executing in the guest context. + * This will force the vcpu to exit and the cached EPT mappings + * will be invalidated by the host before the next vmresume. + */ +static __inline void +pmap_invalidate_ept(pmap_t pmap) +{ + + sched_pin(); + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("pmap_invalidate_ept: absurd pm_active")); + + /* + * The TLB mappings associated with a vcpu context are not + * flushed each time a different vcpu is chosen to execute. + * + * This is in contrast with a process's vtop mappings that + * are flushed from the TLB on each context switch. + * + * Therefore we need to do more than just a TLB shootdown on + * the active cpus in 'pmap->pm_active'. To do this we keep + * track of the number of invalidations performed on this pmap. + * + * Each vcpu keeps a cache of this counter and compares it + * just before a vmresume. If the counter is out-of-date an + * invept will be done to flush stale mappings from the TLB. + */ + atomic_add_acq_long(&pmap->pm_eptgen, 1); + + /* + * Force the vcpu to exit and trap back into the hypervisor. + * + * XXX this is not optimal because IPI_AST builds a trapframe + * whereas all we need is an 'eoi' followed by 'iret'. + */ + ipi_selected(pmap->pm_active, IPI_AST); + sched_unpin(); +} + void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t other_cpus; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); + sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { if (!pmap_pcid_enabled) { @@ -1124,6 +1409,14 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva vm_offset_t addr; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); + sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { if (!pmap_pcid_enabled) { @@ -1175,6 +1468,14 @@ pmap_invalidate_all(pmap_t pmap) uint64_t cr3; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + sched_pin(); cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap || @@ -1243,6 +1544,7 @@ pmap_invalidate_cache(void) struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ + pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; @@ -1255,7 +1557,7 @@ pmap_update_pde_action(void *arg) struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) - pde_store(act->pde, act->newpde); + pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void @@ -1264,7 +1566,7 @@ pmap_update_pde_teardown(void *arg) struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) - pmap_update_pde_invalidate(act->va, act->newpde); + pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* @@ -1286,7 +1588,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_en cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (pmap == kernel_pmap) + if (pmap == kernel_pmap || pmap->pm_type == PT_EPT) active = all_cpus; else { active = pmap->pm_active; @@ -1296,6 +1598,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_en act.store = cpuid; act.invalidate = active; act.va = va; + act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); @@ -1303,9 +1606,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_en smp_no_rendevous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { - pde_store(pde, newpde); + pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) - pmap_update_pde_invalidate(va, newpde); + pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } @@ -1318,8 +1621,17 @@ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invlpg(va); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invlpg(va); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type); + } } PMAP_INLINE void @@ -1327,17 +1639,35 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva { vm_offset_t addr; - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); + } } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invltlb(); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invltlb(); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_all: unknown type %d", pmap->pm_type); + } } PMAP_INLINE void @@ -1351,9 +1681,9 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { - pde_store(pde, newpde); + pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - pmap_update_pde_invalidate(va, newpde); + pmap_update_pde_invalidate(pmap, va, newpde); else CPU_ZERO(&pmap->pm_save); } @@ -1455,10 +1785,11 @@ pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; + PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { @@ -1493,12 +1824,14 @@ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; - pt_entry_t pte; + pt_entry_t pte, PG_RW, PG_V; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; + PG_RW = pmap_rw_bit(pmap); + PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, va); @@ -1571,16 +1904,18 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa) pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | PG_G); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; + int cache_bits; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); + cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); } /* @@ -1629,20 +1964,22 @@ pmap_qenter(vm_offset_t sva, vm_page_t *ma, int co { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; + int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; - pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); - if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { + cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); + pa = VM_PAGE_TO_PHYS(m) | cache_bits; + if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; - pte_store(pte, pa | PG_G | PG_RW | PG_V); + pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); } pte++; } - if (__predict_false((oldpte & PG_V) != 0)) + if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } @@ -1841,6 +2178,7 @@ pmap_pinit0(pmap_t pmap) TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; + pmap->pm_flags = pmap_flags; } /* @@ -1848,9 +2186,10 @@ pmap_pinit0(pmap_t pmap) * such as one in a vmspace structure. */ int -pmap_pinit(pmap_t pmap) +pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg; + vm_paddr_t pml4phys; int i; /* @@ -1860,41 +2199,61 @@ int VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; - pmap->pm_cr3 = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pmap->pm_cr3); + pml4phys = VM_PAGE_TO_PHYS(pml4pg); + pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmap->pm_pcid = -1; + pmap->pm_cr3 = ~0; /* initialize to an invalid value */ if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); - /* Wire in kernel global address entries. */ - for (i = 0; i < NKPML4E; i++) { - pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) | - PG_RW | PG_V | PG_U; + /* + * Do not install the host kernel mappings in the nested page + * tables. These mappings are meaningless in the guest physical + * address space. + */ + if ((pmap->pm_type = pm_type) == PT_X86) { + pmap->pm_cr3 = pml4phys; + + /* Wire in kernel global address entries. */ + for (i = 0; i < NKPML4E; i++) { + pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | PG_U; + } + for (i = 0; i < ndmpdpphys; i++) { + pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | PG_U; + } + + /* install self-referential address mapping entry(s) */ + pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; + + if (pmap_pcid_enabled) { + pmap->pm_pcid = alloc_unr(&pcid_unr); + if (pmap->pm_pcid != -1) + pmap->pm_cr3 |= pmap->pm_pcid; + } } - for (i = 0; i < ndmpdpphys; i++) { - pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) | - PG_RW | PG_V | PG_U; - } - /* install self-referential address mapping entry(s) */ - pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; - pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - if (pmap_pcid_enabled) { - pmap->pm_pcid = alloc_unr(&pcid_unr); - if (pmap->pm_pcid != -1) - pmap->pm_cr3 |= pmap->pm_pcid; - } else { - pmap->pm_pcid = -1; - } + pmap->pm_flags = flags; + pmap->pm_eptgen = 0; CPU_ZERO(&pmap->pm_save); return (1); } +int +pmap_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); +} + /* * This routine is called if the desired page table page does not exist. * @@ -1910,9 +2269,15 @@ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; + pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + /* * Allocate a page table page. */ @@ -2040,9 +2405,11 @@ static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; - pdp_entry_t *pdpe; + pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; + PG_V = pmap_valid_bit(pmap); + retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { @@ -2064,9 +2431,11 @@ static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; - pd_entry_t *pd; + pd_entry_t *pd, PG_V; vm_page_t m; + PG_V = pmap_valid_bit(pmap); + /* * Calculate pagetable page index */ @@ -2140,7 +2509,7 @@ pmap_release(pmap_t pmap) pmap_invalidate_all(pmap); } - m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; @@ -2211,7 +2580,7 @@ pmap_growkernel(vm_offset_t addr) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); - if ((*pdpe & PG_V) == 0) { + if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | @@ -2221,12 +2590,12 @@ pmap_growkernel(vm_offset_t addr) if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); - *pdpe = (pdp_entry_t) - (paddr | PG_V | PG_RW | PG_A | PG_M); + *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); - if ((*pde & PG_V) != 0) { + if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; @@ -2243,7 +2612,7 @@ pmap_growkernel(vm_offset_t addr) if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); - newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); + newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; @@ -2323,13 +2692,14 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; + pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; - + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); @@ -2359,6 +2729,10 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock mtx_lock(&pv_chunks_mutex); continue; } + PG_G = pmap_global_bit(pmap); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); } /* @@ -2892,10 +3266,19 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pd { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; + pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; + int PG_PTE_CACHE; + PG_G = pmap_global_bit(pmap); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_PTE_CACHE = pmap_cache_mask(pmap, 0); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), @@ -2944,8 +3327,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pd KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; - if ((newpte & PG_PDE_PAT) != 0) - newpte ^= PG_PDE_PAT | PG_PTE_PAT; + newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is new, initialize it. @@ -3016,6 +3398,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pd vm_paddr_t mptepa; vm_page_t mpte; + KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_lookup_pt_page(pmap, va); if (mpte == NULL) @@ -3023,7 +3406,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pd pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); - newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; + newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * Initialize the page table page. @@ -3055,7 +3438,13 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_o pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; + pt_entry_t PG_G, PG_A, PG_M, PG_RW; + PG_G = pmap_global_bit(pmap); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); @@ -3111,9 +3500,13 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_o pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; - pt_entry_t oldpte; + pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) @@ -3145,8 +3538,9 @@ pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_e struct spglist *free) { struct rwlock *lock; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; + PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; @@ -3174,10 +3568,13 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offse pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_G, PG_V; struct spglist free; int anyvalid; + PG_G = pmap_global_bit(pmap); + PG_V = pmap_valid_bit(pmap); + /* * Perform an unsynchronized read. This is, however, safe. */ @@ -3326,7 +3723,7 @@ pmap_remove_all(vm_page_t m) struct md_page *pvh; pv_entry_t pv; pmap_t pmap; - pt_entry_t *pte, tpte; + pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; @@ -3350,6 +3747,9 @@ small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" @@ -3388,7 +3788,12 @@ pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_ vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; + pt_entry_t PG_G, PG_M, PG_RW; + PG_G = pmap_global_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); @@ -3428,7 +3833,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offs pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { @@ -3440,6 +3845,10 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offs (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; + PG_G = pmap_global_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); pv_lists_locked = FALSE; resume: anychanged = FALSE; @@ -3568,9 +3977,18 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_ { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; + pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; vm_offset_t oldpteva; vm_page_t mpte; + int PG_PTE_CACHE; + PG_A = pmap_accessed_bit(pmap); + PG_G = pmap_global_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PG_PTE_CACHE = pmap_cache_mask(pmap, 0); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* @@ -3662,8 +4080,7 @@ setpte: /* * Propagate the PAT index to its proper position. */ - if ((newpde & PG_PTE_PAT) != 0) - newpde ^= PG_PDE_PAT | PG_PTE_PAT; + newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. @@ -3696,12 +4113,18 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t { struct rwlock *lock; pd_entry_t *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; + PG_A = pmap_accessed_bit(pmap); + PG_G = pmap_global_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, @@ -3728,8 +4151,18 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; - newpte |= pmap_cache_bits(m->md.pat_mode, 0); + newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if ((newpte & PG_RW) != 0) + newpte |= PG_M; + } + mpte = NULL; lock = NULL; @@ -3877,7 +4310,8 @@ unchanged: * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && - pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && + pmap_ps_enabled(pmap) && + (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); @@ -3898,11 +4332,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_pag struct rwlock **lockp) { pd_entry_t *pde, newpde; + pt_entry_t PG_V; vm_page_t mpde; struct spglist free; + PG_V = pmap_valid_bit(pmap); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); @@ -3918,7 +4355,7 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_pag " in pmap %p", va, pmap); return (FALSE); } - newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | + newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; @@ -3992,7 +4429,8 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && - pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && + pmap_ps_enabled(pmap) && + vm_reserv_level_iffullpop(m) == 0 && pmap_enter_pde(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else @@ -4035,12 +4473,13 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t v vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); + PG_V = pmap_valid_bit(pmap); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -4120,7 +4559,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t v */ pmap_resident_count_inc(pmap, 1); - pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); + pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; @@ -4159,14 +4598,22 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; + pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { + if (!pmap_ps_enabled(pmap)) + return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); @@ -4204,8 +4651,8 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, * will not affect the termination of this loop. */ PMAP_LOCK(pmap); - for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + - size; pa += NBPDR) { + for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); + pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* @@ -4307,10 +4754,25 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_off vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; + pt_entry_t PG_A, PG_M, PG_V; if (dst_addr != src_addr) return; + if (dst_pmap->pm_type != src_pmap->pm_type) + return; + + /* + * EPT page table entries that require emulation of A/D bits are + * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although + * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit + * (aka EPT_PG_EXECUTE) could still be set. Since some EPT + * implementations flag an EPT misconfiguration for exec-only + * mappings we skip this function entirely for emulated pmaps. + */ + if (pmap_emulate_ad_bits(dst_pmap)) + return; + lock = NULL; rw_rlock(&pvh_global_lock); if (dst_pmap < src_pmap) { @@ -4320,6 +4782,11 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_off PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } + + PG_A = pmap_accessed_bit(dst_pmap); + PG_M = pmap_modified_bit(dst_pmap); + PG_V = pmap_valid_bit(dst_pmap); + for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpde, dstmpte, srcmpte; @@ -4673,6 +5140,7 @@ pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; + pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; @@ -4689,7 +5157,12 @@ pmap_remove_pages(pmap_t pmap) printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } + lock = NULL; + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); @@ -4830,12 +5303,13 @@ pmap_remove_pages(pmap_t pmap) } static boolean_t -pmap_page_test_mappings(vm_page_t m, pt_entry_t mask) +pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; - pt_entry_t *pte; + pt_entry_t *pte, mask; + pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; @@ -4858,6 +5332,17 @@ restart: } } pte = pmap_pte(pmap, pv->pv_va); + mask = 0; + if (modified) { + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + mask |= PG_RW | PG_M; + } + if (accessed) { + PG_A = pmap_accessed_bit(pmap); + PG_V = pmap_valid_bit(pmap); + mask |= PG_V | PG_A; + } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) @@ -4880,6 +5365,17 @@ restart: } } pte = pmap_pde(pmap, pv->pv_va); + mask = 0; + if (modified) { + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + mask |= PG_RW | PG_M; + } + if (accessed) { + PG_A = pmap_accessed_bit(pmap); + PG_V = pmap_valid_bit(pmap); + mask |= PG_V | PG_A; + } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) @@ -4913,22 +5409,23 @@ pmap_is_modified(vm_page_t m) VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); - return (pmap_page_test_mappings(m, PG_M | PG_RW)); + return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * - * Return whether or not the specified virtual address is elgible + * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; boolean_t rv; + PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); @@ -4952,7 +5449,7 @@ pmap_is_referenced(vm_page_t m) KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); - return (pmap_page_test_mappings(m, PG_A | PG_V)); + return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* @@ -4966,7 +5463,7 @@ pmap_remove_write(vm_page_t m) struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; - pt_entry_t oldpte, *pte; + pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; @@ -5001,6 +5498,7 @@ retry_pv_loop: goto retry_pv_loop; } } + PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) @@ -5026,6 +5524,8 @@ small_mappings: goto retry_pv_loop; } } + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", @@ -5048,6 +5548,33 @@ retry: rw_runlock(&pvh_global_lock); } +static __inline boolean_t +safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) +{ + + if (!pmap_emulate_ad_bits(pmap)) + return (TRUE); + + KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); + + /* + * RWX = 010 or 110 will cause an unconditional EPT misconfiguration + * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared + * if the EPT_PG_WRITE bit is set. + */ + if ((pte & EPT_PG_WRITE) != 0) + return (FALSE); + + /* + * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. + */ + if ((pte & EPT_PG_EXECUTE) == 0 || + ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) + return (TRUE); + else + return (FALSE); +} + #define PMAP_TS_REFERENCED_MAX 5 /* @@ -5069,13 +5596,17 @@ pmap_ts_referenced(vm_page_t m) pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; - pd_entry_t *pde; - pt_entry_t *pte; + pd_entry_t oldpde, *pde; + pt_entry_t *pte, PG_A; + vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; + struct spglist free; + boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); + SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); @@ -5089,6 +5620,8 @@ retry: goto small_mappings; pv = pvf; do { + if (pvf == NULL) + pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; @@ -5100,7 +5633,10 @@ retry: goto retry; } } + PG_A = pmap_accessed_bit(pmap); + va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); + oldpde = *pde; if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB @@ -5123,15 +5659,50 @@ retry: if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { - atomic_clear_long(pde, PG_A); - pmap_invalidate_page(pmap, pv->pv_va); + if (safe_to_clear_referenced(pmap, oldpde)) { + atomic_clear_long(pde, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + demoted = FALSE; + } else if (pmap_demote_pde_locked(pmap, pde, + pv->pv_va, &lock)) { + /* + * Remove the mapping to a single page + * so that a subsequent access may + * repromote. Since the underlying + * page table page is fully populated, + * this removal never frees a page + * table page. + */ + demoted = TRUE; + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pte = pmap_pde_to_pte(pde, va); + pmap_remove_pte(pmap, pte, va, *pde, + NULL, &lock); + pmap_invalidate_page(pmap, va); + } else + demoted = TRUE; + + if (demoted) { + /* + * The superpage mapping was removed + * entirely and therefore 'pv' is no + * longer valid. + */ + if (pvf == pv) + pvf = NULL; + pv = NULL; + } cleared++; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ - if (TAILQ_NEXT(pv, pv_next) != NULL) { + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; @@ -5144,6 +5715,8 @@ small_mappings: goto out; pv = pvf; do { + if (pvf == NULL) + pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; @@ -5156,19 +5729,40 @@ small_mappings: goto retry; } } + PG_A = pmap_accessed_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & PG_A) != 0) { - atomic_clear_long(pte, PG_A); - pmap_invalidate_page(pmap, pv->pv_va); - cleared++; + if (safe_to_clear_referenced(pmap, *pte)) { + atomic_clear_long(pte, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } else if ((*pte & PG_W) == 0) { + /* + * Wired pages cannot be paged out so + * doing accessed bit emulation for + * them is wasted effort. We do the + * hard work for unwired pages only. + */ + pmap_remove_pte(pmap, pte, pv->pv_va, + *pde, &free, &lock); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + if (pvf == pv) + pvf = NULL; + pv = NULL; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ - if (TAILQ_NEXT(pv, pv_next) != NULL) { + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; @@ -5178,6 +5772,7 @@ small_mappings: out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); + pmap_free_zero_pages(&free); return (cleared + not_cleared); } @@ -5193,13 +5788,29 @@ pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offse pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va_next; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; + + /* + * A/D bit emulation requires an alternate code path when clearing + * the modified and accessed bits below. Since this function is + * advisory in nature we skip it entirely for pmaps that require + * A/D bit emulation. + */ + if (pmap_emulate_ad_bits(pmap)) + return; + + PG_A = pmap_accessed_bit(pmap); + PG_G = pmap_global_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + pv_lists_locked = FALSE; resume: anychanged = FALSE; @@ -5313,7 +5924,7 @@ pmap_clear_modify(vm_page_t m) pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; - pt_entry_t oldpte, *pte; + pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; @@ -5350,6 +5961,9 @@ restart: goto restart; } } + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; @@ -5392,6 +6006,8 @@ small_mappings: goto restart; } } + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); @@ -5412,7 +6028,7 @@ small_mappings: /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void -pmap_pte_attr(pt_entry_t *pte, int cache_bits) +pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; @@ -5422,14 +6038,14 @@ static __inline void */ do { opte = *(u_int *)pte; - npte = opte & ~PG_PTE_CACHE; + npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void -pmap_pde_attr(pd_entry_t *pde, int cache_bits) +pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; @@ -5439,7 +6055,7 @@ static __inline void */ do { opde = *(u_int *)pde; - npde = opde & ~PG_PDE_CACHE; + npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } @@ -5514,9 +6130,15 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, v { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; + pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t mpdepa; vm_page_t mpde; + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), @@ -5633,8 +6255,8 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t if (base < DMAP_MIN_ADDRESS) return (EINVAL); - cache_bits_pde = pmap_cache_bits(mode, 1); - cache_bits_pte = pmap_cache_bits(mode, 0); + cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); + cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* @@ -5651,7 +6273,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ - if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) { + if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } @@ -5678,7 +6300,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ - if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { + if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } @@ -5711,8 +6333,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { - if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) { - pmap_pde_attr(pdpe, cache_bits_pde); + if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { + pmap_pde_attr(pdpe, cache_bits_pde, + X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { @@ -5739,8 +6362,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { - if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { - pmap_pde_attr(pde, cache_bits_pde); + if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { + pmap_pde_attr(pde, cache_bits_pde, + X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { @@ -5765,8 +6389,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); - if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { - pmap_pte_attr(pte, cache_bits_pte); + if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { + pmap_pte_attr(pte, cache_bits_pte, + X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { @@ -5831,7 +6456,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, b changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); - if ((*pdpe & PG_V) == 0) + if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) @@ -5840,7 +6465,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, b } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); - if ((*pde & PG_V) == 0) + if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) @@ -5861,10 +6486,15 @@ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; - pt_entry_t pte; + pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); @@ -5958,6 +6588,154 @@ pmap_align_superpage(vm_object_t object, vm_ooffse *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } +#ifdef INVARIANTS +static unsigned long num_dirty_emulations; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, + &num_dirty_emulations, 0, NULL); + +static unsigned long num_accessed_emulations; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, + &num_accessed_emulations, 0, NULL); + +static unsigned long num_superpage_accessed_emulations; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, + &num_superpage_accessed_emulations, 0, NULL); + +static unsigned long ad_emulation_superpage_promotions; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, + &ad_emulation_superpage_promotions, 0, NULL); +#endif /* INVARIANTS */ + +int +pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) +{ + int rv; + struct rwlock *lock; + vm_page_t m, mpte; + pd_entry_t *pde; + pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; + boolean_t pv_lists_locked; + + KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, + ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); + + if (!pmap_emulate_ad_bits(pmap)) + return (-1); + + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + + rv = -1; + lock = NULL; + pv_lists_locked = FALSE; +retry: + PMAP_LOCK(pmap); + + pde = pmap_pde(pmap, va); + if (pde == NULL || (*pde & PG_V) == 0) + goto done; + + if ((*pde & PG_PS) != 0) { + if (ftype == VM_PROT_READ) { +#ifdef INVARIANTS + atomic_add_long(&num_superpage_accessed_emulations, 1); +#endif + *pde |= PG_A; + rv = 0; + } + goto done; + } + + pte = pmap_pde_to_pte(pde, va); + if ((*pte & PG_V) == 0) + goto done; + + if (ftype == VM_PROT_WRITE) { + if ((*pte & PG_RW) == 0) + goto done; + *pte |= PG_M; + } + *pte |= PG_A; + + /* try to promote the mapping */ + if (va < VM_MAXUSER_ADDRESS) + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + else + mpte = NULL; + + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + pmap_ps_enabled(pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) { + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_rlock(&pvh_global_lock)) { + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + goto retry; + } + } + pmap_promote_pde(pmap, pde, va, &lock); +#ifdef INVARIANTS + atomic_add_long(&ad_emulation_superpage_promotions, 1); +#endif + } +#ifdef INVARIANTS + if (ftype == VM_PROT_WRITE) + atomic_add_long(&num_dirty_emulations, 1); + else + atomic_add_long(&num_accessed_emulations, 1); +#endif + rv = 0; /* success */ +done: + if (lock != NULL) + rw_wunlock(lock); + if (pv_lists_locked) + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (rv); +} + +void +pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) +{ + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pde; + pt_entry_t *pte, PG_V; + int idx; + + idx = 0; + PG_V = pmap_valid_bit(pmap); + PMAP_LOCK(pmap); + + pml4 = pmap_pml4e(pmap, va); + ptr[idx++] = *pml4; + if ((*pml4 & PG_V) == 0) + goto done; + + pdp = pmap_pml4e_to_pdpe(pml4, va); + ptr[idx++] = *pdp; + if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) + goto done; + + pde = pmap_pdpe_to_pde(pdp, va); + ptr[idx++] = *pde; + if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) + goto done; + + pte = pmap_pde_to_pte(pde, va); + ptr[idx++] = *pte; + +done: + PMAP_UNLOCK(pmap); + *num = idx; +} + #include "opt_ddb.h" #ifdef DDB #include @@ -5968,7 +6746,7 @@ DB_SHOW_COMMAND(pte, pmap_print_pte) pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; vm_offset_t va; if (have_addr) { @@ -5978,6 +6756,7 @@ DB_SHOW_COMMAND(pte, pmap_print_pte) db_printf("show pte addr\n"); return; } + PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); db_printf("VA %#016lx pml4e %#016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c (revision 256063) +++ sys/amd64/amd64/trap.c (working copy) @@ -734,6 +734,14 @@ trap_pfault(frame, usermode) } /* + * If the trap was caused by errant bits in the PTE then panic. + */ + if (frame->tf_err & PGEX_RSV) { + trap_fatal(frame, eva); + return (-1); + } + + /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ @@ -822,10 +830,11 @@ trap_fatal(frame, eva) #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); - printf("fault code = %s %s %s, %s\n", + printf("fault code = %s %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", + code & PGEX_RSV ? " rsv" : "", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h (revision 256063) +++ sys/amd64/include/pmap.h (working copy) @@ -50,41 +50,74 @@ * of the fields not present here and there, depending on a lot of things. */ /* ---- Intel Nomenclature ---- */ -#define PG_V 0x001 /* P Valid */ -#define PG_RW 0x002 /* R/W Read/Write */ -#define PG_U 0x004 /* U/S User/Supervisor */ -#define PG_NC_PWT 0x008 /* PWT Write through */ -#define PG_NC_PCD 0x010 /* PCD Cache disable */ -#define PG_A 0x020 /* A Accessed */ -#define PG_M 0x040 /* D Dirty */ -#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ -#define PG_PTE_PAT 0x080 /* PAT PAT index */ -#define PG_G 0x100 /* G Global */ -#define PG_AVAIL1 0x200 /* / Available for system */ -#define PG_AVAIL2 0x400 /* < programmers use */ -#define PG_AVAIL3 0x800 /* \ */ -#define PG_PDE_PAT 0x1000 /* PAT PAT index */ -#define PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_V 0x001 /* P Valid */ +#define X86_PG_RW 0x002 /* R/W Read/Write */ +#define X86_PG_U 0x004 /* U/S User/Supervisor */ +#define X86_PG_NC_PWT 0x008 /* PWT Write through */ +#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ +#define X86_PG_A 0x020 /* A Accessed */ +#define X86_PG_M 0x040 /* D Dirty */ +#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ +#define X86_PG_G 0x100 /* G Global */ +#define X86_PG_AVAIL1 0x200 /* / Available for system */ +#define X86_PG_AVAIL2 0x400 /* < programmers use */ +#define X86_PG_AVAIL3 0x800 /* \ */ +#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_AVAIL(x) (1ul << (x)) +/* Page level cache control fields used to determine the PAT type */ +#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +/* + * Intel extended page table (EPT) bit definitions. + */ +#define EPT_PG_READ 0x001 /* R Read */ +#define EPT_PG_WRITE 0x002 /* W Write */ +#define EPT_PG_EXECUTE 0x004 /* X Execute */ +#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ +#define EPT_PG_PS 0x080 /* PS Page size */ +#define EPT_PG_A 0x100 /* A Accessed */ +#define EPT_PG_M 0x200 /* D Dirty */ +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ + +/* + * Define the PG_xx macros in terms of the bits on x86 PTEs. + */ +#define PG_V X86_PG_V +#define PG_RW X86_PG_RW +#define PG_U X86_PG_U +#define PG_NC_PWT X86_PG_NC_PWT +#define PG_NC_PCD X86_PG_NC_PCD +#define PG_A X86_PG_A +#define PG_M X86_PG_M +#define PG_PS X86_PG_PS +#define PG_PTE_PAT X86_PG_PTE_PAT +#define PG_G X86_PG_G +#define PG_AVAIL1 X86_PG_AVAIL1 +#define PG_AVAIL2 X86_PG_AVAIL2 +#define PG_AVAIL3 X86_PG_AVAIL3 +#define PG_PDE_PAT X86_PG_PDE_PAT +#define PG_NX X86_PG_NX +#define PG_PDE_CACHE X86_PG_PDE_CACHE +#define PG_PTE_CACHE X86_PG_PTE_CACHE + /* Our various interpretations of the above */ -#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */ -#define PG_MANAGED PG_AVAIL2 +#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ +#define PG_MANAGED X86_PG_AVAIL2 +#define EPT_PG_EMUL_V X86_PG_AVAIL(52) +#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) #define PG_FRAME (0x000ffffffffff000ul) #define PG_PS_FRAME (0x000fffffffe00000ul) -#define PG_PROT (PG_RW|PG_U) /* all protection bits . */ -#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ -/* Page level cache control fields used to determine the PAT type */ -#define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD) -#define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD) - /* * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB * (PTE) page mappings have identical settings for the following fields: */ -#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \ - PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V) +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ + PG_M | PG_A | PG_U | PG_RW | PG_V) /* * Page Protection Exception bits @@ -96,6 +129,28 @@ #define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ #define PGEX_I 0x10 /* during an instruction fetch */ +/* + * undef the PG_xx macros that define bits in the regular x86 PTEs that + * have a different position in nested PTEs. This is done when compiling + * code that needs to be aware of the differences between regular x86 and + * nested PTEs. + * + * The appropriate bitmask will be calculated at runtime based on the pmap + * type. + */ +#ifdef AMD64_NPT_AWARE +#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ +#undef PG_G +#undef PG_A +#undef PG_M +#undef PG_PDE_PAT +#undef PG_PDE_CACHE +#undef PG_PTE_PAT +#undef PG_PTE_CACHE +#undef PG_RW +#undef PG_V +#endif + /* * Pte related macros. This is complicated by having to deal with * the sign extension of the 48th bit. @@ -256,6 +311,11 @@ struct pmap { int pm_flags; }; +/* flags */ +#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */ + typedef struct pmap *pmap_t; #ifdef _KERNEL @@ -272,6 +332,9 @@ extern struct pmap kernel_pmap_store; #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) + +int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); #endif /* @@ -330,7 +393,7 @@ void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); - +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); #endif /* _KERNEL */ #endif /* !LOCORE */ Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h (revision 256063) +++ sys/amd64/include/vmm.h (working copy) @@ -39,19 +39,18 @@ struct seg_desc; struct vm_exit; struct vm_run; struct vlapic; +struct vmspace; +struct vm_object; +struct pmap; enum x2apic_state; typedef int (*vmm_init_func_t)(void); typedef int (*vmm_cleanup_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ -typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap); typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa, - vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, - boolean_t superpages_ok); -typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, @@ -65,6 +64,8 @@ typedef int (*vmi_inject_event_t)(void *vmi, int v uint32_t code, int code_valid); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ @@ -73,8 +74,6 @@ struct vmm_ops { vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; - vmi_mmap_set_func_t vmmmap_set; - vmi_mmap_get_func_t vmmmap_get; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; @@ -82,6 +81,8 @@ struct vmm_ops { vmi_inject_event_t vminject; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; }; extern struct vmm_ops vmm_ops_intel; @@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm); int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); -vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); +void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, + void **cookie); +void vm_gpa_release(void *cookie); int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg); +int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, + vm_offset_t *offset, struct vm_object **object); +boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, @@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, + VCPU_FROZEN, VCPU_RUNNING, - VCPU_CANNOT_RUN, + VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state); @@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *host void *vcpu_stats(struct vm *vm, int vcpu); void vm_interrupt_hostcpu(struct vm *vm, int vcpu); - +struct vmspace *vm_get_vmspace(struct vm *vm); +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); #endif /* KERNEL */ #include @@ -247,6 +256,7 @@ enum vm_exitcode { VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_MAX }; @@ -266,8 +276,15 @@ struct vm_exit { } inout; struct { uint64_t gpa; + int fault_type; + int protection; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cr3; struct vie vie; - } paging; + } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. Index: sys/amd64/include/vmm_dev.h =================================================================== --- sys/amd64/include/vmm_dev.h (revision 256063) +++ sys/amd64/include/vmm_dev.h (working copy) @@ -36,7 +36,8 @@ int vmmdev_cleanup(void); struct vm_memory_segment { vm_paddr_t gpa; /* in */ - size_t len; /* in */ + size_t len; + int wired; }; struct vm_register { @@ -135,6 +136,12 @@ struct vm_x2apic { enum x2apic_state state; }; +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -145,6 +152,7 @@ enum { /* memory apis */ IOCNUM_MAP_MEMORY = 10, IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_GET_GPA_PMAP = 12, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, @@ -215,4 +223,6 @@ enum { _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_X2APIC_STATE \ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_GPA_PMAP \ + _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #endif Index: sys/amd64/include/vmm_instruction_emul.h =================================================================== --- sys/amd64/include/vmm_instruction_emul.h (revision 256063) +++ sys/amd64/include/vmm_instruction_emul.h (working copy) @@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, u #ifdef _KERNEL /* * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' */ int vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, uint64_t cr3, struct vie *vie); +void vie_init(struct vie *vie); + /* * Decode the instruction fetched into 'vie' so it can be emulated. * Index: sys/amd64/vmm/amd/amdv.c =================================================================== --- sys/amd64/vmm/amd/amdv.c (revision 256063) +++ sys/amd64/vmm/amd/amdv.c (working copy) @@ -54,7 +54,7 @@ amdv_cleanup(void) } static void * -amdv_vminit(struct vm *vm) +amdv_vminit(struct vm *vm, struct pmap *pmap) { printf("amdv_vminit: not implemented\n"); @@ -62,7 +62,7 @@ static void * } static int -amdv_vmrun(void *arg, int vcpu, register_t rip) +amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap) { printf("amdv_vmrun: not implemented\n"); @@ -78,23 +78,6 @@ amdv_vmcleanup(void *arg) } static int -amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t spok) -{ - - printf("amdv_vmmmap_set: not implemented\n"); - return (EINVAL); -} - -static vm_paddr_t -amdv_vmmmap_get(void *arg, vm_paddr_t gpa) -{ - - printf("amdv_vmmmap_get: not implemented\n"); - return (EINVAL); -} - -static int amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) { @@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val return (EINVAL); } +static struct vmspace * +amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + + printf("amdv_vmspace_alloc: not implemented\n"); + return (NULL); +} + +static void +amdv_vmspace_free(struct vmspace *vmspace) +{ + + printf("amdv_vmspace_free: not implemented\n"); + return; +} + struct vmm_ops vmm_ops_amd = { amdv_init, amdv_cleanup, amdv_vminit, amdv_vmrun, amdv_vmcleanup, - amdv_vmmmap_set, - amdv_vmmmap_get, amdv_getreg, amdv_setreg, amdv_getdesc, amdv_setdesc, amdv_inject_event, amdv_getcap, - amdv_setcap + amdv_setcap, + amdv_vmspace_alloc, + amdv_vmspace_free, }; static int Index: sys/amd64/vmm/intel/ept.c =================================================================== --- sys/amd64/vmm/intel/ept.c (revision 256063) +++ sys/amd64/vmm/intel/ept.c (working copy) @@ -29,32 +29,31 @@ #include __FBSDID("$FreeBSD$"); +#include +#include #include -#include #include -#include #include +#include #include #include +#include -#include -#include -#include -#include +#include -#include #include "vmx_cpufunc.h" #include "vmx_msr.h" -#include "vmx.h" #include "ept.h" +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) #define EPT_PWL4(cap) ((cap) & (1UL << 6)) #define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) #define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ #define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) #define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) -#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) #define INVVPID_ALL_TYPES_MASK 0xF0000000000UL #define INVVPID_ALL_TYPES_SUPPORTED(cap) \ @@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$"); #define INVEPT_ALL_TYPES_SUPPORTED(cap) \ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) -#define EPT_PG_RD (1 << 0) -#define EPT_PG_WR (1 << 1) -#define EPT_PG_EX (1 << 2) -#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) -#define EPT_PG_IGNORE_PAT (1 << 6) -#define EPT_PG_SUPERPAGE (1 << 7) +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) -#define EPT_ADDR_MASK ((uint64_t)-1 << 12) +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL); -MALLOC_DECLARE(M_VMX); +static int ept_enable_ad_bits; -static uint64_t page_sizes_mask; +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); -/* - * Set this to 1 to have the EPT tables respect the guest PAT settings - */ -static int ept_pat_passthru; - int ept_init(void) { - int page_shift; + int use_hw_ad_bits, use_superpages, use_exec_only; uint64_t cap; cap = rdmsr(MSR_VMX_EPT_VPID_CAP); @@ -105,17 +98,22 @@ ept_init(void) !INVEPT_ALL_TYPES_SUPPORTED(cap)) return (EINVAL); - /* Set bits in 'page_sizes_mask' for each valid page size */ - page_shift = PAGE_SHIFT; - page_sizes_mask = 1UL << page_shift; /* 4KB page */ + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ - page_shift += 9; - if (EPT_PDE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; - page_shift += 9; - if (EPT_PDPTE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; return (0); } @@ -154,247 +152,53 @@ ept_dump(uint64_t *ptp, int nlevels) } #endif -static size_t -ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, vm_prot_t prot, boolean_t spok) -{ - int spshift, ptpshift, ptpindex, nlevels; - - /* - * Compute the size of the mapping that we can accomodate. - * - * This is based on three factors: - * - super page sizes supported by the processor - * - alignment of the region starting at 'gpa' and 'hpa' - * - length of the region 'len' - */ - spshift = PAGE_SHIFT; - if (spok) - spshift += (EPT_PWLEVELS - 1) * 9; - while (spshift >= PAGE_SHIFT) { - uint64_t spsize = 1UL << spshift; - if ((page_sizes_mask & spsize) != 0 && - (gpa & (spsize - 1)) == 0 && - (hpa & (spsize - 1)) == 0 && - length >= spsize) { - break; - } - spshift -= 9; - } - - if (spshift < PAGE_SHIFT) { - panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " - "length 0x%016lx, page_sizes_mask 0x%016lx", - gpa, hpa, length, page_sizes_mask); - } - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - /* We have reached the leaf mapping */ - if (spshift >= ptpshift) - break; - - /* - * We are working on a non-leaf page table page. - * - * Create the next level page table page if necessary and point - * to it from the current page table. - */ - if (ptp[ptpindex] == 0) { - void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); - ptp[ptpindex] = vtophys(nlp); - ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; - } - - /* Work our way down to the next level page table page */ - ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); - } - - if ((gpa & ((1UL << ptpshift) - 1)) != 0) { - panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " - "mismatch\n", gpa, ptpshift); - } - - if (prot != VM_PROT_NONE) { - /* Do the mapping */ - ptp[ptpindex] = hpa; - - /* Apply the access controls */ - if (prot & VM_PROT_READ) - ptp[ptpindex] |= EPT_PG_RD; - if (prot & VM_PROT_WRITE) - ptp[ptpindex] |= EPT_PG_WR; - if (prot & VM_PROT_EXECUTE) - ptp[ptpindex] |= EPT_PG_EX; - - /* - * By default the PAT type is ignored - this appears to - * be how other hypervisors handle EPT. Allow this to be - * overridden. - */ - ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); - if (!ept_pat_passthru) - ptp[ptpindex] |= EPT_PG_IGNORE_PAT; - - if (nlevels > 0) - ptp[ptpindex] |= EPT_PG_SUPERPAGE; - } else { - /* Remove the mapping */ - ptp[ptpindex] = 0; - } - - return (1UL << ptpshift); -} - -static vm_paddr_t -ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) -{ - int nlevels, ptpshift, ptpindex; - uint64_t ptpval, hpabase, pgmask; - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - ptpval = ptp[ptpindex]; - - /* Cannot make progress beyond this point */ - if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) - break; - - if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { - pgmask = (1UL << ptpshift) - 1; - hpabase = ptpval & ~pgmask; - return (hpabase | (gpa & pgmask)); - } - - /* Work our way down to the next level page table page */ - ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); - } - - return ((vm_paddr_t)-1); -} - static void -ept_free_pt_entry(pt_entry_t pte) +invept_single_context(void *arg) { - if (pte == 0) - return; + struct invept_desc desc = *(struct invept_desc *)arg; - /* sanity check */ - if ((pte & EPT_PG_SUPERPAGE) != 0) - panic("ept_free_pt_entry: pte cannot have superpage bit"); - - return; + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); } -static void -ept_free_pd_entry(pd_entry_t pde) +void +ept_invalidate_mappings(u_long eptp) { - pt_entry_t *pt; - int i; + struct invept_desc invept_desc = { 0 }; - if (pde == 0) - return; + invept_desc.eptp = eptp; - if ((pde & EPT_PG_SUPERPAGE) == 0) { - pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); - for (i = 0; i < NPTEPG; i++) - ept_free_pt_entry(pt[i]); - free(pt, M_VMX); /* free the page table page */ - } + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); } -static void -ept_free_pdp_entry(pdp_entry_t pdpe) +static int +ept_pinit(pmap_t pmap) { - pd_entry_t *pd; - int i; - if (pdpe == 0) - return; - - if ((pdpe & EPT_PG_SUPERPAGE) == 0) { - pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); - for (i = 0; i < NPDEPG; i++) - ept_free_pd_entry(pd[i]); - free(pd, M_VMX); /* free the page directory page */ - } + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); } -static void -ept_free_pml4_entry(pml4_entry_t pml4e) +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) { - pdp_entry_t *pdp; - int i; - if (pml4e == 0) - return; - - if ((pml4e & EPT_PG_SUPERPAGE) == 0) { - pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); - for (i = 0; i < NPDPEPG; i++) - ept_free_pdp_entry(pdp[i]); - free(pdp, M_VMX); /* free the page directory ptr page */ - } + return (vmspace_alloc(min, max, ept_pinit)); } void -ept_vmcleanup(struct vmx *vmx) +ept_vmspace_free(struct vmspace *vmspace) { - int i; - for (i = 0; i < NPML4EPG; i++) - ept_free_pml4_entry(vmx->pml4ept[i]); + vmspace_free(vmspace); } -int -ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, - vm_memattr_t attr, int prot, boolean_t spok) +uint64_t +eptp(uint64_t pml4) { - size_t n; - struct vmx *vmx = arg; + uint64_t eptp_val; - while (len > 0) { - n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, - prot, spok); - len -= n; - gpa += n; - hpa += n; - } + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; - return (0); + return (eptp_val); } - -vm_paddr_t -ept_vmmmap_get(void *arg, vm_paddr_t gpa) -{ - vm_paddr_t hpa; - struct vmx *vmx; - - vmx = arg; - hpa = ept_lookup_mapping(vmx->pml4ept, gpa); - return (hpa); -} - -static void -invept_single_context(void *arg) -{ - struct invept_desc desc = *(struct invept_desc *)arg; - - invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); -} - -void -ept_invalidate_mappings(u_long pml4ept) -{ - struct invept_desc invept_desc = { 0 }; - - invept_desc.eptp = EPTP(pml4ept); - - smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); -} Index: sys/amd64/vmm/intel/ept.h =================================================================== --- sys/amd64/vmm/intel/ept.h (revision 256063) +++ sys/amd64/vmm/intel/ept.h (working copy) @@ -31,13 +31,9 @@ struct vmx; -#define EPT_PWLEVELS 4 /* page walk levels */ -#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) - int ept_init(void); -int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); -vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa); -void ept_invalidate_mappings(u_long ept_pml4); -void ept_vmcleanup(struct vmx *vmx); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); #endif Index: sys/amd64/vmm/intel/vmcs.c =================================================================== --- sys/amd64/vmm/intel/vmcs.c (revision 256063) +++ sys/amd64/vmm/intel/vmcs.c (working copy) @@ -318,14 +318,14 @@ done: int vmcs_set_defaults(struct vmcs *vmcs, - u_long host_rip, u_long host_rsp, u_long ept_pml4, + u_long host_rip, u_long host_rsp, uint64_t eptp, uint32_t pinbased_ctls, uint32_t procbased_ctls, uint32_t procbased_ctls2, uint32_t exit_ctls, uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; - uint64_t eptp, pat, fsbase, idtrbase; + uint64_t pat, fsbase, idtrbase; uint32_t exc_bitmap; codesel = vmm_get_host_codesel(); @@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs, goto done; /* eptp */ - eptp = EPTP(ept_pml4); if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) goto done; Index: sys/amd64/vmm/intel/vmcs.h =================================================================== --- sys/amd64/vmm/intel/vmcs.h (revision 256063) +++ sys/amd64/vmm/intel/vmcs.h (working copy) @@ -47,7 +47,7 @@ struct msr_entry { int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, - u_long ept_pml4, + uint64_t eptp, uint32_t pinbased_ctls, uint32_t procbased_ctls, uint32_t procbased_ctls2, uint32_t exit_ctls, uint32_t entry_ctls, u_long msr_bitmap, @@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding); #define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) #define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) #endif /* _KERNEL */ @@ -314,6 +316,12 @@ uint64_t vmcs_read(uint32_t encoding); #define VMCS_INTERRUPTION_INFO_NMI (2 << 8) /* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1 << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* * VMCS Guest interruptibility field */ #define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) @@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding); #define EPT_VIOLATION_DATA_READ (1UL << 0) #define EPT_VIOLATION_DATA_WRITE (1UL << 1) #define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) #define EPT_VIOLATION_GLA_VALID (1UL << 7) #define EPT_VIOLATION_XLAT_VALID (1UL << 8) Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c (revision 256063) +++ sys/amd64/vmm/intel/vmx.c (working copy) @@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$"); #include #include -#include - #include #include "vmm_host.h" #include "vmm_lapic.h" @@ -167,9 +165,6 @@ static int cap_pause_exit; static int cap_unrestricted_guest; static int cap_monitor_trap; -/* statistics */ -static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); - static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, @@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * -vmx_vminit(struct vm *vm) +vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; int i, error, guest_msr_count; @@ -753,6 +748,8 @@ static void * } vmx->vm = vm; + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + /* * Clean up EPTP-tagged guest physical and combined mappings * @@ -762,7 +759,7 @@ static void * * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ - ept_invalidate_mappings(vtophys(vmx->pml4ept)); + ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); @@ -818,7 +815,7 @@ static void * error = vmcs_set_defaults(&vmx->vmcs[i], (u_long)vmx_longjmp, (u_long)&vmx->ctx[i], - vtophys(vmx->pml4ept), + vmx->eptp, pinbased_ctls, procbased_ctls, procbased_ctls2, @@ -856,6 +853,9 @@ static void * error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); + + vmx->ctx[i].pmap = pmap; + vmx->ctx[i].eptp = vmx->eptp; } return (vmx); @@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, u } static int -vmx_ept_fault(struct vm *vm, int cpu, - uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, - uint64_t cr3, uint64_t ept_qual, struct vie *vie) +ept_fault_type(uint64_t ept_qual) { - int read, write, error; + int fault_type; - /* EPT violation on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = VM_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = VM_PROT_EXECUTE; + else + fault_type= VM_PROT_READ; + + return (fault_type); +} + +static int +ept_protection(uint64_t ept_qual) +{ + int prot = 0; + + if (ept_qual & EPT_VIOLATION_GPA_READABLE) + prot |= VM_PROT_READ; + if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE) + prot |= VM_PROT_WRITE; + if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE) + prot |= VM_PROT_EXECUTE; + + return (prot); +} + +static boolean_t +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) - return (UNHANDLED); + return (FALSE); - /* EPT violation must be a read fault or a write fault */ + /* EPT fault must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; if ((read | write) == 0) - return (UNHANDLED); + return (FALSE); /* * The EPT violation must have been caused by accessing a @@ -1304,26 +1332,10 @@ static int */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { - return (UNHANDLED); + return (FALSE); } - /* Fetch, decode and emulate the faulting instruction */ - if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) - return (UNHANDLED); - - if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) - return (UNHANDLED); - - /* - * Check if this is a local apic access - */ - if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) - return (UNHANDLED); - - error = vmm_emulate_instruction(vm, cpu, gpa, vie, - lapic_mmio_read, lapic_mmio_write, 0); - - return (error ? UNHANDLED : HANDLED); + return (TRUE); } static int @@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct int error, handled; struct vmcs *vmcs; struct vmxctx *vmxctx; - uint32_t eax, ecx, edx; - uint64_t qual, gla, gpa, cr3, intr_info; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; + uint64_t qual, gpa; handled = 0; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); - switch (vmexit->u.vmx.exit_reason) { + /* + * VM exits that could be triggered during event injection on the + * previous VM entry need to be handled specially by re-injecting + * the event. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + switch (reason) { + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + case EXIT_REASON_APIC: + case EXIT_REASON_TASK_SWITCH: + case EXIT_REASON_EXCEPTION: + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info); + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err); + } + vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + default: + break; + } + + switch (reason) { case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); handled = vmx_emulate_cr_access(vmx, vcpu, qual); @@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); - /* - * If there is an event waiting to be injected then there is - * no need to 'hlt'. - */ - error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); - if (error) - panic("vmx_exit_process: vmread(intrinfo) %d", error); - - if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { - handled = 1; - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); - } else - vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->exitcode = VM_EXITCODE_HLT; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); @@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct break; case EXIT_REASON_EPT_FAULT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); - gla = vmcs_gla(); + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ gpa = vmcs_gpa(); - cr3 = vmcs_guest_cr3(); - handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, - vmexit->rip, vmexit->inst_length, - cr3, qual, &vmexit->u.paging.vie); - if (!handled) { + if (vm_mem_allocated(vmx->vm, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmexit->u.paging.protection = ept_protection(qual); + } else if (ept_emulation_fault(qual)) { + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = vmcs_gla(); + vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); } break; default: @@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit_update_rip(vmexit); vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; - - /* - * Special case for spinning up an AP - exit to userspace to - * give the controlling process a chance to intercept and - * spin up a thread for the AP. - */ - if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) - handled = 0; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* @@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct } static int -vmx_run(void *arg, int vcpu, register_t rip) +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) { int error, vie, rc, handled, astpending; uint32_t exit_reason; @@ -1505,7 +1533,7 @@ static int struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; - + vmx = arg; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; @@ -1514,6 +1542,11 @@ static int astpending = 0; vmexit = vm_exitinfo(vmx->vm, vcpu); + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); + KASSERT(vmxctx->eptp == vmx->eptp, + ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); + /* * XXX Can we avoid doing this every time we do a vm run? */ @@ -1576,6 +1609,9 @@ static int vmxctx->launch_error, vie); #endif goto err_exit; + case VMX_RETURN_INVEPT: + panic("vm %s:%d invept error %d", + vm_name(vmx->vm), vcpu, vmxctx->launch_error); default: panic("vmx_setjmp returned %d", rc); } @@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg) if (error != 0) panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); - ept_vmcleanup(vmx); free(vmx, M_VMX); return; @@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = { vmx_vminit, vmx_run, vmx_vmcleanup, - ept_vmmmap_set, - ept_vmmmap_get, vmx_getreg, vmx_setreg, vmx_getdesc, vmx_setdesc, vmx_inject, vmx_getcap, - vmx_setcap + vmx_setcap, + ept_vmspace_alloc, + ept_vmspace_free, }; Index: sys/amd64/vmm/intel/vmx.h =================================================================== --- sys/amd64/vmm/intel/vmx.h (revision 256063) +++ sys/amd64/vmm/intel/vmx.h (working copy) @@ -31,6 +31,8 @@ #include "vmcs.h" +struct pmap; + #define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ struct vmxctx { @@ -68,6 +70,15 @@ struct vmxctx { int launched; /* vmcs launch state */ int launch_error; + + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ + + /* + * The 'eptp' and the 'pmap' do not change during the lifetime of + * the VM so it is safe to keep a copy in each vcpu's vmxctx. + */ + vm_paddr_t eptp; + struct pmap *pmap; }; struct vmxcap { @@ -82,16 +93,15 @@ struct vmxstate { /* virtual machine softc */ struct vmx { - pml4_entry_t pml4ept[NPML4EPG]; struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ char msr_bitmap[PAGE_SIZE]; struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; struct vmxctx ctx[VM_MAXCPU]; struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; struct vm *vm; }; -CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); @@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) = #define VMX_RETURN_VMRESUME 2 #define VMX_RETURN_VMLAUNCH 3 #define VMX_RETURN_AST 4 +#define VMX_RETURN_INVEPT 5 /* * vmx_setjmp() returns: * - 0 when it returns directly @@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) = * - 2 when it returns from vmx_resume (which would only be in the error case) * - 3 when it returns from vmx_launch (which would only be in the error case) * - 4 when it returns from vmx_resume or vmx_launch because of AST pending + * - 5 when it returns from vmx_launch/vmx_resume because of invept error */ int vmx_setjmp(struct vmxctx *ctx); void vmx_longjmp(void); /* returns via vmx_setjmp */ Index: sys/amd64/vmm/intel/vmx_genassym.c =================================================================== --- sys/amd64/vmm/intel/vmx_genassym.c (revision 256063) +++ sys/amd64/vmm/intel/vmx_genassym.c (working copy) @@ -72,7 +72,11 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, hos ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error)); +ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen)); +ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap)); +ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp)); + ASSYM(VM_SUCCESS, VM_SUCCESS); ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); @@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP); ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME); ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH); ASSYM(VMX_RETURN_AST, VMX_RETURN_AST); +ASSYM(VMX_RETURN_INVEPT, VMX_RETURN_INVEPT); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); +ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); + +ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen)); Index: sys/amd64/vmm/intel/vmx_support.S =================================================================== --- sys/amd64/vmm/intel/vmx_support.S (revision 256063) +++ sys/amd64/vmm/intel/vmx_support.S (working copy) @@ -30,6 +30,12 @@ #include "vmx_assym.s" +#ifdef SMP +#define LK lock ; +#else +#define LK +#endif + /* * Disable interrupts before updating %rsp in VMX_CHECK_AST or * VMX_GUEST_RESTORE. @@ -86,16 +92,74 @@ movq VMXCTX_GUEST_R15(%rdi),%r15; \ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ -#define VM_INSTRUCTION_ERROR(reg) \ +/* + * Check for an error after executing a VMX instruction. + * 'errreg' will be zero on success and non-zero otherwise. + * 'ctxreg' points to the 'struct vmxctx' associated with the vcpu. + */ +#define VM_INSTRUCTION_ERROR(errreg, ctxreg) \ jnc 1f; \ - movl $VM_FAIL_INVALID,reg; /* CF is set */ \ + movl $VM_FAIL_INVALID,errreg; /* CF is set */ \ jmp 3f; \ 1: jnz 2f; \ - movl $VM_FAIL_VALID,reg; /* ZF is set */ \ + movl $VM_FAIL_VALID,errreg; /* ZF is set */ \ jmp 3f; \ -2: movl $VM_SUCCESS,reg; \ -3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) +2: movl $VM_SUCCESS,errreg; \ +3: movl errreg,VMXCTX_LAUNCH_ERROR(ctxreg) +/* + * set or clear the appropriate bit in 'pm_active' + * %rdi = vmxctx + * %rax, %r11 = scratch registers + */ +#define VMX_SET_PM_ACTIVE \ + movq VMXCTX_PMAP(%rdi), %r11; \ + movl PCPU(CPUID), %eax; \ + LK btsl %eax, PM_ACTIVE(%r11) + +#define VMX_CLEAR_PM_ACTIVE \ + movq VMXCTX_PMAP(%rdi), %r11; \ + movl PCPU(CPUID), %eax; \ + LK btrl %eax, PM_ACTIVE(%r11) + +/* + * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * then we must invalidate all mappings associated with this eptp. + * + * %rdi = vmxctx + * %rax, %rbx, %r11 = scratch registers + */ +#define VMX_CHECK_EPTGEN \ + movl PCPU(CPUID), %ebx; \ + movq VMXCTX_PMAP(%rdi), %r11; \ + movq PM_EPTGEN(%r11), %rax; \ + cmpq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \ + je 9f; \ + \ + /* Refresh 'vmxctx->eptgen[curcpu]' */ \ + movq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \ + \ + /* Setup the invept descriptor at the top of tmpstk */ \ + mov %rdi, %r11; \ + addq $VMXCTX_TMPSTKTOP, %r11; \ + movq VMXCTX_EPTP(%rdi), %rax; \ + movq %rax, -16(%r11); \ + movq $0x0, -8(%r11); \ + mov $0x1, %eax; /* Single context invalidate */ \ + invept -16(%r11), %rax; \ + \ + /* Check for invept error */ \ + VM_INSTRUCTION_ERROR(%eax, %rdi); \ + testl %eax, %eax; \ + jz 9f; \ + \ + /* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */ \ + movq $VMX_RETURN_INVEPT, %rsi; \ + movq %rdi,%rsp; \ + addq $VMXCTX_TMPSTKTOP, %rsp; \ + callq vmx_return; \ +9: ; + .text /* * int vmx_setjmp(ctxp) @@ -129,6 +193,9 @@ END(vmx_setjmp) * Return to vmm context through vmx_setjmp() with a value of 'retval'. */ ENTRY(vmx_return) + /* The pmap is no longer active on the host cpu */ + VMX_CLEAR_PM_ACTIVE + /* Restore host context. */ movq VMXCTX_HOST_R15(%rdi),%r15 movq VMXCTX_HOST_R14(%rdi),%r14 @@ -193,6 +260,10 @@ ENTRY(vmx_resume) VMX_CHECK_AST + VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */ + + VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */ + /* * Restore guest state that is not automatically loaded from the vmcs. */ @@ -203,7 +274,7 @@ ENTRY(vmx_resume) /* * Capture the reason why vmresume failed. */ - VM_INSTRUCTION_ERROR(%eax) + VM_INSTRUCTION_ERROR(%eax, %rsp) /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ movq %rsp,%rdi @@ -225,6 +296,10 @@ ENTRY(vmx_launch) VMX_CHECK_AST + VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */ + + VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */ + /* * Restore guest state that is not automatically loaded from the vmcs. */ @@ -235,7 +310,7 @@ ENTRY(vmx_launch) /* * Capture the reason why vmlaunch failed. */ - VM_INSTRUCTION_ERROR(%eax) + VM_INSTRUCTION_ERROR(%eax, %rsp) /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ movq %rsp,%rdi Index: sys/amd64/vmm/io/ppt.c =================================================================== --- sys/amd64/vmm/io/ppt.c (revision 256063) +++ sys/amd64/vmm/io/ppt.c (working copy) @@ -282,6 +282,43 @@ ppt_teardown_msix(struct pptdev *ppt) } int +ppt_num_devices(struct vm *vm) +{ + int i, num; + + num = 0; + for (i = 0; i < num_pptdevs; i++) { + if (pptdevs[i].vm == vm) + num++; + } + return (num); +} + +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + int i, n; + struct pptdev *ppt; + struct vm_memory_segment *seg; + + for (n = 0; n < num_pptdevs; n++) { + ppt = &pptdevs[n]; + if (ppt->vm != vm) + continue; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) + return (TRUE); + } + } + + return (FALSE); +} + +int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; @@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm) bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); - ppt_unassign_device(vm, bus, slot, func); + vm_unassign_pptdev(vm, bus, slot, func); } } @@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, i return (0); } - -int -ppt_num_devices(void) -{ - - return (num_pptdevs); -} Index: sys/amd64/vmm/io/ppt.h =================================================================== --- sys/amd64/vmm/io/ppt.h (revision 256063) +++ sys/amd64/vmm/io/ppt.h (working copy) @@ -29,14 +29,20 @@ #ifndef _IO_PPT_H_ #define _IO_PPT_H_ -int ppt_assign_device(struct vm *vm, int bus, int slot, int func); -int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); int ppt_unassign_all(struct vm *vm); int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, int destcpu, int vector, int numvec); int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, - int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); -int ppt_num_devices(void); + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); +int ppt_num_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); #endif Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c (revision 256063) +++ sys/amd64/vmm/vmm.c (working copy) @@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include +#include +#include #include +#include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" @@ -84,15 +94,23 @@ struct vcpu { #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) +struct mem_seg { + vm_paddr_t gpa; + size_t len; + boolean_t wired; + vm_object_t object; +}; #define VM_MAX_MEMORY_SEGMENTS 2 struct vm { void *cookie; /* processor-specific data */ void *iommu; /* iommu-specific data */ + struct vmspace *vmspace; /* guest's address space */ struct vcpu vcpu[VM_MAXCPU]; int num_mem_segs; - struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; char name[VM_MAX_NAMELEN]; /* @@ -109,16 +127,14 @@ static struct vmm_ops *ops; #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) -#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) -#define VMRUN(vmi, vcpu, rip) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, rip, pmap) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) -#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ - (ops != NULL ? \ - (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ - ENXIO) -#define VMMMAP_GET(vmi, gpa) \ - (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ @@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: vmmdev_init(); - if (ppt_num_devices() > 0) - iommu_init(); + iommu_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; @@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm) { int i; struct vm *vm; - vm_paddr_t maxaddr; + struct vmspace *vmspace; const int BSP = 0; @@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); + vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); + if (vmspace == NULL) + return (ENOMEM); + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); - vm->cookie = VMINIT(vm); + vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); for (i = 0; i < VM_MAXCPU; i++) { vcpu_init(vm, i); guest_msrs_init(vm, i); } - maxaddr = vmm_mem_maxaddr(); - vm->iommu = iommu_create_domain(maxaddr); vm_activate_cpu(vm, BSP); + vm->vmspace = vmspace; *retvm = vm; return (0); } static void -vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) { - size_t len; - vm_paddr_t hpa; - void *host_domain; - host_domain = iommu_host_domain(); + if (seg->object != NULL) + vmm_mem_free(vm->vmspace, seg->gpa, seg->len); - len = 0; - while (len < seg->len) { - hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); - if (hpa == (vm_paddr_t)-1) { - panic("vm_free_mem_segs: cannot free hpa " - "associated with gpa 0x%016lx", seg->gpa + len); - } - - /* - * Remove the 'gpa' to 'hpa' mapping in VMs domain. - * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'. - */ - iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE); - iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE); - - vmm_mem_free(hpa, PAGE_SIZE); - - len += PAGE_SIZE; - } - - /* - * Invalidate cached translations associated with 'vm->iommu' since - * we have now moved some pages from it. - */ - iommu_invalidate_tlb(vm->iommu); - - bzero(seg, sizeof(struct vm_memory_segment)); + bzero(seg, sizeof(*seg)); } void @@ -341,6 +331,9 @@ vm_destroy(struct vm *vm) ppt_unassign_all(vm); + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + for (i = 0; i < vm->num_mem_segs; i++) vm_free_mem_seg(vm, &vm->mem_segs[i]); @@ -349,7 +342,7 @@ vm_destroy(struct vm *vm) for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(&vm->vcpu[i]); - iommu_destroy_domain(vm->iommu); + VMSPACE_FREE(vm->vmspace); VMCLEANUP(vm->cookie); @@ -365,63 +358,59 @@ vm_name(struct vm *vm) int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ + vm_object_t obj; - return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, - VM_PROT_RW, spok)); + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ - return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, - VM_PROT_NONE, spok)); + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); } -/* - * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise - */ -static boolean_t -vm_gpa_available(struct vm *vm, vm_paddr_t gpa) +boolean_t +vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) { int i; vm_paddr_t gpabase, gpalimit; - if (gpa & PAGE_MASK) - panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); - for (i = 0; i < vm->num_mem_segs; i++) { gpabase = vm->mem_segs[i].gpa; gpalimit = gpabase + vm->mem_segs[i].len; if (gpa >= gpabase && gpa < gpalimit) - return (FALSE); + return (TRUE); /* 'gpa' is regular memory */ } - return (TRUE); + if (ppt_is_mmio(vm, gpa)) + return (TRUE); /* 'gpa' is pci passthru mmio */ + + return (FALSE); } int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { - int error, available, allocated; - struct vm_memory_segment *seg; - vm_paddr_t g, hpa; - void *host_domain; + int available, allocated; + struct mem_seg *seg; + vm_object_t object; + vm_paddr_t g; - const boolean_t spok = TRUE; /* superpage mappings are ok */ - if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) return (EINVAL); available = allocated = 0; g = gpa; while (g < gpa + len) { - if (vm_gpa_available(vm, g)) + if (vm_mem_allocated(vm, g)) + allocated++; + else available++; - else - allocated++; g += PAGE_SIZE; } @@ -443,63 +432,205 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t le if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); - host_domain = iommu_host_domain(); - seg = &vm->mem_segs[vm->num_mem_segs]; - error = 0; + if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) + return (ENOMEM); + seg->gpa = gpa; - seg->len = 0; - while (seg->len < len) { - hpa = vmm_mem_alloc(PAGE_SIZE); - if (hpa == 0) { - error = ENOMEM; - break; - } + seg->len = len; + seg->object = object; + seg->wired = FALSE; - error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, - VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); - if (error) + vm->num_mem_segs++; + + return (0); +} + +static void +vm_gpa_unwire(struct vm *vm) +{ + int i, rv; + struct mem_seg *seg; + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + if (!seg->wired) + continue; + + rv = vm_map_unwire(&vm->vmspace->vm_map, + seg->gpa, seg->gpa + seg->len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " + "%#lx/%ld could not be unwired: %d", + vm_name(vm), seg->gpa, seg->len, rv)); + + seg->wired = FALSE; + } +} + +static int +vm_gpa_wire(struct vm *vm) +{ + int i, rv; + struct mem_seg *seg; + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + if (seg->wired) + continue; + + /* XXX rlimits? */ + rv = vm_map_wire(&vm->vmspace->vm_map, + seg->gpa, seg->gpa + seg->len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (rv != KERN_SUCCESS) break; + seg->wired = TRUE; + } + + if (i < vm->num_mem_segs) { /* - * Remove the 1:1 mapping for 'hpa' from the 'host_domain'. - * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain. + * Undo the wiring before returning an error. */ - iommu_remove_mapping(host_domain, hpa, PAGE_SIZE); - iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); + vm_gpa_unwire(vm); + return (EAGAIN); + } - seg->len += PAGE_SIZE; + return (0); +} + +static void +vm_iommu_modify(struct vm *vm, boolean_t map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_seg *seg; + void *vp, *cookie, *host_domain; + + sz = PAGE_SIZE; + host_domain = iommu_host_domain(); + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", + vm_name(vm), seg->gpa, seg->len)); + + gpa = seg->gpa; + while (gpa < seg->gpa + seg->len) { + vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, + &cookie); + KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", + vm_name(vm), gpa)); + + vm_gpa_release(cookie); + + hpa = DMAP_TO_PHYS((uintptr_t)vp); + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); + iommu_remove_mapping(host_domain, hpa, sz); + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); + iommu_create_mapping(host_domain, hpa, hpa, sz); + } + + gpa += PAGE_SIZE; + } } - if (error) { - vm_free_mem_seg(vm, seg); + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +} + +#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) +#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) + +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + + error = ppt_unassign_device(vm, bus, slot, func); + if (error) return (error); + + if (ppt_num_devices(vm) == 0) { + vm_iommu_unmap(vm); + vm_gpa_unwire(vm); } + return (0); +} +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + vm_paddr_t maxaddr; + /* - * Invalidate cached translations associated with 'host_domain' since - * we have now moved some pages from it. + * Virtual machines with pci passthru devices get special treatment: + * - the guest physical memory is wired + * - the iommu is programmed to do the 'gpa' to 'hpa' translation + * + * We need to do this before the first pci passthru device is attached. */ - iommu_invalidate_tlb(host_domain); + if (ppt_num_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_mem_maxaddr(); + vm->iommu = iommu_create_domain(maxaddr); - vm->num_mem_segs++; + error = vm_gpa_wire(vm); + if (error) + return (error); - return (0); + vm_iommu_map(vm); + } + + error = ppt_assign_device(vm, bus, slot, func); + return (error); } -vm_paddr_t -vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +void * +vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) { - vm_paddr_t nextpage; + int count, pageoff; + vm_page_t m; - nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); - if (len > nextpage - gpa) - panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - return (VMMMAP_GET(vm->cookie, gpa)); + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } } +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_lock(m); + vm_page_unhold(m); + vm_page_unlock(m); +} + int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg) @@ -508,7 +639,9 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabas for (i = 0; i < vm->num_mem_segs; i++) { if (gpabase == vm->mem_segs[i].gpa) { - *seg = vm->mem_segs[i]; + seg->gpa = vm->mem_segs[i].gpa; + seg->len = vm->mem_segs[i].len; + seg->wired = vm->mem_segs[i].wired; return (0); } } @@ -516,6 +649,33 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabas } int +vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, + vm_offset_t *offset, struct vm_object **object) +{ + int i; + size_t seg_len; + vm_paddr_t seg_gpa; + vm_object_t seg_obj; + + for (i = 0; i < vm->num_mem_segs; i++) { + if ((seg_obj = vm->mem_segs[i].object) == NULL) + continue; + + seg_gpa = vm->mem_segs[i].gpa; + seg_len = vm->mem_segs[i].len; + + if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { + *offset = gpa - seg_gpa; + *object = seg_obj; + vm_object_reference(seg_obj); + return (0); + } + } + + return (EINVAL); +} + +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { @@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu) static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error == 0) + vcpu->state = newstate; + else + error = EBUSY; + + return (error); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) +{ + struct vcpu *vcpu; + int sleepticks, t; + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + + /* + * Figure out the number of host ticks until the next apic + * timer interrupt in the guest. + */ + sleepticks = lapic_timer_tick(vm, vcpuid); + + /* + * If the guest local apic timer is disabled then sleep for + * a long time but not forever. + */ + if (sleepticks < 0) + sleepticks = hz; + + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. + * + * These interrupts could have happened any time after we + * returned from VMRUN() and before we grabbed the vcpu lock. + */ + if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { + if (sleepticks <= 0) + panic("invalid sleepticks %d", sleepticks); + t = ticks; + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + vcpu_unlock(vcpu); + + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) +{ + int rv, ftype; + struct vm_map *map; + struct vcpu *vcpu; + struct vm_exit *vme; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa, ftype); + if (rv == 0) + goto done; + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + + VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d", + rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: + /* restart execution at the faulting instruction */ + vme->inst_length = 0; + + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + int error, inst_length; + uint64_t rip, gla, gpa, cr3; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + rip = vme->rip; + inst_length = vme->inst_length; + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cr3 = vme->u.inst_emul.cr3; + vie = &vme->u.inst_emul.vie; + + vie_init(vie); + + /* Fetch, decode and emulate the faulting instruction */ + if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) + return (EFAULT); + + if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) + return (EFAULT); + + /* return to userland unless this is a local apic access */ + if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) { + *retu = TRUE; + return (0); + } + + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, + lapic_mmio_read, lapic_mmio_write, 0); + + /* return to userland to spin up the AP */ + if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP) + *retu = TRUE; + + return (error); +} + int vm_run(struct vm *vm, struct vm_run *vmrun) { - int error, vcpuid, sleepticks, t; + int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval, rip; struct vm_exit *vme; + boolean_t retu; + pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); + pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; - vme = &vmrun->vm_exit; + vme = &vcpu->exitinfo; rip = vmrun->rip; restart: critical_enter(); + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + tscval = rdtsc(); pcb = PCPU_GET(curpcb); @@ -661,62 +1010,44 @@ restart: restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); vcpu->hostcpu = curcpu; - error = VMRUN(vm->cookie, vcpuid, rip); + error = VMRUN(vm->cookie, vcpuid, rip, pmap); vcpu->hostcpu = NOCPU; + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); - /* copy the exit information */ - bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit)); - critical_exit(); - /* - * Oblige the guest's desire to 'hlt' by sleeping until the vcpu - * is ready to run. - */ - if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) { - vcpu_lock(vcpu); - - /* - * Figure out the number of host ticks until the next apic - * timer interrupt in the guest. - */ - sleepticks = lapic_timer_tick(vm, vcpuid); - - /* - * If the guest local apic timer is disabled then sleep for - * a long time but not forever. - */ - if (sleepticks < 0) - sleepticks = hz; - - /* - * Do a final check for pending NMI or interrupts before - * really putting this thread to sleep. - * - * These interrupts could have happened any time after we - * returned from VMRUN() and before we grabbed the vcpu lock. - */ - if (!vm_nmi_pending(vm, vcpuid) && - lapic_pending_intr(vm, vcpuid) < 0) { - if (sleepticks <= 0) - panic("invalid sleepticks %d", sleepticks); - t = ticks; - msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); - vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + if (error == 0) { + retu = FALSE; + switch (vme->exitcode) { + case VM_EXITCODE_HLT: + error = vm_handle_hlt(vm, vcpuid, &retu); + break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + default: + retu = TRUE; /* handled in userland */ + break; } + } - vcpu_unlock(vcpu); - + if (error == 0 && retu == FALSE) { rip = vme->rip + vme->inst_length; goto restart; } + /* copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } @@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm) } int -vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state) +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; struct vcpu *vcpu; @@ -880,20 +1211,7 @@ int vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); - - /* - * The following state transitions are allowed: - * IDLE -> RUNNING -> IDLE - * IDLE -> CANNOT_RUN -> IDLE - */ - if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) || - (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) { - error = 0; - vcpu->state = state; - } else { - error = EBUSY; - } - + error = vcpu_set_state_locked(vcpu, newstate); vcpu_unlock(vcpu); return (error); @@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid) vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; if (hostcpu == NOCPU) { - /* - * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then - * the host thread must be sleeping waiting for an event to - * kick the vcpu out of 'hlt'. - * - * XXX this is racy because the condition exists right before - * and after calling VMRUN() in vm_run(). The wakeup() is - * benign in this case. - */ - if (vcpu->state == VCPU_RUNNING) + if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } else { if (vcpu->state != VCPU_RUNNING) @@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid) } vcpu_unlock(vcpu); } + +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} Index: sys/amd64/vmm/vmm_dev.c =================================================================== --- sys/amd64/vmm/vmm_dev.c (revision 256063) +++ sys/amd64/vmm/vmm_dev.c (working copy) @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev) static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { - int error, off, c; - vm_paddr_t hpa, gpa; + int error, off, c, prot; + vm_paddr_t gpa; + void *hpa, *cookie; struct vmmdev_softc *sc; static char zerobuf[PAGE_SIZE]; @@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int if (sc == NULL) error = ENXIO; + prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; @@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ - hpa = vm_gpa2hpa(sc->vm, gpa, c); - if (hpa == (vm_paddr_t)-1) { + hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); + if (hpa == NULL) { if (uio->uio_rw == UIO_READ) error = uiomove(zerobuf, c, uio); else error = EFAULT; - } else - error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); + } else { + error = uiomove(hpa, c, uio); + vm_gpa_release(cookie); + } } mtx_unlock(&vmmdev_mtx); @@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ struct thread *td) { int error, vcpu, state_changed; - enum vcpu_state new_state; struct vmmdev_softc *sc; struct vm_memory_segment *seg; struct vm_register *vmreg; @@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; + struct vm_gpa_pte *gpapte; sc = vmmdev_lookup2(cdev); if (sc == NULL) @@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ goto done; } - if (cmd == VM_RUN) - new_state = VCPU_RUNNING; - else - new_state = VCPU_CANNOT_RUN; - - error = vcpu_set_state(sc->vm, vcpu, new_state); + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); if (error) goto done; @@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ */ error = 0; for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { - error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN); + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); if (error) break; } @@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; - error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, - pptdev->func); + error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; - error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, - pptdev->func); + error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); break; case VM_INJECT_EVENT: vmevent = (struct vm_event *)data; @@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ error = vm_get_x2apic_state(sc->vm, x2apic->cpuid, &x2apic->state); break; + case VM_GET_GPA_PMAP: + gpapte = (struct vm_gpa_pte *)data; + pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), + gpapte->gpa, gpapte->pte, &gpapte->ptenum); + error = 0; + break; default: error = ENOTTY; break; @@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_ } done: + /* Make sure that no handler returns a bogus value like ERESTART */ + KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int -vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, - int nprot, vm_memattr_t *memattr) +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, + vm_size_t size, struct vm_object **object, int nprot) { int error; struct vmmdev_softc *sc; - error = -1; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); - if (sc != NULL && (nprot & PROT_EXEC) == 0) { - *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); - if (*paddr != (vm_paddr_t)-1) - error = 0; - } + if (sc != NULL && (nprot & PROT_EXEC) == 0) + error = vm_get_memobj(sc->vm, *offset, size, offset, object); + else + error = EINVAL; mtx_unlock(&vmmdev_mtx); @@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, - .d_mmap = vmmdev_mmap, + .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; Index: sys/amd64/vmm/vmm_instruction_emul.c =================================================================== --- sys/amd64/vmm/vmm_instruction_emul.c (revision 256063) +++ sys/amd64/vmm/vmm_instruction_emul.c (working copy) @@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint } #ifdef _KERNEL -static void +void vie_init(struct vie *vie) { @@ -479,9 +479,9 @@ static int gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, uint64_t *gpa, uint64_t *gpaend) { - vm_paddr_t hpa; int nlevels, ptpshift, ptpindex; uint64_t *ptpbase, pte, pgsize; + void *cookie; /* * XXX assumes 64-bit guest with 4 page walk levels @@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpp /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; - hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE); - if (hpa == -1) + ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ, + &cookie); + if (ptpbase == NULL) goto error; - ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa); - ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; pgsize = 1UL << ptpshift; pte = ptpbase[ptpindex]; + vm_gpa_release(cookie); + if ((pte & PG_V) == 0) goto error; @@ -530,18 +531,18 @@ int vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, uint64_t cr3, struct vie *vie) { - int n, err; - uint64_t hpa, gpa, gpaend, off; + int n, err, prot; + uint64_t gpa, gpaend, off; + void *hpa, *cookie; /* * XXX cache previously fetched instructions using 'rip' as the tag */ + prot = VM_PROT_READ | VM_PROT_EXECUTE; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); - vie_init(vie); - /* Copy the instruction into 'vie' */ while (vie->num_valid < inst_length) { err = gla2gpa(vm, rip, cr3, &gpa, &gpaend); @@ -551,12 +552,13 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, ui off = gpa & PAGE_MASK; n = min(inst_length - vie->num_valid, PAGE_SIZE - off); - hpa = vm_gpa2hpa(vm, gpa, n); - if (hpa == -1) + if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL) break; - bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n); + bcopy(hpa, &vie->inst[vie->num_valid], n); + vm_gpa_release(cookie); + rip += n; vie->num_valid += n; } Index: sys/amd64/vmm/vmm_mem.c =================================================================== --- sys/amd64/vmm/vmm_mem.c (revision 256063) +++ sys/amd64/vmm/vmm_mem.c (working copy) @@ -30,40 +30,24 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include #include #include -#include -#include +#include +#include +#include #include +#include #include +#include +#include #include -#include +#include #include -#include -#include -#include -#include -#include "vmm_util.h" #include "vmm_mem.h" -SYSCTL_DECL(_hw_vmm); - -static u_long pages_allocated; -SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD, - &pages_allocated, 0, "4KB pages allocated"); - -static void -update_pages_allocated(int howmany) -{ - pages_allocated += howmany; /* XXX locking? */ -} - int vmm_mem_init(void) { @@ -71,60 +55,95 @@ vmm_mem_init(void) return (0); } -vm_paddr_t -vmm_mem_alloc(size_t size) +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) { - int flags; - vm_page_t m; - vm_paddr_t pa; + int error; + vm_object_t obj; + struct sglist *sg; - if (size != PAGE_SIZE) - panic("vmm_mem_alloc: invalid allocation size %lu", size); + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); - flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO; - - while (1) { + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { /* - * XXX need policy to determine when to back off the allocation + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. */ - m = vm_page_alloc(NULL, 0, flags); - if (m == NULL) - VM_WAIT; - else - break; + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } } - pa = VM_PAGE_TO_PHYS(m); - - if ((m->flags & PG_ZERO) == 0) - pagezero((void *)PHYS_TO_DMAP(pa)); - m->valid = VM_PAGE_BITS_ALL; + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); - update_pages_allocated(1); - - return (pa); + return (obj); } void -vmm_mem_free(vm_paddr_t base, size_t length) +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) { - vm_page_t m; - if (base & PAGE_MASK) { - panic("vmm_mem_free: base 0x%0lx must be aligned on a " - "0x%0x boundary\n", base, PAGE_SIZE); + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_object_t +vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + int error; + vm_object_t obj; + + if (gpa & PAGE_MASK) + panic("vmm_mem_alloc: invalid gpa %#lx", gpa); + + if (len == 0 || (len & PAGE_MASK) != 0) + panic("vmm_mem_alloc: invalid allocation size %lu", len); + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj != NULL) { + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } } - if (length != PAGE_SIZE) - panic("vmm_mem_free: invalid length %lu", length); + return (obj); +} - m = PHYS_TO_VM_PAGE(base); - m->wire_count--; - vm_page_free(m); - atomic_subtract_int(&cnt.v_wire_count, 1); +void +vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ - update_pages_allocated(-1); + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); } vm_paddr_t Index: sys/amd64/vmm/vmm_mem.h =================================================================== --- sys/amd64/vmm/vmm_mem.h (revision 256063) +++ sys/amd64/vmm/vmm_mem.h (working copy) @@ -29,9 +29,15 @@ #ifndef _VMM_MEM_H_ #define _VMM_MEM_H_ +struct vmspace; +struct vm_object; + int vmm_mem_init(void); -vm_paddr_t vmm_mem_alloc(size_t size); -void vmm_mem_free(vm_paddr_t start, size_t size); +struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); #endif Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c (revision 256063) +++ usr.sbin/bhyve/bhyverun.c (working copy) @@ -101,7 +101,7 @@ struct bhyvestats { uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; - uint64_t vmexit_paging; + uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; int io_reset; @@ -385,13 +385,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vm } static int -vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err; - stats.vmexit_paging++; + stats.vmexit_inst_emul++; - err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa, - &vmexit->u.paging.vie); + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, + &vmexit->u.inst_emul.vie); if (err) { if (err == EINVAL) { @@ -400,7 +400,7 @@ static int vmexit->rip); } else if (err == ESRCH) { fprintf(stderr, "Unhandled memory access to 0x%lx\n", - vmexit->u.paging.gpa); + vmexit->u.inst_emul.gpa); } return (VMEXIT_ABORT); @@ -416,7 +416,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, - [VM_EXITCODE_PAGING] = vmexit_paging, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, }; Index: usr.sbin/bhyve/pci_emul.c =================================================================== --- usr.sbin/bhyve/pci_emul.c (revision 256063) +++ usr.sbin/bhyve/pci_emul.c (working copy) @@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx) * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ - error = vm_get_memory_seg(ctx, 0, &lowmem); + error = vm_get_memory_seg(ctx, 0, &lowmem, NULL); assert(error == 0); memset(&memp, 0, sizeof(struct mem_range)); Index: usr.sbin/bhyve/rtc.c =================================================================== --- usr.sbin/bhyve/rtc.c (revision 256063) +++ usr.sbin/bhyve/rtc.c (working copy) @@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx) * 0x34/0x35 - 64KB chunks above 16MB, below 4GB * 0x5b/0x5c/0x5d - 64KB chunks above 4GB */ - err = vm_get_memory_seg(ctx, 0, &lomem); + err = vm_get_memory_seg(ctx, 0, &lomem, NULL); assert(err == 0); lomem = (lomem - m_16MB) / m_64KB; rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem; rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8; - if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) { + if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) { himem /= m_64KB; rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem; rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8; Index: usr.sbin/bhyvectl/bhyvectl.c =================================================================== --- usr.sbin/bhyvectl/bhyvectl.c (revision 256063) +++ usr.sbin/bhyvectl/bhyvectl.c (working copy) @@ -188,12 +188,13 @@ usage(void) " [--unassign-pptdev=]\n" " [--set-mem=]\n" " [--get-lowmem]\n" - " [--get-highmem]\n", + " [--get-highmem]\n" + " [--get-gpa-pmap]\n", progname); exit(1); } -static int get_stats, getcap, setcap, capval; +static int get_stats, getcap, setcap, capval, get_gpa_pmap; static const char *capname; static int create, destroy, get_lowmem, get_highmem; static uint64_t memsize; @@ -377,18 +378,20 @@ enum { SET_CAP, CAPNAME, UNASSIGN_PPTDEV, + GET_GPA_PMAP, }; int main(int argc, char *argv[]) { char *vmname; - int error, ch, vcpu; - vm_paddr_t gpa; + int error, ch, vcpu, ptenum; + vm_paddr_t gpa, gpa_pmap; size_t len; struct vm_exit vmexit; - uint64_t ctl, eptp, bm, addr, u64; + uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte; struct vmctx *ctx; + int wired; uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; @@ -427,6 +430,7 @@ main(int argc, char *argv[]) { "capname", REQ_ARG, 0, CAPNAME }, { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, { "setcap", REQ_ARG, 0, SET_CAP }, + { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, { "getcap", NO_ARG, &getcap, 1 }, { "get-stats", NO_ARG, &get_stats, 1 }, { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, @@ -666,6 +670,10 @@ main(int argc, char *argv[]) capval = strtoul(optarg, NULL, 0); setcap = 1; break; + case GET_GPA_PMAP: + gpa_pmap = strtoul(optarg, NULL, 0); + get_gpa_pmap = 1; + break; case CAPNAME: capname = optarg; break; @@ -819,16 +827,18 @@ main(int argc, char *argv[]) if (!error && (get_lowmem || get_all)) { gpa = 0; - error = vm_get_memory_seg(ctx, gpa, &len); + error = vm_get_memory_seg(ctx, gpa, &len, &wired); if (error == 0) - printf("lowmem\t\t0x%016lx/%ld\n", gpa, len); + printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); } if (!error && (get_highmem || get_all)) { gpa = 4 * GB; - error = vm_get_memory_seg(ctx, gpa, &len); + error = vm_get_memory_seg(ctx, gpa, &len, &wired); if (error == 0) - printf("highmem\t\t0x%016lx/%ld\n", gpa, len); + printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); } if (!error && (get_efer || get_all)) { @@ -1457,6 +1467,17 @@ main(int argc, char *argv[]) printf("Capability \"%s\" is not available\n", capname); } + if (!error && get_gpa_pmap) { + error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); + if (error == 0) { + printf("gpa %#lx:", gpa_pmap); + pte = &pteval[0]; + while (ptenum-- > 0) + printf(" %#lx", *pte++); + printf("\n"); + } + } + if (!error && (getcap || get_all)) { int captype, val, getcaptype; Index: usr.sbin/bhyveload/bhyveload.c =================================================================== --- usr.sbin/bhyveload/bhyveload.c (revision 256063) +++ usr.sbin/bhyveload/bhyveload.c (working copy) @@ -492,8 +492,8 @@ static void cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem) { - vm_get_memory_seg(ctx, 0, ret_lowmem); - vm_get_memory_seg(ctx, 4 * GB, ret_highmem); + vm_get_memory_seg(ctx, 0, ret_lowmem, NULL); + vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL); } static const char *