commit 54f7d9a4f907bbe598956fa4aa57b042bade1ad6 Author: Stacey Son Date: Tue May 12 18:42:17 2015 -0500 New PMAP implementation for MIPS64. - Add referenced bit emulation. The hardware valid bit in the TLB entry is used as a referenced bit and a software valid bit is added (PTE_SV). On a TLB exception the software valid bit is checked and the hardware valid/referenced bit (PTE_VR) is set. The hardware valid bit is effectively used as a referenced bit. - Add support for automatic promotion of 4KB page mappings to 2MB page mappings (aka. "superpages"). Automatic promotion can be enabled by setting the tunable "vm.pmap.pg_ps_enabled" to a non-zero value. By default, automatic promotion is disabled. (Further testing is needed before it is enabled by default.) On MIPS64 the 2MB superpages are actually an even and odd pair of 1M pages that act as a single 2MB superpage mapping. This allows the VM layer to believe it is just a single 2M superpage and not have to deal with the way the TLB is implemented on MIPS in a special way. The 4K pages are still mapped in the TLB as before (i.e. even and odd contiguous pages in virtual memory share a single TLB entry but may mapped non-contiguous physical memory.) - PV chunk and list locking so many of the pmap_* functions are no longer serialized by the pvh global lock just like in the AMD64 implementation. Many of the other optimizations to the AMD64 pmap implementation are also included as well. - Add "Machine Check" exception handling and recovery. If for some reason conflicting TLB are added and the CPU generates a "Machine Check" exception then the trap handler will detect and recover by flushing the TLB and reseting the status register. Also, report the MCHECK. - Use large (16K sized pages) for the kernel thread stack. To prevent a kernel thread stack overrun a larger thread stack is used. The larger stack is created by using a larger sized page (16K instead of 4K). The new pmap implementation for MIPS64 is enabled by adding "options MIPS64_NEW_PMAP" to the kernel config file. The larger kernel thread stack is enabled by adding "options KSTACK_LARGE_PAGE". diff --git a/sys/conf/files.mips b/sys/conf/files.mips index 3677de4..cedf4f9 100644 --- a/sys/conf/files.mips +++ b/sys/conf/files.mips @@ -31,7 +31,8 @@ mips/mips/mp_machdep.c optional smp mips/mips/mpboot.S optional smp mips/mips/nexus.c standard mips/mips/pm_machdep.c standard -mips/mips/pmap.c standard +mips/mips/pmap.c optional !mips64_new_pmap +mips/mips/pmap_mips64.c optional mips64_new_pmap mips/mips/ptrace_machdep.c standard mips/mips/sc_machdep.c standard mips/mips/stack_machdep.c optional ddb | stack diff --git a/sys/conf/options.mips b/sys/conf/options.mips index 93d4ed3..cf87195 100644 --- a/sys/conf/options.mips +++ b/sys/conf/options.mips @@ -91,11 +91,24 @@ OCTEON_BOARD_CAPK_0100ND opt_cvmx.h BERI_LARGE_TLB opt_global.h # +# Use the new pmap for MIPS64 that includes reference bit emulation +# and superpage support. +# +MIPS64_NEW_PMAP opt_global.h + +# # Options that control the NetFPGA-10G Embedded CPU Ethernet Core. # NF10BMAC_64BIT opt_netfpga.h # +# Options for hardware with PageMask register support +# + +# Use one large page (currently 16K) for the kernel thread stack +KSTACK_LARGE_PAGE opt_global.h + +# # Options that control the Atheros SoC peripherals # ARGE_DEBUG opt_arge.h diff --git a/sys/mips/include/cpuinfo.h b/sys/mips/include/cpuinfo.h index baf3039..deeb93b 100644 --- a/sys/mips/include/cpuinfo.h +++ b/sys/mips/include/cpuinfo.h @@ -54,6 +54,7 @@ struct mips_cpuinfo { u_int8_t cpu_rev; u_int8_t cpu_impl; u_int8_t tlb_type; + u_int32_t tlb_pgmask; u_int16_t tlb_nentries; u_int8_t icache_virtual; boolean_t cache_coherent_dma; diff --git a/sys/mips/include/param.h b/sys/mips/include/param.h index 90f3e6f..a6a2b0c 100644 --- a/sys/mips/include/param.h +++ b/sys/mips/include/param.h @@ -161,17 +161,38 @@ #define MAXDUMPPGS 1 /* xxx: why is this only one? */ +#ifdef KSTACK_LARGE_PAGE +/* + * For a large kernel stack page the KSTACK_SIZE needs to be a page size + * supported by the hardware (e.g. 16K). + */ +#define KSTACK_SIZE (1 << 14) /* Single 16K page */ +#define KSTACK_PAGE_SIZE KSTACK_SIZE +#define KSTACK_PAGE_MASK (KSTACK_PAGE_SIZE - 1) +#define KSTACK_PAGES (KSTACK_SIZE / PAGE_SIZE) +#define KSTACK_TLBMASK_MASK ((KSTACK_PAGE_MASK >> (TLBMASK_SHIFT - 1)) \ + << TLBMASK_SHIFT) +#define KSTACK_GUARD_PAGES ((KSTACK_PAGE_SIZE * 2) / PAGE_SIZE) + +#else /* ! KSTACK_LARGE_PAGE */ + /* * The kernel stack needs to be aligned on a (PAGE_SIZE * 2) boundary. */ #define KSTACK_PAGES 2 /* kernel stack */ +#define KSTACK_SIZE (KSTACK_PAGES * PAGE_SIZE) +#define KSTACK_PAGE_SIZE PAGE_SIZE +#define KSTACK_PAGE_MASK (PAGE_SIZE - 1) #define KSTACK_GUARD_PAGES 2 /* pages of kstack guard; 0 disables */ +#endif /* ! KSTACK_LARGE_PAGE */ /* * Mach derived conversion macros */ #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) #define trunc_page(x) ((x) & ~PAGE_MASK) +#define round_2mpage(x) (((x) + PDRMASK) & ~PDRMASK) +#define trunc_2mpage(x) ((x) & ~PDRMASK) #define atop(x) ((x) >> PAGE_SHIFT) #define ptoa(x) ((x) << PAGE_SHIFT) diff --git a/sys/mips/include/pmap.h b/sys/mips/include/pmap.h index 15b07d3..8a72465 100644 --- a/sys/mips/include/pmap.h +++ b/sys/mips/include/pmap.h @@ -62,6 +62,10 @@ #include #include +#ifdef MIPS64_NEW_PMAP +#include +#endif /* MIPS64_NEW_PMAP */ + /* * Pmap stuff */ @@ -69,8 +73,13 @@ struct pv_entry; struct pv_chunk; struct md_page { - int pv_flags; TAILQ_HEAD(, pv_entry) pv_list; + vm_memattr_t pv_memattr; +#ifdef MIPS64_NEW_PMAP + int pv_gen; +#else /* ! MIPS64_NEW_PMAP */ + int pv_flags; +#endif /* ! MIPS64_NEW_PMAP */ }; #define PV_TABLE_REF 0x02 /* referenced */ @@ -80,6 +89,7 @@ struct md_page { #define ASIDGEN_MASK ((1 << ASIDGEN_BITS) - 1) struct pmap { + struct mtx pm_mtx; pd_entry_t *pm_segtab; /* KVA of segment table */ TAILQ_HEAD(, pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ @@ -88,7 +98,9 @@ struct pmap { u_int32_t gen:ASIDGEN_BITS; /* its generation number */ } pm_asid[MAXSMPCPU]; struct pmap_statistics pm_stats; /* pmap statistics */ - struct mtx pm_mtx; +#ifdef MIPS64_NEW_PMAP + struct vm_radix pm_root; /* spare page table pages */ +#endif /* MIPS64_NEW_PMAP */ }; typedef struct pmap *pmap_t; @@ -120,7 +132,7 @@ extern struct pmap kernel_pmap_store; */ typedef struct pv_entry { vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_list; + TAILQ_ENTRY(pv_entry) pv_next; } *pv_entry_t; /* @@ -162,13 +174,13 @@ extern vm_offset_t virtual_end; extern vm_paddr_t dump_avail[PHYS_AVAIL_ENTRIES + 2]; -#define pmap_page_get_memattr(m) VM_MEMATTR_DEFAULT -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) +#define pmap_page_get_memattr(m) ((m)->md.pv_memattr) #define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) -#define pmap_page_set_memattr(m, ma) (void)0 void pmap_bootstrap(void); void *pmap_mapdev(vm_paddr_t, vm_size_t); +boolean_t pmap_page_is_mapped(vm_page_t m); +void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); void pmap_unmapdev(vm_offset_t, vm_size_t); vm_offset_t pmap_steal_memory(vm_size_t size); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); @@ -178,6 +190,7 @@ void *pmap_kenter_temporary(vm_paddr_t pa, int i); void pmap_kenter_temporary_free(vm_paddr_t pa); void pmap_flush_pvcache(vm_page_t m); int pmap_emulate_modified(pmap_t pmap, vm_offset_t va); +int pmap_emulate_referenced(pmap_t pmap, vm_offset_t va); void pmap_grow_direct_page_cache(void); #endif /* _KERNEL */ diff --git a/sys/mips/include/pte.h b/sys/mips/include/pte.h index 2f2f995..5041983 100644 --- a/sys/mips/include/pte.h +++ b/sys/mips/include/pte.h @@ -29,6 +29,10 @@ #ifndef _MACHINE_PTE_H_ #define _MACHINE_PTE_H_ +#if !defined(_KERNEL) +#include +#endif + #ifndef _LOCORE #if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ typedef uint64_t pt_entry_t; @@ -72,22 +76,25 @@ typedef pt_entry_t *pd_entry_t; * NOTE: This means that for 32-bit use of CP0, we aren't able to set the top * bit of PFN to a non-zero value, as software is using it! This physical * memory size limit may not be sufficiently enforced elsewhere. + * + * XXXRW: On CHERI, bits 63 and 62 are used for additional permissions that + * prevent loading and storing of capabilities, so we have reduced the 55-bit + * shift to 53 bits. */ #if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ -#define TLBLO_SWBITS_SHIFT (55) -#define TLBLO_SWBITS_CLEAR_SHIFT (9) -#define TLBLO_PFN_MASK 0x3FFFFFFC0ULL +#define TLBLO_SWBITS_SHIFT (53) /* XXXRW: Was 55. */ +#define TLBLO_REF_BIT_SHIFT (61) +#define TLBLO_SWBITS_CLEAR_SHIFT (11) /* XXXSS: Was 9. */ +#define TLBLO_PFN_MASK 0xFFFFFFC0ULL +#define TLB_1M_SUPERPAGE_SHIFT (PDRSHIFT) +#define TLBLO_SWBITS_MASK ((pt_entry_t)0x7F << TLBLO_SWBITS_SHIFT) #else #define TLBLO_SWBITS_SHIFT (29) #define TLBLO_SWBITS_CLEAR_SHIFT (3) #define TLBLO_PFN_MASK (0x1FFFFFC0) +#define TLBLO_SWBITS_MASK ((pt_entry_t)0x7 << TLBLO_SWBITS_SHIFT) #endif #define TLBLO_PFN_SHIFT (6) -#define TLBLO_SWBITS_MASK ((pt_entry_t)0x7 << TLBLO_SWBITS_SHIFT) -#define TLBLO_PA_TO_PFN(pa) ((((pa) >> TLB_PAGE_SHIFT) << TLBLO_PFN_SHIFT) & TLBLO_PFN_MASK) -#define TLBLO_PFN_TO_PA(pfn) ((vm_paddr_t)((pfn) >> TLBLO_PFN_SHIFT) << TLB_PAGE_SHIFT) -#define TLBLO_PTE_TO_PFN(pte) ((pte) & TLBLO_PFN_MASK) -#define TLBLO_PTE_TO_PA(pte) (TLBLO_PFN_TO_PA(TLBLO_PTE_TO_PFN((pte)))) /* * XXX This comment is not correct for anything more modern than R4K. @@ -121,12 +128,29 @@ typedef pt_entry_t *pd_entry_t; #endif /* defined(__mips_n64) */ /* + * PTE Hardware Bits (EntryLo0-1 register fields) + * + * Lower bits of a 32 bit PTE: + * + * 28 --------------- 6 5 - 3 2 1 0 + * -------------------------------------- + * | PFN | C | D | VR| G | + * -------------------------------------- + * + * Lower bits of a 64 bit PTE: + * + * 52 -------------------- 34 33 ------------------- 6 5 - 3 2 1 0 + * ---------------------------------------------------------------------- + * | Reserved (Zero) | PFN | C | D | VR| G | + * ---------------------------------------------------------------------- + * * TLB flags managed in hardware: + * PFN: Page Frame Number. * C: Cache attribute. * D: Dirty bit. This means a page is writable. It is not * set at first, and a write is trapped, and the dirty * bit is set. See also PTE_RO. - * V: Valid bit. Obvious, isn't it? + * VR: Valid/Reference bit. See also PTE_SV. * G: Global bit. This means that this mapping is present * in EVERY address space, and to ignore the ASID when * it is matched. @@ -135,11 +159,52 @@ typedef pt_entry_t *pd_entry_t; #define PTE_C_UNCACHED (PTE_C(MIPS_CCA_UNCACHED)) #define PTE_C_CACHE (PTE_C(MIPS_CCA_CACHED)) #define PTE_D 0x04 -#define PTE_V 0x02 +#define PTE_VR 0x02 #define PTE_G 0x01 +#ifdef CPU_CHERI +/* + * CHERI EntryLo extensions that limit storing loading and storing tagged + * values. + */ +#define PTE_SC (0x1ULL << 63) +#define PTE_LC (0x1ULL << 62) +#endif + /* + * PTE Software Bits + * + * Upper bits of a 32 bit PTE: + * + * 31 30 29 + * -------------- + * | MN | W | RO | + * -------------- + * + * Upper bits of a 64 bit PTE: + * + * 63-62 61-60 59 58 -- 56 55 54 53 + * --------------------------------------------- + * | RG | | SV | PG SZ IDX | MN | W | RO | + * --------------------------------------------- + * * VM flags managed in software: + * RG: Region. (Reserved. Currently not used.) + * SV: Soft Valid bit. + * PG SZ IDX: Page Size Index (0-7). + * Index Page Mask (Binary) HW Page Size + * ----- ------------------- ------------ + * 0 0000 0000 0000 0000 4K + * 1 0000 0000 0000 0011 16K + * 2 0000 0000 0000 1111 64K + * 3 0000 0000 0011 1111 256K + * 4 0000 0000 1111 1111 1M + * 5 0000 0011 1111 1111 4M + * 6 0000 1111 1111 1111 16M + * (MIPS 3:) + * 7 0011 1111 1111 1111 64M + * 8 1111 1111 1111 1111 256M (Not currently supported) + * * RO: Read only. Never set PTE_D on this page, and don't * listen to requests to write to it. * W: Wired. ??? @@ -151,13 +216,291 @@ typedef pt_entry_t *pd_entry_t; #define PTE_RO ((pt_entry_t)0x01 << TLBLO_SWBITS_SHIFT) #define PTE_W ((pt_entry_t)0x02 << TLBLO_SWBITS_SHIFT) #define PTE_MANAGED ((pt_entry_t)0x04 << TLBLO_SWBITS_SHIFT) +#if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ +#define PTE_PS_16K ((pt_entry_t)0x08 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_64K ((pt_entry_t)0x10 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_256K ((pt_entry_t)0x18 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_1M ((pt_entry_t)0x20 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_4M ((pt_entry_t)0x28 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_16M ((pt_entry_t)0x30 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_64M ((pt_entry_t)0x38 << TLBLO_SWBITS_SHIFT) +#define PTE_PS_IDX_MASK ((pt_entry_t)0x38 << TLBLO_SWBITS_SHIFT) +#define PTE_PSIDX_NBITS_TO_LEFT 5 +#define PTE_PSIDX_NBITS_TO_RIGHT 56 +#define PTE_PFN_NBITS_TO_LEFT 11 +#define PTE_PFN_NBITS_TO_RIGHT 6 +#define PTE_HWFLAGS_NBITS_TO_LEFT 58 +#define SW_VALID 0x40 +#define PTE_SV ((pt_entry_t)SW_VALID << TLBLO_SWBITS_SHIFT) +#else +#define PTE_PS_IDX_MASK 0 +#define PTE_SV 0 +#endif + +/* + * Promotion to a 4MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PROMOTE_MASK (PTE_G | PTE_VALID | PTE_D | PTE_C_UNCACHED | \ + PTE_C_CACHE | PTE_RO | PTE_W | PTE_MANAGED | \ + PTE_REF) + +#ifdef MIPS64_NEW_PMAP +#define TLBLO_PTE_TO_IDX(pte) (((pte) & PTE_PS_IDX_MASK) >> 56) +#define TLBMASK_IDX_TO_MASK(idx) (((1 << ((idx) << 1)) - 1) << TLBMASK_SHIFT) +#define TLBLO_PTE_TO_MASK(pte) TLBMASK_IDX_TO_MASK(TLBLO_PTE_TO_IDX(pte)) +#define TLBMASK_4K_PAGE TLBMASK_IDX_TO_MASK(0) +#define TLBMASK_16K_PAGE TLBMASK_IDX_TO_MASK(1) +#define TLBMASK_64K_PAGE TLBMASK_IDX_TO_MASK(2) +#define TLBMASK_256K_PAGE TLBMASK_IDX_TO_MASK(3) +#define TLBMASK_1M_PAGE TLBMASK_IDX_TO_MASK(4) +#define TLBMASK_4M_PAGE TLBMASK_IDX_TO_MASK(5) +#define TLBMASK_16M_PAGE TLBMASK_IDX_TO_MASK(6) +#define TLBMASK_64M_PAGE TLBMASK_IDX_TO_MASK(7) +#else /* ! MIPS64_NEW_PMAP */ +#define TLBLO_PTE_TO_IDX(pte) 0 +#define TLBLO_PTE_TO_MASK(pte) 0 +#endif /* ! MIPS64_NEW_PMAP */ /* * PTE management functions for bits defined above. */ -#define pte_clear(pte, bit) (*(pte) &= ~(bit)) -#define pte_set(pte, bit) (*(pte) |= (bit)) -#define pte_test(pte, bit) ((*(pte) & (bit)) == (bit)) +#ifndef _LOCORE +static __inline void +pte_clear(pt_entry_t *pte, pt_entry_t bit) +{ + + *pte &= (~bit); +} + +static __inline void +pte_set(pt_entry_t *pte, pt_entry_t bit) +{ + + *pte |= bit; +} + +static __inline int +pte_test(pt_entry_t *pte, pt_entry_t bit) +{ + + return ((*pte & bit) == bit); +} + +static __inline void +pde_clear(pd_entry_t *pde, pt_entry_t bit) +{ + + *(pt_entry_t *)pde &= (~bit); +} + +static __inline void +pde_set(pd_entry_t *pde, pt_entry_t bit) +{ + + *(pt_entry_t *)pde |= bit; +} + +static __inline int +pde_test(pd_entry_t *pde, pt_entry_t bit) +{ + + return ((*(pt_entry_t *)pde & bit) == bit); +} + +static __inline pt_entry_t +TLBLO_PA_TO_PFN(vm_paddr_t pa) +{ + + return (((pa >> TLB_PAGE_SHIFT) << TLBLO_PFN_SHIFT) & TLBLO_PFN_MASK); +} + +static __inline vm_paddr_t +TLBLO_PFN_TO_PA(pt_entry_t pfn) +{ + + return ((vm_paddr_t)(pfn >> TLBLO_PFN_SHIFT) << TLB_PAGE_SHIFT); +} + +static __inline pt_entry_t +TLBLO_PTE_TO_PFN(pt_entry_t pte) +{ + + return (pte & TLBLO_PFN_MASK); +} + +#ifdef MIPS64_NEW_PMAP + +#define PTE_REF PTE_VR +#define PTE_VALID PTE_SV + +#define pte_is_ref(pte) pte_test((pte), PTE_REF) +#define pte_ref_clear(pte) pte_clear((pte), PTE_REF) +#define pte_ref_set(pte) pte_set((pte), PTE_REF) +#define pte_ref_atomic_clear(pte) atomic_clear_long((pte), PTE_REF) +#define pte_ref_atomic_set(pte) atomic_set_long((pte), PTE_REF) + +#else /* ! MIPS64_NEW_PMAP */ + +#define PTE_REF 0 +#define PTE_VALID PTE_VR + +#define pte_is_ref(pte) 0 +#define pte_ref_clear(pte) +#define pte_ref_set(pte) +#define pte_ref_atomic_clear(pte) +#define pte_ref_atomic_set(pte, bit) + +#endif /* ! MIPS64_NEW_PMAP */ + +#define pte_is_valid(pte) pte_test((pte), PTE_VALID) + +#if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ + +#define pte_atomic_clear(pte, bit) atomic_clear_64((pte), bit) +#define pte_atomic_set(pte, bit) atomic_set_64((pte), bit) +#define pte_load_store(ptep, pte) atomic_readandset_64(ptep, pte) +#define pde_load_store(pdep, pde) (pd_entry_t)atomic_readandset_64(\ + (pt_entry_t *)pdep, pde) + +#define pte_atomic_store(ptep, pte) atomic_store_rel_64(ptep, pte) +#define pte_store(ptep, pte) do { \ + *(u_long *)(ptep) = (u_long)(pte); \ +} while (0) +#define pde_store(pdep, pde) pte_store(pdep, pde) + + +#else /* ! PHYSADDR_64_BIT */ + +#define pte_atomic_clear(pte, bit) atomic_clear_32((pte), bit) +#define pte_atomic_set(pte, bit) atomic_set_32((pte), bit) +#define pte_load_store(ptep, pte) atomic_readandset_32(ptep, pte) +#define pde_load_store(pdep, pde) (pd_entry_t)atomic_readandset_32(\ + (pt_entry_t *)pdep, pde) + +#define pte_atomic_store(ptep, pte) atomic_store_rel_32(ptep, pte) +#define pte_store(ptep, pte) do { \ + *(u_int *)(ptep) = (u_int)(pte); \ +} while (0) +#define pde_store(pdep, pde) pte_store(pdep, pde) + +#endif /* ! PHYSADDR_64_BIT */ + +#endif /* ! _LOCORE */ + +#if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ + +#ifndef _LOCORE +/* + * Check to see if a PDE is actually a superpage (PageSize > 4K) PTE. + * + * On __mips_n64 the kernel uses the virtual memory address range from + * VM_MIN_KERNEL_ADDRESS (0xc000000000000000) to VM_MAX_KERNEL_ADDRESS + * (0xc000008000000000). Therefore, a valid virtual address in the PDE + * (a pointer to a page table) will have bits 61 to 40 set to zero. A + * superpage will have one of the superpage size bits (bits 58 to 56) + * set. + */ + +/* Is the PDE a superpage of any size? */ +static __inline int +pde_is_superpage(pd_entry_t *pde) +{ + + return (((pt_entry_t)*pde & PTE_PS_IDX_MASK) != 0); +} + +/* Is the PTE a superpage of any size? */ +static __inline int +pte_is_superpage(pt_entry_t *pte) +{ + + return ((*pte & PTE_PS_IDX_MASK) != 0); +} + +/* Is the PDE an 1MB superpage? */ +static __inline int +pde_is_1m_superpage(pd_entry_t *pde) +{ + + return (((pt_entry_t)*pde & PTE_PS_1M) == PTE_PS_1M); +} + +/* Is the PTE an 1MB superpage? */ +static __inline int +pte_is_1m_superpage(pt_entry_t *pte) +{ + + return ((*pte & PTE_PS_1M) == PTE_PS_1M); +} + +/* Physical Address to Superpage Physical Frame Number. */ +static __inline pt_entry_t +TLBLO_PA_TO_SPFN(vm_paddr_t pa) +{ + + return (((pa >> TLB_1M_SUPERPAGE_SHIFT) << TLBLO_PFN_SHIFT) & + TLBLO_PFN_MASK); +} + +/* Superpage Physical Frame Number to Physical Address. */ +static __inline vm_paddr_t +TLBLO_SPFN_TO_PA(pt_entry_t spfn) +{ + + return ((vm_paddr_t)(spfn >> TLBLO_PFN_SHIFT) << + TLB_1M_SUPERPAGE_SHIFT); +} + +/* Superpage Page Table Entry to Physical Address. */ +static __inline vm_paddr_t +TLBLO_SPTE_TO_PA(pt_entry_t pte) +{ + return (TLBLO_SPFN_TO_PA(TLBLO_PTE_TO_PFN(pte))); +} + +static __inline vm_paddr_t +TLBLO_SPDE_TO_PA(pd_entry_t pde) +{ + return (TLBLO_SPFN_TO_PA(TLBLO_PTE_TO_PFN((pt_entry_t)pde))); +} + + +/* An 4KB Page Table Entry to Physical Address. */ +static __inline vm_paddr_t +TLBLO_PTE_TO_PA(pt_entry_t pte) +{ + + return (TLBLO_PFN_TO_PA(TLBLO_PTE_TO_PFN(pte))); +} + +static __inline vm_paddr_t +TLBLO_PDE_TO_PA(pd_entry_t pde) +{ + + return (TLBLO_PFN_TO_PA(TLBLO_PTE_TO_PFN((pt_entry_t)pde))); +} +#endif /* ! _LOCORE */ + +#else /* ! PHYSADDR_64_BIT */ + +#define pte_is_referenced(pte) 0 +#define pte_reference_reset(pte) +#define pte_reference_page(pte) +#define pde_is_superpage(pde) 0 +#define pte_is_superpage(pde) 0 +#define pde_is_1m_superpage(pte) 0 +#define pte_is_1m_superpage(pte) 0 + +#ifndef _LOCORE +static __inline vm_paddr_t +TLBLO_PTE_TO_PA(pt_entry_t pte) +{ + + return (TLBLO_PFN_TO_PA(TLBLO_PTE_TO_PFN(pte))); +} +#endif /* ! _LOCORE */ +#endif /* ! PHYSADDR_64_BIT */ /* Assembly support for PTE access*/ #ifdef LOCORE @@ -167,16 +510,104 @@ typedef pt_entry_t *pd_entry_t; #define PTEMASK 0xff8 #define PTESIZE 8 #define PTE_L ld +#define PTE_S sd #define PTE_MTC0 dmtc0 -#define CLEAR_PTE_SWBITS(pr) -#else +#define CLEAR_PTE_SWBITS(r) + +#ifdef MIPS64_NEW_PMAP + +/* Superpage and referenced bit emulation ASM macros. */ + +/* + * GET_SUPERPAGE_IDX(r) + * + * Get the superpage index from the PTE by shifting it left by + * PTE_PSIDX_NBITS_TO_LEFT (clearing the upper softbits) and then back to the + * right by (PTE_PSIDX_NBITS_TO_RIGHT + PTE_PSIDX_NBITS_TO_RIGHT) clearing + * all the lower bits in the process. + */ +#define GET_SUPERPAGE_IDX(r) \ + dsll r, (PTE_PSIDX_NBITS_TO_LEFT); \ + dsrl32 r, (PTE_PSIDX_NBITS_TO_RIGHT + PTE_PSIDX_NBITS_TO_LEFT - 32) + +/* + * GET_HW_TLB_FLAGS(r) + * + * Get the lower hardware TLB flags but shifting left then right. + */ +#define GET_HW_TLB_FLAGS(r) \ + dsll32 r, (PTE_HWFLAGS_NBITS_TO_LEFT - 32); \ + dsrl32 r, (PTE_HWFLAGS_NBITS_TO_LEFT - 32) + +/* + * GET_ODD_1M_PFN_FROM_EVEN(r) + * + * Get the odd 1M PFN (TLB lo1) from the even 1M PTE. First, mask out the PFN + * from the even PTE. Then add 1M worth of pages to it (256). Finally, shift it + * back to its position in the PTE. + */ +#define GET_ODD_1M_PFN_FROM_EVEN(r) \ + dsll r, (PTE_PFN_NBITS_TO_LEFT); \ + dsrl r, (PTE_PFN_NBITS_TO_LEFT + PTE_PFN_NBITS_TO_RIGHT); \ + daddiu r, r, (1024 * 1024 / PAGE_SIZE); \ + dsll r, (PTE_PFN_NBITS_TO_RIGHT) + +/* + * IF_VALID_SET_REFBIT(r0, r1, offset, unique) + * + * If a PDE is valid then set the referenced bit (PTE_VR). The first version + * does it atomically. + */ +#define ATOMIC_REFBIT_UPDATE +#ifdef ATOMIC_REFBIT_UPDATE + +#define IF_VALID_SET_REFBIT(r0, r1, offset, unique) \ +try_again ## unique ## : ; \ + dsrl32 r0, (TLBLO_SWBITS_SHIFT - 32); \ + andi r0, r0, SW_VALID; \ + beqz r0, not_valid ## unique ; \ + PTE_L r0, offset ## (r1) ; \ + lld r0, offset ## (r1) ; \ + ori r0, r0, PTE_VR ; \ + scd r0, offset ## (r1) ; \ + beqz r0, try_again ## unique ; \ + PTE_L r0, offset ## (r1) ; \ +not_valid ## unique ## : + +#else /* ! ATOMIC_REFBIT_UPDATE */ + +#define IF_VALID_SET_REFBIT(r0, r1, offset, unique) \ +try_again ## unique ## : ; \ + dsrl32 r0, (TLBLO_SWBITS_SHIFT - 32) ; \ + andi r0, r0, SW_VALID ; \ + beqz r0, not_valid ## unique ; \ + PTE_L r0, offset ## (r1) ; \ + ori r0, r0, PTE_VR ; \ + PTE_S r0, offset ## (r1) ; \ +not_valid ## unique ## : +#endif /* ! ATOMIC_REFBIT_UPDATE */ + +#else /* ! MIPS64_NEW_PMAP */ + +#define GET_SUPERPAGE_IDX(r) +#define GET_HW_TLB_FLAGS(r) +#define IF_VALID_SET_REFBIT(r0, r1, offset, unique) + +#endif /* ! MIPS64_NEW_PMAP */ + +#else /* ! LOCORE */ #define PTESHIFT 2 #define PTE2MASK 0xff8 /* for the 2-page lo0/lo1 */ #define PTEMASK 0xffc #define PTESIZE 4 #define PTE_L lw +#define PTE_S sw #define PTE_MTC0 mtc0 #define CLEAR_PTE_SWBITS(r) LONG_SLL r, TLBLO_SWBITS_CLEAR_SHIFT; LONG_SRL r, TLBLO_SWBITS_CLEAR_SHIFT /* remove swbits */ + +#define IS_PTE_VALID(r0, r1, offset, label) +#define SET_REF_BIT(r0, r1, offset) + #endif /* defined(__mips_n64) || defined(__mips_n32) */ #if defined(__mips_n64) @@ -188,4 +619,17 @@ typedef pt_entry_t *pd_entry_t; #endif #endif /* LOCORE */ + +/* PageMask Register (CP0 Register 5, Select 0) Values */ +#define MIPS3_PGMASK_MASKX 0x00001800 +#define MIPS3_PGMASK_4K 0x00000000 +#define MIPS3_PGMASK_16K 0x00006000 +#define MIPS3_PGMASK_64K 0x0001e000 +#define MIPS3_PGMASK_256K 0x0007e000 +#define MIPS3_PGMASK_1M 0x001fe000 +#define MIPS3_PGMASK_4M 0x007fe000 +#define MIPS3_PGMASK_16M 0x01ffe000 +#define MIPS3_PGMASK_64M 0x07ffe000 +#define MIPS3_PGMASK_256M 0x1fffe000 + #endif /* !_MACHINE_PTE_H_ */ diff --git a/sys/mips/include/vmparam.h b/sys/mips/include/vmparam.h index 89caf27..3b74849 100644 --- a/sys/mips/include/vmparam.h +++ b/sys/mips/include/vmparam.h @@ -100,15 +100,47 @@ #define FREEBSD32_USRSTACK (((vm_offset_t)0x80000000) - PAGE_SIZE) #endif +#ifdef MIPS64_NEW_PMAP /* - * Disable superpage reservations. (not sure if this is right - * I copied it from ARM) + * Enable superpage reservations: 1 level. + * + * VM_NRESERVLEVEL specifies a number of promotion levels enabled. + * Currently mips64 only supports one size or level (VM_LEVEL_0_ORDER) of + * superpages (2MB) + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 1 +#endif + +/* + * Level 0 reservations consist of 512 (2^9) pages (2MB). + */ +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 9 +#endif + +/* + * The largest allocation size is 4MB. + */ +#define VM_NFREEORDER 11 + +#else /* ! MIPS64_NEW_PMAP */ + +/* + * Disable superpage reservations. */ #ifndef VM_NRESERVLEVEL #define VM_NRESERVLEVEL 0 #endif /* + * The largest allocation size is 1MB. + */ +#define VM_NFREEORDER 9 + +#endif /* ! MIPS64_NEW_PMAP */ + +/* * How many physical pages per kmem arena virtual page. */ #ifndef VM_KMEM_SIZE_SCALE @@ -178,11 +210,6 @@ #define VM_LOWMEM_BOUNDARY ((vm_paddr_t)0x20000000) #endif -/* - * The largest allocation size is 1MB. - */ -#define VM_NFREEORDER 9 - #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ #ifndef __mips_n64 diff --git a/sys/mips/mips/cpu.c b/sys/mips/mips/cpu.c index 469ab81..bb1c0f0 100644 --- a/sys/mips/mips/cpu.c +++ b/sys/mips/mips/cpu.c @@ -214,6 +214,25 @@ mips_get_identity(struct mips_cpuinfo *cpuinfo) cpuinfo->l2.dc_size = cpuinfo->l2.dc_linesize * cpuinfo->l2.dc_nsets * cpuinfo->l2.dc_nways; #endif + + /* + * Probe PageMask register to see what sizes of pages are supported + * by writing all one's and then reading it back. + */ + mips_wr_pagemask(~0); + cpuinfo->tlb_pgmask = mips_rd_pagemask(); + mips_wr_pagemask(MIPS3_PGMASK_4K); + +#ifdef KSTACK_LARGE_PAGE + if ((cpuinfo->tlb_pgmask & MIPS3_PGMASK_16K) == 0) + panic("%s: 16K sized pages are not supported by this CPU.", + __func__); +#endif /* KSTACK_LARGE_PAGE */ +#ifdef MIPS64_NEW_PMAP + if ((cpuinfo->tlb_pgmask & MIPS3_PGMASK_1M) == 0) + panic("%s: 1M sized pages are not supported by this CPU.", + __func__); +#endif /* MIPS64_NEW_PMAP */ } void @@ -289,9 +308,34 @@ cpu_identify(void) } else if (cpuinfo.tlb_type == MIPS_MMU_FIXED) { printf("Fixed mapping"); } - printf(", %d entries\n", cpuinfo.tlb_nentries); + printf(", %d entries ", cpuinfo.tlb_nentries); + if (cpuinfo.tlb_pgmask) { + printf("("); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_MASKX) + printf("1K "); + printf("4K "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_16K) + printf("16K "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_64K) + printf("64K "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_256K) + printf("256K "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_1M) + printf("1M "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_4M) + printf("4M "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_16M) + printf("16M "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_64M) + printf("64M "); + if (cpuinfo.tlb_pgmask & MIPS3_PGMASK_256M) + printf("256M "); + printf("pg sizes)"); + } + printf("\n"); } + printf(" L1 i-cache: "); if (cpuinfo.l1.ic_linesize == 0) { printf("disabled"); diff --git a/sys/mips/mips/exception.S b/sys/mips/mips/exception.S index 6eddd19..c46fbd5 100644 --- a/sys/mips/mips/exception.S +++ b/sys/mips/mips/exception.S @@ -92,6 +92,28 @@ dtrace_invop_calltrap_addr: */ #define INTRCNT_COUNT 256 +/* + * General MIPS CPU state for exceptions: + * + * EPC Register will point to the instruction that caused fault, unless the + * faulting instruction was in a branch delay slot. In that case, it will + * point to the branch before the branch delay slot instruction. + * + * The cause register will contain what caused the exception and some state + * about the interrupt. + * + * The status register contains information about the status of the CPU such + * as: Kernel/User mode bit, interrupt enable bit. + * + * The BadVaddr register contains the virtual address that cause the last + * exception. + * + * The Context register contains the lower 22 bits of the VPN (starting at + * bit 4) that cause the last exception except bit0 and bit1 are zero. The + * upper bits (bits 23 to 31 for MIPS32 and bits 23 to 63) are set under + * kernel control (i.e. point to the page table). The Context/XContext + * registers are not currently used by FreeBSD. + */ /* *---------------------------------------------------------------------------- @@ -117,13 +139,27 @@ VECTOR_END(MipsTLBMiss) /* *---------------------------------------------------------------------------- * - * MipsDoTLBMiss -- + * MipsDoTLBMiss -- (UTLB miss) * - * This is the real TLB Miss Handler code. + * This is the real TLB Miss Handler code. A miss was generated when the + * access is to kuseg and there was not matching mapping loaded into the TLB. * 'segbase' points to the base of the segment table for user processes. * + * The CPU does the following for an UTLB miss: + * - Sets the EPC register. + * - Sets the Cause register. + * - Sets the Status register. Shifts K/U and IE bits over one and clears + * the current Kernel/User and Interrupt Enable bits. So the processor + * is in kernel mode with the interupts turned off. + * - Sets BadVaddr register. + * - Sets the Context/XContext register(s). + * - Sets the TLB EntryHi register to contain VPN of the faulting address. + * * Don't check for invalid pte's here. We load them as well and * let the processor trap to load the correct value after service. + * + * XXX This really needs to be changed to a linear page table and use the + * Context and XContext registers. That is really what it was designed for. *---------------------------------------------------------------------------- */ .set push @@ -144,25 +180,89 @@ MipsDoTLBMiss: #ifdef __mips_n64 PTR_SRL k0, PDRSHIFT - PTRSHIFT # k0=VPN andi k0, k0, PDEPTRMASK # k0=pde offset - PTR_ADDU k1, k0, k1 # k1=pde entry address - PTR_L k1, 0(k1) # k1=pde entry - MFC0 k0, MIPS_COP_0_BAD_VADDR # k0=bad address (again) + PTR_ADDU k0, k0, k1 # k1=pde entry address + PTR_L k1, 0(k0) # k1=pde entry beq k1, zero, 2f # ==0 -- no page table + nop + +#ifdef MIPS64_NEW_PMAP + # Check for superpage + GET_SUPERPAGE_IDX(k1) # k1=superpage index from PTE + beq k1, zero, not_superpage # ==0 -- not superpage + PTR_L k1, 0(k0) # k1=pde entry (delay slot) + + # Set the referenced bit in the PDE if valid. + # + # XXX Setting the referenced bit here saves a fault later but it + # may not be safe to do so. Therefore, just take the fault to set + # the reference bit. +# IF_VALID_SET_REFBIT(k1, k0, 0, 1) + + # The PDE is actually a superpage PTE. Store it in the TLB lo0 reg. + CLEAR_PTE_SWBITS(k1) + PTE_MTC0 k1, MIPS_COP_0_TLB_LO0 # lo0 is loaded + COP0_SYNC + + # Compute the PFN for the TLB lo1 register from k1(=PTE for TLB lo0). + GET_ODD_1M_PFN_FROM_EVEN(k1) # k1=Odd PFN in PTE postion + + # Get hard TLB flag bits. + PTR_L k0, 0(k0) # k0=pde entry (again) + GET_HW_TLB_FLAGS(k0) # k0=hw TLB flag bits + or k1, k1, k0 # k1=PTE=PFN | hwflg bits + # Load it into the TLB Lo1 register. + #CLEAR_PTE_SWBITS(k1) # No SW bits to clear + PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 # lo1 is loaded + COP0_SYNC + + # Load the TLB PageMask for 1M pages. + dli k0, TLBMASK_1M_PAGE # PageMask for 1M Page + PTE_MTC0 k0, MIPS_COP_0_TLB_PG_MASK # PageMask is loaded + COP0_SYNC + + tlbwr # write to tlb + HAZARD_DELAY + PTE_MTC0 zero, MIPS_COP_0_TLB_PG_MASK # zero out PageMask reg + COP0_SYNC + eret # return from exception + +not_superpage: +#endif /* MIPS64_NEW_PMAP */ #endif + MFC0 k0, MIPS_COP_0_BAD_VADDR # k0=bad address (again) PTR_SRL k0, PAGE_SHIFT - PTESHIFT #0b: k0=VPN (aka va>>10) andi k0, k0, PTE2MASK #0c: k0=page tab offset PTR_ADDU k1, k1, k0 #0d: k1=pte address - PTE_L k0, 0(k1) #0e: k0=lo0 pte - PTE_L k1, PTESIZE(k1) #0f: k1=lo0 pte + + PTE_L k0, 0(k1) # k0=lo0 pte + + # Set the referenced bit in the PDE if valid. + # + # XXX Setting the referenced bit here saves a fault later but it + # may not be safe to do so. Therefore, just take the fault to set + # the reference bit. +# IF_VALID_SET_REFBIT(k0, k1, 0, 2) + CLEAR_PTE_SWBITS(k0) PTE_MTC0 k0, MIPS_COP_0_TLB_LO0 #12: lo0 is loaded COP0_SYNC - CLEAR_PTE_SWBITS(k1) - PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 #15: lo1 is loaded + + PTE_L k0, PTESIZE(k1) # k0=lo1 pte + + # Set the referenced bit in the PDE if valid. + # + # XXX Setting the referenced bit here saves a fault later but it + # may not be safe to do so. Therefore, just take the fault to set + # the reference bit. +# IF_VALID_SET_REFBIT(k0, k1, 0, 3) + + CLEAR_PTE_SWBITS(k0) + PTE_MTC0 k0, MIPS_COP_0_TLB_LO1 #15: lo1 is loaded COP0_SYNC tlbwr #1a: write to tlb HAZARD_DELAY eret #1f: retUrn from exception + 1: j MipsTLBMissException #20: kernel exception nop #21: branch delay slot 2: j SlowFault #22: no page table present @@ -266,7 +366,7 @@ SlowFault: #endif /* - * Save CPU and CP0 register state. + * Save CPU and CP0 register state when taking an exception in kernel mode. * * This is straightforward except for saving the exception program * counter. The ddb backtrace code looks for the first instruction @@ -407,6 +507,7 @@ NESTED_NOPROFILE(MipsKernGenException, KERN_EXC_FRAME_SIZE, ra) SAVE_REG(a1, SR, sp) RESTORE_CPU # v0 contains the return address. sync + eret .set at END(MipsKernGenException) @@ -877,12 +978,51 @@ LEAF_NOPROFILE(MipsTLBInvalidException) PTR_SRL k0, PDRSHIFT - PTRSHIFT # k0=pde offset (almost) beq k1, zero, MipsKernGenException # ==0 -- no pde tab andi k0, k0, PDEPTRMASK # k0=pde offset - PTR_ADDU k1, k0, k1 # k1=pde entry address - PTR_L k1, 0(k1) # k1=pde entry + PTR_ADDU k0, k0, k1 # k0=pde entry address + PTR_L k1, 0(k0) # k1=pde entry /* Validate pde table pointer. */ beqz k1, 3f nop + +#ifdef MIPS64_NEW_PMAP + # Check for superpage + GET_SUPERPAGE_IDX(k1) # k1=superpage index from PTE + beq k1, zero, not_spg # ==0 -- not superpage + PTR_L k1, 0(k0) # k1=pde entry (delay slot) + + /* Validate page table entry. */ + andi k1, PTE_VR + beqz k1, 3f + nop + + # The PDE is actually a superpage PTE. Store it in the TLB lo0 reg. + CLEAR_PTE_SWBITS(k1) + PTE_MTC0 k1, MIPS_COP_0_TLB_LO0 # lo0 is loaded + COP0_SYNC + + # Compute the PFN for the TLB lo1 register from k1(=PTE for TLB lo0). + GET_ODD_1M_PFN_FROM_EVEN(k1) # k1=Odd PFN in PTE postion + + # Get hard TLB flag bits. + PTR_L k0, 0(k0) # k0=pde entry (again) + GET_HW_TLB_FLAGS(k0) # k0=hw TLB flag bits + or k1, k1, k0 # k1=PTE=PFN | hwflg bits + # Load it into the TLB Lo1 register. + # CLEAR_PTE_SWBITS(k1) # No SW bits to clear + PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 # lo1 is loaded + COP0_SYNC + + # Load the TLB PageMask for 1M pages. + dli k0, TLBMASK_1M_PAGE # PageMask for 1M Page + PTE_MTC0 k0, MIPS_COP_0_TLB_PG_MASK # PageMask is loaded + COP0_SYNC + + b tlb_insert_entry + nop + +not_spg: +#endif /* MIPS64_NEW_PMAP */ #endif MFC0 k0, MIPS_COP_0_BAD_VADDR # k0=bad address (again) PTR_SRL k0, PAGE_SHIFT - PTESHIFT # k0=VPN @@ -891,7 +1031,7 @@ LEAF_NOPROFILE(MipsTLBInvalidException) PTE_L k0, 0(k1) # k0=this PTE /* Validate page table entry. */ - andi k0, PTE_V + andi k0, PTE_VR beqz k0, 3f nop @@ -901,12 +1041,15 @@ LEAF_NOPROFILE(MipsTLBInvalidException) nop PTE_L k0, 0(k1) - PTE_L k1, PTESIZE(k1) + CLEAR_PTE_SWBITS(k0) PTE_MTC0 k0, MIPS_COP_0_TLB_LO0 COP0_SYNC - CLEAR_PTE_SWBITS(k1) - PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 + + PTE_L k0, PTESIZE(k1) + + CLEAR_PTE_SWBITS(k0) + PTE_MTC0 k0, MIPS_COP_0_TLB_LO1 COP0_SYNC b tlb_insert_entry @@ -914,12 +1057,15 @@ LEAF_NOPROFILE(MipsTLBInvalidException) odd_page: PTE_L k0, -PTESIZE(k1) - PTE_L k1, 0(k1) + CLEAR_PTE_SWBITS(k0) PTE_MTC0 k0, MIPS_COP_0_TLB_LO0 COP0_SYNC - CLEAR_PTE_SWBITS(k1) - PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 + + PTE_L k0, 0(k1) + + CLEAR_PTE_SWBITS(k0) + PTE_MTC0 k0, MIPS_COP_0_TLB_LO1 COP0_SYNC tlb_insert_entry: @@ -929,11 +1075,15 @@ tlb_insert_entry: bltz k0, tlb_insert_random nop tlbwi + PTE_MTC0 zero, MIPS_COP_0_TLB_PG_MASK + COP0_SYNC eret ssnop tlb_insert_random: tlbwr + PTE_MTC0 zero, MIPS_COP_0_TLB_PG_MASK + COP0_SYNC eret ssnop @@ -1047,21 +1197,67 @@ LEAF_NOPROFILE(MipsTLBMissException) #ifdef __mips_n64 PTR_SRL k0, PDRSHIFT - PTRSHIFT # k0=VPN andi k0, k0, PDEPTRMASK # k0=pde offset - PTR_ADDU k1, k0, k1 # k1=pde entry address - PTR_L k1, 0(k1) # k1=pde entry + PTR_ADDU k0, k0, k1 # k1=pde entry address + PTR_L k1, 0(k0) # k1=pde entry + +#ifdef MIPS64_NEW_PMAP + # Check for superpage + GET_SUPERPAGE_IDX(k1) # k1=superpage index from PTE + beq k1, zero, not_kspg # ==0 -- not superpage + PTR_L k1, 0(k0) # k1=pde entry (delay slot) + + # XXX Reference bit emulation + + # The PDE is actually a superpage PTE. Store it in the TLB lo0 reg. + CLEAR_PTE_SWBITS(k1) + PTE_MTC0 k1, MIPS_COP_0_TLB_LO0 # lo0 is loaded + COP0_SYNC + + # Compute the PFN for the TLB lo1 register from k1(=PTE for TLB lo0). + GET_ODD_1M_PFN_FROM_EVEN(k1) # k1=Odd PFN in PTE postion + + # Get hard TLB flag bits. + PTR_L k0, 0(k0) # k0=pde entry (again) + GET_HW_TLB_FLAGS(k0) # k0=hw TLB flag bits + or k1, k1, k0 # k1=PTE=PFN | hwflg bits + # Load it into the TLB Lo1 register. + #CLEAR_PTE_SWBITS(k1) # No SW Bits to clear + PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 # lo1 is loaded + COP0_SYNC + + # Load the TLB PageMask for 1M pages. + dli k0, TLBMASK_1M_PAGE # PageMask for 1M Page + PTE_MTC0 k0, MIPS_COP_0_TLB_PG_MASK # PageMask is loaded + COP0_SYNC + + tlbwr # write to tlb + HAZARD_DELAY + PTE_MTC0 zero, MIPS_COP_0_TLB_PG_MASK # zero out PageMask reg + COP0_SYNC + eret # return from exception + +not_kspg: +#endif /* MIPS64_NEW_PMAP */ + MFC0 k0, MIPS_COP_0_BAD_VADDR # k0=bad address (again) beq k1, zero, MipsKernGenException # ==0 -- no page table #endif PTR_SRL k0, PAGE_SHIFT - PTESHIFT # k0=VPN andi k0, k0, PTE2MASK # k0=page tab offset PTR_ADDU k1, k1, k0 # k1=pte address + PTE_L k0, 0(k1) # k0=lo0 pte - PTE_L k1, PTESIZE(k1) # k1=lo1 pte + + # XXX Reference bit emulation + CLEAR_PTE_SWBITS(k0) PTE_MTC0 k0, MIPS_COP_0_TLB_LO0 # lo0 is loaded COP0_SYNC - CLEAR_PTE_SWBITS(k1) - PTE_MTC0 k1, MIPS_COP_0_TLB_LO1 # lo1 is loaded + + PTE_L k0, PTESIZE(k1) # k0=lo1 pte + + CLEAR_PTE_SWBITS(k0) + PTE_MTC0 k0, MIPS_COP_0_TLB_LO1 # lo1 is loaded COP0_SYNC tlbwr # write to tlb HAZARD_DELAY diff --git a/sys/mips/mips/genassym.c b/sys/mips/mips/genassym.c index 3c0c1cc..07eb01f 100644 --- a/sys/mips/mips/genassym.c +++ b/sys/mips/mips/genassym.c @@ -99,6 +99,9 @@ ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MDTD_COP2USED, MDTD_COP2USED); +#ifdef KSTACK_LARGE_PAGE +ASSYM(KSTACK_TLBMASK_MASK, KSTACK_TLBMASK_MASK); +#endif ASSYM(MIPS_KSEG0_START, MIPS_KSEG0_START); ASSYM(MIPS_KSEG1_START, MIPS_KSEG1_START); diff --git a/sys/mips/mips/machdep.c b/sys/mips/mips/machdep.c index 5d25d88..f36e497 100644 --- a/sys/mips/mips/machdep.c +++ b/sys/mips/mips/machdep.c @@ -283,9 +283,9 @@ mips_proc0_init(void) #endif proc_linkup0(&proc0, &thread0); - KASSERT((kstack0 & PAGE_MASK) == 0, - ("kstack0 is not aligned on a page boundary: 0x%0lx", - (long)kstack0)); + KASSERT((kstack0 & ((KSTACK_PAGE_SIZE * 2) - 1)) == 0, + ("kstack0 is not aligned on a page (0x%0lx) boundary: 0x%0lx", + (long)(KSTACK_PAGE_SIZE * 2), (long)kstack0)); thread0.td_kstack = kstack0; thread0.td_kstack_pages = KSTACK_PAGES; /* @@ -459,7 +459,7 @@ mips_pcpu_tlb_init(struct pcpu *pcpu) * We use a wired tlb index to do this one-time mapping. */ pa = vtophys(pcpu); - pte = PTE_D | PTE_V | PTE_G | PTE_C_CACHE; + pte = PTE_D | PTE_VALID | PTE_REF | PTE_G | PTE_C_CACHE; tlb_insert_wired(PCPU_TLB_ENTRY, (vm_offset_t)pcpup, TLBLO_PA_TO_PFN(pa) | pte, TLBLO_PA_TO_PFN(pa + PAGE_SIZE) | pte); diff --git a/sys/mips/mips/minidump_machdep.c b/sys/mips/mips/minidump_machdep.c index 2122e00..d35f597 100644 --- a/sys/mips/mips/minidump_machdep.c +++ b/sys/mips/mips/minidump_machdep.c @@ -180,7 +180,7 @@ minidumpsys(struct dumperinfo *di) pte = pmap_pte(kernel_pmap, va); KASSERT(pte != NULL, ("pte for %jx is NULL", (uintmax_t)va)); for (i = 0; i < NPTEPG; i++) { - if (pte_test(&pte[i], PTE_V)) { + if (pte_is_valid(&pte[i])) { pa = TLBLO_PTE_TO_PA(pte[i]); if (is_dumpable(pa)) dump_add_page(pa); diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 2ca8ed5..6d2c8ae 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -231,7 +231,7 @@ pmap_lmem_map1(vm_paddr_t phys) sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va = sysm->base; - npte = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; + npte = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_VALID | PTE_G; pte = pmap_pte(kernel_pmap, va); *pte = npte; sysm->valid1 = 1; @@ -253,10 +253,10 @@ pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) sysm->saved_intr = intr; va1 = sysm->base; va2 = sysm->base + PAGE_SIZE; - npte = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; + npte = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_VALID | PTE_G; pte = pmap_pte(kernel_pmap, va1); *pte = npte; - npte = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; + npte = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_VALID | PTE_G; pte = pmap_pte(kernel_pmap, va2); *pte = npte; sysm->valid1 = 1; @@ -546,9 +546,15 @@ again: msgbufinit(msgbufp, msgbufsize); /* - * Steal thread0 kstack. + * Steal thread0 kstack. This must be aligned to + * (KSTACK_PAGE_SIZE * 2) so it can mapped to a single TLB entry. + * + * XXX There should be a better way of getting aligned memory + * with pmap_steal_memory(). */ - kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); + kstack0 = pmap_steal_memory((KSTACK_PAGES + KSTACK_GUARD_PAGES) \ + << PAGE_SHIFT); + kstack0 = roundup2(kstack0, (KSTACK_PAGE_SIZE * 2)); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; @@ -595,6 +601,7 @@ pmap_page_init(vm_page_t m) TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = 0; + m->md.pv_memattr = VM_MEMATTR_DEFAULT; } /* @@ -791,7 +798,7 @@ retry: ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; - if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || + if (pte_test(&pte, PTE_VALID) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pte_pa = TLBLO_PTE_TO_PA(pte); if (vm_page_pa_tryrelock(pmap, pte_pa, &pa)) @@ -824,9 +831,9 @@ pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int attr) pte = pmap_pte(kernel_pmap, va); opte = *pte; - npte = TLBLO_PA_TO_PFN(pa) | attr | PTE_D | PTE_V | PTE_G; + npte = TLBLO_PA_TO_PFN(pa) | attr | PTE_D | PTE_VALID | PTE_G; *pte = npte; - if (pte_test(&opte, PTE_V) && opte != npte) + if (pte_test(&opte, PTE_VALID) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } @@ -1427,7 +1434,7 @@ pmap_pv_reclaim(pmap_t locked_pmap) if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; @@ -1605,9 +1612,9 @@ pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { - TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } @@ -1649,7 +1656,7 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); @@ -1722,7 +1729,7 @@ pmap_remove_page(struct pmap *pmap, vm_offset_t va) /* * If there is no pte for this address, just skip it! */ - if (!pte_test(ptq, PTE_V)) + if (!pte_test(ptq, PTE_VALID)) return; (void)pmap_remove_pte(pmap, ptq, va, *pde); @@ -1788,7 +1795,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { - if (!pte_test(pte, PTE_V)) { + if (!pte_test(pte, PTE_VALID)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; @@ -1846,7 +1853,7 @@ pmap_remove_all(vm_page_t m) * If it's last mapping writeback all caches from * the page being destroyed */ - if (TAILQ_NEXT(pv, pv_list) == NULL) + if (TAILQ_NEXT(pv, pv_next) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; @@ -1875,7 +1882,7 @@ pmap_remove_all(vm_page_t m) } pmap_invalidate_page(pmap, pv->pv_va); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); pmap_unuse_pt(pmap, pv->pv_va, *pde); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); @@ -1937,7 +1944,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; - if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, + if (!pte_test(&pbits, PTE_VALID) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); @@ -2049,7 +2056,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, /* * Mapping has not changed, must be protection or wiring change. */ - if (pte_test(&origpte, PTE_V) && opa == pa) { + if (pte_test(&origpte, PTE_VALID) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there @@ -2113,7 +2120,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); newpte |= PTE_MANAGED; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); @@ -2138,7 +2145,7 @@ validate: */ if (origpte != newpte) { *pte = newpte; - if (pte_test(&origpte, PTE_V)) { + if (pte_test(&origpte, PTE_VALID)) { if (pte_test(&origpte, PTE_MANAGED) && opa != pa) { if (om->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(om, PGA_REFERENCED); @@ -2246,7 +2253,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, } pte = pmap_pte(pmap, va); - if (pte_test(pte, PTE_V)) { + if (pte_test(pte, PTE_VALID)) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; @@ -2276,7 +2283,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, /* * Now validate mapping with RO protection */ - *pte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; + *pte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_VALID; if ((m->oflags & VPO_UNMANAGED) == 0) *pte |= PTE_MANAGED; @@ -2334,7 +2341,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i) cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; /* Since this is for the debugger, no locks or any other fun */ - npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | + npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_VALID | PTE_G; pte = pmap_pte(kernel_pmap, sysm->base); *pte = npte; @@ -2462,7 +2469,7 @@ pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { - if (!pte_test(pte, PTE_V)) + if (!pte_test(pte, PTE_VALID)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", @@ -2656,7 +2663,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; @@ -2670,6 +2677,16 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) } /* + * Returns TRUE if the given page is mapped. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + + return (!TAILQ_EMPTY(&(m)->md.pv_list)); +} + +/* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but @@ -2709,7 +2726,7 @@ pmap_remove_pages(pmap_t pmap) KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); - if (!pte_test(pte, PTE_V)) + if (!pte_test(pte, PTE_VALID)) panic("pmap_remove_pages: bad pte"); tpte = *pte; @@ -2739,7 +2756,7 @@ pmap_remove_pages(pmap_t pmap) pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, *pde); @@ -2770,7 +2787,7 @@ pmap_testbit(vm_page_t m, int bit) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -2800,7 +2817,7 @@ pmap_page_wired_mappings(vm_page_t m) if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -2834,11 +2851,11 @@ pmap_remove_write(vm_page_t m) if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); - KASSERT(pte != NULL && pte_test(pte, PTE_V), + KASSERT(pte != NULL && pte_test(pte, PTE_VALID), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { @@ -2977,7 +2994,7 @@ pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { - if (!pte_test(pte, PTE_MANAGED | PTE_V)) { + if (!pte_test(pte, PTE_MANAGED | PTE_VALID)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; @@ -3043,7 +3060,7 @@ pmap_clear_modify(vm_page_t m) if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -3131,6 +3148,31 @@ pmap_unmapdev(vm_offset_t va, vm_size_t size) } /* + * Sets the memory attribute for the specified page. + */ +void +pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + + /* + * Set the memattr field so the appropriate bits are set in the + * PTE as mappings are created. + */ + m->md.pv_memattr = ma; + + /* + * It is assumed that this function is only called before any mappings + * are established. If this is not the case then this function will + * need to walk the pv_list and make each of the existing mappings + * uncacheable, sync the cache (with mips_icache_sync_all() and + * mips_dcache_wbinv_all()) and most likely invalidate TLB entries for + * any of the current mappings it modifies. + */ + if (TAILQ_FIRST(&m->md.pv_list) != NULL) + panic("Can't change memattr on page with existing mappings"); +} + +/* * perform the pmap work for mincore */ int @@ -3145,7 +3187,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) retry: ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; - if (!pte_test(&pte, PTE_V)) { + if (!pte_test(&pte, PTE_VALID)) { val = 0; goto out; } @@ -3282,7 +3324,7 @@ DB_SHOW_COMMAND(ptable, ddb_pid_dump) #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; - if (pte == 0 || !pte_test(&pte, PTE_V)) + if (pte == 0 || !pte_test(&pte, PTE_VALID)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); @@ -3318,7 +3360,7 @@ pads(pmap_t pm) va >= VM_MAXUSER_ADDRESS) continue; ptep = pmap_pte(pm, va); - if (pte_test(ptep, PTE_V)) + if (pte_test(ptep, PTE_VALID)) printf("%x:%x ", va, *(int *)ptep); } @@ -3333,7 +3375,7 @@ pmap_pvdump(vm_offset_t pa) printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); for (pv = TAILQ_FIRST(&m->md.pv_list); pv; - pv = TAILQ_NEXT(pv, pv_list)) { + pv = TAILQ_NEXT(pv, pv_next)) { printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } @@ -3380,15 +3422,15 @@ init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) - rw = PTE_V | PTE_RO; + rw = PTE_VALID | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) - rw = PTE_V | PTE_D; + rw = PTE_VALID | PTE_D; else - rw = PTE_V; + rw = PTE_VALID; } else /* Needn't emulate a modified bit for unmanaged pages. */ - rw = PTE_V | PTE_D; + rw = PTE_VALID | PTE_D; return (rw); } @@ -3411,13 +3453,13 @@ pmap_emulate_modified(pmap_t pmap, vm_offset_t va) panic("pmap_emulate_modified: can't find PTE"); #ifdef SMP /* It is possible that some other CPU changed m-bit */ - if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) { + if (!pte_test(pte, PTE_VALID) || pte_test(pte, PTE_D)) { tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } #else - if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) + if (!pte_test(pte, PTE_VALID) || pte_test(pte, PTE_D)) panic("pmap_emulate_modified: invalid pte"); #endif if (pte_test(pte, PTE_RO)) { @@ -3433,6 +3475,18 @@ pmap_emulate_modified(pmap_t pmap, vm_offset_t va) } /* + * pmap_emulate_referenced + * + * Reference bit emulation is not suppored in this pmap implementation. + */ +int +pmap_emulate_referenced(pmap_t pmap, vm_offset_t va) +{ + + return (1); +} + +/* * Routine: pmap_kextract * Function: * Extract the physical page address associated @@ -3509,7 +3563,7 @@ pmap_flush_pvcache(vm_page_t m) if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; - pv = TAILQ_NEXT(pv, pv_list)) { + pv = TAILQ_NEXT(pv, pv_next)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } diff --git a/sys/mips/mips/pmap_mips64.c b/sys/mips/mips/pmap_mips64.c new file mode 100644 index 0000000..b7ebb65 --- /dev/null +++ b/sys/mips/mips/pmap_mips64.c @@ -0,0 +1,5524 @@ +/* + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2003 Peter Wemm + * All rights reserved. + * Copyright (c) 2005-2010 Alan L. Cox + * All rights reserved. + * Copyright (c) 2015 Stacey D. Son + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps + * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish + */ + +/* + * Manages physical address maps. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_pmap.h" +#include "opt_vm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SMP +#include +#else +#include +#endif +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#undef PMAP_DEBUG + +#if !defined(DIAGNOSTIC) +#define PMAP_INLINE __inline +#else +#define PMAP_INLINE +#endif + +// #define PV_STATS +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +#define pa_index(pa) ((pa) >> PDRSHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) + +#define NPV_LIST_LOCKS MAXCPU + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) +#define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) +#define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) +#define pmap_pde_pindex(v) ((v) >> PDRSHIFT) + +#define NUPDE (NPDEPG * NPDEPG) +#define NUSERPGTBLS (NUPDE + NPDEPG) + +#define is_kernel_pmap(x) ((x) == kernel_pmap) + +struct pmap kernel_pmap_store; +pd_entry_t *kernel_segmap; + +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ + +static int nkpt; +unsigned pmap_max_asid; /* max ASID supported by the system */ + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); +static int pg_sp_enabled = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pg_sp_enabled, 0, "Are large page mappings enabled?"); + +#define PMAP_ASID_RESERVED 0 + +vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; + +static void pmap_asid_alloc(pmap_t pmap); + +static struct rwlock_padalign pvh_global_lock; + +/* + * Data for the pv entry allocation mechanism + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static struct mtx pv_chunks_mutex; +static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; +static struct md_page *pv_table; + +static void free_pv_chunk(struct pv_chunk *pc); +static void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); +static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); +static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); +static void reserve_pv_entries(pmap_t pmap, int needed, + struct rwlock **lockp); +static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, + vm_offset_t va, struct rwlock **lockp); +static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, struct rwlock **lockp); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); +static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static __inline vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); +static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp); +static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp); +static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, + pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, + struct spglist *free); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m, struct rwlock **lockp); +static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); +static void pmap_invalidate_all(pmap_t pmap); +static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, + struct rwlock **lockp); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); +static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); + +static void pmap_invalidate_page_action(void *arg); +static void pmap_invalidate_range_action(void *arg); +static void pmap_update_page_action(void *arg); + +/* + * Page table entry lookup routines. + */ + +/* Return a segment entry for given pmap & virtual address. */ +static __inline pd_entry_t * +pmap_segmap(pmap_t pmap, vm_offset_t va) +{ + + return (&pmap->pm_segtab[pmap_seg_index(va)]); +} + +/* Return a page directory entry for given segment table & virtual address. */ +static __inline pd_entry_t * +pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) +{ + pd_entry_t *pde; + + pde = (pd_entry_t *)*pdpe; + return (&pde[pmap_pde_index(va)]); +} + +/* Return a page directory entry for given pmap & virtual address. */ +static __inline pd_entry_t * +pmap_pde(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *pdpe; + + pdpe = pmap_segmap(pmap, va); + if (*pdpe == NULL) + return (NULL); + + return (pmap_pdpe_to_pde(pdpe, va)); +} + +/* Return a page table entry for given page directory & virtual address. */ +static __inline pt_entry_t * +pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) +{ + pt_entry_t *pte; + + pte = (pt_entry_t *)*pde; + return (&pte[pmap_pte_index(va)]); +} + +/* Return a page table entry for given pmap & virtual address. */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (pde == NULL || *pde == NULL) + return (NULL); + if (pde_is_superpage(pde)) { + return ((pt_entry_t *)pde); + } else + return (pmap_pde_to_pte(pde, va)); +} + +static __inline void +pmap_resident_count_inc(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->pm_stats.resident_count += count; +} + +static __inline void +pmap_resident_count_dec(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->pm_stats.resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count -= count; +} + +/* + * Allocate some wired memory before the virtual memory system is + * bootstrapped. + */ +vm_offset_t +pmap_steal_memory(vm_size_t size) +{ + vm_paddr_t bank_size, pa; + vm_offset_t va; + + size = round_page(size); + bank_size = phys_avail[1] - phys_avail[0]; + while (size > bank_size) { + int i; + + for (i = 0; phys_avail[i + 2]; i += 2) { + phys_avail[i] = phys_avail[i + 2]; + phys_avail[i + 1] = phys_avail[i + 3]; + } + phys_avail[i] = 0; + phys_avail[i + 1] = 0; + if (!phys_avail[0]) + panic("pmap_steal_memory: out of memory"); + bank_size = phys_avail[1] - phys_avail[0]; + } + + pa = phys_avail[0]; + phys_avail[0] += size; + va = MIPS_PHYS_TO_DIRECT(pa); + bzero((caddr_t)va, size); + return (va); +} + +/* + * Bootstrap the system enough to run with virtual memory. This + * assumes that the phys_avail array has been initialized. + */ +static void +pmap_create_kernel_pagetable(void) +{ + int i, j; + vm_offset_t ptaddr; + pt_entry_t *pte; + pd_entry_t *pde; + vm_offset_t pdaddr; + int npt, npde; + + /* + * Allocate segment table for the kernel + */ + kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); + + /* + * Allocate second level page tables for the kernel + */ + npde = howmany(NKPT, NPDEPG); + pdaddr = pmap_steal_memory(PAGE_SIZE * npde); + nkpt = NKPT; + ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); + + /* + * The R[4-7]?00 stores only one copy of the Global bit in the + * translation lookaside buffer for each 2 page entry. Thus invalid + * entrys must have the Global bit set so when Entry LO and Entry HI + * G bits are anded together they will produce a global bit to store + * in the tlb. + */ + for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) + *pte = PTE_G; + + for (i = 0, npt = nkpt; npt > 0; i++) { + kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); + pde = (pd_entry_t *)kernel_segmap[i]; + + for (j = 0; j < NPDEPG && npt > 0; j++, npt--) + pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * + PAGE_SIZE); + } + + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_segtab = kernel_segmap; + CPU_FILL(&kernel_pmap->pm_active); + TAILQ_INIT(&kernel_pmap->pm_pvchunk); + kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; + kernel_pmap->pm_asid[0].gen = 0; + kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; +} + +void +pmap_bootstrap(void) +{ + int i; + + /* Sort. */ +again: + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + /* + * Keep the memory aligned on page boundary. + */ + phys_avail[i] = round_page(phys_avail[i]); + phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); + + if (i < 2) + continue; + if (phys_avail[i - 2] > phys_avail[i]) { + vm_paddr_t ptemp[2]; + + ptemp[0] = phys_avail[i + 0]; + ptemp[1] = phys_avail[i + 1]; + + phys_avail[i + 0] = phys_avail[i - 2]; + phys_avail[i + 1] = phys_avail[i - 1]; + + phys_avail[i - 2] = ptemp[0]; + phys_avail[i - 1] = ptemp[1]; + goto again; + } + } + + /* + * Copy the phys_avail[] array before we start stealing memory from it. + */ + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + physmem_desc[i] = phys_avail[i]; + physmem_desc[i + 1] = phys_avail[i + 1]; + } + + Maxmem = atop(phys_avail[i - 1]); + + if (bootverbose) { + printf("Physical memory chunk(s):\n"); + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + vm_paddr_t size; + + size = phys_avail[i + 1] - phys_avail[i]; + printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", + (uintmax_t) phys_avail[i], + (uintmax_t) phys_avail[i + 1] - 1, + (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); + } + printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); + } + /* + * Steal the message buffer from the beginning of memory. + */ + msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); + msgbufinit(msgbufp, msgbufsize); + + /* + * Steal thread0 kstack. This must be aligned to + * (KSTACK_PAGE_SIZE * 2) so it can mapped to a single TLB entry. + * + */ + kstack0 = pmap_steal_memory((KSTACK_PAGES + KSTACK_GUARD_PAGES) << + PAGE_SHIFT); + kstack0 = roundup2(kstack0, (KSTACK_PAGE_SIZE * 2)); + + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_KERNEL_ADDRESS; + +#ifdef SMP + /* + * Steal some virtual address space to map the pcpu area. + */ + virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); + pcpup = (struct pcpu *)virtual_avail; + virtual_avail += PAGE_SIZE * 2; + + /* + * Initialize the wired TLB entry mapping the pcpu region for + * the BSP at 'pcpup'. Up until this point we were operating + * with the 'pcpup' for the BSP pointing to a virtual address + * in KSEG0 so there was no need for a TLB mapping. + */ + mips_pcpu_tlb_init(PCPU_ADDR(0)); + + if (bootverbose) + printf("pcpu is available at virtual address %p.\n", pcpup); +#endif + + pmap_create_kernel_pagetable(); + pmap_max_asid = VMNUM_PIDS; + mips_wr_entryhi(0); + mips_wr_pagemask(0); + + /* + * Initialize the global pv list lock. + */ + rw_init(&pvh_global_lock, "pmap pv global"); +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + m->md.pv_memattr = VM_MEMATTR_DEFAULT; +} + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + int i; + vm_size_t s; + int pv_npg; + + /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + for (i = 0; phys_avail[i + 1]; i += 2); + pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, + M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); +} + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static u_long pmap_pde_demotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_pde_demotions, 0, "2MB page demotions"); + +static u_long pmap_pde_mappings; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pde_mappings, 0, "2MB page mappings"); + +static u_long pmap_pde_p_failures; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_pde_p_failures, 0, "2MB page promotion failures"); + +static u_long pmap_pde_promotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_pde_promotions, 0, "2MB page promotions"); + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +#ifdef SMP +static __inline void +pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) +{ + int cpuid, cpu, self; + cpuset_t active_cpus; + + sched_pin(); + if (is_kernel_pmap(pmap)) { + smp_rendezvous(NULL, fn, NULL, arg); + goto out; + } + /* Force ASID update on inactive CPUs */ + CPU_FOREACH(cpu) { + if (!CPU_ISSET(cpu, &pmap->pm_active)) + pmap->pm_asid[cpu].gen = 0; + } + cpuid = PCPU_GET(cpuid); + /* + * XXX: barrier/locking for active? + * + * Take a snapshot of active here, any further changes are ignored. + * tlb update/invalidate should be harmless on inactive CPUs + */ + active_cpus = pmap->pm_active; + self = CPU_ISSET(cpuid, &active_cpus); + CPU_CLR(cpuid, &active_cpus); + /* Optimize for the case where this cpu is the only active one */ + if (CPU_EMPTY(&active_cpus)) { + if (self) + fn(arg); + } else { + if (self) + CPU_SET(cpuid, &active_cpus); + smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); + } +out: + sched_unpin(); +} +#else /* !SMP */ +static __inline void +pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) +{ + int cpuid; + + if (is_kernel_pmap(pmap)) { + fn(arg); + return; + } + cpuid = PCPU_GET(cpuid); + if (!CPU_ISSET(cpuid, &pmap->pm_active)) + pmap->pm_asid[cpuid].gen = 0; + else + fn(arg); +} +#endif /* SMP */ + +static void +pmap_invalidate_all(pmap_t pmap) +{ + + pmap_call_on_active_cpus(pmap, + (void (*)(void *))tlb_invalidate_all_user, pmap); +} + +struct pmap_invalidate_page_arg { + pmap_t pmap; + vm_offset_t va; +}; + +static void +pmap_invalidate_page_action(void *arg) +{ + struct pmap_invalidate_page_arg *p = arg; + + tlb_invalidate_address(p->pmap, p->va); +} + +static void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + struct pmap_invalidate_page_arg arg; + + arg.pmap = pmap; + arg.va = va; + pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); +} + +struct pmap_invalidate_range_arg { + pmap_t pmap; + vm_offset_t sva; + vm_offset_t eva; +}; + +static void +pmap_invalidate_range_action(void *arg) +{ + struct pmap_invalidate_range_arg *p = arg; + + tlb_invalidate_range(p->pmap, p->sva, p->eva); +} + +static void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct pmap_invalidate_range_arg arg; + + arg.pmap = pmap; + arg.sva = sva; + arg.eva = eva; + pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); +} + +struct pmap_update_page_arg { + pmap_t pmap; + vm_offset_t va; + pt_entry_t pte; +}; + +static void +pmap_update_page_action(void *arg) +{ + struct pmap_update_page_arg *p = arg; + + tlb_update(p->pmap, p->va, p->pte); +} + +static void +pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) +{ + struct pmap_update_page_arg arg; + + arg.pmap = pmap; + arg.va = va; + arg.pte = pte; + pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); +} + +static void +pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pt_entry_t newpde) +{ + + if (!pte_is_1m_superpage(&newpde)) { + /* Demotion: flush a specific 2mb page mapping. */ + tlb_invalidate_range(pmap, (va & ~PDRMASK), + (va & ~PDRMASK) + NBPDR); + } else if (!pte_test(&newpde, PTE_G)) { + /* + * Promotion: flush every 4KB page mapping from the TLB + * because there are too many to flush individually. + */ + tlb_invalidate_all_user(pmap); + } else { + /* + * Promotion: flush every 4KB page mapping from the TLB, + * including any global (PTE_G) mappings. + */ + tlb_invalidate_all(); + } +} + +struct pmap_update_pde_arg { + pmap_t pmap; + vm_offset_t va; + pd_entry_t *pde; + pt_entry_t newpde; +}; + +static void +pmap_update_pde_action(void *act) +{ + struct pmap_update_pde_arg *arg = act; + + pmap_update_pde_invalidate(arg->pmap, arg->va, arg->newpde); +} + +static void +pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pt_entry_t newpde) +{ + + pde_store(pde, newpde); +} + + +/* + * Change the page size for the specified virtual address in a way that + * prevents any possibility of the TLB ever having two entries that map the + * same virtual address using different page sizes. + */ +static void +pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pt_entry_t newpde) +{ + struct pmap_update_pde_arg arg; + + arg.pmap = pmap; + arg.va = va; + arg.pde = pde; + arg.newpde = newpde; + + pmap_update_pde_store(pmap, pde, newpde); + pmap_call_on_active_cpus(pmap, pmap_update_pde_action, &arg); +} + +/* --- */ + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t pa; + + pa = 0; + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if (pde_is_1m_superpage(pde)) { + pa = TLBLO_PDE_TO_PA(*pde) | (va & PDRMASK); + } else { + pte = pmap_pde_to_pte(pde, va); + if (pte) + pa = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); + } + PMAP_UNLOCK(pmap); + return (pa); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pd_entry_t *pdep; + pt_entry_t pte, *ptep; + vm_paddr_t pa, pte_pa; + vm_page_t m; + vm_paddr_t pde_pa; + + pa = 0; + m = NULL; + PMAP_LOCK(pmap); +retry: + pdep = pmap_pde(pmap, va); + if (pdep != NULL && *pdep != NULL) { + if (pde_is_1m_superpage(pdep)) { + if (!pde_test(pdep, PTE_RO) || + (prot & VM_PROT_WRITE) == 0) { + pde_pa = TLBLO_PDE_TO_PA(*pdep) | + (va & PDRMASK); + if (vm_page_pa_tryrelock(pmap, pde_pa, &pa)) + goto retry; + m = PHYS_TO_VM_PAGE(pde_pa); + vm_page_hold(m); + } + } else { + ptep = pmap_pde_to_pte(pdep, va); + if (ptep != NULL) { + pte = *ptep; + if (pte_is_valid(&pte) && + (!pte_test(&pte, PTE_RO) || + (prot & VM_PROT_WRITE) == 0)) { + pte_pa = TLBLO_PTE_TO_PA(pte); + if (vm_page_pa_tryrelock(pmap, pte_pa, + &pa)) + goto retry; + m = PHYS_TO_VM_PAGE(pte_pa); + vm_page_hold(m); + } + } + } + } + PA_UNLOCK_COND(pa); + PMAP_UNLOCK(pmap); + return (m); +} + +/*- + * Routine: pmap_kextract + * Function: + * Extract the physical page address associated + * virtual address. + */ +vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + int mapped; + + /* + * First, the direct-mapped regions. + */ + if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) + return (MIPS_XKPHYS_TO_PHYS(va)); + + if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) + return (MIPS_KSEG0_TO_PHYS(va)); + + if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) + return (MIPS_KSEG1_TO_PHYS(va)); + + /* + * User virtual addresses. + */ + if (va < VM_MAXUSER_ADDRESS) { + pd_entry_t *pdep; + pt_entry_t *ptep; + + if (curproc && curproc->p_vmspace) { + pdep = pmap_pde(&curproc->p_vmspace->vm_pmap, va); + if (pdep == NULL || *pdep == NULL) + return (0); + if (pde_is_1m_superpage(pdep)) { + ptep = (pt_entry_t *)pdep; + return (TLBLO_PTE_TO_PA(*ptep) | + (va & PDRMASK)); + } + ptep = pmap_pde_to_pte(pdep, va); + if (ptep) { + return (TLBLO_PTE_TO_PA(*ptep) | + (va & PAGE_MASK)); + } + return (0); + } + } + + /* + * Should be kernel virtual here, otherwise fail + */ + mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); + mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); + /* + * Kernel virtual. + */ + + if (mapped) { + pd_entry_t *pdep; + pt_entry_t *ptep; + + /* Is the kernel pmap initialized? */ + if (!CPU_EMPTY(&kernel_pmap->pm_active)) { + /* It's inside the virtual address range */ + pdep = pmap_pde(kernel_pmap, va); + if (pdep == NULL || *pdep == NULL) + return (0); + if (pde_is_1m_superpage(pdep)) { + ptep = (pt_entry_t *)pdep; + return (TLBLO_PTE_TO_PA(*ptep) | + (va & PDRMASK)); + } + ptep = pmap_pde_to_pte(pdep, va); + if (ptep) { + return (TLBLO_PTE_TO_PA(*ptep) | + (va & PAGE_MASK)); + } + } + return (0); + } + + panic("%s for unknown address space %p.", __func__, (void *)va); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +/*- + * add a wired page to the kva + */ +void +pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int attr) +{ + pt_entry_t *pte; + pt_entry_t opte, npte; + +#ifdef PMAP_DEBUG + printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); +#endif + + pte = pmap_pte(kernel_pmap, va); + opte = *pte; + npte = TLBLO_PA_TO_PFN(pa) | attr | PTE_D | PTE_REF | PTE_VALID | PTE_G; + pte_store(pte, npte); + if (pte_is_valid(&opte) && opte != npte) + pmap_update_page(kernel_pmap, va, npte); +} + +void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + + KASSERT(is_cacheable_mem(pa), + ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); + + pmap_kenter_attr(va, pa, PTE_C_CACHE); +} + +/*- + * remove a page from the kernel pagetables + */ + /* PMAP_INLINE */ void +pmap_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + + /* + * Write back all caches from the page being destroyed + */ + mips_dcache_wbinv_range_index(va, PAGE_SIZE); + + pte = pmap_pte(kernel_pmap, va); + pte_store(pte, PTE_G); + pmap_invalidate_page(kernel_pmap, va); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + * + * Use XKPHYS for 64 bit. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + + return (MIPS_PHYS_TO_DIRECT(start)); +} + +/*- + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + */ +void +pmap_qenter(vm_offset_t va, vm_page_t *m, int count) +{ + int i; + vm_offset_t origva = va; + + for (i = 0; i < count; i++) { + pmap_flush_pvcache(m[i]); + pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); + va += PAGE_SIZE; + } + + mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); +} + +/*- + * This routine jerks page mappings from the + * kernel -- it is meant only for temporary mappings. + */ +void +pmap_qremove(vm_offset_t va, int count) +{ + pt_entry_t *pte; + vm_offset_t origva; + + if (count < 1) + return; + mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); + origva = va; + do { + pte = pmap_pte(kernel_pmap, va); + pte_store(pte, PTE_G); + va += PAGE_SIZE; + } while (--count > 0); + pmap_invalidate_range(kernel_pmap, origva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +static __inline void +pmap_free_zero_pages(struct spglist *free) +{ + vm_page_t m; + + while ((m = SLIST_FIRST(free)) != NULL) { + SLIST_REMOVE_HEAD(free, plinks.s.ss); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); + } +} + +/*- + * Schedule the specified unused page table page to be freed. Specifically + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, + boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_root, mpte)); +} + +/* + * Looks for a page table page mapping the specified virtual address in the + * specified pmap's collection of idle page table pages. Returns NULL if there + * is no page table page corresponding to the specified virtual address. + */ +static __inline vm_page_t +pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); +} + +/* + * Removes the specified page table page from the specified pmap's collection + * of idle page table pages. The specified page table page must be a member of + * the pmap's collection. + */ +static __inline void +pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + vm_radix_remove(&pmap->pm_root, mpte->pindex); +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static PMAP_INLINE boolean_t +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + --m->wire_count; + if (m->wire_count == 0) { + _pmap_unwire_ptp(pmap, va, m, free); + return (TRUE); + } else + return (FALSE); +} + +static void +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + pd_entry_t *pde, *pdp; + vm_page_t pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * unmap the page table page + */ + if (m->pindex < NUPDE) { + pde = pmap_pde(pmap, va); + *pde = 0; + pmap_resident_count_dec(pmap, 1); + + /* + * Recursively decrement next level pagetable refcount + */ + pdp = (pd_entry_t *)*pmap_segmap(pmap, va); + pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); + pmap_unwire_ptp(pmap, va, pdpg, free); + } else { + pde = pmap_segmap(pmap, va); + *pde = 0; + pmap_resident_count_dec(pmap, 1); + } + + /* + * If the page is finally unwired, simply free it. + * This is a release store so that the ordinary store unmapping + * the page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done. + */ + pmap_add_delayed_free_list(m, free, TRUE); +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, + struct spglist *free) +{ + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); + mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptepde)); + return (pmap_unwire_ptp(pmap, va, mpte, free)); +} + +void +pmap_pinit0(pmap_t pmap) +{ + int i; + + PMAP_LOCK_INIT(pmap); + pmap->pm_segtab = kernel_segmap; + CPU_ZERO(&pmap->pm_active); + for (i = 0; i < MAXCPU; i++) { + pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; + pmap->pm_asid[i].gen = 0; + } + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); +} + +void +pmap_grow_direct_page_cache() +{ + + vm_pageout_grow_cache(3, 0, MIPS_XKPHYS_LARGEST_PHYS); +} + +static vm_page_t +pmap_alloc_direct_page(unsigned int index, int req) +{ + vm_page_t m; + + m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + m->pindex = index; + return (m); +} + +/*- + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +int +pmap_pinit(pmap_t pmap) +{ + vm_offset_t ptdva; + vm_page_t ptdpg; + int i; + + /* + * allocate the page directory page + */ + while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, VM_ALLOC_NOOBJ | + VM_ALLOC_NORMAL)) == NULL) { + pmap_grow_direct_page_cache(); + } + + ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); + pmap->pm_segtab = (pd_entry_t *)ptdva; + CPU_ZERO(&pmap->pm_active); + for (i = 0; i < MAXCPU; i++) { + pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; + pmap->pm_asid[i].gen = 0; + } + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + + return (1); +} + +/* + * This routine is called if the desired page table page does not exist. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, unsigned ptepindex, struct rwlock **lockp) +{ + vm_offset_t pageva; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Find or fabricate a new pagetable page + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_UNLOCK(pmap); + rw_runlock(&pvh_global_lock); + VM_WAIT; + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page + * table page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if it + * isn't already there. + */ + pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); + + if (ptepindex >= NUPDE) { + pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; + } else { + pd_entry_t *pdep, *pde; + int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); + int pdeindex = ptepindex & (NPDEPG - 1); + vm_page_t pg; + + pdep = &pmap->pm_segtab[segindex]; + if (*pdep == NULL) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + segindex, + lockp) == NULL) { + /* alloc failed, release current */ + --m->wire_count; + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to the pd page */ + pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); + pg->wire_count++; + } + /* Next level entry */ + pde = (pd_entry_t *)*pdep; + pde[pdeindex] = (pd_entry_t)pageva; + } + + pmap_resident_count_inc(pmap, 1); + + return (m); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + unsigned ptepindex; + pd_entry_t *pd; + vm_page_t m; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_pde_pindex(va); +retry: + /* + * Get the page directory entry + */ + pd = pmap_pde(pmap, va); + + /* + * This supports switching from a 2MB page to a + * normal 4K page. + */ + if (pd != NULL && (pde_is_1m_superpage(pd) && + pte_is_valid((pt_entry_t *)pd))) { + if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { + /* + * Invalidation of the 2MB page mapping may have caused + * the deallocation of the underlying PD page. + */ + pd = NULL; + } + } + + /* + * If the page table page is mapped, we just increment the hold + * count, and activate it. + */ + if (pd != NULL && *pd != NULL) { + m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((pt_entry_t)*pd)); + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) + goto retry; + } + return (m); +} + +/*************************************************** + * Pmap allocation/deallocation routines. + ***************************************************/ + +/*- + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ + vm_offset_t ptdva; + vm_page_t ptdpg; + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + + /* + * Invalidate any left TLB entries, to allow the reuse + * of the asid. + */ + pmap_invalidate_all(pmap); + + ptdva = (vm_offset_t)pmap->pm_segtab; + ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); + + ptdpg->wire_count--; + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(ptdpg); +} + +/*- + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + vm_page_t nkpg; + pd_entry_t *pde, *pdpe; + pt_entry_t *pte; + int i; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + addr = roundup2(addr, NBSEG); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); + if (*pdpe == 0) { + /* new intermediate page table entry */ + nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_INTERRUPT | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("%s: no memory to grow kernel", __func__); + *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT( + VM_PAGE_TO_PHYS(nkpg)); + continue; /* try again */ + } + pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); + if (*pde != 0) { + kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + /* + * This index is bogus, but out of the way + */ + nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_INTERRUPT | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (!nkpg) + panic("pmap_growkernel: no memory to grow kernel"); + nkpt++; + *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); + + /* + * The R[4-7]?00 stores only one copy of the Global bit in + * the translation lookaside buffer for each 2 page entry. + * Thus invalid entrys must have the Global bit set so when + * Entry LO and Entry HI G bits are anded together they will + * produce a global bit to store in the tlb. + */ + pte = (pt_entry_t *)*pde; + for (i = 0; i < NPTEPG; i++) + pte[i] = PTE_G; + + kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + +/*************************************************** + * page management routines. + ***************************************************/ + +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 3); +CTASSERT(_NPCPV == 168); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0xfffffffffffffffful +#define PC_FREE2 0x000000fffffffffful + +static const u_long pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static long pv_entry_count, pv_entry_frees, pv_entry_allocs; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); +#endif + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * + * We do not, however, unmap 2mpages because subsequent access will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + pd_entry_t *pde; + pmap_t pmap; + pt_entry_t *pte, oldpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint64_t inuse; + int bit, field, freed, idx; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + TAILQ_INIT(&new_tail); + mtx_lock(&pv_chunks_mutex); + while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + if (pmap != pc->pc_pmap) { + if (pmap != NULL) { + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + pmap = pc->pc_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + } else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { + pmap = NULL; + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + continue; + } + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = ffsl(inuse) - 1; + idx = field * sizeof(inuse) * NBBY + bit; + pv = &pc->pc_pventry[idx]; + va = pv->pv_va; + pde = pmap_pde(pmap, va); + KASSERT(pde != NULL && *pde != 0, + ("%s: pde", __func__)); + if (pde_is_1m_superpage(pde)) + continue; + pte = pmap_pde_to_pte(pde, va); + oldpte = *pte; + if (pte_test(&oldpte, PTE_W)) + continue; + if (is_kernel_pmap(pmap)) + *pte = PTE_G; + else + *pte = 0; + if (pte_test(&oldpte, PTE_G)) + pmap_invalidate_page(pmap, va); + m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); + if (pte_test(&oldpte, PTE_D)) + vm_page_dirty(m); + if (pte_is_ref(&oldpte)) + vm_page_aflag_set(m, PGA_REFERENCED); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + struct md_page *pvh = + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, *pde, &free); + freed++; + } + } + if (freed == 0) { + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + continue; + } + /* Every freed mapping is for a 4 KB page. */ + pmap_resident_count_dec(pmap, freed); + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && + pc->pc_map[2] == PC_FREE2) { + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( + (vm_offset_t)pc)); + dump_drop_page(m_pc->phys_addr); + mtx_lock(&pv_chunks_mutex); + break; + } + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + /* One freed pv entry in locked_pmap is sufficient. */ + if (pmap == locked_pmap) + break; + } + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + if (pmap != NULL) { + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + if (m_pc == NULL && !SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->wire_count = 1; + atomic_add_int(&vm_cnt.v_wire_count, 1); + } + pmap_free_zero_pages(&free); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int bit, field, idx; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / (sizeof(u_long) * NBBY); + bit = idx % (sizeof(u_long) * NBBY); + pc->pc_map[field] |= 1ul << bit; + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || + pc->pc_map[2] != PC_FREE2) { + /* 98% of the time, pc is already at the head of the list. */ + if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); + dump_drop_page(m->phys_addr); + vm_page_unwire(m, PQ_INACTIVE); + vm_page_free(m); +} + +/* + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, struct rwlock **lockp) +{ + int bit, field, idx; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = ffsl(pc->pc_map[field]) - 1; + break; + } + } + if (field < _NPCM) { + idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; + pv = &pc->pc_pventry[idx]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && + pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, + pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + if (lockp == NULL) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + mtx_lock(&pv_chunks_mutex); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); + return (pv); +} + +/* + * Returns the number of one bits within the given PV chunk map element. + */ +static inline int +popcount_pc_map_elem(uint64_t elem) +{ + int count; + + /* + * This simple method of counting the one bits performs well because + * the given element typically contains more zero bits than one bits. + */ + count = 0; + for (; elem != 0; elem &= elem - 1) + count++; + return (count); +} + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + int avail, free; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("%s: lockp is NULL", __func__)); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + free = popcount_pc_map_elem(pc->pc_map[0]); + free += popcount_pc_map_elem(pc->pc_map[1]); + free += popcount_pc_map_elem(pc->pc_map[2]); + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_LOCKED); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + break; + } + } + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + int bit, field; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("%s: pa is not 2mpage aligned", __func__)); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("%s: pv not found", __func__)); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); + va_last = va + NBPDR - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || + pc->pc_map[2] != 0, ("%s: missing spare", __func__)); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = ffsl(pc->pc_map[field]) - 1; + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); +} + +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_LOCKED); + KASSERT((pa & PDRMASK) == 0, + ("%s: pa is not 2mpage aligned", __func__)); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("%s: pv not found", __func__)); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", + (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), + (u_long)va)); + free_pv_entry(pmap, pv); +} + +/* + * Conditionally create the pv entry for a 4KB page mapping if the required + * memory can be allocated without restorting to reclamation. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + return (TRUE); + } else + return (FALSE); +} + +/* + * Conditionally create the PV entry for a 2MB page mapping if the required + * memory can be allocated without resorting to reclamation. + */ +static boolean_t +pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + return (TRUE); + } else + return (FALSE); +} + + +/* + * Fills a page table page with mappings to consecutive physical pages. + */ +static void +pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) +{ + pt_entry_t *pte; + + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += (PAGE_SIZE >> TLBLO_PFN_SHIFT); + } +} + +/* + * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page + * mapping is invalidated. + */ +static boolean_t +pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + struct rwlock *lock; + boolean_t rv; + + lock = NULL; + rv = pmap_demote_pde_locked(pmap, pde, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +static boolean_t +pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ + pd_entry_t newpde, oldpde; + pt_entry_t oldpte, *firstpte, newpte; + vm_paddr_t mptepa; + vm_page_t mpte; + struct spglist free; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpde = *pde; + oldpte = (pt_entry_t)oldpde; + KASSERT(pte_is_1m_superpage(&oldpte) && pte_is_valid(&oldpte), + ("%s: oldpde is not superpage and/or valid.", __func__)); + if (pte_is_ref(&oldpte) && (mpte = pmap_lookup_pt_page(pmap, va)) != + NULL) + pmap_remove_pt_page(pmap, mpte); + else { + KASSERT(!pte_test(&oldpte, PTE_W), + ("%s: page table page for a wired mapping is missing", + __func__)); + /* + * Invalidate the 2MB page mapping and return "failure" if the + * mapping was never accessed or the allocation of the new + * page table page fails. If the 2MB page mapping belongs to + * the direct map region of the kernel's address space, then + * the page allocation request specifies the highest possible + * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is + * normal. Page table pages are preallocated for every other + * part of the kernel address space, so the direct map region + * is the only part of the kernel address space that must be + * handled here. + */ + if (!pte_is_ref(&oldpte) || (mpte = vm_page_alloc(NULL, + pmap_pde_pindex(va), (va >= VM_MIN_KERNEL_ADDRESS && va < + VM_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + SLIST_INIT(&free); + pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, + lockp); + pmap_invalidate_range(pmap, + (vm_offset_t)(va & ~PDRMASK), + (vm_offset_t)(va & ~PDRMASK) + NBPDR); + pmap_free_zero_pages(&free); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return (FALSE); + } + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + newpde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(mptepa); + firstpte = newpde; + KASSERT(pte_is_ref(&oldpte), + ("%s: oldpte is not referenced", __func__)); + KASSERT(!pte_test(&oldpte, PTE_RO) && pte_test(&oldpte, PTE_D), + ("%s: oldpte is missing PTE_D", __func__)); + newpte = oldpte & ~PTE_PS_IDX_MASK; + + /* + * If the page table page is new, initialize it. + */ + if (mpte->wire_count == 1) { + mpte->wire_count = NPTEPG; + pmap_fill_ptp(firstpte, newpte); + } + KASSERT(TLBLO_PTE_TO_PA(*firstpte) == TLBLO_PTE_TO_PA(newpte), + ("%s: firstpte and newpte map different physical addresses", + __func__)); + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + if ((*firstpte & PG_PROMOTE_MASK) != (newpte & PG_PROMOTE_MASK)) + pmap_fill_ptp(firstpte, newpte); + + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the PDE and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_pde() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if (pde_test(&oldpde, PTE_MANAGED)) + reserve_pv_entries(pmap, NPTEPG - 1, lockp); + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PTE_REF set. If the old PDE has PTE_RO clear, it also has + * PTE_D set. Thus, there is no danger of a race with another + * processor changing the setting of PTE_REF and/or PTE_D between + * the read above and the store below. + */ + pmap_update_pde(pmap, va, pde, (pt_entry_t)newpde); + + /* + * Invalidate a stale recursive mapping of the page table page. + */ + if (va >= VM_MAXUSER_ADDRESS) + pmap_invalidate_page(pmap, (vm_offset_t)pmap_pte(pmap, va)); + + /* + * Demote the PV entry. + */ + if (pde_test(&oldpde, PTE_MANAGED)) { + pmap_pv_demote_pde(pmap, va, TLBLO_PDE_TO_PA(oldpde), lockp); + } + atomic_add_long(&pmap_pde_demotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, + pmap); + + return (TRUE); +} + +/* + * pmap_remove_kernel_pde: Remove a kernel superpage mapping. + */ +static void +pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + /* XXX Not doing kernel superpages yet. */ + panic("pmap_remove_kernel_pde: kernel superpage"); +} + +/* + * pmap_remove_pde: do the things to unmap a superpage in a process + */ +static int +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pd_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + /* + * Write back all cache lines from the superpage being unmapped. + */ + mips_dcache_wbinv_range_index((sva & ~PDRMASK), NBPDR); + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_remove_pde: sva is not 2mpage aligned")); + + if (is_kernel_pmap(pmap)) + oldpde = pde_load_store(pdq, PTE_G); + else + oldpde = pde_load_store(pdq, 0); + if (pde_test(&oldpde, PTE_W)) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + + if (pde_test(&oldpde, PTE_G)) + pmap_invalidate_page(kernel_pmap, sva); + + pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); + if (pde_test(&oldpde, PTE_MANAGED)) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, TLBLO_PDE_TO_PA(oldpde)); + pvh = pa_to_pvh(TLBLO_PDE_TO_PA(oldpde)); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(TLBLO_PDE_TO_PA(oldpde)); + va < eva; va += PAGE_SIZE, m++) { + if (pde_test(&oldpde, PTE_D) && !pde_test(&oldpde, PTE_RO)) + vm_page_dirty(m); + if (pde_test(&oldpde, PTE_REF)) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + pmap_remove_kernel_pde(pmap, pdq, sva); + } else { + mpte = pmap_lookup_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pde: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + } + } + return (pmap_unuse_pt(pmap, sva, *pmap_segmap(pmap, sva), free)); +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, + pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t oldpte; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Write back all cache lines from the page being unmapped. + */ + mips_dcache_wbinv_range_index(va, PAGE_SIZE); + + oldpte = *ptq; + if (is_kernel_pmap(pmap)) + *ptq = PTE_G; + else + *ptq = 0; + + if (pte_test(&oldpte, PTE_W)) + pmap->pm_stats.wired_count -= 1; + + pmap_resident_count_dec(pmap, 1); + if (pte_test(&oldpte, PTE_MANAGED)) { + m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); + if (pte_test(&oldpte, PTE_D)) { + KASSERT(!pte_test(&oldpte, PTE_RO), + ("%s: modified page not writable: va: %p, pte: %#jx", + __func__, (void *)va, (uintmax_t)oldpte)); + vm_page_dirty(m); + } + if (pte_is_ref(&oldpte)) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + return (pmap_unuse_pt(pmap, va, ptepde, free)); +} + +/* + * Remove a single page from a process address space + */ +static void +pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free) +{ + struct rwlock *lock; + pd_entry_t *pde; + pt_entry_t *pte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pde = pmap_pde(pmap, va); + if (pde == NULL || *pde == 0) + return; + pte = pmap_pde_to_pte(pde, va); + + /* + * If there is no pte for this address, just skip it! + */ + if (!pte_is_valid(pte)) + return; + + lock = NULL; + (void)pmap_remove_pte(pmap, pte, va, *pde, free, &lock); + if (lock != NULL) + rw_wunlock(lock); + pmap_invalidate_page(pmap, va); +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct rwlock *lock; + vm_offset_t va, va_next; + pd_entry_t ptpaddr, *pde, *pdpe; + pt_entry_t *pte; + struct spglist free; + int anyvalid; + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + SLIST_INIT(&free); + + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very common operation + * and easy to short circuit some code. + */ + if ((sva + PAGE_SIZE) == eva) { + pde = pmap_pde(pmap, sva); + if (!pde_is_1m_superpage(pde)) { + pmap_remove_page(pmap, sva, &free); + goto out; + } + } + + lock = NULL; + for (; sva < eva; sva = va_next) { + if (pmap->pm_stats.resident_count == 0) + break; + + pdpe = pmap_segmap(pmap, sva); + if (*pdpe == 0) { + va_next = (sva + NBSEG) & ~SEGMASK; + if (va_next < sva) + va_next = eva; + continue; + } + /* + * Calculate index for next page table. + */ + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, sva); + ptpaddr = *pde; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == NULL) + continue; + /* + * Check for superpage. + */ + if (pde_is_1m_superpage(&ptpaddr)) { + /* + * Are we removing the entire superpage? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == va_next && eva >= va_next) { + /* + * The TLB entry for a PTE_G mapping is + * invalidated by pmap_remove_pde(). + */ + if (!pde_test(&ptpaddr, PTE_G)) + anyvalid = TRUE; + pmap_remove_pde(pmap, pde, sva, &free, &lock); + continue; + } else if (!pmap_demote_pde_locked(pmap, pde, sva, + &lock)) { + /* The superpage mapping was destroyed. */ + continue; + } else { + ptpaddr = *pde; + } + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + va = va_next; + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if (!pte_is_valid(pte)) { + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + continue; + } + if (!pte_test(pte, PTE_G)) + anyvalid = TRUE; + if (va == va_next) + va = sva; + if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, + &lock)) { + sva += PAGE_SIZE; + break; + } + } + if (va != va_next) + pmap_invalidate_range(pmap, va, sva); + } + if (lock != NULL) + rw_wunlock(lock); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(&free); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ +void +pmap_remove_all(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte, tpte; + pd_entry_t *pde; + vm_offset_t va; + struct spglist free; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + SLIST_INIT(&free); + rw_wlock(&pvh_global_lock); + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + va = pv->pv_va; + pde = pmap_pde(pmap, va); + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } +small_mappings: + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + + /* + * If it's last mapping writeback all caches from + * the page being destroyed + */ + if (TAILQ_NEXT(pv, pv_next) == NULL) + mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); + + pmap_resident_count_dec(pmap, 1); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); + KASSERT(!pde_is_superpage(pde), ("pmap_remove_all: found" + " a superpage in page %p 's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); + + tpte = *pte; + pte_store(pte, is_kernel_pmap(pmap) ? PTE_G : 0); + + if (pte_test(&tpte, PTE_W)) + pmap->pm_stats.wired_count--; + + /* + * Update the vm_page_t dirty and reference bits. + */ + if (pte_is_ref(&tpte)) + vm_page_aflag_set(m, PGA_REFERENCED); + if (pte_test(&tpte, PTE_D)) { + KASSERT(!pte_test(&tpte, PTE_RO), + ("%s: modified page not writable: va: %p, pte: %#jx" + , __func__, (void *)pv->pv_va, (uintmax_t)tpte)); + vm_page_dirty(m); + } + pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); + pmap_invalidate_page(pmap, pv->pv_va); + + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(&pvh_global_lock); + pmap_free_zero_pages(&free); +} + +/* + * pmap_protect_pde: do the things to protect a superpage in a process + */ +static boolean_t +pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) +{ + pd_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("%s: sva is not 2mpage aligned", __func__)); + anychanged = FALSE; +retry: + oldpde = newpde = *pde; + if (pde_test(&oldpde, PTE_MANAGED)) { + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(TLBLO_PDE_TO_PA(oldpde)); + va < eva; va += PAGE_SIZE, m++) + if (pde_test(&oldpde, PTE_D) && + !pde_test(&oldpde, PTE_RO)) + vm_page_dirty(m); + } + if ((prot & VM_PROT_WRITE) == 0) { + pde_set(&newpde, PTE_RO); + pde_clear(&newpde, PTE_D); + } + if (newpde != oldpde) { + if (!atomic_cmpset_long((pt_entry_t *)pde, (pt_entry_t)oldpde, + (pt_entry_t)newpde)) + goto retry; + if (pde_test(&oldpde, PTE_G)) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + return (anychanged); +} + +/*- + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t va, va_next; + pd_entry_t *pde, *pdpe; + pt_entry_t *pte; + boolean_t anychanged, pv_lists_locked; + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + if (prot & VM_PROT_WRITE) + return; + + pv_lists_locked = FALSE; +resume: + anychanged = FALSE; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + pdpe = pmap_segmap(pmap, sva); + if (*pdpe == 0) { + va_next = (sva + NBSEG) & ~SEGMASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, sva); + + /* + * Weed out invalid mappings. + */ + if (*pde == NULL) + continue; + + /* + * Check for superpage. + */ + if (pde_is_1m_superpage(pde)) { + /* + * Are we protecting the entire superpage? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == va_next && eva >= va_next) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_protect_pde(). + */ + if (pmap_protect_pde(pmap, pde, sva, prot)) + anychanged = TRUE; + continue; + } else { + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_rlock(&pvh_global_lock)) { + if (anychanged) + pmap_invalidate_all( + pmap); + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + goto resume; + } + } + if (!pmap_demote_pde(pmap, pde, sva)) { + /* + * The superpage mapping was destroyed. + */ + continue; + } + } + } + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being write protected. + */ + if (va_next > eva) + va_next = eva; + + va = va_next; + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + pt_entry_t pbits; + vm_page_t m; + vm_paddr_t pa; + + pbits = *pte; + if (!pte_is_valid(&pbits) || pte_test(&pbits, PTE_RO)) { + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + continue; + } + pte_set(&pbits, PTE_RO); + if (pte_test(&pbits, PTE_D)) { + pte_clear(&pbits, PTE_D); + if (pte_test(&pbits, PTE_MANAGED)) { + pa = TLBLO_PTE_TO_PA(pbits); + m = PHYS_TO_VM_PAGE(pa); + vm_page_dirty(m); + } + if (va == va_next) + va = sva; + } else { + /* + * Unless PTE_D is set, any TLB entries + * mapping "sva" don't allow write access, so + * they needn't be invalidated. + */ + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + } + *pte = pbits; + } + if (va != va_next) + pmap_invalidate_range(pmap, va, sva); + } + if (anychanged) + pmap_invalidate_all(pmap); + if (pv_lists_locked) { + rw_runlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/*- + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single page table page to a single 2MB page mapping. For promotion to + * occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + * + * On MIPS64 promotions are actually done in pairs of two 1MB superpages + * because of the hardware architecture (two physical pages are in a single + * TLB entry) even though VM layer is under the impression that the superpage + * size is actually 2MB. + */ +static void +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ + pt_entry_t newpde; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_offset_t oldpteva; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2MB page. + */ + firstpte = pmap_pte(pmap, trunc_2mpage(va)); +setpde: + newpde = *firstpte; + if (is_kernel_pmap(pmap)) { + /* XXX not doing kernel pmap yet */ + atomic_add_long(&pmap_pde_p_failures, 1); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return; + } + if (!pte_is_valid(&newpde) || !pte_is_ref(&newpde)) { + atomic_add_long(&pmap_pde_p_failures, 1); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return; + } + if (!pte_test(&newpde, PTE_D) && !pte_test(&newpde, PTE_RO)) { + /* + * When PTE_D is already clear, PTE_RO can be set without + * a TLB invalidation. + */ + if (!atomic_cmpset_long((u_long *)firstpte, newpde, newpde | + PTE_RO)) + goto setpde; + newpde |= PTE_RO; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = TLBLO_PTE_TO_PA(newpde) + NBPDR - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = *pte; + if (TLBLO_PTE_TO_PA(oldpte) != pa) { + atomic_add_long(&pmap_pde_p_failures, 1); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return; + } + if (!pte_test(&oldpte, PTE_D) && !pte_test(&oldpte, PTE_RO)) { + if (!atomic_cmpset_long(pte, oldpte, oldpte | PTE_RO)) + goto setpte; + oldpte |= PTE_RO; + oldpteva = (va & ~PDRMASK) | + (TLBLO_PTE_TO_PA(oldpte) & PDRMASK); + CTR3(KTR_PMAP, "%s: protect for va %#lx in pmap %p", + __func__, oldpteva, pmap); + } + if ((oldpte & PG_PROMOTE_MASK) != (newpde & PG_PROMOTE_MASK)) { + atomic_add_long(&pmap_pde_p_failures, 1); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("%s: page table page is out of range", __func__)); + KASSERT(mpte->pindex == pmap_pde_pindex(va), + ("%s: page table page's pindex is wrong", __func__)); + if (pmap_insert_pt_page(pmap, mpte)) { + atomic_add_long(&pmap_pde_p_failures, 1); + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return; + } + + /* + * Promote the pv entries. + */ + if (pte_test(&newpde, PTE_MANAGED)) + pmap_pv_promote_pde(pmap, va, TLBLO_PTE_TO_PA(newpde), lockp); + + /* + * Map the superpage. + */ + pmap_update_pde(pmap, va, pde, newpde | PTE_PS_1M); + + atomic_add_long(&pmap_pde_promotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, + pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +int +pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + u_int flags, int8_t psind __unused) +{ + struct rwlock *lock; + vm_paddr_t pa, opa; + pd_entry_t *pde; + pt_entry_t *pte; + pt_entry_t origpte, newpte; + pv_entry_t pv; + vm_page_t mpte, om; + boolean_t nosleep; + + va = trunc_page(va); + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || + va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) + VM_OBJECT_ASSERT_LOCKED(m->object); + + mpte = NULL; + + lock = NULL; + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + + /* + * In the case that a page table page is not resident, we are + * creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; + mpte = pmap_allocpte(pmap, va, nosleep ? NULL : &lock); + if (mpte == NULL) { + KASSERT(nosleep != 0, + ("pmap_allocpte failed with sleep allowed")); + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (KERN_RESOURCE_SHORTAGE); + } + } + pde = pmap_pde(pmap, va); + if (pde_is_1m_superpage(pde)) { + panic("%s: attempted pmap_enter on superpage", __func__); + } + pte = pmap_pde_to_pte(pde, va); + + /* + * Page Directory table entry not valid, we need a new PT page + */ + if (pte == NULL) { + panic("pmap_enter: invalid page directory, pdir=%p, va=%p", + (void *)pmap->pm_segtab, (void *)va); + } + + pa = VM_PAGE_TO_PHYS(m); + om = NULL; + origpte = *pte; + opa = TLBLO_PTE_TO_PA(origpte); + + newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); + /* + * pmap_enter() is called during a fault or simulated fault so + * set the reference bit now to avoid a fault. + */ + pte_ref_set(&newpte); + if ((flags & PMAP_ENTER_WIRED) != 0) + newpte |= PTE_W; + if (is_kernel_pmap(pmap)) + newpte |= PTE_G; + if (is_cacheable_mem(pa)) { + if (m->md.pv_memattr == VM_MEMATTR_UNCACHEABLE) + newpte |= PTE_C_UNCACHED; + else + newpte |= PTE_C_CACHE; + } else + newpte |= PTE_C_UNCACHED; +#ifdef CPU_CHERI + if ((flags & PMAP_ENTER_NOLOADTAGS) != 0) + newpte |= PTE_LC; + if ((flags & PMAP_ENTER_NOSTORETAGS) != 0) + newpte |= PTE_SC; +#endif + + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit emulation for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if (!pte_test(&newpte, PTE_RO)) + newpte |= PTE_D; + } + + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (pte_is_valid(&origpte) && opa == pa) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is + * wired, the PT page will be also. + */ + if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) + pmap->pm_stats.wired_count++; + else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, + PTE_W)) + pmap->pm_stats.wired_count--; + + KASSERT(!pte_test(&origpte, PTE_D | PTE_RO), + ("%s: modified page not writable: va: %p, pte: %#jx", + __func__, (void *)va, (uintmax_t)origpte)); + + /* + * Remove the extra PT page reference + */ + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%lx", va)); + } + if (pte_test(&origpte, PTE_MANAGED)) { + om = m; + newpte |= PTE_MANAGED; + if (!pte_test(&newpte, PTE_RO)) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + goto validate; + } + + pv = NULL; + + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + if (pte_test(&origpte, PTE_W)) + pmap->pm_stats.wired_count--; + + if (pte_test(&origpte, PTE_MANAGED)) { + om = PHYS_TO_VM_PAGE(opa); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); + pv = pmap_pvh_remove(&om->md, pmap, va); + } + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: %p", (void *)va)); + } + } else + pmap_resident_count_inc(pmap, 1); + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0) { + newpte |= PTE_MANAGED; + /* Insert Entry */ + if (pv == NULL) + pv = get_pv_entry(pmap, &lock); + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (!pte_test(&newpte, PTE_RO)) + vm_page_aflag_set(m, PGA_WRITEABLE); + } else if (pv != NULL) + free_pv_entry(pmap, pv); + + + /* + * Increment counters + */ + if (pte_test(&newpte, PTE_W)) + pmap->pm_stats.wired_count++; + +validate: +#ifdef PMAP_DEBUG + printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); +#endif + /* + * if the mapping or permission bits are different, we need to + * update the pte. + */ + if ((origpte & ~ (PTE_D|PTE_REF)) != newpte) { + newpte |= PTE_VR; + if ((flags & VM_PROT_WRITE) != 0) + newpte |= PTE_D; + if (pte_is_valid(&origpte)) { + boolean_t invlva = FALSE; + + origpte = pte_load_store(pte, newpte); + if (pte_is_ref(&origpte)) { + if (pte_test(&origpte, PTE_MANAGED)) + vm_page_aflag_set(om, PGA_REFERENCED); + if (opa != pa) + invlva = TRUE; + } + if (pte_test(&origpte, PTE_D) && + !pte_test(&origpte, PTE_RO)) { + if (pte_test(&origpte, PTE_MANAGED)) + vm_page_dirty(om); + if ((prot & VM_PROT_WRITE) == 0) + invlva = TRUE; + } + if (pte_test(&origpte, PTE_MANAGED) && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + if (invlva) + pmap_invalidate_page(pmap, va); + } else + pte_store(pte, newpte); + } + + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + (m->flags & PG_FICTITIOUS) == 0 && + pg_sp_enabled && vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pde(pmap, pde, va, &lock); + + /* + * Sync I & D caches for executable pages. Do this only if the + * target pmap belongs to the current process. Otherwise, an + * unresolvable TLB miss may occur. + */ + if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && + (prot & VM_PROT_EXECUTE)) { + mips_icache_sync_range(va, PAGE_SIZE); + mips_dcache_wbinv_range(va, PAGE_SIZE); + } + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (KERN_SUCCESS); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * but is *MUCH* faster than pmap_enter... + */ +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + struct rwlock *lock; + + lock = NULL; + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) +{ + pt_entry_t *pte, newpte; + vm_paddr_t pa; + struct spglist free; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("%s: managed mapping within the clean submap", __func__)); + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not resident, we are + * creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + pd_entry_t *pde; + unsigned ptepindex; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_pde_pindex(va); + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { + /* + * Get the page directory entry + */ + pde = pmap_pde(pmap, va); + + /* + * If the page table page is mapped, we just + * increment the hold count, and activate it. + */ + if (pde && *pde != 0) { + if (pde_is_1m_superpage(pde)) + return (NULL); + mpte = PHYS_TO_VM_PAGE( + MIPS_DIRECT_TO_PHYS(*pde)); + mpte->wire_count++; + } else { + /* + * Pass NULL instead of the PV list lock + * pointer, because we don't intend to sleep. + */ + mpte = _pmap_allocpte(pmap, ptepindex, NULL); + if (mpte == NULL) + return (mpte); + } + } + } else { + mpte = NULL; + } + + pte = pmap_pte(pmap, va); + if (pte_is_valid(pte)) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + if (mpte != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, mpte, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(&free); + } + + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap_resident_count_inc(pmap, 1); + + pa = VM_PAGE_TO_PHYS(m); + + /* + * Now validate mapping with RO protection + */ + newpte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_VALID; + if ((m->oflags & VPO_UNMANAGED) == 0) + newpte |= PTE_MANAGED; + + if (is_cacheable_mem(pa)) { + if (m->md.pv_memattr == VM_MEMATTR_UNCACHEABLE) + newpte |= PTE_C_UNCACHED; + else + newpte |= PTE_C_CACHE; + } else + newpte |= PTE_C_UNCACHED; + + sched_pin(); + if (is_kernel_pmap(pmap)) { + newpte |= PTE_G; + pte_ref_set(&newpte); + pte_store(pte, newpte); + } else { + pte_store(pte, newpte); + /* + * Sync I & D caches. Do this only if the target pmap + * belongs to the current process. Otherwise, an + * unresolvable TLB miss may occur. + */ + if (pmap == &curproc->p_vmspace->vm_pmap) { + va &= ~PAGE_MASK; + mips_icache_sync_range(va, PAGE_SIZE); + mips_dcache_wbinv_range(va, PAGE_SIZE); + } + } + sched_unpin(); + + return (mpte); +} + +/* + * Make a temporary mapping for a physical address. This is only intended + * to be used for panic dumps. + * + * Use XKPHYS for 64 bit. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + + if (i != 0) + printf("%s: ERROR!!! More than one page of virtual address " + "mapping not supported\n", __func__); + + return ((void *)MIPS_PHYS_TO_DIRECT(pa)); +} + +void +pmap_kenter_temporary_free(vm_paddr_t pa) +{ + + /* nothing to do for mips64 */ + return; +} + +static vm_page_t +pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t pdpindex, ptepindex; + pd_entry_t *pdpe; + vm_page_t mpte = NULL; + + if (va < VM_MAXUSER_ADDRESS) { +retry: + pdpe = pmap_segmap(pmap, va); + if (pdpe != NULL && (*pdpe != NULL)) { + /* Add a reference to the pd page. */ + mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdpe)); + mpte->wire_count++; + } else { + /* Allocate a pd page. */ + + /* Calculate pagetable page index. */ + ptepindex = pmap_pde_pindex(va); + pdpindex = ptepindex >> NPDEPGSHIFT; /* XXX */ + mpte = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + if (mpte == NULL && lockp != NULL) + goto retry; + } + } + return (mpte); +} + +/* + * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE + * otherwise. Fails if (1) a page table page cannot be allocated without + * blocking, (2) a mapping already exists at the specified virtual address, or + * (3) a pv entry cannot be allocated without reclaiming another pv entry. + */ +static boolean_t +pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + struct rwlock **lockp) +{ + pd_entry_t *pde; + pt_entry_t newpde; + vm_page_t mpde; + struct spglist free; + vm_paddr_t pa; + + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + if (is_kernel_pmap(pmap)) { + /* Not doing the kernel pmap for now */ + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p: kernel map", + __func__, va, pmap); + return (FALSE); + } + if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return (FALSE); + } + /* pde = pmap_pde(pmap, va); */ + pde = (pd_entry_t *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(mpde)); + pde = &pde[pmap_pde_index(va)]; + if (pde == NULL) { + KASSERT(mpde->wire_count > 1, + ("%s: mpde's wire count is too low", __func__)); + mpde->wire_count--; + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, + va, pmap); + return (FALSE); + } + pa = VM_PAGE_TO_PHYS(m); + newpde = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_VALID | PTE_PS_1M; + if (is_cacheable_mem(pa)) { + if (m->md.pv_memattr == VM_MEMATTR_UNCACHEABLE) + newpde |= PTE_C_UNCACHED; + else + newpde |= PTE_C_CACHE; + } else + newpde |= PTE_C_UNCACHED; + if ((m->oflags & VPO_UNMANAGED) == 0) { + newpde |= PTE_MANAGED; + + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), + lockp)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, mpde, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(&free); + } + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return (FALSE); + } + } + + /* + * Increment counters. + */ + pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); + + /* + * Map the superpage. + */ + sched_pin(); + pde_store(pde, newpde); + + /* + * Sync I & D caches for executable pages. Do this only if the + * target pmap belongs to the current process. Otherwise, an + * unresolvable TLB miss may occur. + */ + if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && + (prot & VM_PROT_EXECUTE)) { + va &= ~PDRMASK; + mips_icache_sync_range(va, NBPDR); + mips_dcache_wbinv_range(va, NBPDR); + + } + sched_unpin(); + + atomic_add_long(&pmap_pde_mappings, 1); + CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, + pmap); + return (TRUE); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + struct rwlock *lock; + vm_offset_t va; + vm_page_t m, mpte; + vm_pindex_t diff, psize; + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + psize = atop(end - start); + mpte = NULL; + m = m_start; + lock = NULL; + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + va = start + ptoa(diff); + if ((va & PDRMASK) == 0 && va + NBPDR <= end && + m->psind == 1 && pg_sp_enabled && + pmap_enter_pde(pmap, va, m, prot, &lock)) + m = &m[NBPDR / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, + mpte, &lock); + m = TAILQ_NEXT(m, listq); + } + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * pmap_object_init_pt preloads the ptes for a given object + * into the specified pmap. This eliminates the blast of soft + * faults on process startup and immediately after a mmap(). + * + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, + vm_pindex_t pindex, vm_size_t size) +{ + pd_entry_t *pde; + vm_paddr_t pa, ptepa; + vm_page_t p, pdpg; + vm_memattr_t memattr; + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); + + if (is_kernel_pmap(pmap)) { + /* Not doing the kernel pmap for now. */ + return; + } + + if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { + if (!pg_sp_enabled) + return; + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + p = vm_page_lookup(object, pindex); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("%s: invalid page %p", __func__, p)); + memattr = p->md.pv_memattr; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 2MB page boundary. + */ + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & (NBPDR - 1)) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + p = TAILQ_NEXT(p, listq); + for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; + pa += PAGE_SIZE) { + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("%s: invalid page %p", __func__, p)); + if (pa != VM_PAGE_TO_PHYS(p) || + memattr != p->md.pv_memattr) + return; + p = TAILQ_NEXT(p, listq); + } + + /* + * Map using 2MB pages. "ptepa" is 2M aligned and "size" + * is a multiple of 2M. + */ + PMAP_LOCK(pmap); + for (pa = ptepa; pa < ptepa + size; pa += NBPDR) { + pdpg = pmap_allocpde(pmap, addr, NULL); + if (pdpg == NULL) { + /* + * The creation of mappings below is only an + * optimization. If a page directory page + * cannot be allocated without blocking, + * continue on to the next mapping rather than + * blocking. + */ + addr += NBPDR; + continue; + } + pde = (pd_entry_t *)MIPS_PHYS_TO_DIRECT( + VM_PAGE_TO_PHYS(pdpg)); + pde = &pde[pmap_pde_index(addr)]; + if (!pte_is_valid((pt_entry_t *)pde)) { + pt_entry_t newpte = TLBLO_PA_TO_PFN(pa) | + PTE_PS_1M | PTE_D | PTE_REF | PTE_VALID; + + if (is_cacheable_mem(pa)) { + if (pdpg->md.pv_memattr == + VM_MEMATTR_UNCACHEABLE) + newpte |= PTE_C_UNCACHED; + else + newpte |= PTE_C_CACHE; + } else + newpte |= PTE_C_UNCACHED; + + pde_store(pde, newpte); + pmap_resident_count_inc(pmap, NBPDR/PAGE_SIZE); + atomic_add_long(&pmap_pde_mappings, 1); + } else { + /* Continue on if the PDE is already valid. */ + pdpg->wire_count--; + KASSERT(pdpg->wire_count > 0, + ("%s: missing reference to page directory " + "page, va: 0x%lx", __func__, addr)); + } + addr += NBPDR; + } + PMAP_UNLOCK(pmap); + } +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware feature, + * so there is no need to invalidate any TLB entries. + */ +void +pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + pd_entry_t *pde, *pdpe; + pt_entry_t *pte; + vm_offset_t va_next; + boolean_t pv_lists_locked; + + pv_lists_locked = FALSE; +resume: + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + pdpe = pmap_segmap(pmap, sva); + if (*pdpe == NULL) { + va_next = (sva + NBSEG) & ~SEGMASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + pde = pmap_pdpe_to_pde(pdpe, sva); + if (*pde == NULL) + continue; + if (pde_is_1m_superpage(pde)) { + if (!pde_test(pde, PTE_W)) + panic("pmap_unwire: pde %#jx is missing PTE_W", + (uintmax_t)*pde); + /* + * Are we unwiring the entire superpage? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == va_next && eva >= va_next) { + atomic_clear_long((pt_entry_t *)pde, PTE_W); + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + continue; + } else { + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_rlock(&pvh_global_lock)) { + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + /* Repeat sva. */ + goto resume; + } + } + if (!pmap_demote_pde(pmap, pde, sva)) + panic("pmap_unwire: demotion failed"); + } + } + if (va_next > eva) + va_next = eva; + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if (!pte_is_valid(pte)) + continue; + if (!pte_test(pte, PTE_W)) + panic("pmap_unwire: pte %#jx is missing PG_W", + (uintmax_t)*pte); + /* + * PTE_W must be cleared atomically. Although the pmap + * lock synchronizes access to PTE_W, another processor + * could be setting PTE_D and/or PTE_REF concurrently. + */ + pte_atomic_clear(pte, PTE_W); + pmap->pm_stats.wired_count--; + } + } + if (pv_lists_locked) { + rw_runlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, + vm_size_t len, vm_offset_t src_addr) +{ +#if 0 +/* + * XXX This doesn't help with fork() performance and + * adds more overhead. Maybe the reference bit emulation + * is causing fault-like overhead anyway? + */ + + struct rwlock *lock; + struct spglist free; + vm_offset_t addr, end_addr, va_next; + + if (dst_addr != src_addr) + return; + + if (PCPU_GET(curpmap) != src_pmap) + return; + + end_addr = src_addr + len; + + lock = NULL; + rw_rlock(&pvh_global_lock); + /* Lock the pmaps in the same order to avoid deadlock. */ + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + + for (addr = src_addr; addr < end_addr; addr = va_next) { + pt_entry_t *src_pte, *dst_pte; + vm_page_t dstmpde, dstmpte, srcmpte; + pd_entry_t *src_pdpe, *src_pde, *dst_pde; + pt_entry_t srcpte; + vm_paddr_t srcpaddr; + vm_page_t m; + + + src_pdpe = pmap_segmap(src_pmap, addr); + if (src_pdpe == NULL || *src_pdpe == 0) { + va_next = (addr + NBSEG) & ~SEGMASK; + /* + * If the next va is out of the copy range then set + * it to end_addr in order to copy all mappings until + * given limit. + */ + if (va_next < addr) + va_next = end_addr; + continue; + } + + va_next = (addr + NBPDR) & ~PDRMASK; + if (va_next < addr) + va_next = end_addr; + + src_pde = pmap_pdpe_to_pde(src_pdpe, addr); + if (src_pde == NULL || *src_pde == 0) + continue; + srcpte = (pt_entry_t)*src_pde; + + if (pte_is_1m_superpage(&srcpte)) { + /* Copy superpage pde. */ + if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) + continue; + dstmpde = pmap_allocpde(dst_pmap, addr, NULL); + if (dstmpde == NULL) + break; + + /* dst_pde = pmap_pde(dst_pmap, addr); */ + dst_pde = (pd_entry_t *)MIPS_PHYS_TO_DIRECT( + VM_PAGE_TO_PHYS(dstmpde)); + dst_pde = &dst_pde[pmap_pde_index(addr)]; + + if (*dst_pde == 0 && + (!pte_test(&srcpte, PTE_MANAGED) || + pmap_pv_insert_pde(dst_pmap, addr, + TLBLO_PTE_TO_PA(srcpte), &lock))) { + *dst_pde = (pd_entry_t)(srcpte & ~PTE_W); + pmap_resident_count_inc(dst_pmap, NBPDR / + PAGE_SIZE); + } else + dstmpde->wire_count--; + continue; + } + + srcpaddr = MIPS_DIRECT_TO_PHYS(*src_pde); + srcmpte = PHYS_TO_VM_PAGE(srcpaddr); + KASSERT(srcmpte->wire_count > 0, + ("pmap_copy: source page table page is unused")); + + /* + * Limit our scan to either the end of the vaddr represented + * by the source page table page, or to the end of the range + * being copied. + */ + if (va_next > end_addr) + va_next = end_addr; + + /* + * Walk the source page table entries and copy the managed + * entries. + */ + + /* src_pte = pmap_pde_to_pte(src_pde, addr); */ + src_pte = (pt_entry_t *)MIPS_PHYS_TO_DIRECT(srcpaddr); + src_pte = &src_pte[pmap_pte_index(addr)]; + + if (src_pte == NULL || *src_pte == 0) + continue; + + dstmpte = NULL; + while (addr < va_next) { + unsigned pdepindex; + pt_entry_t ptetemp; + + + ptetemp = *src_pte; + + /* + * we only virtual copy managed pages + */ + if (pte_test(&ptetemp, PTE_MANAGED)) { + /* Calculate pagetable page index */ + pdepindex = pmap_pde_pindex(addr); + + /* Get the page directory entry. */ + dst_pde = pmap_pde(dst_pmap, addr); + + if (dst_pde != NULL && *dst_pde != 0) { + dstmpte = PHYS_TO_VM_PAGE( + MIPS_DIRECT_TO_PHYS(*dst_pde)); + } else + dstmpte = NULL; + + if (dstmpte != NULL && + dstmpte->pindex == pdepindex) { + /* + * The page table is mapped so just + * increment the hold count. + */ + dstmpte->wire_count++; + } else { + /* + * The page table isn't mapped, or it + * has been deallocated. + */ + dstmpte = pmap_allocpte(dst_pmap, + addr, NULL); + + /* + * If we are having memory alloc issues + * then abandon trying to copy the page + * tables. + */ + if (dstmpte == NULL) + goto out; + } + /* + * Now that we have a page directory, get the + * pte. + */ + + /* dst_pte = pmap_pte(dst_pmap, addr); */ + dst_pte = (pt_entry_t *) + MIPS_PHYS_TO_DIRECT( + VM_PAGE_TO_PHYS(dstmpte)); + dst_pte = &dst_pte[pmap_pte_index(addr)]; + + /* Try and insert the pv_entry. */ + m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(ptetemp)); + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, m, + &lock)) { + /* + * Populate the entry. + * + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + pte_clear(&ptetemp, PTE_W | PTE_D | + PTE_REF); + *dst_pte = ptetemp; + /* Update stats. */ + pmap_resident_count_inc(dst_pmap, 1); + } else { + SLIST_INIT(&free); + if (pmap_unwire_ptp(dst_pmap, addr, + dstmpte, &free)) { + pmap_invalidate_page(dst_pmap, + addr); + pmap_free_zero_pages(&free); + } + goto out; + } + /* Check the wire_count to see if we're done. */ + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + src_pte++; + } + } +out: + if (lock != NULL) + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +#endif /* #if 0 */ +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * Use XKPHYS for 64 bit. + */ +void +pmap_zero_page(vm_page_t m) +{ + vm_offset_t va; + vm_paddr_t phys = VM_PAGE_TO_PHYS(m); + + va = MIPS_PHYS_TO_DIRECT(phys); + sched_pin(); + bzero((caddr_t)va, PAGE_SIZE); + mips_dcache_wbinv_range(va, PAGE_SIZE); + sched_unpin(); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + vm_offset_t va; + vm_paddr_t phys = VM_PAGE_TO_PHYS(m); + + va = MIPS_PHYS_TO_DIRECT(phys); + sched_pin(); + bzero((char *)(caddr_t)va + off, size); + mips_dcache_wbinv_range(va + off, size); + sched_unpin(); +} + +void +pmap_zero_page_idle(vm_page_t m) +{ + vm_offset_t va; + vm_paddr_t phys = VM_PAGE_TO_PHYS(m); + + va = MIPS_PHYS_TO_DIRECT(phys); + sched_pin(); + bzero((caddr_t)va, PAGE_SIZE); + mips_dcache_wbinv_range(va, PAGE_SIZE); + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + * + * Use XKPHYS for 64 bit. + */ +#define PMAP_COPY_TAGS 0x00000001 +static void +pmap_copy_page_internal(vm_page_t src, vm_page_t dst, int flags) +{ + vm_offset_t va_src, va_dst; + vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); + vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); + + /* easy case, all can be accessed via KSEG0 */ + /* + * Flush all caches for VA that are mapped to this page + * to make sure that data in SDRAM is up to date + */ + sched_pin(); + pmap_flush_pvcache(src); + mips_dcache_wbinv_range_index(MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); + va_src = MIPS_PHYS_TO_DIRECT(phys_src); + va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); +#ifdef CPU_CHERI + if (flags & PMAP_COPY_TAGS) + cheri_bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); + else +#else + bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); +#endif + mips_dcache_wbinv_range(va_dst, PAGE_SIZE); + sched_unpin(); +} + +/* + * With CHERI, it is sometimes desirable to explicitly propagate tags between + * pages (e.g., during copy-on-write), but not other times (e.g., copying data + * from VM to buffer cache). There is more playing out here yet to do (e.g., + * if any filesystems learn to preserve tags) but these KPIs allow us to + * capture that difference in the mean time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + + pmap_copy_page_internal(src, dst, 0); +} + +#ifdef CPU_CHERI +void +pmap_copy_page_tags(vm_page_t src, vm_page_t dst) +{ + + pmap_copy_page_internal(src, dst, PMAP_COPY_TAGS); +} +#endif + +int unmapped_buf_allowed; + +static void +pmap_copy_pages_internal(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize, int flags) +{ + char *a_cp, *b_cp; + vm_page_t a_m, b_m; + vm_offset_t a_pg_offset, b_pg_offset; + vm_paddr_t a_phys, b_phys; + int cnt; + + sched_pin(); + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_m = ma[a_offset >> PAGE_SHIFT]; + a_phys = VM_PAGE_TO_PHYS(a_m); + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_m = mb[b_offset >> PAGE_SHIFT]; + b_phys = VM_PAGE_TO_PHYS(b_m); + pmap_flush_pvcache(a_m); + mips_dcache_wbinv_range_index(MIPS_PHYS_TO_DIRECT(b_phys), + PAGE_SIZE); + a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; + b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; +#ifdef CPU_CHERI + if (flags & PMAP_COPY_TAGS) + cheri_bcopy(a_cp, b_cp, cnt); + else +#else + bcopy(a_cp, b_cp, cnt); +#endif + mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + sched_unpin(); +} + +/* + * As with pmap_copy_page(), CHERI requires tagged and non-tagged versions + * depending on the circumstances. + */ +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + + pmap_copy_pages_internal(ma, a_offset, mb, b_offset, xfersize, 0); +} + +#ifdef CPU_CHERI +void +pmap_copy_pages_tags(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + + pmap_copy_pages_internal(ma, a_offset, mb, b_offset, xfersize, + PMAP_COPY_TAGS); +} +#endif + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + struct rwlock *lock; + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + rv = FALSE; + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_runlock(lock); + rw_runlock(&pvh_global_lock); + return (rv); +} + +/* + * pmap_page_wired_mappings: + * + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +pmap_page_wired_mappings(vm_page_t m) +{ + struct rwlock *lock; + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte; + int count, md_gen; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (0); + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + count = 0; + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + if (pte_test(pte, PTE_W)) + count++; + PMAP_UNLOCK(pmap); + } + if ((m->flags & PG_FICTITIOUS) == 0) { + struct md_page *pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + int pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pd_entry_t *pde = pmap_pde(pmap, pv->pv_va); + if (pte_test((pt_entry_t *)pde, PTE_W)) + count++; + PMAP_UNLOCK(pmap); + } + } + rw_runlock(lock); + rw_runlock(&pvh_global_lock); + return (count); +} + +/* + * Returns TRUE if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + struct rwlock *lock; + boolean_t rv; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (FALSE); + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + rv = !TAILQ_EMPTY(&m->md.pv_list) || + ((m->flags & PG_FICTITIOUS) == 0 && + !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); + rw_runlock(lock); + rw_runlock(&pvh_global_lock); + return (rv); +} + +/* + * Destroy all managed, non-wired mappings in the given user-space + * pmap. This pmap cannot be active on any processor besides the + * caller. + * + * This function cannot be applied to the kernel pmap. Moreover, it + * is not intended for general use. It is only to be used during + * process termination. Consequently, it can be implemented in ways + * that make it faster than pmap_remove(). First, it can more quickly + * destroy mappings by iterating over the pmap's collection of PV + * entries, rather than searching the page table. Second, it doesn't + * have to test and clear the page table entries atomically, because + * no processor is currently accessing the user address space. In + * particular, a page table entry's dirty bit won't change state once + * this function starts. + */ +void +pmap_remove_pages(pmap_t pmap) +{ + pd_entry_t ptepde, *pde; + pt_entry_t *pte, tpte; + struct spglist free; + vm_page_t m, mpte, mt; + pv_entry_t pv; + struct md_page *pvh; + struct pv_chunk *pc, *npc; + struct rwlock *lock; + int bit; + uint64_t inuse, bitmask; + int allfree, field, freed, idx; + boolean_t superpage; + vm_paddr_t pa; + + /* + * Assert that the given pmap is only active on the current + * CPU. Unfortunately, we cannot block another CPU from + * activating the pmap while this function is executing. + */ + KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), + ("%s: non-current pmap %p", __func__, pmap)); + + lock = NULL; + SLIST_INIT(&free); + rw_rlock(&pvh_global_lock); + PMAP_LOCK(pmap); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + freed = 0; + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = ffsl(inuse) - 1; + bitmask = 1UL << bit; + idx = field * sizeof(inuse) * NBBY + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pde = pmap_segmap(pmap, pv->pv_va); + ptepde = *pde; + pde = pmap_pdpe_to_pde(pde, pv->pv_va); + if (pde_is_1m_superpage(pde)) { + superpage = TRUE; + pte = (pt_entry_t *)pde; + } else { + superpage = FALSE; + ptepde = *pde; + pte = pmap_pde_to_pte(pde, pv->pv_va); + } + tpte = *pte; + if (!pte_is_valid(pte)) { + panic("%s: bad %s pte va %lx pte %lx", + __func__, superpage ? "superpage" : + "", pv->pv_va, tpte); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (pte_test(&tpte, PTE_W)) { + allfree = 0; + continue; + } + + pa = TLBLO_PTE_TO_PA(tpte); + if (superpage) + pa &= ~PDRMASK; + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("%s: vm_page_t %p phys_addr mismatch " + "%016jx %016jx", __func__, m, + (uintmax_t)m->phys_addr, (uintmax_t)tpte)); + + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("%s: bad tpte %#jx", __func__, + (uintmax_t)tpte)); + + /* Clear PTE */ + if (superpage) + pte_store(pte, 0); + else + pte_store(pte, is_kernel_pmap(pmap) ? + PTE_G : 0); + + /* + * Update the vm_page_t clean and reference bits + */ + if (pte_test(&tpte, PTE_D) && + !pte_test(&tpte, PTE_RO)) { + if (superpage) { + vm_page_t mt; + + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } + + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + + /* Mark free */ + pc->pc_map[field] |= bitmask; + if (superpage) { + pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); + pvh = pa_to_pvh(TLBLO_PTE_TO_PA(tpte)); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->wire_count == NPTEPG, + ("%s: pte page wire count error", + __func__)); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + } + } else { + pmap_resident_count_dec(pmap, 1); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if ((m->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); + freed++; + } + } + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + if (lock != NULL) + rw_wunlock(lock); + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + rw_runlock(&pvh_global_lock); + pmap_free_zero_pages(&free); +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +pmap_remove_write(vm_page_t m) +{ + struct md_page *pvh; + vm_offset_t va; + pv_entry_t next_pv; + int pvh_gen; + pmap_t pmap; + struct rwlock *lock; + pd_entry_t *pde; + pt_entry_t oldpte, *pte; + pv_entry_t pv; + int md_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * set by another thread while the object is locked. Thus, + * if PGA_WRITEABLE is clear, no page table entries need updating. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return; + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry_pv_loop: + rw_wlock(lock); + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + va = pv->pv_va; + pde = pmap_pde(pmap, va); + if (pde_is_1m_superpage(pde) && !pde_test(pde, PTE_RO)) + (void)pmap_demote_pde_locked(pmap, pde, va, &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } +small_mappings: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + pde = pmap_pde(pmap, pv->pv_va); + KASSERT(!pde_is_superpage(pde), + ("%s: found a superpage in page %p's pv list", + __func__, m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); + KASSERT(pte != NULL && pte_is_valid(pte), + ("%s:page on pv_list has no pte", __func__)); +retry: + oldpte = *pte; + if (!pte_test(&oldpte, PTE_RO)) { + if (!atomic_cmpset_long(pte, oldpte, + ((oldpte & ~PTE_D) | PTE_RO))) + goto retry; + if (pte_test(&oldpte, PTE_D)) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); +} + +#define PMAP_TS_REFERENCED_MAX 5 + +/*- + * pmap_ts_referenced: + * + * Return a count of pages that have been referenced, and reset the + * reference bit. It is not necessary for every reference bit to be + * reset, but it is necessary that 0 only be returned when there are + * truly have been pages referenced. + * + * XXX: The exact number of flags to check and reset is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + struct rwlock *lock; + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t va; + vm_paddr_t pa; + int cleared, md_gen, not_cleared, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + cleared = 0; + pa = VM_PAGE_TO_PHYS(m); + lock = PHYS_TO_PV_LIST_LOCK(pa); + pvh = pa_to_pvh(pa); + rw_rlock(&pvh_global_lock); + rw_wlock(lock); +retry: + not_cleared = 0; + if ((m->flags & PG_FICTITIOUS) != 0 || + (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + pde = pmap_pde(pmap, pv->pv_va); + if (pte_is_ref((pt_entry_t *)pde)) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ + (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && + !pde_test(pde, PTE_W)) { + atomic_clear_long((pt_entry_t *)pde, PTE_REF); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } else + not_cleared++; + + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + } + if ((cleared + not_cleared) >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if ( + pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + + pde = pmap_pde(pmap, pv->pv_va); + KASSERT(!pde_is_superpage(pde), + ("pmap_ts_referenced: found superpage in page %p's pv list", + m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); + if (pte_is_ref(pte)) { + atomic_clear_long((pt_entry_t *)pde, PTE_REF); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + + not_cleared < PMAP_TS_REFERENCED_MAX); +out: + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); + return (cleared + not_cleared); +} + +static boolean_t +pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) +{ + struct rwlock *lock; + pv_entry_t pv; + struct md_page *pvh; + pt_entry_t *pte; + pmap_t pmap; + int md_gen, pvh_gen; + boolean_t rv; + + rv = FALSE; + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + if (modified) { + rv = pte_test(pte, PTE_D) && !pte_test(pte, PTE_RO); + if (accessed) + rv = rv && pte_is_valid(pte) && pte_is_ref(pte); + } else if (accessed) { + rv = pte_is_valid(pte) && pte_is_ref(pte); + } + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = (pt_entry_t *)pmap_pde(pmap, pv->pv_va); + if (modified) { + rv = pte_test(pte, PTE_D) && + !pte_test(pte, PTE_RO); + if (accessed) + rv = rv && pte_is_valid(pte) && + pte_is_ref(pte); + } else if (accessed) { + rv = pte_is_valid(pte) && + pte_is_ref(pte); + } + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } +out: + rw_runlock(lock); + rw_runlock(&pvh_global_lock); + return (rv); +} + + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * concurrently set while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no PTEs can have PTE_D set. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return (FALSE); + return (pmap_page_test_mappings(m, FALSE, TRUE)); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is elgible + * for prefault. + */ +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pd_entry_t *pde; + pt_entry_t *pte; + boolean_t rv; + + rv = FALSE; + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, addr); + if (pde != NULL && *pde != 0) { + if (pde_is_1m_superpage(pde)) + pte = (pt_entry_t *)pde; + else + pte = pmap_pde_to_pte(pde, addr); + rv = (*pte == 0) || (*pte == PTE_G); + } + PMAP_UNLOCK(pmap); + return (rv); +} + +/* + * pmap_is_referenced: + * + * Return whether or not the specified physical page was referenced + * in any physical maps. + */ +boolean_t +pmap_is_referenced(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + return (pmap_page_test_mappings(m, TRUE, FALSE)); +} + +/* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + struct rwlock *lock; + pd_entry_t *pde, *pdpe, oldpde; + pt_entry_t *pte; + vm_offset_t va_next; + vm_page_t m; + boolean_t anychanged, pv_lists_locked; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + + pv_lists_locked = FALSE; +resume: + anychanged = FALSE; + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + pdpe = pmap_segmap(pmap, sva); + if (*pdpe == 0) { + va_next = (sva + NBSEG) & ~SEGMASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, sva); + oldpde = *pde; + if (pde == NULL || *pde == 0) + continue; + else if (pde_is_1m_superpage(pde)) { + if (!pde_test(&oldpde, PTE_MANAGED)) + continue; + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_rlock(&pvh_global_lock)) { + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + goto resume; + } + } + lock = NULL; + if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The superpage mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if (!pde_test(&oldpde, PTE_W)) { + pte = pmap_pde_to_pte(pde, sva); + KASSERT(pte_test(pte, PTE_VALID), + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, sva, *pde, NULL, + &lock); + anychanged = TRUE; + } + if (lock != NULL) + rw_wunlock(lock); + } + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being write protected. + */ + if (va_next > eva) + va_next = eva; + + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if (!pte_is_valid(pte) || !pte_test(pte, PTE_MANAGED)) + continue; + else if (pte_test(pte, PTE_D) && + !pte_test(pte, PTE_RO)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE( + TLBLO_PTE_TO_PA(*pte)); + vm_page_dirty(m); + } + pte_atomic_clear(pte, PTE_D | PTE_REF); + } else if (pte_is_ref(pte)) + pte_atomic_clear(pte, PTE_REF); + else + continue; + if (pte_test(pte, PTE_G)) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + } + if (anychanged) + pmap_invalidate_all(pmap); + if (pv_lists_locked) { + rw_runlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + pv_entry_t next_pv, pv; + pd_entry_t oldpde, *pde; + pt_entry_t oldpte, *pte; + struct rwlock *lock; + vm_offset_t va; + int md_gen, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + VM_OBJECT_ASSERT_WLOCKED(m->object); + KASSERT(!vm_page_xbusied(m), + ("pmap_clear_modify: page %p is exclusive busied", m)); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_D set. + * If the object containing the page is locked and the page is not + * write busied, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->aflags & PGA_WRITEABLE) == 0) + return; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + rw_rlock(&pvh_global_lock); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_wlock(lock); +restart: + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + va = pv->pv_va; + pde = pmap_pde(pmap, va); + oldpde = *pde; + if (!pde_test(pde, PTE_RO)) { + if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { + if (!pde_test(&oldpde, PTE_W)) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - + TLBLO_PDE_TO_PA(oldpde); + pte = pmap_pde_to_pte(pde, va); + oldpte = *pte; + if (!pte_test(&oldpte, PTE_VALID)) { + while (!atomic_cmpset_long(pte, + oldpte, + (oldpte & ~PTE_D) | PTE_RO)) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } +small_mappings: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (md_gen != m->md.pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pde = pmap_pde(pmap, pv->pv_va); + KASSERT(!pde_is_superpage(pde), ("pmap_clear_modify: found" + " a superpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); + if (pte_test(pte, PTE_D) && !pte_test(pte, PTE_RO)) { + pte_atomic_clear(pte, PTE_D); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); + rw_runlock(&pvh_global_lock); +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + * + * Use XKPHYS uncached for 64 bit. + */ +void * +pmap_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); +} + +void +pmap_unmapdev(vm_offset_t va, vm_size_t size) +{ + + /* Nothing to do for mips64. */ +} + +/* + * Sets the memory attribute for the specified page. + */ +void +pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + /* + * Set the memattr field so the appropriate bits are set in the + * PTE as mappings are created. + */ + m->md.pv_memattr = ma; + + /* + * It is assumed that this function is only called before any mappings + * are established. If this is not the case then this function will + * need to walk the pv_list and make each of the existing mappings + * uncacheable, sync the cache (with mips_icache_sync_all() and + * mips_dcache_wbinv_all()) and most likely invalidate TLB entries for + * any of the current mappings it modifies. + */ + if (TAILQ_FIRST(&m->md.pv_list) != NULL) + panic("Can't change memattr on page with existing mappings"); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) +{ + pd_entry_t *pdep; + pt_entry_t *ptep, pte; + vm_paddr_t pa; + int val; + + PMAP_LOCK(pmap); +retry: + pdep = pmap_pde(pmap, addr); + if (pdep != NULL) { + if (pde_is_1m_superpage(pdep)) { + pte = (pt_entry_t)*pdep; + pa = TLBLO_PTE_TO_PA(pte); + val = MINCORE_SUPER; + } else { + ptep = pmap_pde_to_pte(pdep, addr); + pte = (ptep != NULL) ? *ptep : 0; + pa = TLBLO_PTE_TO_PA(pte); + val = 0; + } + } else { + pte = 0; + pa = 0; + val = 0; + } + if (pte_is_valid(&pte)) { + val |= MINCORE_INCORE; + if (pte_test(&pte, PTE_D)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if (pte_is_ref(&pte)) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + pte_test(&pte, PTE_MANAGED)) { + /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ + if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) + goto retry; + } else + PA_UNLOCK_COND(*locked_pa); + PMAP_UNLOCK(pmap); + return (val); +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap, oldpmap; + struct proc *p = td->td_proc; + u_int cpuid; + + critical_enter(); + + pmap = vmspace_pmap(p->p_vmspace); + oldpmap = PCPU_GET(curpmap); + cpuid = PCPU_GET(cpuid); + + if (oldpmap) + CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); + CPU_SET_ATOMIC(cpuid, &pmap->pm_active); + pmap_asid_alloc(pmap); + if (td == curthread) { + PCPU_SET(segbase, pmap->pm_segtab); + mips_wr_entryhi(pmap->pm_asid[cpuid].asid); + } + + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +void +pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t superpage_offset; + + if (size < NBSEG) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & SEGMASK; + if (size - ((NBSEG - superpage_offset) & SEGMASK) < NBSEG || + (*addr & SEGMASK) == superpage_offset) + return; + if ((*addr & SEGMASK) < superpage_offset) + *addr = (*addr & ~SEGMASK) + superpage_offset; + else + *addr = ((*addr + SEGMASK) & ~SEGMASK) + superpage_offset; +} + +#ifdef DDB +DB_SHOW_COMMAND(ptable, ddb_pid_dump) +{ + pmap_t pmap; + struct thread *td = NULL; + struct proc *p; + int i, j, k; + vm_paddr_t pa; + vm_offset_t va; + + if (have_addr) { + td = db_lookup_thread(addr, TRUE); + if (td == NULL) { + db_printf("Invalid pid or tid"); + return; + } + p = td->td_proc; + if (p->p_vmspace == NULL) { + db_printf("No vmspace for process"); + return; + } + pmap = vmspace_pmap(p->p_vmspace); + } else + pmap = kernel_pmap; + + db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", + pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, + pmap->pm_asid[0].gen); + for (i = 0; i < NPDEPG; i++) { + pd_entry_t *pdpe; + pt_entry_t *pde; + pt_entry_t pte; + + pdpe = (pd_entry_t *)pmap->pm_segtab[i]; + if (pdpe == NULL) + continue; + db_printf("[%4d] %p\n", i, pdpe); + for (j = 0; j < NPDEPG; j++) { + pde = (pt_entry_t *)pdpe[j]; + if (pde == NULL) + continue; + db_printf("\t[%4d] %p\n", j, pde); + for (k = 0; k < NPTEPG; k++) { + pte = pde[k]; + if (pte == 0 || !pte_is_valid(&pte)) + continue; + pa = TLBLO_PTE_TO_PA(pte); + va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); + db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", + k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); + } + } + } +} +#endif + +#if defined(DEBUG) + +static void pads(pmap_t pm); +void pmap_pvdump(vm_offset_t pa); + +/* print address space of pmap*/ +static void +pads(pmap_t pm) +{ + unsigned va, i, j; + pt_entry_t *ptep; + + if (pm == kernel_pmap) + return; + for (i = 0; i < NPTEPG; i++) + if (pm->pm_segtab[i]) + for (j = 0; j < NPTEPG; j++) { + va = (i << SEGSHIFT) + (j << PAGE_SHIFT); + if (pm == kernel_pmap && va < KERNBASE) + continue; + if (pm != kernel_pmap && + va >= VM_MAXUSER_ADDRESS) + continue; + ptep = pmap_pte(pm, va); + if (pte_is_valid(ptep)) + printf("%x:%x ", va, *(int *)ptep); + } + +} + +void +pmap_pvdump(vm_offset_t pa) +{ + register pv_entry_t pv; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + for (pv = TAILQ_FIRST(&m->md.pv_list); pv; + pv = TAILQ_NEXT(pv, pv_list)) { + printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); + pads(pv->pv_pmap); + } + printf(" "); +} + +/* N/C */ +#endif + + +/* + * Allocate TLB address space tag (called ASID or TLBPID) and return it. + * It takes almost as much or more time to search the TLB for a + * specific ASID and flush those entries as it does to flush the entire TLB. + * Therefore, when we allocate a new ASID, we just take the next number. When + * we run out of numbers, we flush the TLB, increment the generation count + * and start over. ASID zero is reserved for kernel use. + */ +static void +pmap_asid_alloc(pmap) + pmap_t pmap; +{ + if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && + pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); + else { + if (PCPU_GET(next_asid) == pmap_max_asid) { + tlb_invalidate_all_user(NULL); + PCPU_SET(asid_generation, + (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); + if (PCPU_GET(asid_generation) == 0) { + PCPU_SET(asid_generation, 1); + } + PCPU_SET(next_asid, 1); /* 0 means invalid */ + } + pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); + pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); + PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); + } +} + +static pt_entry_t +init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) +{ + pt_entry_t rw; + + if (!(prot & VM_PROT_WRITE)) + rw = PTE_VALID | PTE_RO; + else if ((m->oflags & VPO_UNMANAGED) == 0) { + if ((access & VM_PROT_WRITE) != 0) + rw = PTE_VALID | PTE_D; + else + rw = PTE_VALID; + } else { + /* + * Needn't emulate a reference/modified bit for unmanaged + * pages. + */ + rw = PTE_VALID | PTE_D; + pte_ref_set(&rw); + } + + return (rw); +} + +/* + * pmap_emulate_modified : do dirty bit emulation + * + * On SMP, update just the local TLB, other CPUs will update their + * TLBs from PTE lazily, if they get the exception. + * Returns 0 in case of sucess, 1 if the page is read only and we + * need to fault. + */ +int +pmap_emulate_modified(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *pde; + pt_entry_t *pte; + + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if (pde_is_1m_superpage(pde)) + pte = (pt_entry_t *)pde; + else + pte = pmap_pde_to_pte(pde, va); + if (pte == NULL) + panic("pmap_emulate_modified: can't find PTE"); +#ifdef SMP + /* It is possible that some other CPU changed m-bit */ + if (!pte_is_valid(pte) || pte_test(pte, PTE_D)) { + tlb_update(pmap, va, *pte); + PMAP_UNLOCK(pmap); + return (0); + } +#else + if (!pte_is_valid(pte) || pte_test(pte, PTE_D)) { + tlb_update(pmap, va, *pte); + PMAP_UNLOCK(pmap); + return (0); + } +#endif + if (pte_test(pte, PTE_RO)) { + PMAP_UNLOCK(pmap); + return (1); + } + pte_atomic_set(pte, PTE_D); /* mark it referenced and modified */ + pte_ref_atomic_set(pte); + tlb_update(pmap, va, *pte); + if (!pte_test(pte, PTE_MANAGED)) + panic("pmap_emulate_modified: unmanaged page"); + PMAP_UNLOCK(pmap); + return (0); +} + +/* + * pmap_emulate_referenced: do reference bit emulation + * + * Returns 0 in case of success. Returns 1 if we need to fault. + */ +int +pmap_emulate_referenced(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *pde; + pt_entry_t *pte; + + if (is_kernel_pmap(pmap)) + return (1); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if (pde == NULL || *pde == NULL) { + /* Invalid page directory. */ + goto dofault; + } + if (pde_is_1m_superpage(pde)) { + pte = (pt_entry_t *)pde; + } else + pte = pmap_pde_to_pte(pde, va); + if (pte == NULL) { + /* Invalid page table. */ + goto dofault; + } + if (!pte_is_valid(pte)) { + /* Invalid PTE. */ + goto dofault; + } + /* Check to see if already marked by other CPU. */ + if (!pte_is_ref(pte)) + pte_ref_atomic_set(pte); + + tlb_update(pmap, va, *pte); + PMAP_UNLOCK(pmap); + + return (0); + +dofault: + PMAP_UNLOCK(pmap); + return (1); +} + +void +pmap_flush_pvcache(vm_page_t m) +{ + pv_entry_t pv; + + if (m != NULL) { + for (pv = TAILQ_FIRST(&m->md.pv_list); pv; + pv = TAILQ_NEXT(pv, pv_next)) { + mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); + } + } +} diff --git a/sys/mips/mips/swtch.S b/sys/mips/mips/swtch.S index 056d1f0..be95660 100644 --- a/sys/mips/mips/swtch.S +++ b/sys/mips/mips/swtch.S @@ -315,36 +315,91 @@ blocked_loop: * NOTE: This is hard coded to UPAGES == 2. * Also, there should be no TLB faults at this point. */ - MTC0 v0, MIPS_COP_0_TLB_HI # VPN = va + MTC0 v0, MIPS_COP_0_TLB_HI # VPN = va HAZARD_DELAY tlbp # probe VPN HAZARD_DELAY - mfc0 s0, MIPS_COP_0_TLB_INDEX + mfc0 s0, MIPS_COP_0_TLB_INDEX HAZARD_DELAY - PTR_LI t1, MIPS_KSEG0_START # invalidate tlb entry - bltz s0, entry0set + # MIPS_KSEG0_START + (2 * index * PAGE_SIZE) -> MIPS_COP_0_TLB_HI + PTR_LI t1, MIPS_KSEG0_START # invalidate tlb entry +#ifdef KSTACK_LARGE_PAGE + bltz s0, inval_nxt1 +#else + bltz s0, entry0set +#endif + nop + sll s0, PAGE_SHIFT + 1 + PTR_ADDU t1, s0 + MTC0 t1, MIPS_COP_0_TLB_HI + PTE_MTC0 zero, MIPS_COP_0_TLB_LO0 + PTE_MTC0 zero, MIPS_COP_0_TLB_LO1 + MTC0 zero, MIPS_COP_0_TLB_PG_MASK + HAZARD_DELAY + tlbwi + HAZARD_DELAY + +#ifdef KSTACK_LARGE_PAGE +/* + * With a KSTACK_PAGE_SIZE of 16K and PAGE_SIZE of 4K it is possible that + * a second TLB entry is currently mapping the kernel thread stack as a + * regular 4K sized page(s). Check for this case and, if so, invalidate + * that TLB entry as well. + */ +#if (PAGE_SIZE != 4096) && (KSTACK_PAGE_SIZE != 16384) +#error PAGE_SIZE is not 4K or KSTACK_PAGE_SIZE is not 16K. +#endif +inval_nxt1: + move v1, v0 + PTR_ADDU v1, PAGE_SIZE * 2 + MTC0 v1, MIPS_COP_0_TLB_HI # VPN = va + HAZARD_DELAY + tlbp # probe VPN + HAZARD_DELAY + mfc0 s0, MIPS_COP_0_TLB_INDEX + HAZARD_DELAY + + # MIPS_KSEG0_START + (2 * index * PAGE_SIZE) -> MIPS_COP_0_TLB_HI + PTR_LI t1, MIPS_KSEG0_START # invalidate tlb entry + bltz s0, entry0set nop - sll s0, PAGE_SHIFT + 1 - addu t1, s0 - MTC0 t1, MIPS_COP_0_TLB_HI + sll s0, PAGE_SHIFT + 1 + PTR_ADDU t1, s0 + MTC0 t1, MIPS_COP_0_TLB_HI PTE_MTC0 zero, MIPS_COP_0_TLB_LO0 PTE_MTC0 zero, MIPS_COP_0_TLB_LO1 + MTC0 zero, MIPS_COP_0_TLB_PG_MASK HAZARD_DELAY tlbwi HAZARD_DELAY - MTC0 v0, MIPS_COP_0_TLB_HI # set VPN again +#endif /* KSTACK_LARGE_PAGE */ entry0set: + MTC0 v0, MIPS_COP_0_TLB_HI # set VPN again + HAZARD_DELAY /* SMP!! - Works only for unshared TLB case - i.e. no v-cpus */ - mtc0 zero, MIPS_COP_0_TLB_INDEX # TLB entry #0 + mtc0 zero, MIPS_COP_0_TLB_INDEX # TLB entry #0 HAZARD_DELAY PTE_MTC0 a1, MIPS_COP_0_TLB_LO0 # upte[0] HAZARD_DELAY PTE_MTC0 a2, MIPS_COP_0_TLB_LO1 # upte[1] +#ifdef KSTACK_LARGE_PAGE + HAZARD_DELAY + li t1, KSTACK_TLBMASK_MASK + MTC0 t1, MIPS_COP_0_TLB_PG_MASK HAZARD_DELAY +#else + MTC0 zero, MIPS_COP_0_TLB_PG_MASK + HAZARD_DELAY +#endif tlbwi # set TLB entry #0 HAZARD_DELAY + +#ifdef KSTACK_LARGE_PAGE + MTC0 zero, MIPS_COP_0_TLB_PG_MASK + HAZARD_DELAY +#endif /* * Now running on new u struct. */ diff --git a/sys/mips/mips/tlb.c b/sys/mips/mips/tlb.c index 1ad8a11..77c77fa 100644 --- a/sys/mips/mips/tlb.c +++ b/sys/mips/mips/tlb.c @@ -40,10 +40,14 @@ #include #include +#include "opt_vm.h" + #if defined(CPU_CNMIPS) #define MIPS_MAX_TLB_ENTRIES 128 #elif defined(CPU_NLM) #define MIPS_MAX_TLB_ENTRIES (2048 + 128) +#elif defined(CPU_CHERI) +#define MIPS_MAX_TLB_ENTRIES 144 #else #define MIPS_MAX_TLB_ENTRIES 64 #endif @@ -102,20 +106,31 @@ tlb_insert_wired(unsigned i, vm_offset_t va, pt_entry_t pte0, pt_entry_t pte1) { register_t asid; register_t s; + uint32_t pagemask; + unsigned long mask, size; - va &= ~PAGE_MASK; + KASSERT((TLBLO_PTE_TO_IDX(pte0) == TLBLO_PTE_TO_IDX(pte1)), + ("%s: pte0 and pte1 page sizes don't match", __func__)); + + /* Compute the page mask and size. */ + pagemask = TLBLO_PTE_TO_MASK(pte0); + mask = (unsigned long)pagemask | PAGE_MASK; /* OR it with lower 12 bits */ + size = mask + 1; + + va &= ~mask; s = intr_disable(); asid = mips_rd_entryhi() & TLBHI_ASID_MASK; mips_wr_index(i); - mips_wr_pagemask(0); + mips_wr_pagemask(pagemask); mips_wr_entryhi(TLBHI_ENTRY(va, 0)); mips_wr_entrylo0(pte0); mips_wr_entrylo1(pte1); tlb_write_indexed(); mips_wr_entryhi(asid); + mips_wr_pagemask(0); intr_restore(s); } @@ -137,7 +152,6 @@ tlb_invalidate_address(struct pmap *pmap, vm_offset_t va) i = mips_rd_index(); if (i >= 0) tlb_invalidate_one(i); - mips_wr_entryhi(asid); intr_restore(s); } @@ -298,29 +312,53 @@ tlb_update(struct pmap *pmap, vm_offset_t va, pt_entry_t pte) register_t asid; register_t s; int i; + uint32_t pagemask; + unsigned long mask, size; + pt_entry_t pte0, pte1; - va &= ~PAGE_MASK; + /* Compute the page mask and size. */ + pagemask = TLBLO_PTE_TO_MASK(pte); + mask = (unsigned long)pagemask | PAGE_MASK; /* OR it with lower 12 bits */ + size = mask + 1; + + va &= ~mask; pte &= ~TLBLO_SWBITS_MASK; s = intr_disable(); asid = mips_rd_entryhi() & TLBHI_ASID_MASK; - mips_wr_pagemask(0); + mips_wr_pagemask(pagemask); mips_wr_entryhi(TLBHI_ENTRY(va, pmap_asid(pmap))); tlb_probe(); i = mips_rd_index(); if (i >= 0) { tlb_read(); + pte0 = mips_rd_entrylo0(); + pte1 = mips_rd_entrylo1(); + KASSERT((TLBLO_PTE_TO_IDX(pte) == TLBLO_PTE_TO_IDX(pte0) && + TLBLO_PTE_TO_IDX(pte) == TLBLO_PTE_TO_IDX(pte1)), + ("%s: pte, pte0 and pte1 page sizes don't match", __func__)); - if ((va & PAGE_SIZE) == 0) { + if ((va & size) == 0) { mips_wr_entrylo0(pte); + if (pagemask & ~PAGE_MASK) { + /* Superpage */ + pte1 = (pte1 & ~(PTE_VR | PTE_D)) | (pte & (PTE_VR | PTE_D)); + mips_wr_entrylo1(pte1); + } } else { mips_wr_entrylo1(pte); + if (pagemask & ~PAGE_MASK) { + /* Superpage */ + pte0 = (pte0 & ~(PTE_VR | PTE_D)) | (pte & (PTE_VR | PTE_D)); + mips_wr_entrylo0(pte0); + } } tlb_write_indexed(); } mips_wr_entryhi(asid); + mips_wr_pagemask(0); intr_restore(s); } diff --git a/sys/mips/mips/trap.c b/sys/mips/mips/trap.c index 98fe812..517cda8 100644 --- a/sys/mips/mips/trap.c +++ b/sys/mips/mips/trap.c @@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #ifdef DDB @@ -262,7 +263,7 @@ char *trap_type[] = { "reserved 21", "reserved 22", "watch", - "reserved 24", + "machine check", "reserved 25", "reserved 26", "reserved 27", @@ -630,6 +631,33 @@ trap(struct trapframe *trapframe) #endif panic("MCHECK\n"); break; + case T_MCHECK + T_USER: + { + uint32_t status = mips_rd_status(); + + if (status & MIPS_SR_TS) { + /* + * Machine Check exception caused by TLB + * detecting a match for multiple entries. + * + * Attempt to recover by flushing the user TLB + * and resetting the status bit. + */ + printf("Machine Check in User - Dup TLB entry. " + "Recovering...\n"); + pmap = &p->p_vmspace->vm_pmap; + tlb_invalidate_all_user(pmap); + mips_wr_status(status & ~MIPS_SR_TS); + + return (trapframe->pc); + } else { +#ifdef DDB + kdb_trap(type, 0, trapframe); +#endif + panic("MCHECK\n"); + } + } + break; case T_TLB_MOD: /* check for kernel address */ if (KERNLAND(trapframe->badvaddr)) { @@ -692,10 +720,23 @@ trap(struct trapframe *trapframe) case T_TLB_LD_MISS + T_USER: ftype = VM_PROT_READ; - goto dofault; + goto checkrefbit; case T_TLB_ST_MISS + T_USER: ftype = VM_PROT_WRITE; + +checkrefbit: + /* + * Was this trap caused by the PTE_VR bit not being set? + */ + if (pmap_emulate_referenced(&p->p_vmspace->vm_pmap, + trapframe->badvaddr) == 0) { + if (!usermode) { + return (trapframe->pc); + } + goto out; + } + dofault: { vm_offset_t va; @@ -1418,9 +1459,16 @@ get_mapping_info(vm_offset_t va, pd_entry_t **pdepp, pt_entry_t **ptepp) struct proc *p = curproc; pdep = (&(p->p_vmspace->vm_pmap.pm_segtab[(va >> SEGSHIFT) & (NPDEPG - 1)])); - if (*pdep) - ptep = pmap_pte(&p->p_vmspace->vm_pmap, va); - else + if (*pdep) { +#if VM_NRESERVLEVEL > 0 + pd_entry_t *pde = &pdep[(va >> PDRSHIFT) & (NPDEPG - 1)]; + + if (pde_is_superpage(pde)) + ptep = (pt_entry_t *)pde; + else +#endif /* VM_NRESERVLEVEL > 0 */ + ptep = pmap_pte(&p->p_vmspace->vm_pmap, va); + } else ptep = (pt_entry_t *)0; *pdepp = pdep; diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c index e70dded..a055c05 100644 --- a/sys/mips/mips/uma_machdep.c +++ b/sys/mips/mips/uma_machdep.c @@ -52,17 +52,27 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { +#ifdef MIPS64_NEW_PMAP + m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); +#else /* ! MIPS64_NEW_PMAP */ m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); +#endif /* ! MIPS64_NEW_PMAP */ if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else +#ifdef MIPS64_NEW_PMAP + VM_WAIT; +#else /* ! MIPS64_NEW_PMAP */ pmap_grow_direct_page_cache(); +#endif /* ! MIPS64_NEW_PMAP */ } else break; } pa = VM_PAGE_TO_PHYS(m); + if ((wait & M_NODUMP) == 0) + dump_add_page(pa); va = (void *)MIPS_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); diff --git a/sys/mips/mips/vm_machdep.c b/sys/mips/mips/vm_machdep.c index 26cdbff..a24366b 100644 --- a/sys/mips/mips/vm_machdep.c +++ b/sys/mips/mips/vm_machdep.c @@ -247,7 +247,6 @@ void cpu_thread_swapin(struct thread *td) { pt_entry_t *pte; - int i; /* * The kstack may be at a different physical address now. @@ -255,10 +254,21 @@ cpu_thread_swapin(struct thread *td) * part of the thread struct so cpu_switch() can quickly map in * the pcb struct and kernel stack. */ +#ifdef KSTACK_LARGE_PAGE + /* Just one entry for one large kernel page. */ + pte = pmap_pte(kernel_pmap, td->td_kstack); + td->td_md.md_upte[0] = *pte & ~TLBLO_SWBITS_MASK; + td->td_md.md_upte[1] = 1; + +#else + + int i; + for (i = 0; i < KSTACK_PAGES; i++) { pte = pmap_pte(kernel_pmap, td->td_kstack + i * PAGE_SIZE); td->td_md.md_upte[i] = *pte & ~TLBLO_SWBITS_MASK; } +#endif /* ! KSTACK_LARGE_PAGE */ } void @@ -270,17 +280,31 @@ void cpu_thread_alloc(struct thread *td) { pt_entry_t *pte; - int i; - KASSERT((td->td_kstack & (1 << PAGE_SHIFT)) == 0, ("kernel stack must be aligned.")); + KASSERT((td->td_kstack & ((KSTACK_PAGE_SIZE * 2) - 1) ) == 0, + ("kernel stack must be aligned.")); td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE) - 1; td->td_frame = &td->td_pcb->pcb_regs; - for (i = 0; i < KSTACK_PAGES; i++) { - pte = pmap_pte(kernel_pmap, td->td_kstack + i * PAGE_SIZE); - td->td_md.md_upte[i] = *pte & ~TLBLO_SWBITS_MASK; +#ifdef KSTACK_LARGE_PAGE + /* Just one entry for one large kernel page. */ + pte = pmap_pte(kernel_pmap, td->td_kstack); + td->td_md.md_upte[0] = *pte & ~TLBLO_SWBITS_MASK; + td->td_md.md_upte[1] = 1; + +#else + + { + int i; + + for (i = 0; i < KSTACK_PAGES; i++) { + pte = pmap_pte(kernel_pmap, td->td_kstack + i * + PAGE_SIZE); + td->td_md.md_upte[i] = *pte & ~TLBLO_SWBITS_MASK; + } } +#endif /* ! KSTACK_LARGE_PAGE */ } void diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index c9ee890..2ca1292 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -315,6 +315,161 @@ SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0, #define KSTACK_MAX_PAGES 32 #endif +#if defined(__mips__) + +static vm_offset_t +vm_kstack_valloc(int pages) +{ + vm_offset_t ks; + + /* + * We need to align the kstack's mapped address to fit within + * a single TLB entry. + */ + if (vmem_xalloc(kernel_arena, + (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, + KSTACK_PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, + M_BESTFIT | M_NOWAIT, &ks)) { + return (0); + } + + return (ks); +} + +#ifdef KSTACK_LARGE_PAGE + +#define KSTACK_OBJT OBJT_PHYS + +static int +vm_kstack_palloc(vm_object_t ksobj, vm_offset_t ks, int allocflags, int pages, + vm_page_t ma[]) +{ + vm_page_t m, end_m; + int i; + + KASSERT((ksobj != NULL), ("vm_kstack_palloc: invalid VM object")); + VM_OBJECT_ASSERT_WLOCKED(ksobj); + + allocflags = (allocflags & ~VM_ALLOC_CLASS_MASK) | VM_ALLOC_NORMAL; + + for (i = 0; i < pages; i++) { +retrylookup: + if ((m = vm_page_lookup(ksobj, i)) == NULL) + break; + if (vm_page_busied(m)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_aflag_set(m, PGA_REFERENCED); + vm_page_lock(m); + VM_OBJECT_WUNLOCK(ksobj); + vm_page_busy_sleep(m, "pgrbwt"); + VM_OBJECT_WLOCK(ksobj); + goto retrylookup; + } else { + if ((allocflags & VM_ALLOC_WIRED) != 0) { + vm_page_lock(m); + vm_page_wire(m); + vm_page_unlock(m); + } + ma[i] = m; + } + } + if (i == pages) + return (i); + + KASSERT((i == 0), ("vm_kstack_palloc: ksobj already has kstack pages")); + + for (;;) { + m = vm_page_alloc_contig(ksobj, 0, allocflags, + atop(KSTACK_PAGE_SIZE), 0ul, ~0ul, KSTACK_PAGE_SIZE * 2, 0, + VM_MEMATTR_DEFAULT); + if (m != NULL) + break; + VM_OBJECT_WUNLOCK(ksobj); + VM_WAIT; + VM_OBJECT_WLOCK(ksobj); + } + end_m = m + atop(KSTACK_PAGE_SIZE); + for (i = 0; m < end_m; m++) { + m->pindex = (vm_pindex_t)i; + if ((allocflags & VM_ALLOC_NOBUSY) != 0) + m->valid = VM_PAGE_BITS_ALL; + ma[i] = m; + i++; + } + return (i); +} + +#else /* ! KSTACK_LARGE_PAGE */ + +#define KSTACK_OBJT OBJT_DEFAULT + +static int +vm_kstack_palloc(vm_object_t ksobj, vm_offset_t ks, int allocflags, int pages, + vm_page_t ma[]) +{ + int i; + + KASSERT((ksobj != NULL), ("vm_kstack_palloc: invalid VM object")); + VM_OBJECT_ASSERT_WLOCKED(ksobj); + + allocflags = (allocflags & ~VM_ALLOC_CLASS_MASK) | VM_ALLOC_NORMAL; + + for (i = 0; i < pages; i++) { + /* + * Get a kernel stack page. + */ + ma[i] = vm_page_grab(ksobj, i, allocflags); + if (allocflags & VM_ALLOC_NOBUSY) + ma[i]->valid = VM_PAGE_BITS_ALL; + } + + return (i); +} +#endif /* ! KSTACK_LARGE_PAGE */ + +#else /* ! __mips__ */ + +#define KSTACK_OBJT OBJT_DEFAULT + +static vm_offset_t +vm_kstack_valloc(int pages) +{ + vm_offset_t ks; + + ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); + + return(ks); +} + +static int +vm_kstack_palloc(vm_object_t ksobj, vm_offset_t ks, int allocflags, int pages, + vm_page_t ma[]) +{ + int i; + + KASSERT((ksobj != NULL), ("vm_kstack_palloc: invalid VM object")); + VM_OBJECT_ASSERT_WLOCKED(ksobj); + + allocflags = (allocflags & ~VM_ALLOC_CLASS_MASK) | VM_ALLOC_NORMAL; + + for (i = 0; i < pages; i++) { + /* + * Get a kernel stack page. + */ + ma[i] = vm_page_grab(ksobj, i, allocflags); + if (allocflags & VM_ALLOC_NOBUSY) + ma[i]->valid = VM_PAGE_BITS_ALL; + } + + return (i); +} +#endif /* ! __mips__ */ + + /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and @@ -325,9 +480,8 @@ vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; - vm_page_t m, ma[KSTACK_MAX_PAGES]; + vm_page_t ma[KSTACK_MAX_PAGES]; struct kstack_cache_entry *ks_ce; - int i; /* Bounds check */ if (pages <= 1) @@ -353,24 +507,12 @@ vm_thread_new(struct thread *td, int pages) /* * Allocate an object for the kstack. */ - ksobj = vm_object_allocate(OBJT_DEFAULT, pages); - + ksobj = vm_object_allocate(KSTACK_OBJT, pages); + /* * Get a kernel virtual address for this thread's kstack. */ -#if defined(__mips__) - /* - * We need to align the kstack's mapped address to fit within - * a single TLB entry. - */ - if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, - PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, - M_BESTFIT | M_NOWAIT, &ks)) { - ks = 0; - } -#else - ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); -#endif + ks = vm_kstack_valloc(pages); if (ks == 0) { printf("vm_thread_new: kstack allocation failed\n"); vm_object_deallocate(ksobj); @@ -389,21 +531,15 @@ vm_thread_new(struct thread *td, int pages) * want to deallocate them. */ td->td_kstack_pages = pages; - /* - * For the length of the stack, link in a real page of ram for each - * page of stack. - */ + VM_OBJECT_WLOCK(ksobj); - for (i = 0; i < pages; i++) { - /* - * Get a kernel stack page. - */ - m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED); - ma[i] = m; - m->valid = VM_PAGE_BITS_ALL; - } + pages = vm_kstack_palloc(ksobj, ks, (VM_ALLOC_NOBUSY | VM_ALLOC_WIRED), + pages, ma); VM_OBJECT_WUNLOCK(ksobj); + if (pages == 0) { + printf("vm_thread_new: vm_kstack_palloc() failed\n"); + return (0); + } pmap_qenter(ks, ma, pages); return (1); } @@ -576,9 +712,9 @@ vm_thread_swapin(struct thread *td) pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); - for (i = 0; i < pages; i++) - ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | - VM_ALLOC_WIRED); + rv = vm_kstack_palloc(ksobj, td->td_kstack, (VM_ALLOC_NORMAL | + VM_ALLOC_WIRED), pages, ma); + KASSERT(rv != 0, ("vm_thread_swapin: vm_kstack_palloc() failed")); for (i = 0; i < pages; i++) { if (ma[i]->valid != VM_PAGE_BITS_ALL) { vm_page_assert_xbusied(ma[i]);