diff --git a/sys/compat/linuxkpi/common/include/linux/gfp.h b/sys/compat/linuxkpi/common/include/linux/gfp.h index a20e3606ca0..06f4777a7d8 100644 --- a/sys/compat/linuxkpi/common/include/linux/gfp.h +++ b/sys/compat/linuxkpi/common/include/linux/gfp.h @@ -56,6 +56,7 @@ #define __GFP_IO 0 #define __GFP_NO_KSWAPD 0 +#define __GFP_KSWAPD_RECLAIM 0 #define __GFP_WAIT M_WAITOK #define __GFP_DMA32 (1U << 24) /* LinuxKPI only */ #define __GFP_BITS_SHIFT 25 @@ -74,7 +75,7 @@ #define GFP_TEMPORARY M_NOWAIT #define GFP_NATIVE_MASK (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_ZERO) #define GFP_TRANSHUGE 0 -#define GFP_TRANSHUGE_LIGHT 0 +#define GFP_TRANSHUGE_LIGHT 0 CTASSERT((__GFP_DMA32 & GFP_NATIVE_MASK) == 0); CTASSERT((__GFP_BITS_MASK & GFP_NATIVE_MASK) == GFP_NATIVE_MASK); diff --git a/sys/compat/linuxkpi/common/include/linux/io.h b/sys/compat/linuxkpi/common/include/linux/io.h index 3c75e86b1a1..76b84304f3b 100644 --- a/sys/compat/linuxkpi/common/include/linux/io.h +++ b/sys/compat/linuxkpi/common/include/linux/io.h @@ -42,6 +42,32 @@ * XXX This is all x86 specific. It should be bus space access. */ + +/* rmb and wmb are declared in machine/atomic.h, so should be included first. */ +#ifndef __io_br +#define __io_br() __compiler_membar() +#endif + +#ifndef __io_ar +#ifdef rmb +#define __io_ar() rmb() +#else +#define __io_ar() __compiler_membar() +#endif +#endif + +#ifndef __io_bw +#ifdef wmb +#define __io_bw() wmb() +#else +#define __io_bw() __compiler_membar() +#endif +#endif + +#ifndef __io_aw +#define __io_aw() __compiler_membar() +#endif + /* Access MMIO registers atomically without barriers and byte swapping. */ static inline uint8_t @@ -112,9 +138,9 @@ readb(const volatile void *addr) { uint8_t v; - __compiler_membar(); + __io_br(); v = *(const volatile uint8_t *)addr; - __compiler_membar(); + __io_ar(); return (v); } #define readb(addr) readb(addr) @@ -123,9 +149,9 @@ readb(const volatile void *addr) static inline void writeb(uint8_t v, volatile void *addr) { - __compiler_membar(); + __io_bw(); *(volatile uint8_t *)addr = v; - __compiler_membar(); + __io_aw(); } #define writeb(v, addr) writeb(v, addr) @@ -135,9 +161,9 @@ readw(const volatile void *addr) { uint16_t v; - __compiler_membar(); - v = *(const volatile uint16_t *)addr; - __compiler_membar(); + __io_br(); + v = le16toh(__raw_readw(addr)); + __io_ar(); return (v); } #define readw(addr) readw(addr) @@ -146,9 +172,9 @@ readw(const volatile void *addr) static inline void writew(uint16_t v, volatile void *addr) { - __compiler_membar(); - *(volatile uint16_t *)addr = v; - __compiler_membar(); + __io_bw(); + __raw_writew(htole16(v), addr); + __io_aw(); } #define writew(v, addr) writew(v, addr) @@ -158,9 +184,9 @@ readl(const volatile void *addr) { uint32_t v; - __compiler_membar(); - v = *(const volatile uint32_t *)addr; - __compiler_membar(); + __io_br(); + v = le32toh(__raw_readl(addr)); + __io_ar(); return (v); } #define readl(addr) readl(addr) @@ -169,9 +195,9 @@ readl(const volatile void *addr) static inline void writel(uint32_t v, volatile void *addr) { - __compiler_membar(); - *(volatile uint32_t *)addr = v; - __compiler_membar(); + __io_bw(); + __raw_writel(htole32(v), addr); + __io_aw(); } #define writel(v, addr) writel(v, addr) @@ -183,9 +209,9 @@ readq(const volatile void *addr) { uint64_t v; - __compiler_membar(); - v = *(const volatile uint64_t *)addr; - __compiler_membar(); + __io_br(); + v = le64toh(__raw_readq(addr)); + __io_ar(); return (v); } #define readq(addr) readq(addr) @@ -193,9 +219,9 @@ readq(const volatile void *addr) static inline void writeq(uint64_t v, volatile void *addr) { - __compiler_membar(); - *(volatile uint64_t *)addr = v; - __compiler_membar(); + __io_bw(); + __raw_writeq(htole64(v), addr); + __io_aw(); } #define writeq(v, addr) writeq(v, addr) #endif @@ -206,7 +232,7 @@ writeq(uint64_t v, volatile void *addr) static inline uint8_t readb_relaxed(const volatile void *addr) { - return (*(const volatile uint8_t *)addr); + return (__raw_readb(addr)); } #define readb_relaxed(addr) readb_relaxed(addr) @@ -214,7 +240,7 @@ readb_relaxed(const volatile void *addr) static inline void writeb_relaxed(uint8_t v, volatile void *addr) { - *(volatile uint8_t *)addr = v; + __raw_writeb(v, addr); } #define writeb_relaxed(v, addr) writeb_relaxed(v, addr) @@ -222,7 +248,7 @@ writeb_relaxed(uint8_t v, volatile void *addr) static inline uint16_t readw_relaxed(const volatile void *addr) { - return (*(const volatile uint16_t *)addr); + return (le16toh(__raw_readw(addr))); } #define readw_relaxed(addr) readw_relaxed(addr) @@ -230,7 +256,7 @@ readw_relaxed(const volatile void *addr) static inline void writew_relaxed(uint16_t v, volatile void *addr) { - *(volatile uint16_t *)addr = v; + __raw_writew(htole16(v), addr); } #define writew_relaxed(v, addr) writew_relaxed(v, addr) @@ -238,7 +264,7 @@ writew_relaxed(uint16_t v, volatile void *addr) static inline uint32_t readl_relaxed(const volatile void *addr) { - return (*(const volatile uint32_t *)addr); + return (le32toh(__raw_readl(addr))); } #define readl_relaxed(addr) readl_relaxed(addr) @@ -246,7 +272,7 @@ readl_relaxed(const volatile void *addr) static inline void writel_relaxed(uint32_t v, volatile void *addr) { - *(volatile uint32_t *)addr = v; + __raw_writel(htole32(v), addr); } #define writel_relaxed(v, addr) writel_relaxed(v, addr) @@ -256,14 +282,14 @@ writel_relaxed(uint32_t v, volatile void *addr) static inline uint64_t readq_relaxed(const volatile void *addr) { - return (*(const volatile uint64_t *)addr); + return (le64toh(__raw_readq(addr))); } #define readq_relaxed(addr) readq_relaxed(addr) static inline void writeq_relaxed(uint64_t v, volatile void *addr) { - *(volatile uint64_t *)addr = v; + __raw_writeq(htole64(v), addr); } #define writeq_relaxed(v, addr) writeq_relaxed(v, addr) #endif @@ -290,7 +316,13 @@ ioread16(const volatile void *addr) static inline uint16_t ioread16be(const volatile void *addr) { - return (bswap16(readw(addr))); + uint16_t v; + + __io_br(); + v = (be16toh(__raw_readw(addr))); + __io_ar(); + + return (v); } #define ioread16be(addr) ioread16be(addr) @@ -306,7 +338,13 @@ ioread32(const volatile void *addr) static inline uint32_t ioread32be(const volatile void *addr) { - return (bswap32(readl(addr))); + uint32_t v; + + __io_br(); + v = (be32toh(__raw_readl(addr))); + __io_ar(); + + return (v); } #define ioread32be(addr) ioread32be(addr) @@ -338,7 +376,9 @@ iowrite32(uint32_t v, volatile void *addr) static inline void iowrite32be(uint32_t v, volatile void *addr) { - writel(bswap32(v), addr); + __io_bw(); + __raw_writel(htobe32(v), addr); + __io_aw(); } #define iowrite32be(v, addr) iowrite32be(v, addr) diff --git a/sys/compat/linuxkpi/common/include/linux/pci.h b/sys/compat/linuxkpi/common/include/linux/pci.h index 60feeeca600..29467020596 100644 --- a/sys/compat/linuxkpi/common/include/linux/pci.h +++ b/sys/compat/linuxkpi/common/include/linux/pci.h @@ -193,6 +193,7 @@ struct pci_driver { struct pci_bus { struct pci_dev *self; + int domain; int number; }; @@ -259,26 +260,6 @@ linux_pci_find_irq_dev(unsigned int irq) return (found); } -static inline unsigned long -pci_resource_start(struct pci_dev *pdev, int bar) -{ - struct resource_list_entry *rle; - - if ((rle = linux_pci_get_bar(pdev, bar)) == NULL) - return (0); - return rle->start; -} - -static inline unsigned long -pci_resource_len(struct pci_dev *pdev, int bar) -{ - struct resource_list_entry *rle; - - if ((rle = linux_pci_get_bar(pdev, bar)) == NULL) - return (0); - return rle->count; -} - static inline int pci_resource_type(struct pci_dev *pdev, int bar) { @@ -438,6 +419,9 @@ pci_disable_msix(struct pci_dev *pdev) pdev->dev.msix_max = 0; } +unsigned long pci_resource_start(struct pci_dev *pdev, int bar); +unsigned long pci_resource_len(struct pci_dev *pdev, int bar); + static inline bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { @@ -623,7 +607,7 @@ static inline void pci_disable_sriov(struct pci_dev *dev) /* XXX This should not be necessary. */ #define pcix_set_mmrbc(d, v) 0 #define pcix_get_max_mmrbc(d) 0 -#define pcie_set_readrq(d, v) 0 +#define pcie_set_readrq(d, v) pci_set_max_read_req(&(d)->dev, (v)) #define PCI_DMA_BIDIRECTIONAL 0 #define PCI_DMA_TODEVICE 1 diff --git a/sys/compat/linuxkpi/common/src/linux_pci.c b/sys/compat/linuxkpi/common/src/linux_pci.c index 72651a238da..172b21b0ff3 100644 --- a/sys/compat/linuxkpi/common/src/linux_pci.c +++ b/sys/compat/linuxkpi/common/src/linux_pci.c @@ -29,12 +29,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include -#include #include #include #include @@ -72,6 +72,7 @@ static device_method_t pci_methods[] = { DEVMETHOD(device_suspend, linux_pci_suspend), DEVMETHOD(device_resume, linux_pci_resume), DEVMETHOD(device_shutdown, linux_pci_shutdown), + DEVMETHOD(bus_translate_resource, bus_generic_translate_resource), DEVMETHOD_END }; @@ -131,7 +132,6 @@ linux_pci_attach(device_t dev) struct pci_driver *pdrv; const struct pci_device_id *id; device_t parent; - devclass_t devclass; int error; linux_set_current(curthread); @@ -140,7 +140,6 @@ linux_pci_attach(device_t dev) pdev = device_get_softc(dev); parent = device_get_parent(dev); - devclass = device_get_devclass(parent); if (pdrv->isdrm) { dinfo = device_get_ivars(parent); device_set_ivars(dev, dinfo); @@ -175,6 +174,7 @@ linux_pci_attach(device_t dev) pbus = malloc(sizeof(*pbus), M_DEVBUF, M_WAITOK | M_ZERO); pbus->self = pdev; pbus->number = pci_get_bus(dev); + pbus->domain = pci_get_domain(dev); pdev->bus = pbus; } @@ -302,6 +302,37 @@ linux_pci_register_driver(struct pci_driver *pdrv) return (_linux_pci_register_driver(pdrv, dc)); } +unsigned long +pci_resource_start(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + unsigned long newstart; + device_t dev; + + if ((rle = linux_pci_get_bar(pdev, bar)) == NULL) + return (0); + dev = pci_find_dbsf(pdev->bus->domain, pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + MPASS(dev != NULL); + if (bus_generic_translate_resource(dev, rle->type, + rle->start, &newstart)) { + device_printf(pdev->dev.bsddev, "translate of %#lx failed\n", + rle->start); + return (0); + } + return (newstart); +} + +unsigned long +pci_resource_len(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + if ((rle = linux_pci_get_bar(pdev, bar)) == NULL) + return (0); + return (rle->count); +} + int linux_pci_register_drm_driver(struct pci_driver *pdrv) { diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index 2e68645d90b..e6d566ac282 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -108,6 +108,7 @@ powerpc/aim/locore.S optional aim no-obj powerpc/aim/aim_machdep.c optional aim powerpc/aim/mmu_oea.c optional aim powerpc powerpc/aim/mmu_oea64.c optional aim +powerpc/aim/mmu_radix.c optional aim powerpc64 powerpc/aim/moea64_if.m optional aim powerpc/aim/moea64_native.c optional aim powerpc/aim/mp_cpudep.c optional aim diff --git a/sys/conf/ldscript.powerpc64 b/sys/conf/ldscript.powerpc64 index 866bc05e622..40181632d87 100644 --- a/sys/conf/ldscript.powerpc64 +++ b/sys/conf/ldscript.powerpc64 @@ -6,12 +6,17 @@ OUTPUT_ARCH(powerpc:common64) ENTRY(__start) SEARCH_DIR(/usr/lib); PROVIDE (__stack = 0); +PHDRS +{ + text PT_LOAD ; + dynamic PT_DYNAMIC ; +} SECTIONS { /* Low-address wrapper for bootloaders (kexec/kboot) that can't parse ELF */ . = kernbase - 0x100; - .kboot : { *(.text.kboot) } + .kboot : { *(.text.kboot) } :text /* Read-only sections, merged into text segment: */ . = kernbase; @@ -19,6 +24,7 @@ SECTIONS .text : { + *(.glink) *(.text) *(.stub) /* .gnu.warning sections are handled specially by elf32.em. */ @@ -29,7 +35,6 @@ SECTIONS PROVIDE (etext = .); /* Do not emit PT_INTERP section, which confuses some loaders (kexec-lite) */ - .interpX : { *(.interp) } : NONE /DISCARD/ : { *(.interp) } /* Also delete notes */ @@ -95,7 +100,7 @@ SECTIONS . = ALIGN(4096); .got : ALIGN(8) { __tocbase = .; *(.got .toc) } - .dynamic : { *(.dynamic) } + .dynamic : { *(.dynamic) } :text :dynamic /* Put .ctors and .dtors next to the .got2 section, so that the pointers get relocated with -mrelocatable. Also put in the .fixup pointers. The current compiler no longer needs this, but keep it around for 2.7.2 */ diff --git a/sys/dev/fdt/fdt_slicer.c b/sys/dev/fdt/fdt_slicer.c index ebbfa45e2ec..9442aebdb6e 100644 --- a/sys/dev/fdt/fdt_slicer.c +++ b/sys/dev/fdt/fdt_slicer.c @@ -182,4 +182,5 @@ static moduledata_t fdt_slicer_mod = { }; DECLARE_MODULE(fdt_slicer, fdt_slicer_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_DEPEND(fdt_slicer, g_flashmap, 0, 0, 0); MODULE_VERSION(fdt_slicer, 1); diff --git a/sys/dev/ofw/ofwpci.c b/sys/dev/ofw/ofwpci.c index 7ebfe4ab86a..3b9538727c2 100644 --- a/sys/dev/ofw/ofwpci.c +++ b/sys/dev/ofw/ofwpci.c @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -76,6 +77,8 @@ static int ofw_pci_deactivate_resource(device_t, device_t, int, int, struct resource *); static int ofw_pci_adjust_resource(device_t, device_t, int, struct resource *, rman_res_t, rman_res_t); +static int ofw_pci_translate_resource(device_t bus, int type, + rman_res_t start, rman_res_t *newstart); #ifdef __powerpc__ static bus_space_tag_t ofw_pci_bus_get_bus_tag(device_t, device_t); @@ -116,6 +119,7 @@ static device_method_t ofw_pci_methods[] = { DEVMETHOD(bus_activate_resource, ofw_pci_activate_resource), DEVMETHOD(bus_deactivate_resource, ofw_pci_deactivate_resource), DEVMETHOD(bus_adjust_resource, ofw_pci_adjust_resource), + DEVMETHOD(bus_translate_resource, ofw_pci_translate_resource), #ifdef __powerpc__ DEVMETHOD(bus_get_bus_tag, ofw_pci_bus_get_bus_tag), #endif @@ -406,7 +410,6 @@ ofw_pci_alloc_resource(device_t bus, device_t child, int type, int *rid, struct rman *rm; int needactivate; - needactivate = flags & RF_ACTIVE; flags &= ~RF_ACTIVE; @@ -424,11 +427,8 @@ ofw_pci_alloc_resource(device_t bus, device_t child, int type, int *rid, return (bus_generic_alloc_resource(bus, child, type, rid, start, end, count, flags | needactivate)); } - rv = rman_reserve_resource(rm, start, end, count, flags, child); if (rv == NULL) { - device_printf(bus, "failed to reserve resource for %s\n", - device_get_nameunit(child)); return (NULL); } @@ -436,9 +436,6 @@ ofw_pci_alloc_resource(device_t bus, device_t child, int type, int *rid, if (needactivate) { if (bus_activate_resource(child, type, *rid, rv) != 0) { - device_printf(bus, - "failed to activate resource for %s\n", - device_get_nameunit(child)); rman_release_resource(rv); return (NULL); } @@ -478,6 +475,45 @@ ofw_pci_release_resource(device_t bus, device_t child, int type, int rid, return (rman_release_resource(res)); } +static int +ofw_pci_translate_resource(device_t bus, int type, rman_res_t start, + rman_res_t *newstart) +{ + struct ofw_pci_softc *sc; + struct ofw_pci_range *rp; + int space; + + sc = device_get_softc(bus); + + /* + * Map this through the ranges list + */ + for (rp = sc->sc_range; rp < sc->sc_range + sc->sc_nrange && + rp->pci_hi != 0; rp++) { + if (start < rp->pci || start >= rp->pci + rp->size) + continue; + + switch (rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) { + case OFW_PCI_PHYS_HI_SPACE_IO: + space = SYS_RES_IOPORT; + break; + case OFW_PCI_PHYS_HI_SPACE_MEM32: + case OFW_PCI_PHYS_HI_SPACE_MEM64: + space = SYS_RES_MEMORY; + break; + default: + space = -1; + } + + if (type == space) { + start += (rp->host - rp->pci); + break; + } + } + *newstart = start; + return (0); +} + static int ofw_pci_activate_resource(device_t bus, device_t child, int type, int rid, struct resource *res) diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index 3e74436e5e5..e797b0a28e2 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -169,6 +169,7 @@ static device_method_t pci_methods[] = { DEVMETHOD(bus_suspend_child, pci_suspend_child), DEVMETHOD(bus_resume_child, pci_resume_child), DEVMETHOD(bus_rescan, pci_rescan_method), + DEVMETHOD(bus_translate_resource, bus_generic_translate_resource), /* PCI interface */ DEVMETHOD(pci_read_config, pci_read_config_method), diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c index 18bf1196422..da0773c407d 100644 --- a/sys/dev/pci/pci_pci.c +++ b/sys/dev/pci/pci_pci.c @@ -106,6 +106,7 @@ static device_method_t pcib_methods[] = { DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + DEVMETHOD(bus_translate_resource, bus_generic_translate_resource), /* pcib interface */ DEVMETHOD(pcib_maxslots, pcib_ari_maxslots), diff --git a/sys/dev/pci/vga_pci.c b/sys/dev/pci/vga_pci.c index 6019ffb6770..afd9559c0e2 100644 --- a/sys/dev/pci/vga_pci.c +++ b/sys/dev/pci/vga_pci.c @@ -142,11 +142,35 @@ vga_pci_is_boot_display(device_t dev) return (1); } +static void +vga_pci_reset(device_t dev) +{ + int ps; + /* + * FLR is unsupported on GPUs so attempt a power-management reset by cycling + * the device in/out of D3 state. + * PCI spec says we can only go into D3 state from D0 state. + * Transition from D[12] into D0 before going to D3 state. + */ + ps = pci_get_powerstate(dev); + if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3) + pci_set_powerstate(dev, PCI_POWERSTATE_D0); + if (pci_get_powerstate(dev) != PCI_POWERSTATE_D3) + pci_set_powerstate(dev, PCI_POWERSTATE_D3); + pci_set_powerstate(dev, ps); +} + + void * vga_pci_map_bios(device_t dev, size_t *size) { - int rid; + struct vga_resource *vr; struct resource *res; + device_t pcib; + uint32_t rom_addr; + uint16_t config; + volatile char *bios; + int i, rid, found; #if defined(__amd64__) || defined(__i386__) if (vga_pci_is_boot_display(dev)) { @@ -164,21 +188,96 @@ vga_pci_map_bios(device_t dev, size_t *size) } #endif - rid = PCIR_BIOS; + pcib = device_get_parent(device_get_parent(dev)); + if (device_get_devclass(device_get_parent(pcib)) == + devclass_find("pci")) { + /* + * The parent bridge is a PCI-to-PCI bridge: check the + * value of the "VGA Enable" bit. + */ + config = pci_read_config(pcib, PCIR_BRIDGECTL_1, 2); + if ((config & PCIB_BCR_VGA_ENABLE) == 0) { + config |= PCIB_BCR_VGA_ENABLE; + pci_write_config(pcib, PCIR_BRIDGECTL_1, config, 2); + } + } + + switch(pci_read_config(dev, PCIR_HDRTYPE, 1)) { + case PCIM_HDRTYPE_BRIDGE: + rid = PCIR_BIOS_1; + break; + case PCIM_HDRTYPE_CARDBUS: + rid = 0; + break; + default: + rid = PCIR_BIOS; + break; + } + if (rid == 0) + return (NULL); res = vga_pci_alloc_resource(dev, NULL, SYS_RES_MEMORY, &rid, 0, ~0, 1, RF_ACTIVE); + if (res == NULL) { + device_printf(dev, "vga_pci_alloc_resource failed\n"); return (NULL); } + bios = rman_get_virtual(res); + *size = rman_get_size(res); + for (found = i = 0; i < hz; i++) { + found = (bios[0] == 0x55 && bios[1] == 0xaa); + if (found) + break; + pause("vgabios", 1); + } + if (found) + return (__DEVOLATILE(void *, bios)); + if (bootverbose) + device_printf(dev, "initial rom mapping failed -- resetting\n"); + /* + * Enable ROM decode + */ + vga_pci_reset(dev); + rom_addr = pci_read_config(dev, rid, 4); + rom_addr &= 0x7ff; + rom_addr |= rman_get_start(res) | 0x1; + pci_write_config(dev, rid, rom_addr, 4); + vr = lookup_res(device_get_softc(dev), rid); + vga_pci_release_resource(dev, NULL, SYS_RES_MEMORY, rid, + vr->vr_res); + + /* + * re-allocate + */ + res = vga_pci_alloc_resource(dev, NULL, SYS_RES_MEMORY, &rid, 0, + ~0, 1, RF_ACTIVE); + if (res == NULL) { + device_printf(dev, "vga_pci_alloc_resource failed\n"); + return (NULL); + } + bios = rman_get_virtual(res); *size = rman_get_size(res); - return (rman_get_virtual(res)); + for (found = i = 0; i < 3*hz; i++) { + found = (bios[0] == 0x55 && bios[1] == 0xaa); + if (found) + break; + pause("vgabios", 1); + } + if (found) + return (__DEVOLATILE(void *, bios)); + device_printf(dev, "rom mapping failed\n"); + vr = lookup_res(device_get_softc(dev), rid); + vga_pci_release_resource(dev, NULL, SYS_RES_MEMORY, rid, + vr->vr_res); + return (NULL); } void vga_pci_unmap_bios(device_t dev, void *bios) { struct vga_resource *vr; + int rid; if (bios == NULL) { return; @@ -192,16 +291,28 @@ vga_pci_unmap_bios(device_t dev, void *bios) return; } #endif - + switch(pci_read_config(dev, PCIR_HDRTYPE, 1)) { + case PCIM_HDRTYPE_BRIDGE: + rid = PCIR_BIOS_1; + break; + case PCIM_HDRTYPE_CARDBUS: + rid = 0; + break; + default: + rid = PCIR_BIOS; + break; + } + if (rid == 0) + return; /* * Look up the PCIR_BIOS resource in our softc. It should match * the address we returned previously. */ - vr = lookup_res(device_get_softc(dev), PCIR_BIOS); + vr = lookup_res(device_get_softc(dev), rid); KASSERT(vr->vr_res != NULL, ("vga_pci_unmap_bios: bios not mapped")); KASSERT(rman_get_virtual(vr->vr_res) == bios, ("vga_pci_unmap_bios: mismatch")); - vga_pci_release_resource(dev, NULL, SYS_RES_MEMORY, PCIR_BIOS, + vga_pci_release_resource(dev, NULL, SYS_RES_MEMORY, rid, vr->vr_res); } diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m index 501e8f82791..0b093b717c7 100644 --- a/sys/kern/bus_if.m +++ b/sys/kern/bus_if.m @@ -66,6 +66,7 @@ CODE { panic("bus_add_child is not implemented"); } + }; /** @@ -396,6 +397,23 @@ METHOD int adjust_resource { rman_res_t _end; }; + +/** + * @brief translate a resource value + * + * + * @param _dev the device associated with the resource + * @param _type the type of resource + * @param _start the starting address of the resource range + * @param _newstart the new starting address of the resource range + */ +METHOD int translate_resource { + device_t _dev; + int _type; + rman_res_t _start; + rman_res_t *_newstart; +}; + /** * @brief Release a resource * diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index ba177f7895b..c84ccb1c57a 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -3864,6 +3864,23 @@ bus_generic_resume(device_t dev) return (0); } +/** + * @brief Wrapper function for BUS_TRANSLATE_RESOURCE(). + * + * This function simply calls the BUS_TRANSLATE_RESOURCE() method of the + * parent of @p dev. + */ +int +bus_generic_translate_resource(device_t dev, int type, rman_res_t start, + rman_res_t *newstart) +{ + if (dev->parent == NULL) { + device_printf(dev, "no parent for %lx - EINVAL\n", start); + return (EINVAL); + } + return (BUS_TRANSLATE_RESOURCE(dev->parent, type, start, newstart)); +} + /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * @@ -4989,6 +5006,7 @@ static kobj_method_t root_methods[] = { KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar), KOBJMETHOD(bus_setup_intr, root_setup_intr), KOBJMETHOD(bus_child_present, root_child_present), + KOBJMETHOD(bus_translate_resource, bus_generic_translate_resource), KOBJMETHOD(bus_get_cpus, root_get_cpus), KOBJMETHOD_END diff --git a/sys/powerpc/aim/aim_machdep.c b/sys/powerpc/aim/aim_machdep.c index e4772825914..121b7559363 100644 --- a/sys/powerpc/aim/aim_machdep.c +++ b/sys/powerpc/aim/aim_machdep.c @@ -136,6 +136,8 @@ __FBSDID("$FreeBSD$"); struct bat battable[16]; #endif +int radix_mmu = 0; + #ifndef __powerpc64__ /* Bits for running on 64-bit systems in 32-bit mode. */ extern void *testppc64, *testppc64size; @@ -421,9 +423,14 @@ aim_cpu_init(vm_offset_t toc) * in case the platform module had a better idea of what we * should do. */ - if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) - pmap_mmu_install(MMU_TYPE_P9H, BUS_PROBE_GENERIC); - else if (cpu_features & PPC_FEATURE_64) + if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) { + radix_mmu = 1; + TUNABLE_INT_FETCH("radix_mmu", &radix_mmu); + if (radix_mmu) + pmap_mmu_install(MMU_TYPE_RADIX, BUS_PROBE_GENERIC); + else + pmap_mmu_install(MMU_TYPE_P9H, BUS_PROBE_GENERIC); + } else if (cpu_features & PPC_FEATURE_64) pmap_mmu_install(MMU_TYPE_G5, BUS_PROBE_GENERIC); else pmap_mmu_install(MMU_TYPE_OEA, BUS_PROBE_GENERIC); diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index d4c88eb8941..0503aaa6ad4 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -362,6 +362,7 @@ static mmu_method_t moea_methods[] = { MMUMETHOD(mmu_page_set_memattr, moea_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea_quick_remove_page), + MMUMETHOD(mmu_page_is_mapped, moea_page_is_mapped), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, moea_bootstrap), @@ -1102,6 +1103,12 @@ moea_quick_remove_page(mmu_t mmu, vm_offset_t addr) { } +boolean_t +moea_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); +} + /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 85a0814a887..e71a6306a4b 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -285,6 +285,7 @@ void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void moea64_scan_init(mmu_t mmu); vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m); void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr); +boolean_t moea64_page_is_mapped(mmu_t mmu, vm_page_t m); static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, @@ -328,6 +329,7 @@ static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page), + MMUMETHOD(mmu_page_is_mapped, moea64_page_is_mapped), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), @@ -749,7 +751,7 @@ moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelen mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); - if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz) + if (2 * VM_PHYSSEG_MAX < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; @@ -1344,6 +1346,12 @@ moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr) sched_unpin(); } +boolean_t +moea64_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); +} + /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c new file mode 100644 index 00000000000..e52ac30a9bb --- /dev/null +++ b/sys/powerpc/aim/mmu_radix.c @@ -0,0 +1,6205 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INVARIANTS +#include +#endif + +#define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) + +#include "opt_ddb.h" +#ifdef DDB +static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); +#endif + +int nkpt = 64; +SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, + "Number of kernel page table pages allocated on bootup"); + +caddr_t crashdumpmap; +vm_paddr_t dmaplimit; + +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pg_ps_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pg_ps_enabled, 0, "Are large page mappings enabled?"); +#ifdef INVARIANTS +#define VERBOSE_PMAP 0 +#define VERBOSE_PROTECT 0 +static int pmap_logging; +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, + &pmap_logging, 0, "verbose debug logging"); +#endif + +static u_int64_t KPTphys; /* phys addr of kernel level 1 */ + +//static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ + +static vm_offset_t qframe = 0; +static struct mtx qframe_mtx; +static epoch_t pmap_epoch; + +void mmu_radix_activate(mmu_t mmu, struct thread *); +void mmu_radix_advise(mmu_t mmu, pmap_t, vm_offset_t, vm_offset_t, int); +void mmu_radix_align_superpage(mmu_t mmu, vm_object_t, vm_ooffset_t, vm_offset_t *, + vm_size_t); +void mmu_radix_clear_modify(mmu_t, vm_page_t); +void mmu_radix_copy(mmu_t, pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); +int mmu_radix_map_user_ptr(mmu_t mmu, pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); +int mmu_radix_decode_kernel_ptr(mmu_t, vm_offset_t, int *, vm_offset_t *); +int mmu_radix_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); +void mmu_radix_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, + vm_prot_t); +void mmu_radix_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); +vm_paddr_t mmu_radix_extract(mmu_t, pmap_t pmap, vm_offset_t va); +vm_page_t mmu_radix_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); +void mmu_radix_kenter(mmu_t, vm_offset_t, vm_paddr_t); +vm_paddr_t mmu_radix_kextract(mmu_t, vm_offset_t); +void mmu_radix_kremove(mmu_t, vm_offset_t); +boolean_t mmu_radix_is_modified(mmu_t, vm_page_t); +boolean_t mmu_radix_is_prefaultable(mmu_t, pmap_t, vm_offset_t); +boolean_t mmu_radix_is_referenced(mmu_t, vm_page_t); +void mmu_radix_object_init_pt(mmu_t, pmap_t, vm_offset_t, vm_object_t, + vm_pindex_t, vm_size_t); +boolean_t mmu_radix_page_exists_quick(mmu_t, pmap_t, vm_page_t); +void mmu_radix_page_init(mmu_t, vm_page_t); +boolean_t mmu_radix_page_is_mapped(mmu_t, vm_page_t m); +void mmu_radix_page_set_memattr(mmu_t, vm_page_t, vm_memattr_t); +int mmu_radix_page_wired_mappings(mmu_t, vm_page_t); +void mmu_radix_pinit(mmu_t, pmap_t); +void mmu_radix_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); +boolean_t mmu_radix_ps_enabled(mmu_t, pmap_t); +void mmu_radix_qenter(mmu_t, vm_offset_t, vm_page_t *, int); +void mmu_radix_qremove(mmu_t, vm_offset_t, int); +vm_offset_t mmu_radix_quick_enter_page(mmu_t, vm_page_t); +void mmu_radix_quick_remove_page(mmu_t, vm_offset_t); +boolean_t mmu_radix_ts_referenced(mmu_t, vm_page_t); +void mmu_radix_release(mmu_t, pmap_t); +void mmu_radix_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_remove_all(mmu_t, vm_page_t); +void mmu_radix_remove_pages(mmu_t, pmap_t); +void mmu_radix_remove_write(mmu_t, vm_page_t); +void mmu_radix_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_zero_page(mmu_t, vm_page_t); +void mmu_radix_zero_page_area(mmu_t, vm_page_t, int, int); +int mmu_radix_change_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t); + +#include "mmu_oea64.h" +#include "mmu_if.h" +#include "moea64_if.h" + +/* + * Kernel MMU interface + */ + +static void mmu_radix_bootstrap(mmu_t mmup, + vm_offset_t kernelstart, vm_offset_t kernelend); + +static void mmu_radix_copy_page(mmu_t, vm_page_t, vm_page_t); +static void mmu_radix_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +static void mmu_radix_growkernel(mmu_t, vm_offset_t); +static void mmu_radix_init(mmu_t); +static int mmu_radix_mincore(mmu_t, pmap_t, vm_offset_t, vm_paddr_t *); +static vm_offset_t mmu_radix_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); +static void mmu_radix_pinit0(mmu_t, pmap_t); + +static void *mmu_radix_mapdev(mmu_t, vm_paddr_t, vm_size_t); +static void *mmu_radix_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); +static void mmu_radix_unmapdev(mmu_t, vm_offset_t, vm_size_t); +static void mmu_radix_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); +static boolean_t mmu_radix_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); +static void mmu_radix_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, + void **va); +static void mmu_radix_scan_init(mmu_t mmu); +static void mmu_radix_cpu_bootstrap(mmu_t, int ap); + +static mmu_method_t mmu_radix_methods[] = { + MMUMETHOD(mmu_bootstrap, mmu_radix_bootstrap), + MMUMETHOD(mmu_copy_page, mmu_radix_copy_page), + MMUMETHOD(mmu_copy_pages, mmu_radix_copy_pages), + MMUMETHOD(mmu_cpu_bootstrap, mmu_radix_cpu_bootstrap), + MMUMETHOD(mmu_growkernel, mmu_radix_growkernel), + MMUMETHOD(mmu_init, mmu_radix_init), + MMUMETHOD(mmu_map, mmu_radix_map), + MMUMETHOD(mmu_mincore, mmu_radix_mincore), + MMUMETHOD(mmu_pinit, mmu_radix_pinit), + MMUMETHOD(mmu_pinit0, mmu_radix_pinit0), + + MMUMETHOD(mmu_mapdev, mmu_radix_mapdev), + MMUMETHOD(mmu_mapdev_attr, mmu_radix_mapdev_attr), + MMUMETHOD(mmu_unmapdev, mmu_radix_unmapdev), + MMUMETHOD(mmu_kenter_attr, mmu_radix_kenter_attr), + MMUMETHOD(mmu_dev_direct_mapped,mmu_radix_dev_direct_mapped), + MMUMETHOD(mmu_scan_init, mmu_radix_scan_init), + MMUMETHOD(mmu_dumpsys_map, mmu_radix_dumpsys_map), + MMUMETHOD(mmu_page_is_mapped, mmu_radix_page_is_mapped), + MMUMETHOD(mmu_ps_enabled, mmu_radix_ps_enabled), + MMUMETHOD(mmu_object_init_pt, mmu_radix_object_init_pt), + MMUMETHOD(mmu_protect, mmu_radix_protect), + /* pmap dispatcher interface */ + MMUMETHOD(mmu_clear_modify, mmu_radix_clear_modify), + MMUMETHOD(mmu_copy, mmu_radix_copy), + MMUMETHOD(mmu_enter, mmu_radix_enter), + MMUMETHOD(mmu_enter_object, mmu_radix_enter_object), + MMUMETHOD(mmu_enter_quick, mmu_radix_enter_quick), + MMUMETHOD(mmu_extract, mmu_radix_extract), + MMUMETHOD(mmu_extract_and_hold, mmu_radix_extract_and_hold), + MMUMETHOD(mmu_is_modified, mmu_radix_is_modified), + MMUMETHOD(mmu_is_prefaultable, mmu_radix_is_prefaultable), + MMUMETHOD(mmu_is_referenced, mmu_radix_is_referenced), + MMUMETHOD(mmu_ts_referenced, mmu_radix_ts_referenced), + MMUMETHOD(mmu_page_exists_quick,mmu_radix_page_exists_quick), + MMUMETHOD(mmu_page_init, mmu_radix_page_init), + MMUMETHOD(mmu_page_wired_mappings, mmu_radix_page_wired_mappings), + MMUMETHOD(mmu_qenter, mmu_radix_qenter), + MMUMETHOD(mmu_qremove, mmu_radix_qremove), + MMUMETHOD(mmu_release, mmu_radix_release), + MMUMETHOD(mmu_remove, mmu_radix_remove), + MMUMETHOD(mmu_remove_all, mmu_radix_remove_all), + MMUMETHOD(mmu_remove_write, mmu_radix_remove_write), + MMUMETHOD(mmu_unwire, mmu_radix_unwire), + MMUMETHOD(mmu_zero_page, mmu_radix_zero_page), + MMUMETHOD(mmu_zero_page_area, mmu_radix_zero_page_area), + MMUMETHOD(mmu_activate, mmu_radix_activate), + MMUMETHOD(mmu_quick_enter_page, mmu_radix_quick_enter_page), + MMUMETHOD(mmu_quick_remove_page, mmu_radix_quick_remove_page), + MMUMETHOD(mmu_page_set_memattr, mmu_radix_page_set_memattr), + + /* Internal interfaces */ + MMUMETHOD(mmu_kenter, mmu_radix_kenter), + MMUMETHOD(mmu_kextract, mmu_radix_kextract), + MMUMETHOD(mmu_kremove, mmu_radix_kremove), + MMUMETHOD(mmu_change_attr, mmu_radix_change_attr), + MMUMETHOD(mmu_map_user_ptr, mmu_radix_map_user_ptr), + MMUMETHOD(mmu_decode_kernel_ptr, mmu_radix_decode_kernel_ptr), + { 0, 0 } +}; + +MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods, 0); + +#define METHODVOID(m) mmu_radix_ ## m(mmu_t mmup) + +static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp); +static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); +static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); +static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, + struct spglist *free); +static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); + +static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, + u_int flags, struct rwlock **lockp); +#if VM_NRESERVLEVEL > 0 +static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +#endif +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); + +static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, struct rwlock **lockp); +static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, + u_int flags, vm_page_t m, struct rwlock **lockp); + +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void free_pv_chunk(struct pv_chunk *pc); +static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); +static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free); +static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); + +static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); +static void pmap_invalidate_all(pmap_t pmap); +static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); + +/* + * Internal flags for pmap_enter()'s helper functions. + */ +#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ +#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ + +#define UNIMPLEMENTED() panic("%s not implemented", __func__) +#define UNTESTED() panic("%s not yet tested", __func__) + + + +/* Number of supported PID bits */ +static unsigned int isa3_pid_bits; + +/* PID to start allocating from */ +static unsigned int isa3_base_pid; + +#define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) +#define PROCTAB_ENTRIES (1ul << isa3_pid_bits) + + +/* + * Map of physical memory regions. + */ +static struct mem_region *regions, *pregions; +static u_int phys_avail_count; +static int regions_sz, pregions_sz; +static struct pate *isa3_parttab; +static struct prte *isa3_proctab; +static vmem_t *asid_arena; + +extern void bs_remap_earlyboot(void); + +#define RADIX_PGD_SIZE_SHIFT 16 +#define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) + +#define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) +#define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) +#define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) + +#define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ +#define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ + +/* POWER9 only permits a 64k partition table size. */ +#define PARTTAB_SIZE_SHIFT 16 +#define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) + +#define PARTTAB_HR (1UL << 63) /* host uses radix */ +#define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ + +/* TLB flush actions. Used as argument to tlbiel_all() */ +enum { + TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ + TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ +}; + +#define NPV_LIST_LOCKS MAXCPU +static int pmap_initialized; +static vm_paddr_t proctab0pa; +static vm_paddr_t parttab_phys; +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); + +/* + * Data for the pv entry allocation mechanism. + * Updates to pv_invl_gen are protected by the pv_list_locks[] + * elements, but reads are not. + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static struct mtx __exclusive_cache_line pv_chunks_mutex; +static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; +static struct md_page *pv_table; +static struct md_page pv_dummy; + +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +#define pa_index(pa) ((pa) >> PDRSHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + +/* + * We support 52 bits, hence: + * bits 52 - 31 = 21, 0b10101 + * RTS encoding details + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long + */ +#define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) + + +static int powernv_enabled = 1; + +static inline void +tlbiel_radix_set_isa300(uint32_t set, uint32_t is, + uint32_t pid, uint32_t ric, uint32_t prs) +{ + uint64_t rb; + uint64_t rs; + + rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); + rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); + + __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void +tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) +{ + uint32_t set; + + __asm __volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + /* Do the same for process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + __asm __volatile("ptesync": : :"memory"); +} + +static void +mmu_radix_tlbiel_flush(int scope) +{ + int is; + + MPASS(scope == TLB_INVAL_SCOPE_LPID || + scope == TLB_INVAL_SCOPE_GLOBAL); + is = scope + 2; + + tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); + __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static void +mmu_radix_init_amor(void) +{ + /* + * In HV mode, we init AMOR (Authority Mask Override Register) so that + * the hypervisor and guest can setup IAMR (Instruction Authority Mask + * Register), enable key 0 and set it to 1. + * + * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) + */ + mtspr(SPR_AMOR, (3ul << 62)); +} + +static void +mmu_radix_init_iamr(void) +{ + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPR_IAMR, (1ul << 62)); +} + +static void +mmu_radix_pid_set(pmap_t pmap) +{ + + mtspr(SPR_PID, pmap->pm_pid); + isync(); +} + +/* Quick sort callout for comparing physical addresses. */ +static int +pa_cmp(const void *a, const void *b) +{ + const vm_paddr_t *pa = a, *pb = b; + + if (*pa < *pb) + return (-1); + else if (*pa > *pb) + return (1); + else + return (0); +} + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ + *(u_long *)(ptep) = (u_long)((pte) | PG_V | RPTE_LEAF); \ +} while (0) +/* + * NB: should only be used for adding directories - not for direct mappings + */ +#define pde_store(ptep, pa) do { \ + *(u_long *)(ptep) = (u_long)(pa|RPTE_VALID|RPTE_SHIFT); \ +} while (0) + +#define pte_clear(ptep) do { \ + *(u_long *)(ptep) = (u_long)(0); \ +} while (0) + +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ + PG_M | PG_A | RPTE_EAA_MASK | PG_V) + + +static void +pmap_epoch_init(void *arg __unused) +{ + pmap_epoch = epoch_alloc(EPOCH_PREEMPT | EPOCH_LOCKED); +} +SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_ANY, pmap_epoch_init, NULL); + +static bool +pmap_not_in_di(void) +{ + + return (curthread->td_md.md_invl_gen.gen == 0); +} + +#define PMAP_ASSERT_NOT_IN_DI() \ + KASSERT(pmap_not_in_di(), ("DI already started")) + +static void +pmap_delayed_invl_started(epoch_tracker_t et) +{ + epoch_enter_preempt(pmap_epoch, et); + curthread->td_md.md_invl_gen.gen = 1; +} + +static void +pmap_delayed_invl_finished(epoch_tracker_t et) +{ + curthread->td_md.md_invl_gen.gen = 0; + epoch_exit_preempt(pmap_epoch, et); +} + +static void +pmap_delayed_invl_wait(vm_page_t m __unused) +{ + epoch_wait_preempt(pmap_epoch); +} + +static __inline void +pmap_resident_count_inc(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->pm_stats.resident_count += count; +} + +static __inline void +pmap_resident_count_dec(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->pm_stats.resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count -= count; +} + +static void +pagezero(vm_offset_t va) +{ + va = trunc_page(va); + int off; + + for (off = 0; off < PAGE_SIZE; off += cacheline_size) + __asm __volatile("dcbz 0,%0" :: "r"(va + off)); +} + +static uint64_t +allocpages(int n) +{ + u_int64_t ret; + + ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); + for (int i = 0; i < n; i++) + pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); + return (ret); +} + +static pt_entry_t * +kvtopte(vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(kernel_pmap, va); + if ((*l3e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +void +mmu_radix_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) +{ + pt_entry_t *pte; + + pte = kvtopte(va); + MPASS(pte != NULL); + *pte = pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | RPTE_EAA_W | \ + RPTE_EAA_P | PG_M | PG_A; +} + +boolean_t +mmu_radix_ps_enabled(mmu_t mmu, pmap_t pmap) +{ + return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); +} + +static pt_entry_t * +pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + + va &= PG_PS_FRAME; + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (*l3e & PG_V) == 0) + return (NULL); + + if (*l3e & RPTE_LEAF) { + *is_l3e = 1; + return (l3e); + } + *is_l3e = 0; + va &= PG_FRAME; + pte = pmap_l3e_to_pte(l3e, va); + if (pte == NULL || (*pte & PG_V) == 0) + return (NULL); + return (pte); +} + +int +pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) +{ + pt_entry_t *pte; + pt_entry_t startpte, origpte, newpte; + vm_page_t m; + int is_l3e; + + startpte = 0; + retry: + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) + return (KERN_INVALID_ADDRESS); + origpte = newpte = *pte; + if (startpte == 0) { + startpte = origpte; + if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || + ((flags & VM_PROT_READ) && (startpte & PG_A))) { + pmap_invalidate_all(pmap); +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", + __func__, pmap, va, flags, origpte); +#endif + return (KERN_FAILURE); + } + } +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, + flags, origpte); +#endif + PMAP_LOCK(pmap); + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || + *pte != origpte) { + PMAP_UNLOCK(pmap); + return (KERN_FAILURE); + } + m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); + MPASS(m != NULL); + switch (flags) { + case VM_PROT_READ: + if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + case VM_PROT_WRITE: + if ((newpte & RPTE_EAA_W) == 0) + goto protfail; + if (is_l3e) + goto protfail; + newpte |= PG_M; + vm_page_dirty(m); + break; + case VM_PROT_EXECUTE: + if ((newpte & RPTE_EAA_X) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + } + + if (!atomic_cmpset_long(pte, origpte, newpte)) + goto retry; + ptesync(); + PMAP_UNLOCK(pmap); + if (startpte == newpte) + return (KERN_FAILURE); + return (0); + protfail: + PMAP_UNLOCK(pmap); + return (KERN_PROTECTION_FAILURE); +} + +/* + * Returns TRUE if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns FALSE. + */ +boolean_t +mmu_radix_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + struct rwlock *lock; + boolean_t rv; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (FALSE); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + rv = !TAILQ_EMPTY(&m->md.pv_list) || + ((m->flags & PG_FICTITIOUS) == 0 && + !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); + rw_runlock(lock); + return (rv); +} + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(vm_memattr_t ma) +{ + if (ma != VM_MEMATTR_DEFAULT) { + switch (ma) { + case VM_MEMATTR_UNCACHEABLE: + return (RPTE_ATTR_GUARDEDIO); + case VM_MEMATTR_CACHEABLE: + return (RPTE_ATTR_MEM); + case VM_MEMATTR_WRITE_BACK: + case VM_MEMATTR_PREFETCHABLE: + case VM_MEMATTR_WRITE_COMBINING: + return (RPTE_ATTR_UNGUARDEDIO); + } + } + return (0); +} + +static void +pmap_invalidate_page(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_4k(start); + else + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_2m(start); + else + radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_pwc(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpwc_kernel(); + else + radix_tlbie_invlpwc_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) +{ + if (((start - end) >> PAGE_SHIFT) > 8) { + pmap_invalidate_all(pmap); + return; + } + ptesync(); + if (pmap == kernel_pmap) { + while (start < end) { + radix_tlbie_invlpg_kernel_4k(start); + start += PAGE_SIZE; + } + } else { + while (start < end) { + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + start += PAGE_SIZE; + } + } + ttusync(); +} + +static void +pmap_invalidate_all(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_flush_kernel(); + else + radix_tlbie_flush_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) +{ + + /* + * When the PDE has PG_PROMOTED set, the 2MB page mapping was created + * by a promotion that did not invalidate the 512 4KB page mappings + * that might exist in the TLB. Consequently, at this point, the TLB + * may hold both 4KB and 2MB page mappings for the address range [va, + * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. + * In contrast, when PG_PROMOTED is clear, the TLB will not hold any + * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a + * single INVLPG suffices to invalidate the 2MB page mapping from the + * TLB. + */ + ptesync(); + if ((l3e & PG_PROMOTED) != 0) + pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); + else + pmap_invalidate_page_2m(pmap, va); + + pmap_invalidate_pwc(pmap); +} + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0xfffffffffffffffful +#define PC_FREE2 0x000000fffffffffful + +static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + vm_page_t m; + int avail, free; + bool reclaimed; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + // if ((cpu_feature2 & CPUID2_POPCNT) == 0) + bit_count((bitstr_t *)pc->pc_map, 0, + sizeof(pc->pc_map) * NBBY, &free); +#if 0 + free = popcnt_pc_map_pq(pc->pc_map); +#endif + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (reclaimed = false; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + reclaimed = true; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + + /* + * The reclaim might have freed a chunk from the current pmap. + * If that chunk contained available entries, we need to + * re-count the number of available entries. + */ + if (reclaimed) + goto retry; + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { +#ifdef INVARIANTS + if (PV_PMAP(pv) == NULL) { + printf("corrupted pv_chunk/pv %p\n", pv); + printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); + } + MPASS(PV_PMAP(pv) != NULL); + MPASS(pv->pv_va != 0); +#endif + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + break; + } + } + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + int bit, field; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_demote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + + m->md.pv_gen++; + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || + pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); +} + +static void +reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di, + epoch_tracker_t et) +{ + + if (pmap == NULL) + return; + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + if (start_di) + pmap_delayed_invl_finished(et); +} + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ +static int active_reclaims = 0; +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + struct pv_chunk *pc, *pc_marker, *pc_marker_end; + struct pv_chunk_header pc_marker_b, pc_marker_end_b; + struct md_page *pvh; + pml3_entry_t *l3e; + pmap_t next_pmap, pmap; + pt_entry_t *pte, tpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint64_t inuse; + int bit, field, freed; + bool start_di; + struct epoch_tracker et; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + bzero(&pc_marker_b, sizeof(pc_marker_b)); + bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); + pc_marker = (struct pv_chunk *)&pc_marker_b; + pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; + + /* + * A delayed invalidation block should already be active if + * pmap_advise() or pmap_remove() called this function by way + * of pmap_demote_l3e_locked(). + */ + start_di = pmap_not_in_di(); + + mtx_lock(&pv_chunks_mutex); + active_reclaims++; + TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); + while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && + SLIST_EMPTY(&free)) { + next_pmap = pc->pc_pmap; + if (next_pmap == NULL) { + /* + * The next chunk is a marker. However, it is + * not our marker, so active_reclaims must be + * > 1. Consequently, the next_chunk code + * will not rotate the pv_chunks list. + */ + goto next_chunk; + } + mtx_unlock(&pv_chunks_mutex); + + /* + * A pv_chunk can only be removed from the pc_lru list + * when both pc_chunks_mutex is owned and the + * corresponding pmap is locked. + */ + if (pmap != next_pmap) { + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, + start_di, &et); + pmap = next_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + if (start_di) + pmap_delayed_invl_started(&et); + mtx_lock(&pv_chunks_mutex); + continue; + } else if (pmap != locked_pmap) { + if (PMAP_TRYLOCK(pmap)) { + if (start_di) + pmap_delayed_invl_started(&et); + mtx_lock(&pv_chunks_mutex); + continue; + } else { + pmap = NULL; /* pmap is not locked */ + mtx_lock(&pv_chunks_mutex); + pc = TAILQ_NEXT(pc_marker, pc_lru); + if (pc == NULL || + pc->pc_pmap != next_pmap) + continue; + goto next_chunk; + } + } else if (start_di) + pmap_delayed_invl_started(&et); + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = cnttzd(inuse); + pv = &pc->pc_pventry[field * 64 + bit]; + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + if ((*l3e & RPTE_LEAF) != 0) + continue; + pte = pmap_l3e_to_pte(l3e, va); + if ((*pte & PG_W) != 0) + continue; + tpte = pte_load_clear(pte); + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((tpte & PG_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, *l3e, &free); + freed++; + } + } + if (freed == 0) { + mtx_lock(&pv_chunks_mutex); + goto next_chunk; + } + /* Every freed mapping is for a 4 KB page. */ + pmap_resident_count_dec(pmap, freed); + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && + pc->pc_map[2] == PC_FREE2) { + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + break; + } + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + mtx_lock(&pv_chunks_mutex); + /* One freed pv entry in locked_pmap is sufficient. */ + if (pmap == locked_pmap) + break; +next_chunk: + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); + if (active_reclaims == 1 && pmap != NULL) { + /* + * Rotate the pv chunks list so that we do not + * scan the same pv chunks that could not be + * freed (because they contained a wired + * and/or superpage mapping) on every + * invocation of reclaim_pv_chunk(). + */ + while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { + MPASS(pc->pc_pmap != NULL); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + } + } + } + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); + active_reclaims--; + mtx_unlock(&pv_chunks_mutex); + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di, &et); + if (m_pc == NULL && !SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->wire_count = 1; + } + vm_page_free_pages_toq(&free, true); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + +#ifdef VERBOSE_PV + if (pmap != kernel_pmap) + printf("%s(%p, %p)\n", __func__, pmap, pv); +#endif + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 64; + bit = idx % 64; + pc->pc_map[field] |= 1ul << bit; + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || + pc->pc_map[2] != PC_FREE2) { + /* 98% of the time, pc is already at the head of the list. */ + if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + vm_page_unwire(m, PQ_NONE); + vm_page_free(m); +} + +/* + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, struct rwlock **lockp) +{ + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 64 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && + pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, + pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + if (lockp == NULL) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + mtx_lock(&pv_chunks_mutex); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); +} + +#if VM_NRESERVLEVEL > 0 +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_promote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +/* + * Conditionally create the PV entry for a 4KB page mapping if the required + * memory can be allocated without resorting to reclamation. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + return (TRUE); + } else + return (FALSE); +} + +vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; +#ifdef INVARIANTS +static void +validate_addr(vm_paddr_t addr, vm_size_t size) +{ + vm_paddr_t end = addr + size; + bool found = false; + + for (int i = 0; i < 2 * phys_avail_count; i += 2) { + if (addr >= phys_avail_debug[i] && + end <= phys_avail_debug[i + 1]) { + found = true; + break; + } + } + KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", + addr, end)); +} +#else +static void validate_addr(vm_paddr_t addr, vm_size_t size) {} +#endif +#define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) + +static vm_paddr_t +alloc_pt_page(void) +{ + vm_paddr_t page; + + page = allocpages(1); + pagezero(PHYS_TO_DMAP(page)); + return (page); +} + +static void +mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) +{ + pt_entry_t *pte, pteval; + vm_paddr_t page; + + if (bootverbose) + printf("%s %lx -> %lx\n", __func__, start, end); + while (start < end) { + pteval = start | DMAP_PAGE_BITS; + pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); + if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); + if ((start & L2_PAGE_MASK) == 0 && + end - start >= L2_PAGE_SIZE) { + start += L2_PAGE_SIZE; + goto done; + } else if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + + pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); + if ((start & L3_PAGE_MASK) == 0 && + end - start >= L3_PAGE_SIZE) { + start += L3_PAGE_SIZE; + goto done; + } else if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); + start += PAGE_SIZE; + done: + pte_store(pte, pteval); + } +} + +static void +mmu_radix_dmap_populate(vm_size_t hwphyssz) +{ + vm_paddr_t start, end; + + for (int i = 0; i < pregions_sz; i++) { + start = pregions[i].mr_start; + end = start + pregions[i].mr_size; + if (hwphyssz && start >= hwphyssz) + break; + if (hwphyssz && hwphyssz < end) + end = hwphyssz; + mmu_radix_dmap_range(start, end); + } +} + +static void +mmu_radix_setup_pagetables(vm_size_t hwphyssz) +{ + vm_paddr_t ptpages, pages; + pt_entry_t *pte; + vm_paddr_t l1phys; + + bzero(kernel_pmap, sizeof(struct pmap)); + PMAP_LOCK_INIT(kernel_pmap); + + ptpages = allocpages(2); + l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); + validate_addr(l1phys, RADIX_PGD_SIZE); + if (bootverbose) + printf("l1phys=%lx\n", l1phys); + MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); + for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); + kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); + + mmu_radix_dmap_populate(hwphyssz); + + /* + * Create page tables for first 128MB of KVA + */ + pages = ptpages; + pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); + /* + * the kernel page table pages need to be preserved in + * phys_avail and not overlap with previous allocations + */ + pages = allocpages(nkpt); + if (bootverbose) { + printf("phys_avail after dmap populate and nkpt allocation\n"); + for (int j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + KPTphys = pages; + for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; + if (bootverbose) + printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); + /* + * Add a physical memory segment (vm_phys_seg) corresponding to the + * preallocated kernel page table pages so that vm_page structures + * representing these pages will be created. The vm_page structures + * are required for promotion of the corresponding kernel virtual + * addresses to superpage mappings. + */ + vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); +} + +static void +mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) +{ + vm_paddr_t kpstart, kpend; + vm_size_t physsz, hwphyssz; + //uint64_t l2virt; + int rm_pavail, proctab_size; + int i, j; + + kpstart = start & ~DMAP_BASE_ADDRESS; + kpend = end & ~DMAP_BASE_ADDRESS; + + /* Get physical memory regions from firmware */ + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); + + if (2 * VM_PHYSSEG_MAX < regions_sz) + panic("mmu_radix_early_bootstrap: phys_avail too small"); + + if (bootverbose) + for (int i = 0; i < regions_sz; i++) + printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + /* + * XXX workaround a simulator bug + */ + for (int i = 0; i < regions_sz; i++) + if (regions[i].mr_start & PAGE_MASK) { + regions[i].mr_start += PAGE_MASK; + regions[i].mr_start &= ~PAGE_MASK; + regions[i].mr_size &= ~PAGE_MASK; + } + if (bootverbose) + for (int i = 0; i < pregions_sz; i++) + printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", + i, pregions[i].mr_start, i, pregions[i].mr_size); + + phys_avail_count = 0; + physsz = 0; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); + for (i = 0, j = 0; i < regions_sz; i++) { + if (bootverbose) + printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + + if (regions[i].mr_size < PAGE_SIZE) + continue; + + CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", + regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); + if (hwphyssz != 0 && + (physsz + regions[i].mr_size) >= hwphyssz) { + if (physsz < hwphyssz) { + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + + (hwphyssz - physsz); + physsz = hwphyssz; + phys_avail_count++; + } + break; + } + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; + + phys_avail_count++; + physsz += regions[i].mr_size; + j += 2; + } + + /* Check for overlap with the kernel and exception vectors */ + rm_pavail = 0; + for (j = 0; j < 2 * phys_avail_count; j+=2) { + if (phys_avail[j] < EXC_LAST) + phys_avail[j] += EXC_LAST; + + if (phys_avail[j] >= kpstart && + phys_avail[j + 1] <= kpend) { + phys_avail[j] = phys_avail[j + 1] = ~0; + rm_pavail++; + continue; + } + + if (kpstart >= phys_avail[j] && + kpstart < phys_avail[j + 1]) { + if (kpend < phys_avail[j + 1]) { + phys_avail[2 * phys_avail_count] = + (kpend & ~PAGE_MASK) + PAGE_SIZE; + phys_avail[2 * phys_avail_count + 1] = + phys_avail[j + 1]; + phys_avail_count++; + } + + phys_avail[j + 1] = kpstart & ~PAGE_MASK; + } + + if (kpend >= phys_avail[j] && + kpend < phys_avail[j + 1]) { + if (kpstart > phys_avail[j]) { + phys_avail[2 * phys_avail_count] = phys_avail[j]; + phys_avail[2 * phys_avail_count + 1] = + kpstart & ~PAGE_MASK; + phys_avail_count++; + } + + phys_avail[j] = (kpend & ~PAGE_MASK) + + PAGE_SIZE; + } + } + qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); + for (i = 0; i < 2 * phys_avail_count; i++) + phys_avail_debug[i] = phys_avail[i]; + + /* Remove physical available regions marked for removal (~0) */ + if (rm_pavail) { + phys_avail_count -= rm_pavail; + for (i = 2 * phys_avail_count; + i < 2*(phys_avail_count + rm_pavail); i+=2) + phys_avail[i] = phys_avail[i + 1] = 0; + } + if (bootverbose) { + printf("phys_avail ranges after filtering:\n"); + for (j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + physmem = btoc(physsz); + + /* XXX assume we're running non-virtualized and + * we don't support BHYVE + */ + if (isa3_pid_bits == 0) + isa3_pid_bits = 20; + parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); + validate_addr(parttab_phys, PARTTAB_SIZE); + for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); + + proctab_size = 1UL << PROCTAB_SIZE_SHIFT; + proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); + validate_addr(proctab0pa, proctab_size); + for (int i = 0; i < proctab_size/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); + + mmu_radix_setup_pagetables(hwphyssz); +} + +static void +mmu_radix_late_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t end) +{ + int i; + vm_paddr_t pa; + void *dpcpu; + vm_offset_t va; + + /* + * Set up the Open Firmware pmap and add its mappings if not in real + * mode. + */ + if (bootverbose) + printf("%s enter\n", __func__); + + /* + * Calculate the last available physical address. + */ + Maxmem = 0; + for (i = 0; phys_avail[i + 2] != 0; i += 2) + Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); + + /* + * Set the start and end of kva. + */ + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; + + /* + * Remap any early IO mappings (console framebuffer, etc.) + */ + bs_remap_earlyboot(); + + /* + * Allocate a kernel stack with a guard page for thread0 and map it + * into the kernel page map. + */ + pa = allocpages(kstack_pages); + va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; + virtual_avail = va + kstack_pages * PAGE_SIZE; + CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); + thread0.td_kstack = va; + for (i = 0; i < kstack_pages; i++) { + mmu_radix_kenter(mmu, va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + thread0.td_kstack_pages = kstack_pages; + + /* + * Allocate virtual address space for the message buffer. + */ + pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); + msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); + + /* + * Allocate virtual address space for the dynamic percpu area. + */ + pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); + dpcpu = (void *)PHYS_TO_DMAP(pa); + dpcpu_init(dpcpu, curcpu); + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +} + +static void +mmu_parttab_init(void) +{ + uint64_t ptcr; + + isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); + + if (bootverbose) + printf("%s parttab: %p\n", __func__, isa3_parttab); + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + if (bootverbose) + printf("setting ptcr %lx\n", ptcr); + mtspr(SPR_PTCR, ptcr); +} + +static void +mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) +{ + uint64_t prev; + + if (bootverbose) + printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, + lpid, pagetab, proctab); + prev = be64toh(isa3_parttab[lpid].pagetab); + isa3_parttab[lpid].pagetab = htobe64(pagetab); + isa3_parttab[lpid].proctab = htobe64(proctab); + + if (prev & PARTTAB_HR) { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } else { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } + ttusync(); +} + +static void +mmu_radix_parttab_init(void) +{ + uint64_t pagetab; + + mmu_parttab_init(); + pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ + RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; + mmu_parttab_update(0, pagetab, 0); +} + +static void +mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) +{ + uint64_t pagetab, proctab; + + pagetab = be64toh(isa3_parttab[0].pagetab); + proctab = proctabpa | table_size | PARTTAB_GR; + mmu_parttab_update(0, pagetab, proctab); +} + +static void +mmu_radix_proctab_init(void) +{ + + isa3_base_pid = 1; + + isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); + isa3_proctab->proctab0 = + htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | + RADIX_PGD_INDEX_SHIFT); + + mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); + + __asm __volatile("ptesync" : : : "memory"); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); + if (bootverbose) + printf("process table %p and kernel radix PDE: %p\n", + isa3_proctab, kernel_pmap->pm_pml1); + mtmsr(mfmsr() | PSL_DR ); + mtmsr(mfmsr() & ~PSL_DR); + kernel_pmap->pm_pid = isa3_base_pid; + isa3_base_pid++; +} + +void +mmu_radix_advise(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + int advice) +{ + struct rwlock *lock; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_offset_t va, va_next; + vm_page_t m; + boolean_t anychanged; + struct epoch_tracker et; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + anychanged = FALSE; + pmap_delayed_invl_started(&et); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + oldl3e = *l3e; + if ((oldl3e & PG_V) == 0) + continue; + else if ((oldl3e & RPTE_LEAF) != 0) { + if ((oldl3e & PG_MANAGED) == 0) + continue; + lock = NULL; + if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if ((oldl3e & PG_W) == 0) { + pte = pmap_l3e_to_pte(l3e, sva); + KASSERT((*pte & PG_V) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, sva, *l3e, NULL, + &lock); + anychanged = TRUE; + } + if (lock != NULL) + rw_wunlock(lock); + } + if (va_next > eva) + va_next = eva; + va = va_next; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; + pte++, sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + + if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) + goto maybe_invlrng; + else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + vm_page_dirty(m); + } + atomic_clear_long(pte, PG_M | PG_A); + } else if ((*pte & PG_A) != 0) + atomic_clear_long(pte, PG_A); + else + goto maybe_invlrng; + anychanged = TRUE; + continue; +maybe_invlrng: + if (va != va_next) { + anychanged = true; + va = va_next; + } + } + if (va != va_next) + anychanged = true; + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + pmap_delayed_invl_finished(&et); +} + +/* + * Routines used in machine-dependent code + */ +static void +mmu_radix_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t end) +{ + uint64_t lpcr; + + if (bootverbose) + printf("%s\n", __func__); + hw_direct_map = 1; + mmu_radix_early_bootstrap(start, end); + if (bootverbose) + printf("early bootstrap complete\n"); + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + mmu_radix_parttab_init(); + mmu_radix_init_amor(); + if (bootverbose) + printf("powernv init complete\n"); + } + mmu_radix_init_iamr(); + mmu_radix_proctab_init(); + mmu_radix_pid_set(kernel_pmap); + /* XXX assume CPU_FTR_HVMODE */ + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); + + mmu_radix_late_bootstrap(mmu, start, end); + if (bootverbose) + printf("%s done\n", __func__); + pmap_bootstrapped = 1; + dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); +} + +static void +mmu_radix_cpu_bootstrap(mmu_t mmu, int ap) +{ + uint64_t lpcr; + uint64_t ptcr; + + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + mtspr(SPR_PTCR, ptcr); + mmu_radix_init_amor(); + } + mmu_radix_init_iamr(); + mmu_radix_pid_set(kernel_pmap); + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); +} + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static u_long pmap_l3e_demotions; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l3e_demotions, 0, "2MB page demotions"); + +static u_long pmap_l3e_mappings; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_l3e_mappings, 0, "2MB page mappings"); + +static u_long pmap_l3e_p_failures; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_l3e_p_failures, 0, "2MB page promotion failures"); + +static u_long pmap_l3e_promotions; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_l3e_promotions, 0, "2MB page promotions"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, + "1GB page mapping counters"); + +static u_long pmap_l2e_demotions; +SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l2e_demotions, 0, "1GB page demotions"); + +void +mmu_radix_clear_modify(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + pv_entry_t next_pv, pv; + pml3_entry_t oldl3e, *l3e; + pt_entry_t oldpte, *pte; + struct rwlock *lock; + vm_offset_t va; + int md_gen, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + VM_OBJECT_ASSERT_WLOCKED(m->object); + KASSERT(!vm_page_xbusied(m), + ("pmap_clear_modify: page %p is exclusive busied", m)); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. + * If the object containing the page is locked and the page is not + * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->aflags & PGA_WRITEABLE) == 0) + return; + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_wlock(lock); +restart: + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + oldl3e = *l3e; + if ((oldl3e & PG_RW) != 0) { + if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) { + if ((oldl3e & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldl3e & + PG_PS_FRAME); + pte = pmap_l3e_to_pte(l3e, va); + oldpte = *pte; + if ((oldpte & PG_V) != 0) { + while (!atomic_cmpset_long(pte, + oldpte, + (oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_clear_modify: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + atomic_clear_long(pte, PG_M); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); +} + +void +mmu_radix_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, + vm_size_t len, vm_offset_t src_addr) +{ + struct rwlock *lock; + struct spglist free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t va_next; + vm_page_t dst_pdpg, dstmpte, srcmpte; + bool invalidate_all; + + CTR6(KTR_PMAP, "%s(%p, %p, %#x, %#x, %#x)", __func__, dst_pmap, + src_pmap, dst_addr, len, src_addr); + + if (dst_addr != src_addr) + return; +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", + __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); +#endif + lock = NULL; + invalidate_all = false; + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + + for (addr = src_addr; addr < end_addr; addr = va_next) { + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t srcptepaddr, *l3e; + pt_entry_t *src_pte, *dst_pte; + + l1e = pmap_pml1e(src_pmap, addr); + if ((*l1e & PG_V) == 0) { + va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, addr); + if ((*l2e & PG_V) == 0) { + va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + + l3e = pmap_l2e_to_l3e(l2e, addr); + srcptepaddr = *l3e; + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & RPTE_LEAF) { + if ((addr & L3_PAGE_MASK) != 0 || + addr + L3_PAGE_SIZE > end_addr) + continue; + dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); + if (dst_pdpg == NULL) + break; + l3e = (pml3_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if (*l3e == 0 && ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, + PMAP_ENTER_NORECLAIM, &lock))) { + *l3e = srcptepaddr & ~PG_W; + pmap_resident_count_inc(dst_pmap, + L3_PAGE_SIZE / PAGE_SIZE); + atomic_add_long(&pmap_l3e_mappings, 1); + } else + dst_pdpg->wire_count--; + continue; + } + + srcptepaddr &= PG_FRAME; + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); + KASSERT(srcmpte->wire_count > 0, + ("pmap_copy: source page table page is unused")); + + if (va_next > end_addr) + va_next = end_addr; + + src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); + src_pte = &src_pte[pmap_pte_index(addr)]; + dstmpte = NULL; + while (addr < va_next) { + pt_entry_t ptetemp; + ptetemp = *src_pte; + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + if (dstmpte != NULL && + dstmpte->pindex == pmap_l3e_pindex(addr)) + dstmpte->wire_count++; + else if ((dstmpte = pmap_allocpte(dst_pmap, + addr, NULL)) == NULL) + goto out; + dst_pte = (pt_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); + dst_pte = &dst_pte[pmap_pte_index(addr)]; + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), + &lock)) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + *dst_pte = ptetemp & ~(PG_W | PG_M | + PG_A); + pmap_resident_count_inc(dst_pmap, 1); + } else { + SLIST_INIT(&free); + if (pmap_unwire_ptp(dst_pmap, addr, + dstmpte, &free)) { + /* + * Although "addr" is not + * mapped, paging-structure + * caches could nonetheless + * have entries that refer to + * the freed page table pages. + * Invalidate those entries. + */ + invalidate_all = true; + vm_page_free_pages_toq(&free, + true); + } + goto out; + } + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + if (__predict_false((addr & L3_PAGE_MASK) == 0)) + src_pte = pmap_pte(src_pmap, addr); + else + src_pte++; + } + } +out: + if (invalidate_all) + pmap_invalidate_all(dst_pmap); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +static void +mmu_radix_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); + vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); + + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); + /* + * XXX slow + */ + bcopy((void *)src, (void *)dst, PAGE_SIZE); +} + +static void +mmu_radix_copy_pages(mmu_t mmu, vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, + a_offset, mb, b_offset, xfersize); + UNIMPLEMENTED(); +} + +#if VM_NRESERVLEVEL > 0 +/* + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single page table page (PTP) to a single 2MB page mapping. For promotion + * to occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + */ +static int +pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2MB page. + */ + firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); +setpde: + newpde = *firstpte; + if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((newpde & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared without + * a TLB invalidation. + */ + if (!atomic_cmpset_long(firstpte, newpde, (newpde | RPTE_EAA_R) & ~RPTE_EAA_W)) + goto setpde; + newpde &= ~RPTE_EAA_W; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = *pte; + if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)) + goto setpte; + oldpte &= ~RPTE_EAA_W; + CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" + " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | + (va & ~L3_PAGE_MASK), pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_l3e: page table page is out of range")); + KASSERT(mpte->pindex == pmap_l3e_pindex(va), + ("pmap_promote_l3e: page table page's pindex is wrong")); + if (pmap_insert_pt_page(pmap, mpte)) { + CTR2(KTR_PMAP, + "pmap_promote_l3e: failure for va %#lx in pmap %p", va, + pmap); + goto fail; + } + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); + + pte_store(pde, PG_PROMOTED | newpde); + atomic_add_long(&pmap_l3e_promotions, 1); + CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (0); + fail: + atomic_add_long(&pmap_l3e_p_failures, 1); + return (KERN_FAILURE); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +int +mmu_radix_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct rwlock *lock; + pml3_entry_t *l3e; + pt_entry_t *pte; + pt_entry_t newpte, origpte; + pv_entry_t pv; + vm_paddr_t opa, pa; + vm_page_t mpte, om; + int rv, retrycount; + boolean_t nosleep, invalidate_all, invalidate_page; + + va = trunc_page(va); + retrycount = 0; + invalidate_page = invalidate_all = false; + CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, + m, prot, flags, psind); + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || + va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) + VM_OBJECT_ASSERT_LOCKED(m->object); + KASSERT((flags & PMAP_ENTER_RESERVED) == 0, + ("pmap_enter: flags %u has reserved bits set", flags)); + pa = VM_PAGE_TO_PHYS(m); + newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); + if ((flags & VM_PROT_WRITE) != 0) + newpte |= PG_M; + if ((flags & VM_PROT_READ) != 0) + newpte |= PG_A; + if (prot & VM_PROT_READ) + newpte |= RPTE_EAA_R; + if ((prot & VM_PROT_WRITE) != 0) + newpte |= RPTE_EAA_W; + KASSERT((newpte & (PG_M | PG_RW)) != PG_M, + ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); + + if (prot & VM_PROT_EXECUTE) + newpte |= PG_X; + if ((flags & PMAP_ENTER_WIRED) != 0) + newpte |= PG_W; + if (va >= DMAP_MIN_ADDRESS) + newpte |= RPTE_EAA_P; + newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if ((newpte & PG_RW) != 0) + newpte |= PG_M; + } else + newpte |= PG_MANAGED; + + lock = NULL; + PMAP_LOCK(pmap); + if (psind == 1) { + /* Assert the required virtual and physical alignment. */ + KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); + KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); + goto out; + } + mpte = NULL; + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ +retry: + l3e = pmap_pml3e(pmap, va); + if (l3e != NULL && (*l3e & PG_V) != 0 && ((*l3e & RPTE_LEAF) == 0 || + pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { + pte = pmap_l3e_to_pte(l3e, va); + if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { + mpte = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); + mpte->wire_count++; + } + } else if (va < VM_MAXUSER_ADDRESS) { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; + mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), + nosleep ? NULL : &lock); + if (mpte == NULL && nosleep) { + rv = KERN_RESOURCE_SHORTAGE; + goto out; + } + if (__predict_false(retrycount++ == 6)) + panic("too many retries"); + invalidate_all = true; + goto retry; + } else + panic("pmap_enter: invalid page directory va=%#lx", va); + + origpte = *pte; + pv = NULL; + + /* + * Is the specified virtual address already mapped? + */ + if ((origpte & PG_V) != 0) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) { + printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" + " asid=%lu curpid=%d name=%s origpte0x%lx\n", + pmap, va, m, prot, flags, psind, pmap->pm_pid, + curproc->p_pid, curproc->p_comm, origpte); + pmap_pte_walk(pmap->pm_pml1, va); + } +#endif + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count++; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count--; + + /* + * Remove the extra PT page reference. + */ + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%lx", va)); + } + + /* + * Has the physical page changed? + */ + opa = origpte & PG_FRAME; + if (opa == pa) { + /* + * No, might be a protection or wiring change. + */ + if ((origpte & PG_MANAGED) != 0 && + (newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { + if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { + if (!atomic_cmpset_long(pte, origpte, newpte)) + goto retry; + if ((newpte & PG_M) != (origpte & PG_M)) + vm_page_dirty(m); + if ((newpte & PG_A) != (origpte & PG_A)) + vm_page_aflag_set(m, PGA_REFERENCED); + ptesync(); + } else + invalidate_all = true; + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) + goto unchanged; + } + goto validate; + } + + /* + * The physical page has changed. Temporarily invalidate + * the mapping. This ensures that all threads sharing the + * pmap keep a consistent view of the mapping, which is + * necessary for the correct handling of COW faults. It + * also permits reuse of the old mapping's PV entry, + * avoiding an allocation. + * + * For consistency, handle unmanaged mappings the same way. + */ + origpte = pte_load_clear(pte); + KASSERT((origpte & PG_FRAME) == opa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((origpte & PG_MANAGED) != 0) { + om = PHYS_TO_VM_PAGE(opa); + + /* + * The pmap lock is sufficient to synchronize with + * concurrent calls to pmap_page_test_mappings() and + * pmap_ts_referenced(). + */ + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(om); + if ((origpte & PG_A) != 0) + vm_page_aflag_set(om, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); + pv = pmap_pvh_remove(&om->md, pmap, va); + if ((newpte & PG_MANAGED) == 0) + free_pv_entry(pmap, pv); +#ifdef INVARIANTS + else if (origpte & PG_MANAGED) { + if (pv == NULL) { + pmap_page_print_mappings(om); + MPASS(pv != NULL); + } + } +#endif + if ((om->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + origpte = 0; + } else { + if (pmap != kernel_pmap) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", + pmap, va, m, prot, flags, psind, + pmap->pm_pid, curproc->p_pid, + curproc->p_comm); +#endif + } + + /* + * Increment the counters. + */ + if ((newpte & PG_W) != 0) + pmap->pm_stats.wired_count++; + pmap_resident_count_inc(pmap, 1); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((newpte & PG_MANAGED) != 0) { + if (pv == NULL) { + pv = get_pv_entry(pmap, &lock); + pv->pv_va = va; + } +#ifdef VERBOSE_PV + else + printf("reassigning pv: %p to pmap: %p\n", + pv, pmap); +#endif + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if ((newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + + /* + * Update the PTE. + */ + if ((origpte & PG_V) != 0) { +validate: + origpte = pte_load_store(pte, newpte); + KASSERT((origpte & PG_FRAME) == pa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == + (PG_M | PG_RW)) { + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(m); + invalidate_page = true; + + /* + * Although the PTE may still have PG_RW set, TLB + * invalidation may nonetheless be required because + * the PTE no longer has PG_M set. + */ + } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { + /* + * Removing capabilities requires invalidation on POWER + */ + invalidate_page = true; + goto unchanged; + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + } else { + pte_store(pte, newpte); + ptesync(); + } +unchanged: + +#if VM_NRESERVLEVEL > 0 + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + mmu_radix_ps_enabled(mmu, pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0 && + pmap_promote_l3e(pmap, l3e, va, &lock) == 0) + invalidate_all = true; +#endif + if (invalidate_all) + pmap_invalidate_all(pmap); + else if (invalidate_page) + pmap_invalidate_page(pmap, va); + + rv = KERN_SUCCESS; +out: + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + + return (rv); +} + + +/* + * Tries to create a read- and/or execute-only 2MB page mapping. Returns true + * if successful. Returns false if (1) a page table page cannot be allocated + * without sleeping, (2) a mapping already exists at the specified virtual + * address, or (3) a PV entry cannot be allocated without reclaiming another + * PV entry. + */ +static bool +pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | + RPTE_LEAF | PG_V; + if ((m->oflags & VPO_UNMANAGED) == 0) + newpde |= PG_MANAGED; + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (prot & VM_PROT_READ) + newpde |= RPTE_EAA_R; + if (va >= DMAP_MIN_ADDRESS) + newpde |= RPTE_EAA_P; + return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | + PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == + KERN_SUCCESS); +} + +/* + * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if + * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE + * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and + * a mapping already exists at the specified virtual address. Returns + * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table + * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if + * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * + * The parameter "m" is only used when creating a managed, writeable mapping. + */ +static int +pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, + vm_page_t m, struct rwlock **lockp) +{ + struct spglist free; + pml3_entry_t oldl3e, *l3e; + vm_page_t mt, pdpg; + struct epoch_tracker et; + + KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_enter_pde: newpde is missing PG_M")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? + NULL : lockp)) == NULL) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(va)]; + oldl3e = *l3e; + if ((oldl3e & PG_V) != 0) { + KASSERT(pdpg->wire_count > 1, + ("pmap_enter_pde: pdpg's wire count is too low")); + if ((flags & PMAP_ENTER_NOREPLACE) != 0) { + pdpg->wire_count--; + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_FAILURE); + } + /* Break the existing mapping(s). */ + SLIST_INIT(&free); + if ((oldl3e & RPTE_LEAF) != 0) { + /* + * The reference to the PD page that was acquired by + * pmap_allocl3e() ensures that it won't be freed. + * However, if the PDE resulted from a promotion, then + * a reserved PT page could be freed. + */ + (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); + } else { + pmap_delayed_invl_started(&et); + if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, + &free, lockp)) + pmap_invalidate_all(pmap); + pmap_delayed_invl_finished(&et); + } + vm_page_free_pages_toq(&free, true); + if (va >= VM_MAXUSER_ADDRESS) { + mt = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt)) { + /* + * XXX Currently, this can't happen because + * we do not perform pmap_enter(psind == 1) + * on the kernel pmap. + */ + panic("pmap_enter_pde: trie insert failed"); + } + } else + KASSERT(*l3e == 0, ("pmap_enter_pde: non-zero pde %p", + l3e)); + } + if ((newpde & PG_MANAGED) != 0) { + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + if ((newpde & PG_RW) != 0) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_aflag_set(mt, PGA_WRITEABLE); + } + } + + /* + * Increment counters. + */ + if ((newpde & PG_W) != 0) + pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + + /* + * Map the superpage. (This is not a promoted mapping; there will not + * be any lingering 4KB page mappings in the TLB.) + */ + pte_store(l3e, newpde); + + atomic_add_long(&pmap_l3e_mappings, 1); + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (KERN_SUCCESS); +} + +void +mmu_radix_enter_object(mmu_t mmu, pmap_t pmap, vm_offset_t start, + vm_offset_t end, vm_page_t m_start, vm_prot_t prot) +{ + + struct rwlock *lock; + vm_offset_t va; + vm_page_t m, mpte; + vm_pindex_t diff, psize; + bool invalidate; + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, + end, m_start, prot); + + invalidate = false; + psize = atop(end - start); + mpte = NULL; + m = m_start; + lock = NULL; + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + va = start + ptoa(diff); + if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && + m->psind == 1 && mmu_radix_ps_enabled(mmu, pmap) && + pmap_enter_2mpage(pmap, va, m, prot, &lock)) + m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, + mpte, &lock, &invalidate); + m = TAILQ_NEXT(m, listq); + } + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) +{ + struct spglist free; + pt_entry_t *pte; + vm_paddr_t pa; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("pmap_enter_quick_locked: managed mapping within the clean submap")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + vm_pindex_t ptepindex; + pml3_entry_t *ptepa; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { + /* + * Get the page directory entry + */ + ptepa = pmap_pml3e(pmap, va); + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. Otherwise, we + * attempt to allocate a page table page. If this + * attempt fails, we don't retry. Instead, we give up. + */ + if (ptepa && (*ptepa & PG_V) != 0) { + if (*ptepa & RPTE_LEAF) + return (NULL); + mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); + mpte->wire_count++; + } else { + /* + * Pass NULL instead of the PV list lock + * pointer, because we don't intend to sleep. + */ + mpte = _pmap_allocpte(pmap, ptepindex, NULL); + if (mpte == NULL) + return (mpte); + } + } + pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); + pte = &pte[pmap_pte_index(va)]; + } else { + mpte = NULL; + pte = pmap_pte(pmap, va); + } + if (*pte) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + if (mpte != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, mpte, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + *invalidate = true; + vm_page_free_pages_toq(&free, true); + } + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap_resident_count_inc(pmap, 1); + + pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); + if (prot & VM_PROT_EXECUTE) + pa |= PG_X; + else + pa |= RPTE_EAA_R; + if ((m->oflags & VPO_UNMANAGED) == 0) + pa |= PG_MANAGED; + + pte_store(pte, pa); + return (mpte); +} + +void +mmu_radix_enter_quick(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot) +{ + struct rwlock *lock; + bool invalidate; + + lock = NULL; + invalidate = false; + PMAP_LOCK(pmap); + (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock, &invalidate); + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +vm_paddr_t +mmu_radix_extract(mmu_t mmu, pmap_t pmap, vm_offset_t va) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + + l3e = pmap_pml3e(pmap, va); + if (__predict_false(l3e == NULL)) + return (0); + if (*l3e & RPTE_LEAF) { + pa = (*l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pte = pmap_l3e_to_pte(l3e, va); + if (__predict_false(pte == NULL)) + return (0); + pa = *pte; + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + return (pa); +} + +vm_page_t +mmu_radix_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pml3_entry_t l3e, *l3ep; + pt_entry_t pte; + vm_paddr_t pa; + vm_page_t m; + + pa = 0; + m = NULL; + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); + PMAP_LOCK(pmap); +retry: + l3ep = pmap_pml3e(pmap, va); + if (l3ep != NULL && (l3e = *l3ep)) { + if (l3e & RPTE_LEAF) { + if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + if (vm_page_pa_tryrelock(pmap, (l3e & + PG_PS_FRAME) | (va & L3_PAGE_MASK), &pa)) + goto retry; + m = PHYS_TO_VM_PAGE(pa); + } + } else { + pte = *pmap_l3e_to_pte(l3ep, va); + if ((pte & PG_V) && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, + &pa)) + goto retry; + m = PHYS_TO_VM_PAGE(pa); + } + } + if (m != NULL) + vm_page_hold(m); + } + PA_UNLOCK_COND(pa); + PMAP_UNLOCK(pmap); + return (m); +} + +static void +mmu_radix_growkernel(mmu_t mmu, vm_offset_t addr) +{ + vm_paddr_t paddr; + vm_page_t nkpg; + pml3_entry_t *l3e; + pml2_entry_t *l2e; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + if (VM_MIN_KERNEL_ADDRESS < addr && + addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) + return; + + addr = roundup2(addr, L3_PAGE_SIZE); + if (addr - 1 >= vm_map_max(kernel_map)) + addr = vm_map_max(kernel_map); + while (kernel_vm_end < addr) { + l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); + if ((*l2e & PG_V) == 0) { + /* We need a new PDP entry */ + nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + mmu_radix_zero_page(mmu, nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l2e, paddr); + continue; /* try again */ + } + l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); + if ((*l3e & PG_V) != 0) { + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + continue; + } + + nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + mmu_radix_zero_page(mmu, nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l3e, paddr); + + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + } + ptesync(); +} + +static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); +static uma_zone_t zone_radix_pgd; + +static int +radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, + int flags) +{ + + for (int i = 0; i < count; i++) { + vm_page_t m = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, + 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, + VM_MEMATTR_DEFAULT); + /* XXX zero on alloc here so we don't have to later */ + store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + } + return (count); +} + +static void +radix_pgd_release(void *arg __unused, void **store, int count) +{ + vm_page_t m; + struct spglist free; + int page_count; + + SLIST_INIT(&free); + page_count = RADIX_PGD_SIZE/PAGE_SIZE; + + for (int i = 0; i < count; i++) { + /* + * XXX selectively remove dmap and KVA entries so we don't + * need to bzero + */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); + for (int j = page_count-1; j >= 0; j--) { + vm_page_unwire_noq(&m[j]); + SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); + } + vm_page_free_pages_toq(&free, false); + } +} + +static void +mmu_radix_init(mmu_t mmu) +{ + vm_page_t mpte; + vm_size_t s; + int error, i, pv_npg; + + /* L1TF, reserve page @0 unconditionally */ + vm_page_blacklist_add(0, bootverbose); + + zone_radix_pgd = uma_zcache_create("radix_pgd_cache", + RADIX_PGD_SIZE, NULL, NULL, +#ifdef INVARIANTS + trash_init, trash_fini, +#else + NULL, NULL, +#endif + radix_pgd_import, radix_pgd_release, + NULL, UMA_ZONE_NOBUCKET); + + /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + PMAP_LOCK(kernel_pmap); + for (i = 0; i < nkpt; i++) { + mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range size: %lu", + vm_page_array_size)); + mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; + mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); + MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); + //pmap_insert_pt_page(kernel_pmap, mpte); + mpte->wire_count = 1; + } + PMAP_UNLOCK(kernel_pmap); + vm_wire_add(nkpt); + + CTR1(KTR_PMAP, "%s()", __func__); + TAILQ_INIT(&pv_dummy.pv_list); + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + if (pg_ps_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("pmap_init: can't assign to pagesizes[1]")); + pagesizes[1] = L3_PAGE_SIZE; + } + + /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pmap pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + TAILQ_INIT(&pv_dummy.pv_list); + + pmap_initialized = 1; + mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, + (vmem_addr_t *)&qframe); + + if (error != 0) + panic("qframe allocation failed"); + asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (*pte & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (*pte & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } +out: + rw_runlock(lock); + return (rv); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +mmu_radix_is_modified(mmu_t mmu, vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * concurrently set while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no PTEs can have PG_M set. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return (FALSE); + return (pmap_page_test_mappings(m, FALSE, TRUE)); +} + +boolean_t +mmu_radix_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t addr) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + boolean_t rv; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + rv = FALSE; + PMAP_LOCK(pmap); + l3e = pmap_pml3e(pmap, addr); + if (l3e != NULL && (*l3e & (RPTE_LEAF | PG_V)) == PG_V) { + pte = pmap_l3e_to_pte(l3e, addr); + rv = (*pte & PG_V) == 0; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +boolean_t +mmu_radix_is_referenced(mmu_t mmu, vm_page_t m) +{ + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + return (pmap_page_test_mappings(m, TRUE, FALSE)); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). + * + * A DI block is not needed within this function, because + * invalidations are performed before the PV list lock is + * released. + */ +boolean_t +mmu_radix_ts_referenced(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + struct rwlock *lock; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + int cleared, md_gen, not_cleared, pvh_gen; + struct spglist free; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + SLIST_INIT(&free); + cleared = 0; + pa = VM_PAGE_TO_PHYS(m); + lock = PHYS_TO_PV_LIST_LOCK(pa); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); + rw_wlock(lock); +retry: + not_cleared = 0; + if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + oldl3e = *l3e; + if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + /* + * Although "oldpde" is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } + if ((oldl3e & PG_A) != 0) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ + (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && + (oldl3e & PG_W) == 0) { + atomic_clear_long(l3e, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + } + if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, + ("pmap_ts_referenced: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((*pte & PG_A) != 0) { + atomic_clear_long(pte, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + + not_cleared < PMAP_TS_REFERENCED_MAX); +out: + rw_wunlock(lock); + vm_page_free_pages_toq(&free, true); + return (cleared + not_cleared); +} + +static vm_offset_t +mmu_radix_map(mmu_t mmu, vm_offset_t *virt __unused, vm_paddr_t start, + vm_paddr_t end, int prot __unused) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, + prot); + return (PHYS_TO_DMAP(start)); +} + +void +mmu_radix_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, vm_size_t size) +{ + pml3_entry_t *l3e; + vm_paddr_t pa, ptepa; + vm_page_t p, pdpg; + vm_memattr_t ma; + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, + object, pindex, size); + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); + /* NB: size can be logically ored with addr here */ + if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { + if (!mmu_radix_ps_enabled(mmu, pmap)) + return; + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + p = vm_page_lookup(object, pindex); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + ma = p->md.mdpg_cache_attrs; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 2MB page boundary. + */ + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & L3_PAGE_MASK) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + p = TAILQ_NEXT(p, listq); + for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; + pa += PAGE_SIZE) { + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + if (pa != VM_PAGE_TO_PHYS(p) || + ma != p->md.mdpg_cache_attrs) + return; + p = TAILQ_NEXT(p, listq); + } + + PMAP_LOCK(pmap); + for (pa = ptepa | pmap_cache_bits(ma); + pa < ptepa + size; pa += L3_PAGE_SIZE) { + pdpg = pmap_allocl3e(pmap, addr, NULL); + if (pdpg == NULL) { + /* + * The creation of mappings below is only an + * optimization. If a page directory page + * cannot be allocated without blocking, + * continue on to the next mapping rather than + * blocking. + */ + addr += L3_PAGE_SIZE; + continue; + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if ((*l3e & PG_V) == 0) { + pa |= PG_M | PG_A | PG_RW; + pte_store(l3e, pa); + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + atomic_add_long(&pmap_l3e_mappings, 1); + } else { + /* Continue on if the PDE is already valid. */ + pdpg->wire_count--; + KASSERT(pdpg->wire_count > 0, + ("pmap_object_init_pt: missing reference " + "to page directory page, va: 0x%lx", addr)); + } + addr += L3_PAGE_SIZE; + } + ptesync(); + PMAP_UNLOCK(pmap); + } +} + +boolean_t +mmu_radix_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + struct rwlock *lock; + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_page_exists_quick: page %p is not managed", m)); + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); + rv = FALSE; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_runlock(lock); + return (rv); +} + +void +mmu_radix_page_init(mmu_t mmu, vm_page_t m) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + TAILQ_INIT(&m->md.pv_list); + m->md.pv_magic = 0xCAFEBABE; + m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; +} + +int +mmu_radix_page_wired_mappings(mmu_t mmu, vm_page_t m) +{ + struct rwlock *lock; + struct md_page *pvh; + pmap_t pmap; + pt_entry_t *pte; + pv_entry_t pv; + int count, md_gen, pvh_gen; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (0); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + count = 0; + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + } + rw_runlock(lock); + return (count); +} + +static void +mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) +{ + isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); +} + +void +mmu_radix_pinit(mmu_t mmu, pmap_t pmap) +{ + vmem_addr_t pid; + vm_paddr_t l1pa; + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + + /* + * allocate the page directory page + */ + pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); + + for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) + pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); + pmap->pm_root.rt_root = 0; + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + pmap->pm_flags = PMAP_PDE_SUPERPAGE; + vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); + + pmap->pm_pid = pid; + l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); + mmu_radix_update_proctab(pid, l1pa); + __asm __volatile("ptesync;isync" : : : "memory"); +} + +/* + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. + * + * Note: If a page allocation fails at page table level two or three, + * one or two pages may be held during the wait, only to be released + * afterwards. This conservative approach is easily argued to avoid + * race conditions. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +{ + vm_page_t m, pdppg, pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_UNLOCK(pmap); + PMAP_ASSERT_NOT_IN_DI(); + vm_wait(NULL); + PMAP_LOCK(pmap); + } + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + mmu_radix_zero_page(NULL, m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + if (ptepindex >= (NUPDE + NUPDPE)) { + pml1_entry_t *l1e; + vm_pindex_t pml1index; + + /* Wire up a new PDPE page */ + pml1index = ptepindex - (NUPDE + NUPDPE); + l1e = &pmap->pm_pml1[pml1index]; + pde_store(l1e, VM_PAGE_TO_PHYS(m)); + + } else if (ptepindex >= NUPDE) { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + + /* Wire up a new l2e page */ + pdpindex = ptepindex - NUPDE; + pml1index = pdpindex >> RPTE_SHIFT; + + l1e = &pmap->pm_pml1[pml1index]; + if ((*l1e & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to l2e page */ + pdppg = PHYS_TO_VM_PAGE(*l1e & PG_FRAME); + pdppg->wire_count++; + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + + /* Now find the pdp page */ + l2e = &l2e[pdpindex & RPTE_MASK]; + pde_store(l2e, VM_PAGE_TO_PHYS(m)); + + } else { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + + /* Wire up a new PTE page */ + pdpindex = ptepindex >> RPTE_SHIFT; + pml1index = pdpindex >> RPTE_SHIFT; + + /* First, find the pdp and check that its valid. */ + l1e = &pmap->pm_pml1[pml1index]; + if ((*l1e & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + } else { + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + if ((*l2e & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(*l2e & PG_FRAME); + pdpg->wire_count++; + } + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(*l2e & PG_FRAME); + + /* Now we know where the page directory page is */ + l3e = &l3e[ptepindex & RPTE_MASK]; + pde_store(l3e, VM_PAGE_TO_PHYS(m)); + } + + pmap_resident_count_inc(pmap, 1); + return (m); +} +static vm_page_t +pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t pdpindex, ptepindex; + pml2_entry_t *pdpe; + vm_page_t pdpg; + +retry: + pdpe = pmap_pml2e(pmap, va); + if (pdpe != NULL && (*pdpe & PG_V) != 0) { + /* Add a reference to the pd page. */ + pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); + pdpg->wire_count++; + } else { + /* Allocate a pd page. */ + ptepindex = pmap_l3e_pindex(va); + pdpindex = ptepindex >> RPTE_SHIFT; + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + if (pdpg == NULL && lockp != NULL) + goto retry; + } + return (pdpg); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t ptepindex; + pml3_entry_t *pd; + vm_page_t m; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); +retry: + /* + * Get the page directory entry + */ + pd = pmap_pml3e(pmap, va); + + /* + * This supports switching from a 2MB page to a + * normal 4K page. + */ + if (pd != NULL && (*pd & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { + if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { + /* + * Invalidation of the 2MB page mapping may have caused + * the deallocation of the underlying PD page. + */ + pd = NULL; + } + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (pd != NULL && (*pd & PG_V) != 0) { + m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) + goto retry; + } + return (m); +} + +static void +mmu_radix_pinit0(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + PMAP_LOCK_INIT(pmap); + pmap->pm_pml1 = kernel_pmap->pm_pml1; + pmap->pm_pid = kernel_pmap->pm_pid; + + pmap->pm_root.rt_root = 0; + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + kernel_pmap->pm_flags = + pmap->pm_flags = PMAP_PDE_SUPERPAGE; +} +/* + * pmap_protect_l3e: do the things to protect a 2mpage in a process + */ +static boolean_t +pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) +{ + pt_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_protect_l3e: sva is not 2mpage aligned")); + anychanged = FALSE; +retry: + oldpde = newpde = *l3e; + if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) + vm_page_dirty(m); + } + if ((prot & VM_PROT_WRITE) == 0) { + newpde &= ~(PG_RW | PG_M); + newpde |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (newpde != oldpde) { + /* + * As an optimization to future operations on this PDE, clear + * PG_PROMOTED. The impending invalidation will remove any + * lingering 4KB page mappings from the TLB. + */ + if (!atomic_cmpset_long(l3e, oldpde, newpde & ~PG_PROMOTED)) + goto retry; + anychanged = TRUE; + } + return (anychanged); +} + +void +mmu_radix_protect(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + vm_prot_t prot) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + pt_entry_t *pte; + boolean_t anychanged; + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, start, end, + prot); + + KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); + if (prot == VM_PROT_NONE) { + mmu_radix_remove(mmu, pmap, sva, eva); + return; + } + + if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == + (VM_PROT_WRITE|VM_PROT_EXECUTE)) + return; + +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", + pmap, sva, eva, prot, pmap->pm_pid); +#endif + anychanged = FALSE; + + /* + * Although this function delays and batches the invalidation + * of stale TLB entries, it does not need to call + * pmap_delayed_invl_started() and + * pmap_delayed_invl_finished(), because it does not + * ordinarily destroy mappings. Stale TLB entries from + * protection-only changes need only be invalidated before the + * pmap lock is released, because protection-only changes do + * not destroy PV entries. Even operations that iterate over + * a physical page's PV list of mappings, like + * pmap_remove_write(), acquire the pmap lock for each + * mapping. Consequently, for protection-only changes, the + * pmap lock suffices to synchronize both page table and TLB + * updates. + * + * This function only destroys a mapping if pmap_demote_l3e() + * fails. In that case, stale TLB entries are immediately + * invalidated. + */ + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = *l3e; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + if (pmap_protect_l3e(pmap, l3e, sva, prot)) + anychanged = TRUE; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + } + + if (va_next > eva) + va_next = eva; + + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + pt_entry_t obits, pbits; + vm_page_t m; + +retry: + MPASS(pte == pmap_pte(pmap, sva)); + obits = pbits = *pte; + if ((pbits & PG_V) == 0) + continue; + + if ((prot & VM_PROT_WRITE) == 0) { + if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); + vm_page_dirty(m); + } + pbits &= ~(PG_RW | PG_M); + pbits |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + pbits |= PG_X; + + if (pbits != obits) { + if (!atomic_cmpset_long(pte, obits, pbits)) + goto retry; + if (obits & (PG_A|PG_M)) { + anychanged = TRUE; +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("%#lx %#lx -> %#lx\n", + sva, obits, pbits); +#endif + } + } + } + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *ma, int count) +{ + + CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, m, count); + pt_entry_t oldpte, pa, *pte; + vm_page_t m; + uint64_t cache_bits, attr_bits; + vm_offset_t va; + + oldpte = 0; + attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + MPASS(pte == pmap_pte(kernel_pmap, va)); + + /* + * XXX there has to be a more efficient way than traversing + * the page table every time - but go for correctness for + * today + */ + + m = *ma++; + cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); + pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; + if (*pte != pa) { + oldpte |= *pte; + pte_store(pte, pa); + } + va += PAGE_SIZE; + pte++; + } + if (__predict_false((oldpte & RPTE_VALID) != 0)) + pmap_invalidate_range(kernel_pmap, sva, sva + count * + PAGE_SIZE); + else + ptesync(); +} + +void +mmu_radix_qremove(mmu_t mmu, vm_offset_t sva, int count) +{ + vm_offset_t va; + pt_entry_t *pte; + + CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); + KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); + + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + pte_clear(pte); + pte++; + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +/* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, + boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_root, mpte)); +} + +/* + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. + */ +static __inline vm_page_t +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_remove(&pmap->pm_root, pmap_l3e_pindex(va))); +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static inline boolean_t +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + --m->wire_count; + if (m->wire_count == 0) { + _pmap_unwire_ptp(pmap, va, m, free); + return (TRUE); + } else + return (FALSE); +} + +static void +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * unmap the page table page + */ + if (m->pindex >= (NUPDE + NUPDPE)) { + /* PDP page */ + pml1_entry_t *pml1; + pml1 = pmap_pml1e(pmap, va); + *pml1 = 0; + } else if (m->pindex >= NUPDE) { + /* PD page */ + pml2_entry_t *l2e; + l2e = pmap_pml2e(pmap, va); + *l2e = 0; + } else { + /* PTE page */ + pml3_entry_t *l3e; + l3e = pmap_pml3e(pmap, va); + *l3e = 0; + } + pmap_resident_count_dec(pmap, 1); + if (m->pindex < NUPDE) { + /* We just released a PT, unhold the matching PD */ + vm_page_t pdpg; + + pdpg = PHYS_TO_VM_PAGE(*pmap_pml2e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdpg, free); + } + if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { + /* We just released a PD, unhold the matching PDP */ + vm_page_t pdppg; + + pdppg = PHYS_TO_VM_PAGE(*pmap_pml1e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdppg, free); + } + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + pmap_add_delayed_free_list(m, free, TRUE); +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, + struct spglist *free) +{ + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return (pmap_unwire_ptp(pmap, va, mpte, free)); +} + +void +mmu_radix_release(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + KASSERT(vm_radix_is_empty(&pmap->pm_root), + ("pmap_release: pmap has reserved page table page(s)")); + + pmap_invalidate_all(pmap); + isa3_proctab[pmap->pm_pid].proctab0 = 0; + uma_zfree(zone_radix_pgd, pmap->pm_pml1); + vmem_free(asid_arena, pmap->pm_pid, 1); +} + +/* + * Create the PV entry for a 2MB page mapping. Always returns true unless the + * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns + * false if the PV entry cannot be allocated without resorting to reclamation. + */ +static bool +pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_paddr_t pa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? + NULL : lockp)) == NULL) + return (false); + pv->pv_va = va; + pa = pde & PG_PS_FRAME; + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + return (true); +} + +/* + * Fills a page table page with mappings to consecutive physical pages. + */ +static void +pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) +{ + pt_entry_t *pte; + + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += PAGE_SIZE; + } +} + +static boolean_t +pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) +{ + struct rwlock *lock; + boolean_t rv; + + lock = NULL; + rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +static boolean_t +pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t oldpde; + pt_entry_t *firstpte; + vm_paddr_t mptepa; + vm_page_t mpte; + struct spglist free; + vm_offset_t sva; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpde = *l3e; + KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", + oldpde)); + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { + KASSERT((oldpde & PG_W) == 0, + ("pmap_demote_l3e: page table page for a wired mapping" + " is missing")); + + /* + * Invalidate the 2MB page mapping and return "failure" if the + * mapping was never accessed or the allocation of the new + * page table page fails. If the 2MB page mapping belongs to + * the direct map region of the kernel's address space, then + * the page allocation request specifies the highest possible + * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is + * normal. Page table pages are preallocated for every other + * part of the kernel address space, so the direct map region + * is the only part of the kernel address space that must be + * handled here. + */ + if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, + pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < + DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + SLIST_INIT(&free); + sva = trunc_2mpage(va); + pmap_remove_l3e(pmap, l3e, sva, &free, lockp); + pmap_invalidate_l3e_page(pmap, sva, oldpde); + vm_page_free_pages_toq(&free, true); + CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); + KASSERT((oldpde & PG_A) != 0, + ("pmap_demote_l3e: oldpde is missing PG_A")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_l3e: oldpde is missing PG_M")); + + /* + * If the page table page is new, initialize it. + */ + if (mpte->wire_count == 1) { + mpte->wire_count = NPTEPG; + pmap_fill_ptp(firstpte, oldpde); + } + + KASSERT((*firstpte & PG_FRAME) == (oldpde & PG_FRAME), + ("pmap_demote_l3e: firstpte and newpte map different physical" + " addresses")); + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + if ((*firstpte & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) + pmap_fill_ptp(firstpte, oldpde); + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the PDE and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_l3e() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldpde & PG_MANAGED) != 0) + reserve_pv_entries(pmap, NPTEPG - 1, lockp); + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + pde_store(l3e, mptepa); + ptesync(); + /* + * Demote the PV entry. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); + + + atomic_add_long(&pmap_l3e_demotions, 1); + CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* + * pmap_remove_kernel_pde: Remove a kernel superpage mapping. + */ +static void +pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) +{ + vm_paddr_t mptepa; + vm_page_t mpte; + + KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_remove_pt_page(pmap, va); + if (mpte == NULL) + panic("pmap_remove_kernel_pde: Missing pt page."); + + mptepa = VM_PAGE_TO_PHYS(mpte); + + /* + * Initialize the page table page. + */ + pagezero(PHYS_TO_DMAP(mptepa)); + + /* + * Demote the mapping. + */ + pde_store(l3e, mptepa); + ptesync(); +} + +/* + * pmap_remove_l3e: do the things to unmap a superpage in a process + */ +static int +pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pml3_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_remove_l3e: sva is not 2mpage aligned")); + oldpde = pte_load_clear(pdq); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + if (oldpde & PG_MANAGED) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); + pvh = pa_to_pvh(oldpde & PG_PS_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + pmap_remove_kernel_l3e(pmap, pdq, sva); + } else { + mpte = pmap_remove_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_l3e: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + } + } + return (pmap_unuse_pt(pmap, sva, *pmap_pml2e(pmap, sva), free)); +} + + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t oldpte; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = pte_load_clear(ptq); + if (oldpte & RPTE_WIRED) + pmap->pm_stats.wired_count -= 1; + pmap_resident_count_dec(pmap, 1); + if (oldpte & RPTE_MANAGED) { + m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + return (pmap_unuse_pt(pmap, va, ptepde, free)); +} + +/* + * Remove a single page from a process address space + */ +static bool +pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, + struct spglist *free) +{ + struct rwlock *lock; + pt_entry_t *pte; + bool invalidate_all; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((*l3e & RPTE_VALID) == 0) { + return (false); + } + pte = pmap_l3e_to_pte(l3e, va); + if ((*pte & RPTE_VALID) == 0) { + return (false); + } + lock = NULL; + + invalidate_all = pmap_remove_pte(pmap, pte, va, *l3e, free, &lock); + if (lock != NULL) + rw_wunlock(lock); + if (!invalidate_all) + pmap_invalidate_page(pmap, va); + return (invalidate_all); +} + +/* + * Removes the specified range of addresses from the page table page. + */ +static bool +pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) +{ + pt_entry_t *pte; + vm_offset_t va; + bool anyvalid; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + anyvalid = false; + va = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if (*pte == 0) { + if (va != eva) { + anyvalid = true; + va = eva; + } + continue; + } + if (va == eva) + va = sva; + if (pmap_remove_pte(pmap, pte, sva, *l3e, free, lockp)) { + anyvalid = true; + sva += PAGE_SIZE; + break; + } + } + if (anyvalid) + pmap_invalidate_all(pmap); + else if (va != eva) + pmap_invalidate_range(pmap, va, sva); + return (anyvalid); +} + + +void +mmu_radix_remove(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct rwlock *lock; + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + struct spglist free; + struct epoch_tracker et; + bool anyvalid; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, start, end); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = false; + SLIST_INIT(&free); + + /* XXX something fishy here */ + sva = (sva + PAGE_MASK) & ~PAGE_MASK; + eva = (eva + PAGE_MASK) & ~PAGE_MASK; + + pmap_delayed_invl_started(&et); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if (sva + PAGE_SIZE == eva) { + l3e = pmap_pml3e(pmap, sva); + if (l3e && (*l3e & RPTE_LEAF) == 0) { + anyvalid = pmap_remove_page(pmap, sva, l3e, &free); + goto out; + } + } + + lock = NULL; + for (; sva < eva; sva = va_next) { + + if (pmap->pm_stats.resident_count == 0) + break; + l1e = pmap_pml1e(pmap, sva); + if (l1e == NULL || (*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if (l2e == NULL || (*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + /* + * Calculate index for next page table. + */ + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = *l3e; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + pmap_remove_l3e(pmap, l3e, sva, &free, &lock); + continue; + } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, + &lock)) { + /* The large page mapping was destroyed. */ + continue; + } else + ptpaddr = *l3e; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) + anyvalid = true; + } + if (lock != NULL) + rw_wunlock(lock); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + pmap_delayed_invl_finished(&et); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_all(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + struct rwlock *lock; + pt_entry_t *pte, tpte; + pml3_entry_t *l3e; + vm_offset_t va; + struct spglist free; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + SLIST_INIT(&free); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry: + rw_wlock(lock); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); + PMAP_UNLOCK(pmap); + } + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + pmap_resident_count_dec(pmap, 1); + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_all: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + tpte = pte_load_clear(pte); + if (tpte & PG_W) + pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + pmap_unuse_pt(pmap, pv->pv_va, *l3e, &free); + pmap_invalidate_page(pmap, pv->pv_va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(lock); + pmap_delayed_invl_wait(m); + vm_page_free_pages_toq(&free, true); +} + +/* + * Destroy all managed, non-wired mappings in the given user-space + * pmap. This pmap cannot be active on any processor besides the + * caller. + * + * This function cannot be applied to the kernel pmap. Moreover, it + * is not intended for general use. It is only to be used during + * process termination. Consequently, it can be implemented in ways + * that make it faster than pmap_remove(). First, it can more quickly + * destroy mappings by iterating over the pmap's collection of PV + * entries, rather than searching the page table. Second, it doesn't + * have to test and clear the page table entries atomically, because + * no processor is currently accessing the user address space. In + * particular, a page table entry's dirty bit won't change state once + * this function starts. + * + * Although this function destroys all of the pmap's managed, + * non-wired mappings, it can delay and batch the invalidation of TLB + * entries without calling pmap_delayed_invl_started() and + * pmap_delayed_invl_finished(). Because the pmap is not active on + * any other processor, none of these TLB entries will ever be used + * before their eventual invalidation. Consequently, there is no need + * for either pmap_remove_all() or pmap_remove_write() to wait for + * that eventual TLB invalidation. + */ + +void +mmu_radix_remove_pages(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + pml3_entry_t ptel3e; + pt_entry_t *pte, tpte; + struct spglist free; + vm_page_t m, mpte, mt; + pv_entry_t pv; + struct md_page *pvh; + struct pv_chunk *pc, *npc; + struct rwlock *lock; + int64_t bit; + uint64_t inuse, bitmask; + int allfree, field, freed, idx; + boolean_t superpage; + vm_paddr_t pa; + + /* + * Assert that the given pmap is only active on the current + * CPU. Unfortunately, we cannot block another CPU from + * activating the pmap while this function is executing. + */ + KASSERT(pmap->pm_pid == mfspr(SPR_PID), + ("non-current asid %lu - expected %lu", pmap->pm_pid, + mfspr(SPR_PID))); + + lock = NULL; + + SLIST_INIT(&free); + PMAP_LOCK(pmap); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + freed = 0; + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = cnttzd(inuse); + bitmask = 1UL << bit; + idx = field * 64 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = pmap_pml2e(pmap, pv->pv_va); + ptel3e = *pte; + pte = pmap_l2e_to_l3e(pte, pv->pv_va); + tpte = *pte; + if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { + superpage = FALSE; + ptel3e = tpte; + pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & + PG_FRAME); + pte = &pte[pmap_pte_index(pv->pv_va)]; + tpte = *pte; + } else { + /* + * Keep track whether 'tpte' is a + * superpage explicitly instead of + * relying on RPTE_LEAF being set. + * + * This is because RPTE_LEAF is numerically + * identical to PG_PTE_PAT and thus a + * regular page could be mistaken for + * a superpage. + */ + superpage = TRUE; + } + + if ((tpte & PG_V) == 0) { + panic("bad pte va %lx pte %lx", + pv->pv_va, tpte); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + allfree = 0; + continue; + } + + if (superpage) + pa = tpte & PG_PS_FRAME; + else + pa = tpte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); + + pte_clear(pte); + + /* + * Update the vm_page_t clean/reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (superpage) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } + + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + + /* Mark free */ + pc->pc_map[field] |= bitmask; + if (superpage) { + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + pvh = pa_to_pvh(tpte & PG_PS_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + if ((mt->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpte = pmap_remove_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + } + } else { + pmap_resident_count_dec(pmap, 1); +#ifdef VERBOSE_PV + printf("freeing pv (%p, %p)\n", + pmap, pv); +#endif + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if ((m->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); + freed++; + } + } + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + if (lock != NULL) + rw_wunlock(lock); + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_write(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + struct rwlock *lock; + pv_entry_t next_pv, pv; + pml3_entry_t *l3e; + pt_entry_t oldpte, *pte; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * set by another thread while the object is locked. Thus, + * if PGA_WRITEABLE is clear, no page table entries need updating. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry_pv_loop: + rw_wlock(lock); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + if ((*l3e & PG_RW) != 0) + (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, + ("pmap_remove_write: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); +retry: + oldpte = *pte; + if (oldpte & PG_RW) { + if (!atomic_cmpset_long(pte, oldpte, + (oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))) + goto retry; + if ((oldpte & PG_M) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); + vm_page_aflag_clear(m, PGA_WRITEABLE); + pmap_delayed_invl_wait(m); +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware + * feature, so there is no need to invalidate any TLB entries. + * Since pmap_demote_l3e() for the wired entry must never fail, + * pmap_delayed_invl_started()/finished() calls around the + * function are not needed. + */ +void +mmu_radix_unwire(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, start, end); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + if ((*l3e & PG_V) == 0) + continue; + if ((*l3e & RPTE_LEAF) != 0) { + if ((*l3e & PG_W) == 0) + panic("pmap_unwire: pde %#jx is missing PG_W", + (uintmax_t)*l3e); + + /* + * Are we unwiring the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + atomic_clear_long(l3e, PG_W); + pmap->pm_stats.wired_count -= L3_PAGE_SIZE / + PAGE_SIZE; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) + panic("pmap_unwire: demotion failed"); + } + if (va_next > eva) + va_next = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if ((*pte & PG_V) == 0) + continue; + if ((*pte & PG_W) == 0) + panic("pmap_unwire: pte %#jx is missing PG_W", + (uintmax_t)*pte); + + /* + * PG_W must be cleared atomically. Although the pmap + * lock synchronizes access to PG_W, another processor + * could be setting PG_M and/or PG_A concurrently. + */ + atomic_clear_long(pte, PG_W); + pmap->pm_stats.wired_count--; + } + } + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_zero_page(mmu_t mmu, vm_page_t m) +{ + vm_offset_t addr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + pagezero(addr); +} + +void +mmu_radix_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) +{ + caddr_t addr; + + CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); + MPASS(off + size <= PAGE_SIZE); + addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + memset(addr + off, 0, size); +} + + + + +static int +mmu_radix_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, + vm_paddr_t *locked_pa) +{ + pml3_entry_t *l3ep; + pt_entry_t pte; + vm_paddr_t pa; + int val; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + PMAP_LOCK(pmap); +retry: + l3ep = pmap_pml3e(pmap, addr); + if (l3ep != NULL && (*l3ep & PG_V)) { + if (*l3ep & RPTE_LEAF) { + pte = *l3ep; + /* Compute the physical address of the 4KB page. */ + pa = ((*l3ep & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & + PG_FRAME; + val = MINCORE_SUPER; + } else { + pte = *pmap_l3e_to_pte(l3ep, addr); + pa = pte & PG_FRAME; + val = 0; + } + } else { + pte = 0; + pa = 0; + val = 0; + } + if ((pte & PG_V) != 0) { + val |= MINCORE_INCORE; + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((pte & PG_A) != 0) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { + /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ + if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) + goto retry; + } else + PA_UNLOCK_COND(*locked_pa); + PMAP_UNLOCK(pmap); + return (val); +} + +void +mmu_radix_activate(mmu_t mmu, struct thread *td) +{ + pmap_t pmap; + uint32_t curpid; + + CTR2(KTR_PMAP, "%s(%p)", __func__, td); + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + curpid = mfspr(SPR_PID); + if (pmap->pm_pid > isa3_base_pid && + curpid != pmap->pm_pid) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("activated pid=%lu\n", pmap->pm_pid); +#endif + mmu_radix_pid_set(pmap); + } + critical_exit(); +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +mmu_radix_align_superpage(mmu_t mmu, vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, + size); + vm_offset_t superpage_offset; + + if (size < L3_PAGE_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & L3_PAGE_MASK; + if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || + (*addr & L3_PAGE_MASK) == superpage_offset) + return; + if ((*addr & L3_PAGE_MASK) < superpage_offset) + *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; + else + *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; +} + +static void * +mmu_radix_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) +{ + vm_offset_t va, tmpva, ppa, offset; + + ppa = trunc_page(pa); + offset = pa & PAGE_MASK; + size = roundup2(offset + size, PAGE_SIZE); + if (pa < powerpc_ptob(Maxmem)) + panic("bad pa: %#lx less than Maxmem %#lx\n", + pa, powerpc_ptob(Maxmem)); + va = kva_alloc(size); + if (bootverbose) + printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); + KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); + + if (!va) + panic("%s: Couldn't alloc kernel virtual memory", __func__); + + for (tmpva = va; size > 0;) { + mmu_radix_kenter_attr(mmu, tmpva, ppa, attr); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + ppa += PAGE_SIZE; + } + ptesync(); + + return ((void *)(va + offset)); +} + +static void * +mmu_radix_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + + return (mmu_radix_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); +} + +void +mmu_radix_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) +{ + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); + m->md.mdpg_cache_attrs = ma; + + /* + * If "m" is a normal page, update its direct mapping. This update + * can be relied upon to perform any cache operations that are + * required for data coherence. + */ + if ((m->flags & PG_FICTITIOUS) == 0 && + mmu_radix_change_attr(mmu, PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), + PAGE_SIZE, m->md.mdpg_cache_attrs)) + panic("memory attribute change on the direct map failed"); +} + +static void +mmu_radix_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) +{ + vm_offset_t offset; + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); + /* If we gave a direct map region in pmap_mapdev, do nothing */ + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + return; + + offset = va & PAGE_MASK; + size = round_page(offset + size); + va = trunc_page(va); + + if (pmap_initialized) + kva_free(va, size); +} + +static __inline void +pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) +{ + uint64_t opte, npte; + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = *pte; + npte = opte & ~mask; + npte |= cache_bits; + } while (npte != opte && !atomic_cmpset_long(pte, opte, npte)); +} + +/* + * Tries to demote a 1GB page mapping. + */ +static boolean_t +pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) +{ + pml2_entry_t oldpdpe; + pml3_entry_t *firstpde, newpde, *pde; + vm_paddr_t pdpgpa; + vm_page_t pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpdpe = *l2e; + KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); + pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + if (pdpg == NULL) { + CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + pdpgpa = VM_PAGE_TO_PHYS(pdpg); + firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); + KASSERT((oldpdpe & PG_A) != 0, + ("pmap_demote_pdpe: oldpdpe is missing PG_A")); + KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pdpe: oldpdpe is missing PG_M")); + newpde = oldpdpe; + + /* + * Initialize the page directory page. + */ + for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { + *pde = newpde; + newpde += L3_PAGE_SIZE; + } + + /* + * Demote the mapping. + */ + pde_store(l2e, pdpgpa); + + /* + * Flush PWC --- XXX revisit + */ + pmap_invalidate_all(pmap); + + pmap_l2e_demotions++; + CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +vm_paddr_t +mmu_radix_kextract(mmu_t mmu, vm_offset_t va) +{ + pml3_entry_t l3e; + vm_paddr_t pa; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + pa = DMAP_TO_PHYS(va); + } else { + l3e = *pmap_pml3e(kernel_pmap, va); + if (l3e & RPTE_LEAF) { + pa = (l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pa = *pmap_l3e_to_pte(&l3e, va); + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + } + return (pa); +} + +static pt_entry_t +mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) +{ + + if (ma != VM_MEMATTR_DEFAULT) { + return pmap_cache_bits(ma); + } + + /* + * Assume the page is cache inhibited and access is guarded unless + * it's in our available memory array. + */ + for (int i = 0; i < pregions_sz; i++) { + if ((pa >= pregions[i].mr_start) && + (pa < (pregions[i].mr_start + pregions[i].mr_size))) + return (RPTE_ATTR_MEM); + } + return (RPTE_ATTR_GUARDEDIO); +} + +static void +mmu_radix_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) +{ + pt_entry_t *pte, pteval; + uint64_t cache_bits; + + pte = kvtopte(va); + MPASS(pte != NULL); + pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + cache_bits = mmu_radix_calc_wimg(pa, ma); + pte_store(pte, pteval | cache_bits); +} + +void +mmu_radix_kremove(mmu_t mmu, vm_offset_t va) +{ + pt_entry_t *pte; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + + pte = kvtopte(va); + pte_clear(pte); +} + +int mmu_radix_map_user_ptr(mmu_t mmu, pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) +{ + if ((uintptr_t)uaddr + ulen >= VM_MAXUSER_ADDRESS) + return (EFAULT); + + *kaddr = (void *)(uintptr_t)uaddr; + if (klen) + *klen = ulen; + + return (0); +} + +int +mmu_radix_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, + int *is_user, vm_offset_t *decoded) +{ + + CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); + *decoded = addr; + *is_user = (addr < VM_MAXUSER_ADDRESS); + return (0); +} + +static boolean_t +mmu_radix_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + return (mem_valid(pa, size)); +} + +static void +mmu_radix_scan_init(mmu_t mmup) +{ + + CTR1(KTR_PMAP, "%s()", __func__); + UNIMPLEMENTED(); +} + +static void +mmu_radix_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, + void **va) +{ + CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); + UNIMPLEMENTED(); +} + +vm_offset_t +mmu_radix_quick_enter_page(mmu_t mmu, vm_page_t m) +{ + vm_paddr_t paddr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + paddr = VM_PAGE_TO_PHYS(m); + return (PHYS_TO_DMAP(paddr)); +} + +void +mmu_radix_quick_remove_page(mmu_t mmu, vm_offset_t addr __unused) +{ + /* no work to do here */ + CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); +} + +static void +pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + cpu_flush_dcache((void *)sva, eva - sva); +} + +int +mmu_radix_change_attr(mmu_t mmu, vm_offset_t va, vm_size_t size, + vm_memattr_t mode) +{ + int error; + + CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); + PMAP_LOCK(kernel_pmap); + error = pmap_change_attr_locked(va, size, mode, true); + PMAP_UNLOCK(kernel_pmap); + return (error); +} + +static int +pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) +{ + vm_offset_t base, offset, tmpva; + vm_paddr_t pa_start, pa_end, pa_end1; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + int cache_bits, error; + boolean_t changed; + + PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); + base = trunc_page(va); + offset = va & PAGE_MASK; + size = round_page(offset + size); + + /* + * Only supported on kernel virtual addresses, including the direct + * map but excluding the recursive map. + */ + if (base < DMAP_MIN_ADDRESS) + return (EINVAL); + + cache_bits = pmap_cache_bits(mode); + changed = FALSE; + + /* + * Pages that aren't mapped aren't supported. Also break down 2MB pages + * into 4KB pages if required. + */ + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (l2e == NULL || *l2e == 0) + return (EINVAL); + if (*l2e & RPTE_LEAF) { + /* + * If the current 1GB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 1GB page frame. + */ + if ((*l2e & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 1GB page frame + * and there is at least 1GB left within the range, then + * we need not break down this page into 2MB pages. + */ + if ((tmpva & L2_PAGE_MASK) == 0 && + tmpva + L2_PAGE_MASK < base + size) { + tmpva += L2_PAGE_MASK; + continue; + } + if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) + return (ENOMEM); + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", + tmpva, l2e)); + if (*l3e == 0) + return (EINVAL); + if (*l3e & RPTE_LEAF) { + /* + * If the current 2MB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 2MB page frame. + */ + if ((*l3e & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 2MB page frame + * and there is at least 2MB left within the range, then + * we need not break down this page into 4KB pages. + */ + if ((tmpva & L3_PAGE_MASK) == 0 && + tmpva + L3_PAGE_MASK < base + size) { + tmpva += L3_PAGE_SIZE; + continue; + } + if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) + return (ENOMEM); + } + pte = pmap_l3e_to_pte(l3e, tmpva); + if (*pte == 0) + return (EINVAL); + tmpva += PAGE_SIZE; + } + error = 0; + + /* + * Ok, all the pages exist, so run through them updating their + * cache mode if required. + */ + pa_start = pa_end = 0; + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (*l2e & RPTE_LEAF) { + if ((*l2e & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l2e, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*l2e & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *l2e & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } else if (pa_end == (*l2e & PG_PS_FRAME)) + pa_end += L2_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *l2e & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } + } + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + if (*l3e & RPTE_LEAF) { + if ((*l3e & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l3e, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*l3e & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *l3e & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } else if (pa_end == (*l3e & PG_PS_FRAME)) + pa_end += L3_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *l3e & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } + } + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + } else { + pte = pmap_l3e_to_pte(l3e, tmpva); + if ((*pte & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(pte, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*pte & PG_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *pte & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } else if (pa_end == (*pte & PG_FRAME)) + pa_end += PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *pte & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } + } + tmpva += PAGE_SIZE; + } + } + if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { + pa_end1 = MIN(pa_end, dmaplimit); + if (pa_start != pa_end1) + error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), + pa_end1 - pa_start, mode, flush); + } + + /* + * Flush CPU caches if required to make sure any data isn't cached that + * shouldn't be, etc. + */ + if (changed) { + pmap_invalidate_all(kernel_pmap); + + if (flush) + pmap_invalidate_cache_range(base, tmpva); + + } + return (error); +} + +#ifdef DDB +#include +#include + +static void +pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) +{ + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + l1e = &l1[pmap_pml1e_index(va)]; + db_printf("VA %#016lx l1e %#016lx", va, *l1e); + if ((*l1e & PG_V) == 0) { + db_printf("\n"); + return; + } + l2e = pmap_l1e_to_l2e(l1e, va); + db_printf(" l2e %#016lx", *l2e); + if ((*l2e & PG_V) == 0 || (*l2e & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + l3e = pmap_l2e_to_l3e(l2e, va); + db_printf(" l3e %#016lx", *l3e); + if ((*l3e & PG_V) == 0 || (*l3e & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + pte = pmap_l3e_to_pte(l3e, va); + db_printf(" pte %#016lx\n", *pte); +} + +void +pmap_page_print_mappings(vm_page_t m) +{ + pmap_t pmap; + pv_entry_t pv; + + db_printf("page %p(%lx)\n", m, m->phys_addr); + /* need to elide locks if running in ddb */ + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + db_printf("pv: %p ", pv); + db_printf("va: %#016lx ", pv->pv_va); + pmap = PV_PMAP(pv); + db_printf("pmap %p ", pmap); + if (pmap != NULL) { + db_printf("asid: %lu\n", pmap->pm_pid); + pmap_pte_walk(pmap->pm_pml1, pv->pv_va); + } + } +} + +DB_SHOW_COMMAND(pte, pmap_print_pte) +{ + vm_offset_t va; + pmap_t pmap; + + if (!have_addr) { + db_printf("show pte addr\n"); + return; + } + va = (vm_offset_t)addr; + + if (va >= DMAP_MIN_ADDRESS) + pmap = kernel_pmap; + else if (kdb_thread != NULL) + pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); + else + pmap = vmspace_pmap(curthread->td_proc->p_vmspace); + + pmap_pte_walk(pmap->pm_pml1, va); +} + +#endif + diff --git a/sys/powerpc/aim/mp_cpudep.c b/sys/powerpc/aim/mp_cpudep.c index eb90cbdbba2..0461a95eb55 100644 --- a/sys/powerpc/aim/mp_cpudep.c +++ b/sys/powerpc/aim/mp_cpudep.c @@ -98,6 +98,8 @@ cpudep_ap_early_bootstrap(void) mtspr(SPR_LPCR, lpcr); isync(); + mtspr(SPR_HID0, mfspr(SPR_HID0) | HID0_SPECEXEC); + isync(); } #endif break; diff --git a/sys/powerpc/booke/pmap.c b/sys/powerpc/booke/pmap.c index 5705c222c19..1b29c7a4140 100644 --- a/sys/powerpc/booke/pmap.c +++ b/sys/powerpc/booke/pmap.c @@ -391,6 +391,7 @@ static int mmu_booke_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); +static boolean_t mmu_booke_page_is_mapped(mmu_t mmu, vm_page_t m); static mmu_method_t mmu_booke_methods[] = { @@ -432,6 +433,7 @@ static mmu_method_t mmu_booke_methods[] = { MMUMETHOD(mmu_deactivate, mmu_booke_deactivate), MMUMETHOD(mmu_quick_enter_page, mmu_booke_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, mmu_booke_quick_remove_page), + MMUMETHOD(mmu_page_is_mapped, mmu_booke_page_is_mapped), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, mmu_booke_bootstrap), @@ -2326,6 +2328,13 @@ mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, return (0); } +static boolean_t +mmu_booke_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + + return (!TAILQ_EMPTY(&(m)->md.pv_list)); +} + /* * Initialize pmap associated with process 0. */ diff --git a/sys/powerpc/conf/GENERIC64 b/sys/powerpc/conf/GENERIC64 index 05447f451e1..07cd7da638f 100644 --- a/sys/powerpc/conf/GENERIC64 +++ b/sys/powerpc/conf/GENERIC64 @@ -35,6 +35,7 @@ options POWERNV #Non-virtualized OpenPOWER systems options FDT #Flattened Device Tree options SCHED_ULE #ULE scheduler +options NUMA #Non-Uniform Memory Architecture support options PREEMPTION #Enable kernel thread preemption options VIMAGE # Subsystem virtualization, e.g. VNET options INET #InterNETworking diff --git a/sys/powerpc/include/cpu.h b/sys/powerpc/include/cpu.h index 00f8437c8c4..fdc47900590 100644 --- a/sys/powerpc/include/cpu.h +++ b/sys/powerpc/include/cpu.h @@ -89,6 +89,7 @@ extern u_long cpu_features2; #define PPC_FEATURE2_HAS_IEEE128 0x00400000 #define PPC_FEATURE2_DARN 0x00200000 #define PPC_FEATURE2_SCV 0x00100000 +#define PPC_FEATURE2_MMU_RADIX 0x00080000 #define PPC_FEATURE2_HTM_NOSUSPEND 0x01000000 #define PPC_FEATURE_BITMASK \ diff --git a/sys/powerpc/include/cpufunc.h b/sys/powerpc/include/cpufunc.h index 204c48013b8..9c53a526a2f 100644 --- a/sys/powerpc/include/cpufunc.h +++ b/sys/powerpc/include/cpufunc.h @@ -185,6 +185,34 @@ powerpc_sync(void) __asm __volatile ("sync" : : : "memory"); } +static __inline int +cntlzd(uint64_t word) +{ + uint64_t result; + /* cntlzd %0, %1 */ + __asm __volatile(".long 0x7c000074 | (%1 << 21) | (%0 << 16)" : + "=r"(result) : "r"(word)); + + return (int)result; +} + +static __inline int +cnttzd(uint64_t word) +{ + uint64_t result; + /* cnttzd %0, %1 */ + __asm __volatile(".long 0x7c000474 | (%1 << 21) | (%0 << 16)" : + "=r"(result) : "r"(word)); + + return (int)result; +} + +static __inline void +ptesync(void) +{ + __asm __volatile("ptesync"); +} + static __inline register_t intr_disable(void) { diff --git a/sys/powerpc/include/hid.h b/sys/powerpc/include/hid.h index 1b038111534..67a845f025c 100644 --- a/sys/powerpc/include/hid.h +++ b/sys/powerpc/include/hid.h @@ -34,6 +34,7 @@ /* Hardware Implementation Dependent registers for the PowerPC */ #define HID0_RADIX 0x0080000000000000 /* Enable Radix page tables (POWER9) */ +#define HID0_SPECEXEC 0x0010000000000000 /* Enable speculative execution (POWER9) */ #define HID0_EMCP 0x80000000 /* Enable machine check pin */ #define HID0_DBP 0x40000000 /* Disable 60x bus parity generation */ diff --git a/sys/powerpc/include/intr_machdep.h b/sys/powerpc/include/intr_machdep.h index 6ece0fa804f..7ac542537ab 100644 --- a/sys/powerpc/include/intr_machdep.h +++ b/sys/powerpc/include/intr_machdep.h @@ -54,7 +54,7 @@ u_int powerpc_get_irq(uint32_t, u_int); void powerpc_dispatch_intr(u_int, struct trapframe *); int powerpc_enable_intr(void); int powerpc_setup_intr(const char *, u_int, driver_filter_t, driver_intr_t, - void *, enum intr_type, void **); + void *, enum intr_type, void **, int); int powerpc_teardown_intr(void *); int powerpc_bind_intr(u_int irq, u_char cpu); int powerpc_config_intr(int, enum intr_trigger, enum intr_polarity); diff --git a/sys/powerpc/include/mmuvar.h b/sys/powerpc/include/mmuvar.h index 0d0b49e89a3..9bde047b3ef 100644 --- a/sys/powerpc/include/mmuvar.h +++ b/sys/powerpc/include/mmuvar.h @@ -117,6 +117,7 @@ DATA_SET(mmu_set, name) #define MMU_TYPE_OEA "mmu_oea" /* 32-bit OEA */ #define MMU_TYPE_G5 "mmu_g5" /* 64-bit bridge (ibm 970) */ #define MMU_TYPE_P9H "mmu_p9h" /* 64-bit native ISA 3.0 (POWER9) hash */ +#define MMU_TYPE_RADIX "mmu_radix" /* 64-bit native ISA 3.0 (POWER9) radix */ #define MMU_TYPE_8xx "mmu_8xx" /* 8xx quicc TLB */ #endif /* _MACHINE_MMUVAR_H_ */ diff --git a/sys/powerpc/include/ofw_machdep.h b/sys/powerpc/include/ofw_machdep.h index 0fe659f7e17..80a1707cac2 100644 --- a/sys/powerpc/include/ofw_machdep.h +++ b/sys/powerpc/include/ofw_machdep.h @@ -47,7 +47,11 @@ boolean_t OF_bootstrap(void); void OF_reboot(void); void ofw_mem_regions(struct mem_region *, int *, struct mem_region *, int *); +void ofw_numa_mem_regions(struct numa_mem_region *, int *); void ofw_quiesce(void); /* Must be called before VM is up! */ void ofw_save_trap_vec(char *); +int ofw_pcibus_get_domain(device_t dev, device_t child, int *domain); +int ofw_pcibus_get_cpus(device_t dev, device_t child, enum cpu_sets op, + size_t setsize, cpuset_t *cpuset); #endif /* _MACHINE_OFW_MACHDEP_H_ */ diff --git a/sys/powerpc/include/param.h b/sys/powerpc/include/param.h index 9039d307f58..29f4ab2f5d7 100644 --- a/sys/powerpc/include/param.h +++ b/sys/powerpc/include/param.h @@ -82,7 +82,7 @@ #endif /* SMP || KLD_MODULE */ #ifndef MAXMEMDOM -#define MAXMEMDOM 1 +#define MAXMEMDOM 8 #endif #define ALIGNBYTES _ALIGNBYTES @@ -106,12 +106,25 @@ #define PAGE_SIZE (1L << PAGE_SHIFT) /* Page size */ #define PAGE_MASK (PAGE_SIZE - 1) #define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) +#define NPDEPG (PAGE_SIZE/(sizeof (pt_entry_t))) -#define MAXPAGESIZES 1 /* maximum number of supported page sizes */ +#define L1_PAGE_SIZE_SHIFT 39 +#define L1_PAGE_SIZE (1UL<> PAGE_SHIFT) #define ptoa(x) ((x) << PAGE_SHIFT) diff --git a/sys/powerpc/include/platform.h b/sys/powerpc/include/platform.h index 7fa8dfa8d9d..fb32400afd4 100644 --- a/sys/powerpc/include/platform.h +++ b/sys/powerpc/include/platform.h @@ -45,9 +45,16 @@ struct mem_region { uint64_t mr_size; }; +struct numa_mem_region { + uint64_t mr_start; + uint64_t mr_size; + uint64_t mr_domain; +}; + /* Documentation for these functions is in platform_if.m */ void mem_regions(struct mem_region **, int *, struct mem_region **, int *); +void numa_mem_regions(struct numa_mem_region **, int *); vm_offset_t platform_real_maxaddr(void); u_long platform_timebase_freq(struct cpuref *); diff --git a/sys/powerpc/include/pmap.h b/sys/powerpc/include/pmap.h index ae397169930..96434f02103 100644 --- a/sys/powerpc/include/pmap.h +++ b/sys/powerpc/include/pmap.h @@ -75,6 +75,28 @@ #include #include #include +#ifdef __powerpc64__ +#include +#endif + + +/* + * The radix page table structure is described by levels 1-4. + * See Fig 33. on p. 1002 of Power ISA v3.0B + * + * Page directories and tables must be size aligned. + */ + +/* Root page directory - 64k -- each entry covers 512GB */ +typedef uint64_t pml1_entry_t; +/* l2 page directory - 4k -- each entry covers 1GB */ +typedef uint64_t pml2_entry_t; +/* l3 page directory - 4k -- each entry covers 2MB */ +typedef uint64_t pml3_entry_t; +/* l4 page directory - 256B/4k -- each entry covers 64k/4k */ +typedef uint64_t pml4_entry_t; + +typedef uint64_t pt_entry_t; struct pmap; typedef struct pmap *pmap_t; @@ -133,31 +155,75 @@ RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); ((void)((pvo)->pvo_vaddr |= (i)|PVO_PTEGIDX_VALID)) #define PVO_VSID(pvo) ((pvo)->pvo_vpn >> 16) -struct pmap { - struct pmap_statistics pm_stats; - struct mtx pm_mtx; - - #ifdef __powerpc64__ - struct slbtnode *pm_slb_tree_root; - struct slb **pm_slb; - int pm_slb_len; - #else +struct pmap { + struct pmap_statistics pm_stats; + struct mtx pm_mtx; +#ifdef __powerpc64__ + union { + /* HPT support */ + struct { + struct slbtnode *pm_slb_tree_root; + struct slb **pm_slb; + struct pvo_tree pmap_pvo; + struct pmap *pmap_phys; + cpuset_t pm_active; + int pm_slb_len; + }; + /* Radix support */ + struct { + pml1_entry_t *pm_pml1; /* KVA of root page directory */ + struct vm_radix pm_root; /* spare page table pages */ + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + uint64_t pm_pid; /* PIDR value */ + int pm_flags; + }; + } __aligned(CACHE_LINE_SIZE); +#else register_t pm_sr[16]; - #endif - cpuset_t pm_active; +#endif +}; - struct pmap *pmap_phys; - struct pvo_tree pmap_pvo; +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +#define PV_CHUNK_HEADER \ + pmap_t pc_pmap; \ + TAILQ_ENTRY(pv_chunk) pc_list; \ + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \ + TAILQ_ENTRY(pv_chunk) pc_lru; + +struct pv_chunk_header { + PV_CHUNK_HEADER +}; +struct pv_chunk { + PV_CHUNK_HEADER + struct pv_entry pc_pventry[_NPCPV]; }; struct md_page { - volatile int32_t mdpg_attrs; vm_memattr_t mdpg_cache_attrs; - struct pvo_head mdpg_pvoh; + union { + struct { + volatile int32_t mdpg_attrs; + struct pvo_head mdpg_pvoh; + }; + struct { + int pv_gen; /* (p) */ + int pv_magic; + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ + }; + }; }; #define pmap_page_get_memattr(m) ((m)->md.mdpg_cache_attrs) -#define pmap_page_is_mapped(m) (!LIST_EMPTY(&(m)->md.mdpg_pvoh)) /* * Return the VSID corresponding to a given virtual address. @@ -218,7 +284,6 @@ struct md_page { }; #define pmap_page_get_memattr(m) VM_MEMATTR_DEFAULT -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #else /* @@ -243,7 +308,7 @@ extern struct pmap kernel_pmap_store; #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, \ (pmap == kernel_pmap) ? "kernelpmap" : \ - "pmap", NULL, MTX_DEF) + "pmap", NULL, MTX_DEF | MTX_DUPOK) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) @@ -268,6 +333,9 @@ void pmap_deactivate(struct thread *); vm_paddr_t pmap_kextract(vm_offset_t); int pmap_dev_direct_mapped(vm_paddr_t, vm_size_t); boolean_t pmap_mmu_install(char *name, int prio); +bool pmap_ps_enabled(pmap_t pmap); +int pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags); +boolean_t pmap_page_is_mapped(vm_page_t m); #define vtophys(va) pmap_kextract((vm_offset_t)(va)) @@ -276,17 +344,19 @@ boolean_t pmap_mmu_install(char *name, int prio); * For more Ram increase the lmb or this value. */ -extern vm_paddr_t phys_avail[PHYS_AVAIL_SZ]; +extern vm_paddr_t phys_avail[]; extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; extern vm_offset_t msgbuf_phys; extern int pmap_bootstrapped; +extern int radix_mmu; vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size); void pmap_early_io_unmap(vm_offset_t va, vm_size_t size); void pmap_track_page(pmap_t pmap, vm_offset_t va); +void pmap_page_print_mappings(vm_page_t m); static inline int pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) diff --git a/sys/powerpc/include/pmap_private.h b/sys/powerpc/include/pmap_private.h new file mode 100644 index 00000000000..e1fab90862a --- /dev/null +++ b/sys/powerpc/include/pmap_private.h @@ -0,0 +1,314 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_PMAP_PRIVATE_H_ +#define _MACHINE_PMAP_PRIVATE_H_ + +#define PG_W RPTE_WIRED +#define PG_V RPTE_VALID +#define PG_MANAGED RPTE_MANAGED +#define PG_PROMOTED RPTE_PROMOTED +#define PG_M RPTE_C +#define PG_A RPTE_R +#define PG_X RPTE_EAA_X +#define PG_RW RPTE_EAA_W +#define PG_PTE_CACHE RPTE_ATTR_MASK + +#define RPTE_SHIFT 9 +#define NLS_MASK ((1UL<<5)-1) +#define RPTE_ENTRIES (1UL<> L3_PAGE_SIZE_SHIFT); +} + +static __inline vm_pindex_t +pmap_pml3e_index(vm_offset_t va) +{ + + return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml2e_index(vm_offset_t va) +{ + return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml1e_index(vm_offset_t va) +{ + return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); +} + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & RPTE_MASK); +} + +/* Return a pointer to the PT slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) +{ + pt_entry_t *pte; + vm_paddr_t ptepa; + + ptepa = (*l3e & NLB_MASK); + pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); + return (&pte[pmap_pte_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) +{ + pt_entry_t *l3e; + vm_paddr_t l3pa; + + l3pa = (*l2e & NLB_MASK); + l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); + return (&l3e[pmap_pml3e_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) +{ + pt_entry_t *l2e; + vm_paddr_t l2pa; + + l2pa = (*l1e & NLB_MASK); + + l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); + return (&l2e[pmap_pml2e_index(va)]); +} + +static __inline pml1_entry_t * +pmap_pml1e(pmap_t pmap, vm_offset_t va) +{ + + return (&pmap->pm_pml1[pmap_pml1e_index(va)]); +} + +static pt_entry_t * +pmap_pml2e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l1e; + + l1e = pmap_pml1e(pmap, va); + if (l1e == NULL || (*l1e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l1e_to_l2e(l1e, va)); +} + +static __inline pt_entry_t * +pmap_pml3e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l2e; + + l2e = pmap_pml2e(pmap, va); + if (l2e == NULL || (*l2e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l2e_to_l3e(l2e, va)); +} + +static __inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (*l3e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +#endif diff --git a/sys/powerpc/include/proc.h b/sys/powerpc/include/proc.h index d01b6cda4dc..a2a64cee29a 100644 --- a/sys/powerpc/include/proc.h +++ b/sys/powerpc/include/proc.h @@ -37,12 +37,18 @@ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ +struct pmap_invl_gen { + u_long gen; /* (k) */ + LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ +}; + /* * Machine-dependent part of the proc structure */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_msr; /* (k) */ + struct pmap_invl_gen md_invl_gen; }; struct mdproc { diff --git a/sys/powerpc/include/pte.h b/sys/powerpc/include/pte.h index 32eecaedb92..77840273041 100644 --- a/sys/powerpc/include/pte.h +++ b/sys/powerpc/include/pte.h @@ -70,6 +70,13 @@ struct pate { u_int64_t proctab; }; +/* Process table entry */ +struct prte { + u_int64_t proctab0; + u_int64_t proctab1; +}; + + #endif /* LOCORE */ /* 32-bit PTE definitions */ @@ -143,6 +150,10 @@ struct pate { #define RPTE_R 0x0000000000000100ULL #define RPTE_C 0x0000000000000080ULL +#define RPTE_MANAGED RPTE_SW1 +#define RPTE_WIRED RPTE_SW2 +#define RPTE_PROMOTED RPTE_SW3 + #define RPTE_ATTR_MASK 0x0000000000000030ULL #define RPTE_ATTR_MEM 0x0000000000000000ULL /* PTE M */ #define RPTE_ATTR_SAO 0x0000000000000010ULL /* PTE WIM */ @@ -157,10 +168,13 @@ struct pate { #define RPDE_VALID RPTE_VALID #define RPDE_LEAF RPTE_LEAF /* is a PTE: always 0 */ -#define RPDE_NLB_MASK 0x0FFFFFFFFFFFFF00ULL +#define RPDE_NLB_MASK 0x00FFFFFFFFFFFF00ULL #define RPDE_NLB_SHIFT 8 #define RPDE_NLS_MASK 0x000000000000001FULL +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) + #ifndef LOCORE typedef struct pte pte_t; diff --git a/sys/powerpc/include/smp.h b/sys/powerpc/include/smp.h index 1bdb8cca331..7f7e767519b 100644 --- a/sys/powerpc/include/smp.h +++ b/sys/powerpc/include/smp.h @@ -52,6 +52,7 @@ void ipi_selected(cpuset_t cpus, int ipi); struct cpuref { uintptr_t cr_hwref; u_int cr_cpuid; + u_int cr_domain; }; void pmap_cpu_bootstrap(int); diff --git a/sys/powerpc/include/spr.h b/sys/powerpc/include/spr.h index 29eedc4d27d..65ab58e7a03 100644 --- a/sys/powerpc/include/spr.h +++ b/sys/powerpc/include/spr.h @@ -91,6 +91,12 @@ * 6 for 6xx/7xx series and 8 for 8xx and 8xxx series. */ +#ifdef CONFIG_40X +#define SPR_PID 0x3b1 /* 4.. Process ID */ +#else +#define SPR_PID 0x30 /* 4.. Process ID */ +#endif + #define SPR_MQ 0x000 /* .6. 601 MQ register */ #define SPR_XER 0x001 /* 468 Fixed Point Exception Register */ #define SPR_RTCU_R 0x004 /* .6. 601 RTC Upper - Read */ @@ -117,7 +123,14 @@ #define SRR1_ISI_PFAULT 0x40000000 /* ISI page not found */ #define SRR1_ISI_NOEXECUTE 0x10000000 /* Memory marked no-execute */ #define SRR1_ISI_PP 0x08000000 /* PP bits forbid access */ +#define SPR_CFAR 0x1c /* Come From Address Register */ +#define SPR_AMR 0x1d /* Authority Mask Register */ +#define SPR_UAMOR 0x9d /* User Authority Mask Override Register */ +#define SPR_AMOR 0x15d /* Authority Mask Override Register */ + #define SPR_DECAR 0x036 /* ..8 Decrementer auto reload */ +#define SPR_IAMR 0x03D /* Instr. Authority Mask Reg */ + #define SPR_EIE 0x050 /* ..8 Exception Interrupt ??? */ #define SPR_EID 0x051 /* ..8 Exception Interrupt ??? */ #define SPR_NRI 0x052 /* ..8 Exception Interrupt ??? */ @@ -240,7 +253,11 @@ #define LPCR_PECE_EXT (1ULL << 14) /* External exceptions */ #define LPCR_PECE_DECR (1ULL << 13) /* Decrementer exceptions */ #define LPCR_PECE_ME (1ULL << 12) /* Machine Check and Hypervisor */ - /* Maintenance exceptions */ + /* Maintenance exceptions */ +#define LPCR_UPRT (1ULL << 22) /* Use Process Table (ISA 3) */ +#define LPCR_HR (1ULL << 20) /* Host Radix mode */ + + #define SPR_LPID 0x13f /* Logical Partitioning Control */ #define SPR_PTCR 0x1d0 /* Partition Table Control Register */ @@ -419,7 +436,7 @@ #define SPR_MMCR2 0x3b0 /* .6. Monitor Mode Control Register 2 */ #define SPR_MMCR2_THRESHMULT_32 0x80000000 /* Multiply MMCR0 threshold by 32 */ #define SPR_MMCR2_THRESHMULT_2 0x00000000 /* Multiply MMCR0 threshold by 2 */ -#define SPR_PID 0x3b1 /* 4.. Process ID */ + #define SPR_PMC5 0x3b1 /* .6. Performance Counter Register 5 */ #define SPR_PMC6 0x3b2 /* .6. Performance Counter Register 6 */ #define SPR_CCR0 0x3b3 /* 4.. Core Configuration Register 0 */ diff --git a/sys/powerpc/include/sr.h b/sys/powerpc/include/sr.h index 6917861139b..caf7fef50d8 100644 --- a/sys/powerpc/include/sr.h +++ b/sys/powerpc/include/sr.h @@ -53,7 +53,7 @@ #define KERNEL2_SEGMENT (0xfffff0 + KERNEL2_SR) #define EMPTY_SEGMENT 0xfffff0 #ifdef __powerpc64__ -#define USER_ADDR 0xeffffffff0000000UL +#define USER_ADDR 0xc00ffffff0000000UL #else #define USER_ADDR ((uintptr_t)USER_SR << ADDR_SR_SHFT) #endif diff --git a/sys/powerpc/include/vmparam.h b/sys/powerpc/include/vmparam.h index 3a710e28953..271ca75b3da 100644 --- a/sys/powerpc/include/vmparam.h +++ b/sys/powerpc/include/vmparam.h @@ -83,8 +83,8 @@ #if !defined(LOCORE) #ifdef __powerpc64__ #define VM_MIN_ADDRESS (0x0000000000000000UL) -#define VM_MAXUSER_ADDRESS (0x3ffffffffffff000UL) -#define VM_MAX_ADDRESS (0xffffffffffffffffUL) +#define VM_MAXUSER_ADDRESS (0x000fffffc0000000UL) +#define VM_MAX_ADDRESS (0xc00fffffc0000000UL) #else #define VM_MIN_ADDRESS ((vm_offset_t)0) #define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS32 @@ -95,7 +95,7 @@ #ifdef BOOKE #define VM_MIN_ADDRESS 0 #ifdef __powerpc64__ -#define VM_MAXUSER_ADDRESS 0x3ffffffffffff000 +#define VM_MAXUSER_ADDRESS 0x000fffffffffffff #else #define VM_MAXUSER_ADDRESS 0x7ffff000 #endif @@ -107,11 +107,24 @@ #ifdef __powerpc64__ #ifndef LOCORE -#define VM_MIN_KERNEL_ADDRESS 0xe000000000000000UL -#define VM_MAX_KERNEL_ADDRESS 0xe0000007ffffffffUL +/* + * Virtual addresses of things. Derived from the page directory and + * page table indexes from pmap.h for precision. + * + * kernel map should be able to start at 0xc008000000000000 - + * but at least the functional simulator doesn't like it + * + * 0x0000000000000000 - 0x000fffffffffffff user map + * 0xc000000000000000 - 0xc007ffffffffffff direct map + * 0xc008000000000000 - 0xc00fffffffffffff kernel map + * + */ + +#define VM_MIN_KERNEL_ADDRESS 0xc008000000000000UL +#define VM_MAX_KERNEL_ADDRESS 0xc0080007ffffffffUL #else -#define VM_MIN_KERNEL_ADDRESS 0xe000000000000000 -#define VM_MAX_KERNEL_ADDRESS 0xe0000007ffffffff +#define VM_MIN_KERNEL_ADDRESS 0xc008000000000000 +#define VM_MAX_KERNEL_ADDRESS 0xc0080007ffffffff #endif #define VM_MAX_SAFE_KERNEL_ADDRESS VM_MAX_KERNEL_ADDRESS #endif @@ -149,7 +162,7 @@ struct pmap_physseg { }; #endif -#define VM_PHYSSEG_MAX 16 /* 1? */ +#define VM_PHYSSEG_MAX 63 /* 1? */ /* * The physical address space is densely populated on 32-bit systems, @@ -180,14 +193,33 @@ struct pmap_physseg { /* * The largest allocation size is 4MB. */ +#ifdef __powerpc64__ +#define VM_NFREEORDER 13 +#else #define VM_NFREEORDER 11 +#endif +#ifndef VM_NRESERVLEVEL +#ifdef __powerpc64__ +#define VM_NRESERVLEVEL 1 +#else /* * Disable superpage reservations. */ -#ifndef VM_NRESERVLEVEL #define VM_NRESERVLEVEL 0 #endif +#endif + +/* + * Level 0 reservations consist of 512 pages. + */ +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 9 +#endif + +#ifdef SMP +#define PA_LOCK_COUNT 256 +#endif #ifndef VM_INITIAL_PAGEIN #define VM_INITIAL_PAGEIN 16 @@ -220,7 +252,19 @@ struct pmap_physseg { VM_MIN_KERNEL_ADDRESS + 1) * 2 / 5) #endif +#ifdef __powerpc64__ +#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */ +#else #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#endif + +/* + * Use a fairly large batch size since we expect ppc64 systems to have lots of + * memory. + */ +#ifdef __powerpc64__ +#define VM_BATCHQUEUE_SIZE 31 +#endif /* * On 32-bit OEA, the only purpose for which sf_buf is used is to implement @@ -243,7 +287,8 @@ struct pmap_physseg { #ifndef LOCORE #ifdef __powerpc64__ #define DMAP_BASE_ADDRESS 0xc000000000000000UL -#define DMAP_MAX_ADDRESS 0xcfffffffffffffffUL +#define DMAP_MIN_ADDRESS DMAP_BASE_ADDRESS +#define DMAP_MAX_ADDRESS 0xc007ffffffffffffUL #else #define DMAP_BASE_ADDRESS 0x00000000UL #define DMAP_MAX_ADDRESS 0xbfffffffUL diff --git a/sys/powerpc/ofw/ofw_machdep.c b/sys/powerpc/ofw/ofw_machdep.c index c647ce81790..6ee49378584 100644 --- a/sys/powerpc/ofw/ofw_machdep.c +++ b/sys/powerpc/ofw/ofw_machdep.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -222,9 +223,57 @@ parse_ofw_memory(phandle_t node, const char *prop, struct mem_region *output) j++; } - sz = j*sizeof(output[0]); - return (sz); + return (j); +} + +static int +parse_numa_ofw_memory(phandle_t node, const char *prop, + struct numa_mem_region *output) +{ + cell_t address_cells, size_cells; + cell_t OFmem[4 * PHYS_AVAIL_SZ]; + int sz, i, j; + phandle_t phandle; + + sz = 0; + + /* + * Get #address-cells from root node, defaulting to 1 if it cannot + * be found. + */ + phandle = OF_finddevice("/"); + if (OF_getencprop(phandle, "#address-cells", &address_cells, + sizeof(address_cells)) < (ssize_t)sizeof(address_cells)) + address_cells = 1; + if (OF_getencprop(phandle, "#size-cells", &size_cells, + sizeof(size_cells)) < (ssize_t)sizeof(size_cells)) + size_cells = 1; + + /* + * Get memory. + */ + if (node == -1 || (sz = OF_getencprop(node, prop, + OFmem, sizeof(OFmem))) <= 0) + panic("Physical memory map not found"); + + i = 0; + j = 0; + while (i < sz/sizeof(cell_t)) { + output[j].mr_start = OFmem[i++]; + if (address_cells == 2) { + output[j].mr_start <<= 32; + output[j].mr_start += OFmem[i++]; + } + output[j].mr_size = OFmem[i++]; + if (size_cells == 2) { + output[j].mr_size <<= 32; + output[j].mr_size += OFmem[i++]; + } + j++; + } + + return (j); } #ifdef FDT @@ -402,6 +451,51 @@ excise_fdt_reserved(struct mem_region *avail, int asz) } #endif +/* + * This is called during powerpc_init, before the system is really initialized. + * It shall provide the total and the available regions of RAM. + * The available regions need not take the kernel into account. + */ +void +ofw_numa_mem_regions(struct numa_mem_region *memp, int *memsz) +{ + phandle_t phandle; + int res, count, msz; + char name[31]; + cell_t associativity[5]; + struct numa_mem_region *curmemp; + + msz = 0; + /* + * Get memory from all the /memory nodes. + */ + for (phandle = OF_child(OF_peer(0)); phandle != 0; + phandle = OF_peer(phandle)) { + if (OF_getprop(phandle, "name", name, sizeof(name)) <= 0) + continue; + if (strncmp(name, "memory@", strlen("memory@")) != 0) + continue; + + count = parse_numa_ofw_memory(phandle, "reg", &memp[msz]); + if (count == 0) + continue; + curmemp = &memp[msz]; + res = OF_getproplen(phandle, "ibm,associativity"); + if (res <= 0) + continue; + MPASS(count == 1); + OF_getencprop(phandle, "ibm,associativity", + associativity, res); + curmemp->mr_domain = associativity[3] - 1; + if (bootverbose) + printf("%s %#lx-%#lx domain(%lu)\n", + name, curmemp->mr_start, + curmemp->mr_start + curmemp->mr_size, + curmemp->mr_domain); + msz += count; + } + *memsz = msz; +} /* * This is called during powerpc_init, before the system is really initialized. * It shall provide the total and the available regions of RAM. @@ -430,7 +524,7 @@ ofw_mem_regions(struct mem_region *memp, int *memsz, continue; res = parse_ofw_memory(phandle, "reg", &memp[msz]); - msz += res/sizeof(struct mem_region); + msz += res; /* * On POWER9 Systems we might have both linux,usable-memory and @@ -446,7 +540,7 @@ ofw_mem_regions(struct mem_region *memp, int *memsz, &availp[asz]); else res = parse_ofw_memory(phandle, "reg", &availp[asz]); - asz += res/sizeof(struct mem_region); + asz += res; } #ifdef FDT diff --git a/sys/powerpc/ofw/ofw_pcibus.c b/sys/powerpc/ofw/ofw_pcibus.c index ed4e66dde20..7bbc4275c2f 100644 --- a/sys/powerpc/ofw/ofw_pcibus.c +++ b/sys/powerpc/ofw/ofw_pcibus.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -80,6 +81,8 @@ static device_method_t ofw_pcibus_methods[] = { DEVMETHOD(bus_child_deleted, ofw_pcibus_child_deleted), DEVMETHOD(bus_child_pnpinfo_str, ofw_pcibus_child_pnpinfo_str_method), DEVMETHOD(bus_rescan, bus_null_rescan), + DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), + DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), /* PCI interface */ DEVMETHOD(pci_alloc_devinfo, ofw_pcibus_alloc_devinfo), @@ -382,3 +385,76 @@ ofw_pcibus_get_devinfo(device_t bus, device_t dev) return (&dinfo->opd_obdinfo); } +static int +ofw_pcibus_parse_associativity(device_t dev, int *domain) +{ + phandle_t node; + cell_t associativity[5]; + int res; + + if ((node = ofw_bus_get_node(dev)) == -1) { + device_printf(dev, "no ofw node found\n"); + return (ENXIO); + } + res = OF_getproplen(node, "ibm,associativity"); + if (res <= 0) + return (ENXIO); + OF_getencprop(node, "ibm,associativity", + associativity, res); + + *domain = associativity[3] - 1; + if (bootverbose) + device_printf(dev, "domain(%d)\n", *domain); + return (0); +} + +int +ofw_pcibus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, + cpuset_t *cpuset) +{ + int d, error; + + error = ofw_pcibus_parse_associativity(child, &d); + if (error) + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + + switch (op) { + case LOCAL_CPUS: + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + *cpuset = cpuset_domain[d]; + return (0); + case INTR_CPUS: + error = bus_generic_get_cpus(dev, child, op, setsize, cpuset); + if (error != 0) + return (error); + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + CPU_AND(cpuset, &cpuset_domain[d]); + return (0); + default: + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + } + return (0); +} + +/* + * Fetch the NUMA domain for the given device 'dev'. + * + * If a device has a _PXM method, map that to a NUMA domain. + * Otherwise, pass the request up to the parent. + * If there's no matching domain or the domain cannot be + * determined, return ENOENT. + */ +int +ofw_pcibus_get_domain(device_t dev, device_t child, int *domain) +{ + int d, error; + + error = ofw_pcibus_parse_associativity(child, &d); + /* No ofw node; go up a level */ + if (error) + return (bus_generic_get_domain(dev, child, domain)); + *domain = d; + return (0); +} diff --git a/sys/powerpc/powernv/opal_dev.c b/sys/powerpc/powernv/opal_dev.c index f3c386772a1..09ab3eeb021 100644 --- a/sys/powerpc/powernv/opal_dev.c +++ b/sys/powerpc/powernv/opal_dev.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -92,7 +93,35 @@ static driver_t opaldev_driver = { static devclass_t opaldev_devclass; -DRIVER_MODULE(opaldev, ofwbus, opaldev_driver, opaldev_devclass, 0, 0); +EARLY_DRIVER_MODULE(opaldev, ofwbus, opaldev_driver, opaldev_devclass, 0, 0, + BUS_PASS_BUS); + +static void opal_heartbeat(void); +static struct proc *opal_hb_proc; +static struct kproc_desc opal_heartbeat_kp = { + "opal_heartbeat", + opal_heartbeat, + &opal_hb_proc +}; + +SYSINIT(opal_heartbeat_setup, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, + &opal_heartbeat_kp); + +static int opal_heartbeat_ms; + +static void +opal_heartbeat(void) +{ + + if (opal_heartbeat_ms == 0) + kproc_exit(0); + + while (1) { + opal_call(OPAL_POLL_EVENTS, 0); /* Turn the OPAL state crank */ + tsleep(opal_hb_proc, 0, "opal", + MSEC_2_TICKS(opal_heartbeat_ms)); + } +} static int opaldev_probe(device_t dev) @@ -153,6 +182,8 @@ opaldev_attach(device_t dev) EVENTHANDLER_REGISTER(shutdown_final, opal_shutdown, NULL, SHUTDOWN_PRI_LAST); + OF_getencprop(ofw_bus_get_node(dev), "ibm,heartbeat-ms", + &opal_heartbeat_ms, sizeof(opal_heartbeat_ms)); /* Bind to interrupts */ for (i = 0; (irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, RF_ACTIVE)) != NULL; i++) @@ -313,6 +344,7 @@ opal_intr(void *xintr) opal_call(OPAL_HANDLE_INTERRUPT, (uint32_t)(uint64_t)xintr, vtophys(&events)); /* XXX: do something useful with this information */ + wakeup(opal_hb_proc); /* Notify heartbeat monitor */ } diff --git a/sys/powerpc/powernv/opal_pci.c b/sys/powerpc/powernv/opal_pci.c index 50ee3904dc7..096836faae9 100644 --- a/sys/powerpc/powernv/opal_pci.c +++ b/sys/powerpc/powernv/opal_pci.c @@ -143,12 +143,18 @@ static device_method_t opalpci_methods[] = { DEVMETHOD(pcib_map_msi, opalpci_map_msi), DEVMETHOD(pcib_route_interrupt, opalpci_route_interrupt), + /* Bus interface */ + DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), + DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), + /* PIC interface for MSIs */ DEVMETHOD(pic_enable, opalpic_pic_enable), DEVMETHOD(pic_eoi, opalpic_pic_eoi), /* Bus interface */ DEVMETHOD(bus_get_dma_tag, opalpci_get_dma_tag), + DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), + DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), DEVMETHOD_END }; @@ -367,7 +373,7 @@ opalpci_attach(device_t dev) tce_size = max_tce_size(dev); maxmem = roundup2(powerpc_ptob(Maxmem), tce_size); entries = round_pow2(maxmem / tce_size); - tce_tbl_size = max(entries * sizeof(uint64_t), 4096); + tce_tbl_size = MAX(entries * sizeof(uint64_t), 4096); if (entries > OPAL_PCI_TCE_MAX_ENTRIES) panic("POWERNV supports only %jdGB of memory space\n", (uintmax_t)((OPAL_PCI_TCE_MAX_ENTRIES * tce_size) >> 30)); diff --git a/sys/powerpc/powernv/platform_powernv.c b/sys/powerpc/powernv/platform_powernv.c index 1291eb02e4e..43cdcc8108b 100644 --- a/sys/powerpc/powernv/platform_powernv.c +++ b/sys/powerpc/powernv/platform_powernv.c @@ -65,6 +65,7 @@ static int powernv_probe(platform_t); static int powernv_attach(platform_t); void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz, struct mem_region *avail, int *availsz); +static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz); static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref); static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref); static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref); @@ -83,6 +84,7 @@ static platform_method_t powernv_methods[] = { PLATFORMMETHOD(platform_probe, powernv_probe), PLATFORMMETHOD(platform_attach, powernv_attach), PLATFORMMETHOD(platform_mem_regions, powernv_mem_regions), + PLATFORMMETHOD(platform_numa_mem_regions, powernv_numa_mem_regions), PLATFORMMETHOD(platform_timebase_freq, powernv_timebase_freq), PLATFORMMETHOD(platform_smp_ap_init, powernv_smp_ap_init), @@ -250,6 +252,13 @@ powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz, ofw_mem_regions(phys, physsz, avail, availsz); } +static void +powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz) +{ + + ofw_numa_mem_regions(phys, physsz); +} + static u_long powernv_timebase_freq(platform_t plat, struct cpuref *cpuref) { @@ -313,15 +322,13 @@ powernv_cpuref_init(void) if (res > 0 && strcmp(buf, "cpu") == 0) { res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s"); if (res > 0) { - - OF_getencprop(cpu, "ibm,ppc-interrupt-server#s", interrupt_servers, res); for (a = 0; a < res/sizeof(cell_t); a++) { tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a]; tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt; - + tmp_cpuref[tmp_cpuref_cnt].cr_domain = interrupt_servers[a] >> 11; if (interrupt_servers[a] == (uint32_t)powernv_boot_pir) bsp = tmp_cpuref_cnt; @@ -335,11 +342,13 @@ powernv_cpuref_init(void) for (a = bsp; a < tmp_cpuref_cnt; a++) { platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; + platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; platform_cpuref_cnt++; } for (a = 0; a < bsp; a++) { platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; + platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; platform_cpuref_cnt++; } @@ -356,6 +365,7 @@ powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref) cpuref->cr_cpuid = 0; cpuref->cr_hwref = platform_cpuref[0].cr_hwref; + cpuref->cr_domain = platform_cpuref[0].cr_domain; return (0); } diff --git a/sys/powerpc/powerpc/busdma_machdep.c b/sys/powerpc/powerpc/busdma_machdep.c index cc5212198da..63eb16e884d 100644 --- a/sys/powerpc/powerpc/busdma_machdep.c +++ b/sys/powerpc/powerpc/busdma_machdep.c @@ -350,7 +350,7 @@ bus_dma_tag_set_domain(bus_dma_tag_t dmat, int domain) int bus_dma_tag_destroy(bus_dma_tag_t dmat) { - bus_dma_tag_t dmat_copy; + bus_dma_tag_t dmat_copy __unused; int error; error = 0; @@ -515,11 +515,9 @@ bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags, if (flags & BUS_DMA_ZERO) mflags |= M_ZERO; -#ifdef NOTYET if (flags & BUS_DMA_NOCACHE) attr = VM_MEMATTR_UNCACHEABLE; else -#endif attr = VM_MEMATTR_DEFAULT; /* diff --git a/sys/powerpc/powerpc/cpu.c b/sys/powerpc/powerpc/cpu.c index 8d54593cc32..b35da1c7617 100644 --- a/sys/powerpc/powerpc/cpu.c +++ b/sys/powerpc/powerpc/cpu.c @@ -185,7 +185,7 @@ static const struct cputab models[] = { PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_HTM | PPC_FEATURE2_DSCR | PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | PPC_FEATURE2_HAS_VEC_CRYPTO | PPC_FEATURE2_ARCH_3_00 | PPC_FEATURE2_HAS_IEEE128 | - PPC_FEATURE2_DARN, cpu_powerx_setup }, + PPC_FEATURE2_DARN | PPC_FEATURE2_MMU_RADIX, cpu_powerx_setup }, { "Motorola PowerPC 7400", MPC7400, REVFMT_MAJMIN, PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU, 0, cpu_6xx_setup }, { "Motorola PowerPC 7410", MPC7410, REVFMT_MAJMIN, @@ -674,6 +674,8 @@ cpu_powerx_setup(int cpuid, uint16_t vers) cpu_idle_hook = cpu_idle_power9; mtspr(SPR_LPCR, mfspr(SPR_LPCR) | LPCR_PECE_WAKESET); isync(); + mtspr(SPR_HID0, mfspr(SPR_HID0) | HID0_SPECEXEC); + isync(); break; default: return; diff --git a/sys/powerpc/powerpc/intr_machdep.c b/sys/powerpc/powerpc/intr_machdep.c index 39c315f431f..34d53745f69 100644 --- a/sys/powerpc/powerpc/intr_machdep.c +++ b/sys/powerpc/powerpc/intr_machdep.c @@ -97,16 +97,17 @@ struct powerpc_intr { struct intr_event *event; long *cntp; void *priv; /* PIC-private data */ - u_int irq; device_t pic; + u_int irq; u_int intline; u_int vector; u_int cntindex; - cpuset_t cpu; - enum intr_trigger trig; - enum intr_polarity pol; int fwcode; int ipi; + int pi_domain; + enum intr_trigger trig; + enum intr_polarity pol; + cpuset_t pi_cpuset; }; struct pic { @@ -203,7 +204,7 @@ smp_intr_init(void *dummy __unused) for (vector = 0; vector < nvectors; vector++) { i = powerpc_intrs[vector]; if (i != NULL && i->event != NULL && i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); } } SYSINIT(smp_intr_init, SI_SUB_SMP, SI_ORDER_ANY, smp_intr_init, NULL); @@ -256,9 +257,9 @@ intr_lookup(u_int irq) i->ipi = 0; #ifdef SMP - i->cpu = all_cpus; + i->pi_cpuset = all_cpus; #else - CPU_SETOF(0, &i->cpu); + CPU_SETOF(0, &i->pi_cpuset); #endif for (vector = 0; vector < num_io_irqs && vector <= nvectors; @@ -347,12 +348,12 @@ powerpc_assign_intr_cpu(void *arg, int cpu) struct powerpc_intr *i = arg; if (cpu == NOCPU) - i->cpu = all_cpus; + i->pi_cpuset = all_cpus; else - CPU_SETOF(cpu, &i->cpu); + CPU_SETOF(cpu, &i->pi_cpuset); if (!cold && i->pic != NULL && i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); return (0); #else @@ -469,7 +470,8 @@ powerpc_enable_intr(void) error = powerpc_setup_intr("IPI", MAP_IRQ(piclist[n].node, piclist[n].irqs), powerpc_ipi_handler, NULL, NULL, - INTR_TYPE_MISC | INTR_EXCL, &ipi_cookie); + INTR_TYPE_MISC | INTR_EXCL, &ipi_cookie, + 0 /* domain XXX */); if (error) { printf("unable to setup IPI handler\n"); return (error); @@ -512,7 +514,8 @@ powerpc_enable_intr(void) int powerpc_setup_intr(const char *name, u_int irq, driver_filter_t filter, - driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) + driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep, + int domain) { struct powerpc_intr *i; int error, enable = 0; @@ -533,7 +536,13 @@ powerpc_setup_intr(const char *name, u_int irq, driver_filter_t filter, error = intr_event_add_handler(i->event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); - + if (error) + return (error); + i->pi_domain = domain; + if (strcmp(name, "IPI") != 0) { + CPU_ZERO(&i->pi_cpuset); + CPU_COPY(&cpuset_domain[domain], &i->pi_cpuset); + } mtx_lock(&intr_table_lock); intrcnt_setname(i->event->ie_fullname, i->cntindex); mtx_unlock(&intr_table_lock); @@ -551,7 +560,7 @@ powerpc_setup_intr(const char *name, u_int irq, driver_filter_t filter, PIC_CONFIG(i->pic, i->intline, i->trig, i->pol); if (i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); if (enable) PIC_ENABLE(i->pic, i->intline, i->vector, diff --git a/sys/powerpc/powerpc/machdep.c b/sys/powerpc/powerpc/machdep.c index 8567876e248..1fc775f5509 100644 --- a/sys/powerpc/powerpc/machdep.c +++ b/sys/powerpc/powerpc/machdep.c @@ -123,12 +123,24 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include +/* + * The number of PHYSMAP entries must be one less than the number of + * PHYSSEG entries because the PHYSMAP entry that spans the largest + * physical address that is accessible by ISA DMA is split into two + * PHYSSEG entries. + */ +#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) + + +vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; + int cold = 1; #ifdef __powerpc64__ int cacheline_size = 128; @@ -143,7 +155,7 @@ extern vm_paddr_t kernload; extern void *ap_pcpu; -struct pcpu __pcpu[MAXCPU]; +struct pcpu __pcpu[MAXCPU] __aligned(PAGE_SIZE); static char init_kenv[2048]; static struct trapframe frame0; diff --git a/sys/powerpc/powerpc/mmu_if.m b/sys/powerpc/powerpc/mmu_if.m index 4320fc7299e..94de7e3f86b 100644 --- a/sys/powerpc/powerpc/mmu_if.m +++ b/sys/powerpc/powerpc/mmu_if.m @@ -130,6 +130,11 @@ CODE { { return (0); } + + static boolean_t mmu_null_ps_enabled(mmu_t mmu) + { + return (FALSE); + } }; @@ -1014,3 +1019,12 @@ METHOD int change_attr { vm_memattr_t _mode; } DEFAULT mmu_null_change_attr; +METHOD boolean_t page_is_mapped { + mmu_t _mmu; + vm_page_t _pg; +} DEFAULT; + +METHOD boolean_t ps_enabled { + mmu_t _mmu; + pmap_t _pmap; +} DEFAULT mmu_null_ps_enabled; diff --git a/sys/powerpc/powerpc/mp_machdep.c b/sys/powerpc/powerpc/mp_machdep.c index 2a7dd91cfba..e64d968593e 100644 --- a/sys/powerpc/powerpc/mp_machdep.c +++ b/sys/powerpc/powerpc/mp_machdep.c @@ -182,6 +182,15 @@ cpu_mp_start(void) pc->pc_bsp = 1; } pc->pc_hwref = cpu.cr_hwref; + + if (vm_ndomains > 1) + pc->pc_domain = cpu.cr_domain; + else + pc->pc_domain = 0; + + CPU_SET(pc->pc_cpuid, &cpuset_domain[pc->pc_domain]); + KASSERT(pc->pc_domain < MAXMEMDOM, ("bad domain value %d\n", + pc->pc_domain)); CPU_SET(pc->pc_cpuid, &all_cpus); next: error = platform_smp_next_cpu(&cpu); @@ -205,7 +214,7 @@ cpu_mp_announce(void) pc = pcpu_find(i); if (pc == NULL) continue; - printf("cpu%d: dev=%x", i, (int)pc->pc_hwref); + printf("cpu%d: dev=%x domain=%d ", i, (int)pc->pc_hwref, pc->pc_domain); if (pc->pc_bsp) printf(" (BSP)"); printf("\n"); diff --git a/sys/powerpc/powerpc/nexus.c b/sys/powerpc/powerpc/nexus.c index f5a3da8eb27..d48684d955b 100644 --- a/sys/powerpc/powerpc/nexus.c +++ b/sys/powerpc/powerpc/nexus.c @@ -38,11 +38,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -67,6 +69,11 @@ static bus_teardown_intr_t nexus_teardown_intr; static bus_activate_resource_t nexus_activate_resource; static bus_deactivate_resource_t nexus_deactivate_resource; static bus_space_tag_t nexus_get_bus_tag(device_t, device_t); +static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t, + cpuset_t *); +static int nexus_translate_resource(device_t bus, int type, rman_res_t start, + rman_res_t *newstart); + #ifdef SMP static bus_bind_intr_t nexus_bind_intr; #endif @@ -87,8 +94,10 @@ static device_method_t nexus_methods[] = { #ifdef SMP DEVMETHOD(bus_bind_intr, nexus_bind_intr), #endif + DEVMETHOD(bus_translate_resource, nexus_translate_resource), DEVMETHOD(bus_config_intr, nexus_config_intr), DEVMETHOD(bus_get_bus_tag, nexus_get_bus_tag), + DEVMETHOD(bus_get_cpus, nexus_get_cpus), /* ofw_bus interface */ DEVMETHOD(ofw_bus_map_intr, nexus_ofw_map_intr), @@ -127,11 +136,13 @@ nexus_setup_intr(device_t bus __unused, device_t child, struct resource *r, int flags, driver_filter_t *filt, driver_intr_t *intr, void *arg, void **cookiep) { - int error; + int error, domain; if (r == NULL) panic("%s: NULL interrupt resource!", __func__); + if (cookiep != NULL) + *cookiep = NULL; if ((rman_get_flags(r) & RF_SHAREABLE) == 0) flags |= INTR_EXCL; @@ -140,8 +151,13 @@ nexus_setup_intr(device_t bus __unused, device_t child, struct resource *r, if (error) return (error); + if (bus_get_domain(child, &domain) != 0) { + if(bootverbose) + device_printf(child, "no domain found\n"); + domain = 0; + } error = powerpc_setup_intr(device_get_nameunit(child), - rman_get_start(r), filt, intr, arg, flags, cookiep); + rman_get_start(r), filt, intr, arg, flags, cookiep, domain); return (error); } @@ -164,6 +180,24 @@ nexus_get_bus_tag(device_t bus __unused, device_t child __unused) return(&bs_be_tag); } +static int +nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, + cpuset_t *cpuset) +{ + + switch (op) { +#ifdef SMP + case INTR_CPUS: + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + *cpuset = all_cpus; + return (0); +#endif + default: + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + } +} + #ifdef SMP static int nexus_bind_intr(device_t bus __unused, device_t child __unused, @@ -234,3 +268,11 @@ nexus_deactivate_resource(device_t bus __unused, device_t child __unused, return (rman_deactivate_resource(r)); } +static int +nexus_translate_resource(device_t bus, int type, rman_res_t start, + rman_res_t *newstart) +{ + + *newstart = start; + return (0); +} diff --git a/sys/powerpc/powerpc/platform.c b/sys/powerpc/powerpc/platform.c index 252978f47a3..85035a7ef1d 100644 --- a/sys/powerpc/powerpc/platform.c +++ b/sys/powerpc/powerpc/platform.c @@ -48,13 +48,16 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include +#include #include #include #include #include #include +#include #include "platform_if.h" @@ -67,9 +70,12 @@ static char plat_name[64] = ""; SYSCTL_STRING(_hw, OID_AUTO, platform, CTLFLAG_RD | CTLFLAG_TUN, plat_name, 0, "Platform currently in use"); +static struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; +static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; static struct mem_region pregions[PHYS_AVAIL_SZ]; +static struct numa_mem_region numa_pregions[PHYS_AVAIL_SZ]; static struct mem_region aregions[PHYS_AVAIL_SZ]; -static int npregions, naregions; +static int nnumapregions, npregions, naregions; /* * Memory region utilities: determine if two regions overlap, @@ -112,6 +118,54 @@ mr_cmp(const void *a, const void *b) return (0); } +void +numa_mem_regions(struct numa_mem_region **phys, int *physsz) +{ + struct mem_affinity *mi; + int i, j, maxdom, ndomain, offset; + + nnumapregions = 0; + PLATFORM_NUMA_MEM_REGIONS(plat_obj, numa_pregions, &nnumapregions); + + if (physsz != NULL) + *physsz = nnumapregions; + if (phys != NULL) + *phys = numa_pregions; + if (physsz == NULL || phys == NULL) { + printf("unset value\n"); + return; + } + maxdom = 0; + for (i = 0; i < nnumapregions; i++) + if (numa_pregions[i].mr_domain > maxdom) + maxdom = numa_pregions[i].mr_domain; + + mi = mem_info; + for (i = 0; i < nnumapregions; i++, mi++) { + mi->start = numa_pregions[i].mr_start; + mi->end = numa_pregions[i].mr_start + numa_pregions[i].mr_size; + mi->domain = numa_pregions[i].mr_domain; + } + offset = 0; + vm_locality_table[offset] = 10; + ndomain = maxdom + 1; + if (ndomain > 1) { + for (i = 0; i < ndomain; i++) { + for (j = 0; j < ndomain; j++) { + /* + * Not sure what these values should actually be + */ + if (i == j) + vm_locality_table[offset] = 10; + else + vm_locality_table[offset] = 21; + offset++; + } + } + } + vm_phys_register_domains(ndomain, mem_info, vm_locality_table); +} + void mem_regions(struct mem_region **phys, int *physsz, struct mem_region **avail, int *availsz) @@ -252,7 +306,7 @@ platform_smp_probe_threads(void) struct cpu_group * cpu_topo(void) { - return (PLATFORM_SMP_TOPO(plat_obj)); + return (PLATFORM_SMP_TOPO(plat_obj)); } #endif diff --git a/sys/powerpc/powerpc/platform_if.m b/sys/powerpc/powerpc/platform_if.m index 33da5cc0298..c2ae28fe523 100644 --- a/sys/powerpc/powerpc/platform_if.m +++ b/sys/powerpc/powerpc/platform_if.m @@ -130,6 +130,22 @@ METHOD void mem_regions { int *_availsz; }; + +/** + * @brief Return the system's physical memory map. + * + * It shall provide the total RAM with the corresponding domains. + * + * @param _memp Array of physical memory chunks + * @param _memsz Number of physical memory chunks + */ + +METHOD void numa_mem_regions { + platform_t _plat; + struct numa_mem_region *_memp; + int *_memsz; +}; + /** * @brief Return the maximum address accessible in real mode * (for use with hypervisors) diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c index 3cfa741d2dd..099c8c0d8e3 100644 --- a/sys/powerpc/powerpc/pmap_dispatch.c +++ b/sys/powerpc/powerpc/pmap_dispatch.c @@ -75,7 +75,6 @@ struct pmap kernel_pmap_store; vm_offset_t msgbuf_phys; vm_offset_t kernel_vm_end; -vm_paddr_t phys_avail[PHYS_AVAIL_SZ]; vm_offset_t virtual_avail; vm_offset_t virtual_end; @@ -589,6 +588,20 @@ pmap_change_attr(vm_offset_t addr, vm_size_t size, vm_memattr_t mode) return (MMU_CHANGE_ATTR(mmu_obj, addr, size, mode)); } +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + return (MMU_PAGE_IS_MAPPED(mmu_obj, m)); +} + +bool +pmap_ps_enabled(pmap_t pmap) +{ + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + return (MMU_PS_ENABLED(mmu_obj, pmap)); +} + /* * MMU install routines. Highest priority wins, equal priority also * overrides allowing last-set to win. diff --git a/sys/powerpc/powerpc/trap.c b/sys/powerpc/powerpc/trap.c index 111e69a7b32..7b5ff87fed0 100644 --- a/sys/powerpc/powerpc/trap.c +++ b/sys/powerpc/powerpc/trap.c @@ -262,7 +262,9 @@ trap(struct trapframe *frame) #if defined(__powerpc64__) && defined(AIM) case EXC_ISE: case EXC_DSE: - if (handle_user_slb_spill(&p->p_vmspace->vm_pmap, + /* DSE/ISE are automatically fatal with radix pmap. */ + if (radix_mmu || + handle_user_slb_spill(&p->p_vmspace->vm_pmap, (type == EXC_ISE) ? frame->srr0 : frame->dar) != 0){ sig = SIGSEGV; ucode = SEGV_MAPERR; @@ -422,6 +424,9 @@ trap(struct trapframe *frame) break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: + /* DSE on radix mmu is automatically fatal. */ + if (radix_mmu) + break; if (td->td_pcb->pcb_cpu.aim.usr_vsid != 0 && (frame->dar & SEGMENT_MASK) == USER_ADDR) { __asm __volatile ("slbmte %0, %1" :: @@ -802,7 +807,33 @@ trap_pfault(struct trapframe *frame, int user) else ftype = VM_PROT_READ; } +#if defined(__powerpc64__) && defined(AIM) + if (radix_mmu && pmap_nofault(&p->p_vmspace->vm_pmap, eva, ftype) == 0) + return (0); +#endif + if (__predict_false((td->td_pflags & TDP_NOFAULTING) == 0)) { + /* + * If we get a page fault while in a critical section, then + * it is most likely a fatal kernel page fault. The kernel + * is already going to panic trying to get a sleep lock to + * do the VM lookup, so just consider it a fatal trap so the + * kernel can print out a useful trap message and even get + * to the debugger. + * + * If we get a page fault while holding a non-sleepable + * lock, then it is most likely a fatal kernel page fault. + * If WITNESS is enabled, then it's going to whine about + * bogus LORs with various VM locks, so just skip to the + * fatal trap handling directly. + */ + if (td->td_critnest != 0 || + WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, + "Kernel page fault") != 0) { + trap_fatal(frame); + return (-1); + } + } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; @@ -830,7 +861,7 @@ trap_pfault(struct trapframe *frame, int user) if (!user && handle_onfault(frame)) return (0); - return (SIGSEGV); + return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } /* diff --git a/sys/powerpc/pseries/xics.c b/sys/powerpc/pseries/xics.c index fc9a82dd2b4..419eb54d06c 100644 --- a/sys/powerpc/pseries/xics.c +++ b/sys/powerpc/pseries/xics.c @@ -223,21 +223,22 @@ xicp_attach(device_t dev) return (ENXIO); } - if (OF_hasprop(phandle, "ibm,interrupt-server-ranges")) { - OF_getencprop(phandle, "ibm,interrupt-server-ranges", - sc->cpu_range, sizeof(sc->cpu_range)); - sc->cpu_range[1] += sc->cpu_range[0]; - device_printf(dev, "Handling CPUs %d-%d\n", sc->cpu_range[0], - sc->cpu_range[1]-1); #ifdef POWERNV - } else if (ofw_bus_is_compatible(dev, "ibm,opal-intc")) { + if (ofw_bus_is_compatible(dev, "ibm,opal-intc")) { /* * For now run POWER9 XIVE interrupt controller in XICS * compatibility mode. */ sc->xics_emu = true; opal_call(OPAL_XIVE_RESET, OPAL_XIVE_XICS_MODE_EMU); + } #endif + if (OF_hasprop(phandle, "ibm,interrupt-server-ranges")) { + OF_getencprop(phandle, "ibm,interrupt-server-ranges", + sc->cpu_range, sizeof(sc->cpu_range)); + sc->cpu_range[1] += sc->cpu_range[0]; + device_printf(dev, "Handling CPUs %d-%d\n", sc->cpu_range[0], + sc->cpu_range[1]-1); } else { sc->cpu_range[0] = 0; sc->cpu_range[1] = mp_ncpus; diff --git a/sys/sys/bus.h b/sys/sys/bus.h index 4ad4231a951..bf91fc49636 100644 --- a/sys/sys/bus.h +++ b/sys/sys/bus.h @@ -426,6 +426,8 @@ struct resource * bus_generic_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); +int bus_generic_translate_resource(device_t dev, int type, rman_res_t start, + rman_res_t *newstart); int bus_generic_attach(device_t dev); int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu); diff --git a/sys/sys/pcpu.h b/sys/sys/pcpu.h index 19bc4a8b907..1c24b5908b1 100644 --- a/sys/sys/pcpu.h +++ b/sys/sys/pcpu.h @@ -222,7 +222,7 @@ extern struct pcpu *cpuid_to_pcpu[]; #define UMA_PCPU_ALLOC_SIZE PAGE_SIZE #ifdef CTASSERT -#if defined(__i386__) || defined(__amd64__) +#if defined(__i386__) || defined(__amd64__) || defined(__powerpc64__) /* Required for counters(9) to work on x86. */ CTASSERT(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE); #else diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 50e93159640..c1de4bc2e09 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -270,7 +270,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m, m_map; -#if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ +#if (defined(__aarch64__) || defined(__amd64__) || defined(__powerpc64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 vm_page_t m_super; @@ -286,7 +286,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, return (KERN_FAILURE); m_map = m; psind = 0; -#if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ +#if (defined(__aarch64__) || defined(__amd64__) || defined(__powerpc64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && diff --git a/sys/x86/x86/nexus.c b/sys/x86/x86/nexus.c index 91877499ca6..eac83fc0d4e 100644 --- a/sys/x86/x86/nexus.c +++ b/sys/x86/x86/nexus.c @@ -118,6 +118,8 @@ static int nexus_map_resource(device_t bus, device_t child, int type, struct resource_map *map); static int nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map *map); +static int nexus_translate_resource(device_t bus, int type, rman_res_t s, + rman_res_t *news) static int nexus_release_resource(device_t, device_t, int, int, struct resource *); static int nexus_setup_intr(device_t, device_t, struct resource *, int flags, @@ -158,6 +160,7 @@ static device_method_t nexus_methods[] = { DEVMETHOD(bus_alloc_resource, nexus_alloc_resource), DEVMETHOD(bus_adjust_resource, nexus_adjust_resource), DEVMETHOD(bus_release_resource, nexus_release_resource), + DEVMETHOD(bus_translate_resource, nexus_translate_resource), DEVMETHOD(bus_activate_resource, nexus_activate_resource), DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource), DEVMETHOD(bus_map_resource, nexus_map_resource), @@ -551,6 +554,15 @@ nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, return (0); } +static int +nexus_translate_resource(device_t bus, int type, rman_res_t s, + rman_res_t *news) +{ + /* bus == phys */ + *news = s; + return (0); +} + static int nexus_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r)