Index: include/pmap.h =================================================================== --- include/pmap.h (.../head/sys/powerpc) (revision 279138) +++ include/pmap.h (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -86,11 +86,19 @@ struct pvo_entry { LIST_ENTRY(pvo_entry) pvo_vlink; /* Link to common virt page */ +#ifndef __powerpc64__ LIST_ENTRY(pvo_entry) pvo_olink; /* Link to overflow entry */ +#endif RB_ENTRY(pvo_entry) pvo_plink; /* Link to pmap entries */ - union { - struct pte pte; /* 32 bit PTE */ - struct lpte lpte; /* 64 bit PTE */ + struct { +#ifndef __powerpc64__ + /* 32-bit fields */ + struct pte pte; +#endif + /* 64-bit fields */ + uintptr_t slot; + vm_paddr_t pa; + vm_prot_t prot; } pvo_pte; pmap_t pvo_pmap; /* Owning pmap */ vm_offset_t pvo_vaddr; /* VA of entry */ @@ -101,12 +109,17 @@ int pvo_vaddr_compare(struct pvo_entry *, struct pvo_entry *); RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); +/* Used by 32-bit PMAP */ #define PVO_PTEGIDX_MASK 0x007UL /* which PTEG slot */ #define PVO_PTEGIDX_VALID 0x008UL /* slot is valid */ +/* Used by 64-bit PMAP */ +#define PVO_HID 0x008UL /* PVO entry in alternate hash*/ +/* Used by both */ #define PVO_WIRED 0x010UL /* PVO entry is wired */ #define PVO_MANAGED 0x020UL /* PVO entry is managed */ #define PVO_BOOTSTRAP 0x080UL /* PVO entry allocated during bootstrap */ +#define PVO_DEAD 0x100UL /* waiting to be deleted */ #define PVO_LARGE 0x200UL /* large page */ #define PVO_VADDR(pvo) ((pvo)->pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) @@ -135,7 +148,7 @@ }; struct md_page { - u_int64_t mdpg_attrs; + volatile int32_t mdpg_attrs; vm_memattr_t mdpg_cache_attrs; struct pvo_head mdpg_pvoh; }; Index: ps3/mmu_ps3.c =================================================================== --- ps3/mmu_ps3.c (.../head/sys/powerpc) (revision 279138) +++ ps3/mmu_ps3.c (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -67,15 +67,10 @@ static void mps3_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); static void mps3_cpu_bootstrap(mmu_t mmup, int ap); -static void mps3_pte_synch(mmu_t, uintptr_t pt, struct lpte *pvo_pt); -static void mps3_pte_clear(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn, uint64_t ptebit); -static void mps3_pte_unset(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static void mps3_pte_change(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static int mps3_pte_insert(mmu_t, u_int ptegidx, struct lpte *pvo_pt); -static uintptr_t mps3_pvo_to_pte(mmu_t, const struct pvo_entry *pvo); +static int64_t mps3_pte_synch(mmu_t, struct pvo_entry *); +static int64_t mps3_pte_clear(mmu_t, struct pvo_entry *, uint64_t ptebit); +static int64_t mps3_pte_unset(mmu_t, struct pvo_entry *); +static int mps3_pte_insert(mmu_t, struct pvo_entry *); static mmu_method_t mps3_methods[] = { @@ -85,9 +80,7 @@ MMUMETHOD(moea64_pte_synch, mps3_pte_synch), MMUMETHOD(moea64_pte_clear, mps3_pte_clear), MMUMETHOD(moea64_pte_unset, mps3_pte_unset), - MMUMETHOD(moea64_pte_change, mps3_pte_change), MMUMETHOD(moea64_pte_insert, mps3_pte_insert), - MMUMETHOD(moea64_pvo_to_pte, mps3_pvo_to_pte), { 0, 0 } }; @@ -94,11 +87,15 @@ MMU_DEF_INHERIT(ps3_mmu, "mmu_ps3", mps3_methods, 0, oea64_mmu); +static struct mtx mps3_table_lock; + static void mps3_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { uint64_t final_pteg_count; + mtx_init(&mps3_table_lock, "page table", NULL, MTX_DEF); + moea64_early_bootstrap(mmup, kernelstart, kernelend); lv1_construct_virtual_address_space( @@ -151,72 +148,113 @@ } } -static void -mps3_pte_synch(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt) +static int64_t +mps3_pte_synch_locked(struct pvo_entry *pvo) { uint64_t halfbucket[4], rcbits; PTESYNC(); - lv1_read_htab_entries(mps3_vas_id, slot & ~0x3UL, &halfbucket[0], - &halfbucket[1], &halfbucket[2], &halfbucket[3], &rcbits); + lv1_read_htab_entries(mps3_vas_id, pvo->pvo_pte.slot & ~0x3UL, + &halfbucket[0], &halfbucket[1], &halfbucket[2], &halfbucket[3], + &rcbits); + /* Check if present in page table */ + if ((halfbucket[pvo->pvo_pte.slot & 0x3] & LPTE_AVPN_MASK) != + ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK)) + return (-1); + if (!(halfbucket[pvo->pvo_pte.slot & 0x3] & LPTE_VALID)) + return (-1); + /* - * rcbits contains the low 12 bits of each PTEs 2nd part, + * rcbits contains the low 12 bits of each PTE's 2nd part, * spaced at 16-bit intervals */ - KASSERT((halfbucket[slot & 0x3] & LPTE_AVPN_MASK) == - (pvo_pt->pte_hi & LPTE_AVPN_MASK), - ("PTE upper word %#lx != %#lx\n", - halfbucket[slot & 0x3], pvo_pt->pte_hi)); + return ((rcbits >> ((3 - (pvo->pvo_pte.slot & 0x3))*16)) & + (LPTE_CHG | LPTE_REF)); +} - pvo_pt->pte_lo |= (rcbits >> ((3 - (slot & 0x3))*16)) & - (LPTE_CHG | LPTE_REF); +static int64_t +mps3_pte_synch(mmu_t mmu, struct pvo_entry *pvo) +{ + int64_t retval; + + mtx_lock(&mps3_table_lock); + retval = mps3_pte_synch_locked(pvo); + mtx_unlock(&mps3_table_lock); + + return (retval); } -static void -mps3_pte_clear(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn, - u_int64_t ptebit) +static int64_t +mps3_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { + int64_t refchg; + struct lpte pte; - lv1_write_htab_entry(mps3_vas_id, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo & ~ptebit); + mtx_lock(&mps3_table_lock); + + refchg = mps3_pte_synch_locked(pvo); + if (refchg < 0) { + mtx_unlock(&mps3_table_lock); + return (refchg); + } + + moea64_pte_from_pvo(pvo, &pte); + + pte.pte_lo |= refchg; + pte.pte_lo &= ~ptebit; + /* XXX: race on RC bits between write and sync. Anything to do? */ + lv1_write_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, pte.pte_hi, + pte.pte_lo); + mtx_unlock(&mps3_table_lock); + + return (refchg); } -static void -mps3_pte_unset(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) +static int64_t +mps3_pte_unset(mmu_t mmu, struct pvo_entry *pvo) { + int64_t refchg; - mps3_pte_synch(mmu, slot, pvo_pt); - pvo_pt->pte_hi &= ~LPTE_VALID; - lv1_write_htab_entry(mps3_vas_id, slot, 0, 0); + mtx_lock(&mps3_table_lock); + refchg = mps3_pte_synch_locked(pvo); + if (refchg < 0) { + moea64_pte_overflow--; + mtx_unlock(&mps3_table_lock); + return (-1); + } + /* XXX: race on RC bits between unset and sync. Anything to do? */ + lv1_write_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, 0, 0); + mtx_unlock(&mps3_table_lock); moea64_pte_valid--; -} -static void -mps3_pte_change(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) -{ - - mps3_pte_synch(mmu, slot, pvo_pt); - lv1_write_htab_entry(mps3_vas_id, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo); + return (refchg & (LPTE_REF | LPTE_CHG)); } static int -mps3_pte_insert(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +mps3_pte_insert(mmu_t mmu, struct pvo_entry *pvo) { int result; - struct lpte evicted; - struct pvo_entry *pvo; + struct lpte pte, evicted; uint64_t index; - pvo_pt->pte_hi |= LPTE_VALID; - pvo_pt->pte_hi &= ~LPTE_HID; + if (pvo->pvo_vaddr & PVO_HID) { + /* Hypercall needs primary PTEG */ + pvo->pvo_vaddr &= ~PVO_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + } + + pvo->pvo_pte.slot &= ~7UL; + moea64_pte_from_pvo(pvo, &pte); evicted.pte_hi = 0; PTESYNC(); - result = lv1_insert_htab_entry(mps3_vas_id, ptegidx << 3, - pvo_pt->pte_hi, pvo_pt->pte_lo, LPTE_LOCKED | LPTE_WIRED, 0, + mtx_lock(&mps3_table_lock); + result = lv1_insert_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, + pte.pte_hi, pte.pte_lo, LPTE_LOCKED | LPTE_WIRED, 0, &index, &evicted.pte_hi, &evicted.pte_lo); + mtx_unlock(&mps3_table_lock); if (result != 0) { /* No freeable slots in either PTEG? We're hosed. */ @@ -227,84 +265,19 @@ /* * See where we ended up. */ - if (index >> 3 != ptegidx) - pvo_pt->pte_hi |= LPTE_HID; + if ((index & ~7UL) != pvo->pvo_pte.slot) + pvo->pvo_vaddr |= PVO_HID; + pvo->pvo_pte.slot = index; moea64_pte_valid++; - if (!evicted.pte_hi) - return (index & 0x7); - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - - ptegidx = index >> 3; /* Where the sacrifice PTE was found */ - if (evicted.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - KASSERT((evicted.pte_hi & (LPTE_WIRED | LPTE_LOCKED)) == 0, - ("Evicted a wired PTE")); - - result = 0; - LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) { - if (!PVO_PTEGIDX_ISSET(pvo)) - continue; - - if (pvo->pvo_pte.lpte.pte_hi == (evicted.pte_hi | LPTE_VALID)) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_VALID; - pvo->pvo_pte.lpte.pte_lo |= - evicted.pte_lo & (LPTE_REF | LPTE_CHG); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_valid--; - moea64_pte_overflow++; - result = 1; - break; - } + if (evicted.pte_hi) { + KASSERT((evicted.pte_hi & (LPTE_WIRED | LPTE_LOCKED)) == 0, + ("Evicted a wired PTE")); + moea64_pte_valid--; + moea64_pte_overflow++; } - KASSERT(result == 1, ("PVO for sacrifice PTE not found")); - - return (index & 0x7); + return (0); } -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) -{ - uint64_t hash; - int shift; - - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} - -uintptr_t -mps3_pvo_to_pte(mmu_t mmu, const struct pvo_entry *pvo) -{ - uint64_t vsid; - u_int ptegidx; - - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); - - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; - - return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo)); -} - Index: aim/moea64_if.m =================================================================== --- aim/moea64_if.m (.../head/sys/powerpc) (revision 279138) +++ aim/moea64_if.m (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -1,5 +1,5 @@ #- -# Copyright (c) 2010 Nathan Whitehorn +# Copyright (c) 2010,2015 Nathan Whitehorn # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,72 +44,78 @@ INTERFACE moea64; +CODE { + static moea64_pte_replace_t moea64_pte_replace_default; + static int64_t moea64_pte_replace_default(mmu_t mmu, + struct pvo_entry *pvo, int flags) + { + int64_t refchg; + + refchg = MOEA64_PTE_UNSET(mmu, pvo); + MOEA64_PTE_INSERT(mmu, pvo); + + return (refchg); + } +} + /** - * Copy ref/changed bits from PTE referenced by _pt_cookie to _pvo_pt. + * Return ref/changed bits from PTE referenced by _pvo if _pvo is currently in + * the page table. Returns -1 if _pvo not currently present in the page table. */ -METHOD void pte_synch { +METHOD int64_t pte_synch { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; + struct pvo_entry *_pvo; }; /** * Clear bits ptebit (a mask) from the low word of the PTE referenced by - * _pt_cookie. Note that _pvo_pt is for reference use only -- the bit should - * NOT be cleared there. + * _pvo. Return previous values of ref/changed bits or -1 if _pvo is not + * currently in the page table. */ -METHOD void pte_clear { +METHOD int64_t pte_clear { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; + struct pvo_entry *_pvo; uint64_t _ptebit; }; /** - * Invalidate the PTE referenced by _pt_cookie, synchronizing its validity - * and ref/changed bits after completion. + * Invalidate the PTE referenced by _pvo, returning its ref/changed bits. + * Returns -1 if PTE not currently present in page table. */ -METHOD void pte_unset { +METHOD int64_t pte_unset { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; + struct pvo_entry *_pvo; }; /** - * Update the PTE referenced by _pt_cookie with the values in _pvo_pt, - * making sure that the values of ref/changed bits are preserved and - * synchronized back to _pvo_pt. + * Update the reference PTE to correspond to the contents of _pvo. Has the + * same ref/changed semantics as pte_unset() (and should clear R/C bits). May + * change the PVO's location in the page table or return with it unmapped if + * PVO_WIRED is not set. By default, does unset() followed by insert(). + * + * _flags is a bitmask describing what level of page invalidation should occur: + * 0 means no invalidation is required + * MOEA64_PTE_PROT_UPDATE signifies that the page protection bits are changing + * MOEA64_PTE_INVALIDATE requires an invalidation of the same strength as + * pte_unset() followed by pte_insert() */ -METHOD void pte_change { +METHOD int64_t pte_replace { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; -}; - + struct pvo_entry *_pvo; + int _flags; +} DEFAULT moea64_pte_replace_default; /** - * Insert the PTE _pvo_pt into the PTEG group _ptegidx, returning the index - * of the PTE in its group at completion, or -1 if no slots were free. Must - * not replace PTEs marked LPTE_WIRED or LPTE_LOCKED, and must set LPTE_HID - * and LPTE_VALID appropriately in _pvo_pt. + * Insert a PTE corresponding to _pvo into the page table, returning any errors + * encountered and (optionally) setting the PVO slot value to some + * representation of where the entry was placed. + * + * Must not replace PTEs marked LPTE_WIRED. If an existing valid PTE is spilled, + * must synchronize ref/changed bits as in pte_unset(). */ METHOD int pte_insert { mmu_t _mmu; - u_int _ptegidx; - struct lpte *_pvo_pt; + struct pvo_entry *_pvo; }; -/** - * Return the page table reference cookie corresponding to _pvo, or -1 if - * the _pvo is not currently in the page table. - */ -METHOD uintptr_t pvo_to_pte { - mmu_t _mmu; - const struct pvo_entry *_pvo; -}; - - Index: aim/mmu_oea64.c =================================================================== --- aim/mmu_oea64.c (.../head/sys/powerpc) (revision 279138) +++ aim/mmu_oea64.c (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -1,87 +1,28 @@ /*- - * Copyright (c) 2001 The NetBSD Foundation, Inc. + * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * - * This code is derived from software contributed to The NetBSD Foundation - * by Matt Thomas of Allegro Networks, Inc. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/*- - * Copyright (C) 1995, 1996 Wolfgang Solfrank. - * Copyright (C) 1995, 1996 TooLs GmbH. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by TooLs GmbH. - * 4. The name of TooLs GmbH may not be used to endorse or promote products - * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/*- - * Copyright (C) 2001 Benno Rice. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ #include __FBSDID("$FreeBSD$"); @@ -166,18 +107,25 @@ /* * Locking semantics: - * -- Read lock: if no modifications are being made to either the PVO lists - * or page table or if any modifications being made result in internal - * changes (e.g. wiring, protection) such that the existence of the PVOs - * is unchanged and they remain associated with the same pmap (in which - * case the changes should be protected by the pmap lock) - * -- Write lock: required if PTEs/PVOs are being inserted or removed. + * + * There are two locks of interest: the page locks and the pmap locks, which + * protect their individual PVO lists and are locked in that order. The contents + * of all PVO entries are protected by the locks of their respective pmaps. + * The pmap of any PVO is guaranteed not to change so long as the PVO is linked + * into any list. + * */ -#define LOCK_TABLE_RD() rw_rlock(&moea64_table_lock) -#define UNLOCK_TABLE_RD() rw_runlock(&moea64_table_lock) -#define LOCK_TABLE_WR() rw_wlock(&moea64_table_lock) -#define UNLOCK_TABLE_WR() rw_wunlock(&moea64_table_lock) +#define PV_LOCK_COUNT PA_LOCK_COUNT*3 +static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; + +#define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[pa_index(pa) % PV_LOCK_COUNT])) +#define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) +#define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) +#define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) +#define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; @@ -202,9 +150,8 @@ extern void bs_remap_earlyboot(void); /* - * Lock for the pteg and pvo tables. + * Lock for the SLB tables. */ -struct rwlock moea64_table_lock; struct mtx moea64_slb_mutex; /* @@ -216,10 +163,8 @@ /* * PVO data. */ -struct pvo_head *moea64_pvo_table; /* pvo entries by pteg index */ -uma_zone_t moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */ -uma_zone_t moea64_mpvo_zone; /* zone for pvo entries for managed pages */ +uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; @@ -261,7 +206,6 @@ vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; -uintptr_t moea64_scratchpage_pte[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; @@ -271,16 +215,17 @@ /* * PVO calls. */ -static int moea64_pvo_enter(mmu_t, pmap_t, uma_zone_t, struct pvo_head *, - vm_offset_t, vm_offset_t, uint64_t, int, int8_t); -static void moea64_pvo_remove(mmu_t, struct pvo_entry *); +static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, + struct pvo_head *pvo_head); +static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); +static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ -static boolean_t moea64_query_bit(mmu_t, vm_page_t, u_int64_t); -static u_int moea64_clear_bit(mmu_t, vm_page_t, u_int64_t); +static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); +static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz); @@ -388,43 +333,91 @@ MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) +static struct pvo_head * +vm_page_to_pvoh(vm_page_t m) { + + mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); + return (&m->md.mdpg_pvoh); +} + +static struct pvo_entry * +alloc_pvo_entry(int bootstrap) +{ + struct pvo_entry *pvo; + + if (!moea64_initialized || bootstrap) { + if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { + panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", + moea64_bpvo_pool_index, moea64_bpvo_pool_size, + moea64_bpvo_pool_size * sizeof(struct pvo_entry)); + } + pvo = &moea64_bpvo_pool[ + atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; + bzero(pvo, sizeof(*pvo)); + pvo->pvo_vaddr = PVO_BOOTSTRAP; + } else { + pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT); + bzero(pvo, sizeof(*pvo)); + } + + return (pvo); +} + + +static void +init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) +{ + uint64_t vsid; uint64_t hash; int shift; - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo->pvo_pmap = pmap; + va &= ~ADDR_POFF; + pvo->pvo_vaddr |= va; + vsid = va_to_vsid(pmap, va); + pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) + | (vsid << 16); + + shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : + ADDR_PIDX_SHFT; + hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); + pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } -static __inline struct pvo_head * -vm_page_to_pvoh(vm_page_t m) +static void +free_pvo_entry(struct pvo_entry *pvo) { - return (&m->md.mdpg_pvoh); + if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) + uma_zfree(moea64_pvo_zone, pvo); } -static __inline void -moea64_pte_create(struct lpte *pt, uint64_t vsid, vm_offset_t va, - uint64_t pte_lo, int flags) +void +moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { - /* - * Construct a PTE. Default to IMB initially. Valid bit only gets - * set when the real pte is set in memory. - * - * Note: Don't set the valid bit for correct operation of tlb update. - */ - pt->pte_hi = (vsid << LPTE_VSID_SHIFT) | - (((uint64_t)(va & ADDR_PIDX) >> ADDR_API_SHFT64) & LPTE_API); + lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK; + lpte->pte_hi |= LPTE_VALID; + + if (pvo->pvo_vaddr & PVO_LARGE) + lpte->pte_hi |= LPTE_BIG; + if (pvo->pvo_vaddr & PVO_WIRED) + lpte->pte_hi |= LPTE_WIRED; + if (pvo->pvo_vaddr & PVO_HID) + lpte->pte_hi |= LPTE_HID; - if (flags & PVO_LARGE) - pt->pte_hi |= LPTE_BIG; + lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + lpte->pte_lo |= LPTE_BW; + else + lpte->pte_lo |= LPTE_BR; - pt->pte_lo = pte_lo; + if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) + lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t @@ -489,6 +482,7 @@ { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; + struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; @@ -542,8 +536,11 @@ moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; - if (moea64_pvo_find_va(kernel_pmap, - translations[i].om_va + off) != NULL) + PMAP_LOCK(kernel_pmap); + pvo = moea64_pvo_find_va(kernel_pmap, + translations[i].om_va + off); + PMAP_UNLOCK(kernel_pmap); + if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, @@ -606,6 +603,7 @@ moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { + struct pvo_entry *pvo; register_t msr; vm_paddr_t pa; vm_offset_t size, off; @@ -617,7 +615,6 @@ DISABLE_TRANS(msr); if (hw_direct_map) { - LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + @@ -624,6 +621,10 @@ pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; + pvo = alloc_pvo_entry(1 /* bootstrap */); + pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; + init_pvo_entry(pvo, kernel_pmap, pa); + /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. @@ -636,18 +637,14 @@ pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; - moea64_pvo_enter(mmup, kernel_pmap, moea64_upvo_zone, - NULL, pa, pa, pte_lo, - PVO_WIRED | PVO_LARGE, 0); + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | + VM_PROT_EXECUTE; + pvo->pvo_pte.pa = pa | pte_lo; + moea64_pvo_enter(mmup, pvo, NULL); } } PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); } else { - size = sizeof(struct pvo_head) * moea64_pteg_count; - off = (vm_offset_t)(moea64_pvo_table); - for (pa = off; pa < off + size; pa += PAGE_SIZE) - moea64_kenter(mmup, pa, pa); size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) @@ -782,8 +779,6 @@ void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { - vm_size_t size; - register_t msr; int i; /* @@ -792,28 +787,14 @@ moea64_pteg_mask = moea64_pteg_count - 1; /* - * Allocate pv/overflow lists. + * Initialize SLB table lock and page locks */ - size = sizeof(struct pvo_head) * moea64_pteg_count; - - moea64_pvo_table = (struct pvo_head *)moea64_bootstrap_alloc(size, - PAGE_SIZE); - CTR1(KTR_PMAP, "moea64_bootstrap: PVO table at %p", moea64_pvo_table); - - DISABLE_TRANS(msr); - for (i = 0; i < moea64_pteg_count; i++) - LIST_INIT(&moea64_pvo_table[i]); - ENABLE_TRANS(msr); - - /* - * Initialize the lock that synchronizes access to the pteg and pvo - * tables. - */ - rw_init_flags(&moea64_table_lock, "pmap tables", RW_RECURSE); mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); + for (i = 0; i < PV_LOCK_COUNT; i++) + mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* - * Initialise the unmanaged pvo pool. + * Initialise the bootstrap pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), 0); @@ -974,7 +955,7 @@ /* * Allocate some things for page zeroing. We put this directly - * in the page table, marked with LPTE_LOCKED, to avoid any + * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ @@ -988,24 +969,17 @@ moea64_kenter(mmup, moea64_scratchpage_va[i], 0); + PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); - LOCK_TABLE_RD(); - moea64_scratchpage_pte[i] = MOEA64_PVO_TO_PTE( - mmup, moea64_scratchpage_pvo[i]); - moea64_scratchpage_pvo[i]->pvo_pte.lpte.pte_hi - |= LPTE_LOCKED; - MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[i], - &moea64_scratchpage_pvo[i]->pvo_pte.lpte, - moea64_scratchpage_pvo[i]->pvo_vpn); - UNLOCK_TABLE_RD(); + PMAP_UNLOCK(kernel_pmap); } } } /* - * Activate a user pmap. The pmap must be activated before its address - * space can be accessed in any way. + * Activate a user pmap. This mostly involves setting some non-CPU + * state. */ void moea64_activate(mmu_t mmu, struct thread *td) @@ -1040,35 +1014,33 @@ moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; - uintptr_t pt; + vm_page_t m; + int64_t refchg; - LOCK_TABLE_RD(); + key.pvo_vaddr = sva; PMAP_LOCK(pm); - key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_WIRED) == 0) - panic("moea64_unwire: pte %p is missing LPTE_WIRED", - &pvo->pvo_pte.lpte); - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; - if (pt != -1) { - /* - * The PTE's wired attribute is not a hardware - * feature, so there is no need to invalidate any TLB - * entries. - */ - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); + refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (refchg < 0) + refchg = LPTE_CHG; + m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + + refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } - UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } @@ -1085,13 +1057,10 @@ KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); - moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo &= - ~(LPTE_WIMG | LPTE_RPGN); - moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo |= + moea64_scratchpage_pvo[which]->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; - MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[which], - &moea64_scratchpage_pvo[which]->pvo_pte.lpte, - moea64_scratchpage_pvo[which]->pvo_vpn); + MOEA64_PTE_REPLACE(mmup, moea64_scratchpage_pvo[which], + MOEA64_PTE_INVALIDATE); isync(); } @@ -1245,48 +1214,79 @@ moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { + struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; - uma_zone_t zone; uint64_t pte_lo; - u_int pvo_flags; int error; if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); + pvo = alloc_pvo_entry(0); + pvo->pvo_pmap = NULL; /* to be filled in later */ + pvo->pvo_pte.prot = prot; + + pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); + pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; - zone = moea64_upvo_zone; - pvo_flags = 0; } else { - pvo_head = vm_page_to_pvoh(m); - zone = moea64_mpvo_zone; - pvo_flags = PVO_MANAGED; + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; } + + for (;;) { + PV_PAGE_LOCK(m); + PMAP_LOCK(pmap); + if (pvo->pvo_pmap == NULL) + init_pvo_entry(pvo, pmap, va); + if (prot & VM_PROT_WRITE) + if (pmap_bootstrapped && + (m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_WRITEABLE); - pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); + oldpvo = moea64_pvo_find_va(pmap, va); + if (oldpvo != NULL) { + if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && + oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && + oldpvo->pvo_pte.prot == prot) { + /* Identical mapping already exists */ + error = 0; - if (prot & VM_PROT_WRITE) { - pte_lo |= LPTE_BW; - if (pmap_bootstrapped && - (m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_WRITEABLE); - } else - pte_lo |= LPTE_BR; + /* If not in page table, reinsert it */ + if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { + moea64_pte_overflow--; + MOEA64_PTE_INSERT(mmu, oldpvo); + } - if ((prot & VM_PROT_EXECUTE) == 0) - pte_lo |= LPTE_NOEXEC; + /* Then just clean up and go home */ + PV_PAGE_UNLOCK(m); + PMAP_UNLOCK(pmap); + free_pvo_entry(pvo); + break; + } - if ((flags & PMAP_ENTER_WIRED) != 0) - pvo_flags |= PVO_WIRED; + /* Otherwise, need to kill it first */ + KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " + "mapping does not match new mapping")); + moea64_pvo_remove_from_pmap(mmu, oldpvo); + } + error = moea64_pvo_enter(mmu, pvo, pvo_head); + PV_PAGE_UNLOCK(m); + PMAP_UNLOCK(pmap); - for (;;) { - LOCK_TABLE_WR(); - PMAP_LOCK(pmap); - error = moea64_pvo_enter(mmu, pmap, zone, pvo_head, va, - VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags, psind); - PMAP_UNLOCK(pmap); - UNLOCK_TABLE_WR(); + /* Free any dead pages */ + if (oldpvo != NULL) { + PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, oldpvo); + PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + free_pvo_entry(oldpvo); + } + if (error != ENOMEM) break; if ((flags & PMAP_ENTER_NOSLEEP) != 0) @@ -1394,9 +1394,9 @@ if (pvo == NULL) pa = 0; else - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | - (va - PVO_VADDR(pvo)); + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); + return (pa); } @@ -1417,13 +1417,11 @@ PMAP_LOCK(pmap); retry: pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); - if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) && - ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) == LPTE_RW || - (prot & VM_PROT_WRITE) == 0)) { + if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { if (vm_page_pa_tryrelock(pmap, - pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, &pa)) + pvo->pvo_pte.pa & LPTE_RPGN, &pa)) goto retry; - m = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); + m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); vm_page_hold(m); } PA_UNLOCK_COND(pa); @@ -1436,6 +1434,11 @@ static void * moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { + struct pvo_entry *pvo; + vm_offset_t va; + vm_page_t m; + int pflags, needed_lock; + /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside @@ -1442,11 +1445,7 @@ * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ - vm_offset_t va; - vm_page_t m; - int pflags, needed_lock; - *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; @@ -1463,17 +1462,21 @@ va = VM_PAGE_TO_PHYS(m); - LOCK_TABLE_WR(); + pvo = alloc_pvo_entry(1 /* bootstrap */); + + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; + pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; + if (needed_lock) PMAP_LOCK(kernel_pmap); - moea64_pvo_enter(installed_mmu, kernel_pmap, moea64_upvo_zone, - NULL, va, VM_PAGE_TO_PHYS(m), LPTE_M, PVO_WIRED | PVO_BOOTSTRAP, - 0); + init_pvo_entry(pvo, kernel_pmap, va); + pvo->pvo_vaddr |= PVO_WIRED; + moea64_pvo_enter(installed_mmu, pvo, NULL); + if (needed_lock) PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); @@ -1489,17 +1492,13 @@ CTR0(KTR_PMAP, "moea64_init"); - moea64_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), + moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); - moea64_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; - uma_zone_set_allocf(moea64_upvo_zone,moea64_uma_page_alloc); - uma_zone_set_allocf(moea64_mpvo_zone,moea64_uma_page_alloc); + uma_zone_set_allocf(moea64_pvo_zone,moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 @@ -1515,7 +1514,8 @@ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); - return (moea64_query_bit(mmu, m, PTE_REF)); + + return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t @@ -1540,11 +1540,12 @@ moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; - boolean_t rv; + boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); - rv = pvo == NULL || (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0; + if (pvo != NULL) + rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } @@ -1576,9 +1577,8 @@ moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; - uintptr_t pt; + int64_t refchg, ret; pmap_t pmap; - uint64_t lo = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); @@ -1592,30 +1592,28 @@ if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); + refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; - pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - lo |= pvo->pvo_pte.lpte.pte_lo; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_CHG; - MOEA64_PTE_CHANGE(mmu, pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - if (pvo->pvo_pmap == kernel_pmap) - isync(); - } + if (!(pvo->pvo_vaddr & PVO_DEAD) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + pvo->pvo_pte.prot &= ~VM_PROT_WRITE; + ret = MOEA64_PTE_REPLACE(mmu, pvo, + MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + ret = LPTE_CHG; + refchg |= ret; + if (pvo->pvo_pmap == kernel_pmap) + isync(); } - if ((lo & LPTE_CHG) != 0) - vm_page_dirty(m); PMAP_UNLOCK(pmap); } - UNLOCK_TABLE_RD(); + if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) + vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); + PV_PAGE_UNLOCK(m); } /* @@ -1646,8 +1644,7 @@ moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; - struct pvo_head *pvo_head; - uintptr_t pt; + int64_t refchg; pmap_t pmap; uint64_t lo; @@ -1656,25 +1653,36 @@ return; } - pvo_head = vm_page_to_pvoh(m); lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); - LOCK_TABLE_RD(); - LIST_FOREACH(pvo, pvo_head, pvo_vlink) { + + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_WIMG; - pvo->pvo_pte.lpte.pte_lo |= lo; - if (pt != -1) { - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); + if (!(pvo->pvo_vaddr & PVO_DEAD)) { + pvo->pvo_pte.pa &= ~LPTE_WIMG; + pvo->pvo_pte.pa |= lo; + refchg = MOEA64_PTE_REPLACE(mmu, pvo, + MOEA64_PTE_INVALIDATE); + if (refchg < 0) + refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? + LPTE_CHG : 0; + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + refchg |= + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } - UNLOCK_TABLE_RD(); m->md.mdpg_cache_attrs = ma; + PV_PAGE_UNLOCK(m); } /* @@ -1683,18 +1691,30 @@ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_offset_t pa, vm_memattr_t ma) { - uint64_t pte_lo; int error; + struct pvo_entry *pvo, *oldpvo; - pte_lo = moea64_calc_wimg(pa, ma); + pvo = alloc_pvo_entry(0); + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; + pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); + pvo->pvo_vaddr |= PVO_WIRED; - LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); - error = moea64_pvo_enter(mmu, kernel_pmap, moea64_upvo_zone, - NULL, va, pa, pte_lo, PVO_WIRED, 0); + oldpvo = moea64_pvo_find_va(kernel_pmap, va); + if (oldpvo != NULL) + moea64_pvo_remove_from_pmap(mmu, oldpvo); + init_pvo_entry(pvo, kernel_pmap, va); + error = moea64_pvo_enter(mmu, pvo, NULL); PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); + /* Free any dead pages */ + if (oldpvo != NULL) { + PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, oldpvo); + PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + free_pvo_entry(oldpvo); + } + if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#zx: %d", va, pa, error); @@ -1728,7 +1748,7 @@ pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo)); + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } @@ -1748,8 +1768,8 @@ * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' - * unchanged. We cannot and therefore do not; *virt is updated with the - * first usable address after the mapped region. + * unchanged. Other architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, @@ -1757,8 +1777,22 @@ { vm_offset_t sva, va; + if (hw_direct_map) { + /* + * Check if every page in the region is covered by the direct + * map. The direct map covers all of physical memory. Use + * moea64_calc_wimg() as a shortcut to see if the page is in + * physical memory as a way to see if the direct map covers it. + */ + for (va = pa_start; va < pa_end; va += PAGE_SIZE) + if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) + break; + if (va == pa_end) + return (pa_start); + } sva = *virt; va = sva; + /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; @@ -1784,9 +1818,9 @@ ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { - if (pvo->pvo_pmap == pmap) { + if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } @@ -1793,7 +1827,7 @@ if (++loops >= 16) break; } - UNLOCK_TABLE_RD(); + PV_PAGE_UNLOCK(m); return (rv); } @@ -1810,11 +1844,11 @@ count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) - if ((pvo->pvo_vaddr & PVO_WIRED) != 0) + if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; - UNLOCK_TABLE_RD(); + PV_PAGE_UNLOCK(m); return (count); } @@ -1926,45 +1960,32 @@ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { - uintptr_t pt; - struct vm_page *pg; - uint64_t oldlo; + struct vm_page *pg; + vm_prot_t oldprot; + int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* - * Grab the PTE pointer before we diddle with the cached PTE - * copy. + * Change the protection of the page. */ - pt = MOEA64_PVO_TO_PTE(mmu, pvo); + oldprot = pvo->pvo_pte.prot; + pvo->pvo_pte.prot = prot; + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); /* - * Change the protection of the page. + * If the PVO is in the page table, update mapping */ - oldlo = pvo->pvo_pte.lpte.pte_lo; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_NOEXEC; - if ((prot & VM_PROT_EXECUTE) == 0) - pvo->pvo_pte.lpte.pte_lo |= LPTE_NOEXEC; - if (prot & VM_PROT_WRITE) - pvo->pvo_pte.lpte.pte_lo |= LPTE_BW; - else - pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; + refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); + if (refchg < 0) + refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); - - /* - * If the PVO is in the page table, update that pte as well. - */ - if (pt != -1) - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && - (pvo->pvo_pte.lpte.pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), - pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, PAGE_SIZE); + pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* @@ -1971,14 +1992,13 @@ * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ - if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && - (oldlo & LPTE_PP) != LPTE_BR && !(prot & VM_PROT_WRITE)) { - if (pg != NULL) { - if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) - vm_page_dirty(pg); - if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) - vm_page_aflag_set(pg, PGA_REFERENCED); - } + if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && + (oldprot & VM_PROT_WRITE)) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); } } @@ -1999,7 +2019,6 @@ return; } - LOCK_TABLE_RD(); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); @@ -2007,7 +2026,6 @@ tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } - UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } @@ -2078,16 +2096,33 @@ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { - struct pvo_entry *pvo, *tpvo; + struct pvo_entry *pvo, *tpvo; + struct pvo_tree tofree; - LOCK_TABLE_WR(); + RB_INIT(&tofree); + PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { - if (!(pvo->pvo_vaddr & PVO_WIRED)) - moea64_pvo_remove(mmu, pvo); + if (pvo->pvo_vaddr & PVO_WIRED) + continue; + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(mmu, pvo); + RB_INSERT(pvo_tree, &tofree, pvo); } - UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); + + RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { + PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, pvo); + PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); + RB_REMOVE(pvo_tree, &tofree, pvo); + free_pvo_entry(pvo); + } } /* @@ -2096,7 +2131,8 @@ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo, *tpvo, key; + struct pvo_tree tofree; /* * Perform an unsynchronized read. This is, however, safe. @@ -2104,16 +2140,32 @@ if (pm->pm_stats.resident_count == 0) return; - LOCK_TABLE_WR(); + key.pvo_vaddr = sva; + + RB_INIT(&tofree); + PMAP_LOCK(pm); - key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); - moea64_pvo_remove(mmu, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(mmu, pvo); + RB_INSERT(pvo_tree, &tofree, pvo); } - UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); + + RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { + PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, pvo); + PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); + RB_REMOVE(pvo_tree, &tofree, pvo); + free_pvo_entry(pvo); + } } /* @@ -2124,20 +2176,32 @@ moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; + struct pvo_head freequeue; + int wasdead; pmap_t pmap; - LOCK_TABLE_WR(); + LIST_INIT(&freequeue); + + PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - moea64_pvo_remove(mmu, pvo); + wasdead = (pvo->pvo_vaddr & PVO_DEAD); + if (!wasdead) + moea64_pvo_remove_from_pmap(mmu, pvo); + moea64_pvo_remove_from_page(mmu, pvo); + if (!wasdead) + LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); + } - UNLOCK_TABLE_WR(); - if ((m->aflags & PGA_WRITEABLE) && moea64_is_modified(mmu, m)) - vm_page_dirty(m); - vm_page_aflag_clear(m, PGA_WRITEABLE); - vm_page_aflag_clear(m, PGA_EXECUTABLE); + KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); + KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); + PV_PAGE_UNLOCK(m); + + /* Clean up UMA allocations */ + LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) + free_pvo_entry(pvo); } /* @@ -2187,141 +2251,20 @@ } static int -moea64_pvo_enter(mmu_t mmu, pmap_t pm, uma_zone_t zone, - struct pvo_head *pvo_head, vm_offset_t va, vm_offset_t pa, - uint64_t pte_lo, int flags, int8_t psind __unused) +moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head) { - struct pvo_entry *pvo; - uintptr_t pt; - uint64_t vsid; - int first; - u_int ptegidx; - int i; - int bootstrap; + int first, err; - /* - * One nasty thing that can happen here is that the UMA calls to - * allocate new PVOs need to map more memory, which calls pvo_enter(), - * which calls UMA... - * - * We break the loop by detecting recursion and allocating out of - * the bootstrap pool. - */ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT(moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo)) == NULL, + ("Existing mapping for VA %#jx", (uintmax_t)PVO_VADDR(pvo))); - first = 0; - bootstrap = (flags & PVO_BOOTSTRAP); - - if (!moea64_initialized) - bootstrap = 1; - - PMAP_LOCK_ASSERT(pm, MA_OWNED); - rw_assert(&moea64_table_lock, RA_WLOCKED); - - /* - * Compute the PTE Group index. - */ - va &= ~ADDR_POFF; - vsid = va_to_vsid(pm, va); - ptegidx = va_to_pteg(vsid, va, flags & PVO_LARGE); - - /* - * Remove any existing mapping for this page. Reuse the pvo entry if - * there is a mapping. - */ moea64_pvo_enter_calls++; - LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) { - if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) == pa && - (pvo->pvo_pte.lpte.pte_lo & (LPTE_NOEXEC | LPTE_PP)) - == (pte_lo & (LPTE_NOEXEC | LPTE_PP))) { - /* - * The physical page and protection are not - * changing. Instead, this may be a request - * to change the mapping's wired attribute. - */ - pt = -1; - if ((flags & PVO_WIRED) != 0 && - (pvo->pvo_vaddr & PVO_WIRED) == 0) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_vaddr |= PVO_WIRED; - pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; - pm->pm_stats.wired_count++; - } else if ((flags & PVO_WIRED) == 0 && - (pvo->pvo_vaddr & PVO_WIRED) != 0) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_vaddr &= ~PVO_WIRED; - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; - pm->pm_stats.wired_count--; - } - if (!(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) { - KASSERT(pt == -1, - ("moea64_pvo_enter: valid pt")); - /* Re-insert if spilled */ - i = MOEA64_PTE_INSERT(mmu, ptegidx, - &pvo->pvo_pte.lpte); - if (i >= 0) - PVO_PTEGIDX_SET(pvo, i); - moea64_pte_overflow--; - } else if (pt != -1) { - /* - * The PTE's wired attribute is not a - * hardware feature, so there is no - * need to invalidate any TLB entries. - */ - MOEA64_PTE_CHANGE(mmu, pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - } - return (0); - } - moea64_pvo_remove(mmu, pvo); - break; - } - } - /* - * If we aren't overwriting a mapping, try to allocate. - */ - if (bootstrap) { - if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { - panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", - moea64_bpvo_pool_index, moea64_bpvo_pool_size, - moea64_bpvo_pool_size * sizeof(struct pvo_entry)); - } - pvo = &moea64_bpvo_pool[moea64_bpvo_pool_index]; - moea64_bpvo_pool_index++; - bootstrap = 1; - } else { - pvo = uma_zalloc(zone, M_NOWAIT); - } - - if (pvo == NULL) - return (ENOMEM); - - moea64_pvo_entries++; - pvo->pvo_vaddr = va; - pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) - | (vsid << 16); - pvo->pvo_pmap = pm; - LIST_INSERT_HEAD(&moea64_pvo_table[ptegidx], pvo, pvo_olink); - pvo->pvo_vaddr &= ~ADDR_POFF; - - if (flags & PVO_WIRED) - pvo->pvo_vaddr |= PVO_WIRED; - if (pvo_head != NULL) - pvo->pvo_vaddr |= PVO_MANAGED; - if (bootstrap) - pvo->pvo_vaddr |= PVO_BOOTSTRAP; - if (flags & PVO_LARGE) - pvo->pvo_vaddr |= PVO_LARGE; - - moea64_pte_create(&pvo->pvo_pte.lpte, vsid, va, - (uint64_t)(pa) | pte_lo, flags); - - /* * Add to pmap list */ - RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); + RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first @@ -2333,24 +2276,21 @@ LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } - if (pvo->pvo_vaddr & PVO_WIRED) { - pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; - pm->pm_stats.wired_count++; - } - pm->pm_stats.resident_count++; + if (pvo->pvo_vaddr & PVO_WIRED) + pvo->pvo_pmap->pm_stats.wired_count++; + pvo->pvo_pmap->pm_stats.resident_count++; /* - * We hope this succeeds but it isn't required. + * Insert it into the hardware page table */ - i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte); - if (i >= 0) { - PVO_PTEGIDX_SET(pvo, i); - } else { + err = MOEA64_PTE_INSERT(mmu, pvo); + if (err != 0) { panic("moea64_pvo_enter: overflow"); - moea64_pte_overflow++; } - if (pm == kernel_pmap) + moea64_pvo_entries++; + + if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ @@ -2359,7 +2299,8 @@ * as virtual memory is switched on. */ if (!pmap_bootstrapped) - moea64_bootstrap_slb_prefault(va, flags & PVO_LARGE); + moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), + pvo->pvo_vaddr & PVO_LARGE); #endif return (first ? ENOENT : 0); @@ -2366,24 +2307,28 @@ } static void -moea64_pvo_remove(mmu_t mmu, struct pvo_entry *pvo) +moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; - uintptr_t pt; + int32_t refchg; + KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); - rw_assert(&moea64_table_lock, RA_WLOCKED); + KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* - * If there is an active pte entry, we need to deactivate it (and - * save the ref & cfg bits). + * If there is an active pte entry, we need to deactivate it */ - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_UNSET(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - } else { - moea64_pte_overflow--; + refchg = MOEA64_PTE_UNSET(mmu, pvo); + if (refchg < 0) { + /* + * If it was evicted from the page table, be pessimistic and + * dirty the page. + */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + refchg = LPTE_CHG; + else + refchg = 0; } /* @@ -2399,36 +2344,50 @@ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* - * Remove this from the overflow list and return it to the pool - * if we aren't going to reuse it. + * Mark this for the next sweep */ - LIST_REMOVE(pvo, pvo_olink); + pvo->pvo_vaddr |= PVO_DEAD; + /* Send RC bits to VM */ + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + if (pg != NULL) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); + } + } +} + +static void +moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) +{ + struct vm_page *pg; + + KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); + + /* Use NULL pmaps as a sentinel for races in page deletion */ + if (pvo->pvo_pmap == NULL) + return; + pvo->pvo_pmap = NULL; + /* - * Update vm about the REF/CHG bits if the page is managed. + * Update vm about page writeability/executability if managed */ - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); + PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); - if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && pg != NULL) { + if ((pvo->pvo_vaddr & PVO_MANAGED) && pg != NULL) { LIST_REMOVE(pvo, pvo_vlink); - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { - if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) - vm_page_dirty(pg); - if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) - vm_page_aflag_set(pg, PGA_REFERENCED); - if (LIST_EMPTY(vm_page_to_pvoh(pg))) - vm_page_aflag_clear(pg, PGA_WRITEABLE); - } if (LIST_EMPTY(vm_page_to_pvoh(pg))) - vm_page_aflag_clear(pg, PGA_EXECUTABLE); + vm_page_aflag_clear(pg, PGA_WRITEABLE | PGA_EXECUTABLE); } moea64_pvo_entries--; moea64_pvo_remove_calls++; - - if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) - uma_zfree((pvo->pvo_vaddr & PVO_MANAGED) ? moea64_mpvo_zone : - moea64_upvo_zone, pvo); } static struct pvo_entry * @@ -2436,34 +2395,34 @@ { struct pvo_entry key; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t -moea64_query_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) +moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; - uintptr_t pt; + int64_t ret; + boolean_t rv; - LOCK_TABLE_RD(); - LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { - /* - * See if we saved the bit off. If so, return success. - */ - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - UNLOCK_TABLE_RD(); - return (TRUE); - } - } + /* + * See if this bit is stored in the page already. + */ + if (m->md.mdpg_attrs & ptebit) + return (TRUE); /* - * No luck, now go through the hard part of looking at the PTEs - * themselves. Sync so that any pending REF/CHG bits are flushed to - * the PTEs. + * Examine each PTE. Sync so that any pending REF/CHG bits are + * flushed to the PTEs. */ + rv = FALSE; powerpc_sync(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the @@ -2471,20 +2430,22 @@ * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - PMAP_UNLOCK(pvo->pvo_pmap); - UNLOCK_TABLE_RD(); - return (TRUE); + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = MOEA64_PTE_SYNCH(mmu, pvo); + PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0) { + atomic_set_32(&m->md.mdpg_attrs, + ret & (LPTE_CHG | LPTE_REF)); + if (ret & ptebit) { + rv = TRUE; + break; } } - PMAP_UNLOCK(pvo->pvo_pmap); } + PV_PAGE_UNLOCK(m); - UNLOCK_TABLE_RD(); - return (FALSE); + return (rv); } static u_int @@ -2492,39 +2453,33 @@ { u_int count; struct pvo_entry *pvo; - uintptr_t pt; + int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so - * we can reset the right ones). note that since the pvo entries and - * list heads are accessed via BAT0 and are never placed in the page - * table, we don't have to worry about further accesses setting the - * REF/CHG bits. + * we can reset the right ones). */ powerpc_sync(); /* - * For each pvo entry, clear the pvo's ptebit. If this pvo has a - * valid pte clear the ptebit from the valid pte. + * For each pvo entry, clear the pte's ptebit. */ count = 0; - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + ret = 0; + PMAP_LOCK(pvo->pvo_pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - count++; - MOEA64_PTE_CLEAR(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn, ptebit); - } - } - pvo->pvo_pte.lpte.pte_lo &= ~ptebit; + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0 && (ret & ptebit)) + count++; } + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PV_PAGE_UNLOCK(m); - UNLOCK_TABLE_RD(); return (count); } @@ -2540,8 +2495,7 @@ for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { - if (pvo == NULL || - (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) { + if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } @@ -2613,9 +2567,8 @@ lim = round_page(va); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); - if (pvo != NULL && !(pvo->pvo_pte.lpte.pte_lo & LPTE_I)) { - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | - (va & ADDR_POFF); + if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; @@ -2656,7 +2609,8 @@ /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); - dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; + dump_map[0].pa_size = round_page((uintptr_t)_end) - + dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; @@ -2672,7 +2626,7 @@ continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); - if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) + if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } @@ -2685,8 +2639,7 @@ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); - if (pvo == NULL || - !(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) + if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } @@ -2693,3 +2646,4 @@ dump_map[2].pa_size = va - dump_map[2].pa_start; } } + Index: aim/mmu_oea64.h =================================================================== --- aim/mmu_oea64.h (.../head/sys/powerpc) (revision 279138) +++ aim/mmu_oea64.h (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -38,8 +38,17 @@ /* Allocate physical memory for use in moea64_bootstrap. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t, u_int); +/* Set an LPTE structure to match the contents of a PVO */ +void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte); /* + * Flags + */ + +#define MOEA64_PTE_PROT_UPDATE 1 +#define MOEA64_PTE_INVALIDATE 2 + +/* * Bootstrap subroutines * * An MMU_BOOTSTRAP() implementation looks like this: @@ -68,7 +77,6 @@ * State variables */ -extern struct pvo_head *moea64_pvo_table; extern int moea64_large_page_shift; extern uint64_t moea64_large_page_size; extern u_int moea64_pteg_count; Index: aim/moea64_native.c =================================================================== --- aim/moea64_native.c (.../head/sys/powerpc) (revision 279138) +++ aim/moea64_native.c (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -99,6 +99,8 @@ #include #include #include +#include +#include #include @@ -179,29 +181,25 @@ /* * PTEG data. */ -static struct lpteg *moea64_pteg_table; +static volatile struct lpte *moea64_pteg_table; +static struct rwlock moea64_eviction_lock; /* * PTE calls. */ -static int moea64_pte_insert_native(mmu_t, u_int, struct lpte *); -static uintptr_t moea64_pvo_to_pte_native(mmu_t, const struct pvo_entry *); -static void moea64_pte_synch_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt); -static void moea64_pte_clear_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn, uint64_t ptebit); -static void moea64_pte_change_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn); -static void moea64_pte_unset_native(mmu_t mmu, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn); +static int moea64_pte_insert_native(mmu_t, struct pvo_entry *); +static int64_t moea64_pte_synch_native(mmu_t, struct pvo_entry *); +static int64_t moea64_pte_clear_native(mmu_t, struct pvo_entry *, uint64_t); +static int64_t moea64_pte_replace_native(mmu_t, struct pvo_entry *, int); +static int64_t moea64_pte_unset_native(mmu_t mmu, struct pvo_entry *); /* * Utility routines. */ -static void moea64_bootstrap_native(mmu_t mmup, - vm_offset_t kernelstart, vm_offset_t kernelend); -static void moea64_cpu_bootstrap_native(mmu_t, int ap); -static void tlbia(void); +static void moea64_bootstrap_native(mmu_t mmup, + vm_offset_t kernelstart, vm_offset_t kernelend); +static void moea64_cpu_bootstrap_native(mmu_t, int ap); +static void tlbia(void); static mmu_method_t moea64_native_methods[] = { /* Internal interfaces */ @@ -211,9 +209,8 @@ MMUMETHOD(moea64_pte_synch, moea64_pte_synch_native), MMUMETHOD(moea64_pte_clear, moea64_pte_clear_native), MMUMETHOD(moea64_pte_unset, moea64_pte_unset_native), - MMUMETHOD(moea64_pte_change, moea64_pte_change_native), + MMUMETHOD(moea64_pte_replace, moea64_pte_replace_native), MMUMETHOD(moea64_pte_insert, moea64_pte_insert_native), - MMUMETHOD(moea64_pvo_to_pte, moea64_pvo_to_pte_native), { 0, 0 } }; @@ -221,99 +218,140 @@ MMU_DEF_INHERIT(oea64_mmu_native, MMU_TYPE_G5, moea64_native_methods, 0, oea64_mmu); -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) +static int64_t +moea64_pte_synch_native(mmu_t mmu, struct pvo_entry *pvo) { - uint64_t hash; - int shift; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); -static void -moea64_pte_synch_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt) -{ - struct lpte *pt = (struct lpte *)pt_cookie; + moea64_pte_from_pvo(pvo, &properpt); - pvo_pt->pte_lo |= pt->pte_lo & (LPTE_REF | LPTE_CHG); + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } + + PTESYNC(); + ptelo = be64toh(pt->pte_lo); + + rw_runlock(&moea64_eviction_lock); + + return (ptelo & (LPTE_REF | LPTE_CHG)); } -static void -moea64_pte_clear_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt, - uint64_t vpn, uint64_t ptebit) +static int64_t +moea64_pte_clear_native(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { - struct lpte *pt = (struct lpte *)pt_cookie; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; - /* - * As shown in Section 7.6.3.2.3 - */ - pt->pte_lo &= ~ptebit; - critical_enter(); - TLBIE(vpn); - critical_exit(); -} + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); -static void -moea64_pte_set_native(struct lpte *pt, struct lpte *pvo_pt) -{ + moea64_pte_from_pvo(pvo, &properpt); - pvo_pt->pte_hi |= LPTE_VALID; + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } - /* - * Update the PTE as defined in section 7.6.3.1. - * Note that the REF/CHG bits are from pvo_pt and thus should have - * been saved so this routine can restore them (if desired). - */ - pt->pte_lo = pvo_pt->pte_lo; - EIEIO(); - pt->pte_hi = pvo_pt->pte_hi; - PTESYNC(); + if (ptebit == LPTE_REF) { + /* See "Resetting the Reference Bit" in arch manual */ + PTESYNC(); + /* 2-step here safe: precision is not guaranteed */ + ptelo |= pt->pte_lo; - /* Keep statistics for unlocked pages */ - if (!(pvo_pt->pte_hi & LPTE_LOCKED)) - moea64_pte_valid++; + /* One-byte store to avoid touching the C bit */ + ((volatile uint8_t *)(&pt->pte_lo))[6] = + ((uint8_t *)(&properpt.pte_lo))[6]; + rw_runlock(&moea64_eviction_lock); + + critical_enter(); + TLBIE(pvo->pvo_vpn); + critical_exit(); + } else { + rw_runlock(&moea64_eviction_lock); + ptelo = moea64_pte_unset_native(mmu, pvo); + moea64_pte_insert_native(mmu, pvo); + } + + return (ptelo & (LPTE_REF | LPTE_CHG)); } -static void -moea64_pte_unset_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt, - uint64_t vpn) +static int64_t +moea64_pte_unset_native(mmu_t mmu, struct pvo_entry *pvo) { - struct lpte *pt = (struct lpte *)pt_cookie; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + moea64_pte_overflow--; + rw_runlock(&moea64_eviction_lock); + return (-1); + } + /* - * Invalidate the pte. + * Invalidate the pte, briefly locking it to collect RC bits. No + * atomics needed since this is protected against eviction by the lock. */ isync(); critical_enter(); - pvo_pt->pte_hi &= ~LPTE_VALID; - pt->pte_hi &= ~LPTE_VALID; + pt->pte_hi = (pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED; PTESYNC(); - TLBIE(vpn); + TLBIE(pvo->pvo_vpn); + ptelo = be64toh(pt->pte_lo); + *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ critical_exit(); + rw_runlock(&moea64_eviction_lock); - /* - * Save the reg & chg bits. - */ - moea64_pte_synch_native(mmu, pt_cookie, pvo_pt); + /* Keep statistics */ + moea64_pte_valid--; - /* Keep statistics for unlocked pages */ - if (!(pvo_pt->pte_hi & LPTE_LOCKED)) - moea64_pte_valid--; + return (ptelo & (LPTE_CHG | LPTE_REF)); } -static void -moea64_pte_change_native(mmu_t mmu, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn) +static int64_t +moea64_pte_replace_native(mmu_t mmu, struct pvo_entry *pvo, int flags) { + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + int64_t ptelo; - /* - * Invalidate the PTE - */ - moea64_pte_unset_native(mmu, pt, pvo_pt, vpn); - moea64_pte_set_native((struct lpte *)pt, pvo_pt); + if (flags == 0) { + /* Just some software bits changing. */ + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + rw_runlock(&moea64_eviction_lock); + return (-1); + } + pt->pte_hi = properpt.pte_hi; + ptelo = pt->pte_lo; + rw_runlock(&moea64_eviction_lock); + } else { + /* Otherwise, need reinsertion and deletion */ + ptelo = moea64_pte_unset_native(mmu, pvo); + moea64_pte_insert_native(mmu, pvo); + } + + return (ptelo); } static void @@ -380,6 +418,7 @@ size = moea64_pteg_count * sizeof(struct lpteg); CTR2(KTR_PMAP, "moea64_bootstrap: %d PTEGs, %d bytes", moea64_pteg_count, size); + rw_init(&moea64_eviction_lock, "pte eviction"); /* * We now need to allocate memory. This memory, to be allocated, @@ -388,9 +427,10 @@ * as a measure of last resort. We do this a couple times. */ - moea64_pteg_table = (struct lpteg *)moea64_bootstrap_alloc(size, size); + moea64_pteg_table = (struct lpte *)moea64_bootstrap_alloc(size, size); DISABLE_TRANS(msr); - bzero((void *)moea64_pteg_table, moea64_pteg_count * sizeof(struct lpteg)); + bzero(__DEVOLATILE(void *, moea64_pteg_table), moea64_pteg_count * + sizeof(struct lpteg)); ENABLE_TRANS(msr); CTR1(KTR_PMAP, "moea64_bootstrap: PTEG table at %p", moea64_pteg_table); @@ -446,181 +486,173 @@ TLBSYNC(); } -static uintptr_t -moea64_pvo_to_pte_native(mmu_t mmu, const struct pvo_entry *pvo) +static int +atomic_pte_lock(volatile struct lpte *pte, uint64_t bitmask, uint64_t *oldhi) { - struct lpte *pt; - int pteidx, ptegidx; - uint64_t vsid; + int ret; + uint32_t oldhihalf; - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - /* - * Calculate the ptegidx + * Note: in principle, if just the locked bit were set here, we + * could avoid needing the eviction lock. However, eviction occurs + * so rarely that it isn't worth bothering about in practice. */ - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), - pvo->pvo_vaddr & PVO_LARGE); - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; + __asm __volatile ( + "1:\tlwarx %1, 0, %3\n\t" /* load old value */ + "and. %0,%1,%4\n\t" /* check if any bits set */ + "bne 2f\n\t" /* exit if any set */ + "stwcx. %5, 0, %3\n\t" /* attempt to store */ + "bne- 1b\n\t" /* spin if failed */ + "li %0, 1\n\t" /* success - retval = 1 */ + "b 3f\n\t" /* we've succeeded */ + "2:\n\t" + "stwcx. %1, 0, %3\n\t" /* clear reservation (74xx) */ + "li %0, 0\n\t" /* failure - retval = 0 */ + "3:\n\t" + : "=&r" (ret), "=&r"(oldhihalf), "=m" (pte->pte_hi) + : "r" ((volatile char *)&pte->pte_hi + 4), + "r" ((uint32_t)bitmask), "r" ((uint32_t)LPTE_LOCKED), + "m" (pte->pte_hi) + : "cr0", "cr1", "cr2", "memory"); - pteidx = (ptegidx << 3) | PVO_PTEGIDX_GET(pvo); + *oldhi = (pte->pte_hi & 0xffffffff00000000ULL) | oldhihalf; - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) && - !PVO_PTEGIDX_ISSET(pvo)) { - panic("moea64_pvo_to_pte: pvo %p has valid pte in pvo but no " - "valid pte index", pvo); - } - - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0 && - PVO_PTEGIDX_ISSET(pvo)) { - panic("moea64_pvo_to_pte: pvo %p has valid pte index in pvo " - "pvo but no valid pte", pvo); - } - - pt = &moea64_pteg_table[pteidx >> 3].pt[pteidx & 7]; - if ((pt->pte_hi ^ (pvo->pvo_pte.lpte.pte_hi & ~LPTE_VALID)) == - LPTE_VALID) { - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0) { - panic("moea64_pvo_to_pte: pvo %p has valid pte in " - "moea64_pteg_table %p but invalid in pvo", pvo, pt); - } - - if (((pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo) & - ~(LPTE_M|LPTE_CHG|LPTE_REF)) != 0) { - panic("moea64_pvo_to_pte: pvo %p pte does not match " - "pte %p in moea64_pteg_table difference is %#x", - pvo, pt, - (uint32_t)(pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo)); - } - - return ((uintptr_t)pt); - } - - if (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) { - panic("moea64_pvo_to_pte: pvo %p has invalid pte %p in " - "moea64_pteg_table but valid in pvo", pvo, pt); - } - - return (-1); + return (ret); } -static __inline int -moea64_pte_spillable_ident(u_int ptegidx) +static uintptr_t +moea64_insert_to_pteg_native(struct lpte *pvo_pt, uintptr_t slotbase, + uint64_t mask) { - struct lpte *pt; - int i, j, k; + volatile struct lpte *pt; + uint64_t oldptehi, va; + uintptr_t k; + int i, j; /* Start at a random slot */ i = mftb() % 8; - k = -1; for (j = 0; j < 8; j++) { - pt = &moea64_pteg_table[ptegidx].pt[(i + j) % 8]; - if (pt->pte_hi & (LPTE_LOCKED | LPTE_WIRED)) - continue; + k = slotbase + (i + j) % 8; + pt = &moea64_pteg_table[k]; + /* Invalidate and seize lock only if no bits in mask set */ + if (atomic_pte_lock(pt, mask, &oldptehi)) /* Lock obtained */ + break; + } - /* This is a candidate, so remember it */ - k = (i + j) % 8; + if (j == 8) + return (-1); - /* Try to get a page that has not been used lately */ - if (!(pt->pte_lo & LPTE_REF)) - return (k); + if (oldptehi & LPTE_VALID) { + KASSERT(!(oldptehi & LPTE_WIRED), ("Unmapped wired entry")); + /* + * Need to invalidate old entry completely: see + * "Modifying a Page Table Entry". Need to reconstruct + * the virtual address for the outgoing entry to do that. + */ + if (oldptehi & LPTE_BIG) + va = oldptehi >> moea64_large_page_shift; + else + va = oldptehi >> ADDR_PIDX_SHFT; + if (oldptehi & LPTE_HID) + va = (((k >> 3) ^ moea64_pteg_mask) ^ va) & + VSID_HASH_MASK; + else + va = ((k >> 3) ^ va) & VSID_HASH_MASK; + va |= (oldptehi & LPTE_AVPN_MASK) << + (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); + PTESYNC(); + TLBIE(va); + moea64_pte_valid--; + moea64_pte_overflow++; } - + + /* + * Update the PTE as per "Adding a Page Table Entry". Lock is released + * by setting the high doubleworld. + */ + pt->pte_lo = pvo_pt->pte_lo; + EIEIO(); + pt->pte_hi = pvo_pt->pte_hi; + PTESYNC(); + + /* Keep statistics */ + moea64_pte_valid++; + return (k); } static int -moea64_pte_insert_native(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +moea64_pte_insert_native(mmu_t mmu, struct pvo_entry *pvo) { - struct lpte *pt; - struct pvo_entry *pvo; - u_int pteg_bktidx; - int i; + struct lpte insertpt; + uintptr_t slot; + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &insertpt); + + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&moea64_eviction_lock); + /* * First try primary hash. */ - pteg_bktidx = ptegidx; - for (pt = moea64_pteg_table[pteg_bktidx].pt, i = 0; i < 8; i++, pt++) { - if ((pt->pte_hi & (LPTE_VALID | LPTE_LOCKED)) == 0) { - pvo_pt->pte_hi &= ~LPTE_HID; - moea64_pte_set_native(pt, pvo_pt); - return (i); - } + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_runlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } /* * Now try secondary hash. */ - pteg_bktidx ^= moea64_pteg_mask; - for (pt = moea64_pteg_table[pteg_bktidx].pt, i = 0; i < 8; i++, pt++) { - if ((pt->pte_hi & (LPTE_VALID | LPTE_LOCKED)) == 0) { - pvo_pt->pte_hi |= LPTE_HID; - moea64_pte_set_native(pt, pvo_pt); - return (i); - } + pvo->pvo_vaddr ^= PVO_HID; + insertpt.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_runlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } /* * Out of luck. Find a PTE to sacrifice. */ - pteg_bktidx = ptegidx; - i = moea64_pte_spillable_ident(pteg_bktidx); - if (i < 0) { - pteg_bktidx ^= moea64_pteg_mask; - i = moea64_pte_spillable_ident(pteg_bktidx); + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); } - if (i < 0) { - /* No freeable slots in either PTEG? We're hosed. */ - panic("moea64_pte_insert: overflow"); - return (-1); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_wunlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } - if (pteg_bktidx == ptegidx) - pvo_pt->pte_hi &= ~LPTE_HID; - else - pvo_pt->pte_hi |= LPTE_HID; - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - pt = &moea64_pteg_table[pteg_bktidx].pt[i]; - - if (pt->pte_hi & LPTE_HID) - pteg_bktidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - LIST_FOREACH(pvo, &moea64_pvo_table[pteg_bktidx], pvo_olink) { - if (pvo->pvo_pte.lpte.pte_hi == pt->pte_hi) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - moea64_pte_unset_native(mmu, (uintptr_t)pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_overflow++; - break; - } + /* Try other hash table. Now we're getting desperate... */ + pvo->pvo_vaddr ^= PVO_HID; + insertpt.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_wunlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } - KASSERT(pvo->pvo_pte.lpte.pte_hi == pt->pte_hi, - ("Unable to find PVO for spilled PTE")); - - /* - * Set the new PTE. - */ - moea64_pte_set_native(pt, pvo_pt); - - return (i); + /* No freeable slots in either PTEG? We're hosed. */ + rw_wunlock(&moea64_eviction_lock); + panic("moea64_pte_insert: overflow"); + return (-1); } Index: pseries/mmu_phyp.c =================================================================== --- pseries/mmu_phyp.c (.../head/sys/powerpc) (revision 279138) +++ pseries/mmu_phyp.c (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include @@ -59,6 +59,8 @@ extern int n_slbs; +static struct rwlock mphyp_eviction_lock; + /* * Kernel MMU interface */ @@ -66,19 +68,11 @@ static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); -static void mphyp_pte_synch(mmu_t, uintptr_t pt, struct lpte *pvo_pt); -static void mphyp_pte_clear(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn, u_int64_t ptebit); -static void mphyp_pte_unset(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static void mphyp_pte_change(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static int mphyp_pte_insert(mmu_t, u_int ptegidx, struct lpte *pvo_pt); -static uintptr_t mphyp_pvo_to_pte(mmu_t, const struct pvo_entry *pvo); +static int64_t mphyp_pte_synch(mmu_t, struct pvo_entry *pvo); +static int64_t mphyp_pte_clear(mmu_t, struct pvo_entry *pvo, uint64_t ptebit); +static int64_t mphyp_pte_unset(mmu_t, struct pvo_entry *pvo); +static int mphyp_pte_insert(mmu_t, struct pvo_entry *pvo); -#define VSID_HASH_MASK 0x0000007fffffffffULL - - static mmu_method_t mphyp_methods[] = { MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), @@ -86,16 +80,33 @@ MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), - MMUMETHOD(moea64_pte_change, mphyp_pte_change), MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), - MMUMETHOD(moea64_pvo_to_pte, mphyp_pvo_to_pte), + /* XXX: pmap_copy_page, pmap_init_page with H_PAGE_INIT */ + { 0, 0 } }; MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); +static int brokenkvm = 0; + static void +print_kvm_bug_warning(void *data) +{ + + if (brokenkvm) + printf("WARNING: Running on a broken hypervisor that does " + "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " + "hypercalls. Performance will be suboptimal.\n"); +} + +SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, + print_kvm_bug_warning, NULL); +SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, + NULL); + +static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { uint64_t final_pteg_count = 0; @@ -106,6 +117,8 @@ phandle_t dev, node, root; int idx, len, res; + rw_init(&mphyp_eviction_lock, "pte eviction"); + moea64_early_bootstrap(mmup, kernelstart, kernelend); root = OF_peer(0); @@ -185,6 +198,10 @@ moea64_mid_bootstrap(mmup, kernelstart, kernelend); moea64_late_bootstrap(mmup, kernelstart, kernelend); + + /* Test for broken versions of KVM that don't conform to the spec */ + if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) + brokenkvm = 1; } static void @@ -209,72 +226,105 @@ } } -static void -mphyp_pte_synch(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt) +static int64_t +mphyp_pte_synch(mmu_t mmu, struct pvo_entry *pvo) { struct lpte pte; uint64_t junk; __asm __volatile("ptesync"); - phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pte.pte_hi, &pte.pte_lo, - &junk); + phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, + &pte.pte_lo, &junk); + if ((pte.pte_hi & LPTE_AVPN_MASK) != + ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK)) + return (-1); + if (!(pte.pte_hi & LPTE_VALID)) + return (-1); - pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); + return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); } -static void -mphyp_pte_clear(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn, - u_int64_t ptebit) +static int64_t +mphyp_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { + int64_t refchg; + uint64_t ptelo, junk; + int err; - if (ptebit & LPTE_CHG) - phyp_hcall(H_CLEAR_MOD, 0, slot); - if (ptebit & LPTE_REF) - phyp_hcall(H_CLEAR_REF, 0, slot); + /* + * This involves two steps (synch and clear) so we need the entry + * not to change in the middle. We are protected against deliberate + * unset by virtue of holding the pmap lock. Protection against + * incidental unset (page table eviction) comes from holding the + * shared eviction lock. + */ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + rw_rlock(&mphyp_eviction_lock); + + refchg = mphyp_pte_synch(mmu, pvo); + if (refchg < 0) { + rw_runlock(&mphyp_eviction_lock); + return (refchg); + } + + if (brokenkvm) { + /* + * No way to clear either bit, which is total madness. + * Pessimistically claim that, once modified, it stays so + * forever and that it is never referenced. + */ + rw_runlock(&mphyp_eviction_lock); + return (refchg & ~LPTE_REF); + } + + if (ptebit & LPTE_CHG) { + err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, + &ptelo, &junk, &junk); + KASSERT(err == H_SUCCESS, + ("Error clearing page change bit: %d", err)); + refchg |= (ptelo & LPTE_CHG); + } + if (ptebit & LPTE_REF) { + err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, + &ptelo, &junk, &junk); + KASSERT(err == H_SUCCESS, + ("Error clearing page reference bit: %d", err)); + refchg |= (ptelo & LPTE_REF); + } + + rw_runlock(&mphyp_eviction_lock); + + return (refchg); } -static void -mphyp_pte_unset(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) +static int64_t +mphyp_pte_unset(mmu_t mmu, struct pvo_entry *pvo) { struct lpte pte; uint64_t junk; int err; - pvo_pt->pte_hi &= ~LPTE_VALID; - err = phyp_pft_hcall(H_REMOVE, 1UL << 31, slot, - pvo_pt->pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, - &junk); - KASSERT(err == H_SUCCESS, ("Error removing page: %d", err)); + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); - pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); -} + moea64_pte_from_pvo(pvo, &pte); -static void -mphyp_pte_change(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) -{ - struct lpte evicted; - uint64_t index, junk; - int64_t result; + err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, + pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, + &junk); + KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, + ("Error removing page: %d", err)); - /* - * NB: this is protected by the global table lock, so this two-step - * is safe, except for the scratch-page case. No CPUs on which we run - * this code should be using scratch pages. - */ - KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), - ("Locked pages not supported on PHYP")); + if (err == H_NOT_FOUND) { + moea64_pte_overflow--; + return (-1); + } - /* XXX: optimization using H_PROTECT for common case? */ - mphyp_pte_unset(mmu, slot, pvo_pt, vpn); - pvo_pt->pte_hi |= LPTE_VALID; - result = phyp_pft_hcall(H_ENTER, H_EXACT, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result != H_SUCCESS) - panic("mphyp_pte_change() insertion failure: %ld\n", result); + return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); } -static __inline int -mphyp_pte_spillable_ident(u_int ptegidx, struct lpte *to_evict) +static uintptr_t +mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) { uint64_t slot, junk, k; struct lpte pt; @@ -284,9 +334,9 @@ i = mftb() % 8; k = -1; for (j = 0; j < 8; j++) { - slot = (ptegidx << 3) + (i + j) % 8; - phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, &pt.pte_lo, - &junk); + slot = ptegbase + (i + j) % 8; + phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, + &pt.pte_lo, &junk); if (pt.pte_hi & LPTE_WIRED) continue; @@ -295,7 +345,7 @@ k = slot; /* Try to get a page that has not been used lately */ - if (!(pt.pte_lo & LPTE_REF)) { + if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { memcpy(to_evict, &pt, sizeof(struct lpte)); return (k); } @@ -310,44 +360,50 @@ } static int -mphyp_pte_insert(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +mphyp_pte_insert(mmu_t mmu, struct pvo_entry *pvo) { int64_t result; - struct lpte evicted; - struct pvo_entry *pvo; - uint64_t index, junk; - u_int pteg_bktidx; + struct lpte evicted, pte; + uint64_t index, junk, lastptelo; - /* Check for locked pages, which we can't support on this system */ - KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), - ("Locked pages not supported on PHYP")); + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); /* Initialize PTE */ - pvo_pt->pte_hi |= LPTE_VALID; - pvo_pt->pte_hi &= ~LPTE_HID; + moea64_pte_from_pvo(pvo, &pte); evicted.pte_hi = 0; + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&mphyp_eviction_lock); + /* * First try primary hash. */ - pteg_bktidx = ptegidx; - result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result == H_SUCCESS) - return (index & 0x07); + pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ + result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, + pte.pte_lo, &index, &evicted.pte_lo, &junk); + if (result == H_SUCCESS) { + rw_runlock(&mphyp_eviction_lock); + pvo->pvo_pte.slot = index; + return (0); + } KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " - "(ptegidx: %#x/%#x, PTE %#lx/%#lx", result, ptegidx, - moea64_pteg_count, pvo_pt->pte_hi, pvo_pt->pte_lo)); + "(ptegidx: %#zx/%#x, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, + moea64_pteg_count, pte.pte_hi, pte.pte_lo)); /* * Next try secondary hash. */ - pteg_bktidx ^= moea64_pteg_mask; - pvo_pt->pte_hi |= LPTE_HID; - result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, - pvo_pt->pte_hi, pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result == H_SUCCESS) - return (index & 0x07); + pvo->pvo_vaddr ^= PVO_HID; + pte.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + + result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, + pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); + if (result == H_SUCCESS) { + rw_runlock(&mphyp_eviction_lock); + pvo->pvo_pte.slot = index; + return (0); + } KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", result)); @@ -354,93 +410,51 @@ /* * Out of luck. Find a PTE to sacrifice. */ - pteg_bktidx = ptegidx; - index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&mphyp_eviction_lock)) { + rw_runlock(&mphyp_eviction_lock); + rw_wlock(&mphyp_eviction_lock); + } + + index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); if (index == -1L) { - pteg_bktidx ^= moea64_pteg_mask; - index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); + /* Try other hash table? */ + pvo->pvo_vaddr ^= PVO_HID; + pte.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); } if (index == -1L) { /* No freeable slots in either PTEG? We're hosed. */ + rw_wunlock(&mphyp_eviction_lock); panic("mphyp_pte_insert: overflow"); return (-1); } - if (pteg_bktidx == ptegidx) - pvo_pt->pte_hi &= ~LPTE_HID; - else - pvo_pt->pte_hi |= LPTE_HID; - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - - if (evicted.pte_hi & LPTE_HID) - pteg_bktidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - LIST_FOREACH(pvo, &moea64_pvo_table[pteg_bktidx], pvo_olink) { - if (pvo->pvo_pte.lpte.pte_hi == evicted.pte_hi) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - mphyp_pte_unset(mmu, index, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_overflow++; - break; - } + /* Victim acquired: update page before waving goodbye */ + if (evicted.pte_hi & LPTE_VALID) { + result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, + evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, + &junk); + moea64_pte_overflow++; + KASSERT(result == H_SUCCESS, + ("Error evicting page: %d", (int)result)); } - KASSERT((pvo->pvo_pte.lpte.pte_hi | LPTE_VALID) == evicted.pte_hi, - ("Unable to find PVO for spilled PTE")); - /* * Set the new PTE. */ - result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); + result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, + pte.pte_lo, &index, &evicted.pte_lo, &junk); + rw_wunlock(&mphyp_eviction_lock); /* All clear */ + + pvo->pvo_pte.slot = index; if (result == H_SUCCESS) - return (index & 0x07); + return (0); panic("Page replacement error: %ld", result); - return (-1); + return (result); } -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) -{ - uint64_t hash; - int shift; - - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} - -static uintptr_t -mphyp_pvo_to_pte(mmu_t mmu, const struct pvo_entry *pvo) -{ - uint64_t vsid; - u_int ptegidx; - - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); - - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; - - return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo)); -} - Index: . =================================================================== --- . (.../head/sys/powerpc) (revision 279138) +++ . (.../user/nwhitehorn/ppc64-pmap-rework) (revision 279138) Property changes on: . ___________________________________________________________________ Added: svn:mergeinfo Merged /user/peter/kinfo/sys/powerpc:r185413-185547 Merged /projects/largeSMP/sys/powerpc:r221273-222812,222815-223757 Merged /user/dfr/xenhvm/6/sys/powerpc:r189304,189451 Merged /user/dfr/xenhvm/7/sys/powerpc:r188574-189614 Merged /user/np/cxl_tuning/sys/powerpc:r254336,254386,254736 Merged /user/mav/ata/sys/powerpc:r189793-190578 Merged /projects/clang350-import/sys/powerpc:r274961-276476 Merged /user/alfred/9-alfred/sys/powerpc:r242488 Merged /user/thompsa/usb/sys/powerpc:r187190 Merged /user/ed/newcons/sys/powerpc:r219886-259015 Merged /projects/head_mfi/sys/powerpc:r227068,227574,227579-227580,227612,227905,228108,228208,228279,228310,228320,231988,232412-232414,232888,233016,233620 Merged /projects/cambria/sys/powerpc:r186008-186350 Merged /projects/random_number_generator/sys/powerpc:r254613-256400 Merged /user/piso/sys/powerpc:r186543,186723,186725-186726,186742,186770-186771,186774,186777-186779,187984-187985,190555,190572,190589,190592,190614,190625,190830 Merged /projects/quota64/sys/powerpc:r184125-207707 Merged /user/jimharris/isci/sys/powerpc:r228377-230794 Merged /projects/clang-sparc64/sys/powerpc:r262258-262612 Merged /user/piso/ipfw/sys/powerpc:r190918,190921,190923,190926 Merged /projects/multi-fibv6/head/sys/powerpc:r230929-231848 Merged /projects/ipfw/sys/powerpc:r267383-272837 Merged /projects/pf/head/sys/powerpc:r251993,263908,264198 Merged /head/sys/powerpc:r278877-279136