Index: conf/files =================================================================== --- conf/files (.../head/sys) (revision 248227) +++ conf/files (.../user/attilio/vmcontention/sys) (revision 248227) @@ -3630,6 +3630,7 @@ vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_radix.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard Index: amd64/include/pmap.h =================================================================== --- amd64/include/pmap.h (.../head/sys) (revision 248227) +++ amd64/include/pmap.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -150,6 +150,8 @@ #include #include +#include + typedef u_int64_t pd_entry_t; typedef u_int64_t pt_entry_t; typedef u_int64_t pdp_entry_t; @@ -250,7 +252,7 @@ struct pmap { cpuset_t pm_active; /* active on cpus */ /* spare u_int here due to padding */ struct pmap_statistics pm_stats; /* pmap statistics */ - vm_page_t pm_root; /* spare page table pages */ + struct vm_radix pm_root; /* spare page table pages */ }; typedef struct pmap *pmap_t; Index: amd64/amd64/pmap.c =================================================================== --- amd64/amd64/pmap.c (.../head/sys) (revision 248227) +++ amd64/amd64/pmap.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -131,6 +131,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -1497,7 +1498,8 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; - free = m->right; + free = (void *)m->object; + m->object = NULL; /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } @@ -1516,7 +1518,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; - m->right = *free; + m->object = (void *)*free; *free = m; } @@ -1526,31 +1528,12 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ -static void +static __inline void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { - vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - root = pmap->pm_root; - if (root == NULL) { - mpte->left = NULL; - mpte->right = NULL; - } else { - root = vm_page_splay(mpte->pindex, root); - if (mpte->pindex < root->pindex) { - mpte->left = root->left; - mpte->right = root; - root->left = NULL; - } else if (mpte->pindex == root->pindex) - panic("pmap_insert_pt_page: pindex already inserted"); - else { - mpte->right = root->right; - mpte->left = root; - root->right = NULL; - } - } - pmap->pm_root = mpte; + vm_radix_insert(&pmap->pm_root, mpte->pindex, mpte); } /* @@ -1558,19 +1541,12 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) * specified pmap's collection of idle page table pages. Returns NULL if there * is no page table page corresponding to the specified virtual address. */ -static vm_page_t +static __inline vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) { - vm_page_t mpte; - vm_pindex_t pindex = pmap_pde_pindex(va); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { - mpte = vm_page_splay(pindex, mpte); - if ((pmap->pm_root = mpte)->pindex != pindex) - mpte = NULL; - } - return (mpte); + return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); } /* @@ -1578,25 +1554,12 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) * of idle page table pages. The specified page table page must be a member of * the pmap's collection. */ -static void +static __inline void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) { - vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if (mpte != pmap->pm_root) { - root = vm_page_splay(mpte->pindex, pmap->pm_root); - KASSERT(mpte == root, - ("pmap_remove_pt_page: mpte %p is missing from pmap %p", - mpte, pmap)); - } - if (mpte->left == NULL) - root = mpte->right; - else { - root = vm_page_splay(mpte->pindex, mpte->left); - root->right = mpte->right; - } - pmap->pm_root = root; + vm_radix_remove(&pmap->pm_root, mpte->pindex); } /* @@ -1693,7 +1656,7 @@ pmap_pinit0(pmap_t pmap) PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - pmap->pm_root = NULL; + pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); @@ -1734,7 +1697,7 @@ pmap_pinit(pmap_t pmap) /* install self-referential address mapping entry(s) */ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; - pmap->pm_root = NULL; + pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); @@ -1976,7 +1939,7 @@ pmap_release(pmap_t pmap) KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); - KASSERT(pmap->pm_root == NULL, + KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); @@ -2273,7 +2236,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock } if (m_pc == NULL && free != NULL) { m_pc = free; - free = m_pc->right; + free = (void *)m_pc->object; /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); Index: vm/vm_reserv.c =================================================================== --- vm/vm_reserv.c (.../head/sys) (revision 248227) +++ vm/vm_reserv.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include /* @@ -342,33 +343,22 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pind /* * Look for an existing reservation. */ - msucc = NULL; - mpred = object->root; - while (mpred != NULL) { - KASSERT(mpred->pindex != pindex, + mpred = vm_radix_lookup_le(&object->rtree, pindex); + if (mpred != NULL) { + KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_contig: pindex already allocated")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; - else if (mpred->pindex < pindex) { - if (msucc != NULL || - (msucc = TAILQ_NEXT(mpred, listq)) == NULL) - break; - KASSERT(msucc->pindex != pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && - vm_reserv_has_pindex(rv, pindex)) - goto found; - else if (pindex < msucc->pindex) - break; - } else if (msucc == NULL) { - msucc = mpred; - mpred = TAILQ_PREV(msucc, pglist, listq); - continue; - } - msucc = NULL; - mpred = object->root = vm_page_splay(pindex, object->root); + msucc = TAILQ_NEXT(mpred, listq); + } else + msucc = vm_radix_lookup_ge(&object->rtree, pindex); + if (msucc != NULL) { + KASSERT(msucc->pindex > pindex, + ("vm_reserv_alloc_page: pindex already allocated")); + rv = vm_reserv_from_page(msucc); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; } /* @@ -508,33 +498,22 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex /* * Look for an existing reservation. */ - msucc = NULL; - mpred = object->root; - while (mpred != NULL) { - KASSERT(mpred->pindex != pindex, + mpred = vm_radix_lookup_le(&object->rtree, pindex); + if (mpred != NULL) { + KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_page: pindex already allocated")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; - else if (mpred->pindex < pindex) { - if (msucc != NULL || - (msucc = TAILQ_NEXT(mpred, listq)) == NULL) - break; - KASSERT(msucc->pindex != pindex, - ("vm_reserv_alloc_page: pindex already allocated")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && - vm_reserv_has_pindex(rv, pindex)) - goto found; - else if (pindex < msucc->pindex) - break; - } else if (msucc == NULL) { - msucc = mpred; - mpred = TAILQ_PREV(msucc, pglist, listq); - continue; - } - msucc = NULL; - mpred = object->root = vm_page_splay(pindex, object->root); + msucc = TAILQ_NEXT(mpred, listq); + } else + msucc = vm_radix_lookup_ge(&object->rtree, pindex); + if (msucc != NULL) { + KASSERT(msucc->pindex > pindex, + ("vm_reserv_alloc_page: pindex already allocated")); + rv = vm_reserv_from_page(msucc); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; } /* Index: vm/vm_object.c =================================================================== --- vm/vm_object.c (.../head/sys) (revision 248227) +++ vm/vm_object.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -94,6 +94,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -167,8 +168,8 @@ vm_object_zdtor(void *mem, int size, void *arg) object = (vm_object_t)mem; KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); - KASSERT(object->root == NULL, - ("object %p has resident pages in its tree", object)); + KASSERT(vm_radix_is_empty(&object->rtree), + ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", @@ -199,11 +200,11 @@ vm_object_zinit(void *mem, int size, int flags) rw_init_flags(&object->lock, "vm object", RW_DUPOK); /* These are true for any object that has been freed */ - object->root = NULL; + object->rtree.rt_root = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; - object->cache = NULL; + object->cache.rt_root = 0; return (0); } @@ -295,6 +296,8 @@ vm_object_init(void) NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); + + vm_radix_init(); } void @@ -742,7 +745,7 @@ vm_object_terminate(vm_object_t object) * modified by the preceding loop. */ if (object->resident_page_count != 0) { - object->root = NULL; + vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) Index: vm/vm_radix.c =================================================================== --- vm/vm_radix.c (.../head/sys) (revision 0) +++ vm/vm_radix.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -0,0 +1,752 @@ +/* + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson + * Copyright (c) 2008 Mayur Shardul + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Path-compressed radix trie implementation. + * The following code is not generalized into a general purpose library + * because there are way too many parameters embedded that should really + * be decided by the library consumers. At the same time, consumers + * of this code must achieve highest possible performance. + * + * The implementation takes into account the following rationale: + * - Size of the nodes should be as small as possible. + * - There is no bias toward lookup operations over inserts or removes, + * and vice-versa. + * - On average not many nodes are expected to be full, hence level + * compression may just complicate things. + */ + +#include + +#include "opt_ddb.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +/* + * These widths should allow the pointers to a node's children to fit within + * a single cache line. The extra levels from a narrow width should not be + * a problem thanks to path compression. + */ +#ifdef __LP64__ +#define VM_RADIX_WIDTH 4 +#else +#define VM_RADIX_WIDTH 3 +#endif + +#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) +#define VM_RADIX_MASK (VM_RADIX_COUNT - 1) +#define VM_RADIX_LIMIT \ + (howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1) + +/* Flag bits stored in node pointers. */ +#define VM_RADIX_ISLEAF 0x1 +#define VM_RADIX_FLAGS 0x1 +#define VM_RADIX_PAD VM_RADIX_FLAGS + +/* Returns one unit associated with specified level. */ +#define VM_RADIX_UNITLEVEL(lev) \ + ((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH)) + +struct vm_radix_node { + void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */ + vm_pindex_t rn_owner; /* Owner of record. */ + uint16_t rn_count; /* Valid children. */ + uint16_t rn_clev; /* Current level. */ +}; + +static uma_zone_t vm_radix_node_zone; + +/* + * Allocate a radix node. Pre-allocation should ensure that the request + * will always be satisfied. + */ +static __inline struct vm_radix_node * +vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel) +{ + struct vm_radix_node *rnode; + + rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO); + + /* + * The required number of nodes should already be pre-allocated + * by vm_radix_prealloc(). However, UMA can hold a few nodes + * in per-CPU buckets, which will not be accessible by the + * current CPU. Thus, the allocation could return NULL when + * the pre-allocated pool is close to exhaustion. Anyway, + * in practice this should never occur because a new node + * is not always required for insert. Thus, the pre-allocated + * pool should have some extra pages that prevent this from + * becoming a problem. + */ + if (rnode == NULL) + panic("%s: uma_zalloc() returned NULL for a new node", + __func__); + rnode->rn_owner = owner; + rnode->rn_count = count; + rnode->rn_clev = clevel; + return (rnode); +} + +/* + * Free radix node. + */ +static __inline void +vm_radix_node_put(struct vm_radix_node *rnode) +{ + + uma_zfree(vm_radix_node_zone, rnode); +} + +/* + * Return the position in the array for a given level. + */ +static __inline int +vm_radix_slot(vm_pindex_t index, uint16_t level) +{ + + return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) & + VM_RADIX_MASK); +} + +/* Trims the key after the specified level. */ +static __inline vm_pindex_t +vm_radix_trimkey(vm_pindex_t index, uint16_t level) +{ + vm_pindex_t ret; + + ret = index; + if (level < VM_RADIX_LIMIT) { + ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH; + ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH; + } + return (ret); +} + +/* + * Get the root node for a radix tree. + */ +static __inline struct vm_radix_node * +vm_radix_getroot(struct vm_radix *rtree) +{ + + return ((struct vm_radix_node *)(rtree->rt_root & ~VM_RADIX_FLAGS)); +} + +/* + * Set the root node for a radix tree. + */ +static __inline void +vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode) +{ + + rtree->rt_root = (uintptr_t)rnode; +} + +/* + * Returns the associated page extracted from rnode if available, + * and NULL otherwise. + */ +static __inline vm_page_t +vm_radix_node_page(struct vm_radix_node *rnode) +{ + + return ((((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0) ? + (vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS) : NULL); +} + +/* + * Adds the page as a child of the provided node. + */ +static __inline void +vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, + vm_page_t page) +{ + int slot; + + slot = vm_radix_slot(index, clev); + rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF); +} + +/* + * Returns the slot where two keys differ. + * It cannot accept 2 equal keys. + */ +static __inline uint16_t +vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) +{ + uint16_t clev; + + KASSERT(index1 != index2, ("%s: passing the same key value %jx", + __func__, (uintmax_t)index1)); + + index1 ^= index2; + for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++) + if (vm_radix_slot(index1, clev)) + return (clev); + panic("%s: cannot reach this point", __func__); + return (0); +} + +/* + * Returns TRUE if it can be determined that key does not belong to the + * specified rnode. Otherwise, returns FALSE. + */ +static __inline boolean_t +vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) +{ + + if (rnode->rn_clev > 0) { + idx = vm_radix_trimkey(idx, rnode->rn_clev - 1); + idx -= rnode->rn_owner; + if (idx != 0) + return (TRUE); + } + return (FALSE); +} + +/* + * Adjusts the idx key to the first upper level available, based on a valid + * initial level and map of available levels. + * Returns a value bigger than 0 to signal that there are not valid levels + * available. + */ +static __inline int +vm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev) +{ + vm_pindex_t wrapidx; + + for (; levels[ilev] == FALSE || + vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--) + if (ilev == 0) + break; + KASSERT(ilev > 0 || levels[0], + ("%s: levels back-scanning problem", __func__)); + if (ilev == 0 && vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1)) + return (1); + wrapidx = *idx; + *idx = vm_radix_trimkey(*idx, ilev); + *idx += VM_RADIX_UNITLEVEL(ilev); + return (*idx < wrapidx); +} + +/* + * Adjusts the idx key to the first lower level available, based on a valid + * initial level and map of available levels. + * Returns a value bigger than 0 to signal that there are not valid levels + * available. + */ +static __inline int +vm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev) +{ + vm_pindex_t wrapidx; + + for (; levels[ilev] == FALSE || + vm_radix_slot(*idx, ilev) == 0; ilev--) + if (ilev == 0) + break; + KASSERT(ilev > 0 || levels[0], + ("%s: levels back-scanning problem", __func__)); + if (ilev == 0 && vm_radix_slot(*idx, ilev) == 0) + return (1); + wrapidx = *idx; + *idx = vm_radix_trimkey(*idx, ilev); + *idx |= VM_RADIX_UNITLEVEL(ilev) - 1; + *idx -= VM_RADIX_UNITLEVEL(ilev); + return (*idx > wrapidx); +} + +/* + * Internal helper for vm_radix_reclaim_allnodes(). + * This function is recursive. + */ +static void +vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) +{ + int slot; + + for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) { + if (rnode->rn_child[slot] == NULL) + continue; + if (vm_radix_node_page(rnode->rn_child[slot]) == NULL) + vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]); + rnode->rn_count--; + } + vm_radix_node_put(rnode); +} + +#ifdef INVARIANTS +/* + * Radix node zone destructor. + */ +static void +vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused) +{ + struct vm_radix_node *rnode; + + rnode = mem; + KASSERT(rnode->rn_count == 0, + ("vm_radix_node_put: Freeing node %p with %d children\n", mem, + rnode->rn_count)); +} +#endif + +/* + * Pre-allocate intermediate nodes from the UMA slab zone. + */ +static void +vm_radix_prealloc(void *arg __unused) +{ + + if (!uma_zone_reserve_kva(vm_radix_node_zone, cnt.v_page_count)) + panic("%s: unable to create new zone", __func__); + uma_prealloc(vm_radix_node_zone, cnt.v_page_count); +} +SYSINIT(vm_radix_prealloc, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_prealloc, + NULL); + +/* + * Initialize the UMA slab zone. + * Until vm_radix_prealloc() is called, the zone will be served by the + * UMA boot-time pre-allocated pool of pages. + */ +void +vm_radix_init(void) +{ + + vm_radix_node_zone = uma_zcreate("RADIX NODE", + sizeof(struct vm_radix_node), NULL, +#ifdef INVARIANTS + vm_radix_node_zone_dtor, +#else + NULL, +#endif + NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_NOFREE); +} + +/* + * Inserts the key-value pair into the trie. + * Panics if the key already exists. + */ +void +vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, vm_page_t page) +{ + vm_pindex_t newind; + struct vm_radix_node *rnode, *tmp, *tmp2; + vm_page_t m; + int slot; + uint16_t clev; + + KASSERT(index == page->pindex, ("%s: index != page->pindex", + __func__)); + + /* + * The owner of record for root is not really important because it + * will never be used. + */ + rnode = vm_radix_getroot(rtree); + if (rnode == NULL) { + rnode = vm_radix_node_get(0, 1, 0); + vm_radix_setroot(rtree, rnode); + vm_radix_addpage(rnode, index, 0, page); + return; + } + while (rnode != NULL) { + if (vm_radix_keybarr(rnode, index)) + break; + slot = vm_radix_slot(index, rnode->rn_clev); + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL) { + if (m->pindex == index) + panic("%s: key %jx is already present", + __func__, (uintmax_t)index); + clev = vm_radix_keydiff(m->pindex, index); + tmp = vm_radix_node_get(vm_radix_trimkey(index, + clev - 1), 2, clev); + rnode->rn_child[slot] = tmp; + vm_radix_addpage(tmp, index, clev, page); + vm_radix_addpage(tmp, m->pindex, clev, m); + return; + } + if (rnode->rn_child[slot] == NULL) { + rnode->rn_count++; + vm_radix_addpage(rnode, index, rnode->rn_clev, page); + return; + } + rnode = rnode->rn_child[slot]; + } + if (rnode == NULL) + panic("%s: path traversal ended unexpectedly", __func__); + + /* + * Scan the trie from the top and find the parent to insert + * the new object. + */ + newind = rnode->rn_owner; + clev = vm_radix_keydiff(newind, index); + slot = VM_RADIX_COUNT; + for (rnode = vm_radix_getroot(rtree); ; rnode = tmp) { + KASSERT(rnode != NULL, ("%s: edge cannot be NULL in the scan", + __func__)); + KASSERT(clev >= rnode->rn_clev, + ("%s: unexpected trie depth: clev: %d, rnode->rn_clev: %d", + __func__, clev, rnode->rn_clev)); + slot = vm_radix_slot(index, rnode->rn_clev); + tmp = rnode->rn_child[slot]; + KASSERT(tmp != NULL && vm_radix_node_page(tmp) == NULL, + ("%s: unexpected lookup interruption", __func__)); + if (tmp->rn_clev > clev) + break; + } + KASSERT(rnode != NULL && tmp != NULL && slot < VM_RADIX_COUNT, + ("%s: invalid scan parameters rnode: %p, tmp: %p, slot: %d", + __func__, (void *)rnode, (void *)tmp, slot)); + + /* + * A new node is needed because the right insertion level is reached. + * Setup the new intermediate node and add the 2 children: the + * new object and the older edge. + */ + tmp2 = vm_radix_node_get(vm_radix_trimkey(index, clev - 1), 2, + clev); + rnode->rn_child[slot] = tmp2; + vm_radix_addpage(tmp2, index, clev, page); + slot = vm_radix_slot(newind, clev); + tmp2->rn_child[slot] = tmp; +} + +/* + * Returns the value stored at the index. If the index is not present, + * NULL is returned. + */ +vm_page_t +vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) +{ + struct vm_radix_node *rnode; + vm_page_t m; + int slot; + + rnode = vm_radix_getroot(rtree); + while (rnode != NULL) { + if (vm_radix_keybarr(rnode, index)) + return (NULL); + slot = vm_radix_slot(index, rnode->rn_clev); + rnode = rnode->rn_child[slot]; + m = vm_radix_node_page(rnode); + if (m != NULL) { + if (m->pindex == index) + return (m); + else + return (NULL); + } + } + return (NULL); +} + +/* + * Look up the nearest entry at a position bigger than or equal to index. + */ +vm_page_t +vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) +{ + vm_pindex_t inc; + vm_page_t m; + struct vm_radix_node *rnode; + int slot; + uint16_t difflev; + boolean_t maplevels[VM_RADIX_LIMIT + 1]; +#ifdef INVARIANTS + int loops = 0; +#endif + +restart: + KASSERT(++loops < 1000, ("%s: too many loops", __func__)); + for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++) + maplevels[difflev] = FALSE; + rnode = vm_radix_getroot(rtree); + while (rnode != NULL) { + maplevels[rnode->rn_clev] = TRUE; + + /* + * If the keys differ before the current bisection node + * the search key might rollback to the earlierst + * available bisection node, or to the smaller value + * in the current domain (if the owner is bigger than the + * search key). + * The maplevels array records any node has been seen + * at a given level. This aids the search for a valid + * bisection node. + */ + if (vm_radix_keybarr(rnode, index)) { + difflev = vm_radix_keydiff(index, rnode->rn_owner); + if (index > rnode->rn_owner) { + if (vm_radix_addlev(&index, maplevels, + difflev) > 0) + break; + } else + index = vm_radix_trimkey(rnode->rn_owner, + difflev); + goto restart; + } + slot = vm_radix_slot(index, rnode->rn_clev); + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL && m->pindex >= index) + return (m); + if (rnode->rn_child[slot] != NULL && m == NULL) { + rnode = rnode->rn_child[slot]; + continue; + } + + /* + * Look for an available edge or page within the current + * bisection node. + */ + if (slot < (VM_RADIX_COUNT - 1)) { + inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); + index = vm_radix_trimkey(index, rnode->rn_clev); + index += inc; + slot++; + for (;; index += inc, slot++) { + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL && m->pindex >= index) + return (m); + if ((rnode->rn_child[slot] != NULL && + m == NULL) || slot == (VM_RADIX_COUNT - 1)) + break; + } + } + + /* + * If a valid page or edge bigger than the search slot is + * found in the traversal, skip to the next higher-level key. + */ + if (slot == (VM_RADIX_COUNT - 1) && + (rnode->rn_child[slot] == NULL || m != NULL)) { + if (rnode->rn_clev == 0 || vm_radix_addlev(&index, + maplevels, rnode->rn_clev - 1) > 0) + break; + goto restart; + } + rnode = rnode->rn_child[slot]; + } + return (NULL); +} + +/* + * Look up the nearest entry at a position less than or equal to index. + */ +vm_page_t +vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) +{ + vm_pindex_t inc; + vm_page_t m; + struct vm_radix_node *rnode; + int slot; + uint16_t difflev; + boolean_t maplevels[VM_RADIX_LIMIT + 1]; +#ifdef INVARIANTS + int loops = 0; +#endif + +restart: + KASSERT(++loops < 1000, ("%s: too many loops", __func__)); + for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++) + maplevels[difflev] = FALSE; + rnode = vm_radix_getroot(rtree); + while (rnode != NULL) { + maplevels[rnode->rn_clev] = TRUE; + + /* + * If the keys differ before the current bisection node + * the search key might rollback to the earlierst + * available bisection node, or to the higher value + * in the current domain (if the owner is smaller than the + * search key). + * The maplevels array records any node has been seen + * at a given level. This aids the search for a valid + * bisection node. + */ + if (vm_radix_keybarr(rnode, index)) { + difflev = vm_radix_keydiff(index, rnode->rn_owner); + if (index > rnode->rn_owner) { + index = vm_radix_trimkey(rnode->rn_owner, + difflev); + index |= VM_RADIX_UNITLEVEL(difflev) - 1; + } else if (vm_radix_declev(&index, maplevels, + difflev) > 0) + break; + goto restart; + } + slot = vm_radix_slot(index, rnode->rn_clev); + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL && m->pindex <= index) + return (m); + if (rnode->rn_child[slot] != NULL && m == NULL) { + rnode = rnode->rn_child[slot]; + continue; + } + + /* + * Look for an available edge or page within the current + * bisection node. + */ + if (slot > 0) { + inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); + index = vm_radix_trimkey(index, rnode->rn_clev); + index |= inc - 1; + index -= inc; + slot--; + for (;; index -= inc, slot--) { + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL && m->pindex <= index) + return (m); + if ((rnode->rn_child[slot] != NULL && + m == NULL) || slot == 0) + break; + } + } + + /* + * If a valid page or edge smaller than the search slot is + * found in the traversal, skip to the next higher-level key. + */ + if (slot == 0 && (rnode->rn_child[slot] == NULL || m != NULL)) { + if (rnode->rn_clev == 0 || vm_radix_declev(&index, + maplevels, rnode->rn_clev - 1) > 0) + break; + goto restart; + } + rnode = rnode->rn_child[slot]; + } + return (NULL); +} + +/* + * Remove the specified index from the tree. + * Panics if the key is not present. + */ +void +vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) +{ + struct vm_radix_node *rnode, *parent; + vm_page_t m; + int i, slot; + + parent = NULL; + rnode = vm_radix_getroot(rtree); + for (;;) { + if (rnode == NULL) + panic("vm_radix_remove: impossible to locate the key"); + slot = vm_radix_slot(index, rnode->rn_clev); + m = vm_radix_node_page(rnode->rn_child[slot]); + if (m != NULL && m->pindex == index) { + rnode->rn_child[slot] = NULL; + rnode->rn_count--; + if (rnode->rn_count > 1) + break; + if (parent == NULL) { + if (rnode->rn_count == 0) { + vm_radix_node_put(rnode); + vm_radix_setroot(rtree, NULL); + } + break; + } + for (i = 0; i < VM_RADIX_COUNT; i++) + if (rnode->rn_child[i] != NULL) + break; + KASSERT(i != VM_RADIX_COUNT, + ("%s: invalid node configuration", __func__)); + slot = vm_radix_slot(index, parent->rn_clev); + KASSERT(parent->rn_child[slot] == rnode, + ("%s: invalid child value", __func__)); + parent->rn_child[slot] = rnode->rn_child[i]; + rnode->rn_count--; + rnode->rn_child[i] = NULL; + vm_radix_node_put(rnode); + break; + } + if (m != NULL && m->pindex != index) + panic("%s: invalid key found", __func__); + parent = rnode; + rnode = rnode->rn_child[slot]; + } +} + +/* + * Remove and free all the nodes from the radix tree. + * This function is recursive but there is a tight control on it as the + * maximum depth of the tree is fixed. + */ +void +vm_radix_reclaim_allnodes(struct vm_radix *rtree) +{ + struct vm_radix_node *root; + + root = vm_radix_getroot(rtree); + if (root == NULL) + return; + vm_radix_reclaim_allnodes_int(root); + vm_radix_setroot(rtree, NULL); +} + +#ifdef DDB +/* + * Show details about the given radix node. + */ +DB_SHOW_COMMAND(radixnode, db_show_radixnode) +{ + struct vm_radix_node *rnode; + int i; + + if (!have_addr) + return; + rnode = (struct vm_radix_node *)addr; + db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", + (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, + rnode->rn_clev); + for (i = 0; i < VM_RADIX_COUNT; i++) + if (rnode->rn_child[i] != NULL) + db_printf("slot: %d, val: %p, page: %p, clev: %d\n", + i, (void *)rnode->rn_child[i], + (void *)vm_radix_node_page(rnode->rn_child[i]), + rnode->rn_clev); +} +#endif /* DDB */ Index: vm/vm_object.h =================================================================== --- vm/vm_object.h (.../head/sys) (revision 248227) +++ vm/vm_object.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -72,6 +72,8 @@ #include #include +#include + /* * Types defined: * @@ -79,10 +81,10 @@ * * The root of cached pages pool is protected by both the per-object lock * and the free pages queue mutex. - * On insert in the cache splay tree, the per-object lock is expected + * On insert in the cache radix trie, the per-object lock is expected * to be already held and the free pages queue mutex will be * acquired during the operation too. - * On remove and lookup from the cache splay tree, only the free + * On remove and lookup from the cache radix trie, only the free * pages queue mutex is expected to be locked. * These rules allow for reliably checking for the presence of cached * pages with only the per-object lock held, thereby reducing contention @@ -101,7 +103,7 @@ struct vm_object { LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(, vm_page) memq; /* list of resident pages */ - vm_page_t root; /* root of the resident page splay tree */ + struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ @@ -116,7 +118,7 @@ struct vm_object { vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ - vm_page_t cache; /* (o + f) root of the cache page splay tree */ + struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* @@ -246,7 +248,7 @@ static __inline boolean_t vm_object_cache_is_empty(vm_object_t object) { - return (object->cache == NULL); + return (vm_radix_is_empty(&object->cache)); } vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); Index: vm/_vm_radix.h =================================================================== --- vm/_vm_radix.h (.../head/sys) (revision 0) +++ vm/_vm_radix.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson + * Copyright (c) 2008 Mayur Shardul + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef __VM_RADIX_H_ +#define __VM_RADIX_H_ + +/* + * Radix tree root. + */ +struct vm_radix { + uintptr_t rt_root; +}; + +#ifdef _KERNEL + +static __inline boolean_t +vm_radix_is_empty(struct vm_radix *rtree) +{ + + return (rtree->rt_root == 0); +} + +#endif /* _KERNEL */ +#endif /* !__VM_RADIX_H_ */ Index: vm/vm_radix.h =================================================================== --- vm/vm_radix.h (.../head/sys) (revision 0) +++ vm/vm_radix.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson + * Copyright (c) 2008 Mayur Shardul + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _VM_RADIX_H_ +#define _VM_RADIX_H_ + +#include + +#ifdef _KERNEL + +void vm_radix_init(void); +void vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, + vm_page_t page); +vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index); +vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index); +vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index); +void vm_radix_reclaim_allnodes(struct vm_radix *rtree); +void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); + +#endif /* _KERNEL */ +#endif /* !_VM_RADIX_H_ */ Index: vm/vm_page.c =================================================================== --- vm/vm_page.c (.../head/sys) (revision 248227) +++ vm/vm_page.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -110,6 +110,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -794,63 +795,6 @@ vm_page_dirty_KBI(vm_page_t m) } /* - * vm_page_splay: - * - * Implements Sleator and Tarjan's top-down splay algorithm. Returns - * the vm_page containing the given pindex. If, however, that - * pindex is not found in the vm_object, returns a vm_page that is - * adjacent to the pindex, coming before or after it. - */ -vm_page_t -vm_page_splay(vm_pindex_t pindex, vm_page_t root) -{ - struct vm_page dummy; - vm_page_t lefttreemax, righttreemin, y; - - if (root == NULL) - return (root); - lefttreemax = righttreemin = &dummy; - for (;; root = y) { - if (pindex < root->pindex) { - if ((y = root->left) == NULL) - break; - if (pindex < y->pindex) { - /* Rotate right. */ - root->left = y->right; - y->right = root; - root = y; - if ((y = root->left) == NULL) - break; - } - /* Link into the new root's right tree. */ - righttreemin->left = root; - righttreemin = root; - } else if (pindex > root->pindex) { - if ((y = root->right) == NULL) - break; - if (pindex > y->pindex) { - /* Rotate left. */ - root->right = y->left; - y->left = root; - root = y; - if ((y = root->right) == NULL) - break; - } - /* Link into the new root's left tree. */ - lefttreemax->right = root; - lefttreemax = root; - } else - break; - } - /* Assemble the new root. */ - lefttreemax->right = root->left; - righttreemin->left = root->right; - root->left = dummy.right; - root->right = dummy.left; - return (root); -} - -/* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. @@ -865,7 +809,7 @@ vm_page_dirty_KBI(vm_page_t m) void vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { - vm_page_t root; + vm_page_t neighbor; VM_OBJECT_ASSERT_WLOCKED(object); if (m->object != NULL) @@ -880,28 +824,19 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm /* * Now link into the object's ordered list of backed pages. */ - root = object->root; - if (root == NULL) { - m->left = NULL; - m->right = NULL; + if (object->resident_page_count == 0) { TAILQ_INSERT_TAIL(&object->memq, m, listq); } else { - root = vm_page_splay(pindex, root); - if (pindex < root->pindex) { - m->left = root->left; - m->right = root; - root->left = NULL; - TAILQ_INSERT_BEFORE(root, m, listq); - } else if (pindex == root->pindex) - panic("vm_page_insert: offset already allocated"); - else { - m->right = root->right; - m->left = root; - root->right = NULL; - TAILQ_INSERT_AFTER(&object->memq, root, m, listq); - } + neighbor = vm_radix_lookup_ge(&object->rtree, pindex); + if (neighbor != NULL) { + KASSERT(pindex < neighbor->pindex, + ("vm_page_insert: offset %ju not minor than %ju", + (uintmax_t)pindex, (uintmax_t)neighbor->pindex)); + TAILQ_INSERT_BEFORE(neighbor, m, listq); + } else + TAILQ_INSERT_TAIL(&object->memq, m, listq); } - object->root = m; + vm_radix_insert(&object->rtree, pindex, m); /* * Show that the object has one more resident page. @@ -937,7 +872,6 @@ void vm_page_remove(vm_page_t m) { vm_object_t object; - vm_page_t next, prev, root; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_lock_assert(m, MA_OWNED); @@ -952,42 +886,7 @@ vm_page_remove(vm_page_t m) /* * Now remove from the object's list of backed pages. */ - if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { - /* - * Since the page's successor in the list is also its parent - * in the tree, its right subtree must be empty. - */ - next->left = m->left; - KASSERT(m->right == NULL, - ("vm_page_remove: page %p has right child", m)); - } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && - prev->right == m) { - /* - * Since the page's predecessor in the list is also its parent - * in the tree, its left subtree must be empty. - */ - KASSERT(m->left == NULL, - ("vm_page_remove: page %p has left child", m)); - prev->right = m->right; - } else { - if (m != object->root) - vm_page_splay(m->pindex, object->root); - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - /* - * Move the page's successor to the root, because - * pages are usually removed in ascending order. - */ - if (m->right != next) - vm_page_splay(m->pindex, m->right); - next->left = m->left; - root = next; - } - object->root = root; - } + vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* @@ -1015,15 +914,9 @@ vm_page_remove(vm_page_t m) vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { - vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); - if ((m = object->root) != NULL && m->pindex != pindex) { - m = vm_page_splay(pindex, m); - if ((object->root = m)->pindex != pindex) - m = NULL; - } - return (m); + return (vm_radix_lookup(&object->rtree, pindex)); } /* @@ -1040,13 +933,8 @@ vm_page_find_least(vm_object_t object, vm_pindex_t vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); - if ((m = TAILQ_FIRST(&object->memq)) != NULL) { - if (m->pindex < pindex) { - m = vm_page_splay(pindex, object->root); - if ((object->root = m)->pindex < pindex) - m = TAILQ_NEXT(m, listq); - } - } + if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) + m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } @@ -1126,45 +1014,18 @@ vm_page_rename(vm_page_t m, vm_object_t new_object void vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { - vm_page_t m, m_next; + vm_page_t m; boolean_t empty; mtx_lock(&vm_page_queue_free_mtx); - if (__predict_false(vm_object_cache_is_empty(object))) { + if (__predict_false(vm_radix_is_empty(&object->cache))) { mtx_unlock(&vm_page_queue_free_mtx); return; } - m = object->cache = vm_page_splay(start, object->cache); - if (m->pindex < start) { - if (m->right == NULL) - m = NULL; - else { - m_next = vm_page_splay(start, m->right); - m_next->left = m; - m->right = NULL; - m = object->cache = m_next; - } - } - - /* - * At this point, "m" is either (1) a reference to the page - * with the least pindex that is greater than or equal to - * "start" or (2) NULL. - */ - for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { - /* - * Find "m"'s successor and remove "m" from the - * object's cache. - */ - if (m->right == NULL) { - object->cache = m->left; - m_next = NULL; - } else { - m_next = vm_page_splay(start, m->right); - m_next->left = m->left; - object->cache = m_next; - } - /* Convert "m" to a free page. */ + while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { + if (end != 0 && m->pindex >= end) + break; + vm_radix_remove(&object->cache, m->pindex); m->object = NULL; m->valid = 0; /* Clear PG_CACHED and set PG_FREE. */ @@ -1174,7 +1035,7 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t cnt.v_cache_count--; cnt.v_free_count++; } - empty = vm_object_cache_is_empty(object); + empty = vm_radix_is_empty(&object->cache); mtx_unlock(&vm_page_queue_free_mtx); if (object->type == OBJT_VNODE && empty) vdrop(object->handle); @@ -1189,15 +1050,9 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t static inline vm_page_t vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) { - vm_page_t m; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - if ((m = object->cache) != NULL && m->pindex != pindex) { - m = vm_page_splay(pindex, m); - if ((object->cache = m)->pindex != pindex) - m = NULL; - } - return (m); + return (vm_radix_lookup(&object->cache, pindex)); } /* @@ -1209,28 +1064,11 @@ vm_page_cache_lookup(vm_object_t object, vm_pindex static void vm_page_cache_remove(vm_page_t m) { - vm_object_t object; - vm_page_t root; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_remove: page %p is not cached", m)); - object = m->object; - if (m != object->cache) { - root = vm_page_splay(m->pindex, object->cache); - KASSERT(root == m, - ("vm_page_cache_remove: page %p is not cached in object %p", - m, object)); - } - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - root = vm_page_splay(m->pindex, m->left); - root->right = m->right; - } - object->cache = root; + vm_radix_remove(&m->object->cache, m->pindex); m->object = NULL; cnt.v_cache_count--; } @@ -1250,7 +1088,7 @@ void vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, vm_object_t new_object) { - vm_page_t m, m_next; + vm_page_t m; /* * Insertion into an object's collection of cached pages @@ -1258,53 +1096,24 @@ vm_page_cache_transfer(vm_object_t orig_object, vm * not. */ VM_OBJECT_ASSERT_WLOCKED(new_object); - KASSERT(vm_object_cache_is_empty(new_object), + KASSERT(vm_radix_is_empty(&new_object->cache), ("vm_page_cache_transfer: object %p has cached pages", new_object)); mtx_lock(&vm_page_queue_free_mtx); - if ((m = orig_object->cache) != NULL) { + while ((m = vm_radix_lookup_ge(&orig_object->cache, + offidxstart)) != NULL) { /* * Transfer all of the pages with offset greater than or * equal to 'offidxstart' from the original object's * cache to the new object's cache. */ - m = vm_page_splay(offidxstart, m); - if (m->pindex < offidxstart) { - orig_object->cache = m; - new_object->cache = m->right; - m->right = NULL; - } else { - orig_object->cache = m->left; - new_object->cache = m; - m->left = NULL; - } - while ((m = new_object->cache) != NULL) { - if ((m->pindex - offidxstart) >= new_object->size) { - /* - * Return all of the cached pages with - * offset greater than or equal to the - * new object's size to the original - * object's cache. - */ - new_object->cache = m->left; - m->left = orig_object->cache; - orig_object->cache = m; - break; - } - m_next = vm_page_splay(m->pindex, m->right); - /* Update the page's object and offset. */ - m->object = new_object; - m->pindex -= offidxstart; - if (m_next == NULL) - break; - m->right = NULL; - m_next->left = m; - new_object->cache = m_next; - } - KASSERT(vm_object_cache_is_empty(new_object) || - new_object->type == OBJT_SWAP, - ("vm_page_cache_transfer: object %p's type is incompatible" - " with cached pages", new_object)); + if ((m->pindex - offidxstart) >= new_object->size) + break; + vm_radix_remove(&orig_object->cache, m->pindex); + /* Update the page's object and offset. */ + m->object = new_object; + m->pindex -= offidxstart; + vm_radix_insert(&new_object->cache, m->pindex, m); } mtx_unlock(&vm_page_queue_free_mtx); } @@ -2324,7 +2133,7 @@ void vm_page_cache(vm_page_t m) { vm_object_t object; - vm_page_t next, prev, root; + boolean_t cache_was_empty; vm_page_lock_assert(m, MA_OWNED); object = m->object; @@ -2359,42 +2168,7 @@ vm_page_cache(vm_page_t m) * Remove the page from the object's collection of resident * pages. */ - if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { - /* - * Since the page's successor in the list is also its parent - * in the tree, its right subtree must be empty. - */ - next->left = m->left; - KASSERT(m->right == NULL, - ("vm_page_cache: page %p has right child", m)); - } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && - prev->right == m) { - /* - * Since the page's predecessor in the list is also its parent - * in the tree, its left subtree must be empty. - */ - KASSERT(m->left == NULL, - ("vm_page_cache: page %p has left child", m)); - prev->right = m->right; - } else { - if (m != object->root) - vm_page_splay(m->pindex, object->root); - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - /* - * Move the page's successor to the root, because - * pages are usually removed in ascending order. - */ - if (m->right != next) - vm_page_splay(m->pindex, m->right); - next->left = m->left; - root = next; - } - object->root = root; - } + vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); object->resident_page_count--; @@ -2412,25 +2186,8 @@ vm_page_cache(vm_page_t m) mtx_lock(&vm_page_queue_free_mtx); m->flags |= PG_CACHED; cnt.v_cache_count++; - root = object->cache; - if (root == NULL) { - m->left = NULL; - m->right = NULL; - } else { - root = vm_page_splay(m->pindex, root); - if (m->pindex < root->pindex) { - m->left = root->left; - m->right = root; - root->left = NULL; - } else if (__predict_false(m->pindex == root->pindex)) - panic("vm_page_cache: offset already cached"); - else { - m->right = root->right; - m->left = root; - root->right = NULL; - } - } - object->cache = m; + cache_was_empty = vm_radix_is_empty(&object->cache); + vm_radix_insert(&object->cache, m->pindex, m); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) { #else @@ -2448,9 +2205,9 @@ vm_page_cache(vm_page_t m) * the object's only resident page. */ if (object->type == OBJT_VNODE) { - if (root == NULL && object->resident_page_count != 0) + if (cache_was_empty && object->resident_page_count != 0) vhold(object->handle); - else if (root != NULL && object->resident_page_count == 0) + else if (!cache_was_empty && object->resident_page_count == 0) vdrop(object->handle); } } Index: vm/vm_page.h =================================================================== --- vm/vm_page.h (.../head/sys) (revision 248227) +++ vm/vm_page.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -128,8 +128,6 @@ typedef uint64_t vm_page_bits_t; struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* page queue or free list (Q) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ - struct vm_page *left; /* splay tree link (O) */ - struct vm_page *right; /* splay tree link (O) */ vm_object_t object; /* which object am I in (O,P)*/ vm_pindex_t pindex; /* offset into object (O,P) */ @@ -404,7 +402,6 @@ void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); void vm_page_set_valid_range(vm_page_t m, int base, int size); void vm_page_sleep(vm_page_t m, const char *msg); -vm_page_t vm_page_splay(vm_pindex_t, vm_page_t); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unwire (vm_page_t, int); Index: i386/i386/pmap.c =================================================================== --- i386/i386/pmap.c (.../head/sys) (revision 248227) +++ i386/i386/pmap.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -133,6 +133,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -1573,7 +1574,8 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; - free = m->right; + free = (void *)m->object; + m->object = NULL; /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } @@ -1592,7 +1594,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; - m->right = *free; + m->object = (void *)*free; *free = m; } @@ -1602,31 +1604,12 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ -static void +static __inline void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { - vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - root = pmap->pm_root; - if (root == NULL) { - mpte->left = NULL; - mpte->right = NULL; - } else { - root = vm_page_splay(mpte->pindex, root); - if (mpte->pindex < root->pindex) { - mpte->left = root->left; - mpte->right = root; - root->left = NULL; - } else if (mpte->pindex == root->pindex) - panic("pmap_insert_pt_page: pindex already inserted"); - else { - mpte->right = root->right; - mpte->left = root; - root->right = NULL; - } - } - pmap->pm_root = mpte; + vm_radix_insert(&pmap->pm_root, mpte->pindex, mpte); } /* @@ -1634,19 +1617,12 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) * specified pmap's collection of idle page table pages. Returns NULL if there * is no page table page corresponding to the specified virtual address. */ -static vm_page_t +static __inline vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) { - vm_page_t mpte; - vm_pindex_t pindex = va >> PDRSHIFT; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { - mpte = vm_page_splay(pindex, mpte); - if ((pmap->pm_root = mpte)->pindex != pindex) - mpte = NULL; - } - return (mpte); + return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); } /* @@ -1654,21 +1630,12 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) * of idle page table pages. The specified page table page must be a member of * the pmap's collection. */ -static void +static __inline void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) { - vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if (mpte != pmap->pm_root) - vm_page_splay(mpte->pindex, pmap->pm_root); - if (mpte->left == NULL) - root = mpte->right; - else { - root = vm_page_splay(mpte->pindex, mpte->left); - root->right = mpte->right; - } - pmap->pm_root = root; + vm_radix_remove(&pmap->pm_root, mpte->pindex); } /* @@ -1755,7 +1722,7 @@ pmap_pinit0(pmap_t pmap) #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif - pmap->pm_root = NULL; + pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); @@ -1794,9 +1761,9 @@ pmap_pinit(pmap_t pmap) KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif - pmap->pm_root = NULL; + pmap->pm_root.rt_root = 0; } - KASSERT(pmap->pm_root == NULL, + KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* @@ -2060,7 +2027,7 @@ pmap_release(pmap_t pmap) KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); - KASSERT(pmap->pm_root == NULL, + KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); pmap_lazyfix(pmap); @@ -2343,7 +2310,7 @@ out: } if (m_pc == NULL && pv_vafree != 0 && free != NULL) { m_pc = free; - free = m_pc->right; + free = (void *)m_pc->object; /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); Index: i386/xen/pmap.c =================================================================== --- i386/xen/pmap.c (.../head/sys) (revision 248227) +++ i386/xen/pmap.c (.../user/attilio/vmcontention/sys) (revision 248227) @@ -1335,7 +1335,8 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; - free = m->right; + free = (void *)m->object; + m->object = NULL; vm_page_free_zero(m); } } @@ -1393,7 +1394,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ - m->right = *free; + m->object = (void *)*free; *free = m; } @@ -2090,7 +2091,7 @@ out: } if (m_pc == NULL && pv_vafree != 0 && free != NULL) { m_pc = free; - free = m_pc->right; + free = (void *)m_pc->object; /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); Index: i386/include/pmap.h =================================================================== --- i386/include/pmap.h (.../head/sys) (revision 248227) +++ i386/include/pmap.h (.../user/attilio/vmcontention/sys) (revision 248227) @@ -159,6 +159,8 @@ #include #include +#include + #ifdef PAE typedef uint64_t pdpt_entry_t; @@ -441,7 +443,7 @@ struct pmap { pdpt_entry_t *pm_pdpt; /* KVA of page director pointer table */ #endif - vm_page_t pm_root; /* spare page table pages */ + struct vm_radix pm_root; /* spare page table pages */ }; typedef struct pmap *pmap_t;