Property changes on: share/man/man9 ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/share/man/man9:r178928 Index: share/man/man9/vm_map_find.9 =================================================================== --- share/man/man9/vm_map_find.9 (revision 183520) +++ share/man/man9/vm_map_find.9 (working copy) @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd July 19, 2003 +.Dd May 10, 2008 .Dt VM_MAP_FIND 9 .Os .Sh NAME @@ -38,7 +38,7 @@ .Ft int .Fo vm_map_find .Fa "vm_map_t map" "vm_object_t object" "vm_ooffset_t offset" -.Fa "vm_offset_t *addr" "vm_size_t length" "boolean_t find_space" +.Fa "vm_offset_t *addr" "vm_size_t length" "int find_space" .Fa "vm_prot_t prot" "vm_prot_t max" "int cow" .Fc .Sh DESCRIPTION @@ -70,11 +70,25 @@ .Pp If .Fa find_space -is -.Dv TRUE , +is either +.Dv VMFS_ALIGNED_SPACE +or +.Dv VMFS_ANY_SPACE , the function will call .Xr vm_map_findspace 9 to discover a free region. +Moreover, if +.Fa find_space +is +.Dv VMFS_ALIGNED_SPACE , +the address of the free region will be optimized for the use of superpages. +Otherwise, if +.Fa find_space +is +.Dv VMFS_NO_SPACE , +.Xr vm_map_insert 9 +is called with the given address, +.Fa addr . .Sh IMPLEMENTATION NOTES This function acquires a lock on .Fa map @@ -90,9 +104,14 @@ .Fn vm_map_find function returns .Dv KERN_SUCCESS -if space for the mapping could be found and -the mapping was successfully created. -If space could not be found in the map, +if the mapping was successfully created. +If space could not be found or +.Fa find_space +was +.Dv VMFS_NO_SPACE +and the given address, +.Fa addr , +was already mapped, .Dv KERN_NO_SPACE will be returned. If the discovered range turned out to be bogus, Property changes on: lib/libc ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/lib/libc:r177680 Index: lib/libc/sys/mincore.2 =================================================================== --- lib/libc/sys/mincore.2 (revision 183520) +++ lib/libc/sys/mincore.2 (working copy) @@ -72,6 +72,8 @@ Page has been referenced. .It Dv MINCORE_MODIFIED_OTHER Page has been modified. +.It Dv MINCORE_SUPER +Page is part of a "super" page. (only i386 & amd64) .El .Pp The information returned by Property changes on: sys ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys:r172779,173846,174249,174543,174799,174821,174825,174938-174940,174982,175041,175056,175067,175119,175155,175325,175404,176803,176967,177261,177342,177414,177529,177534,177624,177659,177680,177684,177702,177851,177917,177932,177956,178070,178493,178630,178637,178875,178928,178933,178935,178947,179019,179074,179076,179471,179749,179777,180170,180498,183169,183207 Index: sys/arm/arm/pmap.c =================================================================== --- sys/arm/arm/pmap.c (revision 183520) +++ sys/arm/arm/pmap.c (working copy) @@ -3325,8 +3325,8 @@ */ void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { vm_page_lock_queues(); Index: sys/arm/include/vmparam.h =================================================================== --- sys/arm/include/vmparam.h (revision 183520) +++ sys/arm/include/vmparam.h (working copy) @@ -85,6 +85,13 @@ */ #define VM_NFREEORDER 9 +/* + * Disable superpage reservations. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 0 +#endif + #define UPT_MAX_ADDRESS VADDR(UPTPTDI + 3, 0) #define UPT_MIN_ADDRESS VADDR(UPTPTDI, 0) Index: sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- sys/powerpc/powerpc/pmap_dispatch.c (revision 183520) +++ sys/powerpc/powerpc/pmap_dispatch.c (working copy) @@ -107,8 +107,8 @@ } void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t p, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t p, + vm_prot_t prot, boolean_t wired) { MMU_ENTER(mmu_obj, pmap, va, p, prot, wired); } Index: sys/powerpc/include/vmparam.h =================================================================== --- sys/powerpc/include/vmparam.h (revision 183520) +++ sys/powerpc/include/vmparam.h (working copy) @@ -131,6 +131,13 @@ */ #define VM_NFREEORDER 11 +/* + * Disable superpage reservations. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 0 +#endif + #ifndef VM_INITIAL_PAGEIN #define VM_INITIAL_PAGEIN 16 #endif Index: sys/sparc64/sparc64/pmap.c =================================================================== --- sys/sparc64/sparc64/pmap.c (revision 183520) +++ sys/sparc64/sparc64/pmap.c (working copy) @@ -1280,8 +1280,8 @@ * will be wired down. */ void -pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pm, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { vm_page_lock_queues(); Index: sys/sparc64/include/vmparam.h =================================================================== --- sys/sparc64/include/vmparam.h (revision 183520) +++ sys/sparc64/include/vmparam.h (working copy) @@ -122,6 +122,13 @@ #define VM_NFREEORDER 12 /* + * Disable superpage reservations. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 0 +#endif + +/* * Address space layout. * * UltraSPARC I and II implement a 44 bit virtual address space. The address Index: sys/conf/files =================================================================== --- sys/conf/files (revision 183520) +++ sys/conf/files (working copy) @@ -2052,9 +2052,9 @@ vm/vm_object.c standard vm/vm_page.c standard vm/vm_pageout.c standard -vm/vm_pageq.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard vm/vnode_pager.c standard Index: sys/conf/options =================================================================== --- sys/conf/options (revision 183520) +++ sys/conf/options (working copy) @@ -555,6 +555,8 @@ VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h +VM_NRESERVLEVEL opt_vm.h +VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h Index: sys/kern/kern_malloc.c =================================================================== --- sys/kern/kern_malloc.c (revision 183520) +++ sys/kern/kern_malloc.c (working copy) @@ -623,7 +623,7 @@ init_param3(vm_kmem_size / PAGE_SIZE); kmem_map = kmem_suballoc(kernel_map, &kmembase, &kmemlimit, - vm_kmem_size); + vm_kmem_size, TRUE); kmem_map->system_map = 1; #ifdef DEBUG_MEMGUARD Index: sys/kern/kern_exec.c =================================================================== --- sys/kern/kern_exec.c (revision 183520) +++ sys/kern/kern_exec.c (working copy) @@ -31,6 +31,7 @@ #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_vm.h" #include #include @@ -855,6 +856,12 @@ if (object == NULL) return (EACCES); VM_OBJECT_LOCK(object); +#if VM_NRESERVLEVEL > 0 + if ((object->flags & OBJ_COLORED) == 0) { + object->flags |= OBJ_COLORED; + object->pg_color = 0; + } +#endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; Index: sys/ia64/ia64/pmap.c =================================================================== --- sys/ia64/ia64/pmap.c (revision 183520) +++ sys/ia64/ia64/pmap.c (working copy) @@ -1512,8 +1512,8 @@ * insert this page into the given map NOW. */ void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { pmap_t oldpmap; vm_offset_t pa; Index: sys/ia64/include/vmparam.h =================================================================== --- sys/ia64/include/vmparam.h (revision 183520) +++ sys/ia64/include/vmparam.h (working copy) @@ -145,6 +145,13 @@ #define VM_NFREEORDER 16 /* + * Disable superpage reservations. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 0 +#endif + +/* * Manipulating region bits of an address. */ #define IA64_RR_BASE(n) (((u_int64_t) (n)) << 61) Index: sys/sun4v/sun4v/pmap.c =================================================================== --- sys/sun4v/sun4v/pmap.c (revision 183520) +++ sys/sun4v/sun4v/pmap.c (working copy) @@ -1039,8 +1039,8 @@ * will be wired down. */ void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { vm_paddr_t pa, opa; uint64_t tte_data, otte_data; Index: sys/sun4v/include/vmparam.h =================================================================== --- sys/sun4v/include/vmparam.h (revision 183520) +++ sys/sun4v/include/vmparam.h (working copy) @@ -122,6 +122,13 @@ #define VM_NFREEORDER 12 /* + * Disable superpage reservations. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 0 +#endif + +/* * Address space layout. * * UltraSPARC I and II implement a 44 bit virtual address space. The address Index: sys/vm/vm_pageq.c =================================================================== --- sys/vm/vm_pageq.c (revision 183520) +++ sys/vm/vm_pageq.c (working copy) @@ -1,115 +0,0 @@ -/*- - * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct vpgqueues vm_page_queues[PQ_MAXCOUNT]; - -void -vm_pageq_init(void) -{ - int i; - - vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; - vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; - vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; - - for (i = 0; i < PQ_COUNT; i++) { - TAILQ_INIT(&vm_page_queues[i].pl); - } -} - -void -vm_pageq_requeue(vm_page_t m) -{ - int queue = VM_PAGE_GETQUEUE(m); - struct vpgqueues *vpq; - - if (queue != PQ_NONE) { - vpq = &vm_page_queues[queue]; - TAILQ_REMOVE(&vpq->pl, m, pageq); - TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); - } -} - -/* - * vm_pageq_enqueue: - */ -void -vm_pageq_enqueue(int queue, vm_page_t m) -{ - struct vpgqueues *vpq; - - vpq = &vm_page_queues[queue]; - VM_PAGE_SETQUEUE2(m, queue); - TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); - ++*vpq->cnt; -} - -/* - * vm_pageq_remove: - * - * Remove a page from its queue. - * - * The queue containing the given page must be locked. - * This routine may not block. - */ -void -vm_pageq_remove(vm_page_t m) -{ - int queue = VM_PAGE_GETQUEUE(m); - struct vpgqueues *pq; - - if (queue != PQ_NONE) { - VM_PAGE_SETQUEUE2(m, PQ_NONE); - pq = &vm_page_queues[queue]; - TAILQ_REMOVE(&pq->pl, m, pageq); - (*pq->cnt)--; - } -} Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c (revision 183520) +++ sys/vm/vm_kern.c (working copy) @@ -109,8 +109,8 @@ size = round_page(size); addr = vm_map_min(map); - result = vm_map_find(map, NULL, 0, - &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + result = vm_map_find(map, NULL, 0, &addr, size, VMFS_ANY_SPACE, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS) { return (0); } @@ -221,21 +221,20 @@ * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find + * superpage_align Request that min is superpage aligned */ vm_map_t -kmem_suballoc(parent, min, max, size) - vm_map_t parent; - vm_offset_t *min, *max; - vm_size_t size; +kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max, + vm_size_t size, boolean_t superpage_align) { int ret; vm_map_t result; size = round_page(size); - *min = (vm_offset_t) vm_map_min(parent); - ret = vm_map_find(parent, NULL, (vm_offset_t) 0, - min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); + *min = vm_map_min(parent); + ret = vm_map_find(parent, NULL, 0, min, size, superpage_align ? + VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (ret != KERN_SUCCESS) { printf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); @@ -261,9 +260,6 @@ * (kmem_object). This, combined with the fact that only malloc uses * this routine, ensures that we will never block in map or object waits. * - * Note that this still only works in a uni-processor environment and - * when called at splhigh(). - * * We don't worry about expanding the map (adding entries) since entries * for wired maps are statically allocated. * @@ -413,7 +409,8 @@ /* * Because this is kernel_pmap, this call will not block. */ - pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1); + pmap_enter(kernel_pmap, addr + i, VM_PROT_ALL, m, VM_PROT_ALL, + TRUE); vm_page_wakeup(m); } VM_OBJECT_UNLOCK(kmem_object); Index: sys/vm/memguard.c =================================================================== --- sys/vm/memguard.c (revision 183520) +++ sys/vm/memguard.c (working copy) @@ -174,7 +174,7 @@ size *= PAGE_SIZE; memguard_map = kmem_suballoc(parent_map, (vm_offset_t *)&base, - (vm_offset_t *)&limit, (vm_size_t)size); + (vm_offset_t *)&limit, (vm_size_t)size, FALSE); memguard_map->system_map = 1; memguard_mapsize = size; memguard_mapused = 0; Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c (revision 183520) +++ sys/vm/vm_pageout.c (working copy) @@ -567,14 +567,14 @@ pmap_remove_all(p); vm_page_deactivate(p); } else { - vm_pageq_requeue(p); + vm_page_requeue(p); } } else { vm_page_activate(p); vm_page_flag_clear(p, PG_REFERENCED); if (p->act_count < (ACT_MAX - ACT_ADVANCE)) p->act_count += ACT_ADVANCE; - vm_pageq_requeue(p); + vm_page_requeue(p); } } else if (p->queue == PQ_INACTIVE) { pmap_remove_all(p); @@ -763,7 +763,7 @@ * A held page may be undergoing I/O, so skip it. */ if (m->hold_count) { - vm_pageq_requeue(m); + vm_page_requeue(m); addl_page_shortage++; continue; } @@ -878,7 +878,7 @@ * the thrash point for a heavily loaded machine. */ vm_page_flag_set(m, PG_WINATCFLS); - vm_pageq_requeue(m); + vm_page_requeue(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if @@ -906,7 +906,7 @@ */ if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(object); - vm_pageq_requeue(m); + vm_page_requeue(m); continue; } @@ -999,7 +999,7 @@ * be undergoing I/O, so skip it */ if (m->hold_count) { - vm_pageq_requeue(m); + vm_page_requeue(m); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; goto unlock_and_continue; @@ -1080,7 +1080,7 @@ (m->oflags & VPO_BUSY) || (m->hold_count != 0)) { VM_OBJECT_UNLOCK(object); - vm_pageq_requeue(m); + vm_page_requeue(m); m = next; continue; } @@ -1117,7 +1117,7 @@ * page activation count stats. */ if (actcount && (object->ref_count != 0)) { - vm_pageq_requeue(m); + vm_page_requeue(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm || @@ -1134,7 +1134,7 @@ vm_page_deactivate(m); } } else { - vm_pageq_requeue(m); + vm_page_requeue(m); } } VM_OBJECT_UNLOCK(object); @@ -1320,7 +1320,7 @@ (m->oflags & VPO_BUSY) || (m->hold_count != 0)) { VM_OBJECT_UNLOCK(object); - vm_pageq_requeue(m); + vm_page_requeue(m); m = next; continue; } @@ -1336,7 +1336,7 @@ m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; - vm_pageq_requeue(m); + vm_page_requeue(m); } else { if (m->act_count == 0) { /* @@ -1352,7 +1352,7 @@ vm_page_deactivate(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); - vm_pageq_requeue(m); + vm_page_requeue(m); } } VM_OBJECT_UNLOCK(object); Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c (revision 183520) +++ sys/vm/vm_map.c (working copy) @@ -1172,13 +1172,12 @@ int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, - vm_offset_t *addr /* IN/OUT */, vm_size_t length, vm_prot_t prot, + vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { - vm_offset_t start, end; + vm_offset_t end; int result; - start = *addr; vm_map_lock(map); end = start + length; VM_MAP_RANGE_CHECK(map, start, end); @@ -1201,7 +1200,7 @@ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ - vm_size_t length, boolean_t find_space, vm_prot_t prot, + vm_size_t length, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t start; @@ -1209,15 +1208,20 @@ start = *addr; vm_map_lock(map); - if (find_space) { - if (vm_map_findspace(map, start, length, addr)) { - vm_map_unlock(map); - return (KERN_NO_SPACE); + do { + if (find_space != VMFS_NO_SPACE) { + if (vm_map_findspace(map, start, length, addr)) { + vm_map_unlock(map); + return (KERN_NO_SPACE); + } + if (find_space == VMFS_ALIGNED_SPACE) + pmap_align_superpage(object, offset, addr, + length); + start = *addr; } - start = *addr; - } - result = vm_map_insert(map, object, offset, - start, start + length, prot, max, cow); + result = vm_map_insert(map, object, offset, start, start + + length, prot, max, cow); + } while (result == KERN_NO_SPACE && find_space == VMFS_ALIGNED_SPACE); vm_map_unlock(map); return (result); } Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h (revision 183520) +++ sys/vm/vm_map.h (working copy) @@ -320,6 +320,13 @@ #define VM_FAULT_DIRTY 8 /* Dirty the page */ /* + * The following "find_space" options are supported by vm_map_find() + */ +#define VMFS_NO_SPACE 0 /* don't find; use the given range */ +#define VMFS_ANY_SPACE 1 /* find a range with any alignment */ +#define VMFS_ALIGNED_SPACE 2 /* find a superpage-aligned range */ + +/* * vm_map_wire and vm_map_unwire option flags */ #define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ @@ -332,8 +339,10 @@ boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t); -int vm_map_find (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int); -int vm_map_fixed (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int); +int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, + int, vm_prot_t, vm_prot_t, int); +int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, + vm_prot_t, vm_prot_t, int); int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t); Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c (revision 183520) +++ sys/vm/vm_phys.c (working copy) @@ -54,6 +54,7 @@ #include #include #include +#include struct vm_freelist { struct pglist pl; @@ -464,11 +465,13 @@ } /* - * Remove the given physical page "m" from the free lists. + * Search for the given physical page "m" in the free lists. If the search + * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return + * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ -void +boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; @@ -486,21 +489,21 @@ */ seg = &vm_phys_segs[m->segind]; for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && - order < VM_NFREEORDER; ) { + order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); - KASSERT(pa >= seg->start && pa < seg->end, - ("vm_phys_unfree_page: paddr %#jx is not within segment %p", - (uintmax_t)pa, seg)); - m_set = &seg->first_page[atop(pa - seg->start)]; + if (pa >= seg->start) + m_set = &seg->first_page[atop(pa - seg->start)]; + else + return (FALSE); } - KASSERT(m_set->order >= order, ("vm_phys_unfree_page: page %p's order" - " (%d) is less than expected (%d)", m_set, m_set->order, order)); + if (m_set->order < order) + return (FALSE); + if (m_set->order == VM_NFREEORDER) + return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); - KASSERT(order < VM_NFREEORDER, - ("vm_phys_unfree_page: order %d is out of range", order)); /* * Next, remove "m_set" from the free lists. Finally, extract @@ -527,6 +530,7 @@ fl[order].lcnt++; } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); + return (TRUE); } /* @@ -604,6 +608,9 @@ /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); mtx_lock(&vm_page_queue_free_mtx); +#if VM_NRESERVLEVEL > 0 +retry: +#endif for (flind = 0; flind < vm_nfreelists; flind++) { for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { @@ -661,6 +668,10 @@ } } } +#if VM_NRESERVLEVEL > 0 + if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary)) + goto retry; +#endif mtx_unlock(&vm_page_queue_free_mtx); return (NULL); done: Index: sys/vm/vm_phys.h =================================================================== --- sys/vm/vm_phys.h (revision 183520) +++ sys/vm/vm_phys.h (working copy) @@ -38,6 +38,8 @@ #ifndef _VM_PHYS_H_ #define _VM_PHYS_H_ +#ifdef _KERNEL + void vm_phys_add_page(vm_paddr_t pa); vm_page_t vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high, @@ -47,7 +49,8 @@ void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); void vm_phys_set_pool(int pool, vm_page_t m, int order); -void vm_phys_unfree_page(vm_page_t m); +boolean_t vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); +#endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ Index: sys/vm/vm_mmap.c =================================================================== --- sys/vm/vm_mmap.c (revision 183520) +++ sys/vm/vm_mmap.c (working copy) @@ -466,7 +466,7 @@ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; - int len; + size_t len; int flags; }; #endif @@ -1401,17 +1401,15 @@ maxprot |= VM_PROT_EXECUTE; #endif - if (fitit) - *addr = pmap_addr_hint(object, *addr, size); - if (flags & MAP_STACK) rv = vm_map_stack(map, *addr, size, prot, maxprot, docow | MAP_STACK_GROWS_DOWN); else if (fitit) - rv = vm_map_find(map, object, foff, addr, size, TRUE, - prot, maxprot, docow); + rv = vm_map_find(map, object, foff, addr, size, + object != NULL && object->type == OBJT_DEVICE ? + VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); else - rv = vm_map_fixed(map, object, foff, addr, size, + rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); if (rv != KERN_SUCCESS) { Index: sys/vm/pmap.h =================================================================== --- sys/vm/pmap.h (revision 183520) +++ sys/vm/pmap.h (working copy) @@ -90,14 +90,16 @@ */ extern vm_offset_t kernel_vm_end; +void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, + vm_size_t); void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); -void pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, - boolean_t); -void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, +void pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t, + vm_prot_t, boolean_t); +void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot); Index: sys/vm/vm_init.c =================================================================== --- sys/vm/vm_init.c (revision 183520) +++ sys/vm/vm_init.c (working copy) @@ -186,16 +186,17 @@ panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva, - (nbuf*BKVASIZE) + (nswbuf*MAXPHYS)); + nbuf * BKVASIZE + nswbuf * MAXPHYS, FALSE); buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva, - &kmi->buffer_eva, (nbuf*BKVASIZE)); + &kmi->buffer_eva, nbuf * BKVASIZE, FALSE); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva, - (nswbuf*MAXPHYS)); + nswbuf * MAXPHYS, FALSE); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, - (exec_map_entries*(ARG_MAX+(PAGE_SIZE*3)))); - pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva); + exec_map_entries * (ARG_MAX + (PAGE_SIZE * 3)), FALSE); + pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva, + FALSE); /* * XXX: Mbuf system machine-specific initializations should Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c (revision 183520) +++ sys/vm/vm_object.c (working copy) @@ -65,6 +65,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_vm.h" + #include #include #include @@ -90,6 +92,7 @@ #include #include #include +#include #include #define EASY_SCAN_FACTOR 8 @@ -170,6 +173,11 @@ KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages", object)); +#if VM_NRESERVLEVEL > 0 + KASSERT(LIST_EMPTY(&object->rvq), + ("object %p has reservations", + object)); +#endif KASSERT(object->cache == NULL, ("object %p has cached pages", object)); @@ -220,6 +228,9 @@ object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; +#if VM_NRESERVLEVEL > 0 + LIST_INIT(&object->rvq); +#endif object->cache = NULL; mtx_lock(&vm_object_list_mtx); @@ -241,10 +252,18 @@ VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); +#if VM_NRESERVLEVEL > 0 + kernel_object->flags |= OBJ_COLORED; + kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); +#endif VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); +#if VM_NRESERVLEVEL > 0 + kmem_object->flags |= OBJ_COLORED; + kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); +#endif /* * The lock portion of struct vm_object must be type stable due @@ -676,6 +695,10 @@ } vm_page_unlock_queues(); +#if VM_NRESERVLEVEL > 0 + if (__predict_false(!LIST_EMPTY(&object->rvq))) + vm_reserv_break_all(object); +#endif if (__predict_false(object->cache != NULL)) vm_page_cache_free(object, 0, 0); @@ -1262,7 +1285,13 @@ LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; source->generation++; +#if VM_NRESERVLEVEL > 0 + result->flags |= source->flags & (OBJ_NEEDGIANT | OBJ_COLORED); + result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & + ((1 << (VM_NFREEORDER - 1)) - 1); +#else result->flags |= source->flags & OBJ_NEEDGIANT; +#endif VM_OBJECT_UNLOCK(source); } @@ -1574,7 +1603,15 @@ continue; } +#if VM_NRESERVLEVEL > 0 /* + * Rename the reservation. + */ + vm_reserv_rename(p, object, backing_object, + backing_offset_index); +#endif + + /* * Page does not exist in parent, rename the * page from the backing object to the main object. * @@ -1676,7 +1713,15 @@ */ vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT); +#if VM_NRESERVLEVEL > 0 /* + * Break any reservations from backing_object. + */ + if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) + vm_reserv_break_all(backing_object); +#endif + + /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { Index: sys/vm/vm_reserv.c =================================================================== --- sys/vm/vm_reserv.c (revision 183520) +++ sys/vm/vm_reserv.c (working copy) @@ -1,6 +1,6 @@ /*- * Copyright (c) 2002-2006 Rice University - * Copyright (c) 2007 Alan L. Cox + * Copyright (c) 2007-2008 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, @@ -170,6 +170,7 @@ static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); static void vm_reserv_populate(vm_reserv_t rv); +static void vm_reserv_reclaim(vm_reserv_t rv); /* * Describes the current state of the partially-populated reservation queue. @@ -568,6 +569,37 @@ } /* + * Breaks the given partially-populated reservation, releasing its cached and + * free pages to the physical memory allocator. + * + * The free page queue lock must be held. + */ +static void +vm_reserv_reclaim(vm_reserv_t rv) +{ + int i; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + KASSERT(rv->inpartpopq, + ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv)); + TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); + rv->inpartpopq = FALSE; + KASSERT(rv->object != NULL, + ("vm_reserv_reclaim: reserv %p is free", rv)); + LIST_REMOVE(rv, objq); + rv->object = NULL; + for (i = 0; i < VM_LEVEL_0_NPAGES; i++) { + if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) + vm_phys_free_pages(&rv->pages[i], 0); + else + rv->popcnt--; + } + KASSERT(rv->popcnt == 0, + ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv)); + vm_reserv_reclaimed++; +} + +/* * Breaks the reservation at the head of the partially-populated reservation * queue, releasing its cached and free pages to the physical memory * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. @@ -575,38 +607,69 @@ * The free page queue lock must be held. */ boolean_t -vm_reserv_reclaim(void) +vm_reserv_reclaim_inactive(void) { vm_reserv_t rv; - int i; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { - KASSERT(rv->inpartpopq, - ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", - rv)); - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); - rv->inpartpopq = FALSE; - KASSERT(rv->object != NULL, - ("vm_reserv_reclaim: reserv %p is free", rv)); - LIST_REMOVE(rv, objq); - rv->object = NULL; - for (i = 0; i < VM_LEVEL_0_NPAGES; i++) { - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) - vm_phys_free_pages(&rv->pages[i], 0); - else - rv->popcnt--; - } - KASSERT(rv->popcnt == 0, - ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", - rv)); - vm_reserv_reclaimed++; + vm_reserv_reclaim(rv); return (TRUE); } return (FALSE); } /* + * Searches the partially-populated reservation queue for the least recently + * active reservation with unused pages, i.e., cached or free, that satisfy the + * given request for contiguous physical memory. If a satisfactory reservation + * is found, it is broken. Returns TRUE if a reservation is broken and FALSE + * otherwise. + * + * The free page queue lock must be held. + */ +boolean_t +vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low, vm_paddr_t high, + unsigned long alignment, unsigned long boundary) +{ + vm_paddr_t pa, pa_length; + vm_reserv_t rv; + int i; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + if (size > VM_LEVEL_0_SIZE - PAGE_SIZE) + return (FALSE); + TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { + pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); + if (pa + PAGE_SIZE - size < low) { + /* this entire reservation is too low; go to next */ + continue; + } + pa_length = 0; + for (i = 0; i < VM_LEVEL_0_NPAGES; i++) + if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) { + pa_length += PAGE_SIZE; + if (pa_length == PAGE_SIZE) { + pa = VM_PAGE_TO_PHYS(&rv->pages[i]); + if (pa + size > high) { + /* skip to next reservation */ + break; + } else if (pa < low || + (pa & (alignment - 1)) != 0 || + ((pa ^ (pa + size - 1)) & + ~(boundary - 1)) != 0) + pa_length = 0; + } else if (pa_length >= size) { + vm_reserv_reclaim(rv); + return (TRUE); + } + } else + pa_length = 0; + } + return (FALSE); +} + +/* * Transfers the reservation underlying the given page to a new object. * * The object must be locked. Index: sys/vm/vm_extern.h =================================================================== --- sys/vm/vm_extern.h (revision 183520) +++ sys/vm/vm_extern.h (working copy) @@ -63,7 +63,8 @@ void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); void kmem_init(vm_offset_t, vm_offset_t); vm_offset_t kmem_malloc(vm_map_t, vm_size_t, boolean_t); -vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t); +vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, + boolean_t); void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h (revision 183520) +++ sys/vm/vm_object.h (working copy) @@ -100,6 +100,7 @@ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ + LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ vm_page_t cache; /* root of the cache page splay tree */ void *handle; union { @@ -143,6 +144,7 @@ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty */ #define OBJ_CLEANING 0x0200 +#define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_NEEDGIANT 0x8000 /* object requires Giant */ Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c (revision 183520) +++ sys/vm/vm_fault.c (working copy) @@ -74,6 +74,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_vm.h" + #include #include #include @@ -429,6 +431,13 @@ */ fs.m = NULL; if (!vm_page_count_severe()) { +#if VM_NRESERVLEVEL > 0 + if ((fs.object->flags & OBJ_COLORED) == 0) { + fs.object->flags |= OBJ_COLORED; + fs.object->pg_color = atop(vaddr) - + fs.pindex; + } +#endif fs.m = vm_page_alloc(fs.object, fs.pindex, (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO); } @@ -475,7 +484,8 @@ fs.pindex < fs.entry->lastr + VM_FAULT_READ)) && (fs.first_object == fs.object || (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object))) && - fs.first_object->type != OBJT_DEVICE) { + fs.first_object->type != OBJT_DEVICE && + fs.first_object->type != OBJT_PHYS) { vm_pindex_t firstpindex, tmppindex; if (fs.first_pindex < 2 * VM_FAULT_READ) @@ -499,7 +509,6 @@ break; if (mt->busy || (mt->oflags & VPO_BUSY) || - (mt->flags & (PG_FICTITIOUS | PG_UNMANAGED)) || mt->hold_count || mt->wire_count) continue; @@ -879,7 +888,7 @@ * back on the active queue until later so that the pageout daemon * won't find it (yet). */ - pmap_enter(fs.map->pmap, vaddr, fs.m, prot, wired); + pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) { vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); } @@ -1107,6 +1116,10 @@ */ dst_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(dst_entry->end - dst_entry->start)); +#if VM_NRESERVLEVEL > 0 + dst_object->flags |= OBJ_COLORED; + dst_object->pg_color = atop(dst_entry->start); +#endif VM_OBJECT_LOCK(dst_object); dst_entry->object.vm_object = dst_object; @@ -1164,9 +1177,10 @@ VM_OBJECT_UNLOCK(dst_object); /* - * Enter it in the pmap... + * Enter it in the pmap as a read and/or execute access. */ - pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE); + pmap_enter(dst_map->pmap, vaddr, prot & ~VM_PROT_WRITE, dst_m, + prot, FALSE); /* * Mark it no longer busy, and put it on the active list. Index: sys/vm/device_pager.c =================================================================== --- sys/vm/device_pager.c (revision 183520) +++ sys/vm/device_pager.c (working copy) @@ -146,10 +146,14 @@ object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* - * Allocate object and associate it with the pager. + * Allocate object and associate it with the pager. Initialize + * the object's pg_color based upon the physical address of the + * device's memory. */ mtx_unlock(&dev_pager_mtx); object1 = vm_object_allocate(OBJT_DEVICE, pindex); + object1->flags |= OBJ_COLORED; + object1->pg_color = atop(paddr) - OFF_TO_IDX(off - PAGE_SIZE); mtx_lock(&dev_pager_mtx); object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object != NULL) { Index: sys/vm/vm_reserv.h =================================================================== --- sys/vm/vm_reserv.h (revision 183520) +++ sys/vm/vm_reserv.h (working copy) @@ -1,6 +1,6 @@ /*- * Copyright (c) 2002-2006 Rice University - * Copyright (c) 2007 Alan L. Cox + * Copyright (c) 2007-2008 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, @@ -48,7 +48,10 @@ void vm_reserv_init(void); int vm_reserv_level_iffullpop(vm_page_t m); boolean_t vm_reserv_reactivate_page(vm_page_t m); -boolean_t vm_reserv_reclaim(void); +boolean_t vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low, + vm_paddr_t high, unsigned long alignment, + unsigned long boundary); +boolean_t vm_reserv_reclaim_inactive(void); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c (revision 183520) +++ sys/vm/vm_page.c (working copy) @@ -1,6 +1,7 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. + * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. @@ -99,6 +100,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_vm.h" + #include #include #include @@ -118,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +133,7 @@ * page structure. */ +struct vpgqueues vm_page_queues[PQ_COUNT]; struct mtx vm_page_queue_mtx; struct mtx vm_page_queue_free_mtx; @@ -142,6 +147,8 @@ SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); +static void vm_page_enqueue(int queue, vm_page_t m); + /* * vm_set_page_size: * @@ -261,7 +268,11 @@ * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ - vm_pageq_init(); + for (i = 0; i < PQ_COUNT; i++) + TAILQ_INIT(&vm_page_queues[i].pl); + vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; + vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; + vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; /* * Allocate memory for use when boot strapping the kernel memory @@ -325,6 +336,13 @@ mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; +#if VM_NRESERVLEVEL > 0 + /* + * Allocate memory for the reservation management system's data + * structures. + */ + new_end = vm_reserv_startup(&vaddr, new_end, high_water); +#endif #ifdef __amd64__ /* * pmap_map on amd64 comes out of the direct-map, not kvm like i386, @@ -380,6 +398,12 @@ } } freeenv(list); +#if VM_NRESERVLEVEL > 0 + /* + * Initialize the reservation management system. + */ + vm_reserv_init(); +#endif return (vaddr); } @@ -1043,14 +1067,35 @@ mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } - vm_phys_unfree_page(m); - vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); + if (vm_phys_unfree_page(m)) + vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); +#if VM_NRESERVLEVEL > 0 + else if (!vm_reserv_reactivate_page(m)) +#else + else +#endif + panic("vm_page_alloc: cache page %p is missing" + " from the free queue", m); } else if ((req & VM_ALLOC_IFCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); - } else +#if VM_NRESERVLEVEL > 0 + } else if (object == NULL || object->type == OBJT_DEVICE || + (object->flags & OBJ_COLORED) == 0 || + (m = vm_reserv_alloc_page(object, pindex)) == NULL) { +#else + } else { +#endif m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); +#if VM_NRESERVLEVEL > 0 + if (m == NULL && vm_reserv_reclaim_inactive()) { + m = vm_phys_alloc_pages(object != NULL ? + VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, + 0); + } +#endif + } } else { /* * Not allocatable, give up. @@ -1188,6 +1233,67 @@ } /* + * vm_page_requeue: + * + * If the given page is contained within a page queue, move it to the tail + * of that queue. + * + * The page queues must be locked. + */ +void +vm_page_requeue(vm_page_t m) +{ + int queue = VM_PAGE_GETQUEUE(m); + struct vpgqueues *vpq; + + if (queue != PQ_NONE) { + vpq = &vm_page_queues[queue]; + TAILQ_REMOVE(&vpq->pl, m, pageq); + TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); + } +} + +/* + * vm_pageq_remove: + * + * Remove a page from its queue. + * + * The queue containing the given page must be locked. + * This routine may not block. + */ +void +vm_pageq_remove(vm_page_t m) +{ + int queue = VM_PAGE_GETQUEUE(m); + struct vpgqueues *pq; + + if (queue != PQ_NONE) { + VM_PAGE_SETQUEUE2(m, PQ_NONE); + pq = &vm_page_queues[queue]; + TAILQ_REMOVE(&pq->pl, m, pageq); + (*pq->cnt)--; + } +} + +/* + * vm_page_enqueue: + * + * Add the given page to the specified queue. + * + * The page queues must be locked. + */ +static void +vm_page_enqueue(int queue, vm_page_t m) +{ + struct vpgqueues *vpq; + + vpq = &vm_page_queues[queue]; + VM_PAGE_SETQUEUE2(m, queue); + TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); + ++*vpq->cnt; +} + +/* * vm_page_activate: * * Put the specified page on the active list (if appropriate). @@ -1207,7 +1313,7 @@ if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; - vm_pageq_enqueue(PQ_ACTIVE, m); + vm_page_enqueue(PQ_ACTIVE, m); } } else { if (m->act_count < ACT_INIT) @@ -1310,18 +1416,21 @@ } if (m->hold_count != 0) { m->flags &= ~PG_ZERO; - vm_pageq_enqueue(PQ_HOLD, m); + vm_page_enqueue(PQ_HOLD, m); } else { m->flags |= PG_FREE; mtx_lock(&vm_page_queue_free_mtx); cnt.v_free_count++; - if ((m->flags & PG_ZERO) != 0) { +#if VM_NRESERVLEVEL > 0 + if (!vm_reserv_free_page(m)) +#else + if (TRUE) +#endif vm_phys_free_pages(m, 0); + if ((m->flags & PG_ZERO) != 0) ++vm_page_zero_count; - } else { - vm_phys_free_pages(m, 0); + else vm_page_zero_idle_wakeup(); - } vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } @@ -1400,10 +1509,10 @@ if (m->flags & PG_UNMANAGED) { ; } else if (activate) - vm_pageq_enqueue(PQ_ACTIVE, m); + vm_page_enqueue(PQ_ACTIVE, m); else { vm_page_flag_clear(m, PG_WINATCFLS); - vm_pageq_enqueue(PQ_INACTIVE, m); + vm_page_enqueue(PQ_INACTIVE, m); } } } else { @@ -1564,7 +1673,6 @@ vm_page_flag_set(m, PG_CACHED); vm_page_flag_clear(m, PG_ZERO); mtx_lock(&vm_page_queue_free_mtx); - vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); cnt.v_cache_count++; root = object->cache; if (root == NULL) { @@ -1585,7 +1693,14 @@ } } object->cache = m; - vm_phys_free_pages(m, 0); +#if VM_NRESERVLEVEL > 0 + if (!vm_reserv_free_page(m)) { +#else + if (TRUE) { +#endif + vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); + vm_phys_free_pages(m, 0); + } vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h (revision 183520) +++ sys/vm/vm_page.h (working copy) @@ -162,7 +162,6 @@ #define PQ_ACTIVE 2 #define PQ_HOLD 3 #define PQ_COUNT 4 -#define PQ_MAXCOUNT 4 /* Returns the real queue a page is on. */ #define VM_PAGE_GETQUEUE(m) ((m)->queue) @@ -181,7 +180,7 @@ int *cnt; }; -extern struct vpgqueues vm_page_queues[PQ_MAXCOUNT]; +extern struct vpgqueues vm_page_queues[PQ_COUNT]; extern struct mtx vm_page_queue_free_mtx; /* @@ -311,10 +310,7 @@ void vm_page_dirty(vm_page_t m); void vm_page_wakeup(vm_page_t m); -void vm_pageq_init(void); -void vm_pageq_enqueue(int queue, vm_page_t m); void vm_pageq_remove(vm_page_t m); -void vm_pageq_requeue(vm_page_t m); void vm_page_activate (vm_page_t); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); @@ -331,6 +327,7 @@ vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); void vm_page_remove (vm_page_t); void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); +void vm_page_requeue(vm_page_t m); void vm_page_sleep(vm_page_t m, const char *msg); vm_page_t vm_page_splay(vm_pindex_t, vm_page_t); vm_offset_t vm_page_startup(vm_offset_t vaddr); Index: sys/vm/vm.h =================================================================== --- sys/vm/vm.h (revision 183520) +++ sys/vm/vm.h (working copy) @@ -114,6 +114,9 @@ typedef struct vm_page *vm_page_t; #endif /* _KERNEL */ +struct vm_reserv; +typedef struct vm_reserv *vm_reserv_t; + /* * Information passed from the machine-independant VM initialization code * for use by machine-dependant code (mainly for MMU support) Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c (revision 183520) +++ sys/vm/vnode_pager.c (working copy) @@ -404,21 +404,6 @@ pmap_zero_page_area(m, base, size); /* - * XXX work around SMP data integrity race - * by unmapping the page from user processes. - * The garbage we just cleared may be mapped - * to a user process running on another cpu - * and this code is not running through normal - * I/O channels which handle SMP issues for - * us, so unmap page to synchronize all cpus. - * - * XXX should vm_pager_unmap_page() have - * dealt with this? - */ - vm_page_lock_queues(); - pmap_remove_all(m); - - /* * Clear out partial-page dirty bits. This * has the side effect of setting the valid * bits, but that is ok. There are a bunch @@ -431,6 +416,7 @@ * bits. This would prevent bogus_page * replacement from working properly. */ + vm_page_lock_queues(); vm_page_set_validclean(m, base, size); if (m->dirty != 0) m->dirty = VM_PAGE_BITS_ALL; Index: sys/i386/include/pmap.h =================================================================== --- sys/i386/include/pmap.h (revision 183520) +++ sys/i386/include/pmap.h (working copy) @@ -82,6 +82,13 @@ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ /* + * Promotion to a 2 or 4MB (PDE) page mapping requires that the corresponding + * 4KB (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \ + PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V) + +/* * Page Protection Exception bits */ @@ -213,6 +220,9 @@ #ifdef PAE +#define pde_cmpset(pdep, old, new) \ + atomic_cmpset_64((pdep), (old), (new)) + static __inline pt_entry_t pte_load(pt_entry_t *ptep) { @@ -269,6 +279,9 @@ #else /* PAE */ +#define pde_cmpset(pdep, old, new) \ + atomic_cmpset_int((pdep), (old), (new)) + static __inline pt_entry_t pte_load(pt_entry_t *ptep) { @@ -316,7 +329,7 @@ struct pv_chunk; struct md_page { - int pv_list_count; + int pv_unused; TAILQ_HEAD(,pv_entry) pv_list; }; @@ -331,6 +344,7 @@ pdpt_entry_t *pm_pdpt; /* KVA of page director pointer table */ #endif + vm_page_t pm_root; /* spare page table pages */ }; typedef struct pmap *pmap_t; @@ -394,7 +408,6 @@ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) void pmap_bootstrap(vm_paddr_t); @@ -407,6 +420,7 @@ void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_unmapdev(vm_offset_t, vm_size_t); pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; void pmap_set_pg(void); Index: sys/i386/include/vmparam.h =================================================================== --- sys/i386/include/vmparam.h (revision 183520) +++ sys/i386/include/vmparam.h (working copy) @@ -123,6 +123,25 @@ #endif /* + * Enable superpage reservations: 1 level. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 1 +#endif + +/* + * Level 0 reservations consist of 512 pages under PAE and 1024 pages + * otherwise. + */ +#ifndef VM_LEVEL_0_ORDER +#ifdef PAE +#define VM_LEVEL_0_ORDER 9 +#else +#define VM_LEVEL_0_ORDER 10 +#endif +#endif + +/* * Kernel physical load address. */ #ifndef KERNLOAD Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c (revision 183520) +++ sys/i386/i386/pmap.c (working copy) @@ -5,7 +5,7 @@ * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. - * Copyright (c) 2005 Alan L. Cox + * Copyright (c) 2005-2008 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -112,6 +112,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ #include #include #include +#include #include #include @@ -158,11 +160,7 @@ #define PMAP_SHPGPERPROC 200 #endif -#if defined(DIAGNOSTIC) -#define PMAP_DIAGNOSTIC -#endif - -#if !defined(PMAP_DIAGNOSTIC) +#if !defined(DIAGNOSTIC) #define PMAP_INLINE __gnu89_inline #else #define PMAP_INLINE @@ -175,6 +173,9 @@ #define PV_STAT(x) do { } while (0) #endif +#define pa_index(pa) ((pa) >> PDRSHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) + /* * Get PDEs and PTEs for user/kernel address space */ @@ -210,10 +211,17 @@ static uma_zone_t pdptzone; #endif +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pg_ps_enabled; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, + "Are large page mappings enabled?"); + /* * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ @@ -263,11 +271,29 @@ static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); +static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static boolean_t pmap_is_modified_pvh(struct md_page *pvh); +static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); +static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, + vm_prot_t prot); +static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, vm_page_t *free); +static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, @@ -355,6 +381,7 @@ #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif + kernel_pmap->pm_root = NULL; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); LIST_INIT(&allpmaps); @@ -525,7 +552,6 @@ { TAILQ_INIT(&m->md.pv_list); - m->md.pv_list_count = 0; } #ifdef PAE @@ -606,8 +632,24 @@ void pmap_init(void) { + vm_page_t mpte; + vm_size_t s; + int i, pv_npg; /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + for (i = 0; i < nkpt; i++) { + mpte = PHYS_TO_VM_PAGE(PTD[i + KPTDI] & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range")); + mpte->pindex = i + KPTDI; + mpte->phys_addr = PTD[i + KPTDI] & PG_FRAME; + } + + /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. @@ -618,6 +660,26 @@ pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + + /* + * Calculate the size of the pv head table for superpages. + */ + for (i = 0; phys_avail[i + 1]; i += 2); + pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_alloc(kernel_map, s); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE * pv_maxchunks); @@ -633,12 +695,30 @@ } -SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); +SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, + "2/4MB page mapping counters"); + +static u_long pmap_pde_demotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_pde_demotions, 0, "2/4MB page demotions"); + +static u_long pmap_pde_mappings; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pde_mappings, 0, "2/4MB page mappings"); + +static u_long pmap_pde_p_failures; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); + +static u_long pmap_pde_promotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_pde_promotions, 0, "2/4MB page promotions"); + /*************************************************** * Low level helper routines..... ***************************************************/ @@ -1161,11 +1241,104 @@ while (free != NULL) { m = free; free = m->right; - vm_page_free_zero(m); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); } } /* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + m->right = *free; + *free = m; +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static void +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + root = pmap->pm_root; + if (root == NULL) { + mpte->left = NULL; + mpte->right = NULL; + } else { + root = vm_page_splay(mpte->pindex, root); + if (mpte->pindex < root->pindex) { + mpte->left = root->left; + mpte->right = root; + root->left = NULL; + } else if (mpte->pindex == root->pindex) + panic("pmap_insert_pt_page: pindex already inserted"); + else { + mpte->right = root->right; + mpte->left = root; + root->right = NULL; + } + } + pmap->pm_root = mpte; +} + +/* + * Looks for a page table page mapping the specified virtual address in the + * specified pmap's collection of idle page table pages. Returns NULL if there + * is no page table page corresponding to the specified virtual address. + */ +static vm_page_t +pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +{ + vm_page_t mpte; + vm_pindex_t pindex = va >> PDRSHIFT; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { + mpte = vm_page_splay(pindex, mpte); + if ((pmap->pm_root = mpte)->pindex != pindex) + mpte = NULL; + } + return (mpte); +} + +/* + * Removes the specified page table page from the specified pmap's collection + * of idle page table pages. The specified page table page must be a member of + * the pmap's collection. + */ +static void +pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (mpte != pmap->pm_root) + vm_page_splay(mpte->pindex, pmap->pm_root); + if (mpte->left == NULL) + root = mpte->right; + else { + root = vm_page_splay(mpte->pindex, mpte->left); + root->right = mpte->right; + } + pmap->pm_root = root; +} + +/* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ @@ -1209,8 +1382,7 @@ * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ - m->right = *free; - *free = m; + pmap_add_delayed_free_list(m, free, TRUE); return 1; } @@ -1241,6 +1413,7 @@ #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif + pmap->pm_root = NULL; pmap->pm_active = 0; PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); @@ -1284,7 +1457,10 @@ KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif + pmap->pm_root = NULL; } + KASSERT(pmap->pm_root == NULL, + ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) @@ -1405,10 +1581,8 @@ * normal 4K page. */ if (ptepa & PG_PS) { - pmap->pm_pdir[ptepindex] = 0; - ptepa = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_invalidate_all(kernel_pmap); + (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); + ptepa = pmap->pm_pdir[ptepindex]; } /* @@ -1542,6 +1716,8 @@ KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); + KASSERT(pmap->pm_root == NULL, + ("pmap_release: pmap has reserved page table page(s)")); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); @@ -1628,13 +1804,10 @@ continue; } - /* - * This index is bogus, but out of the way - */ - nkpg = vm_page_alloc(NULL, nkpt, + nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (!nkpg) + if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; @@ -1728,6 +1901,8 @@ static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { + struct md_page *pvh; + pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; @@ -1747,26 +1922,27 @@ else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; + pde = pmap_pde(pmap, va); + KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; pmap_unuse_pt(pmap, va, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); - m->md.pv_list_count--; + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); @@ -1908,25 +2084,112 @@ return (pv); } -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == PV_PMAP(pv) && va == pv->pv_va) + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; + } } - KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count--; - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + return (pv); +} + +static void +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_demote_pde: pa is not 4mpage aligned")); + + /* + * Transfer the 4mpage's pv entry for this mapping to the first + * page's pv list. + */ + pvh = pa_to_pvh(pa); + va = trunc_4mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + va += PAGE_SIZE; + pmap_insert_entry(pmap, va, m); + } while (va < va_last); +} + +static void +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_promote_pde: pa is not 4mpage aligned")); + + /* + * Transfer the first page's pv entry for this mapping to the + * 4mpage's pv list. Aside from avoiding the cost of a call + * to get_pv_entry(), a transfer avoids the possibility that + * get_pv_entry() calls pmap_collect() and that pmap_collect() + * removes one of the mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_4mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + struct md_page *pvh; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } +} + /* * Create a pv entry for page at pa for * (pmap, va). @@ -1941,7 +2204,6 @@ pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count++; } /* @@ -1958,13 +2220,228 @@ (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count++; return (TRUE); } else return (FALSE); } /* + * Create the pv entries for each of the pages within a superpage. + */ +static boolean_t +pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + return (TRUE); + } else + return (FALSE); +} + +/* + * Tries to demote a 2- or 4MB page mapping. + */ +static boolean_t +pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde, oldpde; + pmap_t allpmaps_entry; + pt_entry_t *firstpte, newpte, *pte; + vm_paddr_t mptepa; + vm_page_t free, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_lookup_pt_page(pmap, va); + if (mpte != NULL) + pmap_remove_pt_page(pmap, mpte); + else { + KASSERT((*pde & PG_W) == 0, + ("pmap_demote_pde: page table page for a wired mapping" + " is missing")); + free = NULL; + pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); + pmap_invalidate_page(pmap, trunc_4mpage(va)); + pmap_free_zero_pages(free); + CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return (FALSE); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + + /* + * Temporarily map the page table page (mpte) into the kernel's + * address space at either PADDR1 or PADDR2. + */ + if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) { + if ((*PMAP1 & PG_FRAME) != mptepa) { + *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + invlcaddr(PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + firstpte = PADDR1; + } else { + mtx_lock(&PMAP2mutex); + if ((*PMAP2 & PG_FRAME) != mptepa) { + *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); + } + firstpte = PADDR2; + } + oldpde = *pde; + newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; + KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V), + ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pde: oldpde is missing PG_M")); + KASSERT((oldpde & PG_PS) != 0, + ("pmap_demote_pde: oldpde is missing PG_PS")); + newpte = oldpde & ~PG_PS; + if ((newpte & PG_PDE_PAT) != 0) + newpte ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), + ("pmap_demote_pde: firstpte and newpte map different physical" + " addresses")); + if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += PAGE_SIZE; + } + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + if (pmap == kernel_pmap) { + /* + * A harmless race exists between this loop and the bcopy() + * in pmap_pinit() that initializes the kernel segment of + * the new page table. Specifically, that bcopy() may copy + * the new PDE from the PTD, which is first in allpmaps, to + * the new page table before this loop updates that new + * page table. + */ + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { + pde = pmap_pde(allpmaps_entry, va); + KASSERT(*pde == newpde || (*pde & PG_PTE_PROMOTE) == + (oldpde & PG_PTE_PROMOTE), + ("pmap_demote_pde: pde was %#jx, expected %#jx", + (uintmax_t)*pde, (uintmax_t)oldpde)); + pde_store(pde, newpde); + } + mtx_unlock_spin(&allpmaps_lock); + } else + pde_store(pde, newpde); + if (firstpte == PADDR2) + mtx_unlock(&PMAP2mutex); + + /* + * Invalidate the recursive mapping of the page table page. + */ + pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); + + /* + * Demote the pv entry. This depends on the earlier demotion + * of the mapping. Specifically, the (re)creation of a per- + * page pv entry might trigger the execution of pmap_collect(), + * which might reclaim a newly (re)created per-page pv entry + * and destroy the associated mapping. In order to destroy + * the mapping, the PDE must have already changed from mapping + * the 2mpage to referencing the page table page. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); + + pmap_pde_demotions++; + CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* + * pmap_remove_pde: do the things to unmap a superpage in a process + */ +static void +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free) +{ + struct md_page *pvh; + pd_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_remove_pde: sva is not 4mpage aligned")); + oldpde = pte_load_clear(pdq); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpde & PG_G) + pmap_invalidate_page(kernel_pmap, sva); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + if (oldpde & PG_MANAGED) { + pvh = pa_to_pvh(oldpde & PG_PS_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + if (!pmap_demote_pde(pmap, pdq, sva)) + panic("pmap_remove_pde: failed demotion"); + } else { + mpte = pmap_lookup_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pde: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + } +} + +/* * pmap_remove_pte: do the things to unmap a page in a process */ static int @@ -1987,12 +2464,8 @@ pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); - if (oldpte & PG_M) { - KASSERT((oldpte & PG_RW), - ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)oldpte)); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); @@ -2081,10 +2554,25 @@ * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - pmap->pm_pdir[pdirindex] = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - anyvalid = 1; - continue; + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == pdnxt && eva >= pdnxt) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_remove_pde(). + */ + if ((ptpaddr & PG_G) == 0) + anyvalid = 1; + pmap_remove_pde(pmap, + &pmap->pm_pdir[pdirindex], sva, &free); + continue; + } else if (!pmap_demote_pde(pmap, + &pmap->pm_pdir[pdirindex], sva)) { + /* The large page mapping was destroyed. */ + continue; + } } /* @@ -2135,26 +2623,34 @@ void pmap_remove_all(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; + pd_entry_t *pde; + vm_offset_t va; vm_page_t free; -#if defined(PMAP_DIAGNOSTIC) - /* - * XXX This makes pmap_remove_all() illegal for non-managed pages! - */ - if (m->flags & PG_FICTITIOUS) { - panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", - VM_PAGE_TO_PHYS(m)); - } -#endif + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("pmap_remove_all: page %p is fictitious", m)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) @@ -2165,18 +2661,13 @@ /* * Update the vm_page_t clean and reference bits. */ - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", - pv->pv_va, (uintmax_t)tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count--; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } @@ -2185,6 +2676,56 @@ } /* + * pmap_protect_pde: do the things to protect a 4mpage in a process + */ +static boolean_t +pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) +{ + pd_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_protect_pde: sva is not 4mpage aligned")); + anychanged = FALSE; +retry: + oldpde = newpde = *pde; + if (oldpde & PG_MANAGED) { + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + /* + * In contrast to the analogous operation on a 4KB page + * mapping, the mapping's PG_A flag is not cleared and + * the page's PG_REFERENCED flag is not set. The + * reason is that pmap_demote_pde() expects that a 2/4MB + * page mapping with a stored page table page has PG_A + * set. + */ + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + } + } + if ((prot & VM_PROT_WRITE) == 0) + newpde &= ~(PG_RW | PG_M); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; +#endif + if (newpde != oldpde) { + if (!pde_cmpset(pde, oldpde, newpde)) + goto retry; + if (oldpde & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + return (anychanged); +} + +/* * Set the physical protection on the * specified range of this map as requested. */ @@ -2237,14 +2778,24 @@ * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - if ((prot & VM_PROT_WRITE) == 0) - pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pmap->pm_pdir[pdirindex] |= pg_nx; -#endif - anychanged = 1; - continue; + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == pdnxt && eva >= pdnxt) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_protect_pde(). + */ + if (pmap_protect_pde(pmap, + &pmap->pm_pdir[pdirindex], sva, prot)) + anychanged = 1; + continue; + } else if (!pmap_demote_pde(pmap, + &pmap->pm_pdir[pdirindex], sva)) { + /* The large page mapping was destroyed. */ + continue; + } } if (pdnxt > eva) @@ -2270,7 +2821,7 @@ vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } - if ((pbits & PG_M) != 0) { + if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); @@ -2308,6 +2859,141 @@ } /* + * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are + * within a single page table page (PTP) to a single 2- or 4MB page mapping. + * For promotion to occur, two conditions must be met: (1) the 4KB page + * mappings must map aligned, contiguous physical memory and (2) the 4KB page + * mappings must have identical characteristics. + * + * Managed (PG_MANAGED) mappings within the kernel address space are not + * promoted. The reason is that kernel PDEs are replicated in each pmap but + * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel + * pmap. + */ +static void +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde; + pmap_t allpmaps_entry; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_offset_t oldpteva; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2- or 4MB page. + */ + firstpte = vtopte(trunc_4mpage(va)); +setpde: + newpde = *firstpte; + if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + if ((newpde & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared without + * a TLB invalidation. + */ + if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & + ~PG_RW)) + goto setpde; + newpde &= ~PG_RW; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = *pte; + if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_int((u_int *)pte, oldpte, + oldpte & ~PG_RW)) + goto setpte; + oldpte &= ~PG_RW; + oldpteva = (oldpte & PG_FRAME & PDRMASK) | + (va & ~PDRMASK); + CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" + " in pmap %p", oldpteva, pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_pde: page table page is out of range")); + KASSERT(mpte->pindex == va >> PDRSHIFT, + ("pmap_promote_pde: page table page's pindex is wrong")); + pmap_insert_pt_page(pmap, mpte); + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); + + /* + * Propagate the PAT index to its proper position. + */ + if ((newpde & PG_PTE_PAT) != 0) + newpde ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * Map the superpage. + */ + if (pmap == kernel_pmap) { + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { + pde = pmap_pde(allpmaps_entry, va); + pde_store(pde, PG_PS | newpde); + } + mtx_unlock_spin(&allpmaps_lock); + } else + pde_store(pde, PG_PS | newpde); + + pmap_pde_promotions++; + CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" + " in pmap %p", va, pmap); +} + +/* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. @@ -2320,8 +3006,8 @@ * insert this page into the given map NOW. */ void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; pd_entry_t *pde; @@ -2332,12 +3018,9 @@ boolean_t invlva; va = trunc_page(va); -#ifdef PMAP_DIAGNOSTIC - if (va > VM_MAX_KERNEL_ADDRESS) - panic("pmap_enter: toobig"); - if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) - panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); -#endif + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, + ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); mpte = NULL; @@ -2352,16 +3035,6 @@ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, M_WAITOK); } -#if 0 && defined(PMAP_DIAGNOSTIC) - else { - pd_entry_t *pdeaddr = pmap_pde(pmap, va); - origpte = *pdeaddr; - if ((origpte & PG_V) == 0) { - panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", - pmap->pm_pdir[PTDPTDI], origpte, va); - } - } -#endif pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) @@ -2372,7 +3045,7 @@ * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { - panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", + panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } @@ -2473,9 +3146,12 @@ * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { + newpte |= PG_A; + if ((access & VM_PROT_WRITE) != 0) + newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; - origpte = pte_load_store(pte, newpte | PG_A); + origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); @@ -2487,10 +3163,7 @@ invlva = TRUE; #endif } - if (origpte & PG_M) { - KASSERT((origpte & PG_RW), - ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)origpte)); + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) @@ -2499,14 +3172,78 @@ if (invlva) pmap_invalidate_page(pmap, va); } else - pte_store(pte, newpte | PG_A); + pte_store(pte, newpte); } + + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pde(pmap, pde, va); + sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* + * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and + * FALSE otherwise. Fails if (1) a page table page cannot be allocated without + * blocking, (2) a mapping already exists at the specified virtual address, or + * (3) a pv entry cannot be allocated without reclaiming another pv entry. + */ +static boolean_t +pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + pd_entry_t *pde, newpde; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pde = pmap_pde(pmap, va); + if (*pde != 0) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { + newpde |= PG_MANAGED; + + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + } +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; +#endif + if (va < VM_MAXUSER_ADDRESS) + newpde |= PG_U; + + /* + * Increment counters. + */ + pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; + + /* + * Map the superpage. + */ + pde_store(pde, newpde); + + pmap_pde_mappings++; + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is @@ -2522,6 +3259,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { + vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; @@ -2531,8 +3269,15 @@ m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, - prot, mpte); + va = start + ptoa(diff); + if ((va & PDRMASK) == 0 && va + NBPDR <= end && + (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && + pmap_enter_pde(pmap, va, m, prot)) + m = &m[NBPDR / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, + mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); @@ -2596,7 +3341,7 @@ */ if (ptepa) { if (ptepa & PG_PS) - panic("pmap_enter_quick: unexpected mapping into 4MB page"); + return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { @@ -2760,9 +3505,29 @@ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { + pd_entry_t *pde; pt_entry_t *pte; + boolean_t are_queues_locked; + are_queues_locked = FALSE; +retry: PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) { + if (!wired != ((*pde & PG_W) == 0)) { + if (!are_queues_locked) { + are_queues_locked = TRUE; + if (!mtx_trylock(&vm_page_queue_mtx)) { + PMAP_UNLOCK(pmap); + vm_page_lock_queues(); + goto retry; + } + } + if (!pmap_demote_pde(pmap, pde, va)) + panic("pmap_change_wiring: demotion failed"); + } else + goto out; + } pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) @@ -2776,6 +3541,9 @@ */ pmap_pte_set_w(pte, wired); pmap_pte_release(pte); +out: + if (are_queues_locked) + vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } @@ -2819,8 +3587,8 @@ pd_entry_t srcptepaddr; unsigned ptepindex; - if (addr >= UPT_MIN_ADDRESS) - panic("pmap_copy: invalid to pmap_copy page tables"); + KASSERT(addr < UPT_MIN_ADDRESS, + ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) @@ -2832,7 +3600,10 @@ continue; if (srcptepaddr & PG_PS) { - if (dst_pmap->pm_pdir[ptepindex] == 0) { + if (dst_pmap->pm_pdir[ptepindex] == 0 && + ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & + PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += @@ -2842,8 +3613,8 @@ } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); - if (srcmpte->wire_count == 0) - panic("pmap_copy: source page table page is unused"); + KASSERT(srcmpte->wire_count > 0, + ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; @@ -3018,6 +3789,7 @@ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; int loops = 0; @@ -3033,10 +3805,39 @@ if (loops >= 16) break; } + if (loops < 16) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (PV_PMAP(pv) == pmap) + return (TRUE); + loops++; + if (loops >= 16) + break; + } + } return (FALSE); } /* + * Returns TRUE if the given page is mapped individually or as part of + * a 4mpage. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + struct md_page *pvh; + + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) + return (FALSE); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + return (!TAILQ_EMPTY(&pvh->pv_list)); + } else + return (TRUE); +} + +/* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but @@ -3048,8 +3849,10 @@ pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; - vm_page_t m, free = NULL; + vm_page_t free = NULL; + vm_page_t m, mpte, mt; pv_entry_t pv; + struct md_page *pvh; struct pv_chunk *pc, *npc; int field, idx; int32_t bit; @@ -3074,8 +3877,12 @@ pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; - pte = vtopte(pv->pv_va); + pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; + if ((tpte & PG_PS) == 0) { + pte = vtopte(pv->pv_va); + tpte = *pte & ~PG_PTE_PAT; + } if (tpte == 0) { printf( @@ -3102,27 +3909,52 @@ ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); - pmap->pm_stats.resident_count--; - pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ - if (tpte & PG_M) - vm_page_dirty(m); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if ((tpte & PG_PS) != 0) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; - m->md.pv_list_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); - - pmap_unuse_pt(pmap, pv->pv_va, &free); + if ((tpte & PG_PS) != 0) { + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pvh = pa_to_pvh(tpte & PG_PS_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_flag_clear(mt, PG_WRITEABLE); + } + mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + } else { + pmap->pm_stats.resident_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + pmap_unuse_pt(pmap, pv->pv_va, &free); + } } } if (allfree) { @@ -3153,22 +3985,35 @@ boolean_t pmap_is_modified(vm_page_t m) { + + if (m->flags & PG_FICTITIOUS) + return (FALSE); + if (pmap_is_modified_pvh(&m->md)) + return (TRUE); + return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); +} + +/* + * Returns TRUE if any of the given mappings were used to modify + * physical memory. Otherwise, returns FALSE. Both page and 2mpage + * mappings are supported. + */ +static boolean_t +pmap_is_modified_pvh(struct md_page *pvh) +{ pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; + mtx_assert(&vm_page_queue_mtx, MA_OWNED); rv = FALSE; - if (m->flags & PG_FICTITIOUS) - return (rv); - sched_pin(); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & PG_M) != 0; + rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; @@ -3186,12 +4031,14 @@ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { + pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); - if (*pmap_pde(pmap, addr)) { + pde = pmap_pde(pmap, addr); + if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } @@ -3205,18 +4052,34 @@ void pmap_remove_write(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; + pd_entry_t *pde; pt_entry_t oldpte, *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if ((*pde & PG_RW) != 0) + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; @@ -3254,15 +4117,49 @@ int pmap_ts_referenced(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv, pvf, pvn; pmap_t pmap; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Remove the mapping to a single page + * so that a subsequent access may + * repromote. Since the underlying + * page table page is fully populated, + * this removal never frees a page + * table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pmap_remove_page(pmap, va, NULL); + rtval++; + if (rtval > 4) { + PMAP_UNLOCK(pmap); + return (rtval); + } + } + } + } + PMAP_UNLOCK(pmap); + } if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { @@ -3271,6 +4168,9 @@ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" + " found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); @@ -3292,19 +4192,62 @@ void pmap_clear_modify(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; - pt_entry_t *pte; + pd_entry_t oldpde, *pde; + pt_entry_t oldpte, *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_RW) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pte = pmap_pte_quick(pmap, va); + oldpte = *pte; + if ((oldpte & PG_V) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ + while (!atomic_cmpset_int((u_int *)pte, + oldpte, + oldpte & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_M) != 0) { + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant @@ -3326,17 +4269,46 @@ void pmap_clear_reference(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + /* + * Remove the mapping to a single page so + * that a subsequent access may repromote. + * Since the underlying page table page is + * fully populated, this removal never frees + * a page table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pmap_remove_page(pmap, va, NULL); + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { /* @@ -3486,31 +4458,44 @@ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { + pd_entry_t *pdep; pt_entry_t *ptep, pte; + vm_paddr_t pa; vm_page_t m; int val = 0; PMAP_LOCK(pmap); - ptep = pmap_pte(pmap, addr); - pte = (ptep != NULL) ? *ptep : 0; - pmap_pte_release(ptep); + pdep = pmap_pde(pmap, addr); + if (*pdep != 0) { + if (*pdep & PG_PS) { + pte = *pdep; + val = MINCORE_SUPER; + /* Compute the physical address of the 4KB page. */ + pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & + PG_FRAME; + } else { + ptep = pmap_pte(pmap, addr); + pte = *ptep; + pmap_pte_release(ptep); + pa = pte & PG_FRAME; + } + } else { + pte = 0; + pa = 0; + } PMAP_UNLOCK(pmap); if (pte != 0) { - vm_paddr_t pa; - - val = MINCORE_INCORE; + val |= MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; - pa = pte & PG_FRAME; - m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ - if (pte & PG_M) + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* @@ -3584,7 +4569,31 @@ return addr; } +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t superpage_offset; + if (size < NBPDR) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & PDRMASK; + if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || + (*addr & PDRMASK) == superpage_offset) + return; + if ((*addr & PDRMASK) < superpage_offset) + *addr = (*addr & ~PDRMASK) + superpage_offset; + else + *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; +} + + #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h (revision 183520) +++ sys/amd64/include/pmap.h (working copy) @@ -57,7 +57,7 @@ #define PG_NC_PCD 0x010 /* PCD Cache disable */ #define PG_A 0x020 /* A Accessed */ #define PG_M 0x040 /* D Dirty */ -#define PG_PS 0x080 /* PS Page size (0=4k,1=4M) */ +#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ #define PG_PTE_PAT 0x080 /* PAT PAT index */ #define PG_G 0x100 /* G Global */ #define PG_AVAIL1 0x200 /* / Available for system */ @@ -76,6 +76,13 @@ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ /* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \ + PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V) + +/* * Page Protection Exception bits */ @@ -231,7 +238,7 @@ struct pv_chunk; struct md_page { - int pv_list_count; + int pv_unused; TAILQ_HEAD(,pv_entry) pv_list; }; @@ -242,6 +249,7 @@ u_int pm_active; /* active on cpus */ /* spare u_int here due to padding */ struct pmap_statistics pm_stats; /* pmap statistics */ + vm_page_t pm_root; /* spare page table pages */ }; typedef struct pmap *pmap_t; @@ -302,7 +310,6 @@ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) void pmap_bootstrap(vm_paddr_t *); @@ -316,6 +323,7 @@ void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_unmapdev(vm_offset_t, vm_size_t); void pmap_invalidate_page(pmap_t, vm_offset_t); void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h (revision 183520) +++ sys/amd64/include/vmparam.h (working copy) @@ -132,6 +132,20 @@ #define VM_NFREEORDER 13 /* + * Enable superpage reservations: 1 level. + */ +#ifndef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 1 +#endif + +/* + * Level 0 reservations consist of 512 pages. + */ +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 9 +#endif + +/* * Virtual addresses of things. Derived from the page directory and * page table indexes from pmap.h for precision. * Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c (revision 183520) +++ sys/amd64/amd64/pmap.c (working copy) @@ -7,7 +7,7 @@ * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. - * Copyright (c) 2005 Alan L. Cox + * Copyright (c) 2005-2008 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -107,10 +107,12 @@ #include "opt_msgbuf.h" #include "opt_pmap.h" +#include "opt_vm.h" #include #include #include +#include #include #include #include @@ -134,6 +136,7 @@ #include #include #include +#include #include #include @@ -149,11 +152,7 @@ #define PMAP_SHPGPERPROC 200 #endif -#if defined(DIAGNOSTIC) -#define PMAP_DIAGNOSTIC -#endif - -#if !defined(PMAP_DIAGNOSTIC) +#if !defined(DIAGNOSTIC) #define PMAP_INLINE __gnu89_inline #else #define PMAP_INLINE @@ -166,17 +165,25 @@ #define PV_STAT(x) do { } while (0) #endif +#define pa_index(pa) ((pa) >> PDRSHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) + struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ -static int nkpt; static int ndmpdp; static vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end; pt_entry_t pg_nx; +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pg_ps_enabled; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, + "Are large page mappings enabled?"); + static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ @@ -189,6 +196,7 @@ * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; /* @@ -205,11 +213,29 @@ static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); +static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static boolean_t pmap_is_modified_pvh(struct md_page *pvh); +static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); +static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, + vm_prot_t prot); +static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); +static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, @@ -291,8 +317,6 @@ pmap_pml4e(pmap_t pmap, vm_offset_t va) { - if (!pmap) - return NULL; return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } @@ -313,7 +337,7 @@ pml4_entry_t *pml4e; pml4e = pmap_pml4e(pmap, va); - if (pml4e == NULL || (*pml4e & PG_V) == 0) + if ((*pml4e & PG_V) == 0) return NULL; return (pmap_pml4e_to_pdpe(pml4e, va)); } @@ -365,21 +389,6 @@ } -static __inline pt_entry_t * -pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde) -{ - pd_entry_t *pde; - - pde = pmap_pde(pmap, va); - if (pde == NULL || (*pde & PG_V) == 0) - return NULL; - *ptepde = *pde; - if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ - return ((pt_entry_t *)pde); - return (pmap_pde_to_pte(pde, va)); -} - - PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { @@ -514,9 +523,9 @@ */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); + kernel_pmap->pm_root = NULL; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); - nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary @@ -603,7 +612,6 @@ { TAILQ_INIT(&m->md.pv_list); - m->md.pv_list_count = 0; } /* @@ -614,8 +622,28 @@ void pmap_init(void) { + pd_entry_t *pd; + vm_page_t mpte; + vm_size_t s; + int i, pv_npg; /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + pd = pmap_pde(kernel_pmap, VM_MIN_KERNEL_ADDRESS); + for (i = 0; i < NKPT; i++) { + if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V)) + continue; + mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range")); + mpte->pindex = pmap_pde_pindex(VM_MIN_KERNEL_ADDRESS) + i; + mpte->phys_addr = pd[i] & PG_FRAME; + } + + /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. @@ -624,9 +652,28 @@ pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + + /* + * Calculate the size of the pv head table for superpages. + */ + for (i = 0; phys_avail[i + 1]; i += 2); + pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_alloc(kernel_map, s); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); } -SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pmap_pventry_proc(SYSCTL_HANDLER_ARGS) { @@ -657,7 +704,26 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); +SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, + "2MB page mapping counters"); +static u_long pmap_pde_demotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_pde_demotions, 0, "2MB page demotions"); + +static u_long pmap_pde_mappings; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pde_mappings, 0, "2MB page mappings"); + +static u_long pmap_pde_p_failures; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_pde_p_failures, 0, "2MB page promotion failures"); + +static u_long pmap_pde_promotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_pde_promotions, 0, "2MB page promotions"); + + /*************************************************** * Low level helper routines..... ***************************************************/ @@ -959,17 +1025,25 @@ vm_paddr_t pmap_kextract(vm_offset_t va) { - pd_entry_t *pde; + pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { - pde = vtopde(va); - if (*pde & PG_PS) { - pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); + pde = *vtopde(va); + if (pde & PG_PS) { + pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { - pa = *vtopte(va); + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } @@ -1091,11 +1165,108 @@ while (free != NULL) { m = free; free = m->right; - vm_page_free_zero(m); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); } } /* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + m->right = *free; + *free = m; +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static void +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + root = pmap->pm_root; + if (root == NULL) { + mpte->left = NULL; + mpte->right = NULL; + } else { + root = vm_page_splay(mpte->pindex, root); + if (mpte->pindex < root->pindex) { + mpte->left = root->left; + mpte->right = root; + root->left = NULL; + } else if (mpte->pindex == root->pindex) + panic("pmap_insert_pt_page: pindex already inserted"); + else { + mpte->right = root->right; + mpte->left = root; + root->right = NULL; + } + } + pmap->pm_root = mpte; +} + +/* + * Looks for a page table page mapping the specified virtual address in the + * specified pmap's collection of idle page table pages. Returns NULL if there + * is no page table page corresponding to the specified virtual address. + */ +static vm_page_t +pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +{ + vm_page_t mpte; + vm_pindex_t pindex = pmap_pde_pindex(va); + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { + mpte = vm_page_splay(pindex, mpte); + if ((pmap->pm_root = mpte)->pindex != pindex) + mpte = NULL; + } + return (mpte); +} + +/* + * Removes the specified page table page from the specified pmap's collection + * of idle page table pages. The specified page table page must be a member of + * the pmap's collection. + */ +static void +pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (mpte != pmap->pm_root) { + root = vm_page_splay(mpte->pindex, pmap->pm_root); + KASSERT(mpte == root, + ("pmap_remove_pt_page: mpte %p is missing from pmap %p", + mpte, pmap)); + } + if (mpte->left == NULL) + root = mpte->right; + else { + root = vm_page_splay(mpte->pindex, mpte->left); + root->right = mpte->right; + } + pmap->pm_root = root; +} + +/* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ @@ -1171,8 +1342,7 @@ * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ - m->right = *free; - *free = m; + pmap_add_delayed_free_list(m, free, TRUE); return 1; } @@ -1199,6 +1369,7 @@ PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); + pmap->pm_root = NULL; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); @@ -1235,6 +1406,7 @@ /* install self-referential address mapping entry(s) */ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; + pmap->pm_root = NULL; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); @@ -1410,7 +1582,7 @@ { vm_pindex_t ptepindex; pd_entry_t *pd; - vm_page_t m, free; + vm_page_t m; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, @@ -1430,21 +1602,21 @@ * This supports switching from a 2MB page to a * normal 4K page. */ - if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { - *pd = 0; - pd = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - free = NULL; - pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va), &free); - pmap_invalidate_all(kernel_pmap); - pmap_free_zero_pages(free); + if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { + if (!pmap_demote_pde(pmap, pd, va)) { + /* + * Invalidation of the 2MB page mapping may have caused + * the deallocation of the underlying PD page. + */ + pd = NULL; + } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ - if (pd != 0 && (*pd & PG_V) != 0) { + if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { @@ -1477,6 +1649,8 @@ KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); + KASSERT(pmap->pm_root == NULL, + ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); @@ -1524,10 +1698,8 @@ mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; - nkpt = 0; while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); - nkpt++; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; @@ -1541,10 +1713,10 @@ pde = pmap_pde(kernel_pmap, kernel_vm_end); if (pde == NULL) { /* We need a new PDP entry */ - nkpg = vm_page_alloc(NULL, nkpt, + nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (!nkpg) + if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); @@ -1563,17 +1735,11 @@ continue; } - /* - * This index is bogus, but out of the way - */ - nkpg = vm_page_alloc(NULL, nkpt, + nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (!nkpg) + if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - - nkpt++; - if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); @@ -1650,11 +1816,16 @@ * drastic measures to free some pages so we can allocate * another pv entry chunk. This is normally called to * unmap inactive pages, and if necessary, active pages. + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. */ static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { - pd_entry_t ptepde; + struct md_page *pvh; + pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; @@ -1673,29 +1844,27 @@ else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; - pte = pmap_pte_pde(pmap, va, &ptepde); - if (pte == NULL) { - panic("null pte in pmap_collect"); - } + pde = pmap_pde(pmap, va); + KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#lx", tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_collect: modified page not writable: va: %#lx, pte: %#lx", - va, tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; - pmap_unuse_pt(pmap, va, ptepde, &free); + pmap_unuse_pt(pmap, va, *pde, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); - m->md.pv_list_count--; + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); @@ -1830,25 +1999,133 @@ return (pv); } -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == PV_PMAP(pv) && va == pv->pv_va) + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; + } } - KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count--; - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_demote_pde: pa is not 2mpage aligned")); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. + */ + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + va += PAGE_SIZE; + pmap_insert_entry(pmap, va, m); + } while (va < va_last); +} + +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_promote_pde: pa is not 2mpage aligned")); + + /* + * Transfer the first page's pv entry for this mapping to the + * 2mpage's pv list. Aside from avoiding the cost of a call + * to get_pv_entry(), a transfer avoids the possibility that + * get_pv_entry() calls pmap_collect() and that pmap_collect() + * removes one of the mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + struct md_page *pvh; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } +} + /* * Create a pv entry for page at pa for * (pmap, va). @@ -1863,7 +2140,6 @@ pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count++; } /* @@ -1880,13 +2156,176 @@ (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count++; return (TRUE); } else return (FALSE); } /* + * Create the pv entry for a 2MB page mapping. + */ +static boolean_t +pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + return (TRUE); + } else + return (FALSE); +} + +/* + * Tries to demote a 2MB page mapping. + */ +static boolean_t +pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde, oldpde; + pt_entry_t *firstpte, newpte, *pte; + vm_paddr_t mptepa; + vm_page_t free, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_lookup_pt_page(pmap, va); + if (mpte != NULL) + pmap_remove_pt_page(pmap, mpte); + else { + KASSERT((*pde & PG_W) == 0, + ("pmap_demote_pde: page table page for a wired mapping" + " is missing")); + free = NULL; + pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); + pmap_invalidate_page(pmap, trunc_2mpage(va)); + pmap_free_zero_pages(free); + CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); + oldpde = *pde; + newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; + KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V), + ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pde: oldpde is missing PG_M")); + KASSERT((oldpde & PG_PS) != 0, + ("pmap_demote_pde: oldpde is missing PG_PS")); + newpte = oldpde & ~PG_PS; + if ((newpte & PG_PDE_PAT) != 0) + newpte ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), + ("pmap_demote_pde: firstpte and newpte map different physical" + " addresses")); + if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += PAGE_SIZE; + } + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + pde_store(pde, newpde); + + /* + * Invalidate a stale mapping of the page table page. + */ + pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); + + /* + * Demote the pv entry. This depends on the earlier demotion + * of the mapping. Specifically, the (re)creation of a per- + * page pv entry might trigger the execution of pmap_collect(), + * which might reclaim a newly (re)created per-page pv entry + * and destroy the associated mapping. In order to destroy + * the mapping, the PDE must have already changed from mapping + * the 2mpage to referencing the page table page. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); + + pmap_pde_demotions++; + CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* + * pmap_remove_pde: do the things to unmap a superpage in a process + */ +static int +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free) +{ + struct md_page *pvh; + pd_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_remove_pde: sva is not 2mpage aligned")); + oldpde = pte_load_clear(pdq); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpde & PG_G) + pmap_invalidate_page(kernel_pmap, sva); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + if (oldpde & PG_MANAGED) { + pvh = pa_to_pvh(oldpde & PG_PS_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + if (!pmap_demote_pde(pmap, pdq, sva)) + panic("pmap_remove_pde: failed demotion"); + } else { + mpte = pmap_lookup_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pde: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + } + return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); +} + +/* * pmap_remove_pte: do the things to unmap a page in a process */ static int @@ -1909,12 +2348,8 @@ pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); - if (oldpte & PG_M) { - KASSERT((oldpte & PG_RW), - ("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx", - va, oldpte)); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); @@ -2022,11 +2457,24 @@ * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - *pde = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_unuse_pt(pmap, sva, *pdpe, &free); - anyvalid = 1; - continue; + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == va_next && eva >= va_next) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_remove_pde(). + */ + if ((ptpaddr & PG_G) == 0) + anyvalid = 1; + pmap_remove_pde(pmap, pde, sva, &free); + continue; + } else if (!pmap_demote_pde(pmap, pde, sva)) { + /* The large page mapping was destroyed. */ + continue; + } else + ptpaddr = *pde; } /* @@ -2076,30 +2524,34 @@ void pmap_remove_all(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; - pd_entry_t ptepde; + pd_entry_t *pde; + vm_offset_t va; vm_page_t free; -#if defined(PMAP_DIAGNOSTIC) - /* - * XXX This makes pmap_remove_all() illegal for non-managed pages! - */ - if (m->flags & PG_FICTITIOUS) { - panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx", - VM_PAGE_TO_PHYS(m)); + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("pmap_remove_all: page %p is fictitious", m)); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); } -#endif - mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; - pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde); - if (pte == NULL) { - panic("null pte in pmap_remove_all"); - } + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; @@ -2109,18 +2561,13 @@ /* * Update the vm_page_t clean and reference bits. */ - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx", - pv->pv_va, tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; - pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); + pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count--; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } @@ -2128,6 +2575,54 @@ } /* + * pmap_protect_pde: do the things to protect a 2mpage in a process + */ +static boolean_t +pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) +{ + pd_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_protect_pde: sva is not 2mpage aligned")); + anychanged = FALSE; +retry: + oldpde = newpde = *pde; + if (oldpde & PG_MANAGED) { + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + /* + * In contrast to the analogous operation on a 4KB page + * mapping, the mapping's PG_A flag is not cleared and + * the page's PG_REFERENCED flag is not set. The + * reason is that pmap_demote_pde() expects that a 2MB + * page mapping with a stored page table page has PG_A + * set. + */ + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + } + } + if ((prot & VM_PROT_WRITE) == 0) + newpde &= ~(PG_RW | PG_M); + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; + if (newpde != oldpde) { + if (!atomic_cmpset_long(pde, oldpde, newpde)) + goto retry; + if (oldpde & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + return (anychanged); +} + +/* * Set the physical protection on the * specified range of this map as requested. */ @@ -2189,12 +2684,22 @@ * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - if ((prot & VM_PROT_WRITE) == 0) - *pde &= ~(PG_M|PG_RW); - if ((prot & VM_PROT_EXECUTE) == 0) - *pde |= pg_nx; - anychanged = 1; - continue; + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == va_next && eva >= va_next) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_protect_pde(). + */ + if (pmap_protect_pde(pmap, pde, sva, prot)) + anychanged = 1; + continue; + } else if (!pmap_demote_pde(pmap, pde, sva)) { + /* The large page mapping was destroyed. */ + continue; + } } if (va_next > eva) @@ -2216,7 +2721,7 @@ vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } - if ((pbits & PG_M) != 0) { + if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); @@ -2246,6 +2751,119 @@ } /* + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single page table page (PTP) to a single 2MB page mapping. For promotion + * to occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + */ +static void +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_offset_t oldpteva; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2MB page. + */ + firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); +setpde: + newpde = *firstpte; + if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } + if ((newpde & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared without + * a TLB invalidation. + */ + if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) + goto setpde; + newpde &= ~PG_RW; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = *pte; + if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) + goto setpte; + oldpte &= ~PG_RW; + oldpteva = (oldpte & PG_FRAME & PDRMASK) | + (va & ~PDRMASK); + CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" + " in pmap %p", oldpteva, pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_pde: page table page is out of range")); + KASSERT(mpte->pindex == pmap_pde_pindex(va), + ("pmap_promote_pde: page table page's pindex is wrong")); + pmap_insert_pt_page(pmap, mpte); + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); + + /* + * Propagate the PAT index to its proper position. + */ + if ((newpde & PG_PTE_PAT) != 0) + newpde ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * Map the superpage. + */ + pde_store(pde, PG_PS | newpde); + + pmap_pde_promotions++; + CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" + " in pmap %p", va, pmap); +} + +/* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. @@ -2258,8 +2876,8 @@ * insert this page into the given map NOW. */ void -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; pd_entry_t *pde; @@ -2270,12 +2888,9 @@ boolean_t invlva; va = trunc_page(va); -#ifdef PMAP_DIAGNOSTIC - if (va > VM_MAX_KERNEL_ADDRESS) - panic("pmap_enter: toobig"); - if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) - panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); -#endif + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, + ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); mpte = NULL; @@ -2289,31 +2904,15 @@ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, M_WAITOK); } -#if 0 && defined(PMAP_DIAGNOSTIC) - else { - pd_entry_t *pdeaddr = pmap_pde(pmap, va); - origpte = *pdeaddr; - if ((origpte & PG_V) == 0) { - panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n", - origpte, va); - } - } -#endif pde = pmap_pde(pmap, va); - if (pde != NULL) { + if (pde != NULL && (*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) panic("pmap_enter: attempted pmap_enter on 2MB page"); pte = pmap_pde_to_pte(pde, va); } else - pte = NULL; + panic("pmap_enter: invalid page directory va=%#lx", va); - /* - * Page Directory table entry not valid, we need a new PT page - */ - if (pte == NULL) - panic("pmap_enter: invalid page directory va=%#lx\n", va); - pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; @@ -2409,9 +3008,12 @@ * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { + newpte |= PG_A; + if ((access & VM_PROT_WRITE) != 0) + newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; - origpte = pte_load_store(pte, newpte | PG_A); + origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); @@ -2419,10 +3021,7 @@ PG_NX) == 0 && (newpte & PG_NX))) invlva = TRUE; } - if (origpte & PG_M) { - KASSERT((origpte & PG_RW), - ("pmap_enter: modified page not writable: va: %#lx, pte: %#lx", - va, origpte)); + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((newpte & PG_RW) == 0) @@ -2431,13 +3030,90 @@ if (invlva) pmap_invalidate_page(pmap, va); } else - pte_store(pte, newpte | PG_A); + pte_store(pte, newpte); } + + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pde(pmap, pde, va); + vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* + * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE + * otherwise. Fails if (1) a page table page cannot be allocated without + * blocking, (2) a mapping already exists at the specified virtual address, or + * (3) a pv entry cannot be allocated without reclaiming another pv entry. + */ +static boolean_t +pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + pd_entry_t *pde, newpde; + vm_page_t free, mpde; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); + pde = &pde[pmap_pde_index(va)]; + if ((*pde & PG_V) != 0) { + KASSERT(mpde->wire_count > 1, + ("pmap_enter_pde: mpde's wire count is too low")); + mpde->wire_count--; + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { + newpde |= PG_MANAGED; + + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { + free = NULL; + if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); + } + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + } + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; + if (va < VM_MAXUSER_ADDRESS) + newpde |= PG_U; + + /* + * Increment counters. + */ + pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; + + /* + * Map the superpage. + */ + pde_store(pde, newpde); + + pmap_pde_mappings++; + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is @@ -2453,6 +3129,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { + vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; @@ -2462,8 +3139,15 @@ m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, - prot, mpte); + va = start + ptoa(diff); + if ((va & PDRMASK) == 0 && va + NBPDR <= end && + (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && + pmap_enter_pde(pmap, va, m, prot)) + m = &m[NBPDR / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, + mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); @@ -2527,7 +3211,7 @@ */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) - panic("pmap_enter_quick: unexpected mapping into 2MB page"); + return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { @@ -2708,14 +3392,35 @@ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { + pd_entry_t *pde; pt_entry_t *pte; + boolean_t are_queues_locked; + are_queues_locked = FALSE; + /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ +retry: PMAP_LOCK(pmap); - pte = pmap_pte(pmap, va); + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) { + if (!wired != ((*pde & PG_W) == 0)) { + if (!are_queues_locked) { + are_queues_locked = TRUE; + if (!mtx_trylock(&vm_page_queue_mtx)) { + PMAP_UNLOCK(pmap); + vm_page_lock_queues(); + goto retry; + } + } + if (!pmap_demote_pde(pmap, pde, va)) + panic("pmap_change_wiring: demotion failed"); + } else + goto out; + } + pte = pmap_pde_to_pte(pde, va); if (wired && (*pte & PG_W) == 0) { pmap->pm_stats.wired_count++; atomic_set_long(pte, PG_W); @@ -2723,6 +3428,9 @@ pmap->pm_stats.wired_count--; atomic_clear_long(pte, PG_W); } +out: + if (are_queues_locked) + vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } @@ -2766,8 +3474,8 @@ pdp_entry_t *pdpe; pd_entry_t srcptepaddr, *pde; - if (addr >= UPT_MIN_ADDRESS) - panic("pmap_copy: invalid to pmap_copy page tables"); + KASSERT(addr < UPT_MIN_ADDRESS, + ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { @@ -2801,7 +3509,9 @@ pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); pde = &pde[pmap_pde_index(addr)]; - if (*pde == 0) { + if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & + PG_PS_FRAME))) { *pde = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; @@ -2811,8 +3521,8 @@ } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); - if (srcmpte->wire_count == 0) - panic("pmap_copy: source page table page is unused"); + KASSERT(srcmpte->wire_count > 0, + ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; @@ -2932,6 +3642,7 @@ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; int loops = 0; @@ -2947,10 +3658,39 @@ if (loops >= 16) break; } + if (loops < 16) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (PV_PMAP(pv) == pmap) + return (TRUE); + loops++; + if (loops >= 16) + break; + } + } return (FALSE); } /* + * Returns TRUE if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + struct md_page *pvh; + + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) + return (FALSE); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + return (!TAILQ_EMPTY(&pvh->pv_list)); + } else + return (TRUE); +} + +/* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but @@ -2961,9 +3701,12 @@ void pmap_remove_pages(pmap_t pmap) { + pd_entry_t *pde; pt_entry_t *pte, tpte; - vm_page_t m, free = NULL; + vm_page_t free = NULL; + vm_page_t m, mpte, mt; pv_entry_t pv; + struct md_page *pvh; struct pv_chunk *pc, *npc; int field, idx; int64_t bit; @@ -2987,8 +3730,14 @@ pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; - pte = vtopte(pv->pv_va); - tpte = *pte; + pde = vtopde(pv->pv_va); + tpte = *pde; + if ((tpte & PG_PS) != 0) + pte = pde; + else { + pte = vtopte(pv->pv_va); + tpte = *pte & ~PG_PTE_PAT; + } if (tpte == 0) { printf( @@ -3015,27 +3764,54 @@ ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); - pmap->pm_stats.resident_count--; - pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ - if (tpte & PG_M) - vm_page_dirty(m); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if ((tpte & PG_PS) != 0) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; - m->md.pv_list_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); - pmap_unuse_pt(pmap, pv->pv_va, - *vtopde(pv->pv_va), &free); + if ((tpte & PG_PS) != 0) { + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pvh = pa_to_pvh(tpte & PG_PS_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_flag_clear(mt, PG_WRITEABLE); + } + mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + pmap_unuse_pt(pmap, pv->pv_va, + *pmap_pdpe(pmap, pv->pv_va), &free); + } else { + pmap->pm_stats.resident_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); + } } } if (allfree) { @@ -3064,21 +3840,34 @@ boolean_t pmap_is_modified(vm_page_t m) { + + if (m->flags & PG_FICTITIOUS) + return (FALSE); + if (pmap_is_modified_pvh(&m->md)) + return (TRUE); + return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); +} + +/* + * Returns TRUE if any of the given mappings were used to modify + * physical memory. Otherwise, returns FALSE. Both page and 2mpage + * mappings are supported. + */ +static boolean_t +pmap_is_modified_pvh(struct md_page *pvh) +{ pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; + mtx_assert(&vm_page_queue_mtx, MA_OWNED); rv = FALSE; - if (m->flags & PG_FICTITIOUS) - return (rv); - - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); - rv = (*pte & PG_M) != 0; + rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; @@ -3102,8 +3891,8 @@ rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); - if (pde != NULL && (*pde & PG_V)) { - pte = vtopte(addr); + if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { + pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); @@ -3116,18 +3905,34 @@ void pmap_remove_write(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; pmap_t pmap; + pv_entry_t next_pv, pv; + pd_entry_t *pde; pt_entry_t oldpte, *pte; + vm_offset_t va; if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if ((*pde & PG_RW) != 0) + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); - pte = pmap_pte(pmap, pv->pv_va); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { @@ -3158,14 +3963,48 @@ int pmap_ts_referenced(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv, pvf, pvn; pmap_t pmap; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Remove the mapping to a single page + * so that a subsequent access may + * repromote. Since the underlying + * page table page is fully populated, + * this removal never frees a page + * table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pmap_remove_page(pmap, va, pde, NULL); + rtval++; + if (rtval > 4) { + PMAP_UNLOCK(pmap); + return (rtval); + } + } + } + } + PMAP_UNLOCK(pmap); + } if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { @@ -3174,7 +4013,10 @@ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); - pte = pmap_pte(pmap, pv->pv_va); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" + " found a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); @@ -3194,18 +4036,56 @@ void pmap_clear_modify(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; pmap_t pmap; - pt_entry_t *pte; + pv_entry_t next_pv, pv; + pd_entry_t oldpde, *pde; + pt_entry_t oldpte, *pte; + vm_offset_t va; if ((m->flags & PG_FICTITIOUS) != 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_RW) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pte = pmap_pde_to_pte(pde, va); + oldpte = *pte; + if ((oldpte & PG_V) != 0) { + while (!atomic_cmpset_long(pte, + oldpte, + oldpte & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); - pte = pmap_pte(pmap, pv->pv_va); - if (*pte & PG_M) { + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } @@ -3221,17 +4101,46 @@ void pmap_clear_reference(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; pmap_t pmap; + pv_entry_t next_pv, pv; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; if ((m->flags & PG_FICTITIOUS) != 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + /* + * Remove the mapping to a single page so + * that a subsequent access may repromote. + * Since the underlying page table page is + * fully populated, this removal never frees + * a page table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_PS_FRAME); + pmap_remove_page(pmap, va, pde, NULL); + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); - pte = pmap_pte(pmap, pv->pv_va); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_pde_to_pte(pde, pv->pv_va); if (*pte & PG_A) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); @@ -3422,30 +4331,42 @@ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { - pt_entry_t *ptep, pte; + pd_entry_t *pdep; + pt_entry_t pte; + vm_paddr_t pa; vm_page_t m; int val = 0; PMAP_LOCK(pmap); - ptep = pmap_pte(pmap, addr); - pte = (ptep != NULL) ? *ptep : 0; + pdep = pmap_pde(pmap, addr); + if (pdep != NULL && (*pdep & PG_V)) { + if (*pdep & PG_PS) { + pte = *pdep; + val = MINCORE_SUPER; + /* Compute the physical address of the 4KB page. */ + pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & + PG_FRAME; + } else { + pte = *pmap_pde_to_pte(pdep, addr); + pa = pte & PG_FRAME; + } + } else { + pte = 0; + pa = 0; + } PMAP_UNLOCK(pmap); if (pte != 0) { - vm_paddr_t pa; - - val = MINCORE_INCORE; + val |= MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; - pa = pte & PG_FRAME; - m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ - if (pte & PG_M) + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* @@ -3512,3 +4433,27 @@ addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t superpage_offset; + + if (size < NBPDR) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & PDRMASK; + if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || + (*addr & PDRMASK) == superpage_offset) + return; + if ((*addr & PDRMASK) < superpage_offset) + *addr = (*addr & ~PDRMASK) + superpage_offset; + else + *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; +} Index: sys/sys/mman.h =================================================================== --- sys/sys/mman.h (revision 183520) +++ sys/sys/mman.h (working copy) @@ -139,6 +139,7 @@ #define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ +#define MINCORE_SUPER 0x20 /* Page is a "super" page */ #endif /* __BSD_VISIBLE */ /*