Index: i386/i386/pmap.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/pmap.c,v retrieving revision 1.549 diff -u -r1.549 pmap.c --- i386/i386/pmap.c 12 Apr 2006 04:22:51 -0000 1.549 +++ i386/i386/pmap.c 25 Apr 2006 23:20:43 -0000 @@ -168,6 +168,13 @@ #define PMAP_INLINE #endif +#define PV_STATS +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + /* * Get PDEs and PTEs for user/kernel address space */ @@ -206,9 +213,8 @@ /* * Data for the pv entry allocation mechanism */ -static uma_zone_t pvzone; -static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static int shpgperproc = PMAP_SHPGPERPROC; /* * All those kernel PT submaps that BSD is so fond of @@ -254,8 +260,8 @@ "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; -static PMAP_INLINE void free_pv_entry(pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t locked_pmap); +static PMAP_INLINE void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); static void pmap_clear_ptes(vm_page_t m, int bit); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); @@ -341,7 +347,7 @@ kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif kernel_pmap->pm_active = -1; /* don't allow deactivation */ - TAILQ_INIT(&kernel_pmap->pm_pvlist); + TAILQ_INIT(&kernel_pmap->pm_pvchunk); LIST_INIT(&allpmaps); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); @@ -484,20 +490,16 @@ void pmap_init(void) { - int shpgperproc = PMAP_SHPGPERPROC; /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ - pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); - uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); #ifdef PAE pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, @@ -508,6 +510,37 @@ } +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); +static int +pmap_pventry_proc(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (error == 0 && req->newptr) { + shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc; + pv_entry_high_water = 9 * (pv_entry_max / 10); + } + return (error); +} +SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, + &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries"); + +static int +pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (error == 0 && req->newptr) { + pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; + pv_entry_high_water = 9 * (pv_entry_max / 10); + } + return (error); +} +SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, + &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); + /*************************************************** * Low level helper routines..... ***************************************************/ @@ -1007,7 +1040,7 @@ #endif pmap->pm_active = 0; PCPU_SET(curpmap, pmap); - TAILQ_INIT(&pmap->pm_pvlist); + TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); @@ -1087,7 +1120,7 @@ } pmap->pm_active = 0; - TAILQ_INIT(&pmap->pm_pvlist); + TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } @@ -1428,60 +1461,83 @@ * page management routines. ***************************************************/ -/* - * free the pv_entry back to the free list - */ -static PMAP_INLINE void -free_pv_entry(pv_entry_t pv) +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 11); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) { - pv_entry_count--; - uma_zfree(pvzone, pv); + + return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); } +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ +#define PC_FREE10 0x0000fffful /* Free values for index 10 */ + +static uint64_t pc_freemask[11] = { + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE10 +}; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static long pv_entry_frees, pv_entry_allocs; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +YSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); + +static int pmap_collect_inactive, pmap_collect_active; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, + "Current number times pmap_collect called on inactive queue"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, + "Current number times pmap_collect called on active queue"); +#endif + /* - * get a new pv_entry, allocating a block from the system - * when needed. + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. This is normally called to + * unmap inactive pages, and if necessary, active pages. */ -static pv_entry_t -get_pv_entry(pmap_t locked_pmap) +static void +pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { - static const struct timeval printinterval = { 60, 0 }; - static struct timeval lastprint; - struct vpgqueues *vpq; pmap_t pmap; pt_entry_t *pte, tpte; - pv_entry_t allocated_pv, next_pv, pv; + pv_entry_t next_pv, pv; vm_offset_t va; vm_page_t m; - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - allocated_pv = uma_zalloc(pvzone, M_NOWAIT); - if (allocated_pv != NULL) { - pv_entry_count++; - if (pv_entry_count > pv_entry_high_water) - pagedaemon_wakeup(); - else - return (allocated_pv); - } - - /* - * Reclaim pv entries: At first, destroy mappings to inactive - * pages. After that, if a pv entry is still needed, destroy - * mappings to active pages. - */ - if (ratecheck(&lastprint, &printinterval)) - printf("Approaching the limit on PV entries, " - "increase the vm.pmap.shpgperproc tunable.\n"); - vpq = &vm_page_queues[PQ_INACTIVE]; -retry: sched_pin(); TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; - pmap = pv->pv_pmap; + pmap = PV_PMAP(pv); /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); @@ -1491,17 +1547,16 @@ pte = pmap_pte_quick(pmap, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, - ("get_pv_entry: wired pte %#jx", (uintmax_t)tpte)); + ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); if (tpte & PG_M) { KASSERT((tpte & PG_RW), - ("get_pv_entry: modified page not writable: va: %#x, pte: %#jx", + ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", va, (uintmax_t)tpte)); vm_page_dirty(m); } pmap_invalidate_page(pmap, va); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); @@ -1509,48 +1564,166 @@ pmap_unuse_pt(pmap, va); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); - if (allocated_pv == NULL) - allocated_pv = pv; - else - free_pv_entry(pv); + free_pv_entry(locked_pmap, pv); } } sched_unpin(); - if (allocated_pv == NULL) { - if (vpq == &vm_page_queues[PQ_INACTIVE]) { - vpq = &vm_page_queues[PQ_ACTIVE]; - goto retry; - } - panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable"); - } - return (allocated_pv); } -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) + +/* + * free the pv_entry back to the free list + */ +static PMAP_INLINE void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + vm_page_t m; + struct pv_chunk *pc; + int idx, field, bit; + + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 32; + bit = idx % 32; + pc->pc_map[field] |= 1ul << bit; + /* move to head of list */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + for (idx = 0; idx < _NPCM; idx++) + if (pc->pc_map[idx] != pc_freemask[idx]) + return; + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* entire chunk is free, return it */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_lock_queues(); + vm_page_unwire(m, 0); + vm_page_free(m); + vm_page_unlock_queues(); + kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, int try) { + static const struct timeval printinterval = { 60, 0 }; + static struct timeval lastprint; + static vm_pindex_t colour; + int bit, field; pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - if (m->md.pv_list_count < pmap->pm_stats.resident_count) { - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == pv->pv_pmap && va == pv->pv_va) + PV_STAT(pv_entry_allocs++); + pv_entry_count++; + if (pv_entry_count > pv_entry_high_water) + pagedaemon_wakeup(); + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = bsfl(pc->pc_map[field]); break; + } } - } else { - TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { - if (va == pv->pv_va) - break; + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 32 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != 0) { + PV_STAT(pv_entry_spare--); + return (pv); /* not full, return */ + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare--); + return (pv); + } + } + /* No free items, allocate another chunk */ + pc = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE); + if (pc == NULL) { + if (try) { + pv_entry_count--; + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + panic("get_pv_entry: out of kvm for pv entry chunk!"); + } + m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + if (m == NULL) { + if (try) { + pv_entry_count--; + PV_STAT(pc_chunk_tryfail++); + kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE); + return (NULL); } + /* + * Reclaim pv entries: At first, destroy mappings to inactive + * pages. After that, if a pv chunk entry is still needed, + * destroy mappings to active pages. + */ + if (ratecheck(&lastprint, &printinterval)) + printf("Approaching the limit on PV entries, consider" + "increasing sysctl vm.pmap.shpgperproc or " + "vm.pmap.pv_entry_max\n"); + PV_STAT(pmap_collect_inactive++); + pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]); + m = vm_page_alloc(NULL, colour, + VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); + if (m == NULL) { + PV_STAT(pmap_collect_active++); + pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]); + m = vm_page_alloc(NULL, colour, + VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); + if (m == NULL) + panic("get_pv_entry: increase vm.pmap.shpgperproc"); + } + } + PV_STAT(pc_chunk_count++); + PV_STAT(pc_chunk_allocs++); + colour++; + pmap_qenter((vm_offset_t)pc, &m, 1); + pc->pc_pmap = pmap; + pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ + for (field = 1; field < _NPCM; field++) + pc->pc_map[field] = pc_freemask[field]; + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); +} + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) + break; } KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); - free_pv_entry(pv); + free_pv_entry(pmap, pv); } /* @@ -1562,13 +1735,10 @@ { pv_entry_t pv; - pv = get_pv_entry(pmap); - pv->pv_va = va; - pv->pv_pmap = pmap; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } @@ -1584,11 +1754,8 @@ PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && - (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) { - pv_entry_count++; + (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; - pv->pv_pmap = pmap; - TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; return (TRUE); @@ -1764,6 +1931,7 @@ pmap_remove_all(vm_page_t m) { register pv_entry_t pv; + pmap_t pmap; pt_entry_t *pte, tpte; #if defined(PMAP_DIAGNOSTIC) @@ -1778,12 +1946,13 @@ mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - PMAP_LOCK(pv->pv_pmap); - pv->pv_pmap->pm_stats.resident_count--; - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) - pv->pv_pmap->pm_stats.wired_count--; + pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); @@ -1796,13 +1965,12 @@ pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); + pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; - pmap_unuse_pt(pv->pv_pmap, pv->pv_va); - PMAP_UNLOCK(pv->pv_pmap); - free_pv_entry(pv); + pmap_unuse_pt(pmap, pv->pv_va); + PMAP_UNLOCK(pmap); + free_pv_entry(pmap, pv); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); @@ -2577,7 +2745,7 @@ mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pv->pv_pmap == pmap) { + if (PV_PMAP(pv) == pmap) { return TRUE; } loops++; @@ -2587,7 +2755,6 @@ return (FALSE); } -#define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code @@ -2601,69 +2768,95 @@ { pt_entry_t *pte, tpte; vm_page_t m; - pv_entry_t pv, npv; + pv_entry_t pv; + struct pv_chunk *pc, *npc; + int field, idx; + int32_t bit; + uint32_t inuse, bitmask; + int allfree; -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } -#endif vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); - for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { - -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - pte = vtopte(pv->pv_va); -#else - pte = pmap_pte_quick(pmap, pv->pv_va); -#endif - tpte = *pte; - - if (tpte == 0) { - printf("TPTE at %p IS ZERO @ VA %08x\n", - pte, pv->pv_va); - panic("bad pte"); - } + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + for (field = 0; field < _NPCM; field++) { + inuse = (~(pc->pc_map[field])) & pc_freemask[field]; + while (inuse != 0) { + bit = bsfl(inuse); + bitmask = 1UL << bit; + idx = field * 32 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = vtopte(pv->pv_va); + tpte = *pte; + + if (tpte == 0) { + printf( + "TPTE at %p IS ZERO @ VA %08x\n", + pte, pv->pv_va); + panic("bad pte"); + } /* * We cannot remove wired pages from a process' mapping at this time */ - if (tpte & PG_W) { - npv = TAILQ_NEXT(pv, pv_plist); - continue; - } - - m = PHYS_TO_VM_PAGE(tpte); - KASSERT(m->phys_addr == (tpte & PG_FRAME), - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); - - KASSERT(m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); + if (tpte & PG_W) { + allfree = 0; + continue; + } - pmap->pm_stats.resident_count--; + m = PHYS_TO_VM_PAGE(tpte); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); - pte_clear(pte); + pmap->pm_stats.resident_count--; - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { - vm_page_dirty(m); - } + pte_clear(pte); - npv = TAILQ_NEXT(pv, pv_plist); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + /* + * Update the vm_page_t clean/reference bits. + */ + if (tpte & PG_M) + vm_page_dirty(m); - m->md.pv_list_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + /* Mark free */ + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc->pc_map[field] |= bitmask; + m->md.pv_list_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); - pmap_unuse_pt(pmap, pv->pv_va); - free_pv_entry(pv); + pmap_unuse_pt(pmap, pv->pv_va); + } + } + if (allfree) { + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_lock_queues(); + vm_page_unwire(m, 0); + vm_page_free(m); + vm_page_unlock_queues(); + kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE); + } } sched_unpin(); pmap_invalidate_all(pmap); @@ -2682,6 +2875,7 @@ { pv_entry_t pv; pt_entry_t *pte; + pmap_t pmap; boolean_t rv; rv = FALSE; @@ -2691,10 +2885,11 @@ sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & PG_M) != 0; - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); if (rv) break; } @@ -2733,6 +2928,7 @@ pmap_clear_ptes(vm_page_t m, int bit) { register pv_entry_t pv; + pmap_t pmap; pt_entry_t pbits, *pte; if ((m->flags & PG_FICTITIOUS) || @@ -2746,8 +2942,9 @@ * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); retry: pbits = *pte; if (pbits & bit) { @@ -2766,9 +2963,9 @@ } else { atomic_clear_int((u_int *)pte, bit); } - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + pmap_invalidate_page(pmap, pv->pv_va); } - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); } if (bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); @@ -2808,6 +3005,7 @@ pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; + pmap_t pmap; pt_entry_t *pte; pt_entry_t v; int rtval = 0; @@ -2828,20 +3026,21 @@ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + pmap_invalidate_page(pmap, pv->pv_va); rtval++; if (rtval > 4) { - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); break; } } - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } sched_unpin(); @@ -3129,13 +3328,15 @@ vm_paddr_t pa; { pv_entry_t pv; + pmap_t pmap; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); - pads(pv->pv_pmap); + pmap = PV_PMAP(pv); + printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); + pads(pmap); } printf(" "); } Index: i386/include/pmap.h =================================================================== RCS file: /home/ncvs/src/sys/i386/include/pmap.h,v retrieving revision 1.118 diff -u -r1.118 pmap.h --- i386/include/pmap.h 6 Dec 2005 21:09:01 -0000 1.118 +++ i386/include/pmap.h 25 Apr 2006 23:20:43 -0000 @@ -283,6 +283,7 @@ * Pmap stuff */ struct pv_entry; +struct pv_chunk; struct md_page { int pv_list_count; @@ -292,7 +293,7 @@ struct pmap { struct mtx pm_mtx; pd_entry_t *pm_pdir; /* KVA of page directory */ - TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ u_int pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ @@ -325,12 +326,24 @@ * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { - pmap_t pv_pmap; /* pmap where mapping lies */ vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_list; - TAILQ_ENTRY(pv_entry) pv_plist; } *pv_entry_t; +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 11 +#define _NPCPV 336 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint32_t pc_map[_NPCM]; /* bitmap; 1 = free */ + uint32_t pc_spare[2]; + struct pv_entry pc_pventry[_NPCPV]; +}; + #ifdef _KERNEL #define NPPROVMTRR 8