diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 6950c82..24f88cf 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -130,11 +130,13 @@ struct faultstate { vm_map_entry_t entry; int lookup_still_valid; struct vnode *vp; + int vfslocked; }; static inline void release_page(struct faultstate *fs) { + vm_page_wakeup(fs->m); vm_page_lock_queues(); vm_page_deactivate(fs->m); @@ -145,6 +147,7 @@ release_page(struct faultstate *fs) static inline void unlock_map(struct faultstate *fs) { + if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = FALSE; @@ -169,13 +172,11 @@ unlock_and_deallocate(struct faultstate *fs) vm_object_deallocate(fs->first_object); unlock_map(fs); if (fs->vp != NULL) { - int vfslocked; - - vfslocked = VFS_LOCK_GIANT(fs->vp->v_mount); vput(fs->vp); fs->vp = NULL; - VFS_UNLOCK_GIANT(vfslocked); } + VFS_UNLOCK_GIANT(fs->vfslocked); + fs->vfslocked = 0; } /* @@ -216,12 +217,17 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, vm_object_t next_object; vm_page_t marray[VM_FAULT_READ]; int hardfault; - int faultcount; + int faultcount, ahead, behind; struct faultstate fs; + struct vnode *vp; + int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); + fs.vp = NULL; + fs.vfslocked = 0; + faultcount = behind = 0; RetryFault:; @@ -287,21 +293,9 @@ RetryFault:; * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. - * - * XXX vnode_pager_lock() can block without releasing the map lock. */ - if (fs.first_object->flags & OBJ_NEEDGIANT) - mtx_lock(&Giant); VM_OBJECT_LOCK(fs.first_object); vm_object_reference_locked(fs.first_object); - fs.vp = vnode_pager_lock(fs.first_object); - KASSERT(fs.vp == NULL || !fs.map->system_map, - ("vm_fault: vnode-backed object mapped by system map")); - KASSERT((fs.first_object->flags & OBJ_NEEDGIANT) == 0 || - !fs.map->system_map, - ("vm_fault: Object requiring giant mapped by system map")); - if (fs.first_object->flags & OBJ_NEEDGIANT) - mtx_unlock(&Giant); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; @@ -378,14 +372,6 @@ RetryFault:; fs.first_m = NULL; } unlock_map(&fs); - if (fs.vp != NULL) { - int vfslck; - - vfslck = VFS_LOCK_GIANT(fs.vp->v_mount); - vput(fs.vp); - fs.vp = NULL; - VFS_UNLOCK_GIANT(vfslck); - } VM_OBJECT_LOCK(fs.object); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { @@ -439,7 +425,9 @@ RetryFault:; } #endif fs.m = vm_page_alloc(fs.object, fs.pindex, - (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO); + (fs.object->type == OBJT_VNODE || + fs.object->backing_object != NULL) ? + VM_ALLOC_NORMAL : VM_ALLOC_ZERO); } if (fs.m == NULL) { unlock_and_deallocate(&fs); @@ -462,7 +450,6 @@ readrest: if (TRYPAGER) { int rv; int reqpage = 0; - int ahead, behind; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { @@ -525,6 +512,65 @@ readrest: } if (is_first_object_locked) VM_OBJECT_UNLOCK(fs.first_object); + + /* + * Call the pager to retrieve the data, if any, after + * releasing the lock on the map. We hold a ref on + * fs.object and the pages are VPO_BUSY'd. + */ + unlock_map(&fs); + +vnode_lock: + if (fs.object->type == OBJT_VNODE) { + vp = fs.object->handle; + if (vp == fs.vp) + goto vnode_locked; + else if (fs.vp != NULL) { + vput(fs.vp); + fs.vp = NULL; + } + locked = VOP_ISLOCKED(vp); + + if (VFS_NEEDSGIANT(vp->v_mount) && !fs.vfslocked) { + fs.vfslocked = 1; + if (!mtx_trylock(&Giant)) { + VM_OBJECT_UNLOCK(fs.object); + mtx_lock(&Giant); + VM_OBJECT_LOCK(fs.object); + goto vnode_lock; + } + } + if (locked != LK_EXCLUSIVE) + locked = LK_SHARED; + /* Do not sleep for vnode lock while fs.m is busy */ + error = vget(vp, locked | LK_RETRY | + LK_CANRECURSE | LK_NOWAIT, curthread); + if (error == EBUSY) { + int vfslocked; + + vfslocked = fs.vfslocked; + fs.vfslocked = 0; /* Keep Giant */ + vhold(vp); + release_page(&fs); + unlock_and_deallocate(&fs); + error = vget(vp, locked | LK_RETRY | + LK_CANRECURSE, curthread); + vdrop(vp); + fs.vp = vp; + fs.vfslocked = vfslocked; + KASSERT(error == 0, + ("vm_fault: vget failed")); + goto RetryFault; + } + fs.vp = vp; + } +vnode_locked: + KASSERT(fs.vp == NULL || !fs.map->system_map, + ("vm_fault: vnode-backed object mapped by system map")); + KASSERT((fs.first_object->flags & OBJ_NEEDGIANT) == 0 || + !fs.map->system_map, + ("vm_fault: Object requiring giant mapped by system map")); + /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the @@ -545,25 +591,8 @@ readrest: faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); - /* - * update lastr imperfectly (we do not know how much - * getpages will actually read), but good enough. - * - * XXX The following assignment modifies the map - * without holding a write lock on it. - */ - fs.entry->lastr = fs.pindex + faultcount - behind; - - /* - * Call the pager to retrieve the data, if any, after - * releasing the lock on the map. We hold a ref on - * fs.object and the pages are VPO_BUSY'd. - */ - unlock_map(&fs); - - rv = faultcount ? - vm_pager_get_pages(fs.object, marray, faultcount, - reqpage) : VM_PAGER_FAIL; + rv = faultcount ? vm_pager_get_pages(fs.object, marray, + faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* @@ -837,6 +866,15 @@ readrest: prot &= retry_prot; } } + /* + * update lastr imperfectly (we do not know how much + * getpages will actually read), but good enough. + * + * XXX The following assignment modifies the map + * without holding a write lock on it. + */ + fs.entry->lastr = fs.pindex + faultcount - behind; + if (prot & VM_PROT_WRITE) { vm_object_set_writeable_dirty(fs.object); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 9a1ac63..b438058 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -271,7 +271,7 @@ kmem_malloc(map, size, flags) int flags; { vm_offset_t offset, i; - vm_map_entry_t entry; + vm_map_entry_t entry, freelist; vm_offset_t addr; vm_page_t m; int pflags; @@ -355,8 +355,10 @@ retry: vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kmem_object); - vm_map_delete(map, addr, addr + size); + freelist = NULL; + vm_map_delete(map, addr, addr + size, &freelist); vm_map_unlock(map); + vm_map_entry_free_freelist(map, freelist); return (0); } if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) @@ -455,14 +457,18 @@ kmem_free_wakeup(map, addr, size) vm_offset_t addr; vm_size_t size; { + vm_map_entry_t freelist; + freelist = NULL; vm_map_lock(map); - (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); + (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size), + &freelist); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); + vm_map_entry_free_freelist(map, freelist); } /* diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 18ba489..43dad2b 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1261,16 +1261,19 @@ vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { + vm_map_entry_t freelist; vm_offset_t end; int result; - vm_map_lock(map); end = start + length; + freelist = NULL; + vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - (void) vm_map_delete(map, start, end); + (void) vm_map_delete(map, start, end, &freelist); result = vm_map_insert(map, object, offset, start, end, prot, max, cow); vm_map_unlock(map); + vm_map_entry_free_freelist(map, freelist); return (result); } @@ -1350,6 +1353,16 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) entry->offset = prev->offset; if (entry->prev != &map->header) vm_map_entry_resize_free(map, entry->prev); + + /* + * If the backing object is the vnode object, + * vm_object_deallocate() results in a call to + * vrele(). Because the reference to the + * object is not last, vrele() does not lock + * the vnode, and map lock can be kept without + * causing vnode lock to be taken after the + * map lock. + */ if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); vm_map_entry_dispose(map, prev); @@ -1371,6 +1384,10 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); + + /* + * See comment above. + */ if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); vm_map_entry_dispose(map, next); @@ -2290,6 +2307,7 @@ vm_map_sync( vm_size_t size; vm_object_t object; vm_ooffset_t offset; + unsigned int last_timestamp; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); @@ -2324,8 +2342,7 @@ vm_map_sync( * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ - for (current = entry; current != &map->header && current->start < end; - current = current->next) { + for (current = entry; current != &map->header && current->start < end;) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { @@ -2345,8 +2362,16 @@ vm_map_sync( } else { object = current->object.vm_object; } + vm_object_reference(object); + last_timestamp = map->timestamp; + vm_map_unlock_read(map); vm_object_sync(object, offset, size, syncio, invalidate); start += size; + vm_object_deallocate(object); + vm_map_lock_read(map); + if (last_timestamp == map->timestamp || + !vm_map_lookup_entry(map, start, ¤t)) + current = current->next; } vm_map_unlock_read(map); @@ -2370,6 +2395,23 @@ vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) entry->wired_count = 0; } +void +vm_map_entry_free_freelist(vm_map_t map, vm_map_entry_t freelist) +{ + vm_map_entry_t e; + vm_object_t object; + + while (freelist != NULL) { + e = freelist; + freelist = freelist->next; + if ((e->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { + object = e->object.vm_object; + vm_object_deallocate(object); + } + vm_map_entry_dispose(map, e); + } +} + /* * vm_map_entry_delete: [ internal use only ] * @@ -2402,10 +2444,8 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) object->size = offidxstart; } VM_OBJECT_UNLOCK(object); - vm_object_deallocate(object); - } - - vm_map_entry_dispose(map, entry); + } else + entry->object.vm_object = NULL; } /* @@ -2415,7 +2455,8 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) * map. */ int -vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) +vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, + vm_map_entry_t *freelist) { vm_map_entry_t entry; vm_map_entry_t first_entry; @@ -2492,6 +2533,8 @@ vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) * modify bits will be set in the wrong object!) */ vm_map_entry_delete(map, entry); + entry->next = *freelist; + *freelist = entry; entry = next; } return (KERN_SUCCESS); @@ -2506,12 +2549,15 @@ vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { + vm_map_entry_t freelist; int result; + freelist = NULL; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - result = vm_map_delete(map, start, end); + result = vm_map_delete(map, start, end, &freelist); vm_map_unlock(map); + vm_map_entry_free_freelist(map, freelist); return (result); } @@ -2679,6 +2725,7 @@ vmspace_fork(struct vmspace *vm1) vm_map_entry_t old_entry; vm_map_entry_t new_entry; vm_object_t object; + int locked; vm_map_lock(old_map); @@ -2689,6 +2736,8 @@ vmspace_fork(struct vmspace *vm1) vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; new_map = &vm2->vm_map; /* XXX */ + locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ + KASSERT(locked, ("vmspace_fork: lock failed")); new_map->timestamp = 1; old_entry = old_map->header.next; @@ -2726,6 +2775,12 @@ vmspace_fork(struct vmspace *vm1) /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); + + /* + * As in vm_map_simplify_entry(), the + * vnode lock may not be acquired in + * this call to vm_object_deallocate(). + */ vm_object_deallocate(object); object = old_entry->object.vm_object; } @@ -2738,7 +2793,8 @@ vmspace_fork(struct vmspace *vm1) */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; - new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; + new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | + MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; /* @@ -2764,7 +2820,8 @@ vmspace_fork(struct vmspace *vm1) */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; - new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; + new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | + MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; new_entry->object.vm_object = NULL; vm_map_entry_link(new_map, new_map->header.prev, @@ -2778,6 +2835,8 @@ vmspace_fork(struct vmspace *vm1) } unlock_and_return: vm_map_unlock(old_map); + if (vm2 != NULL) + vm_map_unlock(new_map); return (vm2); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 0c3ddc8..af317bf 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -157,6 +157,8 @@ vm_map_entry_system_wired_count(vm_map_entry_t entry) { return (entry->wired_count - vm_map_entry_user_wired_count(entry)); } + +void vm_map_entry_free_freelist(vm_map_t map, vm_map_entry_t freelist); #endif /* _KERNEL */ /* @@ -336,7 +338,7 @@ long vmspace_wired_count(struct vmspace *vmspace); #ifdef _KERNEL boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); -int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t); +int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t, vm_map_entry_t *); int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, int, vm_prot_t, vm_prot_t, int); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, @@ -359,7 +361,7 @@ void vm_map_startup (void); int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t); int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); -void vm_map_simplify_entry (vm_map_t, vm_map_entry_t); +void vm_map_simplify_entry(vm_map_t, vm_map_entry_t); void vm_init2 (void); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_growstack (struct proc *p, vm_offset_t addr); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 2d668c4..4ee9dfc 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -552,6 +552,7 @@ munmap(td, uap) vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; + vm_map_entry_t freelist; addr = (vm_offset_t) uap->addr; size = uap->len; @@ -571,6 +572,7 @@ munmap(td, uap) map = &td->td_proc->p_vmspace->vm_map; if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) return (EINVAL); + freelist = NULL; vm_map_lock(map); #ifdef HWPMC_HOOKS /* @@ -593,8 +595,9 @@ munmap(td, uap) } #endif /* returns nothing but KERN_SUCCESS anyway */ - vm_map_delete(map, addr, addr + size); + vm_map_delete(map, addr, addr + size, &freelist); vm_map_unlock(map); + vm_map_entry_free_freelist(map, freelist); return (0); } diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c index cd98be9..8092102 100644 --- a/sys/vm/vm_unix.c +++ b/sys/vm/vm_unix.c @@ -72,6 +72,7 @@ obreak(td, uap) struct obreak_args *uap; { struct vmspace *vm = td->td_proc->p_vmspace; + vm_map_entry_t freelist; vm_offset_t new, old, base; rlim_t datalim, vmemlim; int rv; @@ -85,6 +86,7 @@ obreak(td, uap) do_map_wirefuture = FALSE; new = round_page((vm_offset_t)uap->nsize); + freelist = NULL; vm_map_lock(&vm->vm_map); base = round_page((vm_offset_t) vm->vm_daddr); @@ -138,7 +140,7 @@ obreak(td, uap) do_map_wirefuture = TRUE; } } else if (new < old) { - rv = vm_map_delete(&vm->vm_map, new, old); + rv = vm_map_delete(&vm->vm_map, new, old, &freelist); if (rv != KERN_SUCCESS) { error = ENOMEM; goto done; @@ -147,6 +149,7 @@ obreak(td, uap) } done: vm_map_unlock(&vm->vm_map); + vm_map_entry_free_freelist(&vm->vm_map, freelist); if (do_map_wirefuture) (void) vm_map_wire(&vm->vm_map, old, new, diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 4721646..a8aef29 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -367,6 +367,7 @@ vnode_pager_setsize(vp, nsize) if ((object = vp->v_object) == NULL) return; + ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); VM_OBJECT_LOCK(object); if (nsize == object->un_pager.vnp.vnp_size) { /* @@ -1173,56 +1174,3 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals) } return rtvals[0]; } - -struct vnode * -vnode_pager_lock(vm_object_t first_object) -{ - struct vnode *vp; - vm_object_t backing_object, object; - int locked, lockf; - - VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED); - for (object = first_object; object != NULL; object = backing_object) { - if (object->type != OBJT_VNODE) { - if ((backing_object = object->backing_object) != NULL) - VM_OBJECT_LOCK(backing_object); - if (object != first_object) - VM_OBJECT_UNLOCK(object); - continue; - } - retry: - if (object->flags & OBJ_DEAD) { - if (object != first_object) - VM_OBJECT_UNLOCK(object); - return NULL; - } - vp = object->handle; - locked = VOP_ISLOCKED(vp); - VI_LOCK(vp); - VM_OBJECT_UNLOCK(object); - if (first_object != object) - VM_OBJECT_UNLOCK(first_object); - VFS_ASSERT_GIANT(vp->v_mount); - if (locked == LK_EXCLUSIVE) - lockf = LK_CANRECURSE | LK_INTERLOCK | LK_RETRY | - LK_EXCLUSIVE; - else - lockf = LK_CANRECURSE | LK_INTERLOCK | LK_RETRY | - LK_SHARED; - if (vget(vp, lockf, curthread)) { - VM_OBJECT_LOCK(first_object); - if (object != first_object) - VM_OBJECT_LOCK(object); - if (object->type != OBJT_VNODE) { - if (object != first_object) - VM_OBJECT_UNLOCK(object); - return NULL; - } - printf("vnode_pager_lock: retrying\n"); - goto retry; - } - VM_OBJECT_LOCK(first_object); - return (vp); - } - return NULL; -} diff --git a/sys/vm/vnode_pager.h b/sys/vm/vnode_pager.h index aa9be03..88ae306 100644 --- a/sys/vm/vnode_pager.h +++ b/sys/vm/vnode_pager.h @@ -39,7 +39,6 @@ #define _VNODE_PAGER_ 1 #ifdef _KERNEL -struct vnode *vnode_pager_lock(vm_object_t); /* * XXX Generic routines; currently called by badly written FS code; these