diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 0c98d56..94d448d 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -74,6 +74,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -91,6 +92,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -475,11 +477,23 @@ vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry; + vm_object_t object; td = curthread; - while ((entry = td->td_map_def_user) != NULL) { td->td_map_def_user = entry->next; + if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { + /* + * Decrement the object's writemappings and + * possibly the vnode's v_writecount. + */ + KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, + ("Submap with writecount")); + object = entry->object.vm_object; + KASSERT(object != NULL, ("No object for writecount")); + vnode_pager_release_writecount(object, entry->start, + entry->end); + } vm_map_entry_deallocate(entry, FALSE); } } @@ -1178,6 +1192,8 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; + if (cow & MAP_VN_WRITECOUNT) + protoeflags |= MAP_ENTRY_VN_WRITECNT; cred = NULL; KASSERT((object != kmem_object && object != kernel_object) || @@ -1516,6 +1532,11 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) * references. Thus, the map lock can be kept * without causing a lock-order reversal with * the vnode lock. + * + * Since we count the number of virtual page + * mappings in object->un_pager.vnp.writemappings, + * the writemappings value should not be adjusted + * when the entry is disposed of. */ if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); @@ -1627,6 +1648,13 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); + /* + * The object->un_pager.vnp.writemappings for the + * object of MAP_ENTRY_VN_WRITECNT type entry shall be + * kept as is here. The virtual pages are + * re-distributed among the clipped entries, so the sum is + * left the same. + */ } } @@ -2900,6 +2928,7 @@ vm_map_copy_entry( vm_ooffset_t *fork_charge) { vm_object_t src_object; + vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; @@ -2965,6 +2994,27 @@ vm_map_copy_entry( src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; + if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + /* + * MAP_ENTRY_VN_WRITECNT cannot + * indicate write reference from + * src_entry, since the entry is + * marked as needs copy. Allocate a + * fake entry that is used to + * decrement object->un_pager.vnp.writecount + * at the appropriate time. Attach + * fake_entry to the deferred list. + */ + fake_entry = vm_map_entry_create(dst_map); + fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; + src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; + vm_object_reference(src_object); + fake_entry->object.vm_object = src_object; + fake_entry->start = src_entry->start; + fake_entry->end = src_entry->end; + fake_entry->next = curthread->td_map_def_user; + curthread->td_map_def_user = fake_entry; + } } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; @@ -3043,6 +3093,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); + new_map = NULL; /* silence gcc */ vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); if (vm2 == NULL) goto unlock_and_return; @@ -3122,6 +3173,16 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + object = new_entry->object.vm_object; + KASSERT(((struct vnode *)object->handle)-> + v_writecount > 0, + ("vmspace_fork: v_writecount")); + KASSERT(object->un_pager.vnp.writemappings > 0, + ("vmspace_fork: vnp.writecount")); + vnode_pager_update_writecount(object, + new_entry->start, new_entry->end); + } /* * Insert the entry into the new map -- we know we're @@ -3146,8 +3207,11 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; + /* + * Copied entry is COW over the old object. + */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | - MAP_ENTRY_IN_TRANSITION); + MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; @@ -3161,9 +3225,15 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) old_entry = old_entry->next; } unlock_and_return: - vm_map_unlock(old_map); + /* + * Use inlined vm_map_unlock() to postpone handling the deferred + * map entries, which cannot be done until both old_map and + * new_map locks are released. + */ + sx_xunlock(&old_map->lock); if (vm2 != NULL) - vm_map_unlock(new_map); + sx_xunlock(&new_map->lock); + vm_map_process_deferred(); return (vm2); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 11ff632..0b7fdf9 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -139,6 +139,7 @@ struct vm_map_entry { #define MAP_ENTRY_GROWS_UP 0x2000 /* Bottom-up stacks */ #define MAP_ENTRY_WIRE_SKIPPED 0x4000 +#define MAP_ENTRY_VN_WRITECNT 0x8000 /* writeable vnode mapping */ #ifdef _KERNEL static __inline u_char @@ -315,6 +316,7 @@ long vmspace_wired_count(struct vmspace *vmspace); #define MAP_DISABLE_SYNCER 0x0020 #define MAP_DISABLE_COREDUMP 0x0100 #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ +#define MAP_VN_WRITECOUNT 0x0400 #define MAP_STACK_GROWS_DOWN 0x1000 #define MAP_STACK_GROWS_UP 0x2000 #define MAP_ACC_CHARGED 0x4000 diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 6ed24f6..c3648fd 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef HWPMC_HOOKS #include @@ -93,7 +94,7 @@ struct sbrk_args { #endif static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct vnode *, vm_ooffset_t *, vm_object_t *); + int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, vm_ooffset_t *, vm_object_t *); static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, @@ -1218,28 +1219,34 @@ sys_munlock(td, uap) /* * vm_mmap_vnode() * - * MPSAFE - * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. + * + * For VCHR vnodes, lock is held over call to vm_mmap_cdev() to keep + * vp->v_rdev valid. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) + struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, + boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_offset_t foff; struct mount *mp; struct ucred *cred; - int error, flags; - int vfslocked; + int error, flags, locktype, vfslocked; mp = vp->v_mount; cred = td->td_ucred; + + if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) + locktype = LK_EXCLUSIVE; + else + locktype = LK_SHARED; vfslocked = VFS_LOCK_GIANT(mp); - if ((error = vget(vp, LK_SHARED, td)) != 0) { + if ((error = vget(vp, locktype, td)) != 0) { VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -1256,8 +1263,20 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize, } if (obj->handle != vp) { vput(vp); - vp = (struct vnode*)obj->handle; - vget(vp, LK_SHARED, td); + vp = (struct vnode *)obj->handle; + /* + * Bypass filesystems obey the mpsafety of the + * underlying fs. + */ + error = vget(vp, locktype, td); + if (error != 0) { + VFS_UNLOCK_GIANT(vfslocked); + return (error); + } + if (locktype == LK_EXCLUSIVE) { + *writecounted = TRUE; + vnode_pager_update_writecount(obj, 0, objsize); + } } } else if (vp->v_type == VCHR) { error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, @@ -1293,7 +1312,8 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize, objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; - obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred); + obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, + td->td_ucred); if (obj == NULL) { error = ENOMEM; goto done; @@ -1432,6 +1452,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, int rv = KERN_SUCCESS; int docow, error; struct thread *td = curthread; + boolean_t writecounted; if (size == 0) return (0); @@ -1470,6 +1491,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, return (EINVAL); fitit = FALSE; } + writecounted = FALSE; + /* * Lookup/allocate object. */ @@ -1480,7 +1503,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, break; case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object); + handle, &foff, &object, &writecounted); break; case OBJT_SWAP: error = vm_mmap_shm(td, size, prot, &maxprot, &flags, @@ -1520,6 +1543,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; + if (writecounted) + docow |= MAP_VN_WRITECOUNT; if (flags & MAP_STACK) rv = vm_map_stack(map, *addr, size, prot, maxprot, @@ -1537,7 +1562,12 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. + * + * If entry was accounted in the vnode writecount, + * undo this now too. */ + if (writecounted) + vnode_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 599df51..b59f7fe 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -112,6 +112,7 @@ struct vm_object { */ struct { off_t vnp_size; + vm_ooffset_t writemappings; } vnp; /* diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 929fa4f..609205a 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -222,6 +222,7 @@ retry: object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; + object->un_pager.vnp.writemappings = 0; object->handle = handle; VI_LOCK(vp); @@ -268,10 +269,16 @@ vnode_pager_dealloc(object) wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); + if (object->un_pager.vnp.writemappings > 0) { + object->un_pager.vnp.writemappings = 0; + vp->v_writecount--; + } vp->v_object = NULL; vp->v_vflag &= ~VV_TEXT; + VM_OBJECT_UNLOCK(object); while (refs-- > 0) vunref(vp); + VM_OBJECT_LOCK(object); } static boolean_t @@ -1215,3 +1222,81 @@ vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) } VM_OBJECT_UNLOCK(obj); } + +void +vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end) +{ + struct vnode *vp; + vm_ooffset_t old_wm; + + VM_OBJECT_LOCK(object); + if (object->type != OBJT_VNODE) { + VM_OBJECT_UNLOCK(object); + return; + } + old_wm = object->un_pager.vnp.writemappings; + object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; + vp = object->handle; + if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { + ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); + vp->v_writecount++; + } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { + ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); + vp->v_writecount--; + } + VM_OBJECT_UNLOCK(object); +} + +void +vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end) +{ + struct vnode *vp; + struct mount *mp; + vm_offset_t inc; + int vfslocked; + + VM_OBJECT_LOCK(object); + + /* + * First, recheck the object type to account for the race when + * the vnode is reclaimed. + */ + if (object->type != OBJT_VNODE) { + VM_OBJECT_UNLOCK(object); + return; + } + + /* + * Optimize for the case when writemappings is not going to + * zero. + */ + inc = end - start; + if (object->un_pager.vnp.writemappings != inc) { + object->un_pager.vnp.writemappings -= inc; + VM_OBJECT_UNLOCK(object); + return; + } + + vp = object->handle; + vhold(vp); + VM_OBJECT_UNLOCK(object); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + mp = NULL; + vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + + /* + * Decrement the object's writemappings, by swapping the start + * and end arguments for vnode_pager_update_writecount(). If + * there was not a race with vnode reclaimation, then the + * vnode's v_writecount is decremented. + */ + vnode_pager_update_writecount(object, end, start); + VOP_UNLOCK(vp, 0); + vdrop(vp); + if (mp != NULL) + vn_finished_write(mp); + VFS_UNLOCK_GIANT(vfslocked); +} diff --git a/sys/vm/vnode_pager.h b/sys/vm/vnode_pager.h index 5e3d5eb..995838d 100644 --- a/sys/vm/vnode_pager.h +++ b/sys/vm/vnode_pager.h @@ -50,7 +50,11 @@ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int count, boolean_t sync, int *rtvals); +void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end); void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written); +void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end); #endif /* _KERNEL */ #endif /* _VNODE_PAGER_ */