diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index 2e74844..887aab6 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -951,7 +951,9 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, vput(vp); goto unionfs_vn_create_on_upper_free_out1; } + VI_LOCK(vp); vp->v_writecount++; + VI_UNLOCK(vp); *vpp = vp; unionfs_vn_create_on_upper_free_out1: @@ -1086,7 +1088,9 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, } } VOP_CLOSE(uvp, FWRITE, cred, td); + VI_LOCK(uvp); uvp->v_writecount--; + VI_UNLOCK(uvp); vn_finished_write(mp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index ee383ee..f13fa04 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2245,9 +2245,18 @@ vputx(struct vnode *vp, int func) CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); - /* Skip this v_writecount check if we're going to panic below. */ +#if 0 + /* + * Skip this v_writecount check if we're going to panic below. + * + * XXXKIB Disable the assert. The v_writecount counts number + * of pages mapped writeable, so it can now become (much) + * bigger then v_usecount. Might be, the useful assert is + * that v_writecount == 0 if v_usecount is going to 0. + */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vputx: missed vn_close")); +#endif error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && @@ -2496,6 +2505,19 @@ loop: * vnodes open for writing. */ if (flags & WRITECLOSE) { + if (vp->v_object != NULL) { + VM_OBJECT_LOCK(vp->v_object); + vm_object_page_clean(vp->v_object, 0, 0, 0); + VM_OBJECT_UNLOCK(vp->v_object); + } + error = VOP_FSYNC(vp, MNT_WAIT, td); + if (error != 0) { + VOP_UNLOCK(vp, 0); + vdrop(vp); + MNT_VNODE_FOREACH_ABORT(mp, mvp); + return (error); + } + error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); @@ -2721,8 +2743,9 @@ vn_printf(struct vnode *vp, const char *fmt, ...) va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); - printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", - vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); + printf(" usecount %d, writecount %jd, refcount %d mountedhere %p\n", + vp->v_usecount, (uintmax_t)vp->v_writecount, vp->v_holdcnt, + vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 31ad276..b13fd39 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -4599,16 +4599,22 @@ sys_fhopen(td, uap) if (error) goto bad; - if (fmode & FWRITE) + if (fmode & FWRITE) { + VI_LOCK(vp); vp->v_writecount++; + VI_UNLOCK(vp); + } /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx, fmode)) != 0) { - if (fmode & FWRITE) + if (fmode & FWRITE) { + VI_LOCK(vp); vp->v_writecount--; + VI_UNLOCK(vp); + } goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index e33592a..05471bd 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -245,8 +245,11 @@ restart: if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) goto bad; - if (fmode & FWRITE) + if (fmode & FWRITE) { + VI_LOCK(vp); vp->v_writecount++; + VI_UNLOCK(vp); + } *flagp = fmode; ASSERT_VOP_LOCKED(vp, "vn_open_cred"); if (!mpsafe) @@ -308,7 +311,9 @@ vn_close(vp, flags, file_cred, td) if (flags & FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); + VI_LOCK(vp); vp->v_writecount--; + VI_UNLOCK(vp); } error = VOP_CLOSE(vp, flags, file_cred, td); vput(vp); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 0ef4979..477f18e 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -149,9 +149,14 @@ struct vnode { struct lock *v_vnlock; /* u pointer to vnode lock */ int v_holdcnt; /* i prevents recycling. */ int v_usecount; /* i ref count of users */ - u_long v_iflag; /* i vnode flags (see below) */ - u_long v_vflag; /* v vnode flags */ - int v_writecount; /* v ref count of writers */ + u_int v_iflag; /* i vnode flags (see below) */ + u_int v_vflag; /* v vnode flags */ + uint64_t v_writecount; /* i/v ref count of writers */ + /* + * v_writecount is protected by vnode interlock. When + * v_writecount is decremented from 1 to 0, or + * incremented from 0 to 1, vnode lock must be held. + */ /* * The machinery of being a vnode diff --git a/sys/ufs/ufs/ufs_extattr.c b/sys/ufs/ufs/ufs_extattr.c index 777f385..5fb4184 100644 --- a/sys/ufs/ufs/ufs_extattr.c +++ b/sys/ufs/ufs/ufs_extattr.c @@ -334,7 +334,9 @@ ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, return (error); } + VI_LOCK(vp); vp->v_writecount++; + VI_UNLOCK(vp); vref(vp); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index d62576f..60c67b0 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -74,6 +74,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -475,11 +476,39 @@ vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry; + vm_object_t object; + struct vnode *vp; + struct mount *mp; + int vfslocked; td = curthread; while ((entry = td->td_map_def_user) != NULL) { td->td_map_def_user = entry->next; + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && + (object = entry->object.vm_object) != NULL) { + mp = NULL; + vfslocked = 0; + VM_OBJECT_LOCK(object); + if (object->type == OBJT_VNODE && + entry->eflags & MAP_ENTRY_VN_WRITECNT) { + vp = object->handle; + vhold(vp); + VM_OBJECT_UNLOCK(object); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vm_object_update_writecount(object, + entry->start - entry->end); + VOP_UNLOCK(vp, 0); + vdrop(vp); + } else + VM_OBJECT_UNLOCK(object); + if (mp != NULL) + vn_finished_write(mp); + VFS_UNLOCK_GIANT(vfslocked); + } + vm_map_entry_deallocate(entry, FALSE); } } @@ -1231,6 +1260,13 @@ charged: (prev_entry->protection == prot) && (prev_entry->max_protection == max)) { map->size += (end - prev_entry->end); + if ((prev_entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { + ASSERT_VOP_ELOCKED(object->handle, + "vm_map_insert inc v_writecount"); + vm_object_update_writecount( + prev_entry->object.vm_object, + end - prev_entry->end); + } prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); @@ -1290,6 +1326,15 @@ charged: */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; + if (object != NULL && object->type == OBJT_VNODE && + (new_entry->max_protection & VM_PROT_WRITE) != 0 && + (cow & MAP_COPY_ON_WRITE) == 0) { + ASSERT_VOP_ELOCKED(object->handle, + "vm_map_insert inc v_writecount"); + vm_object_update_writecount(object->handle, new_entry->end - + new_entry->start); + new_entry->eflags |= MAP_ENTRY_VN_WRITECNT; + } /* * It may be possible to merge the new entry with the next and/or @@ -1511,6 +1556,12 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) * references. Thus, the map lock can be kept * without causing a lock-order reversal with * the vnode lock. + * + * Since we count the number of virtual page + * mappings in v_writecount, both v_writecount + * and object->un_pager.vnp.writemappings + * should not be adjusted on the entry + * dispose. */ if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); @@ -1622,6 +1673,14 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); + + /* + * The v_writecount for the vnode referenced by the + * object of MAP_ENTRY_VN_WRITECNT type entry shall be + * kept as is there. The virtual pages are + * re-distributed among clipped entries, so the sum is + * left the same. + */ } } @@ -2895,6 +2954,7 @@ vm_map_copy_entry( vm_ooffset_t *fork_charge) { vm_object_t src_object; + vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; @@ -2960,6 +3020,27 @@ vm_map_copy_entry( src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; + if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + /* + * MAP_ENTRY_VN_WRITECNT cannot + * indicate write reference from + * src_entry, since the entry is + * marked as needs copy. + * Allocate a fake entry that is used + * to decrement v_writecount at the + * appropriate time. + * Attach fake_entry to the deferred list. + */ + fake_entry = vm_map_entry_create(dst_map); + fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; + src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; + vm_object_reference(src_object); + fake_entry->object.vm_object = src_object; + fake_entry->start = src_entry->start; + fake_entry->end = src_entry->end; + fake_entry->next = curthread->td_map_def_user; + curthread->td_map_def_user = fake_entry; + } } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; @@ -3117,6 +3198,14 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + object = new_entry->object.vm_object; + KASSERT(((struct vnode *)object->handle)-> + v_writecount > 0, + ("vmspace_fork: v_writecount")); + vm_object_update_writecount(object, + new_entry->end - new_entry->start); + } /* * Insert the entry into the new map -- we know we're @@ -3141,8 +3230,11 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; + /* + * Copied entry is COW over the old object. + */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | - MAP_ENTRY_IN_TRANSITION); + MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; @@ -3156,9 +3248,13 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) old_entry = old_entry->next; } unlock_and_return: - vm_map_unlock(old_map); + /* + * Inlined vm_map_unlock to properly handle nodefer. + */ + sx_xunlock(&old_map->lock); if (vm2 != NULL) - vm_map_unlock(new_map); + sx_xunlock(&new_map->lock); + vm_map_process_deferred(); return (vm2); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 5311e02..90ad5c3 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -139,6 +139,7 @@ struct vm_map_entry { #define MAP_ENTRY_GROWS_UP 0x2000 /* Bottom-up stacks */ #define MAP_ENTRY_WIRE_SKIPPED 0x4000 +#define MAP_ENTRY_VN_WRITECNT 0x8000 /* writeable vnode mapping */ #ifdef _KERNEL static __inline u_char diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index e85b681..f219a44 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -93,7 +93,8 @@ struct sbrk_args { #endif static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct vnode *, vm_ooffset_t *, vm_object_t *); + int *, struct vnode *, vm_ooffset_t *, vm_object_t *, struct vnode **, + int *); static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, vm_ooffset_t *, vm_object_t *); static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, @@ -1222,25 +1223,36 @@ sys_munlock(td, uap) * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. + * + * For regular files, function returns with *rvp locked, since + * vm_map_insert shall bump v_writecount for shared writable mappings. + * + * For VCHR vnodes, lock is held over call to vm_mmap_cdev() to keep + * vp->v_rdev valid. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) + struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, + struct vnode **rvp, int *vfslocked) { struct vattr va; vm_object_t obj; vm_offset_t foff; struct mount *mp; struct ucred *cred; - int error, flags; - int vfslocked; + int error, flags, locktype; mp = vp->v_mount; cred = td->td_ucred; - vfslocked = VFS_LOCK_GIANT(mp); - if ((error = vget(vp, LK_SHARED, td)) != 0) { - VFS_UNLOCK_GIANT(vfslocked); + + if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) + locktype = LK_EXCLUSIVE; + else + locktype = LK_SHARED; + *vfslocked = VFS_LOCK_GIANT(mp); + if ((error = vget(vp, locktype, td)) != 0) { + VFS_UNLOCK_GIANT(*vfslocked); return (error); } foff = *foffp; @@ -1257,11 +1269,15 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize, if (obj->handle != vp) { vput(vp); vp = (struct vnode*)obj->handle; - vget(vp, LK_SHARED, td); + vget(vp, locktype, td); } + *rvp = vp; } else if (vp->v_type == VCHR) { error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, vp->v_rdev, foffp, objp); + vput(vp); + VFS_UNLOCK_GIANT(*vfslocked); + *rvp = NULL; if (error == 0) goto mark_atime; goto done; @@ -1305,8 +1321,11 @@ mark_atime: vfs_mark_atime(vp, cred); done: - vput(vp); - VFS_UNLOCK_GIANT(vfslocked); + if (error != 0 && *rvp != NULL) { + vput(vp); + VFS_UNLOCK_GIANT(*vfslocked); + *rvp = NULL; + } return (error); } @@ -1430,8 +1449,9 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, boolean_t fitit; vm_object_t object = NULL; int rv = KERN_SUCCESS; - int docow, error; + int docow, error, vfslocked; struct thread *td = curthread; + struct vnode *vp; if (size == 0) return (0); @@ -1470,6 +1490,9 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, return (EINVAL); fitit = FALSE; } + vfslocked = 0; + vp = NULL; + /* * Lookup/allocate object. */ @@ -1480,7 +1503,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, break; case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object); + handle, &foff, &object, &vp, &vfslocked); break; case OBJT_SWAP: error = vm_mmap_shm(td, size, prot, &maxprot, &flags, @@ -1529,6 +1552,10 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); + if (vp != NULL) { + vput(vp); + VFS_UNLOCK_GIANT(vfslocked); + } if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1a8ce65..3450dde 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2132,6 +2132,29 @@ vm_object_set_writeable_dirty(vm_object_t object) vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } +void +vm_object_update_writecount(vm_object_t object, vm_offset_t inc) +{ + struct vnode *vp; + u_long pages; + + if (object->type != OBJT_VNODE) + return; + vp = object->handle; + pages = inc / PAGE_SIZE; + VM_OBJECT_LOCK(object); + object->un_pager.vnp.writemappings += pages; + VI_LOCK(vp); +#ifdef DEBUG_VFS_LOCKS + if ((vp->v_writecount == 0 && pages > 0) || + (vp->v_writecount != 0 && vp->v_writecount + pages) == 0) + ASSERT_VOP_ELOCKED(vp, "v_writecount"); +#endif + vp->v_writecount += pages; + VI_UNLOCK(vp); + VM_OBJECT_UNLOCK(object); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 599df51..315948f 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -112,6 +112,7 @@ struct vm_object { */ struct { off_t vnp_size; + uint64_t writemappings; } vnp; /* @@ -240,6 +241,7 @@ void vm_object_split(vm_map_entry_t); void vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_madvise (vm_object_t, vm_pindex_t, int, int); +void vm_object_update_writecount(vm_object_t object, vm_offset_t inc); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 929fa4f..b023aed 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -268,10 +268,16 @@ vnode_pager_dealloc(object) wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); + VI_LOCK(vp); + vp->v_writecount -= object->un_pager.vnp.writemappings; + VI_UNLOCK(vp); + object->un_pager.vnp.writemappings = 0; vp->v_object = NULL; vp->v_vflag &= ~VV_TEXT; + VM_OBJECT_UNLOCK(object); while (refs-- > 0) vunref(vp); + VM_OBJECT_LOCK(object); } static boolean_t