commit dc322ed2448d22707376d39e1eb2dd4fd4d1d6fd Author: Andriy Gapon Date: Fri Nov 16 12:11:45 2012 +0200 [experiment!!!] zfs: introduce zfs_freebsd_lock and zfs_freebsd_unlock... These vnode operations acquire and release z_teardown_lock in addition to the normal operations on the vnode lock. This is a half-step towards fixing lock order problems between VM page busying "locks" and ZFS internal locks. In particular, now z_teardown_lock is always acquired prior to page busying (the same as the vnode lock). Ideally I wanted z_teardown_lock to be acquired before the vnode lock, but this is not possible to implement using VOP_LOCK1 approach. Additional effect of this change is that now all ZFS vnode ops are guaranteed to be protected with z_teardown_lock. Previously some FreeBSD-specific operations accessed znodes and zfsvfs without calling ZFS_ENTER first. Consequentially, after this change ZFS_ENTER calls in the vnode ops has become redundant. They are still needed in ZFS vfs ops. This change should fix a deadlock between vn_pages_remove/vnode_pager_setsize called from zfs_rezget with z_teardown_lock held and putpages operation blocked on z_teardown_lock while entering zfs_write. This change has also made z_teardown_inactive_lock redundant. The only remaining LOR between the page busying and ZFS is with ZFS range locks. This should be fixed by introducing VOP_RANGELOCK to FreeBSD VFS, placing calls to this operation at strategic places and implementing the op for ZFS using the ZFS rangelock. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 21724ef..eff9890 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1809,16 +1809,18 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } - +#ifdef sun rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); - +#endif /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { +#ifdef sun rw_exit(&zfsvfs->z_teardown_inactive_lock); +#endif rrw_exit(&zfsvfs->z_teardown_lock, FTAG); return (EIO); } @@ -1847,7 +1849,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); +#ifdef sun rw_exit(&zfsvfs->z_teardown_inactive_lock); +#endif } /* @@ -2196,8 +2200,9 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) int err; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); +#ifdef sun ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - +#endif err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, &zfsvfs->z_os); if (err) { @@ -2250,7 +2255,9 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) bail: /* release the VOPs */ +#ifdef sun rw_exit(&zfsvfs->z_teardown_inactive_lock); +#endif rrw_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index ade9c68..07ddeb5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -4671,13 +4671,11 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } @@ -4688,7 +4686,6 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) * Fast path to recycle a vnode of a removed file. */ mutex_exit(&zp->z_lock); - rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } @@ -4711,7 +4708,6 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) dmu_tx_commit(tx); } } - rw_exit(&zfsvfs->z_teardown_inactive_lock); } #ifdef sun @@ -6474,19 +6470,20 @@ zfs_freebsd_reclaim(ap) /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); - /* - * z_teardown_inactive_lock protects from a race with - * zfs_znode_dmu_fini in zfsvfs_teardown during - * force unmount. - */ - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; + + /* + * Drop z_teardown_lock acquired in zfs_freebsd_lock, + * because zfs_freebsd_unlock won't be called on this vnode. + */ + lockmgr_assert(vp->v_vnlock, KA_XLOCKED | KA_NOTRECURSED); + rrw_exit(&zfsvfs->z_teardown_lock, vp); + return (0); } @@ -7008,6 +7005,66 @@ zfs_freebsd_aclcheck(ap) return (EOPNOTSUPP); } +int +zfs_freebsd_lock(ap) + struct vop_lock1_args /* { + struct vnode *a_vp; + int a_flags; + char *file; + int line; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + zfsvfs_t *zfsvfs; + int error; + int op; + + op = ap->a_flags & LK_TYPE_MASK; + if (op == LK_UPGRADE) { + /* + * Failure to upgrade means losing shared lock. + * We drop z_teardown_lock now and re-acquire it + * if the upgrade is successful. + */ + ASSERT(VTOZ(vp) != NULL && VTOZ(vp)->z_zfsvfs != NULL); + zfsvfs = VTOZ(vp)->z_zfsvfs; + rrw_exit(&zfsvfs->z_teardown_lock, vp); + } + error = vop_stdlock(ap); + if (op == LK_DOWNGRADE) { + ASSERT(error == 0); + return (error); + } + ASSERT(op == LK_SHARED || op == LK_EXCLUSIVE || op == LK_UPGRADE); + if (error != 0 || (vp->v_iflag & VI_DOOMED) != 0) + return (error); + ASSERT(VTOZ(vp) != NULL && VTOZ(vp)->z_zfsvfs != NULL); + zfsvfs = VTOZ(vp)->z_zfsvfs; + + /* + * zfsvfs_teardown -> zil_close -> zil_commit -> + * zfs_get_data -> zfs_zget -> zfs_znode_alloc + */ + if (!RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)) + rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, vp); + return (0); +} + +int +zfs_freebsd_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + + if (!RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)) + rrw_exit(&zfsvfs->z_teardown_lock, vp); + return (vop_stdunlock(ap)); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; @@ -7053,6 +7110,8 @@ struct vop_vector zfs_vnodeops = { .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, + .vop_lock1 = zfs_freebsd_lock, + .vop_unlock = zfs_freebsd_unlock, }; struct vop_vector zfs_fifoops = { @@ -7070,6 +7129,8 @@ struct vop_vector zfs_fifoops = { .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_lock1 = zfs_freebsd_lock, + .vop_unlock = zfs_freebsd_unlock, }; /* @@ -7082,4 +7143,6 @@ struct vop_vector zfs_shareops = { .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, + .vop_lock1 = zfs_freebsd_lock, + .vop_unlock = zfs_freebsd_unlock, }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index c2ea9ef..76a4126 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -597,7 +597,7 @@ zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || - RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + RRW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL;