diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h index d3955d7..6281f8e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -207,8 +207,6 @@ typedef struct znode { list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ - /* FreeBSD-specific field. */ - struct task z_task; } znode_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 90a6f7a..a204966 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1867,18 +1867,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); - -#ifdef __FreeBSD__ - /* - * Some znodes might not be fully reclaimed, wait for them. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - while (list_head(&zfsvfs->z_all_znodes) != NULL) { - msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0, - "zteardown", 0); - } - mutex_exit(&zfsvfs->z_znodes_lock); -#endif } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 3322b42..383f11b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -4576,10 +4576,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ - VI_LOCK(vp); - ASSERT(vp->v_count <= 1); - vp->v_count = 0; - VI_UNLOCK(vp); + ASSERT0(vp->v_count); vrecycle(vp); rw_exit(&zfsvfs->z_teardown_inactive_lock); return; @@ -6206,28 +6203,6 @@ zfs_freebsd_inactive(ap) return (0); } -static void -zfs_reclaim_complete(void *arg, int pending) -{ - znode_t *zp = arg; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_sa_hdl != NULL) { - ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - } - zfs_znode_free(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - /* - * If the file system is being unmounted, there is a process waiting - * for us, wake it up. - */ - if (zfsvfs->z_unmounted) - wakeup_one(zfsvfs); -} - static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { @@ -6238,9 +6213,7 @@ zfs_freebsd_reclaim(ap) vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - boolean_t rlocked; - - rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + boolean_t doinginact; ASSERT(zp != NULL); @@ -6249,42 +6222,40 @@ zfs_freebsd_reclaim(ap) */ vnode_destroy_vobject(vp); + VI_LOCK(vp); + doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; + VI_UNLOCK(vp); + mutex_enter(&zp->z_lock); zp->z_vnode = NULL; mutex_exit(&zp->z_lock); - if (zp->z_unlinked) { - ; /* Do nothing. */ - } else if (!rlocked) { - TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); - taskqueue_enqueue(taskqueue_thread, &zp->z_task); - } else if (zp->z_sa_hdl == NULL) { - zfs_znode_free(zp); - } else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ { - int locked; + if (doinginact) { + ASSERT(RW_READ_HELD(&zfsvfs->z_teardown_inactive_lock)); + ASSERT(zp->z_sa_hdl == NULL || zp->z_unlinked); - locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : - ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); - if (locked == 0) { - /* - * Lock can't be obtained due to deadlock possibility, - * so defer znode destruction. - */ - TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); - taskqueue_enqueue(taskqueue_thread, &zp->z_task); - } else { - zfs_znode_dmu_fini(zp); - if (locked == 1) - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); + /* + * Note: the order of the checks below must match + * zfs_inactive/zfs_zinactive. + */ + if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); + else if (zp->z_unlinked) + ; /* Do nothing, zfs_zinactive handles this. */ + } else { + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl != NULL) { + ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); } + zfs_znode_free(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); } - VI_LOCK(vp); + vp->v_data = NULL; ASSERT(vp->v_holdcnt >= 1); - VI_UNLOCK(vp); - if (rlocked) - rw_exit(&zfsvfs->z_teardown_inactive_lock); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index 3f50a91..cfef70c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -856,6 +856,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } } + getnewvnode_reserve(1); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); @@ -1042,6 +1043,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, KASSERT(err == 0, ("insmntque() failed: error %d", err)); } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + getnewvnode_drop_reserve(); } /* @@ -1146,18 +1148,22 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; - int err; + vnode_t *vp; sa_handle_t *hdl; - int first = 1; - - *zpp = NULL; + struct thread *td; + int locked; + int err; + td = curthread; + getnewvnode_reserve(1); again: + *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); return (err); } @@ -1168,6 +1174,7 @@ again: doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); return (EINVAL); } @@ -1175,7 +1182,6 @@ again: if (hdl != NULL) { zp = sa_get_userdata(hdl); - /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't @@ -1189,48 +1195,47 @@ again: if (zp->z_unlinked) { err = ENOENT; } else { - vnode_t *vp; - int dying = 0; - vp = ZTOV(zp); - if (vp == NULL) - dying = 1; - else { - VN_HOLD(vp); - if ((vp->v_iflag & VI_DOOMED) != 0) { - dying = 1; - /* - * Don't VN_RELE() vnode here, because - * it can call vn_lock() which creates - * LOR between vnode lock and znode - * lock. We will VN_RELE() the vnode - * after droping znode lock. - */ - } - } - if (dying) { - if (first) { - ZFS_LOG(1, "dying znode detected (zp=%p)", zp); - first = 0; - } - /* - * znode is dying so we can't reuse it, we must - * wait until destruction is completed. - */ - sa_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - if (vp != NULL) - VN_RELE(vp); - tsleep(zp, 0, "zcollide", 1); - goto again; - } *zpp = zp; err = 0; } sa_buf_rele(db, NULL); + + /* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */ + if (err == 0) + VI_LOCK(vp); + mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err == 0) { + locked = VOP_ISLOCKED(vp); + if (locked != LK_EXCLUSIVE) + locked = LK_SHARED; + /* + * ZFS marks vnode lock as recursive, so no need for + * LK_CANRECURSE. + * LK_RETRY is because we want VI_DOOMED vnodes, so + * that we can decide what to do next. + */ + err = vget(vp, locked | LK_INTERLOCK | LK_RETRY, td); + ASSERT3S(err, ==, 0); + if ((vp->v_iflag & VI_DOOMED) != 0) { + /* + * Retry if this vnode is already recycled: just + * let it die - the znode is gone by now. + * Otherwise, it is current thread that is + * recycling the vnode, so the znode should + * stay good for us. + */ + if (vp->v_data == NULL) { + vput(vp); + goto again; + } + } + VOP_UNLOCK(vp, 0); + } + getnewvnode_drop_reserve(); return (err); } @@ -1266,6 +1271,7 @@ again: } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); return (err); } diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 24960fd..66e3201 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -154,6 +154,8 @@ userret(struct thread *td, struct trapframe *frame) ("userret: Returning with sleep disabled")); KASSERT(td->td_pinned == 0, ("userret: Returning with with pinned thread")); + KASSERT(td->td_vp_reserv == 0, + ("userret: Returning while holding vnode reservation")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 5c781c2..0de53f0 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -935,34 +935,22 @@ vtryrecycle(struct vnode *vp) } /* - * Return the next vnode from the free list. + * Wait for available vnodes. */ -int -getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, - struct vnode **vpp) +static int +getnewvnode_wait(int suspended) { - struct vnode *vp = NULL; - struct bufobj *bo; - CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); - mtx_lock(&vnode_free_list_mtx); - /* - * Lend our context to reclaim vnodes if they've exceeded the max. - */ - if (freevnodes > wantfreevnodes) - vnlru_free(1); - /* - * Wait for available vnodes. - */ + mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (numvnodes > desiredvnodes) { - if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { + if (suspended) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); - goto alloc; + return (0); } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ @@ -970,16 +958,75 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); -#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ - if (numvnodes > desiredvnodes) { - mtx_unlock(&vnode_free_list_mtx); - return (ENFILE); + } + return (numvnodes > desiredvnodes ? ENFILE : 0); +} + +void +getnewvnode_reserve(u_int count) +{ + struct thread *td; + + td = curthread; + mtx_lock(&vnode_free_list_mtx); + while (count > 0) { + if (getnewvnode_wait(0) == 0) { + count--; + td->td_vp_reserv++; + numvnodes++; } -#endif } -alloc: + mtx_unlock(&vnode_free_list_mtx); +} + +void +getnewvnode_drop_reserve(void) +{ + struct thread *td; + + td = curthread; + mtx_lock(&vnode_free_list_mtx); + KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large")); + numvnodes -= td->td_vp_reserv; + mtx_unlock(&vnode_free_list_mtx); + td->td_vp_reserv = 0; +} + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, + struct vnode **vpp) +{ + struct vnode *vp; + struct bufobj *bo; + struct thread *td; + int error; + + CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); + vp = NULL; + td = curthread; + if (td->td_vp_reserv > 0) { + td->td_vp_reserv -= 1; + goto alloc; + } + mtx_lock(&vnode_free_list_mtx); + /* + * Lend our context to reclaim vnodes if they've exceeded the max. + */ + if (freevnodes > wantfreevnodes) + vnlru_free(1); + error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & + MNTK_SUSPEND)); +#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ + if (error != 0) { + mtx_unlock(&vnode_free_list_mtx); + return (error); +#endif numvnodes++; mtx_unlock(&vnode_free_list_mtx); +alloc: vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 4c4aa2f..0aae0ce 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -272,6 +272,7 @@ struct thread { struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ + u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 1bde8b9..029458f 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -600,6 +600,8 @@ void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); +void getnewvnode_reserve(u_int count); +void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp);