diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c index 15b1ecafb1f..0ba0338a984 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c @@ -154,6 +154,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, vput(vp); return (error); } + vn_seqc_write_begin(vp); VOP_UNLOCK(vp); /* @@ -206,6 +207,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); + vn_seqc_write_end(vp); vput(vp); vfs_unbusy(mp); vfs_freeopts(mp->mnt_optnew); @@ -241,6 +243,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) panic("mount: lost mount"); + vn_seqc_write_end(vp); VOP_UNLOCK(vp); vfs_op_exit(mp); vfs_unbusy(mp); diff --git a/sys/fs/tmpfs/tmpfs.h b/sys/fs/tmpfs/tmpfs.h index f94992dfc81..2b0d1a7b18b 100644 --- a/sys/fs/tmpfs/tmpfs.h +++ b/sys/fs/tmpfs/tmpfs.h @@ -526,6 +526,9 @@ VP_TO_TMPFS_NODE(struct vnode *vp) return (node); } +#define VP_TO_TMPFS_NODE_SMR(vp) \ + (struct tmpfs_node *)vn_load_v_data_smr(vp) + static inline struct tmpfs_node * VP_TO_TMPFS_DIR(struct vnode *vp) { diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index 06f5f989074..2ef4ae75adb 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -75,6 +75,7 @@ static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; static uma_zone_t tmpfs_dirent_pool; static uma_zone_t tmpfs_node_pool; +VFS_SMR_DECLARE; static int tmpfs_node_ctor(void *mem, int size, void *arg, int flags) @@ -131,6 +132,7 @@ tmpfs_subr_init(void) tmpfs_node_pool = uma_zcreate("TMPFS node", sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); + VFS_SMR_ZONE_SET(tmpfs_node_pool); } void @@ -288,7 +290,7 @@ tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) return (EROFS); - nnode = uma_zalloc_arg(tmpfs_node_pool, tmp, M_WAITOK); + nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; @@ -435,7 +437,7 @@ tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } - uma_zfree(tmpfs_node_pool, node); + uma_zfree_smr(tmpfs_node_pool, node); TMPFS_LOCK(tmp); tmpfs_free_tmp(tmp); return (true); @@ -1619,10 +1621,11 @@ tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { - int error; + int error, newmode; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chmod"); + ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); @@ -1656,9 +1659,9 @@ tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) return (error); } - - node->tn_mode &= ~ALLPERMS; - node->tn_mode |= mode & ALLPERMS; + newmode = node->tn_mode & ~ALLPERMS; + newmode |= mode & ALLPERMS; + atomic_store_short(&node->tn_mode, newmode); node->tn_status |= TMPFS_NODE_CHANGED; @@ -1684,6 +1687,7 @@ tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, gid_t ogid; ASSERT_VOP_ELOCKED(vp, "chown"); + ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); @@ -1730,7 +1734,7 @@ tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) - node->tn_mode &= ~(S_ISUID | S_ISGID); + atomic_store_short(&node->tn_mode, node->tn_mode & ~(S_ISUID | S_ISGID)); } ASSERT_VOP_ELOCKED(vp, "chown2"); diff --git a/sys/fs/tmpfs/tmpfs_vfsops.c b/sys/fs/tmpfs/tmpfs_vfsops.c index c36ec68f928..fee923e6132 100644 --- a/sys/fs/tmpfs/tmpfs_vfsops.c +++ b/sys/fs/tmpfs/tmpfs_vfsops.c @@ -462,6 +462,8 @@ tmpfs_mount(struct mount *mp) mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_TEXT_REFS | MNTK_NOMSYNC; + if (!nonc) + mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); mp->mnt_data = tmp; diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index a5e21bd1d88..c2dc30fb6cf 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include SYSCTL_DECL(_vfs_tmpfs); +VFS_SMR_DECLARE; static volatile int tmpfs_rename_restarts; SYSCTL_INT(_vfs_tmpfs, OID_AUTO, rename_restarts, CTLFLAG_RD, @@ -317,6 +319,32 @@ tmpfs_close(struct vop_close_args *v) return (0); } +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + */ +int +tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v) +{ + struct vnode *vp; + struct tmpfs_node *node; + struct ucred *cred; + mode_t all_x, mode; + + vp = v->a_vp; + node = VP_TO_TMPFS_NODE_SMR(vp); + if (__predict_false(node == NULL)) + return (EAGAIN); + + all_x = S_IXUSR | S_IXGRP | S_IXOTH; + mode = atomic_load_short(&node->tn_mode); + if (__predict_true((mode & all_x) == all_x)) + return (0); + + cred = v->a_cred; + return (vaccess_vexec_smr(mode, node->tn_uid, node->tn_gid, cred)); +} + int tmpfs_access(struct vop_access_args *v) { @@ -427,6 +455,7 @@ tmpfs_setattr(struct vop_setattr_args *v) int error; MPASS(VOP_ISLOCKED(vp)); + ASSERT_VOP_IN_SEQC(vp); error = 0; @@ -806,12 +835,15 @@ tmpfs_rename(struct vop_rename_args *v) struct tmpfs_node *tnode; struct tmpfs_node *tdnode; int error; + bool want_seqc_end; MPASS(VOP_ISLOCKED(tdvp)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp))); MPASS(fcnp->cn_flags & HASBUF); MPASS(tcnp->cn_flags & HASBUF); + want_seqc_end = false; + /* * Disallow cross-device renames. * XXX Why isn't this done by the caller? @@ -852,6 +884,13 @@ tmpfs_rename(struct vop_rename_args *v) } } + if (tvp != NULL) + vn_seqc_write_begin(tvp); + vn_seqc_write_begin(tdvp); + vn_seqc_write_begin(fvp); + vn_seqc_write_begin(fdvp); + want_seqc_end = true; + tmp = VFS_TO_TMPFS(tdvp->v_mount); tdnode = VP_TO_TMPFS_DIR(tdvp); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); @@ -1065,6 +1104,14 @@ tmpfs_rename(struct vop_rename_args *v) VOP_UNLOCK(fdvp); out: + if (want_seqc_end) { + if (tvp != NULL) + vn_seqc_write_end(tvp); + vn_seqc_write_end(tdvp); + vn_seqc_write_end(fvp); + vn_seqc_write_end(fdvp); + } + /* * Release target nodes. * XXX: I don't understand when tdvp can be the same as tvp, but @@ -1621,6 +1668,7 @@ struct vop_vector tmpfs_vnodeop_entries = { .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, + .vop_fplookup_vexec = tmpfs_fplookup_vexec, .vop_access = tmpfs_access, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, diff --git a/sys/fs/tmpfs/tmpfs_vnops.h b/sys/fs/tmpfs/tmpfs_vnops.h index 2f89e15629d..0fa9739c0bc 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.h +++ b/sys/fs/tmpfs/tmpfs_vnops.h @@ -49,6 +49,7 @@ extern struct vop_vector tmpfs_vnodeop_entries; extern struct vop_vector tmpfs_vnodeop_nonc_entries; vop_access_t tmpfs_access; +vop_fplookup_vexec_t tmpfs_fplookup_vexec; vop_getattr_t tmpfs_getattr; vop_setattr_t tmpfs_setattr; vop_pathconf_t tmpfs_pathconf; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index e2b57609c9f..1f422bbf57c 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -102,8 +102,8 @@ MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; -static __read_mostly uma_zone_t pwd_zone; -static __read_mostly smr_t pwd_smr; +__read_mostly uma_zone_t pwd_zone; +VFS_SMR_DECLARE; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); @@ -3343,18 +3343,27 @@ pwd_hold(struct thread *td) fdp = td->td_proc->p_fd; - smr_enter(pwd_smr); - pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr); + vfs_smr_enter(); + pwd = vfs_smr_entered_load(&fdp->fd_pwd); MPASS(pwd != NULL); if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { - smr_exit(pwd_smr); + vfs_smr_exit(); return (pwd); } - smr_exit(pwd_smr); + vfs_smr_exit(); FILEDESC_SLOCK(fdp); pwd = pwd_hold_filedesc(fdp); MPASS(pwd != NULL); - FILEDESC_SUNLOCK(fdp); + return (pwd); +} + +struct pwd * +pwd_get_smr(void) +{ + struct pwd *pwd; + + pwd = vfs_smr_entered_load(&curproc->p_fd->fd_pwd); + MPASS(pwd != NULL); return (pwd); } @@ -4368,7 +4377,11 @@ filelistinit(void *dummy) NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); - pwd_smr = uma_zone_get_smr(pwd_zone); + /* + * XXXMJG this is a temporary hack due to boot ordering issues against + * the vnode zone. + */ + vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index c4945200ec4..16c493a1e39 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -67,6 +68,11 @@ __FBSDID("$FreeBSD$"); #include #endif +#include + +#include +#include + #ifdef DDB #include #endif @@ -100,6 +106,10 @@ SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", "char *"); +SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); +SDT_PROBE_DECLARE(vfs, namei, lookup, entry); +SDT_PROBE_DECLARE(vfs, namei, lookup, return); + /* * This structure describes the elements in the cache of recent * names looked up by namei. @@ -2835,3 +2845,861 @@ DB_SHOW_COMMAND(vpath, db_show_vpath) } #endif + +extern uma_zone_t namei_zone; + +static bool __read_frequently cache_fast_lookup = true; +SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, + &cache_fast_lookup, 0, ""); + +#define CACHE_FPL_FAILED -2020 + +static void +cache_fpl_cleanup_cnp(struct componentname *cnp) +{ + + uma_zfree(namei_zone, cnp->cn_pnbuf); +#ifdef DIAGNOSTIC + cnp->cn_pnbuf = NULL; + cnp->cn_nameptr = NULL; +#endif +} + +static void +cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) +{ + struct componentname *cnp; + + cnp = &ndp->ni_cnd; + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + + *dpp = ndp->ni_rootdir; +} + +/* + * Components of nameidata (or objects it can point to) which may + * need restoring in case fast path lookup fails. + */ +struct nameidata_saved { + int cn_flags; + long cn_namelen; + char *cn_nameptr; + size_t ni_pathlen; +}; + +struct cache_fpl { + int line; + enum cache_fpl_status status; + bool in_smr; + struct nameidata *ndp; + struct nameidata_saved snd; + struct componentname *cnp; + struct vnode *dvp; + seqc_t dvp_seqc; + struct vnode *tvp; + seqc_t tvp_seqc; + struct pwd *pwd; +}; + +static void +cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) +{ + + snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; + snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; + snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; + snd->ni_pathlen = fpl->ndp->ni_pathlen; +} + +static void +cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) +{ + + fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; + fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; + fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; + fpl->ndp->ni_pathlen = snd->ni_pathlen; +} + +#ifdef INVARIANTS +#define cache_fpl_smr_assert_entered(fpl) ({ \ + struct cache_fpl *_fpl = (fpl); \ + MPASS(_fpl->in_smr == true); \ + VFS_SMR_ASSERT_ENTERED(); \ +}) +#define cache_fpl_smr_assert_not_entered(fpl) ({ \ + struct cache_fpl *_fpl = (fpl); \ + MPASS(_fpl->in_smr == false); \ + VFS_SMR_ASSERT_NOT_ENTERED(); \ +}) +#else +#define cache_fpl_smr_assert_entered(fpl) do { } while (0) +#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) +#endif + +#define cache_fpl_smr_enter(fpl) ({ \ + struct cache_fpl *_fpl = (fpl); \ + MPASS(_fpl->in_smr == false); \ + vfs_smr_enter(); \ + _fpl->in_smr = true; \ +}) + +#define cache_fpl_smr_exit(fpl) ({ \ + struct cache_fpl *_fpl = (fpl); \ + MPASS(_fpl->in_smr == true); \ + vfs_smr_exit(); \ + _fpl->in_smr = false; \ +}) + +static int +cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) +{ + + if (fpl->status != CACHE_FPL_STATUS_UNSET) { + KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, + ("%s: converting to abort from %d at %d, set at %d\n", + __func__, fpl->status, line, fpl->line)); + } + fpl->status = CACHE_FPL_STATUS_ABORTED; + fpl->line = line; + return (CACHE_FPL_FAILED); +} + +#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) + +static int +cache_fpl_partial_impl(struct cache_fpl *fpl, int line) +{ + + KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, + ("%s: setting to partial at %d, but already set to %d at %d\n", + __func__, line, fpl->status, fpl->line)); + cache_fpl_smr_assert_entered(fpl); + fpl->status = CACHE_FPL_STATUS_PARTIAL; + fpl->line = line; + return (CACHE_FPL_FAILED); +} + +#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) + +static int +cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) +{ + + KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, + ("%s: setting to handled at %d, but already set to %d at %d\n", + __func__, line, fpl->status, fpl->line)); + cache_fpl_smr_assert_not_entered(fpl); + MPASS(error != CACHE_FPL_FAILED); + fpl->status = CACHE_FPL_STATUS_HANDLED; + fpl->line = line; + return (error); +} + +#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) + +#define CACHE_FPL_SUPPORTED_CN_FLAGS \ + (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1) + +static bool +cache_can_fplookup(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + struct thread *td; + + ndp = fpl->ndp; + cnp = fpl->cnp; + td = cnp->cn_thread; + + if (!cache_fast_lookup) { + cache_fpl_aborted(fpl); + return (false); + } +#ifdef MAC + if (mac_vnode_check_lookup_enabled()) { + cache_fpl_aborted(fpl); + return (false); + } +#endif + if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { + cache_fpl_aborted(fpl); + return (false); + } + if (cnp->cn_nameiop != LOOKUP) { + cache_fpl_aborted(fpl); + return (false); + } + if (ndp->ni_dirfd != AT_FDCWD) { + cache_fpl_aborted(fpl); + return (false); + } + if (IN_CAPABILITY_MODE(td)) { + cache_fpl_aborted(fpl); + return (false); + } + if (AUDITING_TD(td)) { + cache_fpl_aborted(fpl); + return (false); + } + if (ndp->ni_startdir != NULL) { + cache_fpl_aborted(fpl); + return (false); + } + return (true); +} + +static bool +cache_fplookup_vnode_supported(struct vnode *vp) +{ + + return (vp->v_type != VLNK); +} + +/* + * The target vnode is not supported, prepare for the slow path to take over. + */ +static int +cache_fplookup_partial_setup(struct cache_fpl *fpl) +{ + struct componentname *cnp; + enum vgetstate dvs; + struct vnode *dvp; + struct pwd *pwd; + seqc_t dvp_seqc; + + cnp = fpl->cnp; + dvp = fpl->dvp; + dvp_seqc = fpl->dvp_seqc; + + dvs = vget_prep_smr(dvp); + if (dvs == VGET_NONE) { + cache_fpl_smr_exit(fpl); + return (cache_fpl_aborted(fpl)); + } + + cache_fpl_smr_exit(fpl); + + vget_finish_ref(dvp, dvs); + if (!vn_seqc_consistent(dvp, dvp_seqc)) { + vrele(dvp); + return (cache_fpl_aborted(fpl)); + } + + pwd = pwd_hold(curthread); + if (fpl->pwd != pwd) { + vrele(dvp); + pwd_drop(pwd); + return (cache_fpl_aborted(fpl)); + } + + fpl->ndp->ni_startdir = dvp; + return (0); +} + +static int +cache_fplookup_final(struct cache_fpl *fpl) +{ + struct componentname *cnp; + enum vgetstate tvs; + struct vnode *dvp, *tvp; + seqc_t dvp_seqc, tvp_seqc; + int error; + + cnp = fpl->cnp; + dvp = fpl->dvp; + dvp_seqc = fpl->dvp_seqc; + tvp = fpl->tvp; + tvp_seqc = fpl->tvp_seqc; + + VNPASS(cache_fplookup_vnode_supported(dvp), dvp); + + tvs = vget_prep_smr(tvp); + if (tvs == VGET_NONE) { + return (cache_fpl_partial(fpl)); + } + + if (!vn_seqc_consistent(dvp, dvp_seqc)) { + cache_fpl_smr_exit(fpl); + vget_abort(tvp, tvs); + return (cache_fpl_aborted(fpl)); + } + + cache_fpl_smr_exit(fpl); + + if ((cnp->cn_flags & LOCKLEAF) != 0) { + error = vget_finish(tvp, cnp->cn_lkflags, tvs); + if (error != 0) { + return (cache_fpl_aborted(fpl)); + } + } else { + vget_finish_ref(tvp, tvs); + } + + if (!vn_seqc_consistent(tvp, tvp_seqc)) { + if ((cnp->cn_flags & LOCKLEAF) != 0) + vput(tvp); + else + vrele(tvp); + return (cache_fpl_aborted(fpl)); + } + + return (cache_fpl_handled(fpl, 0)); +} + +static int +cache_fplookup_next(struct cache_fpl *fpl) +{ + struct componentname *cnp; + struct namecache *ncp; + struct negstate *negstate; + struct vnode *dvp, *tvp; + u_char nc_flag; + uint32_t hash; + bool neg_hot; + + cnp = fpl->cnp; + dvp = fpl->dvp; + + if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { + fpl->tvp = dvp; + fpl->tvp_seqc = vn_seqc_read_any(dvp); + if (seqc_in_modify(fpl->tvp_seqc)) { + return (cache_fpl_aborted(fpl)); + } + return (0); + } + + hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + counter_u64_add(numchecks, 1); + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + /* + * If there is no entry we have to punt to the slow path to perform + * actual lookup. Should there be nothing with this name a negative + * entry will be created. + */ + if (__predict_false(ncp == NULL)) { + return (cache_fpl_partial(fpl)); + } + + tvp = atomic_load_ptr(&ncp->nc_vp); + nc_flag = atomic_load_char(&ncp->nc_flag); + if ((nc_flag & NCF_NEGATIVE) != 0) { + negstate = NCP2NEGSTATE(ncp); + neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); + if (__predict_false(cache_ncp_invalid(ncp))) { + return (cache_fpl_partial(fpl)); + } + if (__predict_false((nc_flag & NCF_WHITE) != 0)) { + return (cache_fpl_partial(fpl)); + } + if (!neg_hot) { + /* + * TODO + * Promoting to hot negative requires locks, thus is + * left not yet supported for simplicity. + */ + return (cache_fpl_partial(fpl)); + } + SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, + ncp->nc_name); + counter_u64_add(numneghits, 1); + cache_fpl_smr_exit(fpl); + return (cache_fpl_handled(fpl, ENOENT)); + } + + if (__predict_false(cache_ncp_invalid(ncp))) { + return (cache_fpl_partial(fpl)); + } + + fpl->tvp = tvp; + fpl->tvp_seqc = vn_seqc_read_any(tvp); + if (seqc_in_modify(fpl->tvp_seqc)) { + return (cache_fpl_partial(fpl)); + } + + if (!cache_fplookup_vnode_supported(tvp)) { + return (cache_fpl_partial(fpl)); + } + + counter_u64_add(numposhits, 1); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); + return (0); +} + +static bool +cache_fplookup_mp_supported(struct mount *mp) +{ + + if (mp == NULL) + return (false); + if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) + return (false); + if ((mp->mnt_flag & MNT_UNION) != 0) + return (false); + return (true); +} + +/* + * Walk up the mount stack (if any). + * + * Correctness is provided in the following ways: + * - all vnodes are protected from freeing with SMR + * - struct mount objects are type stable making them always safe to access + * - stability of the particular mount is provided by busying it + * - relationship between the vnode which is mounted on and the mount is + * verified with the vnode sequence counter after busying + * - association between root vnode of the mount and the mount is protected + * by busy + * + * From that point on we can read the sequence counter of the root vnode + * and get the next mount on the stack (if any) using the same protection. + * + * By the end of successful walk we are guaranteed the reached state was + * indeed present at least at some point which matches the regular lookup. + */ +static int +cache_fplookup_climb_mount(struct cache_fpl *fpl) +{ + struct mount *mp, *prev_mp; + struct vnode *vp; + seqc_t vp_seqc; + + vp = fpl->tvp; + vp_seqc = fpl->tvp_seqc; + if (vp->v_type != VDIR) + return (0); + + mp = atomic_load_ptr(&vp->v_mountedhere); + if (mp == NULL) + return (0); + + prev_mp = NULL; + for (;;) { + if (!vfs_op_thread_enter(mp)) { + if (prev_mp != NULL) + vfs_op_thread_exit(prev_mp); + return (cache_fpl_partial(fpl)); + } + if (prev_mp != NULL) + vfs_op_thread_exit(prev_mp); + if (!vn_seqc_consistent(vp, vp_seqc)) { + vfs_op_thread_exit(mp); + return (cache_fpl_partial(fpl)); + } + if (!cache_fplookup_mp_supported(mp)) { + vfs_op_thread_exit(mp); + return (cache_fpl_partial(fpl)); + } + vp = atomic_load_ptr(&mp->mnt_rootvnode); + if (vp == NULL || VN_IS_DOOMED(vp)) { + vfs_op_thread_exit(mp); + return (cache_fpl_partial(fpl)); + } + vp_seqc = vn_seqc_read_any(vp); + if (seqc_in_modify(vp_seqc)) { + vfs_op_thread_exit(mp); + return (cache_fpl_partial(fpl)); + } + prev_mp = mp; + mp = atomic_load_ptr(&vp->v_mountedhere); + if (mp == NULL) + break; + } + + vfs_op_thread_exit(prev_mp); + fpl->tvp = vp; + fpl->tvp_seqc = vp_seqc; + return (0); +} + +/* + * Parse the path. + * + * The code is mostly copy-pasted from regular lookup, see lookup(). + * The structure is maintained along with comments for easier maintenance. + * Deduplicating the code will become feasible after fast path lookup + * becomes more feature-complete. + */ +static int +cache_fplookup_parse(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + char *cp; + char *prev_ni_next; /* saved ndp->ni_next */ + size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ + + ndp = fpl->ndp; + cnp = fpl->cnp; + + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + continue; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + cache_fpl_smr_exit(fpl); + return (cache_fpl_handled(fpl, ENAMETOOLONG)); + } + prev_ni_pathlen = ndp->ni_pathlen; + ndp->ni_pathlen -= cnp->cn_namelen; + KASSERT(ndp->ni_pathlen <= PATH_MAX, + ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); + prev_ni_next = ndp->ni_next; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + /* + * TODO + * Regular lookup performs the following: + * *ndp->ni_next = '\0'; + * cnp->cn_flags |= TRAILINGSLASH; + * + * Which is problematic since it modifies data read + * from userspace. Then if fast path lookup was to + * abort we would have to either restore it or convey + * the flag. Since this is a corner case just ignore + * it for simplicity. + */ + return (cache_fpl_partial(fpl)); + } + } + ndp->ni_next = cp; + + cnp->cn_flags |= MAKEENTRY; + + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + * + * TODO + * Another corner case handled by the regular lookup + */ + if (__predict_false(cnp->cn_nameptr[0] == '\0')) { + return (cache_fpl_partial(fpl)); + } + return (0); +} + +static void +cache_fplookup_parse_advance(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + + ndp = fpl->ndp; + cnp = fpl->cnp; + + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } +} + +static int +cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + struct mount *mp; + int error; + + error = CACHE_FPL_FAILED; + ndp = fpl->ndp; + ndp->ni_lcf = 0; + cnp = fpl->cnp; + cnp->cn_lkflags = LK_SHARED; + if ((cnp->cn_flags & LOCKSHARED) == 0) + cnp->cn_lkflags = LK_EXCLUSIVE; + + cache_fpl_checkpoint(fpl, &fpl->snd); + + fpl->dvp = dvp; + fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); + if (seqc_in_modify(fpl->dvp_seqc)) { + cache_fpl_aborted(fpl); + goto out; + } + mp = atomic_load_ptr(&fpl->dvp->v_mount); + if (!cache_fplookup_mp_supported(mp)) { + cache_fpl_aborted(fpl); + goto out; + } + + VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); + + for (;;) { + error = cache_fplookup_parse(fpl); + if (__predict_false(error != 0)) { + break; + } + + if (cnp->cn_flags & ISDOTDOT) { + error = cache_fpl_partial(fpl); + break; + } + + VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); + + error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); + if (__predict_false(error != 0)) { + switch (error) { + case EAGAIN: + case EOPNOTSUPP: /* can happen when racing against vgone */ + cache_fpl_partial(fpl); + break; + default: + /* + * See the API contract for VOP_FPLOOKUP_VEXEC. + */ + if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { + error = cache_fpl_aborted(fpl); + } else { + cache_fpl_smr_exit(fpl); + cache_fpl_handled(fpl, error); + } + break; + } + break; + } + + error = cache_fplookup_next(fpl); + if (__predict_false(error != 0)) { + break; + } + + VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); + + error = cache_fplookup_climb_mount(fpl); + if (__predict_false(error != 0)) { + break; + } + + VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); + + if (cnp->cn_flags & ISLASTCN) { + error = cache_fplookup_final(fpl); + break; + } + + if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { + error = cache_fpl_aborted(fpl); + break; + } + + fpl->dvp = fpl->tvp; + fpl->dvp_seqc = fpl->tvp_seqc; + + cache_fplookup_parse_advance(fpl); + cache_fpl_checkpoint(fpl, &fpl->snd); + } +out: + switch (fpl->status) { + case CACHE_FPL_STATUS_UNSET: + __assert_unreachable(); + break; + case CACHE_FPL_STATUS_PARTIAL: + cache_fpl_smr_assert_entered(fpl); + return (cache_fplookup_partial_setup(fpl)); + case CACHE_FPL_STATUS_ABORTED: + if (fpl->in_smr) + cache_fpl_smr_exit(fpl); + return (CACHE_FPL_FAILED); + case CACHE_FPL_STATUS_HANDLED: + cache_fpl_smr_assert_not_entered(fpl); + if (__predict_false(error != 0)) { + ndp->ni_dvp = NULL; + ndp->ni_vp = NULL; + cache_fpl_cleanup_cnp(cnp); + return (error); + } + ndp->ni_dvp = fpl->dvp; + ndp->ni_vp = fpl->tvp; + if (cnp->cn_flags & SAVENAME) + cnp->cn_flags |= HASBUF; + else + cache_fpl_cleanup_cnp(cnp); + return (error); + } +} + +/* + * Fast path lookup protected with SMR and sequence counters. + * + * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. + * + * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria + * outlined below. + * + * Traditional vnode lookup conceptually looks like this: + * + * vn_lock(current); + * for (;;) { + * next = find(); + * vn_lock(next); + * vn_unlock(current); + * current = next; + * if (last) + * break; + * } + * return (current); + * + * Each jump to the next vnode is safe memory-wise and atomic with respect to + * any modifications thanks to holding respective locks. + * + * The same guarantee can be provided with a combination of safe memory + * reclamation and sequence counters instead. If all operations which affect + * the relationship between the current vnode and the one we are looking for + * also modify the counter, we can verify whether all the conditions held as + * we made the jump. This includes things like permissions, mount points etc. + * Counter modification is provided by enclosing relevant places in + * vn_seqc_write_begin()/end() calls. + * + * Thus this translates to: + * + * vfs_smr_enter(); + * dvp_seqc = seqc_read_any(dvp); + * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode + * abort(); + * for (;;) { + * tvp = find(); + * tvp_seqc = seqc_read_any(tvp); + * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode + * abort(); + * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode + * abort(); + * dvp = tvp; // we know nothing of importance has changed + * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration + * if (last) + * break; + * } + * vget(); // secure the vnode + * if (!seqc_consistent(tvp, tvp_seqc) // final check + * abort(); + * // at this point we know nothing has changed for any parent<->child pair + * // as they were crossed during the lookup, meaning we matched the guarantee + * // of the locked variant + * return (tvp); + * + * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: + * - they are called while within vfs_smr protection which they must never exit + * - EAGAIN can be returned to denote checking could not be performed, it is + * always valid to return it + * - if the sequence counter has not changed the result must be valid + * - if the sequence counter has changed both false positives and false negatives + * are permitted (since the result will be rejected later) + * - for simple cases of unix permission checks vaccess_vexec_smr can be used + * + * Caveats to watch out for: + * - vnodes are passed unlocked and unreferenced with nothing stopping + * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised + * to use atomic_load_ptr to fetch it. + * - the aforementioned object can also get freed, meaning absent other means it + * should be protected with vfs_smr + * - either safely checking permissions as they are modified or guaranteeing + * their stability is left to the routine + */ +int +cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, + struct pwd **pwdp) +{ + struct cache_fpl fpl; + struct pwd *pwd; + struct vnode *dvp; + struct componentname *cnp; + struct nameidata_saved orig; + int error; + + *status = CACHE_FPL_STATUS_UNSET; + bzero(&fpl, sizeof(fpl)); + fpl.status = CACHE_FPL_STATUS_UNSET; + fpl.ndp = ndp; + fpl.cnp = &ndp->ni_cnd; + MPASS(curthread == fpl.cnp->cn_thread); + + if (!cache_can_fplookup(&fpl)) { + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); + *status = fpl.status; + return (EOPNOTSUPP); + } + + cache_fpl_checkpoint(&fpl, &orig); + + cache_fpl_smr_enter(&fpl); + pwd = pwd_get_smr(); + fpl.pwd = pwd; + ndp->ni_rootdir = pwd->pwd_rdir; + ndp->ni_topdir = pwd->pwd_jdir; + + cnp = fpl.cnp; + cnp->cn_nameptr = cnp->cn_pnbuf; + if (cnp->cn_pnbuf[0] == '/') { + cache_fpl_handle_root(ndp, &dvp); + } else { + MPASS(ndp->ni_dirfd == AT_FDCWD); + dvp = pwd->pwd_cdir; + } + + SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); + + error = cache_fplookup_impl(dvp, &fpl); + cache_fpl_smr_assert_not_entered(&fpl); + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); + + *status = fpl.status; + switch (fpl.status) { + case CACHE_FPL_STATUS_UNSET: + __assert_unreachable(); + break; + case CACHE_FPL_STATUS_HANDLED: + SDT_PROBE3(vfs, namei, lookup, return, error, + (error == 0 ? ndp->ni_vp : NULL), true); + break; + case CACHE_FPL_STATUS_PARTIAL: + *pwdp = fpl.pwd; + cache_fpl_restore(&fpl, &fpl.snd); + break; + case CACHE_FPL_STATUS_ABORTED: + cache_fpl_restore(&fpl, &orig); + break; + } + return (error); +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index fa58f7576ce..13e1d6ddce9 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -71,9 +71,9 @@ __FBSDID("$FreeBSD$"); #undef NAMEI_DIAGNOSTIC SDT_PROVIDER_DECLARE(vfs); -SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *", - "unsigned long"); -SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); +SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *", + "unsigned long", "bool"); +SDT_PROBE_DEFINE3(vfs, namei, lookup, return, "int", "struct vnode *", "bool"); /* Allocation zone for namei. */ uma_zone_t namei_zone; @@ -280,77 +280,21 @@ namei_handle_root(struct nameidata *ndp, struct vnode **dpp) return (0); } -/* - * Convert a pathname into a pointer to a locked vnode. - * - * The FOLLOW flag is set when symbolic links are to be followed - * when they occur at the end of the name translation process. - * Symbolic links are always followed for all other pathname - * components other than the last. - * - * The segflg defines whether the name is to be copied from user - * space or kernel space. - * - * Overall outline of namei: - * - * copy in name - * get starting directory - * while (!done && !error) { - * call lookup to search path. - * if symbolic link, massage name in buffer and continue - * } - */ -int -namei(struct nameidata *ndp) +static int +namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp) { - char *cp; /* pointer into pathname argument */ - struct vnode *dp; /* the directory we are searching */ - struct iovec aiov; /* uio for reading symbolic links */ struct componentname *cnp; struct file *dfp; struct thread *td; - struct proc *p; struct pwd *pwd; cap_rights_t rights; struct filecaps dirfd_caps; - struct uio auio; - int error, linklen, startdir_used; + int error, startdir_used; cnp = &ndp->ni_cnd; td = cnp->cn_thread; - p = td->td_proc; - ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; - KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); - KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, - ("namei: nameiop contaminated with flags")); - KASSERT((cnp->cn_flags & OPMASK) == 0, - ("namei: flags contaminated with nameiops")); - MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || - ndp->ni_startdir->v_type == VBAD); - TAILQ_INIT(&ndp->ni_cap_tracker); - ndp->ni_lcf = 0; - - /* We will set this ourselves if we need it. */ - cnp->cn_flags &= ~TRAILINGSLASH; - - /* - * Get a buffer for the name to be translated, and copy the - * name into the buffer. - */ - if ((cnp->cn_flags & HASBUF) == 0) - cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); - if (ndp->ni_segflg == UIO_SYSSPACE) - error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, - &ndp->ni_pathlen); - else - error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, - &ndp->ni_pathlen); - /* - * Don't allow empty pathnames. - */ - if (error == 0 && *cnp->cn_pnbuf == '\0') - error = ENOENT; + *pwdp = NULL; #ifdef CAPABILITY_MODE /* @@ -366,31 +310,19 @@ namei(struct nameidata *ndp) * previously walked by us, which prevents an escape from * the relative root. */ - if (error == 0 && IN_CAPABILITY_MODE(td) && - (cnp->cn_flags & NOCAPCHECK) == 0) { + if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; if (ndp->ni_dirfd == AT_FDCWD) { #ifdef KTRACE if (KTRPOINT(td, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif - error = ECAPMODE; + return (ECAPMODE); } } #endif - if (error != 0) { - namei_cleanup_cnp(cnp); - ndp->ni_vp = NULL; - return (error); - } - ndp->ni_loopcnt = 0; -#ifdef KTRACE - if (KTRPOINT(td, KTR_NAMEI)) { - KASSERT(cnp->cn_thread == curthread, - ("namei not using curthread")); - ktrnamei(cnp->cn_pnbuf); - } -#endif + error = 0; + /* * Get starting point for the translation. */ @@ -402,19 +334,16 @@ namei(struct nameidata *ndp) ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; - startdir_used = 0; - dp = NULL; - cnp->cn_nameptr = cnp->cn_pnbuf; if (cnp->cn_pnbuf[0] == '/') { ndp->ni_resflags |= NIRES_ABS; - error = namei_handle_root(ndp, &dp); + error = namei_handle_root(ndp, dpp); } else { if (ndp->ni_startdir != NULL) { - dp = ndp->ni_startdir; + *dpp = ndp->ni_startdir; startdir_used = 1; } else if (ndp->ni_dirfd == AT_FDCWD) { - dp = pwd->pwd_cdir; - vrefact(dp); + *dpp = pwd->pwd_cdir; + vrefact(*dpp); } else { rights = ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); @@ -441,8 +370,8 @@ namei(struct nameidata *ndp) } else if (dfp->f_vnode == NULL) { error = ENOTDIR; } else { - dp = dfp->f_vnode; - vrefact(dp); + *dpp = dfp->f_vnode; + vrefact(*dpp); if ((dfp->f_flag & FSEARCH) != 0) cnp->cn_flags |= NOEXECCHECK; @@ -464,7 +393,7 @@ namei(struct nameidata *ndp) } #endif } - if (error == 0 && dp->v_type != VDIR) + if (error == 0 && (*dpp)->v_type != VDIR) error = ENOTDIR; } if (error == 0 && (cnp->cn_flags & BENEATH) != 0) { @@ -476,7 +405,7 @@ namei(struct nameidata *ndp) cap_rights_set_one(&rights, CAP_LOOKUP); error = fgetvp_rights(td, ndp->ni_dirfd, &rights, &dirfd_caps, &ndp->ni_beneath_latch); - if (error == 0 && dp->v_type != VDIR) { + if (error == 0 && (*dpp)->v_type != VDIR) { vrele(ndp->ni_beneath_latch); error = ENOTDIR; } @@ -488,15 +417,15 @@ namei(struct nameidata *ndp) * If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) - AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf); + AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (cnp->cn_flags & AUDITVNODE2) - AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf); + AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (ndp->ni_startdir != NULL && !startdir_used) vrele(ndp->ni_startdir); if (error != 0) { - if (dp != NULL) - vrele(dp); - goto out; + if (*dpp != NULL) + vrele(*dpp); + return (error); } MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) != NI_LCF_BENEATH_ABS); @@ -505,8 +434,130 @@ namei(struct nameidata *ndp) ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 && (cnp->cn_flags & BENEATH) != 0)) ndp->ni_lcf |= NI_LCF_CAP_DOTDOT; - SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, - cnp->cn_flags); + SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf, + cnp->cn_flags, false); + *pwdp = pwd; + return (0); +} + +/* + * Convert a pathname into a pointer to a locked vnode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(struct nameidata *ndp) +{ + char *cp; /* pointer into pathname argument */ + struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct componentname *cnp; + struct thread *td; + struct proc *p; + struct pwd *pwd; + struct uio auio; + int error, linklen; + enum cache_fpl_status status; + + cnp = &ndp->ni_cnd; + td = cnp->cn_thread; + p = td->td_proc; + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; + KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); + KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, + ("namei: nameiop contaminated with flags")); + KASSERT((cnp->cn_flags & OPMASK) == 0, + ("namei: flags contaminated with nameiops")); + MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || + ndp->ni_startdir->v_type == VBAD); + TAILQ_INIT(&ndp->ni_cap_tracker); + ndp->ni_lcf = 0; + ndp->ni_loopcnt = 0; + dp = NULL; + + /* We will set this ourselves if we need it. */ + cnp->cn_flags &= ~TRAILINGSLASH; + + ndp->ni_vp = NULL; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); + + if (error != 0) { + namei_cleanup_cnp(cnp); + return (error); + } + + cnp->cn_nameptr = cnp->cn_pnbuf; + + /* + * Don't allow empty pathnames. + */ + if (*cnp->cn_pnbuf == '\0') { + namei_cleanup_cnp(cnp); + return (ENOENT); + } + +#ifdef KTRACE + if (KTRPOINT(td, KTR_NAMEI)) { + KASSERT(cnp->cn_thread == curthread, + ("namei not using curthread")); + ktrnamei(cnp->cn_pnbuf); + } +#endif + + /* + * First try looking up the target without locking any vnodes. + * + * We may need to start from scratch or pick up where it left off. + */ + error = cache_fplookup(ndp, &status, &pwd); + switch (status) { + case CACHE_FPL_STATUS_UNSET: + __assert_unreachable(); + break; + case CACHE_FPL_STATUS_HANDLED: + return (error); + case CACHE_FPL_STATUS_PARTIAL: + dp = ndp->ni_startdir; + break; + case CACHE_FPL_STATUS_ABORTED: + error = namei_setup(ndp, &dp, &pwd); + if (error != 0) { + namei_cleanup_cnp(cnp); + return (error); + } + break; + } + + /* + * Locked lookup. + */ for (;;) { ndp->ni_startdir = dp; error = lookup(ndp); @@ -526,8 +577,8 @@ namei(struct nameidata *ndp) error = ENOTCAPABLE; } nameicap_cleanup(ndp, true); - SDT_PROBE2(vfs, namei, lookup, return, error, - (error == 0 ? ndp->ni_vp : NULL)); + SDT_PROBE3(vfs, namei, lookup, return, error, + (error == 0 ? ndp->ni_vp : NULL), false); pwd_drop(pwd); return (error); } @@ -602,7 +653,7 @@ namei(struct nameidata *ndp) MPASS(error != 0); namei_cleanup_cnp(cnp); nameicap_cleanup(ndp, true); - SDT_PROBE2(vfs, namei, lookup, return, error, NULL); + SDT_PROBE3(vfs, namei, lookup, return, error, NULL, false); pwd_drop(pwd); return (error); } diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 61b76f82ebc..7e8106abcc5 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -947,6 +947,7 @@ vfs_domount_first( vput(vp); return (error); } + vn_seqc_write_begin(vp); VOP_UNLOCK(vp); /* Allocate and initialize the filesystem. */ @@ -979,9 +980,11 @@ vfs_domount_first( VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); + vn_seqc_write_end(vp); vrele(vp); return (error); } + vn_seqc_write_begin(newdp); VOP_UNLOCK(newdp); if (mp->mnt_opt != NULL) @@ -1018,6 +1021,8 @@ vfs_domount_first( EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp); mountcheckdirs(vp, newdp); + vn_seqc_write_end(vp); + vn_seqc_write_end(newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); @@ -1094,7 +1099,9 @@ vfs_domount_update( VOP_UNLOCK(vp); vfs_op_enter(mp); + vn_seqc_write_begin(vp); + rootvp = NULL; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); @@ -1108,8 +1115,6 @@ vfs_domount_update( mp->mnt_kern_flag &= ~MNTK_ASYNC; rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); - if (rootvp != NULL) - vrele(rootvp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); @@ -1233,6 +1238,11 @@ vfs_domount_update( vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); + if (rootvp != NULL) { + vn_seqc_write_end(rootvp); + vrele(rootvp); + } + vn_seqc_write_end(vp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; @@ -1723,14 +1733,19 @@ dounmount(struct mount *mp, int flags, struct thread *td) } mp->mnt_kern_flag |= MNTK_UNMOUNT; rootvp = vfs_cache_root_clear(mp); + if (coveredvp != NULL) + vn_seqc_write_begin(coveredvp); if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { + vn_seqc_write_end(coveredvp); dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); - if (rootvp != NULL) + if (rootvp != NULL) { + vn_seqc_write_end(rootvp); vrele(rootvp); + } return (error); } } @@ -1759,22 +1774,19 @@ dounmount(struct mount *mp, int flags, struct thread *td) ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); - if (rootvp != NULL) + /* + * We want to keep the vnode around so that we can vn_seqc_write_end + * after we are done with unmount. Downgrade our reference to a mere + * hold count so that we don't interefere with anything. + */ + if (rootvp != NULL) { + vhold(rootvp); vrele(rootvp); + } if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); - /* - * From now, we can claim that the use reference on the - * coveredvp is ours, and the ref can be released only by - * successfull unmount by us, or left for later unmount - * attempt. The previously acquired hold reference is no - * longer needed to protect the vnode from reuse. - */ - if (coveredvp != NULL) - vdrop(coveredvp); - vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; @@ -1809,8 +1821,15 @@ dounmount(struct mount *mp, int flags, struct thread *td) } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); - if (coveredvp) + if (coveredvp) { + vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); + vdrop(coveredvp); + } + if (rootvp != NULL) { + vn_seqc_write_end(rootvp); + vdrop(rootvp); + } return (error); } mtx_lock(&mountlist_mtx); @@ -1819,7 +1838,13 @@ dounmount(struct mount *mp, int flags, struct thread *td) EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; + vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); + vdrop(coveredvp); + } + if (rootvp != NULL) { + vn_seqc_write_end(rootvp); + vdrop(rootvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 5f964c534df..c0f51ab7b5a 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -664,8 +664,8 @@ vntblinit(void *dummy __unused) vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, - vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); - vfs_smr = uma_zone_get_smr(vnode_zone); + vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(vnode_zone, vfs_smr); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* @@ -1760,6 +1760,12 @@ freevnode(struct vnode *vp) * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); + /* + * Paired with vgone. + */ + vn_seqc_write_end_locked(vp); + VNPASS(vp->v_seqc_users == 0, vp); + bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); @@ -2889,6 +2895,22 @@ vget_prep(struct vnode *vp) return (vs); } +void +vget_abort(struct vnode *vp, enum vgetstate vs) +{ + + switch (vs) { + case VGET_USECOUNT: + vrele(vp); + break; + case VGET_HOLDCNT: + vdrop(vp); + break; + default: + __assert_unreachable(); + } +} + int vget(struct vnode *vp, int flags, struct thread *td) { @@ -2900,7 +2922,7 @@ vget(struct vnode *vp, int flags, struct thread *td) return (vget_finish(vp, flags, vs)); } -static int __noinline +static void __noinline vget_finish_vchr(struct vnode *vp) { @@ -2916,7 +2938,7 @@ vget_finish_vchr(struct vnode *vp) #else refcount_release(&vp->v_holdcnt); #endif - return (0); + return; } VI_LOCK(vp); @@ -2928,18 +2950,17 @@ vget_finish_vchr(struct vnode *vp) refcount_release(&vp->v_holdcnt); #endif VI_UNLOCK(vp); - return (0); + return; } v_incr_devcount(vp); refcount_acquire(&vp->v_usecount); VI_UNLOCK(vp); - return (0); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { - int error, old; + int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); @@ -2951,20 +2972,32 @@ vget_finish(struct vnode *vp, int flags, enum vgetstate vs) error = vn_lock(vp, flags); if (__predict_false(error != 0)) { - if (vs == VGET_USECOUNT) - vrele(vp); - else - vdrop(vp); + vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } + vget_finish_ref(vp, vs); + return (0); +} + +void +vget_finish_ref(struct vnode *vp, enum vgetstate vs) +{ + int old; + + VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); + VNPASS(vp->v_holdcnt > 0, vp); + VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); + if (vs == VGET_USECOUNT) - return (0); + return; - if (__predict_false(vp->v_type == VCHR)) - return (vget_finish_vchr(vp)); + if (__predict_false(vp->v_type == VCHR)) { + vget_finish_vchr(vp); + return; + } /* * We hold the vnode. If the usecount is 0 it will be utilized to keep @@ -2981,7 +3014,6 @@ vget_finish(struct vnode *vp, int flags, enum vgetstate vs) refcount_release(&vp->v_holdcnt); #endif } - return (0); } /* @@ -3999,6 +4031,10 @@ vgonel(struct vnode *vp) */ if (vp->v_irflag & VIRF_DOOMED) return; + /* + * Paired with freevnode. + */ + vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vp->v_irflag |= VIRF_DOOMED; @@ -4141,8 +4177,9 @@ vn_printf(struct vnode *vp, const char *fmt, ...) printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); - printf(" usecount %d, writecount %d, refcount %d", - vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS); + printf(" usecount %d, writecount %d, refcount %d seqc users %d", + vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, + vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); @@ -4394,6 +4431,7 @@ DB_SHOW_COMMAND(mount, db_show_mount) MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); + MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); @@ -5209,6 +5247,38 @@ vn_isdisk(struct vnode *vp, int *errp) return (error == 0); } +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + * + * We never deny as priv_check_cred calls are not yet supported, see vaccess. + */ +int +vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) +{ + + VFS_SMR_ASSERT_ENTERED(); + + /* Check the owner. */ + if (cred->cr_uid == file_uid) { + if (file_mode & S_IXUSR) + return (0); + return (EAGAIN); + } + + /* Otherwise, check the groups (first match) */ + if (groupmember(file_gid, cred)) { + if (file_mode & S_IXGRP) + return (0); + return (EAGAIN); + } + + /* Otherwise, check everyone else. */ + if (file_mode & S_IXOTH) + return (0); + return (EAGAIN); +} + /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, @@ -5489,6 +5559,14 @@ vop_rename_pre(void *ap) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif + /* + * It may be tempting to add vn_seqc_write_begin/end calls here and + * in vop_rename_post but that's not going to work out since some + * filesystems relookup vnodes mid-rename. This is probably a bug. + * + * For now filesystems are expected to do the relevant calls after they + * decide what vnodes to operate on. + */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) @@ -5499,6 +5577,20 @@ vop_rename_pre(void *ap) } #ifdef DEBUG_VFS_LOCKS +void +vop_fplookup_vexec_pre(void *ap __unused) +{ + + VFS_SMR_ASSERT_ENTERED(); +} + +void +vop_fplookup_vexec_post(void *ap __unused, int rc __unused) +{ + + VFS_SMR_ASSERT_ENTERED(); +} + void vop_strategy_pre(void *ap) { @@ -5569,70 +5661,194 @@ vop_need_inactive_post(void *ap, int rc) } #endif +void +vop_create_pre(void *ap) +{ + struct vop_create_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_begin(dvp); +} + void vop_create_post(void *ap, int rc) { - struct vop_create_args *a = ap; + struct vop_create_args *a; + struct vnode *dvp; + a = ap; + dvp = a->a_dvp; + vn_seqc_write_end(dvp); if (!rc) - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); +} + +void +vop_whiteout_pre(void *ap) +{ + struct vop_whiteout_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_begin(dvp); +} + +void +vop_whiteout_post(void *ap, int rc) +{ + struct vop_whiteout_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_end(dvp); +} + +void +vop_deleteextattr_pre(void *ap) +{ + struct vop_deleteextattr_args *a; + struct vnode *vp; + + a = ap; + vp = a->a_vp; + vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { - struct vop_deleteextattr_args *a = ap; + struct vop_deleteextattr_args *a; + struct vnode *vp; + a = ap; + vp = a->a_vp; + vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } +void +vop_link_pre(void *ap) +{ + struct vop_link_args *a; + struct vnode *vp, *tdvp; + + a = ap; + vp = a->a_vp; + tdvp = a->a_tdvp; + vn_seqc_write_begin(vp); + vn_seqc_write_begin(tdvp); +} + void vop_link_post(void *ap, int rc) { - struct vop_link_args *a = ap; + struct vop_link_args *a; + struct vnode *vp, *tdvp; + a = ap; + vp = a->a_vp; + tdvp = a->a_tdvp; + vn_seqc_write_end(vp); + vn_seqc_write_end(tdvp); if (!rc) { - VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); - VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(vp, NOTE_LINK); + VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } +void +vop_mkdir_pre(void *ap) +{ + struct vop_mkdir_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_begin(dvp); +} + void vop_mkdir_post(void *ap, int rc) { - struct vop_mkdir_args *a = ap; + struct vop_mkdir_args *a; + struct vnode *dvp; + a = ap; + dvp = a->a_dvp; + vn_seqc_write_end(dvp); if (!rc) - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); +} + +void +vop_mknod_pre(void *ap) +{ + struct vop_mknod_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { - struct vop_mknod_args *a = ap; + struct vop_mknod_args *a; + struct vnode *dvp; + a = ap; + dvp = a->a_dvp; + vn_seqc_write_end(dvp); if (!rc) - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { - struct vop_reclaim_args *a = ap; + struct vop_reclaim_args *a; + struct vnode *vp; + a = ap; + vp = a->a_vp; + ASSERT_VOP_IN_SEQC(vp); if (!rc) - VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); + VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); +} + +void +vop_remove_pre(void *ap) +{ + struct vop_remove_args *a; + struct vnode *dvp, *vp; + + a = ap; + dvp = a->a_dvp; + vp = a->a_vp; + vn_seqc_write_begin(dvp); + vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { - struct vop_remove_args *a = ap; + struct vop_remove_args *a; + struct vnode *dvp, *vp; + a = ap; + dvp = a->a_dvp; + vp = a->a_vp; + vn_seqc_write_end(dvp); + vn_seqc_write_end(vp); if (!rc) { - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); - VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } @@ -5674,42 +5890,128 @@ vop_rename_post(void *ap, int rc) vdrop(a->a_tvp); } +void +vop_rmdir_pre(void *ap) +{ + struct vop_rmdir_args *a; + struct vnode *dvp, *vp; + + a = ap; + dvp = a->a_dvp; + vp = a->a_vp; + vn_seqc_write_begin(dvp); + vn_seqc_write_begin(vp); +} + void vop_rmdir_post(void *ap, int rc) { - struct vop_rmdir_args *a = ap; + struct vop_rmdir_args *a; + struct vnode *dvp, *vp; + a = ap; + dvp = a->a_dvp; + vp = a->a_vp; + vn_seqc_write_end(dvp); + vn_seqc_write_end(vp); if (!rc) { - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); - VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); + VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } +void +vop_setattr_pre(void *ap) +{ + struct vop_setattr_args *a; + struct vnode *vp; + + a = ap; + vp = a->a_vp; + vn_seqc_write_begin(vp); +} + void vop_setattr_post(void *ap, int rc) { - struct vop_setattr_args *a = ap; + struct vop_setattr_args *a; + struct vnode *vp; + a = ap; + vp = a->a_vp; + vn_seqc_write_end(vp); if (!rc) - VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); + VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); +} + +void +vop_setacl_pre(void *ap) +{ + struct vop_setacl_args *a; + struct vnode *vp; + + a = ap; + vp = a->a_vp; + vn_seqc_write_begin(vp); +} + +void +vop_setacl_post(void *ap, int rc __unused) +{ + struct vop_setacl_args *a; + struct vnode *vp; + + a = ap; + vp = a->a_vp; + vn_seqc_write_end(vp); +} + +void +vop_setextattr_pre(void *ap) +{ + struct vop_setextattr_args *a; + struct vnode *vp; + + a = ap; + vp = a->a_vp; + vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { - struct vop_setextattr_args *a = ap; + struct vop_setextattr_args *a; + struct vnode *vp; + a = ap; + vp = a->a_vp; + vn_seqc_write_end(vp); if (!rc) - VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); + VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); +} + +void +vop_symlink_pre(void *ap) +{ + struct vop_symlink_args *a; + struct vnode *dvp; + + a = ap; + dvp = a->a_dvp; + vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { - struct vop_symlink_args *a = ap; + struct vop_symlink_args *a; + struct vnode *dvp; + a = ap; + dvp = a->a_dvp; + vn_seqc_write_end(dvp); if (!rc) - VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void @@ -6262,6 +6564,8 @@ vfs_cache_root_clear(struct mount *mp) */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; + if (vp != NULL) + vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } @@ -6558,3 +6862,44 @@ vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } + +void +vn_seqc_write_begin_locked(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + VNPASS(vp->v_holdcnt > 0, vp); + VNPASS(vp->v_seqc_users >= 0, vp); + vp->v_seqc_users++; + if (vp->v_seqc_users == 1) + seqc_sleepable_write_begin(&vp->v_seqc); +} + +void +vn_seqc_write_begin(struct vnode *vp) +{ + + VI_LOCK(vp); + vn_seqc_write_begin_locked(vp); + VI_UNLOCK(vp); +} + +void +vn_seqc_write_end_locked(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + VNPASS(vp->v_seqc_users > 0, vp); + vp->v_seqc_users--; + if (vp->v_seqc_users == 0) + seqc_sleepable_write_end(&vp->v_seqc); +} + +void +vn_seqc_write_end(struct vnode *vp) +{ + + VI_LOCK(vp); + vn_seqc_write_end_locked(vp); + VI_UNLOCK(vp); +} diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index bfa81016946..5c0649fdada 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -88,6 +88,7 @@ vop_cachedlookup { %% create dvp E E E %% create vpp - L - +%! create pre vop_create_pre %! create post vop_create_post vop_create { @@ -99,6 +100,8 @@ vop_create { %% whiteout dvp E E E +%! whiteout pre vop_whiteout_pre +%! whiteout post vop_whiteout_post vop_whiteout { IN struct vnode *dvp; @@ -109,6 +112,7 @@ vop_whiteout { %% mknod dvp E E E %% mknod vpp - L - +%! mknod pre vop_mknod_pre %! mknod post vop_mknod_post vop_mknod { @@ -142,6 +146,17 @@ vop_close { }; +%% fplookup_vexec vp - - - +%! fplookup_vexec pre vop_fplookup_vexec_pre +%! fplookup_vexec post vop_fplookup_vexec_post + +vop_fplookup_vexec { + IN struct vnode *vp; + IN struct ucred *cred; + IN struct thread *td; +}; + + %% access vp L L L vop_access { @@ -172,6 +187,7 @@ vop_getattr { %% setattr vp E E E +%! setattr pre vop_setattr_pre %! setattr post vop_setattr_post vop_setattr { @@ -260,6 +276,7 @@ vop_fsync { %% remove dvp E E E %% remove vp E E E +%! remove pre vop_remove_pre %! remove post vop_remove_post vop_remove { @@ -271,6 +288,7 @@ vop_remove { %% link tdvp E E E %% link vp E E E +%! link pre vop_link_pre %! link post vop_link_post vop_link { @@ -295,6 +313,7 @@ vop_rename { %% mkdir dvp E E E %% mkdir vpp - E - +%! mkdir pre vop_mkdir_pre %! mkdir post vop_mkdir_post vop_mkdir { @@ -307,6 +326,7 @@ vop_mkdir { %% rmdir dvp E E E %% rmdir vp E E E +%! rmdir pre vop_rmdir_pre %! rmdir post vop_rmdir_post vop_rmdir { @@ -318,6 +338,7 @@ vop_rmdir { %% symlink dvp E E E %% symlink vpp - E - +%! symlink pre vop_symlink_pre %! symlink post vop_symlink_post vop_symlink { @@ -523,6 +544,8 @@ vop_getacl { %% setacl vp E E E +%! setacl pre vop_setacl_pre +%! setacl post vop_setacl_post vop_setacl { IN struct vnode *vp; @@ -589,6 +612,7 @@ vop_openextattr { %% deleteextattr vp E E E +%! deleteextattr pre vop_deleteextattr_pre %! deleteextattr post vop_deleteextattr_post vop_deleteextattr { @@ -601,6 +625,7 @@ vop_deleteextattr { %% setextattr vp E E E +%! setextattr pre vop_setextattr_pre %! setextattr post vop_setextattr_post vop_setextattr { diff --git a/sys/security/mac/mac_framework.h b/sys/security/mac/mac_framework.h index 1ab82dd709d..e917eeb3c89 100644 --- a/sys/security/mac/mac_framework.h +++ b/sys/security/mac/mac_framework.h @@ -422,13 +422,14 @@ int mac_vnode_check_listextattr(struct ucred *cred, struct vnode *vp, int mac_vnode_check_lookup_impl(struct ucred *cred, struct vnode *dvp, struct componentname *cnp); extern bool mac_vnode_check_lookup_fp_flag; +#define mac_vnode_check_lookup_enabled() __predict_false(mac_vnode_check_lookup_fp_flag) static inline int mac_vnode_check_lookup(struct ucred *cred, struct vnode *dvp, struct componentname *cnp) { mac_vnode_assert_locked(dvp, "mac_vnode_check_lookup"); - if (__predict_false(mac_vnode_check_lookup_fp_flag)) + if (mac_vnode_check_lookup_enabled()) return (mac_vnode_check_lookup_impl(cred, dvp, cnp)); return (0); } diff --git a/sys/sys/_seqc.h b/sys/sys/_seqc.h new file mode 100644 index 00000000000..4e00dd4e57e --- /dev/null +++ b/sys/sys/_seqc.h @@ -0,0 +1,6 @@ +#ifndef _SYS__SEQC_H_ +#define _SYS__SEQC_H_ + +typedef uint32_t seqc_t; + +#endif /* _SYS__SEQC_H */ diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 6954b1d23f4..a99b3b0cb04 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -311,6 +311,7 @@ pwd_set(struct filedesc *fdp, struct pwd *newpwd) smr_serialized_store(&fdp->fd_pwd, newpwd, (FILEDESC_XLOCK_ASSERT(fdp), true)); } +struct pwd *pwd_get_smr(void); #endif /* _KERNEL */ diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 6e1517aac4c..a3bc0518a7e 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -420,6 +420,7 @@ void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp); #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ #define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ +#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ diff --git a/sys/sys/namei.h b/sys/sys/namei.h index 602d7eff28b..1fa20081a55 100644 --- a/sys/sys/namei.h +++ b/sys/sys/namei.h @@ -108,6 +108,12 @@ struct nameidata { }; #ifdef _KERNEL + +enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL, + CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET }; +int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, + struct pwd **pwdp); + /* * namei operations */ diff --git a/sys/sys/seqc.h b/sys/sys/seqc.h index 82a0ae9c30b..b4b0770ad40 100644 --- a/sys/sys/seqc.h +++ b/sys/sys/seqc.h @@ -36,7 +36,7 @@ /* * seqc_t may be included in structs visible to userspace */ -typedef uint32_t seqc_t; +#include #ifdef _KERNEL @@ -111,5 +111,26 @@ seqc_consistent(const seqc_t *seqcp, seqc_t oldseqc) return (seqc_consistent_nomb(seqcp, oldseqc)); } +/* + * Variant which does not critical enter/exit. + */ +static __inline void +seqc_sleepable_write_begin(seqc_t *seqcp) +{ + + MPASS(!seqc_in_modify(*seqcp)); + *seqcp += 1; + atomic_thread_fence_rel(); +} + +static __inline void +seqc_sleepable_write_end(seqc_t *seqcp) +{ + + atomic_thread_fence_rel(); + *seqcp += 1; + MPASS(!seqc_in_modify(*seqcp)); +} + #endif /* _KERNEL */ #endif /* _SYS_SEQC_H_ */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 19f07d05d7c..b5ebceb1846 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -45,6 +45,7 @@ #include #include #include +#include /* * The vnode is the focus of all file activity in UNIX. There is a @@ -105,6 +106,7 @@ struct vnode { */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ + seqc_t v_seqc; /* i modification count */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ @@ -175,6 +177,7 @@ struct vnode { short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ + int v_seqc_users; /* i modifications pending */ u_int v_hash; }; @@ -539,6 +542,18 @@ void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) +#define ASSERT_VOP_IN_SEQC(vp) do { \ + struct vnode *_vp = (vp); \ + \ + VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ +} while (0) + +#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ + struct vnode *_vp = (vp); \ + \ + VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ +} while (0) + #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) @@ -546,6 +561,10 @@ void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) + +#define ASSERT_VOP_IN_SEQC(vp) ((void)0) +#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) + #endif /* DEBUG_VFS_LOCKS */ @@ -647,6 +666,8 @@ int vn_path_to_global_path(struct thread *td, struct vnode *vp, int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused); +int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, + struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused); @@ -663,6 +684,8 @@ int vget(struct vnode *vp, int flags, struct thread *td); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); +void vget_finish_ref(struct vnode *vp, enum vgetstate vs); +void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); @@ -738,6 +761,13 @@ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); +void vn_seqc_write_begin_locked(struct vnode *vp); +void vn_seqc_write_begin(struct vnode *vp); +void vn_seqc_write_end_locked(struct vnode *vp); +void vn_seqc_write_end(struct vnode *vp); +#define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) +#define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) + #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ @@ -804,27 +834,43 @@ int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); +void vop_create_pre(void *a); void vop_create_post(void *a, int rc); +void vop_whiteout_pre(void *a); +void vop_whiteout_post(void *a, int rc); +void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); +void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); +void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); +void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); +void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); +void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); +void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); +void vop_setacl_pre(void *a); +void vop_setacl_post(void *a, int rc); +void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); +void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS +void vop_fplookup_vexec_pre(void *a); +void vop_fplookup_vexec_post(void *a, int rc); void vop_strategy_pre(void *a); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); @@ -832,6 +878,8 @@ void vop_unlock_pre(void *a); void vop_need_inactive_pre(void *a); void vop_need_inactive_post(void *a, int rc); #else +#define vop_fplookup_vexec_pre(x) do { } while (0) +#define vop_fplookup_vexec_post(x, y) do { } while (0) #define vop_strategy_pre(x) do { } while (0) #define vop_lock_pre(x) do { } while (0) #define vop_lock_post(x, y) do { } while (0) @@ -985,10 +1033,18 @@ int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) +#define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) +#define vn_load_v_data_smr(vp) ({ \ + struct vnode *_vp = (vp); \ + \ + VFS_SMR_ASSERT_ENTERED(); \ + (_vp)->v_data; \ +}) + #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 696be51ae6a..8c69212d82e 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; +VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, @@ -393,6 +394,7 @@ ffs_mount(struct mount *mp) uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); @@ -455,6 +457,7 @@ ffs_mount(struct mount *mp) } MNT_ILOCK(mp); + mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* @@ -795,6 +798,17 @@ ffs_mount(struct mount *mp) } } } + + MNT_ILOCK(mp); + /* + * This is racy versus lookup, see ufs_fplookup_vexec for details. + */ + if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) + panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); + if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) == 0) + mp->mnt_kern_flag |= MNTK_FPLOOKUP; + MNT_IUNLOCK(mp); + vfs_mountedfrom(mp, fspec); return (0); } @@ -1968,14 +1982,14 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags) ump = VFSTOUFS(mp); fs = ump->um_fs; - ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); + ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; - uma_zfree(uma_inode, ip); + uma_zfree_smr(uma_inode, ip); return (error); } /* @@ -2004,7 +2018,7 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { - uma_zfree(uma_inode, ip); + uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } @@ -2327,7 +2341,7 @@ ffs_ifree(struct ufsmount *ump, struct inode *ip) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); - uma_zfree(uma_inode, ip); + uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 000ded6cbba..c363f4bbb09 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -905,8 +905,10 @@ ffs_write(ap) if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) { - ip->i_mode &= ~(ISUID | ISGID); + vn_seqc_write_begin(vp); + UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); DIP_SET(ip, i_mode, ip->i_mode); + vn_seqc_write_end(vp); } } if (error) { @@ -1152,8 +1154,10 @@ ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) { - ip->i_mode &= ~(ISUID | ISGID); + vn_seqc_write_begin(vp); + UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); dp->di_mode = ip->i_mode; + vn_seqc_write_end(vp); } } if (error) { diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index defa888b17f..15f60caae1f 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -43,6 +43,7 @@ #include #include #include +#include /* * This must agree with the definition in . @@ -149,6 +150,14 @@ struct inode { #define UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE \ (UFS_INODE_FLAG_LAZY_MASK & ~(IN_LAZYMOD | IN_LAZYACCESS)) +#define UFS_INODE_SET_MODE(ip, mode) do { \ + struct inode *_ip = (ip); \ + int _mode = (mode); \ + \ + ASSERT_VOP_IN_SEQC(ITOV(_ip)); \ + atomic_store_short(&(_ip)->i_mode, _mode); \ +} while (0) + #define UFS_INODE_SET_FLAG(ip, flags) do { \ struct inode *_ip = (ip); \ struct vnode *_vp = ITOV(_ip); \ @@ -229,6 +238,7 @@ struct indir { /* Convert between inode pointers and vnode pointers. */ #define VTOI(vp) ((struct inode *)(vp)->v_data) +#define VTOI_SMR(vp) ((struct inode *)vn_load_v_data_smr(vp)) #define ITOV(ip) ((ip)->i_vnode) /* Determine if soft dependencies are being done */ diff --git a/sys/ufs/ufs/ufs_acl.c b/sys/ufs/ufs/ufs_acl.c index cb077387ae9..68e8ef91534 100644 --- a/sys/ufs/ufs/ufs_acl.c +++ b/sys/ufs/ufs/ufs_acl.c @@ -139,9 +139,11 @@ ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl) void ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip) { + int newmode; - ip->i_mode &= ACL_PRESERVE_MASK; - ip->i_mode |= acl_posix1e_acl_to_mode(acl); + newmode = ip->i_mode & ACL_PRESERVE_MASK; + newmode |= acl_posix1e_acl_to_mode(acl); + UFS_INODE_SET_MODE(ip, newmode); DIP_SET(ip, i_mode, ip->i_mode); } @@ -381,7 +383,7 @@ int ufs_setacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td) { int error; - mode_t mode; + mode_t mode, newmode; struct inode *ip = VTOI(vp); KASSERT(acl_nfs4_check(aclp, vp->v_type == VDIR) == 0, @@ -418,8 +420,9 @@ ufs_setacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td) acl_nfs4_sync_mode_from_acl(&mode, aclp); - ip->i_mode &= ACL_PRESERVE_MASK; - ip->i_mode |= mode; + newmode = ip->i_mode & ACL_PRESERVE_MASK; + newmode |= mode; + UFS_INODE_SET_MODE(ip, newmode); DIP_SET(ip, i_mode, ip->i_mode); UFS_INODE_SET_FLAG(ip, IN_CHANGE); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 2e6aa283fc8..1ad3b3cf643 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -96,10 +97,12 @@ FEATURE(suiddir, "Give all new files in directory the same ownership as the directory"); #endif +VFS_SMR_DECLARE; #include static vop_accessx_t ufs_accessx; +static vop_fplookup_vexec_t ufs_fplookup_vexec; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; @@ -422,6 +425,48 @@ ufs_accessx(ap) return (error); } +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + */ +static int +ufs_fplookup_vexec(ap) + struct vop_fplookup_vexec_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct ucred *cred; + mode_t all_x, mode; + + vp = ap->a_vp; + ip = VTOI_SMR(vp); + if (__predict_false(ip == NULL)) + return (EAGAIN); + + /* + * XXX ACL race + * + * ACLs are not supported and UFS clears/sets the flag on mount + * and remount. However, we may still be racing with seeing them + * and there are no provisions to make sure they got accounted for. + * + * This happens to match the behavior of the locked case as well, + * since in it the lookup is also racy -- mount takes no measures + * to block anyone from progressing. + */ + all_x = S_IXUSR | S_IXGRP | S_IXOTH; + mode = atomic_load_short(&ip->i_mode); + if (__predict_true((mode & all_x) == all_x)) + return (0); + + cred = ap->a_cred; + return (vaccess_vexec_smr(mode, ip->i_uid, ip->i_gid, cred)); +} + /* ARGSUSED */ static int ufs_getattr(ap) @@ -711,7 +756,7 @@ ufs_chmod(vp, mode, cred, td) struct thread *td; { struct inode *ip = VTOI(vp); - int error; + int newmode, error; /* * To modify the permissions on a file, must possess VADMIN @@ -744,8 +789,9 @@ ufs_chmod(vp, mode, cred, td) return (error); } - ip->i_mode &= ~ALLPERMS; - ip->i_mode |= (mode & ALLPERMS); + newmode = ip->i_mode & ~ALLPERMS; + newmode |= (mode & ALLPERMS); + UFS_INODE_SET_MODE(ip, newmode); DIP_SET(ip, i_mode, ip->i_mode); UFS_INODE_SET_FLAG(ip, IN_CHANGE); #ifdef UFS_ACL @@ -869,7 +915,7 @@ ufs_chown(vp, uid, gid, cred, td) UFS_INODE_SET_FLAG(ip, IN_CHANGE); if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { - ip->i_mode &= ~(ISUID | ISGID); + UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); DIP_SET(ip, i_mode, ip->i_mode); } } @@ -1111,6 +1157,9 @@ ufs_rename(ap) int error = 0; struct mount *mp; ino_t ino; + bool want_seqc_end; + + want_seqc_end = false; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || @@ -1315,6 +1364,13 @@ ufs_rename(ap) tdp->i_effnlink == 0) panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); + if (tvp != NULL) + vn_seqc_write_begin(tvp); + vn_seqc_write_begin(tdvp); + vn_seqc_write_begin(fvp); + vn_seqc_write_begin(fdvp); + want_seqc_end = true; + /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before @@ -1513,6 +1569,14 @@ ufs_rename(ap) cache_purge_negative(tdvp); unlockout: + if (want_seqc_end) { + if (tvp != NULL) + vn_seqc_write_end(tvp); + vn_seqc_write_end(tdvp); + vn_seqc_write_end(fvp); + vn_seqc_write_end(fdvp); + } + vput(fdvp); vput(fvp); if (tvp) @@ -1556,6 +1620,14 @@ ufs_rename(ap) goto unlockout; releout: + if (want_seqc_end) { + if (tvp != NULL) + vn_seqc_write_end(tvp); + vn_seqc_write_end(tdvp); + vn_seqc_write_end(fvp); + vn_seqc_write_end(fdvp); + } + vrele(fdvp); vrele(fvp); vrele(tdvp); @@ -1590,7 +1662,7 @@ ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); - ip->i_mode = dmode; + UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); @@ -1602,7 +1674,7 @@ ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, /* * Just use the mode as-is. */ - ip->i_mode = dmode; + UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); error = 0; goto out; @@ -1673,7 +1745,7 @@ ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); - ip->i_mode = mode; + UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; @@ -1684,7 +1756,7 @@ ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, /* * Just use the mode as-is. */ - ip->i_mode = mode; + UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); error = 0; goto out; @@ -1796,6 +1868,7 @@ ufs_mkdir(ap) error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; + vn_seqc_write_begin(tvp); ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); @@ -1846,6 +1919,7 @@ ufs_mkdir(ap) if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); + vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); @@ -1861,6 +1935,7 @@ ufs_mkdir(ap) if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); + vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); @@ -1868,7 +1943,7 @@ ufs_mkdir(ap) #endif #endif /* !SUIDDIR */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); - ip->i_mode = dmode; + UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; @@ -1974,6 +2049,7 @@ ufs_mkdir(ap) bad: if (error == 0) { *ap->a_vpp = tvp; + vn_seqc_write_end(tvp); } else { dp->i_effnlink--; dp->i_nlink--; @@ -1989,6 +2065,7 @@ ufs_mkdir(ap) UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_mkdir(dp, ip); + vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); } @@ -2637,8 +2714,9 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc) } #endif #endif /* !SUIDDIR */ + vn_seqc_write_begin(tvp); /* Mostly to cover asserts */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); - ip->i_mode = mode; + UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; @@ -2648,7 +2726,7 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc) softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) { - ip->i_mode &= ~ISGID; + UFS_INODE_SET_MODE(ip, ip->i_mode & ~ISGID); DIP_SET(ip, i_mode, ip->i_mode); } @@ -2688,6 +2766,7 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc) error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); if (error) goto bad; + vn_seqc_write_end(tvp); *vpp = tvp; return (0); @@ -2702,6 +2781,7 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc) UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_create(VTOI(dvp), ip); + vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); @@ -2740,6 +2820,7 @@ struct vop_vector ufs_vnodeops = { .vop_write = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_bmap = ufs_bmap, + .vop_fplookup_vexec = ufs_fplookup_vexec, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create,