diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 6eff71606ac..855c6329356 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -162,6 +163,7 @@ struct namecache_ts { #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 +#define NCF_WIP 0x80 /* * Flags in negstate.neg_flag @@ -179,22 +181,22 @@ cache_ncp_invalidate(struct namecache *ncp) KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); - ncp->nc_flag |= NCF_INVALID; + atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* - * Verify validity of an entry. + * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ static bool -cache_ncp_invalid(struct namecache *ncp) +cache_ncp_canuse(struct namecache *ncp) { atomic_thread_fence_acq(); - return ((ncp->nc_flag & NCF_INVALID) != 0); + return ((ncp->nc_flag & (NCF_INVALID | NCF_WIP)) == 0); } /* @@ -948,16 +950,22 @@ cache_zap_locked(struct namecache *ncp) SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp); TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); - if (ncp == ncp->nc_vp->v_cache_dd) + if (ncp == ncp->nc_vp->v_cache_dd) { + vn_seqc_write_begin(ncp->nc_vp); ncp->nc_vp->v_cache_dd = NULL; + vn_seqc_write_end(ncp->nc_vp); + } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, ncp->nc_name); cache_negative_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { - if (ncp == ncp->nc_dvp->v_cache_dd) + if (ncp == ncp->nc_dvp->v_cache_dd) { + vn_seqc_write_begin(ncp->nc_dvp); ncp->nc_dvp->v_cache_dd = NULL; + vn_seqc_write_end(ncp->nc_dvp); + } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { @@ -1297,7 +1305,9 @@ cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, mtx_unlock(dvlp2); cache_free(ncp); } else { + vn_seqc_write_begin(dvp); dvp->v_cache_dd = NULL; + vn_seqc_write_end(dvp); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); @@ -1498,7 +1508,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, VOP_UNLOCK(dvp); } if (doing_smr) { - if (cache_ncp_invalid(ncp)) { + if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto retry; @@ -1552,7 +1562,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, */ negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) == 0 || - cache_ncp_invalid(ncp)) { + !cache_ncp_canuse(ncp)) { vfs_smr_exit(); doing_smr = false; goto retry_hashed; @@ -1808,6 +1818,7 @@ cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); + vn_seqc_write_begin(dvp); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); @@ -1816,6 +1827,7 @@ cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, ncp = NULL; } dvp->v_cache_dd = NULL; + vn_seqc_write_end(dvp); cache_enter_unlock(&cel); cache_free(ncp); } @@ -1835,6 +1847,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int flag; int len; u_long lnumcache; + bool want_seqc_end; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, @@ -1876,7 +1889,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); - ncp->nc_flag = flag; + ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_negative_init(ncp); @@ -1930,7 +1943,9 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); + vn_seqc_write_begin(dvp); dvp->v_cache_dd = ncp; + vn_seqc_write_end(dvp); } if (vp != NULL) { @@ -1941,6 +1956,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, * directory name in it and the name ".." for the * directory's parent. */ + vn_seqc_write_begin(vp); if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); @@ -1948,9 +1964,18 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, ndd = NULL; } vp->v_cache_dd = ncp; + vn_seqc_write_end(vp); } } else { + want_seqc_end = false; + if (vp->v_cache_dd != NULL) { + vn_seqc_write_begin(vp); + want_seqc_end = true; + } vp->v_cache_dd = NULL; + if (want_seqc_end) { + vn_seqc_write_end(vp); + } } } @@ -1979,13 +2004,19 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, ncp->nc_name); } - atomic_thread_fence_rel(); /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + atomic_thread_fence_rel(); + /* + * Mark the entry as fully constructed. + * It is immutable past this point until its removal. + */ + atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); + cache_enter_unlock(&cel); if (numneg * ncnegfactor > lnumcache) cache_negative_zap_one(); @@ -3136,7 +3167,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, goto out_abort; } - if (__predict_false(cache_ncp_invalid(ncp))) { + if (__predict_false(!cache_ncp_canuse(ncp))) { goto out_abort; } @@ -3382,7 +3413,7 @@ cache_fplookup_next(struct cache_fpl *fpl) if ((nc_flag & NCF_NEGATIVE) != 0) { negstate = NCP2NEGSTATE(ncp); neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); - if (__predict_false(cache_ncp_invalid(ncp))) { + if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_partial(fpl)); } if (__predict_false((nc_flag & NCF_WHITE) != 0)) { @@ -3398,7 +3429,7 @@ cache_fplookup_next(struct cache_fpl *fpl) return (cache_fpl_handled(fpl, ENOENT)); } - if (__predict_false(cache_ncp_invalid(ncp))) { + if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_partial(fpl)); } @@ -3417,6 +3448,76 @@ cache_fplookup_next(struct cache_fpl *fpl) return (0); } +static int __noinline +cache_fplookup_dotdot(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + struct namecache *ncp; + struct vnode *dvp; + struct prison *pr; + u_char nc_flag; + + ndp = fpl->ndp; + cnp = fpl->cnp; + dvp = fpl->dvp; + + /* + * XXX this is racy the same way regular lookup is + */ + for (pr = cnp->cn_cred->cr_prison; pr != NULL; + pr = pr->pr_parent) + if (dvp == pr->pr_root) + break; + + if (dvp == ndp->ni_rootdir || + dvp == ndp->ni_topdir || + dvp == rootvnode || + pr != NULL) { + fpl->tvp = dvp; + fpl->tvp_seqc = vn_seqc_read_any(dvp); + if (seqc_in_modify(fpl->tvp_seqc)) { + return (cache_fpl_aborted(fpl)); + } + return (0); + } + + if ((dvp->v_vflag & VV_ROOT) != 0) { + /* + * TODO + * The opposite of climb mount is needed here. + */ + return (cache_fpl_aborted(fpl)); + } + + ncp = atomic_load_ptr(&dvp->v_cache_dd); + if (ncp == NULL) { + return (cache_fpl_aborted(fpl)); + } + + nc_flag = atomic_load_char(&ncp->nc_flag); + if ((nc_flag & NCF_ISDOTDOT) != 0) { + if (nc_flag & NCF_NEGATIVE) + return (cache_fpl_aborted(fpl)); + fpl->tvp = ncp->nc_vp; + } else { + fpl->tvp = ncp->nc_dvp; + } + + if (__predict_false(!cache_ncp_canuse(ncp))) { + return (cache_fpl_aborted(fpl)); + } + + fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); + if (seqc_in_modify(fpl->tvp_seqc)) { + return (cache_fpl_partial(fpl)); + } + + + counter_u64_add(dotdothits, 1); + return (0); +} + static bool cache_fplookup_mp_supported(struct mount *mp) { @@ -3653,11 +3754,6 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) break; } - if (cnp->cn_flags & ISDOTDOT) { - error = cache_fpl_partial(fpl); - break; - } - VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); @@ -3682,16 +3778,23 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) break; } - error = cache_fplookup_next(fpl); - if (__predict_false(error != 0)) { - break; - } + if (__predict_false(cnp->cn_flags & ISDOTDOT)) { + error = cache_fplookup_dotdot(fpl); + if (__predict_false(error != 0)) { + break; + } + } else { + error = cache_fplookup_next(fpl); + if (__predict_false(error != 0)) { + break; + } - VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); + VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); - error = cache_fplookup_climb_mount(fpl); - if (__predict_false(error != 0)) { - break; + error = cache_fplookup_climb_mount(fpl); + if (__predict_false(error != 0)) { + break; + } } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index d7a69b49d7f..5e0c801052e 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -6887,7 +6887,7 @@ vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); - VNPASS(vp->v_holdcnt > 0, vp); + VNPASS(!VN_IS_DOOMED(vp) || vp->v_seqc_users > 0 || vp->v_holdcnt > 0, vp); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1)