diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index c8fc5de..0878e55 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -56,6 +56,7 @@ struct null_node { int nullfs_init(struct vfsconf *vfsp); int nullfs_uninit(struct vfsconf *vfsp); int null_nodeget(struct mount *mp, struct vnode *target, struct vnode **vpp); +struct vnode *null_hashget(struct mount *mp, struct vnode *lowervp); void null_hashrem(struct null_node *xp); int null_bypass(struct vop_generic_args *ap); diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index 7e3571d..5f926a6 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -67,7 +67,6 @@ struct mtx null_hashmtx; static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table"); MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part"); -static struct vnode * null_hashget(struct mount *, struct vnode *); static struct vnode * null_hashins(struct mount *, struct null_node *); /* @@ -98,7 +97,7 @@ nullfs_uninit(vfsp) * Return a VREF'ed alias for lower vnode if already exists, else 0. * Lower vnode should be locked on entry and will be left locked on exit. */ -static struct vnode * +struct vnode * null_hashget(mp, lowervp) struct mount *mp; struct vnode *lowervp; @@ -209,14 +208,10 @@ null_nodeget(mp, lowervp, vpp) struct vnode *vp; int error; - /* - * The insmntque1() call below requires the exclusive lock on - * the nullfs vnode. - */ - ASSERT_VOP_ELOCKED(lowervp, "lowervp"); - KASSERT(lowervp->v_usecount >= 1, ("Unreferenced vnode %p\n", lowervp)); + ASSERT_VOP_LOCKED(lowervp, "lowervp"); + KASSERT(lowervp->v_usecount >= 1, ("Unreferenced vnode %p", lowervp)); - /* Lookup the hash firstly */ + /* Lookup the hash firstly. */ *vpp = null_hashget(mp, lowervp); if (*vpp != NULL) { vrele(lowervp); @@ -224,6 +219,19 @@ null_nodeget(mp, lowervp, vpp) } /* + * The insmntque1() call below requires the exclusive lock on + * the nullfs vnode. Upgrade the lock now if hash failed to + * provide ready to use vnode. + */ + if (VOP_ISLOCKED(lowervp) != LK_EXCLUSIVE) { + vn_lock(lowervp, LK_UPGRADE | LK_RETRY); + if ((lowervp->v_iflag & VI_DOOMED) != 0) { + vput(lowervp); + return (ENOENT); + } + } + + /* * We do not serialize vnode creation, instead we will check for * duplicates later, when adding new vnode to hash. * Note that duplicate can only appear in hash if the lowervp is @@ -233,8 +241,7 @@ null_nodeget(mp, lowervp, vpp) * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ - xp = malloc(sizeof(struct null_node), - M_NULLFSNODE, M_WAITOK); + xp = malloc(sizeof(struct null_node), M_NULLFSNODE, M_WAITOK); error = getnewvnode("null", mp, &null_vnodeops, &vp); if (error) { diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 08f722a..9c2851b 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -65,6 +65,7 @@ static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; +static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp; /* * Mount null layer @@ -121,8 +122,10 @@ nullfs_mount(struct mount *mp) */ NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread); error = namei(ndp); + /* * Re-lock vnode. + * XXXKIB This is deadlock-prone as well. */ if (isvnunlocked) vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); @@ -146,7 +149,7 @@ nullfs_mount(struct mount *mp) } xmp = (struct null_mount *) malloc(sizeof(struct null_mount), - M_NULLFSMNT, M_WAITOK); /* XXX */ + M_NULLFSMNT, M_WAITOK); /* * Save reference to underlying FS @@ -186,10 +189,15 @@ nullfs_mount(struct mount *mp) } MNT_ILOCK(mp); mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & - (MNTK_MPSAFE | MNTK_SHARED_WRITES); + (MNTK_MPSAFE | MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED | + MNTK_EXTENDED_SHARED); + mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT; MNT_IUNLOCK(mp); mp->mnt_data = xmp; vfs_getnewfsid(mp); + MNT_ILOCK(xmp->nullm_vfs); + TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp, mnt_upper_link); + MNT_IUNLOCK(xmp->nullm_vfs); vfs_mountedfrom(mp, target); @@ -206,14 +214,16 @@ nullfs_unmount(mp, mntflags) struct mount *mp; int mntflags; { - void *mntdata; - int error; - int flags = 0; + struct null_mount *mntdata; + struct mount *ump; + int error, flags; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); if (mntflags & MNT_FORCE) - flags |= FORCECLOSE; + flags = FORCECLOSE; + else + flags = 0; /* There is 1 extra root vnode reference (nullm_rootvp). */ error = vflush(mp, 1, flags, curthread); @@ -224,9 +234,17 @@ nullfs_unmount(mp, mntflags) * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; + ump = mntdata->nullm_vfs; + MNT_ILOCK(ump); + while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) { + ump->mnt_kern_flag |= MNTK_VGONE_WAITER; + msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0); + } + TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link); + MNT_IUNLOCK(ump); mp->mnt_data = NULL; free(mntdata, M_NULLFSMNT); - return 0; + return (0); } static int @@ -316,13 +334,10 @@ nullfs_vget(mp, ino, flags, vpp) KASSERT((flags & LK_TYPE_MASK) != 0, ("nullfs_vget: no lock requested")); - flags &= ~LK_TYPE_MASK; - flags |= LK_EXCLUSIVE; error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp); - if (error) + if (error != 0) return (error); - return (null_nodeget(mp, *vpp, vpp)); } @@ -334,11 +349,11 @@ nullfs_fhtovp(mp, fidp, flags, vpp) struct vnode **vpp; { int error; - error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, LK_EXCLUSIVE, + + error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, flags, vpp); - if (error) + if (error != 0) return (error); - return (null_nodeget(mp, *vpp, vpp)); } @@ -350,10 +365,22 @@ nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname) int namespace; const char *attrname; { - return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, filename_vp, - namespace, attrname); + + return (VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, + filename_vp, namespace, attrname)); } +static void +nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp) +{ + struct vnode *vp; + + vp = null_hashget(mp, lowervp); + if (vp == NULL) + return; + vgone(vp); + vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); +} static struct vfsops null_vfsops = { .vfs_extattrctl = nullfs_extattrctl, @@ -367,6 +394,7 @@ static struct vfsops null_vfsops = { .vfs_uninit = nullfs_uninit, .vfs_unmount = nullfs_unmount, .vfs_vget = nullfs_vget, + .vfs_reclaim_lowervp = nullfs_reclaim_lowervp, }; VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index 6f40233..5fb020b 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -665,33 +665,18 @@ null_unlock(struct vop_unlock_args *ap) } /* - * There is no way to tell that someone issued remove/rmdir operation - * on the underlying filesystem. For now we just have to release lowervp - * as soon as possible. - * - * Note, we can't release any resources nor remove vnode from hash before - * appropriate VXLOCK stuff is done because other process can find this - * vnode in hash during inactivation and may be sitting in vget() and waiting - * for null_inactive to unlock vnode. Thus we will do all those in VOP_RECLAIM. + * XXXKIB */ static int -null_inactive(struct vop_inactive_args *ap) +null_inactive(struct vop_inactive_args *ap __unused) { - struct vnode *vp = ap->a_vp; - - vp->v_object = NULL; - - /* - * If this is the last reference, then free up the vnode - * so as not to tie up the lower vnodes. - */ - vrecycle(vp); return (0); } /* - * Now, the VXLOCK is in force and we're free to destroy the null vnode. + * Now, the nullfs vnode and, due to the sharing lock, the lower + * vnode, are exclusively locked, and we shall destroy the null vnode. */ static int null_reclaim(struct vop_reclaim_args *ap) diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 29ad019..d53dbd0 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -406,11 +406,13 @@ namei(struct nameidata *ndp) } static int -compute_cn_lkflags(struct mount *mp, int lkflags) +compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) { - if (mp == NULL || - ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) { + if (mp == NULL || ((lkflags & LK_SHARED) && + (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) || + ((cnflags & ISDOTDOT) && + (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } @@ -539,7 +541,8 @@ lookup(struct nameidata *ndp) dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, - compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, + cnp->cn_flags)); dirloop: /* @@ -700,7 +703,7 @@ dirloop: VFS_UNLOCK_GIANT(tvfslocked); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | - LK_RETRY)); + LK_RETRY, ISDOTDOT)); } } @@ -738,7 +741,8 @@ unionlookup: vprint("lookup in", dp); #endif lkflags_save = cnp->cn_lkflags; - cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags); + cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, + cnp->cn_flags); if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { cnp->cn_lkflags = lkflags_save; KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); @@ -757,7 +761,7 @@ unionlookup: VFS_UNLOCK_GIANT(tvfslocked); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | - LK_RETRY)); + LK_RETRY, cnp->cn_flags)); goto unionlookup; } @@ -829,8 +833,8 @@ unionlookup: dvfslocked = 0; vref(vp_crossmp); ndp->ni_dvp = vp_crossmp; - error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), - &tdp); + error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, + cnp->cn_flags), &tdp); vfs_unbusy(mp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index e9d3abe..f397842 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -481,6 +481,7 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); + TAILQ_INIT(&mp->mnt_uppers); return (mp); } @@ -514,6 +515,7 @@ vfs_mount_destroy(struct mount *mp) vprint("", vp); panic("unmount: dangling vnode"); } + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_activevnodelistsize != 0) @@ -1275,7 +1277,8 @@ dounmount(mp, flags, td) } MNT_ILOCK(mp); - if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || + !TAILQ_EMPTY(&mp->mnt_uppers)) { MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 631d3f2..06d16c0 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2688,6 +2688,58 @@ vgone(struct vnode *vp) VI_UNLOCK(vp); } +static void +vgonel_reclaim_lowervp_vfs(struct mount *mp __unused, + struct vnode *lowervp __unused) +{ +} + +/* + * Notify upper mounts about reclaimed vnode. + */ +static void +vgonel_reclaim_lowervp(struct vnode *vp) +{ + static struct vfsops vgonel_vfsops = { + .vfs_reclaim_lowervp = vgonel_reclaim_lowervp_vfs + }; + struct mount *mp, *ump, *mmp; + + mp = vp->v_mount; + if (mp == NULL) + return; + + MNT_ILOCK(mp); + if (TAILQ_EMPTY(&mp->mnt_uppers)) + goto unlock; + MNT_IUNLOCK(mp); + mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); + mmp->mnt_op = &vgonel_vfsops; + mmp->mnt_kern_flag |= MNTK_MARKER; + MNT_ILOCK(mp); + mp->mnt_kern_flag |= MNTK_VGONE_UPPER; + for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { + if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { + ump = TAILQ_NEXT(ump, mnt_upper_link); + continue; + } + TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); + MNT_IUNLOCK(mp); + VFS_RECLAIM_LOWERVP(ump, vp); + MNT_ILOCK(mp); + ump = TAILQ_NEXT(mmp, mnt_upper_link); + TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); + } + free(mmp, M_TEMP); + mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; + if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { + mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; + wakeup(&mp->mnt_uppers); + } +unlock: + MNT_IUNLOCK(mp); +} + /* * vgone, with the vp interlock held. */ @@ -2712,6 +2764,7 @@ vgonel(struct vnode *vp) if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; + /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). @@ -2719,6 +2772,8 @@ vgonel(struct vnode *vp) active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); + vgonel_reclaim_lowervp(vp); + /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 4b83ba4..833ac0f 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -188,6 +188,8 @@ struct mount { #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct lock mnt_explock; /* vfs_export walkers lock */ + TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ + TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ }; /* @@ -373,6 +375,10 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ +#define MNTK_VGONE_UPPER 0x00000200 +#define MNTK_VGONE_WAITER 0x00000200 +#define MNTK_MARKER 0x00000400 +#define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ @@ -628,6 +634,7 @@ typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); +typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp); struct vfsops { vfs_mount_t *vfs_mount; @@ -645,6 +652,7 @@ struct vfsops { vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; + vfs_reclaim_lowervp_t *vfs_reclaim_lowervp; }; vfs_statfs_t __vfs_statfs; @@ -670,6 +678,9 @@ vfs_statfs_t __vfs_statfs; #define VFS_SUSP_CLEAN(MP) \ ({if (*(MP)->mnt_op->vfs_susp_clean != NULL) \ (*(MP)->mnt_op->vfs_susp_clean)(MP); }) +#define VFS_RECLAIM_LOWERVP(MP, VP) \ + ({if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) \ + (*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP)); }) #define VFS_NEEDSGIANT_(MP) \ ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0)