--- //depot/projects/smpng/sys/kern/vfs_cache.c 2008/08/25 16:33:41 +++ //depot/user/jhb/lock/kern/vfs_cache.c 2008/09/15 22:13:44 @@ -304,7 +309,9 @@ * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup - * fails, a status of zero is returned. + * fails, a status of zero is returned. If the directory vnode is + * recycled out from under us due to a forced unmount, a status of + * EBADF is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is @@ -422,16 +447,24 @@ */ if (dvp == *vpp) { /* lookup on "." */ VREF(*vpp); CACHE_UNLOCK(); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; - if (ltype == VOP_ISLOCKED(*vpp)) - return (-1); - else if (ltype == LK_EXCLUSIVE) - vn_lock(*vpp, LK_UPGRADE | LK_RETRY); + if (ltype != VOP_ISLOCKED(*vpp)) { + if (ltype == LK_EXCLUSIVE) { + vn_lock(*vpp, LK_UPGRADE | LK_RETRY); + if ((*vpp)->v_iflag & VI_DOOMED) { + /* forced unmount */ + vrele(*vpp); + *vpp = NULL; + return (EBADF); + } + } else + vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); + } return (-1); } ltype = 0; /* silence gcc warning */ @@ -671,9 +720,9 @@ error = cache_lookup(dvp, vpp, cnp); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); - if (error == ENOENT) - return (error); - return (0); + if (error == -1) + return (0); + return (error); } --- //depot/projects/smpng/sys/kern/vfs_lookup.c 2008/07/25 18:20:23 +++ //depot/user/jhb/lock/kern/vfs_lookup.c 2008/09/15 22:13:44 @@ -430,6 +430,10 @@ ndp->ni_startdir = NULLVP; vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + goto bad; + } dirloop: /* @@ -556,10 +560,6 @@ } if ((dp->v_vflag & VV_ROOT) == 0) break; - if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ - error = EBADF; - goto bad; - } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; tvfslocked = dvfslocked; @@ -570,6 +570,10 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + goto bad; + } } } @@ -594,8 +598,13 @@ */ if (dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED && - (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) + (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) { vn_lock(dp, LK_UPGRADE|LK_RETRY); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + goto bad; + } + } /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. @@ -627,6 +636,10 @@ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + goto bad; + } goto unionlookup; } @@ -804,6 +817,18 @@ if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) == (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { vn_lock(dp, LK_UPGRADE | LK_RETRY); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + /* + * XXX: Always bad2? The !wantparent case + * above could have already dropped the + * reference. Perhaps better would be to set + * a flag to indicate if a reference is held + * on the parent dir and to use that at bad2 + * and merge the bad and bad2 labels. + */ + goto bad2; + } } if (vfslocked && dvfslocked) VFS_UNLOCK_GIANT(dvfslocked); /* Only need one */ @@ -850,6 +875,10 @@ dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = EBADF; + goto bad; + } /* * Search a new directory. --- //depot/projects/smpng/sys/nfsclient/nfs_vnops.c 2008/08/25 16:33:41 +++ //depot/user/jhb/lock/nfsclient/nfs_vnops.c 2008/09/15 22:13:44 @@ -869,7 +869,16 @@ *vpp = NULLVP; return (error); } - if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { + error = cache_lookup(dvp, vpp, cnp); + + /* + * XXX: Ignoring ENOENT means no negative name cache entries. + * We do this on purpose because we don't have a way to expire + * them since name cache entries don't have timestamps. + */ + if (error > 0 && error != ENOENT) + return (error); + if (error == -1) { struct vattr vattr; newvp = *vpp; --- //depot/projects/smpng/sys/ufs/ffs/ffs_vfsops.c 2008/08/25 16:33:41 +++ //depot/user/jhb/lock/ufs/ffs/ffs_vfsops.c 2008/08/29 15:04:03 @@ -852,7 +852,7 @@ * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); - mp->mnt_kern_flag |= MNTK_MPSAFE; + mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART --- //depot/projects/smpng/sys/ufs/ufs/dirhash.h 2008/07/25 18:20:23 +++ //depot/user/jhb/lock/ufs/ufs/dirhash.h 2008/09/10 17:22:58 @@ -28,6 +28,9 @@ #ifndef _UFS_UFS_DIRHASH_H_ #define _UFS_UFS_DIRHASH_H_ +#include +#include + /* * For fast operations on large directories, we maintain a hash * that maps the file name to the offset of the directory entry within @@ -80,7 +83,8 @@ ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK]) struct dirhash { - struct lock dh_lock; /* protects all fields except list & score */ + struct sx dh_lock; /* protects all fields except list & score */ + int dh_refcount; doff_t **dh_hash; /* the hash array (2-level) */ int dh_narrays; /* number of entries in dh_hash */ --- //depot/projects/smpng/sys/ufs/ufs/ufs_dirhash.c 2008/07/25 18:20:23 +++ //depot/user/jhb/lock/ufs/ufs/ufs_dirhash.c 2008/09/10 17:22:58 @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -47,7 +46,9 @@ #include #include #include +#include #include +#include #include #include @@ -98,7 +99,7 @@ #define DIRHASH_BLKALLOC_WAITOK() uma_zalloc(ufsdirhash_zone, M_WAITOK) #define DIRHASH_BLKFREE(ptr) uma_zfree(ufsdirhash_zone, (ptr)) #define DIRHASH_ASSERT_LOCKED(dh) \ - lockmgr_assert(&(dh)->dh_lock, KA_LOCKED) + sx_assert(&(dh)->dh_lock, SA_LOCKED) /* Dirhash list; recently-used entries are near the tail. */ static TAILQ_HEAD(, dirhash) ufsdirhash_list; @@ -111,7 +112,12 @@ * * The relationship between inode and dirhash is protected either by an * exclusive vnode lock or the vnode interlock where a shared vnode lock - * may be used. The dirhash_mtx is acquired after the dirhash lock. + * may be used. The dirhash_mtx is acquired after the dirhash lock. To + * handle teardown races, code wishing to lock the dirhash for an inode + * when using a shared vnode lock must obtain a private reference on the + * dirhash while holding the vnode interlock. They can drop it once they + * have obtained the dirhash lock and verified that the dirhash wasn't + * recycled while they waited for the dirhash lock. * * ufsdirhash_build() acquires a shared lock on the dirhash when it is * successful. This lock is released after a call to ufsdirhash_lookup(). @@ -122,6 +128,23 @@ * The dirhash lock may be held across io operations. */ +static void +ufsdirhash_hold(struct dirhash *dh) +{ + + refcount_acquire(&dh->dh_refcount); +} + +static void +ufsdirhash_drop(struct dirhash *dh) +{ + + if (refcount_release(&dh->dh_refcount)) { + sx_destroy(&dh->dh_lock); + free(dh, M_DIRHASH); + } +} + /* * Release the lock on a dirhash. */ @@ -129,7 +152,7 @@ ufsdirhash_release(struct dirhash *dh) { - lockmgr(&dh->dh_lock, LK_RELEASE, 0); + sx_unlock(&dh->dh_lock); } /* @@ -157,8 +180,9 @@ M_NOWAIT | M_ZERO); if (ndh == NULL) return (NULL); - lockinit(&ndh->dh_lock, PRIBIO, "dirhash", 0, 0); - lockmgr(&ndh->dh_lock, LK_EXCLUSIVE, NULL); + refcount_init(&ndh->dh_refcount, 1); + sx_init(&ndh->dh_lock, "dirhash"); + sx_xlock(&ndh->dh_lock); } /* * Check i_dirhash. If it's NULL just try to use a @@ -173,31 +197,39 @@ continue; return (ndh); } - /* Try to acquire shared on existing hashes. */ - if (lockmgr(&dh->dh_lock, LK_SHARED | LK_INTERLOCK, - VI_MTX(vp))) - continue; + ufsdirhash_hold(dh); + VI_UNLOCK(vp); + + /* Acquire a shared lock on existing hashes. */ + sx_slock(&dh->dh_lock); + /* The hash could've been recycled while we were waiting. */ + VI_LOCK(vp); if (ip->i_dirhash != dh) { + VI_UNLOCK(vp); ufsdirhash_release(dh); + ufsdirhash_drop(dh); continue; } + VI_UNLOCK(vp); + ufsdirhash_drop(dh); + /* If the hash is still valid we've succeeded. */ if (dh->dh_hash != NULL) break; /* * If the hash is NULL it has been recycled. Try to upgrade - * so we can recreate it. If we fail the upgrade another - * thread must've already exclusively locked it. + * so we can recreate it. If we fail the upgrade, drop our + * lock and try again. */ - if (lockmgr(&dh->dh_lock, LK_UPGRADE | LK_SLEEPFAIL, NULL) == 0) + if (sx_try_upgrade(&dh->dh_lock)) break; + sx_sunlock(&dh->dh_lock); } /* Free the preallocated structure if it was not necessary. */ if (ndh) { - lockmgr(&ndh->dh_lock, LK_RELEASE, NULL); - lockdestroy(&ndh->dh_lock); - FREE(ndh, M_DIRHASH); + ufsdirhash_release(ndh); + ufsdirhash_drop(ndh); } return (dh); } @@ -219,7 +251,7 @@ dh = ip->i_dirhash; if (dh == NULL) return (NULL); - lockmgr(&dh->dh_lock, LK_EXCLUSIVE, 0); + sx_xlock(&dh->dh_lock); if (dh->dh_hash != NULL) return (dh); ufsdirhash_free_locked(ip); @@ -238,18 +270,34 @@ vp = ip->i_vnode; for (;;) { + /* Grab a reference on this inode's dirhash if it has one. */ VI_LOCK(vp); dh = ip->i_dirhash; if (dh == NULL) { VI_UNLOCK(vp); return; } - if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_INTERLOCK, - VI_MTX(vp))) - continue; - if (ip->i_dirhash == dh) + ufsdirhash_hold(dh); + VI_UNLOCK(vp); + + /* Exclusively lock the dirhash. */ + sx_xlock(&dh->dh_lock); + + /* If this dirhash still belongs to this inode, then free it. */ + VI_LOCK(vp); + if (ip->i_dirhash == dh) { + VI_UNLOCK(vp); + ufsdirhash_drop(dh); break; + } + VI_UNLOCK(vp); + + /* + * This inode's dirhash has changed while we were + * waiting for the dirhash lock, so try again. + */ ufsdirhash_release(dh); + ufsdirhash_drop(dh); } ufsdirhash_free_locked(ip); } @@ -382,7 +430,7 @@ TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 1; DIRHASHLIST_UNLOCK(); - lockmgr(&dh->dh_lock, LK_DOWNGRADE, 0); + sx_downgrade(&dh->dh_lock); return (0); fail: @@ -401,6 +449,7 @@ int i; DIRHASH_ASSERT_LOCKED(ip->i_dirhash); + /* * Clear the pointer in the inode to prevent new threads from * finding the dead structure. @@ -410,12 +459,15 @@ dh = ip->i_dirhash; ip->i_dirhash = NULL; VI_UNLOCK(vp); + /* - * Drain waiters. They will abort when they see that ip->i_dirhash - * is NULL after locking. + * At this point, any waiters for the lock should hold their + * own reference on the dirhash structure. They will drop + * that reference once they grab the vnode interlock and see + * that ip->i_dirhash is NULL. */ - lockmgr(&dh->dh_lock, LK_RELEASE, 0); - lockmgr(&dh->dh_lock, LK_DRAIN, 0); + sx_xunlock(&dh->dh_lock); + /* * Handle partially recycled as well as fully constructed hashes. */ @@ -432,14 +484,11 @@ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); ufs_dirhashmem -= dh->dh_memreq; DIRHASHLIST_UNLOCK(); + /* - * Release the lock and reclaim datastructure memory. + * Drop the inode's reference to the data structure. */ - lockmgr(&dh->dh_lock, LK_RELEASE, 0); - lockdestroy(&dh->dh_lock); - FREE(dh, M_DIRHASH); - - return; + ufsdirhash_drop(dh); } /* @@ -1098,7 +1147,7 @@ * If we can't lock it it's in use and we don't want to * recycle it anyway. */ - if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { + if (!sx_try_xlock(&dh->dh_lock)) { dh = TAILQ_NEXT(dh, dh_list); continue; } --- //depot/projects/smpng/sys/ufs/ufs/ufs_lookup.c 2008/07/31 20:12:34 +++ //depot/user/jhb/lock/ufs/ufs/ufs_lookup.c 2008/09/10 20:07:10 @@ -362,14 +362,12 @@ slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; - dp->i_reclen = slotsize; enduseful = dp->i_size; ap->a_cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } ino = ep->d_ino; - dp->i_reclen = ep->d_reclen; goto found; } } @@ -390,7 +388,6 @@ endsearch = i_diroff; goto searchloop; } - dp->i_offset = i_offset; if (bp != NULL) brelse(bp); /* @@ -468,6 +465,7 @@ */ if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, i_offset, "i_size too small"); + /* XXX: This needs an exclusive lock, but we panic above. */ dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; @@ -482,13 +480,13 @@ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); - dp->i_offset = i_offset; /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { - ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); + if (flags & LOCKPARENT) + ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); /* * Write access to directory required to delete files. */ @@ -500,7 +498,13 @@ * and distance past previous entry (if there * is a previous entry in this block) in dp->i_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). + * + * Technically we shouldn't be setting these in the + * WANTPARENT case (first lookup in rename()), but any + * lookups that will result in directory changes will + * overwrite these. */ + dp->i_offset = i_offset; if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else @@ -542,6 +546,7 @@ * Careful about locking second inode. * This can only occur if the target is ".". */ + dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); if ((error = VFS_VGET(vdp->v_mount, ino, @@ -999,7 +1004,7 @@ int isrmdir; { struct inode *dp; - struct direct *ep; + struct direct *ep, *rep; struct buf *bp; int error; @@ -1020,14 +1025,19 @@ if ((error = UFS_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) return (error); + + /* Set 'rep' to the entry being removed. */ + if (dp->i_count == 0) + rep = ep; + else + rep = (struct direct *)((char *)ep + ep->d_reclen); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) - ufsdirhash_remove(dp, (dp->i_count == 0) ? ep : - (struct direct *)((char *)ep + ep->d_reclen), dp->i_offset); + ufsdirhash_remove(dp, rep, dp->i_offset); #endif if (dp->i_count == 0) { /* @@ -1038,7 +1048,7 @@ /* * Collapse new free space into previous entry. */ - ep->d_reclen += dp->i_reclen; + ep->d_reclen += rep->d_reclen; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) --- //depot/projects/smpng/sys/ufs/ufs/ufs_vnops.c 2008/08/25 16:33:41 +++ //depot/user/jhb/lock/ufs/ufs/ufs_vnops.c 2008/09/13 15:19:43 @@ -335,13 +335,12 @@ vhold(vp); vn_lock(vp, LK_UPGRADE | LK_RETRY); VI_LOCK(vp); - vdropl(vp); if (vp->v_iflag & VI_DOOMED) { - VI_UNLOCK(vp); + vdropl(vp); error = ENOENT; goto relock; } - VI_UNLOCK(vp); + vdropl(vp); } else relocked = 0; error = getinoquota(ip);