Index: fs/coda/coda_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/fs/coda/coda_vnops.c,v
retrieving revision 1.97
diff -u -r1.97 coda_vnops.c
--- fs/coda/coda_vnops.c	15 Feb 2008 11:58:11 -0000	1.97
+++ fs/coda/coda_vnops.c	6 Apr 2008 10:09:24 -0000
@@ -134,7 +134,7 @@
 	.vop_inactive = coda_inactive,		/* inactive */
 	.vop_reclaim = coda_reclaim,		/* reclaim */
 	.vop_lock1 = coda_lock,			/* lock */
-	.vop_unlock = coda_unlock,		/* unlock */
+	.vop_unlock1 = coda_unlock,		/* unlock */
 	.vop_bmap = VOP_EOPNOTSUPP,		/* bmap */
 	.vop_print = VOP_NULL,			/* print */
 	.vop_islocked = coda_islocked,		/* islocked */
@@ -1581,7 +1581,7 @@
 }
 
 int
-coda_unlock(struct vop_unlock_args *ap)
+coda_unlock(struct vop_unlock1_args *ap)
 {
 	/* true args */
 	struct vnode *vp = ap->a_vp;
Index: fs/coda/coda_vnops.h
===================================================================
RCS file: /home/ncvs/src/sys/fs/coda/coda_vnops.h,v
retrieving revision 1.24
diff -u -r1.24 coda_vnops.h
--- fs/coda/coda_vnops.h	13 Feb 2008 13:06:22 -0000	1.24
+++ fs/coda/coda_vnops.h	6 Apr 2008 10:09:24 -0000
@@ -73,7 +73,7 @@
 vop_strategy_t	coda_strategy;
 vop_reclaim_t	coda_reclaim;
 vop_lock1_t	coda_lock;
-vop_unlock_t	coda_unlock;
+vop_unlock1_t	coda_unlock;
 vop_islocked_t	coda_islocked;
 vop_pathconf_t	coda_pathconf;
 
Index: fs/nullfs/null_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/fs/nullfs/null_vnops.c,v
retrieving revision 1.99
diff -u -r1.99 null_vnops.c
--- fs/nullfs/null_vnops.c	25 Feb 2008 18:45:55 -0000	1.99
+++ fs/nullfs/null_vnops.c	6 Apr 2008 10:09:25 -0000
@@ -581,7 +581,7 @@
  * vnodes below us on the stack.
  */
 static int
-null_unlock(struct vop_unlock_args *ap)
+null_unlock(struct vop_unlock1_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int flags = ap->a_flags;
@@ -744,6 +744,6 @@
 	.vop_rename =		null_rename,
 	.vop_setattr =		null_setattr,
 	.vop_strategy =		VOP_EOPNOTSUPP,
-	.vop_unlock =		null_unlock,
+	.vop_unlock1 =		null_unlock,
 	.vop_vptofh =		null_vptofh,
 };
Index: fs/unionfs/union_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/fs/unionfs/union_vnops.c,v
retrieving revision 1.153
diff -u -r1.153 union_vnops.c
--- fs/unionfs/union_vnops.c	25 Feb 2008 18:45:56 -0000	1.153
+++ fs/unionfs/union_vnops.c	6 Apr 2008 10:09:25 -0000
@@ -1767,7 +1767,7 @@
 }
 
 static int
-unionfs_unlock(struct vop_unlock_args *ap)
+unionfs_unlock(struct vop_unlock1_args *ap)
 {
 	int		error;
 	int		flags;
@@ -2318,7 +2318,7 @@
 	.vop_setlabel =		unionfs_setlabel,
 	.vop_strategy =		unionfs_strategy,
 	.vop_symlink =		unionfs_symlink,
-	.vop_unlock =		unionfs_unlock,
+	.vop_unlock1 =		unionfs_unlock,
 	.vop_whiteout =		unionfs_whiteout,
 	.vop_write =		unionfs_write,
 	.vop_vptofh =		unionfs_vptofh,
Index: kern/kern_exit.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.310
diff -u -r1.310 kern_exit.c
--- kern/kern_exit.c	22 Mar 2008 16:32:52 -0000	1.310
+++ kern/kern_exit.c	6 Apr 2008 10:09:26 -0000
@@ -321,7 +321,7 @@
 					sp->s_ttyvp = NULL;
 					SESS_UNLOCK(p->p_session);
 					sx_xunlock(&proctree_lock);
-					VOP_LOCK(ttyvp, LK_EXCLUSIVE);
+					vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
 					VOP_REVOKE(ttyvp, REVOKEALL);
 					vput(ttyvp);
 					sx_xlock(&proctree_lock);
Index: kern/vfs_default.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_default.c,v
retrieving revision 1.143
diff -u -r1.143 vfs_default.c
--- kern/vfs_default.c	22 Mar 2008 09:15:14 -0000	1.143
+++ kern/vfs_default.c	6 Apr 2008 10:09:26 -0000
@@ -95,7 +95,7 @@
 	.vop_readlink =		VOP_EINVAL,
 	.vop_revoke =		VOP_PANIC,
 	.vop_strategy =		vop_nostrategy,
-	.vop_unlock =		vop_stdunlock,
+	.vop_unlock1 =		vop_stdunlock,
 	.vop_vptofh =		vop_stdvptofh,
 };
 
@@ -270,7 +270,7 @@
 /* See above. */
 int
 vop_stdunlock(ap)
-	struct vop_unlock_args /* {
+	struct vop_unlock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
Index: kern/vfs_mount.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_mount.c,v
retrieving revision 1.278
diff -u -r1.278 vfs_mount.c
--- kern/vfs_mount.c	31 Mar 2008 12:01:20 -0000	1.278
+++ kern/vfs_mount.c	6 Apr 2008 10:09:26 -0000
@@ -457,6 +457,7 @@
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+	mtx_init(&mp->mnt_susplock, "mount suspend lock", NULL, MTX_DEF);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 	return (0);
 }
@@ -469,6 +470,7 @@
 	mp = (struct mount *)mem;
 	lockdestroy(&mp->mnt_lock);
 	mtx_destroy(&mp->mnt_mtx);
+	mtx_destroy(&mp->mnt_susplock);
 }
 
 /*
Index: kern/vfs_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.727
diff -u -r1.727 vfs_subr.c
--- kern/vfs_subr.c	2 Apr 2008 10:40:03 -0000	1.727
+++ kern/vfs_subr.c	6 Apr 2008 10:09:26 -0000
@@ -1015,6 +1015,7 @@
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
+	int flags;
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
@@ -1034,6 +1035,9 @@
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(mp);
+	flags = 0;
+	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		vn_write_start(mp, vp, &flags);
 	return (0);
 }
 
@@ -2142,7 +2146,7 @@
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
-	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
+	if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK) == 0) {
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0)
 			vp->v_iflag &= ~VI_OWEINACT;
@@ -2526,6 +2530,8 @@
 	/*
 	 * Delete from old mount point vnode list.
 	 */
+	if (vp->v_mount)
+		vn_write_end(vp->v_mount);
 	delmntque(vp);
 	cache_purge(vp);
 	/*
@@ -3102,7 +3108,7 @@
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
-	.vop_unlock =	vop_stdunlock,	/* unlock */
+	.vop_unlock1 =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
@@ -3606,54 +3612,6 @@
 }
 
 void
-vop_lock_pre(void *ap)
-{
-#ifdef DEBUG_VFS_LOCKS
-	struct vop_lock1_args *a = ap;
-
-	if ((a->a_flags & LK_INTERLOCK) == 0)
-		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
-	else
-		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
-#endif
-}
-
-void
-vop_lock_post(void *ap, int rc)
-{
-#ifdef DEBUG_VFS_LOCKS
-	struct vop_lock1_args *a = ap;
-
-	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
-	if (rc == 0)
-		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
-#endif
-}
-
-void
-vop_unlock_pre(void *ap)
-{
-#ifdef DEBUG_VFS_LOCKS
-	struct vop_unlock_args *a = ap;
-
-	if (a->a_flags & LK_INTERLOCK)
-		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
-	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
-#endif
-}
-
-void
-vop_unlock_post(void *ap, int rc)
-{
-#ifdef DEBUG_VFS_LOCKS
-	struct vop_unlock_args *a = ap;
-
-	if (a->a_flags & LK_INTERLOCK)
-		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
-#endif
-}
-
-void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
Index: kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.261
diff -u -r1.261 vfs_vnops.c
--- kern/vfs_vnops.c	31 Mar 2008 11:57:18 -0000	1.261
+++ kern/vfs_vnops.c	6 Apr 2008 10:09:26 -0000
@@ -867,7 +867,7 @@
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock called with no locktype."));
 	do {
-		error = VOP_LOCK1(vp, flags, file, line);
+		error = vop_lock_wrapper(vp, flags, file, line);
 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
 		    ("LK_RETRY set with incompatible flags %d\n", flags));
@@ -916,209 +916,215 @@
 	return (error);
 }
 
-/*
- * Preparing to start a filesystem write operation. If the operation is
- * permitted, then we bump the count of operations in progress and
- * proceed. If a suspend request is in progress, we wait until the
- * suspension is over, and then proceed.
- */
 int
-vn_start_write(vp, mpp, flags)
-	struct vnode *vp;
-	struct mount **mpp;
-	int flags;
+vn_write_start(struct mount *mp, struct vnode *vp, int *flagsp)
 {
-	struct mount *mp;
+	struct thread *td;
+	int writers;
 	int error;
+	int flags;
 
-	error = 0;
-	/*
-	 * If a vnode is provided, get and return the mount point that
-	 * to which it will write.
-	 */
-	if (vp != NULL) {
-		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
-			*mpp = NULL;
-			if (error != EOPNOTSUPP)
-				return (error);
-			return (0);
-		}
-	}
-	if ((mp = *mpp) == NULL)
+	flags = *flagsp;
+	td = curthread;
+	for (;;) {
+		/*
+		 * If we currently own a write lock proceed as long as
+		 * the suspension is not complete.  If it has completed
+		 * that means we own write locks for other filesystems.
+		 * This prevents deadlocks from suspending between two
+		 * exclusive lock acquisitions.
+		 */
+		if (td->td_vfslocks) {
+			if (mp->mnt_kern_flag & MNTK_SUSPENDED)
+				break;
+		/*
+		 * If we don't own a write lock we need to abort and sleep
+		 * if any of the suspension flags are set.
+		 */
+		} else if (mp->mnt_kern_flag & MNTK_ALLSUSPEND)
+			break;
+		writers = mp->mnt_writers;
+		/*
+		 * If there are no writers we must synchronize the state
+		 * transition to 1 writer using the lock.
+		 */
+		if (writers == 0)
+			break;
+		if (atomic_cmpset_int(&mp->mnt_writers, writers,
+		    writers + 1) == 0)
+			continue;
+		td->td_vfslocks++;
 		return (0);
-	MNT_ILOCK(mp);
-	if (vp == NULL)
-		MNT_REF(mp);
+	}
+	if (flags & LK_INTERLOCK)
+		VI_UNLOCK(vp);
+	*flagsp &= ~LK_INTERLOCK;
+	error = 0;
 	/*
 	 * Check on status of suspension.
 	 */
-	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
-		if (flags & V_NOWAIT) {
-			error = EWOULDBLOCK;
-			goto unlock;
+	MNT_SLOCK(mp);
+	if (mp->mnt_suspender != td) {
+		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+			if (flags & LK_NOWAIT) {
+				error = EBUSY;
+				break;
+			}
+			error = msleep(&mp->mnt_flag, MNT_SMTX(mp), 
+			    (PUSER - 1), "suspfs", 0);
+			if (error)
+				break;
 		}
-		error = msleep(&mp->mnt_flag, MNT_MTX(mp), 
-		    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
-		if (error)
-			goto unlock;
-	}
-	if (flags & V_XSLEEP)
-		goto unlock;
-	mp->mnt_writeopcount++;
-unlock:
-	MNT_REL(mp);
-	MNT_IUNLOCK(mp);
+	}
+	if (error == 0) {
+		atomic_add_int(&mp->mnt_writers, 1);
+		td->td_vfslocks++;
+	}
+	MNT_SUNLOCK(mp);
 	return (error);
 }
 
-/*
- * Secondary suspension. Used by operations such as vop_inactive
- * routines that are needed by the higher level functions. These
- * are allowed to proceed until all the higher level functions have
- * completed (indicated by mnt_writeopcount dropping to zero). At that
- * time, these operations are halted until the suspension is over.
- */
+void
+vn_write_end(struct mount *mp)
+{
+	int writers;
+
+	if (mp->mnt_writers <= 0)
+		panic("vn_write_end: neg cnt");
+	curthread->td_vfslocks--;
+	for (;;) {
+		writers = mp->mnt_writers;
+		if (writers == 1)
+			break;
+		if (atomic_cmpset_int(&mp->mnt_writers, writers,
+		    writers - 1) == 0)
+			continue;
+		return;
+	}
+	MNT_SLOCK(mp);
+	writers = atomic_fetchadd_int(&mp->mnt_writers, -1);
+	if (writers <= 0)
+		panic("vn_write_end: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && writers == 1)
+		wakeup(&mp->mnt_flag);
+	MNT_SUNLOCK(mp);
+	return;
+}
+
 int
-vn_write_suspend_wait(vp, mp, flags)
-	struct vnode *vp;
-	struct mount *mp;
-	int flags;
+vop_lock_wrapper(struct vnode *vp, int flags, char *file, int line)
 {
+	struct mount *mp;
+	int startwrite;
+	int endwrite;
 	int error;
 
-	if (vp != NULL) {
-		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
-			if (error != EOPNOTSUPP)
-				return (error);
-			return (0);
+#ifdef DEBUG_VFS_LOCKS
+	if ((flags & LK_INTERLOCK) == 0)
+		ASSERT_VI_UNLOCKED(vp, "VOP_LOCK");
+	else
+		ASSERT_VI_LOCKED(vp, "VOP_LOCK");
+#endif
+	endwrite = 0;
+	startwrite = 0;
+	mp = vp->v_mount;
+	if (mp) {
+		switch (flags & LK_TYPE_MASK) {
+		case LK_EXCLUSIVE:
+		case LK_UPGRADE:
+			error = vn_write_start(mp, vp, &flags);
+			if (error)
+				goto out;
+			startwrite = 1;
+			break;
+		case LK_DOWNGRADE:
+			endwrite = 1;
+			break;
+		default:
+			mp = NULL;
+			break;
 		}
+
 	}
-	/*
-	 * If we are not suspended or have not yet reached suspended
-	 * mode, then let the operation proceed.
-	 */
-	if (mp == NULL)
-		return (0);
-	MNT_ILOCK(mp);
-	if (vp == NULL)
-		MNT_REF(mp);
-	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
-		MNT_REL(mp);
-		MNT_IUNLOCK(mp);
-		return (0);
-	}
-	if (flags & V_NOWAIT) {
-		MNT_REL(mp);
-		MNT_IUNLOCK(mp);
-		return (EWOULDBLOCK);
-	}
-	/*
-	 * Wait for the suspension to finish.
-	 */
-	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
-	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
-	vfs_rel(mp);
+	error = VOP_LOCK1(vp, flags, file, line);
+	if (vp->v_mount != mp && startwrite)
+		endwrite = 1;
+	if ((error && startwrite) || endwrite)
+		vn_write_end(mp);
+out:
+#ifdef DEBUG_VFS_LOCKS
+	ASSERT_VI_UNLOCKED(vp, "VOP_LOCK");
+	if (error == 0)
+		ASSERT_VOP_LOCKED(vp, "VOP_LOCK");
+#endif
 	return (error);
 }
 
-/*
- * Secondary suspension. Used by operations such as vop_inactive
- * routines that are needed by the higher level functions. These
- * are allowed to proceed until all the higher level functions have
- * completed (indicated by mnt_writeopcount dropping to zero). At that
- * time, these operations are halted until the suspension is over.
- */
 int
-vn_start_secondary_write(vp, mpp, flags)
+vop_unlock_wrapper(struct vnode *vp, int flags)
+{
+	int error;
+
+#ifdef DEBUG_VFS_LOCKS
+	if (flags & LK_INTERLOCK)
+		ASSERT_VI_LOCKED(vp, "VOP_UNLOCK");
+	ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK");
+#endif
+	if (flags & LK_INTERLOCK) {
+		VI_UNLOCK(vp);
+		flags &= ~LK_INTERLOCK;
+	}
+	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && vp->v_mount)
+		vn_write_end(vp->v_mount);
+	error = VOP_UNLOCK1(vp, flags);
+#ifdef DEBUG_VFS_LOCKS
+	if (flags & LK_INTERLOCK)
+		ASSERT_VI_UNLOCKED(vp, "VOP_UNLOCK");
+#endif
+	return (error);
+}
+
+int
+vn_start_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
+	return (0);
+}
+
+int
+vn_write_suspend_wait(vp, mp, flags)
+	struct vnode *vp;
 	struct mount *mp;
-	int error;
+	int flags;
+{
+	return (0);
+}
 
- retry:
-	if (vp != NULL) {
-		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
-			*mpp = NULL;
-			if (error != EOPNOTSUPP)
-				return (error);
-			return (0);
-		}
-	}
-	/*
-	 * If we are not suspended or have not yet reached suspended
-	 * mode, then let the operation proceed.
-	 */
-	if ((mp = *mpp) == NULL)
-		return (0);
-	MNT_ILOCK(mp);
-	if (vp == NULL)
-		MNT_REF(mp);
-	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
-		mp->mnt_secondary_writes++;
-		mp->mnt_secondary_accwrites++;
-		MNT_REL(mp);
-		MNT_IUNLOCK(mp);
-		return (0);
-	}
-	if (flags & V_NOWAIT) {
-		MNT_REL(mp);
-		MNT_IUNLOCK(mp);
-		return (EWOULDBLOCK);
-	}
-	/*
-	 * Wait for the suspension to finish.
-	 */
-	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
-		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
-	vfs_rel(mp);
-	if (error == 0)
-		goto retry;
-	return (error);
+int
+vn_start_secondary_write(vp, mpp, flags)
+	struct vnode *vp;
+	struct mount **mpp;
+	int flags;
+{
+	return (0);
 }
 
-/*
- * Filesystem write operation has completed. If we are suspending and this
- * operation is the last one, notify the suspender that the suspension is
- * now in effect.
- */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
-	if (mp == NULL)
-		return;
-	MNT_ILOCK(mp);
-	mp->mnt_writeopcount--;
-	if (mp->mnt_writeopcount < 0)
-		panic("vn_finished_write: neg cnt");
-	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
-	    mp->mnt_writeopcount <= 0)
-		wakeup(&mp->mnt_writeopcount);
-	MNT_IUNLOCK(mp);
-}
 
+	return;
+}
 
-/*
- * Filesystem secondary write operation has completed. If we are
- * suspending and this operation is the last one, notify the suspender
- * that the suspension is now in effect.
- */
 void
 vn_finished_secondary_write(mp)
 	struct mount *mp;
 {
-	if (mp == NULL)
-		return;
-	MNT_ILOCK(mp);
-	mp->mnt_secondary_writes--;
-	if (mp->mnt_secondary_writes < 0)
-		panic("vn_finished_secondary_write: neg cnt");
-	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
-	    mp->mnt_secondary_writes <= 0)
-		wakeup(&mp->mnt_secondary_writes);
-	MNT_IUNLOCK(mp);
+
+	return;
 }
 
 
@@ -1138,12 +1144,20 @@
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
+	mp->mnt_suspender = td;
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
-	if (mp->mnt_writeopcount > 0)
-		(void) msleep(&mp->mnt_writeopcount, 
-		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
-	else
-		MNT_IUNLOCK(mp);
+	mp->mnt_suspwrites = curthread->td_vfslocks;
+	if (mp->mnt_suspwrites)
+		atomic_add_int(&mp->mnt_writers, -mp->mnt_suspwrites);
+	MNT_IUNLOCK(mp);
+	MNT_SLOCK(mp);
+	while (mp->mnt_writers) {
+		printf("vfs_write_suspend: %d writers %d suspwrites\n",
+		    mp->mnt_writers, mp->mnt_suspwrites);
+		(void) msleep(&mp->mnt_flag, MNT_SMTX(mp),
+		    (PUSER - 1), "suspwt", 0);
+	}
+	MNT_SUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
 		vfs_write_resume(mp);
 	return (error);
@@ -1161,8 +1175,14 @@
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
+		MNT_SLOCK(mp);
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
+		mp->mnt_suspender = NULL;
+		if (mp->mnt_suspwrites)
+			atomic_add_int(&mp->mnt_writers, mp->mnt_suspwrites);
+		mp->mnt_suspwrites = 0;
+		MNT_SUNLOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 }
Index: kern/vnode_if.src
===================================================================
RCS file: /home/ncvs/src/sys/kern/vnode_if.src,v
retrieving revision 1.91
diff -u -r1.91 vnode_if.src
--- kern/vnode_if.src	26 Mar 2008 15:23:09 -0000	1.91
+++ kern/vnode_if.src	6 Apr 2008 10:09:28 -0000
@@ -361,8 +361,7 @@
 };
 
 
-%! lock1	pre	vop_lock_pre
-%! lock1	post	vop_lock_post
+# Handled by vop_lock_wapper
 
 vop_lock1 {
 	IN struct vnode *vp;
@@ -372,10 +371,9 @@
 };
 
 
-%! unlock	pre	vop_unlock_pre
-%! unlock	post	vop_unlock_post
+# Handled by vop_unlock_wapper
 
-vop_unlock {
+vop_unlock1 {
 	IN struct vnode *vp;
 	IN int flags;
 };
Index: sys/mount.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/mount.h,v
retrieving revision 1.229
diff -u -r1.229 mount.h
--- sys/mount.h	1 Mar 2008 19:47:50 -0000	1.229
+++ sys/mount.h	6 Apr 2008 10:09:36 -0000
@@ -137,6 +137,7 @@
  * Lock reference:
  *	m - mountlist_mtx
  *	i - interlock
+ *	s - mnt_susplock
  *	l - mnt_lock
  *
  * Unmarked fields are considered stable as long as a ref is held.
@@ -145,6 +146,7 @@
 struct mount {
 	struct lock	mnt_lock;		/* mount structure lock */
 	struct mtx	mnt_mtx;		/* mount structure interlock */
+	struct mtx	mnt_susplock;		/* mount suspend lock. */
 	int		mnt_gen;		/* struct mount generation */
 #define	mnt_startzero	mnt_list
 	TAILQ_ENTRY(mount) mnt_list;		/* (m) mount list */
@@ -152,6 +154,7 @@
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
+	struct thread	*mnt_suspender;		/* Thread running suspension. */
 	int		mnt_ref;		/* (i) Reference count */
 	struct vnodelst	mnt_nvnodelist;		/* (i) list of vnodes */
 	int		mnt_nvnodelistsize;	/* (i) # of vnodes */
@@ -159,6 +162,8 @@
 	int		mnt_kern_flag;		/* (i) kernel only flags */
 	u_int		mnt_flag;		/* (i) flags shared with user */
 	u_int		mnt_noasync;		/* (i) # noasync overrides */
+	int		mnt_suspwrites;		/* (s) suspender writes. */
+	volatile int	mnt_writers;		/* (s) pending writers. */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
@@ -208,6 +213,10 @@
 		wakeup((mp));						\
 } while (0)
 
+#define	MNT_SLOCK(mp)	mtx_lock(&(mp)->mnt_susplock)
+#define	MNT_SUNLOCK(mp)	mtx_unlock(&(mp)->mnt_susplock)
+#define	MNT_SMTX(mp)	(&(mp)->mnt_susplock)
+
 #endif /* _KERNEL */
 
 /*
@@ -320,6 +329,7 @@
 #define	MNTK_MPSAFE	0x20000000	/* Filesystem is MPSAFE. */
 #define	MNTK_NOKNOTE	0x80000000	/* Don't send KNOTEs from VOP hooks */
 #define MNTK_LOOKUP_SHARED	0x40000000 /* FS supports shared lock lookups */
+#define	MNTK_ALLSUSPEND	(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED)
 
 /*
  * Sysctl CTL_VFS definitions.
Index: sys/proc.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/proc.h,v
retrieving revision 1.509
diff -u -r1.509 proc.h
--- sys/proc.h	21 Mar 2008 08:23:25 -0000	1.509
+++ sys/proc.h	6 Apr 2008 10:09:36 -0000
@@ -201,6 +201,7 @@
 	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
 	short		td_locks;	/* (k) Count of non-spin locks. */
 	short		td_rw_rlocks;	/* (k) count of rwlock read locks. */
+	short		td_vfslocks;	/* (k) Count of vfs write locks. */
 	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
 	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.334
diff -u -r1.334 vnode.h
--- sys/vnode.h	31 Mar 2008 11:53:03 -0000	1.334
+++ sys/vnode.h	6 Apr 2008 10:09:36 -0000
@@ -628,6 +628,8 @@
 	    int flags);
 int	vn_write_suspend_wait(struct vnode *vp, struct mount *mp,
 	    int flags);
+int	vn_write_start(struct mount *mp, struct vnode *vp, int *flagsp);
+void	vn_write_end(struct mount *mp);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
@@ -639,6 +641,7 @@
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp);
 int	vfs_write_suspend(struct mount *mp);
+int	vop_lock_wrapper(struct vnode *vp, int flags, char *file, int line);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
@@ -648,7 +651,7 @@
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
-int	vop_stdunlock(struct vop_unlock_args *);
+int	vop_stdunlock(struct vop_unlock1_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
@@ -659,12 +662,11 @@
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
+int	vop_unlock_wrapper(struct vnode *vp, int flags);
 
 /* These are called from within the actual VOPS. */
 void	vop_create_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
-void	vop_lock_pre(void *a);
-void	vop_lock_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
@@ -676,8 +678,6 @@
 void	vop_setattr_post(void *a, int rc);
 void	vop_strategy_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
-void	vop_unlock_post(void *a, int rc);
-void	vop_unlock_pre(void *a);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
@@ -700,7 +700,10 @@
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
-#define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
+#define VOP_LOCK(vp, flags)						\
+    vop_lock_wrapper((vp), (flags), __FILE__, __LINE__)
+
+#define VOP_UNLOCK(vp, flags)	vop_unlock_wrapper((vp), (flags))
 
 
 void	vput(struct vnode *vp);
Index: ufs/ffs/ffs_softdep.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.222
diff -u -r1.222 ffs_softdep.c
--- ufs/ffs/ffs_softdep.c	23 Mar 2008 13:45:24 -0000	1.222
+++ ufs/ffs/ffs_softdep.c	6 Apr 2008 10:09:36 -0000
@@ -713,6 +713,7 @@
 	struct thread *td;
 	int remaining;
 	int vfslocked;
+	int flags;
 
 	td = curthread;
 	td->td_pflags |= TDP_NORUNNINGBUF;
@@ -745,10 +746,14 @@
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 				continue;
 			vfslocked = VFS_LOCK_GIANT(mp);
-			softdep_process_worklist(mp, 0);
-			ump = VFSTOUFS(mp);
-			remaining += ump->softdep_on_worklist -
-				ump->softdep_on_worklist_inprogress;
+			flags = LK_NOWAIT;
+			if (vn_write_start(mp, NULL, &flags) == 0) {
+				softdep_process_worklist(mp, 0);
+				ump = VFSTOUFS(mp);
+				remaining += ump->softdep_on_worklist -
+					ump->softdep_on_worklist_inprogress;
+				vn_write_end(mp);
+			}
 			VFS_UNLOCK_GIANT(vfslocked);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
Index: ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.340
diff -u -r1.340 ffs_vfsops.c
--- ufs/ffs/ffs_vfsops.c	26 Mar 2008 20:48:07 -0000	1.340
+++ ufs/ffs/ffs_vfsops.c	6 Apr 2008 10:09:36 -0000
@@ -866,7 +866,7 @@
 	 * Initialize filesystem stat information in mount struct.
 	 */
 	MNT_ILOCK(mp);
-	mp->mnt_kern_flag |= MNTK_MPSAFE;
+	mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED;
 	MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
Index: ufs/ufs/dirhash.h
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/dirhash.h,v
retrieving revision 1.5
diff -u -r1.5 dirhash.h
--- ufs/ufs/dirhash.h	7 Jan 2005 02:29:26 -0000	1.5
+++ ufs/ufs/dirhash.h	6 Apr 2008 10:09:36 -0000
@@ -80,12 +80,13 @@
     ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK])
 
 struct dirhash {
-	struct mtx dh_mtx;	/* protects all fields except dh_list */
+	struct lock dh_lock;	/* protects all fields except list & score */
 
 	doff_t	**dh_hash;	/* the hash array (2-level) */
 	int	dh_narrays;	/* number of entries in dh_hash */
 	int	dh_hlen;	/* total slots in the 2-level hash array */
 	int	dh_hused;	/* entries in use */
+	int	dh_memreq;	/* Memory used. */
 
 	/* Free space statistics. XXX assumes DIRBLKSIZ is 512. */
 	u_int8_t *dh_blkfree;	/* free DIRALIGN words in each dir block */
Index: ufs/ufs/ufs_dirhash.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_dirhash.c,v
retrieving revision 1.23
diff -u -r1.23 ufs_dirhash.c
--- ufs/ufs/ufs_dirhash.c	31 Oct 2005 15:41:28 -0000	1.23
+++ ufs/ufs/ufs_dirhash.c	6 Apr 2008 10:09:36 -0000
@@ -38,6 +38,7 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/fnv_hash.h>
@@ -88,15 +89,16 @@
 	   doff_t offset);
 static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
 static int ufsdirhash_recycle(int wanted);
+static void ufsdirhash_free_locked(struct inode *ip);
 
 static uma_zone_t	ufsdirhash_zone;
 
 #define DIRHASHLIST_LOCK() 		mtx_lock(&ufsdirhash_mtx)
 #define DIRHASHLIST_UNLOCK() 		mtx_unlock(&ufsdirhash_mtx)
-#define DIRHASH_LOCK(dh)		mtx_lock(&(dh)->dh_mtx)
-#define DIRHASH_UNLOCK(dh) 		mtx_unlock(&(dh)->dh_mtx)
 #define DIRHASH_BLKALLOC_WAITOK() 	uma_zalloc(ufsdirhash_zone, M_WAITOK)
 #define DIRHASH_BLKFREE(ptr) 		uma_zfree(ufsdirhash_zone, (ptr))
+#define	DIRHASH_ASSERT_LOCKED(dh)					\
+    lockmgr_assert(&(dh)->dh_lock, KA_LOCKED)
 
 /* Dirhash list; recently-used entries are near the tail. */
 static TAILQ_HEAD(, dirhash) ufsdirhash_list;
@@ -105,14 +107,152 @@
 static struct mtx	ufsdirhash_mtx;
 
 /*
- * Locking order:
- *	ufsdirhash_mtx
- *	dh_mtx
+ * Locking:
  *
- * The dh_mtx mutex should be acquired either via the inode lock, or via
- * ufsdirhash_mtx. Only the owner of the inode may free the associated
- * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ * The relationship between inode and dirhash is protected either by an
+ * exclusive vnode lock or the vnode interlock where a shared vnode lock
+ * may be used.  The dirhash_mtx is acquired after the dirhash lock.
+ *
+ * ufsdirhash_build() acquires a shared lock on the dirhash when it is
+ * successful.  This lock is released after a call to ufsdirhash_lookup().
+ *
+ * Functions requiring exclusive access use ufsdirhash_acquire() which may
+ * free a dirhash structure that was recycled by ufsdirhash_recycle().
+ *
+ * The dirhash lock may be held across io operations.
+ */
+
+/*
+ * Release the lock on a dirhash.
+ */
+static void
+ufsdirhash_release(struct dirhash *dh)
+{
+
+	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
+}
+
+/*
+ * Either acquire an existing hash locked shared or create a new hash and
+ * return it exclusively locked.  May return NULL if the allocation fails.
+ *
+ * The vnode interlock is used to protect the i_dirhash pointer from
+ * simultaneous access while only a shared vnode lock is held.
+ */
+static struct dirhash *
+ufsdirhash_create(struct inode *ip)
+{
+	struct dirhash *ndh;
+	struct dirhash *dh;
+	struct vnode *vp;
+	int error;
+
+	error = 0;
+	ndh = dh = NULL;
+	vp = ip->i_vnode;
+	for (;;) {
+		/* Racy check for i_dirhash to prefetch an dirhash structure. */
+		if (ip->i_dirhash == NULL && ndh == NULL) {
+			MALLOC(ndh, struct dirhash *, sizeof *dh, M_DIRHASH,
+			    M_NOWAIT | M_ZERO);
+			if (ndh == NULL)
+				return (NULL);
+			lockinit(&ndh->dh_lock, PRIBIO, "dirhash", 0, 0);
+			lockmgr(&ndh->dh_lock, LK_EXCLUSIVE, NULL);
+		}
+		/*
+		 * Check i_dirhash.  If it's NULL just try to use a
+		 * preallocated structure.  If none exists loop and try again.
+		 */
+		VI_LOCK(vp);
+		dh = ip->i_dirhash;
+		if (dh == NULL) {
+			ip->i_dirhash = ndh;
+			VI_UNLOCK(vp);
+			if (ndh == NULL)
+				continue;
+			return (ndh);
+		}
+		/* Try to acquire shared on existing hashes. */
+		if (lockmgr(&dh->dh_lock, LK_SHARED | LK_INTERLOCK,
+		    VI_MTX(vp)))
+			continue;
+		/* The hash could've been recycled while we were waiting. */
+		if (ip->i_dirhash != dh) {
+			ufsdirhash_release(dh);
+			continue;
+		}
+		/* If the hash is still valid we've succeeded. */
+		if (dh->dh_hash != NULL)
+			break;
+		/*
+		 * If the hash is NULL it has been recycled.  Try to upgrade
+		 * so we can recreate it.  If we fail the upgrade another
+		 * thread must've already exclusively locked it.
+		 */
+		if (lockmgr(&dh->dh_lock, LK_UPGRADE | LK_SLEEPFAIL, NULL) == 0)
+			break;
+	}
+	/* Free the preallocated structure if it was not necessary. */
+	if (ndh) {
+		lockmgr(&ndh->dh_lock, LK_RELEASE, NULL);
+		lockdestroy(&ndh->dh_lock);
+		FREE(ndh, M_DIRHASH);
+	}
+	return (dh);
+}
+
+/*
+ * Acquire an exclusive lock on an existing hash.  Requires an exclusive
+ * vnode lock to protect the i_dirhash pointer.  hashes that have been
+ * recycled are reclaimed here and NULL is returned.
  */
+static struct dirhash *
+ufsdirhash_acquire(struct inode *ip)
+{
+	struct dirhash *dh;
+	struct vnode *vp;
+
+	ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
+
+	vp = ip->i_vnode;
+	dh = ip->i_dirhash;
+	if (dh == NULL)
+		return (NULL);
+	lockmgr(&dh->dh_lock, LK_EXCLUSIVE, 0);
+	if (dh->dh_hash != NULL)
+		return (dh);
+	ufsdirhash_free_locked(ip);
+	return (NULL);
+}
+
+/*
+ * Acquire exclusively and free the hash pointed to by ip.  Works with a
+ * shared or exclusive vnode lock.
+ */
+void
+ufsdirhash_free(struct inode *ip)
+{
+	struct dirhash *dh;
+	struct vnode *vp;
+
+	vp = ip->i_vnode;
+	for (;;) {
+		VI_LOCK(vp);
+		dh = ip->i_dirhash;
+		if (dh == NULL) {
+			VI_UNLOCK(vp);
+			return;
+		}
+		if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_INTERLOCK,
+		    VI_MTX(vp)))
+			continue;
+		if (ip->i_dirhash == dh)
+			break;
+		ufsdirhash_release(dh);
+	}
+	ufsdirhash_free_locked(ip);
+}
 
 /*
  * Attempt to build up a hash table for the directory contents in
@@ -128,27 +268,23 @@
 	doff_t bmask, pos;
 	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
 
-	/* Check if we can/should use dirhash. */
-	if (ip->i_dirhash == NULL) {
-		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
+	/* Take care of a decreased sysctl value. */
+	while (ufs_dirhashmem > ufs_dirhashmaxmem)
+		if (ufsdirhash_recycle(0) != 0)
 			return (-1);
-	} else {
-		/* Hash exists, but sysctls could have changed. */
-		if (ip->i_size < ufs_mindirhashsize ||
-		    ufs_dirhashmem > ufs_dirhashmaxmem) {
+
+	/* Check if we can/should use dirhash. */
+	if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
+	    ip->i_effnlink == 0) {
+		if (ip->i_dirhash)
 			ufsdirhash_free(ip);
-			return (-1);
-		}
-		/* Check if hash exists and is intact (note: unlocked read). */
-		if (ip->i_dirhash->dh_hash != NULL)
-			return (0);
-		/* Free the old, recycled hash and build a new one. */
-		ufsdirhash_free(ip);
+		return (-1);
 	}
-
-	/* Don't hash removed directories. */
-	if (ip->i_effnlink == 0)
+	dh = ufsdirhash_create(ip);
+	if (dh == NULL)
 		return (-1);
+	if (dh->dh_hash != NULL)
+		return (0);
 
 	vp = ip->i_vnode;
 	/* Allocate 50% more entries than this dir size could ever need. */
@@ -159,7 +295,6 @@
 	nslots = narrays * DH_NBLKOFF;
 	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
 	nblocks = (dirblocks * 3 + 1) / 2;
-
 	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
 	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
 	    nblocks * sizeof(*dh->dh_blkfree);
@@ -167,33 +302,39 @@
 	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
 		DIRHASHLIST_UNLOCK();
 		if (memreqd > ufs_dirhashmaxmem / 2)
-			return (-1);
-
+			goto fail;
 		/* Try to free some space. */
 		if (ufsdirhash_recycle(memreqd) != 0)
-			return (-1);
+			goto fail;
 		/* Enough was freed, and list has been locked. */
 	}
 	ufs_dirhashmem += memreqd;
 	DIRHASHLIST_UNLOCK();
 
+	/* Initialise the hash table and block statistics. */
+	dh->dh_memreq = memreqd;
+	dh->dh_narrays = narrays;
+	dh->dh_hlen = nslots;
+	dh->dh_nblk = nblocks;
+	dh->dh_dirblks = dirblocks;
+	for (i = 0; i < DH_NFSTATS; i++)
+		dh->dh_firstfree[i] = -1;
+	dh->dh_firstfree[DH_NFSTATS] = 0;
+	dh->dh_seqopt = 0;
+	dh->dh_seqoff = 0;
+	dh->dh_score = DH_SCOREINIT;
+
 	/*
 	 * Use non-blocking mallocs so that we will revert to a linear
 	 * lookup on failure rather than potentially blocking forever.
 	 */
-	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
-	if (dh == NULL) {
-		DIRHASHLIST_LOCK();
-		ufs_dirhashmem -= memreqd;
-		DIRHASHLIST_UNLOCK();
-		return (-1);
-	}
-	mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF);
 	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
 	    M_DIRHASH, M_NOWAIT | M_ZERO);
+	if (dh->dh_hash == NULL)
+		goto fail;
 	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
 	    M_DIRHASH, M_NOWAIT);
-	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+	if (dh->dh_blkfree == NULL)
 		goto fail;
 	for (i = 0; i < narrays; i++) {
 		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
@@ -201,22 +342,8 @@
 		for (j = 0; j < DH_NBLKOFF; j++)
 			dh->dh_hash[i][j] = DIRHASH_EMPTY;
 	}
-
-	/* Initialise the hash table and block statistics. */
-	dh->dh_narrays = narrays;
-	dh->dh_hlen = nslots;
-	dh->dh_nblk = nblocks;
-	dh->dh_dirblks = dirblocks;
 	for (i = 0; i < dirblocks; i++)
 		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
-	for (i = 0; i < DH_NFSTATS; i++)
-		dh->dh_firstfree[i] = -1;
-	dh->dh_firstfree[DH_NFSTATS] = 0;
-	dh->dh_seqopt = 0;
-	dh->dh_seqoff = 0;
-	dh->dh_score = DH_SCOREINIT;
-	ip->i_dirhash = dh;
-
 	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 	pos = 0;
 	while (pos < ip->i_size) {
@@ -254,63 +381,64 @@
 	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
 	dh->dh_onlist = 1;
 	DIRHASHLIST_UNLOCK();
+	lockmgr(&dh->dh_lock, LK_DOWNGRADE, 0);
 	return (0);
 
 fail:
-	if (dh->dh_hash != NULL) {
-		for (i = 0; i < narrays; i++)
-			if (dh->dh_hash[i] != NULL)
-				DIRHASH_BLKFREE(dh->dh_hash[i]);
-		FREE(dh->dh_hash, M_DIRHASH);
-	}
-	if (dh->dh_blkfree != NULL)
-		FREE(dh->dh_blkfree, M_DIRHASH);
-	mtx_destroy(&dh->dh_mtx);
-	FREE(dh, M_DIRHASH);
-	ip->i_dirhash = NULL;
-	DIRHASHLIST_LOCK();
-	ufs_dirhashmem -= memreqd;
-	DIRHASHLIST_UNLOCK();
+	ufsdirhash_free_locked(ip);
 	return (-1);
 }
 
 /*
  * Free any hash table associated with inode 'ip'.
  */
-void
-ufsdirhash_free(struct inode *ip)
+static void
+ufsdirhash_free_locked(struct inode *ip)
 {
 	struct dirhash *dh;
-	int i, mem;
-
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASHLIST_LOCK();
-	DIRHASH_LOCK(dh);
-	if (dh->dh_onlist)
-		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
-	DIRHASH_UNLOCK(dh);
-	DIRHASHLIST_UNLOCK();
-
-	/* The dirhash pointed to by 'dh' is exclusively ours now. */
+	struct vnode *vp;
+	int i;
 
-	mem = sizeof(*dh);
+	DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
+	/*
+	 * Clear the pointer in the inode to prevent new threads from
+	 * finding the dead structure.
+	 */
+	vp = ip->i_vnode;
+	VI_LOCK(vp);
+	dh = ip->i_dirhash;
+	ip->i_dirhash = NULL;
+	VI_UNLOCK(vp);
+	/*
+	 * Drain waiters.  They will abort when they see that ip->i_dirhash
+	 * is NULL after locking.
+	 */
+	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
+	lockmgr(&dh->dh_lock, LK_DRAIN, 0);
+	/*
+	 * Handle partially recycled as well as fully constructed hashes.
+	 */
 	if (dh->dh_hash != NULL) {
 		for (i = 0; i < dh->dh_narrays; i++)
-			DIRHASH_BLKFREE(dh->dh_hash[i]);
+			if (dh->dh_hash[i] != NULL)
+				DIRHASH_BLKFREE(dh->dh_hash[i]);
 		FREE(dh->dh_hash, M_DIRHASH);
-		FREE(dh->dh_blkfree, M_DIRHASH);
-		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
-		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
-		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
+		if (dh->dh_blkfree != NULL)
+			FREE(dh->dh_blkfree, M_DIRHASH);
 	}
-	mtx_destroy(&dh->dh_mtx);
-	FREE(dh, M_DIRHASH);
-	ip->i_dirhash = NULL;
-
 	DIRHASHLIST_LOCK();
-	ufs_dirhashmem -= mem;
+	if (dh->dh_onlist)
+		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+	ufs_dirhashmem -= dh->dh_memreq;
 	DIRHASHLIST_UNLOCK();
+	/*
+	 * Release the lock and reclaim datastructure memory.
+	 */
+	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
+	lockdestroy(&dh->dh_lock);
+	FREE(dh, M_DIRHASH);
+
+	return;
 }
 
 /*
@@ -323,6 +451,8 @@
  * prevoffp is non-NULL, the offset of the previous entry within
  * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
  * is the first in a block, the start of the block is used).
+ *
+ * Must be called with the hash locked.  Returns with the hash unlocked.
  */
 int
 ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
@@ -334,48 +464,36 @@
 	struct buf *bp;
 	doff_t blkoff, bmask, offset, prevoff;
 	int i, slot;
+	int error;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (EJUSTRETURN);
+	dh = ip->i_dirhash;
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
+	DIRHASH_ASSERT_LOCKED(dh);
 	/*
 	 * Move this dirhash towards the end of the list if it has a
-	 * score higher than the next entry, and acquire the dh_mtx.
-	 * Optimise the case where it's already the last by performing
-	 * an unlocked read of the TAILQ_NEXT pointer.
-	 *
-	 * In both cases, end up holding just dh_mtx.
+	 * score higher than the next entry, and acquire the dh_lock.
 	 */
+	DIRHASHLIST_LOCK();
 	if (TAILQ_NEXT(dh, dh_list) != NULL) {
-		DIRHASHLIST_LOCK();
-		DIRHASH_LOCK(dh);
 		/*
 		 * If the new score will be greater than that of the next
 		 * entry, then move this entry past it. With both mutexes
 		 * held, dh_next won't go away, but its dh_score could
 		 * change; that's not important since it is just a hint.
 		 */
-		if (dh->dh_hash != NULL &&
-		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+		if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
 		    dh->dh_score >= dh_next->dh_score) {
 			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
 			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
 			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
 			    dh_list);
 		}
-		DIRHASHLIST_UNLOCK();
-	} else {
-		/* Already the last, though that could change as we wait. */
-		DIRHASH_LOCK(dh);
-	}
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (EJUSTRETURN);
 	}
-
 	/* Update the score. */
 	if (dh->dh_score < DH_SCOREMAX)
 		dh->dh_score++;
+	DIRHASHLIST_UNLOCK();
 
 	vp = ip->i_vnode;
 	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
@@ -410,23 +528,23 @@
 	    slot = WRAPINCR(slot, dh->dh_hlen)) {
 		if (offset == DIRHASH_DEL)
 			continue;
-		DIRHASH_UNLOCK(dh);
-
 		if (offset < 0 || offset >= ip->i_size)
 			panic("ufsdirhash_lookup: bad offset in hash array");
 		if ((offset & ~bmask) != blkoff) {
 			if (bp != NULL)
 				brelse(bp);
 			blkoff = offset & ~bmask;
-			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
-				return (EJUSTRETURN);
+			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
+				error = EJUSTRETURN;
+				goto fail;
+			}
 		}
 		dp = (struct direct *)(bp->b_data + (offset & bmask));
 		if (dp->d_reclen == 0 || dp->d_reclen >
 		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
 			/* Corrupted directory. */
-			brelse(bp);
-			return (EJUSTRETURN);
+			error = EJUSTRETURN;
+			goto fail;
 		}
 		if (dp->d_namlen == namelen &&
 		    bcmp(dp->d_name, name, namelen) == 0) {
@@ -436,8 +554,8 @@
 					prevoff = ufsdirhash_getprev(dp,
 					    offset);
 					if (prevoff == -1) {
-						brelse(bp);
-						return (EJUSTRETURN);
+						error = EJUSTRETURN;
+						goto fail;
 					}
 				} else
 					prevoff = offset;
@@ -448,20 +566,12 @@
 			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
 				dh->dh_seqopt = 1;
 			dh->dh_seqoff = offset + DIRSIZ(0, dp);
-
 			*bpp = bp;
 			*offp = offset;
+			ufsdirhash_release(dh);
 			return (0);
 		}
 
-		DIRHASH_LOCK(dh);
-		if (dh->dh_hash == NULL) {
-			DIRHASH_UNLOCK(dh);
-			if (bp != NULL)
-				brelse(bp);
-			ufsdirhash_free(ip);
-			return (EJUSTRETURN);
-		}
 		/*
 		 * When the name doesn't match in the seqopt case, go back
 		 * and search normally.
@@ -471,10 +581,12 @@
 			goto restart;
 		}
 	}
-	DIRHASH_UNLOCK(dh);
+	error = ENOENT;
+fail:
+	ufsdirhash_release(dh);
 	if (bp != NULL)
 		brelse(bp);
-	return (ENOENT);
+	return (error);
 }
 
 /*
@@ -502,29 +614,22 @@
 	doff_t pos, slotstart;
 	int dirblock, error, freebytes, i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (-1);
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (-1);
-	}
+	dh = ip->i_dirhash;
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
+	DIRHASH_ASSERT_LOCKED(dh);
 
 	/* Find a directory block with the desired free space. */
 	dirblock = -1;
 	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
 		if ((dirblock = dh->dh_firstfree[i]) != -1)
 			break;
-	if (dirblock == -1) {
-		DIRHASH_UNLOCK(dh);
+	if (dirblock == -1)
 		return (-1);
-	}
 
 	KASSERT(dirblock < dh->dh_nblk &&
 	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
 	    ("ufsdirhash_findfree: bad stats"));
-	DIRHASH_UNLOCK(dh);
 	pos = dirblock * DIRBLKSIZ;
 	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
 	if (error)
@@ -582,24 +687,18 @@
 	struct dirhash *dh;
 	int i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (-1);
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (-1);
-	}
+	dh = ip->i_dirhash;
+	DIRHASH_ASSERT_LOCKED(dh);
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
 
-	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
-		DIRHASH_UNLOCK(dh);
+	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
 		return (-1);
-	}
 
 	for (i = dh->dh_dirblks - 1; i >= 0; i--)
 		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
 			break;
-	DIRHASH_UNLOCK(dh);
+
 	return ((doff_t)(i + 1) * DIRBLKSIZ);
 }
 
@@ -614,15 +713,9 @@
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
-
+	
 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_add: bad offset"));
 	/*
@@ -630,8 +723,7 @@
 	 * remove the hash entirely and let it be rebuilt later.
 	 */
 	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 
@@ -645,7 +737,7 @@
 
 	/* Update the per-block summary info. */
 	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -659,14 +751,8 @@
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_remove: bad offset"));
@@ -678,7 +764,7 @@
 
 	/* Update the per-block summary info. */
 	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -692,14 +778,8 @@
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return;
-	}
 
 	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
 	    newoff < dh->dh_dirblks * DIRBLKSIZ,
@@ -707,7 +787,7 @@
 	/* Find the entry, and update the offset. */
 	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
 	DH_ENTRY(dh, slot) = newoff;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -720,22 +800,15 @@
 	struct dirhash *dh;
 	int block;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_newblk: bad offset"));
 	block = offset / DIRBLKSIZ;
 	if (block >= dh->dh_nblk) {
 		/* Out of space; must rebuild. */
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 	dh->dh_dirblks = block + 1;
@@ -744,7 +817,7 @@
 	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
 	if (dh->dh_firstfree[DH_NFSTATS] == -1)
 		dh->dh_firstfree[DH_NFSTATS] = block;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -756,14 +829,8 @@
 	struct dirhash *dh;
 	int block, i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_dirtrunc: bad offset"));
@@ -775,8 +842,7 @@
 	 * if necessary.
 	 */
 	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 
@@ -794,7 +860,7 @@
 		if (dh->dh_firstfree[i] >= block)
 			panic("ufsdirhash_dirtrunc: first free corrupt");
 	dh->dh_dirblks = block;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -815,14 +881,8 @@
 
 	if (!ufs_dirhashcheck)
 		return;
-	if ((dh = ip->i_dirhash) == NULL)
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return;
-	}
 
 	block = offset / DIRBLKSIZ;
 	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
@@ -866,7 +926,7 @@
 			panic("ufsdirhash_checkblock: bad first-free");
 	if (dh->dh_firstfree[ffslot] == -1)
 		panic("ufsdirhash_checkblock: missing first-free entry");
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -893,7 +953,7 @@
  * by the value specified by `diff'.
  *
  * The caller must ensure we have exclusive access to `dh'; normally
- * that means that dh_mtx should be held, but this is also called
+ * that means that dh_lock should be held, but this is also called
  * from ufsdirhash_build() where exclusive access can be assumed.
  */
 static void
@@ -937,7 +997,7 @@
 {
 	int slot;
 
-	mtx_assert(&dh->dh_mtx, MA_OWNED);
+	DIRHASH_ASSERT_LOCKED(dh);
 
 	/* Find the entry. */
 	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
@@ -961,7 +1021,7 @@
 {
 	int i;
 
-	mtx_assert(&dh->dh_mtx, MA_OWNED);
+	DIRHASH_ASSERT_LOCKED(dh);
 
 	/* Mark the entry as deleted. */
 	DH_ENTRY(dh, slot) = DIRHASH_DEL;
@@ -1026,21 +1086,22 @@
 	int i, mem, narrays;
 
 	DIRHASHLIST_LOCK();
+	dh = TAILQ_FIRST(&ufsdirhash_list);
 	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
-		/* Find a dirhash, and lock it. */
-		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
-			DIRHASHLIST_UNLOCK();
-			return (-1);
-		}
-		DIRHASH_LOCK(dh);
-		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
-
 		/* Decrement the score; only recycle if it becomes zero. */
-		if (--dh->dh_score > 0) {
-			DIRHASH_UNLOCK(dh);
+		if (dh == NULL || --dh->dh_score > 0) {
 			DIRHASHLIST_UNLOCK();
 			return (-1);
 		}
+		/*
+		 * If we can't lock it it's in use and we don't want to
+		 * recycle it anyway.
+		 */
+		if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+			dh = TAILQ_NEXT(dh, dh_list);
+			continue;
+		}
+		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
 
 		/* Remove it from the list and detach its memory. */
 		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
@@ -1050,12 +1111,11 @@
 		blkfree = dh->dh_blkfree;
 		dh->dh_blkfree = NULL;
 		narrays = dh->dh_narrays;
-		mem = narrays * sizeof(*dh->dh_hash) +
-		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
-		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
+		mem = dh->dh_memreq;
+		dh->dh_memreq = 0;
 
 		/* Unlock everything, free the detached memory. */
-		DIRHASH_UNLOCK(dh);
+		ufsdirhash_release(dh);
 		DIRHASHLIST_UNLOCK();
 		for (i = 0; i < narrays; i++)
 			DIRHASH_BLKFREE(hash[i]);
@@ -1065,6 +1125,7 @@
 		/* Account for the returned memory, and repeat if necessary. */
 		DIRHASHLIST_LOCK();
 		ufs_dirhashmem -= mem;
+		dh = TAILQ_FIRST(&ufsdirhash_list);
 	}
 	/* Success; return with list locked. */
 	return (0);
Index: ufs/ufs/ufs_lookup.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_lookup.c,v
retrieving revision 1.86
diff -u -r1.86 ufs_lookup.c
--- ufs/ufs/ufs_lookup.c	13 Jan 2008 14:44:14 -0000	1.86
+++ ufs/ufs/ufs_lookup.c	6 Apr 2008 10:09:36 -0000
@@ -137,6 +137,8 @@
 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
 	enum {NONE, COMPACT, FOUND} slotstatus;
 	doff_t slotoffset;		/* offset of area with free space */
+	doff_t i_diroff;		/* cached i_diroff value. */
+	doff_t i_offset;		/* cached i_offset value. */
 	int slotsize;			/* size of area at slotoffset */
 	int slotfreespace;		/* amount of space free in slot */
 	int slotneeded;			/* size of the entry we're seeking */
@@ -154,6 +156,7 @@
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	ino_t saved_ino;
+	int ltype;
 
 	bp = NULL;
 	slotoffset = -1;
@@ -173,6 +176,7 @@
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
+	i_diroff = dp->i_diroff;
 	slotstatus = FOUND;
 	slotfreespace = slotsize = slotneeded = 0;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
@@ -206,13 +210,13 @@
 		numdirpasses = 1;
 		entryoffsetinblock = 0; /* silence compiler warning */
 		switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
-		    &dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
+		    &i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
 		case 0:
 			ep = (struct direct *)((char *)bp->b_data +
-			    (dp->i_offset & bmask));
+			    (i_offset & bmask));
 			goto foundentry;
 		case ENOENT:
-			dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ);
+			i_offset = roundup2(dp->i_size, DIRBLKSIZ);
 			goto notfound;
 		default:
 			/* Something failed; just do a linear search. */
@@ -231,33 +235,32 @@
 	 * profiling time and hence has been removed in the interest
 	 * of simplicity.
 	 */
-	if (nameiop != LOOKUP || dp->i_diroff == 0 ||
-	    dp->i_diroff >= dp->i_size) {
+	if (nameiop != LOOKUP || i_diroff == 0 || i_diroff >= dp->i_size) {
 		entryoffsetinblock = 0;
-		dp->i_offset = 0;
+		i_offset = 0;
 		numdirpasses = 1;
 	} else {
-		dp->i_offset = dp->i_diroff;
-		if ((entryoffsetinblock = dp->i_offset & bmask) &&
-		    (error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)))
+		i_offset = i_diroff;
+		if ((entryoffsetinblock = i_offset & bmask) &&
+		    (error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp)))
 			return (error);
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
-	prevoff = dp->i_offset;
+	prevoff = i_offset;
 	endsearch = roundup2(dp->i_size, DIRBLKSIZ);
 	enduseful = 0;
 
 searchloop:
-	while (dp->i_offset < endsearch) {
+	while (i_offset < endsearch) {
 		/*
 		 * If necessary, get the next directory block.
 		 */
-		if ((dp->i_offset & bmask) == 0) {
+		if ((i_offset & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			error =
-			    UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp);
+			    UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp);
 			if (error)
 				return (error);
 			entryoffsetinblock = 0;
@@ -284,9 +287,9 @@
 		    (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
 			int i;
 
-			ufs_dirbad(dp, dp->i_offset, "mangled entry");
+			ufs_dirbad(dp, i_offset, "mangled entry");
 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
-			dp->i_offset += i;
+			i_offset += i;
 			entryoffsetinblock += i;
 			continue;
 		}
@@ -305,15 +308,15 @@
 			if (size > 0) {
 				if (size >= slotneeded) {
 					slotstatus = FOUND;
-					slotoffset = dp->i_offset;
+					slotoffset = i_offset;
 					slotsize = ep->d_reclen;
 				} else if (slotstatus == NONE) {
 					slotfreespace += size;
 					if (slotoffset == -1)
-						slotoffset = dp->i_offset;
+						slotoffset = i_offset;
 					if (slotfreespace >= slotneeded) {
 						slotstatus = COMPACT;
-						slotsize = dp->i_offset +
+						slotsize = i_offset +
 						      ep->d_reclen - slotoffset;
 					}
 				}
@@ -347,7 +350,7 @@
 				if (vdp->v_mount->mnt_maxsymlinklen > 0 &&
 				    ep->d_type == DT_WHT) {
 					slotstatus = FOUND;
-					slotoffset = dp->i_offset;
+					slotoffset = i_offset;
 					slotsize = ep->d_reclen;
 					dp->i_reclen = slotsize;
 					enduseful = dp->i_size;
@@ -360,11 +363,11 @@
 				goto found;
 			}
 		}
-		prevoff = dp->i_offset;
-		dp->i_offset += ep->d_reclen;
+		prevoff = i_offset;
+		i_offset += ep->d_reclen;
 		entryoffsetinblock += ep->d_reclen;
 		if (ep->d_ino)
-			enduseful = dp->i_offset;
+			enduseful = i_offset;
 	}
 notfound:
 	/*
@@ -373,10 +376,11 @@
 	 */
 	if (numdirpasses == 2) {
 		numdirpasses--;
-		dp->i_offset = 0;
-		endsearch = dp->i_diroff;
+		i_offset = 0;
+		endsearch = i_diroff;
 		goto searchloop;
 	}
+	dp->i_offset = i_offset;
 	if (bp != NULL)
 		brelse(bp);
 	/*
@@ -389,6 +393,7 @@
 	      (ap->a_cnp->cn_flags & DOWHITEOUT) &&
 	      (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
 	    (flags & ISLASTCN) && dp->i_effnlink != 0) {
+		ASSERT_VOP_LOCKED(vdp, __FUNCTION__);
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
@@ -452,9 +457,9 @@
 	 * Check that directory length properly reflects presence
 	 * of this entry.
 	 */
-	if (dp->i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) {
-		ufs_dirbad(dp, dp->i_offset, "i_size too small");
-		dp->i_size = dp->i_offset + DIRSIZ(OFSFMT(vdp), ep);
+	if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) {
+		ufs_dirbad(dp, i_offset, "i_size too small");
+		dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep);
 		DIP_SET(dp, i_size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
@@ -466,13 +471,15 @@
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
-		dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1);
+		dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);
 
+	dp->i_offset = i_offset;
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
+		ASSERT_VOP_LOCKED(vdp, __FUNCTION__);
 		/*
 		 * Write access to directory required to delete files.
 		 */
@@ -557,16 +564,28 @@
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
+		ltype = VOP_ISLOCKED(pdp);
 		saved_ino = dp->i_ino;
 		VOP_UNLOCK(pdp, 0);	/* race to get the inode */
 		error = VFS_VGET(pdp->v_mount, saved_ino,
 		    cnp->cn_lkflags, &tdp);
-		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(pdp, ltype | LK_RETRY);
 		if (error)
 			return (error);
 		*vpp = tdp;
 	} else if (dp->i_number == dp->i_ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
+		/*
+		 * When we lookup "." we still can be asked to lock it
+		 * differently.
+		 */
+		ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
+		if (ltype != VOP_ISLOCKED(vdp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(vdp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(vdp, LK_DOWNGRADE | LK_RETRY);
+		}
 		*vpp = vdp;
 	} else {
 		error = VFS_VGET(pdp->v_mount, dp->i_ino,