--- //depot/projects/smpng/sys/compat/linux/linux_misc.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/compat/linux/linux_misc.c	2009/05/13 17:57:30
@@ -251,8 +251,8 @@
 	locked = 0;
 	vp = NULL;
 
-	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	    UIO_SYSSPACE, library, td);
+	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE |
+	    AUDITVNODE1, UIO_SYSSPACE, library, td);
 	error = namei(&ni);
 	LFREEPATH(library);
 	if (error)
@@ -312,6 +312,13 @@
 	if (error)
 		goto cleanup;
 
+	/*
+	 * Drop the vnode lock (but not the reference) while we map
+	 * the header.
+	 */
+	locked = 0;
+	VOP_UNLOCK(vp, 0);
+
 	/* Pull in executable header into kernel_map */
 	error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
@@ -374,14 +381,9 @@
 	 * XXX: Note that if any of the VM operations fail below we don't
 	 * clear this flag.
 	 */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_TEXT;
-
-	/*
-	 * Lock no longer needed
-	 */
-	locked = 0;
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 
 	/*
 	 * Check if file_offset page aligned. Currently we cannot handle
@@ -460,8 +462,12 @@
 
 cleanup:
 	/* Unlock vnode if needed */
-	if (locked) {
-		VOP_UNLOCK(vp, 0);
+	if (locked || vp != NULL) {
+		if (locked)
+			vput(vp);
+		else
+			/* XXX: Should this do VOP_CLOSE(). */
+			vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 
@@ -470,7 +476,7 @@
 		vm_map_remove(kernel_map, (vm_offset_t)a_out,
 		    (vm_offset_t)a_out + PAGE_SIZE);
 
-	return error;
+	return (error);
 }
 
 #endif	/* __i386__ */
--- //depot/projects/smpng/sys/compat/svr4/svr4_misc.c	2009/02/13 18:22:54
+++ //depot/user/jhb/lock/compat/svr4/svr4_misc.c	2009/02/13 20:21:57
@@ -1611,14 +1611,14 @@
 	struct nameidata nd;
 	int error, *retval = td->td_retval;
 	unsigned int ncopy;
-	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE,
 	    uap->path, td);
 
 	if ((error = namei(&nd)) != 0)
-		return error;
-	vfslocked = NDHASGIANT(&nd);
+		return (error);
+	NDFREE(&nd, NDF_NO_FREE_PNBUF);
+	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
 
 	ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1);
 	if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0)
@@ -1627,7 +1627,5 @@
 	*retval = ncopy;
 bad:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vput(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return error;
 }
--- //depot/projects/smpng/sys/contrib/altq/altq/altq_subr.c	2009/05/19 13:40:43
+++ //depot/user/jhb/lock/contrib/altq/altq/altq_subr.c	2009/05/20 17:51:14
@@ -1021,7 +1021,7 @@
 		microtime(&tv_start);
 		start = read_machclk();
 		timo = hz;	/* 1 sec */
-		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
+		(void)tsleep(&wait, PWAIT, "init_machclk", timo);
 		microtime(&tv_end);
 		end = read_machclk();
 		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
--- //depot/projects/smpng/sys/fs/cd9660/cd9660_lookup.c	2009/01/28 22:38:17
+++ //depot/user/jhb/lock/fs/cd9660/cd9660_lookup.c	2009/05/11 13:58:44
@@ -374,15 +374,17 @@
 		 */
 		mp = pdp->v_mount;
 		ltype = VOP_ISLOCKED(pdp);
-		for (;;) {
-			error = vfs_busy(mp, MBF_NOWAIT);
-			if (error == 0)
-				break;
+		error = vfs_busy(mp, MBF_NOWAIT);
+		if (error != 0) {
 			VOP_UNLOCK(pdp, 0);
-			pause("vn_vget", 1);
+			error = vfs_busy(mp, 0);
 			vn_lock(pdp, ltype | LK_RETRY);
-			if (pdp->v_iflag & VI_DOOMED)
+			if (error)
+				return (ENOENT);
+			if (pdp->v_iflag & VI_DOOMED) {
+				vfs_unbusy(mp);
 				return (ENOENT);
+			}
 		}
 		VOP_UNLOCK(pdp, 0);
 		error = cd9660_vget_internal(vdp->v_mount, i_ino,
--- //depot/projects/smpng/sys/fs/cd9660/cd9660_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/cd9660/cd9660_vfsops.c	2009/05/13 17:57:30
@@ -156,7 +156,8 @@
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
-	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
+	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE,
+	    fspec, td);
 	if ((error = namei(&ndp)))
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
--- //depot/projects/smpng/sys/fs/hpfs/hpfs_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/hpfs/hpfs_vfsops.c	2009/05/13 17:57:30
@@ -156,7 +156,8 @@
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
-	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
+	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from,
+	    td);
 	err = namei(&ndp);
 	if (err) {
 		/* can't get devvp!*/
--- //depot/projects/smpng/sys/fs/msdosfs/denode.h	2009/02/27 21:13:56
+++ //depot/user/jhb/lock/fs/msdosfs/denode.h	2009/03/05 20:57:52
@@ -135,20 +135,26 @@
 /*
  * This is the in memory variant of a dos directory entry.  It is usually
  * contained within a vnode.
+ *
+ * Locking key:
+ * (c) - only changed during initial vnode creation or reclaim
+ * (i) - VI_LOCK()
+ * (v) - vn_lock()
+ * (*) - TBD, but needs a lock of some sort
  */
 struct denode {
-	struct vnode *de_vnode;	/* addr of vnode we are part of */
-	u_long de_flag;		/* flag bits */
-	u_long de_dirclust;	/* cluster of the directory file containing this entry */
-	u_long de_diroffset;	/* offset of this entry in the directory cluster */
-	u_long de_fndoffset;	/* offset of found dir entry */
-	int de_fndcnt;		/* number of slots before de_fndoffset */
-	long de_refcnt;		/* reference count */
-	struct msdosfsmount *de_pmp;	/* addr of our mount struct */
+	struct vnode *de_vnode;	/* (c) addr of vnode we are part of */
+	u_long de_flag;	/* (*) flag bits */
+	u_long de_dirclust;	/* (*) cluster of the directory file containing this entry */
+	u_long de_diroffset;	/* (*) offset of this entry in the directory cluster */
+	u_long de_fndoffset;	/* (*) offset of found dir entry */
+	int de_fndcnt;		/* (*) number of slots before de_fndoffset */
+	long de_refcnt;	/* (*) reference count */
+	struct msdosfsmount *de_pmp;	/* (c) addr of our mount struct */
 	u_char de_Name[12];	/* name, from DOS directory entry */
 	u_char de_Attributes;	/* attributes, from directory entry */
 	u_char de_LowerCase;	/* NT VFAT lower case flags */
-	u_char de_CHun;		/* Hundredth of second of CTime*/
+	u_char de_CHun;	/* Hundredth of second of CTime*/
 	u_short de_CTime;	/* creation time */
 	u_short de_CDate;	/* creation date */
 	u_short de_ADate;	/* access date */
--- //depot/projects/smpng/sys/fs/msdosfs/msdosfs_denode.c	2009/02/27 21:13:56
+++ //depot/user/jhb/lock/fs/msdosfs/msdosfs_denode.c	2009/03/05 20:57:52
@@ -168,6 +168,7 @@
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	ldep->de_inode = inode;
+	ldep->de_Name[0] = SLOT_DELETED;
 	fc_purge(ldep, 0);	/* init the fat cache for this denode */
 
 	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
@@ -184,9 +185,8 @@
 		return (error);
 	}
 	if (xvp != NULL) {
-		/* XXX: Not sure this is right */
-		nvp = xvp;
-		ldep->de_vnode = nvp;
+		*depp = VTODE(xvp);
+		return (0);
 	}
 
 	ldep->de_pmp = pmp;
@@ -228,17 +228,12 @@
 		ldep->de_ADate = ldep->de_CDate;
 		ldep->de_MTime = ldep->de_CTime;
 		ldep->de_MDate = ldep->de_CDate;
-		/* leave the other fields as garbage */
+		/* Clear de_Name[0] as this is a real node now. */
+		ldep->de_Name[0] = 0;
+		/* leave the other fields as garbage */		
 	} else {
 		error = readep(pmp, dirclust, diroffset, &bp, &direntptr);
 		if (error) {
-			/*
-			 * The denode does not contain anything useful, so
-			 * it would be wrong to leave it on its hash chain.
-			 * Arrange for vput() to just forget about it.
-			 */
-			ldep->de_Name[0] = SLOT_DELETED;
-
 			vput(nvp);
 			*depp = NULL;
 			return (error);
--- //depot/projects/smpng/sys/fs/msdosfs/msdosfs_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/msdosfs/msdosfs_vfsops.c	2009/05/13 17:57:30
@@ -314,7 +314,7 @@
 			 * that user has necessary permissions on the device.
 			 */
 			devvp = pmp->pm_devvp;
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			vn_lock(devvp, LK_SHARED | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
@@ -350,7 +350,8 @@
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
-	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
+	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from,
+	    td);
 	error = namei(&ndp);
 	if (error)
 		return (error);
--- //depot/projects/smpng/sys/fs/msdosfs/msdosfsmount.h	2009/02/27 21:13:56
+++ //depot/user/jhb/lock/fs/msdosfs/msdosfsmount.h	2009/03/05 20:57:52
@@ -63,50 +63,56 @@
 
 /*
  * Layout of the mount control block for a msdos filesystem.
+ *
+ * Locking key:
+ *  (c) - only changed during initial mount/unmount
+ *  (i) - MNT_ILOCK()
+ *  (u) - like (c) but can be changed via MNT_UPDATE request
+ *  (*) - TBD, but needs a lock of some sort
  */
 struct msdosfsmount {
-	struct mount *pm_mountp;/* vfs mount struct for this fs */
-	struct g_consumer *pm_cp;
-	struct bufobj *pm_bo;
-	uid_t pm_uid;		/* uid to set as owner of the files */
-	gid_t pm_gid;		/* gid to set as owner of the files */
-	mode_t pm_mask;		/* mask to and with file protection bits 
+	struct mount *pm_mountp;/* (c) vfs mount struct for this fs */
+	struct g_consumer *pm_cp; /* (c) */
+	struct bufobj *pm_bo;	/* (c) */
+	uid_t pm_uid;		/* (u) uid to set as owner of the files */
+	gid_t pm_gid;		/* (u) gid to set as owner of the files */
+	mode_t pm_mask;	/* (u) mask to and with file protection bits 
 				   for files */
-	mode_t pm_dirmask;	/* mask to and with file protection bits
+	mode_t pm_dirmask;	/* (u) mask to and with file protection bits
 				   for directories */
-	struct vnode *pm_devvp;	/* vnode for character device mounted */
-	struct cdev *pm_dev;	/* character device mounted */
-	struct bpb50 pm_bpb;	/* BIOS parameter blk for this fs */
-	u_long pm_BlkPerSec;	/* How many DEV_BSIZE blocks fit inside a physical sector */
-	u_long pm_FATsecs;	/* actual number of fat sectors */
-	u_long pm_fatblk;	/* block # of first FAT */
-	u_long pm_rootdirblk;	/* block # (cluster # for FAT32) of root directory number */
-	u_long pm_rootdirsize;	/* size in blocks (not clusters) */
-	u_long pm_firstcluster;	/* block number of first cluster */
-	u_long pm_maxcluster;	/* maximum cluster number */
-	u_long pm_freeclustercount;	/* number of free clusters */
-	u_long pm_cnshift;	/* shift file offset right this amount to get a cluster number */
-	u_long pm_crbomask;	/* and a file offset with this mask to get cluster rel offset */
-	u_long pm_bnshift;	/* shift file offset right this amount to get a block number */
-	u_long pm_bpcluster;	/* bytes per cluster */
-	u_long pm_fmod;		/* ~0 if fs is modified, this can rollover to 0	*/
-	u_long pm_fatblocksize;	/* size of fat blocks in bytes */
-	u_long pm_fatblocksec;	/* size of fat blocks in sectors */
-	u_long pm_fatsize;	/* size of fat in bytes */
-	u_int32_t pm_fatmask;	/* mask to use for fat numbers */
-	u_long pm_fsinfo;	/* fsinfo block number */
-	u_long pm_nxtfree;	/* next place to search for a free cluster */
-	u_int pm_fatmult;	/* these 2 values are used in fat */
-	u_int pm_fatdiv;	/*	offset computation */
-	u_int pm_curfat;	/* current fat for FAT32 (0 otherwise) */
-	u_int *pm_inusemap;	/* ptr to bitmap of in-use clusters */
-	u_int pm_flags;		/* see below */
-	void *pm_u2w;	/* Local->Unicode iconv handle */
-	void *pm_w2u;	/* Unicode->Local iconv handle */
-	void *pm_u2d;	/* Unicode->DOS iconv handle */
-	void *pm_d2u;	/* DOS->Local iconv handle */
-	u_int32_t pm_nfileno;	/* next 32-bit fileno */
-	RB_HEAD(msdosfs_filenotree, msdosfs_fileno) pm_filenos; /* 64<->32-bit fileno mapping */
+	struct vnode *pm_devvp; /* (c) vnode for character device mounted */
+	struct cdev *pm_dev;	/* (c) character device mounted */
+	struct bpb50 pm_bpb;	/* (c) BIOS parameter blk for this fs */
+	u_long pm_BlkPerSec;	/* (c) How many DEV_BSIZE blocks fit inside a physical sector */
+	u_long pm_FATsecs;	/* (c) actual number of fat sectors */
+	u_long pm_fatblk;	/* (c) block # of first FAT */
+	u_long pm_rootdirblk;	/* (c) block # (cluster # for FAT32) of root directory number */
+	u_long pm_rootdirsize;	/* (c) size in blocks (not clusters) */
+	u_long pm_firstcluster; /* (c) block number of first cluster */
+	u_long pm_maxcluster;	/* (c) maximum cluster number */
+	u_long pm_freeclustercount; /* (*) number of free clusters */
+	u_long pm_cnshift;	/* (c) shift file offset right this amount to get a cluster number */
+	u_long pm_crbomask;	/* (c) and a file offset with this mask to get cluster rel offset */
+	u_long pm_bnshift;	/* (c) shift file offset right this amount to get a block number */
+	u_long pm_bpcluster;	/* (c) bytes per cluster */
+	u_long pm_fmod;	/* (*) ~0 if fs is modified, this can rollover to 0	*/
+	u_long pm_fatblocksize; /* (c) size of fat blocks in bytes */
+	u_long pm_fatblocksec;	/* (c) size of fat blocks in sectors */
+	u_long pm_fatsize;	/* (c) size of fat in bytes */
+	u_int32_t pm_fatmask;	/* (c) mask to use for fat numbers */
+	u_long pm_fsinfo;	/* (*) fsinfo block number */
+	u_long pm_nxtfree;	/* (*) next place to search for a free cluster */
+	u_int pm_fatmult;	/* (c) these 2 values are used in fat */
+	u_int pm_fatdiv;	/* (c)	offset computation */
+	u_int pm_curfat;	/* (c) current fat for FAT32 (0 otherwise) */
+	u_int *pm_inusemap;	/* (*) ptr to bitmap of in-use clusters */
+	u_int pm_flags;	/* (u) see below */
+	void *pm_u2w;		/* (u) Local->Unicode iconv handle */
+	void *pm_w2u;		/* (u) Unicode->Local iconv handle */
+	void *pm_u2d;		/* (u) Unicode->DOS iconv handle */
+	void *pm_d2u;		/* (u) DOS->Local iconv handle */
+	u_int32_t pm_nfileno;	/* (*) next 32-bit fileno */
+	RB_HEAD(msdosfs_filenotree, msdosfs_fileno) pm_filenos; /* (*) 64<->32-bit fileno mapping */
 };
 
 /*
--- //depot/projects/smpng/sys/fs/ntfs/ntfs_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/ntfs/ntfs_vfsops.c	2009/05/13 17:57:30
@@ -181,7 +181,8 @@
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
-	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, curthread);
+	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from,
+	    curthread);
 	err = namei(&ndp);
 	if (err) {
 		/* can't get devvp!*/
--- //depot/projects/smpng/sys/fs/nullfs/null_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/nullfs/null_vfsops.c	2009/05/13 17:57:30
@@ -115,7 +115,8 @@
 	/*
 	 * Find lower node
 	 */
-	NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread);
+	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE,
+	    target, curthread);
 	error = namei(ndp);
 	/*
 	 * Re-lock vnode.
--- //depot/projects/smpng/sys/fs/udf/udf_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/fs/udf/udf_vfsops.c	2009/05/13 17:57:30
@@ -225,7 +225,8 @@
 	/* Check that the mount device exists */
 	if (fspec == NULL)
 		return (EINVAL);
-	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
+	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, fspec,
+	    td);
 	if ((error = namei(ndp)))
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
--- //depot/projects/smpng/sys/kern/kern_acct.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/kern/kern_acct.c	2009/05/13 17:57:30
@@ -412,7 +412,6 @@
 
 	/* (8) The boolean flags that tell how the process terminated, etc. */
 	acct.ac_flagx = p->p_acflag;
-	PROC_UNLOCK(p);
 
 	/* Setup ancillary structure fields. */
 	acct.ac_flagx |= ANVER;
@@ -423,14 +422,17 @@
 	/*
 	 * Eliminate any file size rlimit.
 	 */
-	newlim = lim_alloc();
-	PROC_LOCK(p);
-	oldlim = p->p_limit;
-	lim_copy(newlim, oldlim);
-	newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	p->p_limit = newlim;
+	if (p->p_limit->pl_rlimit[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) {
+		PROC_UNLOCK(p);
+		newlim = lim_alloc();
+		PROC_LOCK(p);
+		oldlim = p->p_limit;
+		lim_copy(newlim, oldlim);
+		newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+		p->p_limit = newlim;
+		lim_free(oldlim);
+	}
 	PROC_UNLOCK(p);
-	lim_free(oldlim);
 
 	/*
 	 * Write the accounting information to the file.
--- //depot/projects/smpng/sys/kern/kern_condvar.c	2009/02/27 15:49:22
+++ //depot/user/jhb/lock/kern/kern_condvar.c	2009/02/27 16:32:01
@@ -122,7 +122,7 @@
 
 	sleepq_lock(cvp);
 
-	cvp->cv_waiters++;
+	cvp->cv_waiters = 1;
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
@@ -184,7 +184,7 @@
 
 	sleepq_lock(cvp);
 
-	cvp->cv_waiters++;
+	cvp->cv_waiters = 1;
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
@@ -239,7 +239,7 @@
 
 	sleepq_lock(cvp);
 
-	cvp->cv_waiters++;
+	cvp->cv_waiters = 1;
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
@@ -305,7 +305,7 @@
 
 	sleepq_lock(cvp);
 
-	cvp->cv_waiters++;
+	cvp->cv_waiters = 1;
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
@@ -372,7 +372,7 @@
 
 	sleepq_lock(cvp);
 
-	cvp->cv_waiters++;
+	cvp->cv_waiters = 1;
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
@@ -417,10 +417,8 @@
 
 	wakeup_swapper = 0;
 	sleepq_lock(cvp);
-	if (cvp->cv_waiters > 0) {
-		cvp->cv_waiters--;
+	if (cvp->cv_waiters > 0)
 		wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0);
-	}
 	sleepq_release(cvp);
 	if (wakeup_swapper)
 		kick_proc0();
--- //depot/projects/smpng/sys/kern/kern_mtxpool.c	2008/11/03 21:11:59
+++ //depot/user/jhb/lock/kern/kern_mtxpool.c	2009/05/10 13:05:31
@@ -71,16 +71,20 @@
 	int		mtxpool_mask;
 	int		mtxpool_shift;
 	int		mtxpool_next;
-};
+} __aligned(CACHE_LINE_SIZE);
+
+union mtx_c {
+	struct mtx	mtx;
+} __aligned(CACHE_LINE_SIZE);
 
 struct mtx_pool {
 	struct mtxpool_header mtx_pool_header;
-	struct mtx	mtx_pool_ary[1];
+	union mtx_c	mtx_pool_ary[1];
 };
 
 static struct mtx_pool_lockbuilder {
 	struct mtxpool_header mtx_pool_header;
-	struct mtx	mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE];
+	union mtx_c	mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE];
 } lockbuilder_pool;
 
 #define mtx_pool_size	mtx_pool_header.mtxpool_size
@@ -117,7 +121,7 @@
 	 */
 	p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) &
 	    pool->mtx_pool_mask;
-	return (&pool->mtx_pool_ary[p]);
+	return (&pool->mtx_pool_ary[p].mtx);
 }
 
 static void
@@ -133,7 +137,7 @@
 	pool->mtx_pool_shift = POINTER_BITS - maskbits;
 	pool->mtx_pool_next = 0;
 	for (i = 0; i < pool_size; ++i)
-		mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts);
+		mtx_init(&pool->mtx_pool_ary[i].mtx, mtx_name, NULL, opts);
 }
 
 struct mtx_pool *
@@ -147,7 +151,7 @@
 		pool_size = 128;
 	}
 	pool = malloc(sizeof (struct mtx_pool) +
-	    ((pool_size - 1) * sizeof (struct mtx)),
+	    ((pool_size - 1) * sizeof (union mtx_c)),
 	    M_MTXPOOL, M_WAITOK | M_ZERO);
 	mtx_pool_initialize(pool, mtx_name, pool_size, opts);
 	return pool;
@@ -160,7 +164,7 @@
 	struct mtx_pool *pool = *poolp;
 
 	for (i = pool->mtx_pool_size - 1; i >= 0; --i)
-		mtx_destroy(&pool->mtx_pool_ary[i]);
+		mtx_destroy(&pool->mtx_pool_ary[i].mtx);
 	free(pool, M_MTXPOOL);
 	*poolp = NULL;
 }
@@ -199,7 +203,7 @@
 	 */
 	i = pool->mtx_pool_next;
 	pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask;
-	return (&pool->mtx_pool_ary[i]);
+	return (&pool->mtx_pool_ary[i].mtx);
 }
 
 /*
--- //depot/projects/smpng/sys/kern/kern_mutex.c	2009/03/19 16:52:48
+++ //depot/user/jhb/lock/kern/kern_mutex.c	2009/03/19 17:44:47
@@ -369,21 +369,20 @@
 		 */
 		if (v == MTX_UNOWNED) {
 			turnstile_cancel(ts);
-			cpu_spinwait();
 			continue;
 		}
 
-		MPASS(v != MTX_CONTESTED);
-
 #ifdef ADAPTIVE_MUTEXES
 		/*
-		 * If the current owner of the lock is executing on another
-		 * CPU quit the hard path and try to spin.
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
 		 */
 		owner = (struct thread *)(v & ~MTX_FLAGMASK);
 		if (TD_IS_RUNNING(owner)) {
 			turnstile_cancel(ts);
-			cpu_spinwait();
 			continue;
 		}
 #endif
@@ -396,7 +395,6 @@
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
 			turnstile_cancel(ts);
-			cpu_spinwait();
 			continue;
 		}
 
--- //depot/projects/smpng/sys/kern/kern_rwlock.c	2009/03/19 16:52:48
+++ //depot/user/jhb/lock/kern/kern_rwlock.c	2009/03/19 17:44:47
@@ -322,7 +322,6 @@
 					    (void *)(v + RW_ONE_READER));
 				break;
 			}
-			cpu_spinwait();
 			continue;
 		}
 		lock_profile_obtain_lock_failed(&rw->lock_object,
@@ -380,14 +379,16 @@
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
-		 * If the current owner of the lock is executing on another
-		 * CPU quit the hard path and try to spin.
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
-				cpu_spinwait();
 				continue;
 			}
 		}
@@ -408,7 +409,6 @@
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_READ_WAITERS)) {
 				turnstile_cancel(ts);
-				cpu_spinwait();
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
@@ -650,14 +650,16 @@
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
-		 * If the current owner of the lock is executing on another
-		 * CPU quit the hard path and try to spin.
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
 		 */
 		if (!(v & RW_LOCK_READ)) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
-				cpu_spinwait();
 				continue;
 			}
 		}
@@ -680,7 +682,6 @@
 				break;
 			}
 			turnstile_cancel(ts);
-			cpu_spinwait();
 			continue;
 		}
 		/*
@@ -692,7 +693,6 @@
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_WRITE_WAITERS)) {
 				turnstile_cancel(ts);
-				cpu_spinwait();
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
@@ -797,6 +797,13 @@
 	_rw_assert(rw, RA_RLOCKED, file, line);
 
 	/*
+	 * If we have multiple readers, just fail without doing any
+	 * atomic operations.
+	 */
+	if (RW_READERS(rw->rw_lock) > 1)
+		return (0);
+
+	/*
 	 * Attempt to switch from one reader to a writer.  If there
 	 * are any write waiters, then we will have to lock the
 	 * turnstile first to prevent races with another writer
--- //depot/projects/smpng/sys/kern/kern_shutdown.c	2009/04/07 17:48:51
+++ //depot/user/jhb/lock/kern/kern_shutdown.c	2009/04/07 19:19:09
@@ -115,6 +115,12 @@
 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW,
 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
 
+#ifdef INVARIANT_SUPPORT
+int debugger_on_caught_panic = 0;
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_caught_panic, CTLFLAG_RW,
+    &debugger_on_caught_panic, 0, "Run debugger on caught kernel panic");
+#endif
+
 SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
 
 /*
@@ -533,6 +539,21 @@
 	if (panicstr)
 		bootopt |= RB_NOSYNC;
 	else {
+#ifdef INVARIANT_SUPPORT
+		if (td->td_pflags & TDP_CATCHPANIC) {
+			td->td_pflags &= ~TDP_CATCHPANIC;
+			td->td_caught_panic = fmt;
+			if (debugger_on_caught_panic) {
+				va_start(ap, fmt);
+				printf("caught panic: ");
+				vprintf(fmt, ap);
+				printf("\n");
+				va_end(ap);
+				kdb_enter(NULL, "caught panic");
+			}				
+			longjmp(td->td_panic_buf, 1);
+		}
+#endif
 		panicstr = fmt;
 		newpanic = 1;
 	}
--- //depot/projects/smpng/sys/kern/kern_thread.c	2009/03/19 16:52:48
+++ //depot/user/jhb/lock/kern/kern_thread.c	2009/03/19 17:44:47
@@ -35,6 +35,7 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
@@ -48,6 +49,7 @@
 #include <sys/umtx.h>
 #include <sys/cpuset.h>
 
+#include <machine/setjmp.h>
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
@@ -164,6 +166,9 @@
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
 	td->td_kstack = 0;
+#ifdef INVARIANT_SUPPORT
+	td->td_panic_buf = malloc(sizeof(struct _jmp_buf), M_SUBPROC, M_WAITOK);
+#endif
 	return (0);
 }
 
@@ -181,6 +186,9 @@
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	seltdfini(td);
+#ifdef INVARIANT_SUPPORT
+	free(td->td_panic_buf, M_SUBPROC);
+#endif
 }
 
 /*
--- //depot/projects/smpng/sys/kern/kern_time.c	2009/04/14 19:06:19
+++ //depot/user/jhb/lock/kern/kern_time.c	2009/04/14 19:48:26
@@ -364,6 +364,7 @@
 	timespecadd(&ts, rqt);
 	TIMESPEC_TO_TIMEVAL(&tv, rqt);
 	for (;;) {
+		/* XXX: pause_catch()? */
 		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
 		    tvtohz(&tv));
 		getnanouptime(&ts2);
--- //depot/projects/smpng/sys/kern/sched_4bsd.c	2009/01/26 15:26:58
+++ //depot/user/jhb/lock/kern/sched_4bsd.c	2009/05/01 19:05:35
@@ -1534,6 +1534,7 @@
 sched_idletd(void *dummy)
 {
 
+	THREAD_NO_SLEEPING();
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
--- //depot/projects/smpng/sys/kern/sched_ule.c	2009/05/08 11:53:25
+++ //depot/user/jhb/lock/kern/sched_ule.c	2009/05/08 14:17:47
@@ -2537,6 +2537,7 @@
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
+	THREAD_NO_SLEEPING();
 	for (;;) {
 #ifdef SMP
 		if (tdq_idled(tdq) == 0)
--- //depot/projects/smpng/sys/kern/subr_turnstile.c	2008/09/17 20:27:47
+++ //depot/user/jhb/lock/kern/subr_turnstile.c	2009/05/01 19:05:35
@@ -678,6 +678,7 @@
 	if (owner)
 		MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+	KASSERT(!TD_IS_IDLETHREAD(td), ("idle threads cannot block on locks"));
 
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
--- //depot/projects/smpng/sys/kern/subr_witness.c	2009/05/20 17:20:32
+++ //depot/user/jhb/lock/kern/subr_witness.c	2009/05/20 17:51:14
@@ -824,7 +824,7 @@
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
-	if (witness_watch < 1 || panicstr != NULL ||
+	if (witness_watch == -1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
@@ -1481,7 +1481,8 @@
 	register_t s;
 	int i, j;
 
-	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
+	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
+	    panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
@@ -1500,17 +1501,8 @@
 				goto found;
 		}
 
-	/*
-	 * When disabling WITNESS through witness_watch we could end up in
-	 * having registered locks in the td_sleeplocks queue.
-	 * We have to make sure we flush these queues, so just search for
-	 * eventual register locks and remove them.
-	 */
-	if (witness_watch > 0)
-		panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
-		    lock->lo_name, file, line);
-	else
-		return;
+	panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
+	    lock->lo_name, file, line);
 found:
 
 	/* First, check for shared/exclusive mismatches. */
@@ -1585,7 +1577,7 @@
 	int i, n;
 
 	lle = td->td_sleeplocks;
-	if (lle == NULL || panicstr != NULL)
+	if (lle == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
--- //depot/projects/smpng/sys/kern/uipc_sockbuf.c	2009/05/08 11:53:25
+++ //depot/user/jhb/lock/kern/uipc_sockbuf.c	2009/05/08 14:17:47
@@ -610,45 +610,36 @@
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
- * with total length.  Returns 0 if no space in sockbuf or insufficient
- * mbufs.
+ * with total length.  Returns 0 if no space in sockbuf.  The sockaddr
+ * should be present in the MT_SONAME mbuf 'sa'.
  */
 int
-sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
-    struct mbuf *m0, struct mbuf *control)
+sbappendaddr_mbuf(struct sockbuf *sb, struct mbuf *sa, struct mbuf *m0,
+    struct mbuf *control)
 {
-	struct mbuf *m, *n, *nlast;
-	int space = asa->sa_len;
+	struct mbuf *n, *nlast;
+	int space = sa->m_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
-		panic("sbappendaddr_locked");
+		panic("sbappendaddr_mbuf");
 	if (m0)
 		space += m0->m_pkthdr.len;
 	space += m_length(control, &n);
 
 	if (space > sbspace(sb))
 		return (0);
-#if MSIZE <= 256
-	if (asa->sa_len > MLEN)
-		return (0);
-#endif
-	MGET(m, M_DONTWAIT, MT_SONAME);
-	if (m == 0)
-		return (0);
-	m->m_len = asa->sa_len;
-	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
 	if (n)
 		n->m_next = m0;		/* concatenate data to control */
 	else
 		control = m0;
-	m->m_next = control;
-	for (n = m; n->m_next != NULL; n = n->m_next)
+	sa->m_next = control;
+	for (n = sa; n->m_next != NULL; n = n->m_next)
 		sballoc(sb, n);
 	sballoc(sb, n);
 	nlast = n;
-	SBLINKRECORD(sb, m);
+	SBLINKRECORD(sb, sa);
 
 	sb->sb_mbtail = nlast;
 	SBLASTMBUFCHK(sb);
@@ -664,6 +655,25 @@
  * mbufs.
  */
 int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	struct mbuf *m;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	m = soputsockaddr(asa);
+	if (m == NULL)
+		return (0);
+	return (sbappendaddr_mbuf(sb, m, m0, control));
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
--- //depot/projects/smpng/sys/kern/uipc_socket.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/kern/uipc_socket.c	2009/05/13 17:57:30
@@ -3146,6 +3146,26 @@
 }
 
 /*
+ * Make a copy of a sockaddr in an mbuf.
+ */
+struct mbuf *
+soputsockaddr(const struct sockaddr *asa)
+{
+	struct mbuf *m;
+
+#if MSIZE <= 256
+	if (asa->sa_len > MLEN)
+		return (NULL);
+#endif
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == NULL)
+		return (NULL);
+	m->m_len = asa->sa_len;
+	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+	return (m);
+}
+
+/*
  * Create an external-format (``xsocket'') structure using the information in
  * the kernel-format socket structure pointed to by so.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
--- //depot/projects/smpng/sys/modules/crash/crash.c	2008/08/07 20:34:04
+++ //depot/user/jhb/lock/modules/crash/crash.c	2009/02/02 18:40:33
@@ -46,6 +46,7 @@
 #include <sys/kthread.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
+#include <sys/lockmgr.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
@@ -81,6 +82,8 @@
 static struct thread *kthread;
 static int event;
 
+SYSCTL_NODE(_debug, OID_AUTO, crash, CTLFLAG_RD, 0, "crash tree");
+
 static int	mod_event(struct module *module, int cmd, void *arg);
 static int	load(void *arg);
 static int	unload(void *arg);
@@ -90,6 +93,86 @@
 /* Events. */
 
 static void
+sysctl_tree(void)
+{
+	struct sysctl_ctx_list ctx;
+	struct sysctl_oid *oidp;
+	int i;
+
+	for (i = 0; i < 1000000; i++) {
+		sysctl_ctx_init(&ctx);
+		oidp = SYSCTL_ADD_NODE(&ctx,
+		    SYSCTL_STATIC_CHILDREN(_debug_crash), OID_AUTO, "tree",
+		    CTLFLAG_RD, NULL, "sysctl_tree tree");
+		SYSCTL_ADD_INT(&ctx, SYSCTL_CHILDREN(oidp), OID_AUTO, "max",
+		    CTLFLAG_RD, &event_max, 0, "event_max");
+		DELAY(100000);
+		sysctl_ctx_free(&ctx);
+	}
+}
+CRASH_EVENT("add and remove dynamic sysctls", sysctl_tree);
+
+#ifdef WITNESS
+static void
+witness_interlock(void)
+{
+	struct lock lk;
+	struct mtx ilock;
+
+	mtx_init(&ilock, "ilock", NULL, MTX_DEF | MTX_RECURSE);
+	lockinit(&lk, PZERO, "lk", 0, 0);
+
+	printf("This should not cause a LOR report (interlock)...\n");
+	mtx_lock(&ilock);
+	lockmgr(&lk, LK_EXCLUSIVE | LK_INTERLOCK, &ilock);
+	lockmgr(&lk, LK_RELEASE, 0);
+
+	printf("This should cause a LOR report (no interlock)...\n");
+	mtx_lock(&ilock);
+	lockmgr(&lk, LK_EXCLUSIVE, NULL);
+	mtx_unlock(&ilock);
+	lockmgr(&lk, LK_RELEASE, 0);
+
+	printf("This should cause a LOR report (recursed interlock)...\n");
+	mtx_lock(&ilock);
+	mtx_lock(&ilock);
+	lockmgr(&lk, LK_EXCLUSIVE | LK_INTERLOCK, &ilock);
+	mtx_unlock(&ilock);
+	lockmgr(&lk, LK_RELEASE, 0);
+
+	lockdestroy(&lk);
+	mtx_destroy(&ilock);
+}
+CRASH_EVENT("test handling of lockmgr interlocks with witness",
+    witness_interlock);
+#endif
+
+static int Giant_wchan;
+
+static void
+sleeping_with_unlocked_Giant_interlock(void)
+{
+	struct cv cv;
+
+	cv_init(&cv, "Giant");
+	PANIC_TRY {
+		cv_timedwait(&cv, &Giant, 5);
+	} PANIC_CATCH {
+		sleepq_release(&cv);
+		IGNORE_PANIC_STARTS_WITH("mutex %s not owned");
+	} PANIC_END;
+	cv_destroy(&cv);
+	PANIC_TRY {
+		mtx_sleep(&Giant_wchan, &Giant, 0, "Giant", 5);
+	} PANIC_CATCH {
+		sleepq_release(&Giant_wchan);
+		IGNORE_PANIC_STARTS_WITH("mutex %s not owned");
+	} PANIC_END;
+}
+CRASH_EVENT("using unlocked Giant as sleep interlock",
+    sleeping_with_unlocked_Giant_interlock);
+
+static void
 broadcast(void *cv)
 {
 
@@ -113,8 +196,6 @@
 }
 CRASH_EVENT("try to cv_wait_unlock() with Giant", wait_unlock_Giant);
 
-static int Giant_wchan;
-
 static void
 pdrop_Giant(void)
 {
@@ -768,7 +849,7 @@
 {
 	sx_slock(&foo);
 	if (sx_try_upgrade(&foo) == 0) {
-		printf("crash: umm, upgrade failed?\n");
+		panic("crash: umm, upgrade failed?\n");
 		sx_sunlock(&foo);
 	} else
 		sx_xunlock(&foo);
@@ -788,7 +869,14 @@
 upgrade_excl_foo(void)
 {
 	sx_xlock(&foo);
-	sx_try_upgrade(&foo);
+	PANIC_TRY {
+		sx_try_upgrade(&foo);
+	} PANIC_CATCH {		
+		IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked");
+		/* WITNESS */
+		IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked");
+	} PANIC_END;
+	sx_xunlock(&foo);
 }
 CRASH_EVENT("xlock foo, upgrade", upgrade_excl_foo);
 
@@ -796,7 +884,14 @@
 downgrade_shared_foo(void)
 {
 	sx_slock(&foo);
-	sx_downgrade(&foo);
+	PANIC_TRY {
+		sx_downgrade(&foo);
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked");
+		/* WITNESS */
+		IGNORE_PANIC_STARTS_WITH("Lock (%s) %s not exclusively locked");
+	} PANIC_END;
+	sx_sunlock(&foo);
 }
 CRASH_EVENT("slock foo, downgrade", downgrade_shared_foo);
 
@@ -805,7 +900,14 @@
 {
 	sx_slock(&foo);
 	sx_try_upgrade(&foo);
-	sx_sunlock(&foo);
+	PANIC_TRY {
+		sx_sunlock(&foo);
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked");
+		/* WITNESS */
+		IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked");
+	} PANIC_END;
+	sx_xunlock(&foo);
 }
 CRASH_EVENT("slock foo, upgrade, sunlock", sunlock_upgraded_foo);
 
@@ -814,24 +916,37 @@
 {
 	sx_xlock(&foo);
 	sx_downgrade(&foo);
-	sx_xunlock(&foo);
+	PANIC_TRY {
+		sx_xunlock(&foo);
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked");
+		/* WITNESS */
+		IGNORE_PANIC_STARTS_WITH("Lock (%s) %s not exclusively locked");
+	} PANIC_END;
+	sx_sunlock(&foo);
+		
 }
 CRASH_EVENT("xlock foo, downgrade, xunlock", xunlock_downgraded_foo);
 
 static void
 double_mtx_init(void)
 {
-	kdb_enter(KDB_WHY_CRASH, "about to init again");
-	mtx_init(&test_mtx, "test", NULL, MTX_DEF);
-	kdb_enter(KDB_WHY_CRASH, "if we haven't panic'd by now, ouch. :(");
-	mtx_destroy(&test_mtx);
+	PANIC_TRY {
+		mtx_init(&test_mtx, "test", NULL, MTX_DEF);
+	} PANIC_CATCH {
+		IGNORE_PANIC("lock \"%s\" %p already initialized");
+	} PANIC_END;
 }
 CRASH_EVENT("re-init of test_mtx", double_mtx_init);
 
 static void
 test_mtx_assert(void)
 {
-	mtx_assert(&Giant, MA_OWNED);
+	PANIC_TRY {
+		mtx_assert(&Giant, MA_OWNED);
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("mutex %s not owned");
+	} PANIC_END;
 }
 CRASH_EVENT("assert that Giant is locked while it is unlocked",
     test_mtx_assert);
@@ -840,7 +955,13 @@
 test_sx_assert_slocked(void)
 {
 	sx_xlock(&foo);
-	sx_assert(&foo, SX_SLOCKED);
+	PANIC_TRY {		
+		sx_assert(&foo, SX_SLOCKED);
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked");
+		/* WITNESS */
+		IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked");
+	} PANIC_END;	
 	sx_xunlock(&foo);
 }
 CRASH_EVENT("assert that foo is slocked while it is xlocked",
@@ -857,30 +978,6 @@
 }
 CRASH_EVENT("lock test, slock foo, sunlock foo, unlock test",
     test_sx_and_mtx_order);
-
-static void
-test_witness_removal(void)
-{
-	bzero(&test1_mtx, sizeof(test1_mtx));
-	bzero(&test2_mtx, sizeof(test2_mtx));
-	mtx_init(&test1_mtx, "test1", NULL, MTX_DEF);
-	mtx_init(&test2_mtx, "test2", NULL, MTX_DEF);
-	kdb_enter(KDB_WHY_CRASH, "no order yet");
-	mtx_lock(&Giant);
-	mtx_lock(&test1_mtx);
-	mtx_lock(&test2_mtx);
-	mtx_unlock(&test2_mtx);
-	mtx_unlock(&test1_mtx);
-	mtx_unlock(&Giant);
-	kdb_enter(KDB_WHY_CRASH, "test1 and test2 should be ordered");
-	mtx_destroy(&test1_mtx);
-	kdb_enter(KDB_WHY_CRASH,
-	    "test1 should be gone, test2 should be after Giant");
-	mtx_destroy(&test2_mtx);
-	kdb_enter(KDB_WHY_CRASH, "test1 and test2 should be gone");
-}
-CRASH_EVENT("use test1 and test2 mutexes to test witness removal",
-    test_witness_removal);
 #endif
 
 static void
@@ -1035,7 +1132,11 @@
 
 	printf("Should panic\n");
 	THREAD_NO_SLEEPING();
-	pause("sleep", 1);
+	PANIC_TRY {
+		pause("sleep", 1);
+	} PANIC_CATCH {
+		IGNORE_PANIC("Trying sleep, but thread marked as sleeping prohibited");
+	} PANIC_END;
 	THREAD_SLEEPING_OK();
 }
 CRASH_EVENT("sleep while sleeping is prohibited", test_no_sleeping);
@@ -1047,13 +1148,31 @@
 	printf("Should panic\n");
 	bzero(&test1_mtx, sizeof(test1_mtx));
 	mtx_init(&test1_mtx, "test1", NULL, MTX_SPIN | MTX_RECURSE);
-	if (mtx_trylock(&test1_mtx))
-		printf("Hmm, locked!\n");
-	else
-		printf("Not locked\n");
+	PANIC_TRY {
+		if (mtx_trylock(&test1_mtx))
+			printf("Hmm, locked!\n");
+		else
+			printf("Not locked\n");
+	} PANIC_CATCH {
+		IGNORE_PANIC_STARTS_WITH("mtx_trylock() of spin mutex");
+	} PANIC_END;
+	mtx_destroy(&test1_mtx);
 }
 CRASH_EVENT("try lock on a spin mutex", test_trylock_spin);
 
+static void
+test_passert(void)
+{
+
+	printf("Should panic on \"foo\"\n");
+	PANIC_TRY {
+		panic("foo");
+	} PANIC_CATCH {
+		IGNORE_PANIC("foo");
+	} PANIC_END;	
+}
+CRASH_EVENT("basic test of catching panics", test_passert);
+
 /* Help event should be last so that it is always event 1. */
 
 static void
@@ -1078,8 +1197,6 @@
 }
 CRASH_EVENT(NULL, nop);
 
-SYSCTL_NODE(_debug, OID_AUTO, crash, CTLFLAG_RD, 0, "crash tree");
-
 static int
 sysctl_debug_crash_test(SYSCTL_HANDLER_ARGS)
 {
--- //depot/projects/smpng/sys/netinet/udp_usrreq.c	2009/05/08 11:53:25
+++ //depot/user/jhb/lock/netinet/udp_usrreq.c	2009/05/08 14:17:47
@@ -207,7 +207,7 @@
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
-	struct mbuf *opts = 0;
+	struct mbuf *opts = 0, *msa;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
@@ -229,15 +229,6 @@
 		return;
 	}
 #endif
-	if (inp->inp_flags & INP_CONTROLOPTS ||
-	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
-#ifdef INET6
-		if (inp->inp_vflag & INP_IPV6)
-			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
-		else
-#endif
-			ip_savecontrol(inp, &opts, ip, n);
-	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
@@ -248,14 +239,29 @@
 	} else
 #endif
 		append_sa = (struct sockaddr *)udp_in;
+	msa = soputsockaddr(append_sa);
+	if (msa == NULL) {
+		m_freem(n);
+		return;
+	}
+	if (inp->inp_flags & INP_CONTROLOPTS ||
+	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
+#ifdef INET6
+		if (inp->inp_vflag & INP_IPV6)
+			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
+		else
+#endif
+			ip_savecontrol(inp, &opts, ip, n);
+	}
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
-	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
+	if (sbappendaddr_mbuf(&so->so_rcv, msa, n, opts) == 0) {
 		INIT_VNET_INET(so->so_vnet);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
+		m_freem(msa);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
--- //depot/projects/smpng/sys/notes	2009/02/18 22:05:55
+++ //depot/user/jhb/lock/notes	2009/03/11 22:14:12
@@ -73,3 +73,214 @@
 - jhb_socket - socket hacking
 
 Space reserved for child branches:
+- Fix show witness output to use paging.
+- Add a MTX_NEW flag to bypass the LO_INITIALIZED check.  Here be dragons.
+- Add a MTX_LEAF flag with LO_LEAF flag in WITNESS.
+- Add a set of mtx_lock_if_sleep()/mtx_unlock_if_sleep() pair of functions
+  that use a per-thread array (until the array gets full, at which point
+  it does the normal unlock/lock).  This would be useful to do things like
+
+	mtx_unlock_if_sleep(&foo);
+	malloc(blah, M_WAITOK);
+	mtx_lock_if_sleep(&foo);
+
+  rather than:
+
+	mtx_unlock(&foo);
+	malloc(blah, M_WAITOK);
+	mtx_lock(&foo);
+
+  where foo is only unlocked if malloc() actually blocks.  This is purely
+  an optimization.
+- Allow witness to output witness tree via sysctl or some such
+- Allow witness to output witness tree via sysctl in dot friendly format
+- Split critical section from spinlocks
+  - inline critical_enter?
+    - move prototypes to sys/proc.h if so
+- Try an experiement on UP where we replace normal mutexes with critical_*
+  similar to using spinlock_* for spin mutexes thus effectively giving us
+  a two-layer spl scheme.
+- Reader/writer locks
+  - add "owner of record" where first read locker gets priority propagations
+    until they drop the lock
+- Stephan's VFS shared locking changes:
+  - shared locks should be ok for VOP_ACCESS() and VOP_GETATTR()
+  + VOP_READLINK()?
+    + cd9660
+    + zfs
+    + udf
+    + nfs
+    + ffs
+  + VOP_READDIR()? (getdirentries())
+    + cd9660
+    + zfs
+    + udf
+    + nfs
+    + ffs
+  - VOP_VPTOFH()? (lgetfh() and getfh())
+  - VOP_FHTOVP() should return a share-locked vnode or take a locking flag?
+  - system calls
+    - lookups
+    - vnode locks
+  - VOP_MARKATIME() for zfs can just use ZFS_ACCESSTIME_STAMP()
+- Lock some filesystems
+  - devfs
+    - shared vnode locks
+      - VFS_ROOT
+      - VFS_VGET
+      + VOP_READ
+      + VOP_GETDIRENTRIES
+      + VOP_ACCESS
+      + VOP_GETATTR
+      + VOP_READLINK
+      - see if there are places dm_lock can be shared instead of xlocked
+        - devfs_vptocn
+      - LOOKUP_SHARED
+      - EXTENDED_SHARED?
+  - msdos
+    - MPSAFE
+      - denode (v_data) fields
+      - mount fields
+        - FAT related fields
+          - pm_fmod?
+	  - pm_fsinfo
+	  - pm_nextfree
+	  - pm_inusemap[]
+	- fileno RB tree
+	  - pm_nfileno
+	  - pm_filenos
+      + msdosfs_conv.c
+      - msdosfs_denode.c
+      - msdosfs_fat.c
+      - msdosfs_fileno.c
+      + msdosfs_iconv.c
+      - msdosfs_lookup.c
+      - msdosfs_vfsops.c
+      - msdosfs_vnops.c
+    - LOOKUP_SHARED
+    - EXTENDED_SHARED
+- NFS fun
+  + caching credentials across UIDs
+    - needs testing
+  - v_dd (kan has patch for this)
+- Make all filesystems use vfs_getnewfsid()
+- Locking notes on namei() and lookup():
+  - namei():
+      VREF(dp)
+      for (;;) {
+	ni_startdir = dp;
+	lookup();
+	if (error)
+	   no references or locks
+	if (!symlink)
+	   return (0)
+	read symlink and build path
+	if (error)
+	   break
+	vput(ni_vp)
+	dp = ni_dvp
+      }
+      vput(ni_vp)
+      vrele(ni_dvp)
+  - so lookup() is called with ni_startdir referenced, but not locked.  On
+    return, ni_startdir's reference is always released.  On error no other
+    vnodes are referenced or locked.  On success, ni_vp is locked and
+    referenced and ni_dvp is referenced (symlink).   
+  - lookup():
+      dp = ni_startdir
+      ni_startdir = NULL
+      ni_dvp = NULL
+      vn_lock(dp)
+      if (doomed) goto bad
+    dirloop:
+      if (degenerate name)
+	if (error) goto bad;
+	if (wantparent)
+	    ni_dvp = dp
+	    VREF(dp);
+	ni_vp = dp;
+	if (!LOCKPARENT | LOCKLEAF)
+	    VOP_UNLOCK(dp)
+	/* ni_dvp is ref'd (locked if LOCKPARENT), ni_vp is ref'd and locked (if LOCKLEAF) */
+	goto success
+      if (ISDOTDOT)
+	for (;;)
+	  if (hitroot)
+	    ni_dvp = dp
+	    VREF(dp)
+	    ni_vp = dp
+	    goto nextname
+          if (!mountpoint root)
+	    break
+          update dp for mountpoint below vnode
+          if (doomed) goto bad
+    unionlookup:
+      ni_dvp = dp
+      ni_vp = NULL
+      if (ISLASTCN)
+	upgrade dp
+	if (doomed) goto bad
+      VOP_LOOKUP(dp, &ni_vp)
+      if (error)
+	if (ENOENT && union case)
+	  switch dp to covered vnode
+	  if (doomed) goto bad
+	  goto unionlookup;
+	  if (error) goto bad
+          if (!LOCKPARENT)
+	     VOP_UNLOCK(dp) /* == ni_dvp */
+	  /* ni_vp is NULL, ni_dvp (dp) is ref'd (locked if LOCKPARENT) */
+	  goto success;
+      dp = ni_vp
+      if (dp is mountpoint)
+	vput(dp)
+	vput(ni_dvp)
+	ni_dvp = vp_crossmp
+	vn_lock(ni_dvp)
+	VFS_ROOT(&tdp)
+	if (error)
+	  /* ni_dvp is ref'd and locked, but ni_vp and dp are not and invalid */
+	  dpunlocked = 1
+	  goto bad2
+        ni_vp = dp = tdp
+      if (symlink)
+	if (error) goto bad2
+	if (ni_dvp != ni_vp)
+	    VOP_UNLOCK(ni_dvp)
+	/* ni_dvp is ref'd, ni_vp is locked and ref'd.  ni_dvp is never locked
+	   because symlink code in namei() assumes it isn't */
+	goto success
+    nextname:
+      /* ni_vp (dp) is locked and ref'd, ni_dvp is ref'd and locked if
+         ni_dvp != dp */
+      if (another component)
+	vput(ni_dvp)
+        goto dirloop /* dp (ni_vp) is now the parent dir for next lookup */
+      if (EROFS) goto bad
+      if (SAVESTART)
+	ni_startdir = ni_dvp
+	VREF(ni_startdir)
+      if (!wantparent)
+	vput(ni_dvp)
+      if (!LOCKPARENT and dp != ni_dvp)
+	VOP_UNLOCK(ni_dvp)
+      if (!LOCKLEAK)
+        VOP_UNLOCK(dp) /* ni_vp */
+
+    success:
+      /*
+       * If ni_dvp is != NULL, should be ref'd and locked if LOCKPARENT.
+       * If ni_vp is != NULL, should be ref'd and locked if LOCKLEAF.
+       * dp == ni_vp unless ni_vp == NULL (CREATE) when dp == ni_dvp
+       */
+      if (need upgrade of leaf)
+	vn_lock(dp) /* ni_vp */
+	if (error)
+	  /* XXX: leak ni_startdir if SAVESTART? */
+	  goto bad
+      return (0)
+
+      
+  - VOP_LOOKUP(dp, &ni_vp) is called with dp locked.  On error, nothing
+    changes.  On success, ni_vp is locked and referenced and dp remains
+    unchanged.
--- //depot/projects/smpng/sys/sys/proc.h	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/sys/proc.h	2009/05/13 17:57:30
@@ -168,6 +168,7 @@
 struct kdtrace_proc;
 struct kdtrace_thread;
 struct cpuset;
+struct _jmp_buf;
 
 /*
  * Kernel runnable context (thread).
@@ -234,6 +235,7 @@
 	struct file	*td_fpop;	/* (k) file referencing cdev under op */
 	int		td_dbgflags;	/* (c) Userland debugger flags */
 	struct osd	td_osd;		/* (k) Object specific data. */
+	const char	*td_caught_panic; /* (k) Caught panic. */
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall(). */
@@ -277,6 +279,7 @@
 	int		td_errno;	/* Error returned by last syscall. */
 	struct vnet	*td_vnet;	/* (*) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (*) Debugging vnet push / pop. */
+	struct _jmp_buf *td_panic_buf;	/* (k) Jump buffer for PANIC_CATCH(). */
 };
 
 struct mtx *thread_lock_block(struct thread *);
@@ -369,6 +372,7 @@
 #define	TDP_CALLCHAIN	0x00400000 /* Capture thread's callchain */
 #define	TDP_IGNSUSP	0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
 #define	TDP_AUDITREC	0x01000000 /* Audit record pending on thread */
+#define	TDP_CATCHPANIC	0x02000000 /* Catching panics with PANIC_CATCH */
 
 /*
  * Reasons that the current thread can not be run yet.
--- //depot/projects/smpng/sys/sys/sockbuf.h	2008/08/05 21:26:01
+++ //depot/user/jhb/lock/sys/sockbuf.h	2008/09/30 18:25:15
@@ -121,6 +121,8 @@
 void	sbappend_locked(struct sockbuf *sb, struct mbuf *m);
 void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
 void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m);
+int	sbappendaddr_mbuf(struct sockbuf *sb, struct mbuf *sa, struct mbuf *m0,
+	    struct mbuf *control);
 int	sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
--- //depot/projects/smpng/sys/sys/socketvar.h	2009/05/08 11:53:25
+++ //depot/user/jhb/lock/sys/socketvar.h	2009/05/08 14:17:47
@@ -335,6 +335,7 @@
 	    struct thread *td);
 int	sopoll_generic(struct socket *so, int events,
 	    struct ucred *active_cred, struct thread *td);
+struct mbuf *soputsockaddr(const struct sockaddr *asa);
 int	soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio,
 	    struct mbuf **mp0, struct mbuf **controlp, int *flagsp);
 int	soreceive_dgram(struct socket *so, struct sockaddr **paddr,
--- //depot/projects/smpng/sys/sys/systm.h	2009/05/19 13:40:43
+++ //depot/user/jhb/lock/sys/systm.h	2009/05/20 17:51:14
@@ -71,6 +71,46 @@
 	if (__predict_false(!(exp)))					\
 		panic msg;						\
 } while (0)
+
+#define	PANIC_TRY do {							\
+	switch (setjmp(curthread->td_panic_buf)) {			\
+	case 0:								\
+		curthread->td_pflags |= TDP_CATCHPANIC;
+
+#define	PANIC_CATCH							\
+		curthread->td_pflags &= ~TDP_CATCHPANIC;		\
+		panic("Expected panic did not trigger");		\
+		break;							\
+	case 1:								\
+		curthread->td_pflags &= ~TDP_CATCHPANIC;
+
+#define	IGNORE_PANIC(panicstr)						\
+		if (strcmp(curthread->td_caught_panic, (panicstr)) ==	\
+		    0) {						\
+			printf("Caught expected panic '%s'\n",		\
+			    curthread->td_caught_panic);		\
+			break;						\
+		}
+
+#define	IGNORE_PANIC_STARTS_WITH(panicstr)				\
+		if (strncmp(curthread->td_caught_panic, (panicstr),	\
+		    strlen((panicstr))) == 0) {				\
+			printf("Caught expected panic '%s'\n",		\
+			    curthread->td_caught_panic);		\
+			break;						\
+		}
+
+#define	PANIC_END							\
+		panic("Unexpected panic '%s'",				\
+		    curthread->td_caught_panic);			\
+		break;							\
+	default:							\
+		curthread->td_pflags &= ~TDP_CATCHPANIC;		\
+		panic("Unexpected return value from setjmp()");		\
+	}								\
+	curthread->td_pflags &= ~TDP_CATCHPANIC;			\
+} while (0)
+
 #define	VNASSERT(exp, vp, msg) do {					\
 	if (__predict_false(!(exp))) {					\
 		vn_printf(vp, "VNASSERT failed\n");			\
@@ -81,6 +121,14 @@
 #define	KASSERT(exp,msg) do { \
 } while (0)
 
+#define	PANIC_TRY							\
+	while (0) {
+#define	PANIC_CATCH
+#define	IGNORE_PANIC
+#define	IGNORE_PANIC_STARTS_WITH
+#define	PANIC_END							\
+	}
+
 #define	VNASSERT(exp, vp, msg) do { \
 } while (0)
 #endif
--- //depot/projects/smpng/sys/ufs/ffs/ffs_vfsops.c	2009/05/13 13:56:17
+++ //depot/user/jhb/lock/ufs/ffs/ffs_vfsops.c	2009/05/13 17:57:30
@@ -278,7 +278,7 @@
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			vn_lock(devvp, LK_SHARED | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
--- //depot/projects/smpng/sys/ufs/ufs/ufs_lookup.c	2009/05/08 11:53:25
+++ //depot/user/jhb/lock/ufs/ufs/ufs_lookup.c	2009/05/08 14:17:47
@@ -472,6 +472,7 @@
 	 */
 	if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) {
 		ufs_dirbad(dp, i_offset, "i_size too small");
+		/* XXX: This needs an exclusive lock, but we panic above. */
 		dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep);
 		DIP_SET(dp, i_size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;