diff -urN /Projects/clean/src/lib/libc/gen/lockf.c /Projects/M4/src/lib/libc/gen/lockf.c
--- /Projects/clean/src/lib/libc/gen/lockf.c	2008-01-19 15:54:31.000000000 +0000
+++ /Projects/M4/src/lib/libc/gen/lockf.c	2008-02-12 09:57:16.000000000 +0000
@@ -74,7 +74,7 @@
 		fl.l_type = F_WRLCK;
 		if (_fcntl(filedes, F_GETLK, &fl) == -1)
 			return (-1);
-		if (fl.l_type == F_UNLCK || fl.l_pid == getpid())
+		if (fl.l_type == F_UNLCK || (fl.l_sysid == 0 && fl.l_pid == getpid()))
 			return (0);
 		errno = EAGAIN;
 		return (-1);
diff -urN /Projects/clean/src/lib/libc/sys/fcntl.2 /Projects/M4/src/lib/libc/sys/fcntl.2
--- /Projects/clean/src/lib/libc/sys/fcntl.2	2008-01-19 15:54:32.000000000 +0000
+++ /Projects/M4/src/lib/libc/sys/fcntl.2	2008-02-12 09:57:18.000000000 +0000
@@ -177,6 +177,7 @@
 	pid_t	l_pid;		/* lock owner */
 	short	l_type;		/* lock type: read/write, etc. */
 	short	l_whence;	/* type of l_start */
+	int	l_sysid;	/* remote system id or zero for local */
 };
 .Ed
 The commands available for advisory record locking are as follows:
@@ -264,9 +265,13 @@
 means end edge of the region.
 The
 .Fa l_pid
-field is only used with
+and
+.Fa l_sysid
+fields are only used with
 .Dv F_GETLK
-to return the process ID of the process holding a blocking lock.
+to return the process ID of the process holding a blocking lock and
+the system ID of the system that owns that process.
+Locks created by the local system will have a system ID of zero.
 After a successful
 .Dv F_GETLK
 request, the value of
diff -urN /Projects/clean/src/sys/compat/linux/linux_file.c /Projects/M4/src/sys/compat/linux/linux_file.c
--- /Projects/clean/src/sys/compat/linux/linux_file.c	2008-01-19 15:54:38.000000000 +0000
+++ /Projects/M4/src/sys/compat/linux/linux_file.c	2008-02-12 09:56:43.000000000 +0000
@@ -1051,6 +1051,7 @@
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
+	bsd_flock->l_sysid = 0;
 }
 
 static void
@@ -1107,6 +1108,7 @@
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
+	bsd_flock->l_sysid = 0;
 }
 
 static void
diff -urN /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c /Projects/M4/src/sys/compat/svr4/svr4_fcntl.c
--- /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c	2008-01-19 15:54:40.000000000 +0000
+++ /Projects/M4/src/sys/compat/svr4/svr4_fcntl.c	2008-02-12 09:56:44.000000000 +0000
@@ -191,7 +191,7 @@
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
-
+	oflp->l_sysid = iflp->l_sysid;
 }
 
 static void
@@ -217,7 +217,7 @@
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off64_t) iflp->l_start;
 	oflp->l_len = (svr4_off64_t) iflp->l_len;
-	oflp->l_sysid = 0;
+	oflp->l_sysid = iflp->l_sysid;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
diff -urN /Projects/clean/src/sys/conf/options /Projects/M4/src/sys/conf/options
--- /Projects/clean/src/sys/conf/options	2008-01-19 15:43:49.000000000 +0000
+++ /Projects/M4/src/sys/conf/options	2008-02-12 09:56:44.000000000 +0000
@@ -54,6 +54,7 @@
 KDB_TRACE	opt_kdb.h
 KDB_UNATTENDED	opt_kdb.h
 SYSCTL_DEBUG	opt_sysctl.h
+ADVLOCKASYNC_TESTING opt_global.h
 
 NO_SYSCTL_DESCR	opt_global.h
 
diff -urN /Projects/clean/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c /Projects/M4/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
--- /Projects/clean/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2008-01-19 15:43:55.000000000 +0000
+++ /Projects/M4/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2008-02-12 09:56:50.000000000 +0000
@@ -3547,6 +3547,25 @@
 	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
 }
 
+/*
+ * Advisory record locking support
+ */
+static int
+zfs_freebsd_advlockasync(ap)
+	struct vop_advlockasync_args /* {
+		struct vnode *a_vp;
+		caddr_t  a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+		struct task *a_task;
+	} */ *ap;
+{
+	znode_t	*zp = VTOZ(ap->a_vp);
+
+	return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size));
+}
+
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 
@@ -3580,6 +3599,7 @@
 	.vop_remove =	zfs_freebsd_remove,
 	.vop_rename =	zfs_freebsd_rename,
 	.vop_advlock =	zfs_freebsd_advlock,
+	.vop_advlockasync = zfs_freebsd_advlockasync,
 	.vop_pathconf =	zfs_freebsd_pathconf,
 	.vop_bmap =	VOP_EOPNOTSUPP,
 	.vop_fid =	zfs_freebsd_fid,
diff -urN /Projects/clean/src/sys/fs/msdosfs/msdosfs_vnops.c /Projects/M4/src/sys/fs/msdosfs/msdosfs_vnops.c
--- /Projects/clean/src/sys/fs/msdosfs/msdosfs_vnops.c	2008-01-19 15:44:13.000000000 +0000
+++ /Projects/M4/src/sys/fs/msdosfs/msdosfs_vnops.c	2008-02-12 09:57:01.000000000 +0000
@@ -83,6 +83,7 @@
  * Prototypes for MSDOSFS vnode operations
  */
 static vop_advlock_t	msdosfs_advlock;
+static vop_advlockasync_t msdosfs_advlockasync;
 static vop_create_t	msdosfs_create;
 static vop_mknod_t	msdosfs_mknod;
 static vop_open_t	msdosfs_open;
@@ -1963,6 +1964,22 @@
 }
 
 static int
+msdosfs_advlockasync(ap)
+	struct vop_advlockasync_args /* {
+		struct vnode *a_vp;
+		u_char a_id;
+		int a_op;
+		struct flock *a_fl;
+		int a_flags;
+		struct task *a_task;
+	} */ *ap;
+{
+	struct denode *dep = VTODE(ap->a_vp);
+
+	return (lf_advlockasync(ap, &dep->de_lockf, dep->de_FileSize));
+}
+
+static int
 msdosfs_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
@@ -1987,6 +2004,7 @@
 
 	.vop_access =		msdosfs_access,
 	.vop_advlock =		msdosfs_advlock,
+	.vop_advlockasync =	msdosfs_advlockasync,
 	.vop_bmap =		msdosfs_bmap,
 	.vop_cachedlookup =	msdosfs_lookup,
 	.vop_open =		msdosfs_open,
diff -urN /Projects/clean/src/sys/fs/tmpfs/tmpfs_vnops.c /Projects/M4/src/sys/fs/tmpfs/tmpfs_vnops.c
--- /Projects/clean/src/sys/fs/tmpfs/tmpfs_vnops.c	2008-01-19 15:44:13.000000000 +0000
+++ /Projects/M4/src/sys/fs/tmpfs/tmpfs_vnops.c	2008-02-12 09:57:01.000000000 +0000
@@ -1446,6 +1446,20 @@
 /* --------------------------------------------------------------------- */
 
 static int
+tmpfs_advlockasync(struct vop_advlockasync_args *v)
+{
+	struct vnode *vp = v->a_vp;
+
+	struct tmpfs_node *node;
+
+	node = VP_TO_TMPFS_NODE(vp);
+
+	return lf_advlockasync(v, &node->tn_lockf, node->tn_size);
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
 tmpfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct tmpfs_fid *tfhp;
@@ -1493,6 +1507,7 @@
 	.vop_print =			tmpfs_print,
 	.vop_pathconf =			tmpfs_pathconf,
 	.vop_advlock =			tmpfs_advlock,
+	.vop_advlockasync =		tmpfs_advlockasync,
 	.vop_vptofh =			tmpfs_vptofh,
 	.vop_bmap =			VOP_EOPNOTSUPP,
 };
diff -urN /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c /Projects/M4/src/sys/i386/ibcs2/ibcs2_fcntl.c
--- /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c	2008-01-19 15:54:41.000000000 +0000
+++ /Projects/M4/src/sys/i386/ibcs2/ibcs2_fcntl.c	2008-02-12 09:57:03.000000000 +0000
@@ -93,7 +93,7 @@
 	iflp->l_whence = (short)flp->l_whence;
 	iflp->l_start = (ibcs2_off_t)flp->l_start;
 	iflp->l_len = (ibcs2_off_t)flp->l_len;
-	iflp->l_sysid = 0;
+	iflp->l_sysid = flp->l_sysid;
 	iflp->l_pid = (ibcs2_pid_t)flp->l_pid;
 }
 
@@ -127,6 +127,7 @@
 		break;
 	}
 	flp->l_whence = iflp->l_whence;
+	flk->l_sysid = iflp->l_sysid;
 }
 
 /* convert iBCS2 mode into NetBSD mode */
diff -urN /Projects/clean/src/sys/kern/kern_descrip.c /Projects/M4/src/sys/kern/kern_descrip.c
--- /Projects/clean/src/sys/kern/kern_descrip.c	2008-01-19 15:54:42.000000000 +0000
+++ /Projects/M4/src/sys/kern/kern_descrip.c	2008-02-12 09:57:05.000000000 +0000
@@ -69,6 +69,9 @@
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
+#ifdef ADVLOCKASYNC_TESTING
+#include <sys/taskqueue.h>	/* XXX for async lock testing */
+#endif
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
@@ -316,28 +319,67 @@
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
+	struct oflock ofl;
 	intptr_t arg;
 	int error;
+	int cmd;
 
 	error = 0;
+	cmd = uap->cmd;
 	switch (uap->cmd) {
-	case F_GETLK:
-	case F_SETLK:
-	case F_SETLKW:
-		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+	case F_OGETLK:
+	case F_OSETLK:
+	case F_OSETLKW:
+		/*
+		 * Convert old flock structure to new.
+		 */
+		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+		fl.l_start = ofl.l_start;
+		fl.l_len = ofl.l_len;
+		fl.l_pid = ofl.l_pid;
+		fl.l_type = ofl.l_type;
+		fl.l_whence = ofl.l_whence;
+		fl.l_sysid = 0;
+
+		switch (uap->cmd) {
+		case F_OGETLK:
+		    cmd = F_GETLK;
+		    break;
+		case F_OSETLK:
+		    cmd = F_SETLK;
+		    break;
+		case F_OSETLKW:
+		    cmd = F_SETLKW;
+		    break;
+		}
 		arg = (intptr_t)&fl;
 		break;
+        case F_GETLK:
+        case F_SETLK:
+        case F_SETLKW:
+	case F_SETLK_REMOTE:
+                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+                arg = (intptr_t)&fl;
+                break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
-	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
+	error = kern_fcntl(td, uap->fd, cmd, arg);
 	if (error)
 		return (error);
-	if (uap->cmd == F_GETLK)
+	if (uap->cmd == F_OGETLK) {
+		ofl.l_start = fl.l_start;
+		ofl.l_len = fl.l_len;
+		ofl.l_pid = fl.l_pid;
+		ofl.l_type = fl.l_type;
+		ofl.l_whence = fl.l_whence;
+		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
+	} else if (uap->cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+	}
 	return (error);
 }
 
@@ -353,11 +395,61 @@
 	return (fp);
 }
 
+#ifdef ADVLOCKASYNC_TESTING
+
+struct async_flock {
+	struct task	af_task;
+	struct vnode	*af_vp;
+	struct proc	*af_proc;
+	struct flock	af_fl;
+	int		af_error;
+};
+
+/*
+ * This async callback happens when a lock which was blocking an async
+ * lock request is removed. We re-attempt the lock and if it succeeds,
+ * wakeup the client's thread.
+ */
+extern void kern_fcntl_callback(void *arg, int pending);
+void
+kern_fcntl_callback(void *arg, int pending)
+{
+	struct async_flock *af = (struct async_flock *) arg;
+	struct vnode *vp;
+	int error;
+
+	mtx_pool_lock(mtxpool_sleep, af);
+	vp = af->af_vp;
+
+	if (!vp) {
+		af->af_error = ECANCELED;
+		mtx_pool_unlock(mtxpool_sleep, af);
+		return;
+	}
+
+	mtx_pool_unlock(mtxpool_sleep, af);
+
+	error = VOP_ADVLOCKASYNC(af->af_vp, (caddr_t)af->af_proc,
+	    F_SETLK, &af->af_fl, F_POSIX, &af->af_task);
+
+	mtx_pool_lock(mtxpool_sleep, af);
+	af->af_error = error;
+	mtx_pool_unlock(mtxpool_sleep, af);
+
+	if (error != EINPROGRESS)
+		wakeup(af);
+}
+
+#endif
+
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
+#ifdef ADVLOCKASYNC_TESTING
+	struct async_flock *af = 0;
+#endif
 	struct file *fp;
 	struct proc *p;
 	char *pop;
@@ -490,11 +582,16 @@
 		fdrop(fp, td);
 		break;
 
+	case F_SETLK_REMOTE:
+		flg = F_REMOTE;
+		goto do_setlk;
+
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
+	do_setlk:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
@@ -524,6 +621,21 @@
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
+#ifdef ADVLOCKASYNC_TESTING
+		if (flg & F_WAIT) {
+			/*
+			 * XXX temporary support for testing async lock
+			 * infrastructure.
+			 */
+			af = malloc(sizeof(struct async_flock),
+			    M_TEMP, M_WAITOK);
+			TASK_INIT(&af->af_task, 0, kern_fcntl_callback, af);
+			af->af_vp = vp;
+			af->af_proc = p->p_leader;
+			af->af_fl = *flp;
+			flg &= ~F_WAIT;
+		}
+#endif
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		switch (flp->l_type) {
 		case F_RDLCK:
@@ -534,8 +646,13 @@
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
+#ifdef ADVLOCKASYNC_TESTING
+			error = VOP_ADVLOCKASYNC(vp, (caddr_t)p->p_leader,
+			    F_SETLK, flp, flg, af ? &af->af_task: NULL);
+#else
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
+#endif
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
@@ -545,17 +662,134 @@
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
+#ifdef ADVLOCKASYNC_TESTING
+			error = VOP_ADVLOCKASYNC(vp, (caddr_t)p->p_leader,
+			    F_SETLK, flp, flg, af ? &af->af_task: NULL);
+#else
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
+#endif
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
-			    flp, F_POSIX);
+			    flp, flg);
+			break;
+		case F_UNLCKSYS:
+			/*
+			 * Temporary api for testing remote lock
+			 * infrastructure.
+			 */
+			if (flg != F_REMOTE) {
+				error = EINVAL;
+				break;
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
+#ifdef ADVLOCKASYNC_TESTING
+		/*
+		 * XXX temporary support for testing async lock
+		 * infrastructure.
+		 */
+		if (error == EINPROGRESS) {
+			struct mtx *m = mtx_pool_find(mtxpool_sleep, af);
+			mtx_lock(m);
+			error = msleep(af, m, PCATCH, "F_SETLK", 0);
+			if (error == EINTR) {
+				/*
+				 * Cancel our async request. This is
+				 * slightly complicated by a potential
+				 * race with our own callback. We deal
+				 * with this as follows:
+				 *
+				 * First, we set af_vp to null - this
+				 * restricts the number of times we
+				 * have to compete with
+				 * kern_fcntl_callback to at most
+				 * twice.
+				 *
+				 * Second, we attempt to cancel the
+				 * lock. Since the vnode interlock
+				 * protects both the cancel and the
+				 * callback trigger in the locking
+				 * code, we are guaranteed that either
+				 * we successfuly cancel or that our
+				 * callback has been triggered.
+				 *
+				 * We handle any failure to cancel by
+				 * first ensuring that the callback
+				 * has finished by calling
+				 * taskqueue_drain. We can then
+				 * examine the value of af_error to
+				 * figure out whether we need to
+				 * re-attempt the cancel.
+				 */
+				int e;
+				af->af_vp = NULL;
+			retry_cancel:
+				mtx_unlock(m);
+				e = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+				    F_CANCEL, flp, flg);
+				if (e) {
+					/*
+					 * We failed to cancel - make
+					 * sure our callback has
+					 * completed before we
+					 * continue.
+					 */
+					taskqueue_drain(taskqueue_thread,
+					    &af->af_task);
+
+					mtx_lock(m);
+
+					/*
+					 * If the value of af_error is
+					 * EINPROGRESS, our callback
+					 * has re-registered the async
+					 * lock with the lock manager
+					 * so we must re-attempt the
+					 * cancel.
+					 */
+					if (af->af_error == EINPROGRESS) {
+						goto retry_cancel;
+					}
+
+					/*
+					 * If we managed to set af_vp
+					 * to null before the
+					 * callback, we will get a
+					 * value of ECANCELED in
+					 * af_error. This means we
+					 * successfully cancelled and
+					 * can report EINTR to the
+					 * caller.
+					 *
+					 * Any other value of af_error
+					 * should be reported to the
+					 * user as it represents the
+					 * success or failure of the
+					 * lock request.
+					 */
+					if (af->af_error != ECANCELED)
+						error = af->af_error;
+					mtx_unlock(m);
+				}
+			} else {
+				/*
+				 * We were woken up by the callback -
+				 * take our return value from
+				 * af_error.
+				 */
+				error = af->af_error;
+				mtx_unlock(m);
+			}
+			free(af, M_TEMP);
+		}
+#endif
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* Check for race with close */
diff -urN /Projects/clean/src/sys/kern/kern_lockf.c /Projects/M4/src/sys/kern/kern_lockf.c
--- /Projects/clean/src/sys/kern/kern_lockf.c	2008-01-19 15:54:43.000000000 +0000
+++ /Projects/M4/src/sys/kern/kern_lockf.c	2008-02-12 09:57:05.000000000 +0000
@@ -39,6 +39,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
@@ -50,6 +51,7 @@
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
+#include <sys/taskqueue.h>
 
 /*
  * This variable controls the maximum number of processes that will
@@ -57,6 +59,7 @@
  */
 static int maxlockdepth = MAXDEPTH;
 
+#define LOCKF_DEBUG
 #ifdef LOCKF_DEBUG
 #include <sys/sysctl.h>
 
@@ -80,36 +83,215 @@
 	 lf_getblock(struct lockf *);
 static int	 lf_getlock(struct lockf *, struct flock *);
 static int	 lf_setlock(struct lockf *, struct vnode *, struct lockf **);
-static void	 lf_split(struct lockf *, struct lockf *, struct lockf **);
-static void	 lf_wakelock(struct lockf *);
+/*static*/ int	 lf_cancel(struct lockf *, struct lockf **);
+static void	 lf_split(struct lockf *, struct lockf *, struct lockf **,
+	    int addlock);
+static void	 lf_wakelock(struct lockf *, struct lockf **);
+static void	 lf_clearremotesys(int sysid);
 #ifdef LOCKF_DEBUG
 static void	 lf_print(char *, struct lockf *);
 static void	 lf_printlist(char *, struct lockf *);
+static void	 lf_print_owner(struct lock_owner *);
 #endif
 
 /*
+ * This structure is used to keep track of both local and remote lock
+ * owners. The lf_owner field of the struct lockf points back at the
+ * lock owner structure. Each possible lock owner (local proc for
+ * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
+ * pair for remote locks) is represented by a unique instance of
+ * struct lock_owner.
+ *
+ * Locks:
+ * (l)		locked by lf_lock_owners_mutex
+ * (p)		locked by mtx_pool_lock(mtxpool_sleep, lo)
+ * (c)		const until freeing
+ */
+#define	LOCK_OWNER_HASH_SIZE	256
+
+struct lock_owner {
+	LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
+	int	lo_refs;	    /* (l) Number of locks referring to this */
+	int	lo_flags;	    /* (c) Flags passwd to lf_advlock */
+	caddr_t	lo_id;		    /* (c) Id value passed to lf_advlock */
+	pid_t	lo_pid;		    /* (c) Process Id of the lock owner */
+	int	lo_sysid;	    /* (c) System Id of the lock owner */
+	struct locklist lo_active;  /* (p) Active locks for this owner */
+	struct locklist lo_pending; /* (p) Pending locks for this owner */
+};
+
+LIST_HEAD(lock_owner_list, lock_owner);
+
+static struct mtx		lf_lock_owners_mutex;
+static struct lock_owner_list	lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
+
+/*
+ * Initialise the lock owner structures.
+ */
+static void
+lf_init(void *dummy)
+{
+	int i;
+
+	mtx_init(&lf_lock_owners_mutex, "lock owners lock", NULL, MTX_DEF);
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+		LIST_INIT(&lf_lock_owners[i]);
+}
+SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL)
+
+/*
+ * Generate a hash value for a lock owner.
+ */
+static int
+lf_hash_owner(caddr_t id, struct flock *fl, int flags)
+{
+	uint32_t h;
+
+	if (flags & F_REMOTE) {
+		h = HASHSTEP(0, fl->l_pid);
+		h = HASHSTEP(h, fl->l_sysid);
+	} else if (flags & F_FLOCK) {
+		h = ((uintptr_t) id) >> 7;
+	} else {
+		struct proc *p = (struct proc *) id;
+		h = HASHSTEP(0, p->p_pid);
+		h = HASHSTEP(h, 0);
+	}
+
+	return (h % LOCK_OWNER_HASH_SIZE);
+}
+
+/*
+ * Return true if a lock owner matches the details passed to
+ * lf_advlock.
+ */
+static int
+lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
+    int flags)
+{
+	if (flags & F_REMOTE) {
+		return lo->lo_pid == fl->l_pid
+			&& lo->lo_sysid == fl->l_sysid;
+	} else {
+		return lo->lo_id == id;
+	}
+}
+
+static  void
+lf_cleanup_lock(struct lockf *lock)
+{
+	/*
+	 * Adjust the lock_owner reference count and
+	 * reclaim the entry if this is the last lock
+	 * for that owner.
+	 */
+	struct lock_owner *lo = lock->lf_owner;
+	if (lo) {
+		mtx_lock(&lf_lock_owners_mutex);
+		lo->lo_refs--;
+		if (lo->lo_refs == 0) {
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 1)
+				printf("lf_cleanup_lock: freeing lock owner %p\n",
+				    lo);
+#endif
+			KASSERT(TAILQ_EMPTY(&lo->lo_active),
+			    ("freeing lock owner with active locks"));
+			KASSERT(TAILQ_EMPTY(&lo->lo_pending),
+			    ("freeing lock owner with pending locks"));
+			LIST_REMOVE(lo, lo_link);
+			free(lo, M_LOCKF);
+		}
+		mtx_unlock(&lf_lock_owners_mutex);
+	}
+}
+
+/*
  * Advisory record locking support
  */
 int
-lf_advlock(ap, head, size)
-	struct vop_advlock_args /* {
-		struct vnode *a_vp;
-		caddr_t  a_id;
-		int  a_op;
-		struct flock *a_fl;
-		int  a_flags;
-	} */ *ap;
-	struct lockf **head;
-	u_quad_t size;
+lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **head, u_quad_t size)
 {
 	struct flock *fl = ap->a_fl;
 	struct lockf *lock;
 	struct vnode *vp = ap->a_vp;
+	caddr_t id = ap->a_id;
+	int flags = ap->a_flags;
+	int hash;
+	struct lock_owner *lo;
 	off_t start, end, oadd;
 	struct lockf *clean, *n;
 	int error;
 
 	/*
+	 * Handle the F_UNLKSYS case first - no need to mess about
+	 * creating a lock owner for this one.
+	 */
+	if (ap->a_op == F_UNLCKSYS) {
+		lf_clearremotesys(fl->l_sysid);
+		return (0);
+	}
+
+	/*
+	 * Map our arguments to an existing lock owner or create one
+	 * if this is the first time we have seen this owner.
+	 */
+	hash = lf_hash_owner(id, fl, flags);
+	mtx_lock(&lf_lock_owners_mutex);
+	LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
+		if (lf_owner_matches(lo, id, fl, flags))
+			break;
+	if (!lo) {
+		/*
+		 * We initialise the lock with a reference
+		 * count of one which refers to the new lockf
+		 * structure created below.
+		 */
+		lo = malloc(sizeof(struct lock_owner),
+		    M_LOCKF, M_NOWAIT);
+		if (!lo) {
+			mtx_unlock(&lf_lock_owners_mutex);
+			return (ENOMEM);
+		}
+
+		lo->lo_refs = 1;
+		lo->lo_flags = flags;
+		lo->lo_id = id;
+		if (flags & F_REMOTE) {
+			lo->lo_pid = fl->l_pid;
+			lo->lo_sysid = fl->l_sysid;
+		} else if (flags & F_FLOCK) {
+			lo->lo_pid = -1;
+			lo->lo_sysid = 0;
+		} else {
+			struct proc *p = (struct proc *) id;
+			lo->lo_pid = p->p_pid;
+			lo->lo_sysid = 0;
+		}
+		TAILQ_INIT(&lo->lo_active);
+		TAILQ_INIT(&lo->lo_pending);
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			printf("lf_advlockasync: new lock owner %p ", lo);
+			lf_print_owner(lo);
+			printf("\n");
+		}
+#endif
+
+		LIST_INSERT_HEAD(&lf_lock_owners[hash],
+		    lo, lo_link);
+	} else {
+		/*
+		 * We have seen this lock owner before,
+		 * increase its reference count to account for
+		 * the new lockf struct we create below.
+		 */
+		lo->lo_refs++;
+	}
+	mtx_unlock(&lf_lock_owners_mutex);
+
+	/*
 	 * Convert the flock structure into a start and end.
 	 */
 	switch (fl->l_whence) {
@@ -165,6 +347,7 @@
 	clean = NULL;
 	if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) {
 		MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+		clean->lf_owner = 0;
 		clean->lf_next = NULL;
 	}
 	/*
@@ -173,7 +356,8 @@
 	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 	lock->lf_start = start;
 	lock->lf_end = end;
-	lock->lf_id = ap->a_id;
+	lock->lf_owner = lo;
+	lock->lf_vnode = vp;
 	/*
 	 * XXX The problem is that VTOI is ufs specific, so it will
 	 * break LOCKF_DEBUG for all other FS's other than UFS because
@@ -185,7 +369,9 @@
 	lock->lf_head = head;
 	lock->lf_next = (struct lockf *)0;
 	TAILQ_INIT(&lock->lf_blkhd);
+	lock->lf_async_task = ap->a_task;
 	lock->lf_flags = ap->a_flags;
+
 	/*
 	 * Do the requested operation.
 	 */
@@ -207,6 +393,12 @@
 		clean = lock;
 		break;
 
+	case F_CANCEL:
+		error = lf_cancel(lock, &clean);
+		lock->lf_next = clean;
+		clean = lock;
+		break;
+
 	default:
 		lock->lf_next = clean;
 		clean = lock;
@@ -215,6 +407,7 @@
 	}
 	VI_UNLOCK(vp);
 	for (lock = clean; lock != NULL; ) {
+		lf_cleanup_lock(lock);
 		n = lock->lf_next;
 		free(lock, M_LOCKF);
 		lock = n;
@@ -222,14 +415,26 @@
 	return (error);
 }
 
+int
+lf_advlock(struct vop_advlock_args *ap, struct lockf **head, u_quad_t size)
+{
+	struct vop_advlockasync_args a;
+
+	a.a_vp = ap->a_vp;
+	a.a_id = ap->a_id;
+	a.a_op = ap->a_op;
+	a.a_fl = ap->a_fl;
+	a.a_flags = ap->a_flags;
+	a.a_task = NULL;
+
+	return (lf_advlockasync(&a, head, size));
+}
+
 /*
  * Set a byte-range lock.
  */
 static int
-lf_setlock(lock, vp, clean)
-	struct lockf *lock;
-	struct vnode *vp;
-	struct lockf **clean;
+lf_setlock(struct lockf *lock, struct vnode *vp, struct lockf **clean)
 {
 	struct lockf *block;
 	struct lockf **head = lock->lf_head;
@@ -256,7 +461,8 @@
 		/*
 		 * Free the structure and return if nonblocking.
 		 */
-		if ((lock->lf_flags & F_WAIT) == 0) {
+		if ((lock->lf_flags & F_WAIT) == 0
+		    && lock->lf_async_task == NULL) {
 			lock->lf_next = *clean;
 			*clean = lock;
 			return (EAGAIN);
@@ -267,48 +473,57 @@
 		 * For byte-range locks we must check for deadlock.
 		 *
 		 * Deadlock detection is done by looking through the
-		 * wait channels to see if there are any cycles that
-		 * involve us. MAXDEPTH is set just to make sure we
-		 * do not go off into neverland.
+		 * lock owner pending lists to see if there are any
+		 * cycles that involve us. MAXDEPTH is set just to
+		 * make sure we do not go off into neverland.
+		 *
+		 * This algorithm is simplistic - it only considers
+		 * the first blocking lock and it doesn't follow all
+		 * paths through the lock graph.
 		 */
 		if ((lock->lf_flags & F_POSIX) &&
 		    (block->lf_flags & F_POSIX)) {
-			struct proc *wproc;
-			struct proc *nproc;
-			struct thread *td;
 			struct lockf *waitblock;
-			int i = 0;
-
-			/* The block is waiting on something */
-			wproc = (struct proc *)block->lf_id;
-restart:
-			nproc = NULL;
-			PROC_SLOCK(wproc);
-			FOREACH_THREAD_IN_PROC(wproc, td) {
-				thread_lock(td);
-				while (td->td_wchan &&
-				    (td->td_wmesg == lockstr) &&
-				    (i++ < maxlockdepth)) {
-					waitblock = (struct lockf *)td->td_wchan;
-					/* Get the owner of the blocking lock */
-					waitblock = waitblock->lf_next;
-					if ((waitblock->lf_flags & F_POSIX) == 0)
-						break;
-					nproc = (struct proc *)waitblock->lf_id;
-					if (nproc == (struct proc *)lock->lf_id) {
-						PROC_SUNLOCK(wproc);
-						thread_unlock(td);
+			struct lockf *nblock;
+			struct lock_owner *lo;
+			struct lock_owner *nlo;
+			int i;
+
+			lo = block->lf_owner;
+			i = 0;
+			while (lo) {
+				if (i++ == maxlockdepth)
+					break;
+				mtx_pool_lock(mtxpool_sleep, lo);
+				nlo = NULL;
+				TAILQ_FOREACH(waitblock, &lo->lo_pending,
+				    lf_olock) {
+					/*
+					 * Get the owner of the
+					 * blocking lock.
+					 *
+					 * XXX this is unsafe - if
+					 * waitblock is on a different
+					 * vnode to this one, our
+					 * vnode interlock will not
+					 * protect us against changes
+					 * to waitblock->lf_next.
+					 */
+					nblock = waitblock->lf_next;
+					if ((nblock->lf_flags & F_POSIX) == 0)
+						continue;
+					nlo = nblock->lf_owner;
+					if (nlo == lock->lf_owner) {
+						mtx_pool_unlock(mtxpool_sleep,
+						    lo);
 						lock->lf_next = *clean;
 						*clean = lock;
 						return (EDEADLK);
 					}
 				}
-				thread_unlock(td);
+				mtx_pool_unlock(mtxpool_sleep, block->lf_owner);
+				lo = nlo;
 			}
-			PROC_SUNLOCK(wproc);
-			wproc = nproc;
-			if (wproc)
-				goto restart;
 		}
 		/*
 		 * For flock type locks, we must first remove
@@ -327,12 +542,26 @@
 		 */
 		lock->lf_next = block;
 		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+		mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+		TAILQ_INSERT_TAIL(&lock->lf_owner->lo_pending, lock, lf_olock);
+		mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			lf_print("lf_setlock: blocking on", block);
 			lf_printlist("lf_setlock", block);
 		}
 #endif /* LOCKF_DEBUG */
+
+		if ((lock->lf_flags & F_WAIT) == 0) {
+			/*
+			 * The caller requested async notification -
+			 * this callback happens when the blocking
+			 * lock is released, allowing the caller to
+			 * make another attempt to take the lock.
+			 */
+			return (EINPROGRESS);
+		}
+
 		error = msleep(lock, VI_MTX(vp), priority, lockstr, 0);
 		/*
 		 * We may have been awakened by a signal and/or by a
@@ -344,6 +573,10 @@
 		 */
 		if (lock->lf_next) {
 			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			TAILQ_REMOVE(&lock->lf_owner->lo_pending, lock,
+			    lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			lock->lf_next = NOLOCKF;
 		}
 		if (error) {
@@ -381,6 +614,10 @@
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			}
 			break;
 
@@ -391,7 +628,7 @@
 			 */
 			if (lock->lf_type == F_RDLCK &&
 			    overlap->lf_type == F_WRLCK)
-				lf_wakelock(overlap);
+				lf_wakelock(overlap, clean);
 			overlap->lf_type = lock->lf_type;
 			lock->lf_next = *clean;
 			*clean = lock;
@@ -412,9 +649,13 @@
 				*prev = lock;
 				lock->lf_next = overlap;
 				overlap->lf_start = lock->lf_end + 1;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			} else
-				lf_split(overlap, lock, clean);
-			lf_wakelock(overlap);
+				lf_split(overlap, lock, clean, TRUE);
+			lf_wakelock(overlap, clean);
 			break;
 
 		case 3: /* lock contains overlap */
@@ -424,7 +665,7 @@
 			 */
 			if (lock->lf_type == F_RDLCK &&
 			    overlap->lf_type == F_WRLCK) {
-				lf_wakelock(overlap);
+				lf_wakelock(overlap, clean);
 			} else {
 				while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 					ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
@@ -438,13 +679,21 @@
 			/*
 			 * Add the new lock if necessary and delete the overlap.
 			 */
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			KASSERT(lock->lf_owner == overlap->lf_owner,
+			    ("unexpected lock owner for overlap"));
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap->lf_next;
 				prev = &lock->lf_next;
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
 				needtolink = 0;
 			} else
 				*prev = overlap->lf_next;
+			TAILQ_REMOVE(&lock->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			overlap->lf_next = *clean;
 			*clean = overlap;
 			continue;
@@ -457,7 +706,11 @@
 			overlap->lf_next = lock;
 			overlap->lf_end = lock->lf_start - 1;
 			prev = &lock->lf_next;
-			lf_wakelock(overlap);
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+			    lock, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
+			lf_wakelock(overlap, clean);
 			needtolink = 0;
 			continue;
 
@@ -468,9 +721,13 @@
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			}
 			overlap->lf_start = lock->lf_end + 1;
-			lf_wakelock(overlap);
+			lf_wakelock(overlap, clean);
 			break;
 		}
 		break;
@@ -491,12 +748,10 @@
  * and remove it (or shrink it), then wakeup anyone we can.
  */
 static int
-lf_clearlock(unlock, clean)
-	struct lockf *unlock;
-	struct lockf **clean;
+lf_clearlock(struct lockf *unlock, struct lockf **clean)
 {
 	struct lockf **head = unlock->lf_head;
-	register struct lockf *lf = *head;
+	struct lockf *lf = *head;
 	struct lockf *overlap, **prev;
 	int ovcase;
 
@@ -513,7 +768,7 @@
 		/*
 		 * Wakeup the list of locks to be retried.
 		 */
-		lf_wakelock(overlap);
+		lf_wakelock(overlap, clean);
 
 		switch (ovcase) {
 
@@ -521,6 +776,10 @@
 			*prev = overlap->lf_next;
 			overlap->lf_next = *clean;
 			*clean = overlap;
+			mtx_pool_lock(mtxpool_sleep, overlap->lf_owner);
+			TAILQ_REMOVE(&overlap->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner);
 			break;
 
 		case 2: /* overlap contains lock: split it */
@@ -528,8 +787,7 @@
 				overlap->lf_start = unlock->lf_end + 1;
 				break;
 			}
-			lf_split(overlap, unlock, clean);
-			overlap->lf_next = unlock->lf_next;
+			lf_split(overlap, unlock, clean, FALSE);
 			break;
 
 		case 3: /* lock contains overlap */
@@ -537,6 +795,10 @@
 			lf = overlap->lf_next;
 			overlap->lf_next = *clean;
 			*clean = overlap;
+			mtx_pool_lock(mtxpool_sleep, overlap->lf_owner);
+			TAILQ_REMOVE(&overlap->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner);
 			continue;
 
 		case 4: /* overlap starts before lock */
@@ -563,11 +825,9 @@
  * and if so return its process identifier.
  */
 static int
-lf_getlock(lock, fl)
-	register struct lockf *lock;
-	register struct flock *fl;
+lf_getlock(struct lockf *lock, struct flock *fl)
 {
-	register struct lockf *block;
+	struct lockf *block;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 1)
@@ -582,10 +842,8 @@
 			fl->l_len = 0;
 		else
 			fl->l_len = block->lf_end - block->lf_start + 1;
-		if (block->lf_flags & F_POSIX)
-			fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
-		else
-			fl->l_pid = -1;
+		fl->l_pid = block->lf_owner->lo_pid;
+		fl->l_sysid = block->lf_owner->lo_sysid;
 	} else {
 		fl->l_type = F_UNLCK;
 	}
@@ -593,12 +851,65 @@
 }
 
 /*
+ * Cancel an async lock request.
+ */
+/*static*/ int
+lf_cancel(struct lockf *lock, struct lockf **clean)
+{
+	struct lock_owner *lo = lock->lf_owner;
+	struct lockf *reallock;
+
+	/*
+	 * We need to match this request with an existing lock
+	 * request. We need to take the pool mutex to protect the
+	 * lock owner's lists.
+	 */
+	mtx_pool_lock(mtxpool_sleep, lo);
+
+	TAILQ_FOREACH(reallock, &lo->lo_pending, lf_olock) {
+		if (reallock->lf_vnode == lock->lf_vnode
+		    && reallock->lf_start == lock->lf_start
+		    && reallock->lf_end == lock->lf_end) {
+			/*
+			 * Make sure this lock was async and then just
+			 * remove it from its wait lists.
+			 */
+			if (!reallock->lf_async_task) {
+				mtx_pool_unlock(mtxpool_sleep, lo);
+				return (ENOENT);
+			}
+
+			/*
+			 * Note that since any other thread must take
+			 * the vnode interlock before it can possibly
+			 * trigger the async callback, we are safe
+			 * from a race with lf_wakelock, i.e. we
+			 * can free the lock (actually our caller does
+			 * this).
+			 */
+			TAILQ_REMOVE(&reallock->lf_blkhd, reallock, lf_block);
+			TAILQ_REMOVE(&lo->lo_pending, reallock, lf_olock);
+			reallock->lf_next = *clean;
+			*clean = reallock;
+			mtx_pool_unlock(mtxpool_sleep, lo);
+			return (0);
+		}
+	}
+
+	mtx_pool_unlock(mtxpool_sleep, lo);
+
+	/*
+	 * We didn't find a matching lock - not much we can do here.
+	 */
+	return (ENOENT);
+}
+
+/*
  * Walk the list of locks for an inode and
  * return the first blocking lock.
  */
 static struct lockf *
-lf_getblock(lock)
-	register struct lockf *lock;
+lf_getblock(struct lockf *lock)
 {
 	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
 	int ovcase;
@@ -627,12 +938,8 @@
  *	 may be more than one.
  */
 static int
-lf_findoverlap(lf, lock, type, prev, overlap)
-	register struct lockf *lf;
-	struct lockf *lock;
-	int type;
-	struct lockf ***prev;
-	struct lockf **overlap;
+lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
+    struct lockf ***prev, struct lockf **overlap)
 {
 	off_t start, end;
 
@@ -646,8 +953,8 @@
 	start = lock->lf_start;
 	end = lock->lf_end;
 	while (lf != NOLOCKF) {
-		if (((type & SELF) && lf->lf_id != lock->lf_id) ||
-		    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+		if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
+		    ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
 			*prev = &lf->lf_next;
 			*overlap = lf = lf->lf_next;
 			continue;
@@ -733,14 +1040,13 @@
 }
 
 /*
- * Split a lock and a contained region into
- * two or three locks as necessary.
+ * Split a lock and a contained region into two or three locks as
+ * necessary. If addlock is TRUE, lock2 is being set so it must be
+ * added to the list, otherwise it is being cleared.
  */
 static void
-lf_split(lock1, lock2, split)
-	struct lockf *lock1;
-	struct lockf *lock2;
-	struct lockf **split;
+lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **split,
+    int addlock)
 {
 	struct lockf *splitlock;
 
@@ -755,13 +1061,16 @@
 	 */
 	if (lock1->lf_start == lock2->lf_start) {
 		lock1->lf_start = lock2->lf_end + 1;
-		lock2->lf_next = lock1;
+		if (addlock)
+			lock2->lf_next = lock1;
 		return;
 	}
 	if (lock1->lf_end == lock2->lf_end) {
 		lock1->lf_end = lock2->lf_start - 1;
-		lock2->lf_next = lock1->lf_next;
-		lock1->lf_next = lock2;
+		if (addlock) {
+			lock2->lf_next = lock1->lf_next;
+			lock1->lf_next = lock2;
+		}
 		return;
 	}
 	/*
@@ -773,6 +1082,15 @@
 	KASSERT(splitlock != NULL, ("no split"));
 	*split = splitlock->lf_next;
 	bcopy(lock1, splitlock, sizeof *splitlock);
+
+	/*
+	 * Update the lock owner reference count to account for the
+	 * new lock.
+	 */
+	mtx_lock(&lf_lock_owners_mutex);
+	splitlock->lf_owner->lo_refs++;
+	mtx_unlock(&lf_lock_owners_mutex);
+
 	splitlock->lf_start = lock2->lf_end + 1;
 	TAILQ_INIT(&splitlock->lf_blkhd);
 	lock1->lf_end = lock2->lf_start - 1;
@@ -780,46 +1098,144 @@
 	 * OK, now link it in
 	 */
 	splitlock->lf_next = lock1->lf_next;
-	lock2->lf_next = splitlock;
-	lock1->lf_next = lock2;
+	mtx_pool_lock(mtxpool_sleep, lock1->lf_owner);
+	TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, splitlock, lf_olock);
+	if (addlock) {
+		KASSERT(lock1->lf_owner == lock2->lf_owner,
+		    ("unexpected lock owner for split"));
+		TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, lock2,
+		    lf_olock);
+		lock2->lf_next = splitlock;
+		lock1->lf_next = lock2;
+	} else {
+		lock1->lf_next = splitlock;
+	}
+	mtx_pool_unlock(mtxpool_sleep, lock1->lf_owner);
 }
 
 /*
  * Wakeup a blocklist
  */
 static void
-lf_wakelock(listhead)
-	struct lockf *listhead;
+lf_wakelock(struct lockf *listhead, struct lockf **clean)
 {
-	register struct lockf *wakelock;
+	struct lockf *wakelock;
 
 	while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
 		wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
 		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
 		wakelock->lf_next = NOLOCKF;
+		mtx_pool_lock(mtxpool_sleep, wakelock->lf_owner);
+		TAILQ_REMOVE(&wakelock->lf_owner->lo_pending, wakelock,
+		    lf_olock);
+		mtx_pool_unlock(mtxpool_sleep, wakelock->lf_owner);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 2)
 			lf_print("lf_wakelock: awakening", wakelock);
 #endif /* LOCKF_DEBUG */
-		wakeup(wakelock);
+		if (wakelock->lf_async_task) {
+			/*
+			 * Perform async notification to allow a
+			 * caller to re-attempt the lock.
+			 */
+			taskqueue_enqueue(taskqueue_thread,
+			    wakelock->lf_async_task);
+			wakelock->lf_next = *clean;
+			*clean = wakelock;
+		} else {
+			wakeup(wakelock);
+		}
+	}
+}
+
+struct clearlock {
+	STAILQ_ENTRY(clearlock) link;
+	struct vnode *vp;
+	struct flock fl;
+};
+STAILQ_HEAD(clearlocklist, clearlock);
+
+static void
+lf_clearremotesys(int sysid)
+{
+	int i;
+	struct lock_owner *lo;
+	struct lockf *lf;
+	struct clearlock *cl;
+	struct clearlocklist locks;
+
+	KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
+
+	/*
+	 * In order to keep the locking simple, we iterate over the
+	 * active lock lists to build a list of locks that need
+	 * releasing. We then call VOP_ADVLOCK for each one in turn.
+	 */
+	STAILQ_INIT(&locks);
+	mtx_lock(&lf_lock_owners_mutex);
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) {
+		LIST_FOREACH(lo, &lf_lock_owners[i], lo_link) {
+			if (lo->lo_sysid != sysid)
+				continue;
+
+			mtx_pool_lock(mtxpool_sleep, lo);
+			TAILQ_FOREACH(lf, &lo->lo_active, lf_olock) {
+				cl = malloc(sizeof(struct clearlock),
+				    M_LOCKF, M_NOWAIT);
+				if (!cl)
+					continue;
+				cl->vp = lf->lf_vnode;
+				cl->fl.l_start = lf->lf_start;
+				if (lf->lf_end == -1)
+					cl->fl.l_len = 0;
+				else
+					cl->fl.l_len =
+						lf->lf_end - lf->lf_start;
+				cl->fl.l_whence = SEEK_SET;
+				cl->fl.l_type = F_UNLCK;
+				cl->fl.l_pid = lo->lo_pid;
+				cl->fl.l_sysid = sysid;
+				STAILQ_INSERT_TAIL(&locks, cl, link);
+			}
+			mtx_pool_unlock(mtxpool_sleep, lo);
+		}
+	}
+	mtx_unlock(&lf_lock_owners_mutex);
+
+	while ((cl = STAILQ_FIRST(&locks)) != NULL) {
+		STAILQ_REMOVE_HEAD(&locks, link);
+		VOP_ADVLOCK(cl->vp, 0, F_UNLCK, &cl->fl, F_REMOTE);
+		free(cl, M_LOCKF);
 	}
 }
 
 #ifdef LOCKF_DEBUG
 /*
+ * Print description of a lock owner
+ */
+static void
+lf_print_owner(struct lock_owner *lo)
+{
+
+	if (lo->lo_flags & F_REMOTE) {
+		printf("remote pid %d, system %d",
+		    lo->lo_pid, lo->lo_sysid);
+	} else if (lo->lo_flags & F_FLOCK) {
+		printf("file %p", lo->lo_id);
+	} else {
+		printf("local pid %d", lo->lo_pid);
+	}
+}
+
+/*
  * Print out a lock.
  */
 static void
-lf_print(tag, lock)
-	char *tag;
-	register struct lockf *lock;
+lf_print(char *tag, struct lockf *lock)
 {
 
 	printf("%s: lock %p for ", tag, (void *)lock);
-	if (lock->lf_flags & F_POSIX)
-		printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
-	else
-		printf("id %p", (void *)lock->lf_id);
+	lf_print_owner(lock->lf_owner);
 	if (lock->lf_inode != (struct inode *)0)
 		printf(" in ino %ju on dev <%s>, %s, start %jd, end %jd",
 		    (uintmax_t)lock->lf_inode->i_number,
@@ -841,11 +1257,9 @@
 }
 
 static void
-lf_printlist(tag, lock)
-	char *tag;
-	struct lockf *lock;
+lf_printlist(char *tag, struct lockf *lock)
 {
-	register struct lockf *lf, *blk;
+	struct lockf *lf, *blk;
 
 	if (lock->lf_inode == (struct inode *)0)
 		return;
@@ -855,11 +1269,7 @@
 	    devtoname(lock->lf_inode->i_dev));
 	for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
 		printf("\tlock %p for ",(void *)lf);
-		if (lf->lf_flags & F_POSIX)
-			printf("proc %ld",
-			    (long)((struct proc *)lf->lf_id)->p_pid);
-		else
-			printf("id %p", (void *)lf->lf_id);
+		lf_print_owner(lock->lf_owner);
 		printf(", %s, start %jd, end %jd",
 		    lf->lf_type == F_RDLCK ? "shared" :
 		    lf->lf_type == F_WRLCK ? "exclusive" :
@@ -867,11 +1277,7 @@
 		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
 		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
 			printf("\n\t\tlock request %p for ", (void *)blk);
-			if (blk->lf_flags & F_POSIX)
-				printf("proc %ld",
-				    (long)((struct proc *)blk->lf_id)->p_pid);
-			else
-				printf("id %p", (void *)blk->lf_id);
+			lf_print_owner(blk->lf_owner);
 			printf(", %s, start %jd, end %jd",
 			    blk->lf_type == F_RDLCK ? "shared" :
 			    blk->lf_type == F_WRLCK ? "exclusive" :
diff -urN /Projects/clean/src/sys/kern/vnode_if.src /Projects/M4/src/sys/kern/vnode_if.src
--- /Projects/clean/src/sys/kern/vnode_if.src	2008-01-19 15:44:20.000000000 +0000
+++ /Projects/M4/src/sys/kern/vnode_if.src	2008-02-12 09:57:05.000000000 +0000
@@ -438,6 +438,18 @@
 };
 
 
+%% advlockasync	vp	U U U
+
+vop_advlockasync {
+	IN struct vnode *vp;
+	IN void *id;
+	IN int op;
+	IN struct flock *fl;
+	IN int flags;
+	IN struct task *task;	
+};
+
+
 %% reallocblks	vp	E E E
 
 vop_reallocblks {
diff -urN /Projects/clean/src/sys/nfs4client/nfs4_vnops.c /Projects/M4/src/sys/nfs4client/nfs4_vnops.c
--- /Projects/clean/src/sys/nfs4client/nfs4_vnops.c	2008-01-19 15:44:36.000000000 +0000
+++ /Projects/M4/src/sys/nfs4client/nfs4_vnops.c	2008-02-12 09:57:11.000000000 +0000
@@ -157,6 +157,7 @@
 static vop_readlink_t	nfs4_readlink;
 static vop_print_t	nfs4_print;
 static vop_advlock_t	nfs4_advlock;
+static vop_advlockasync_t nfs4_advlockasync;
 
 /*
  * Global vfs data structures for nfs
@@ -165,6 +166,7 @@
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs4_access,
 	.vop_advlock =		nfs4_advlock,
+	.vop_advlockasync =	nfs4_advlockasync,
 	.vop_close =		nfs4_close,
 	.vop_create =		nfs4_create,
 	.vop_fsync =		nfs4_fsync,
@@ -2777,6 +2779,22 @@
 }
 
 /*
+ * NFS advisory byte-level locks.
+ */
+static int
+nfs4_advlockasync(struct vop_advlockasync_args *ap)
+{
+	return (EPERM);
+
+	if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
+		struct nfsnode *np = VTONFS(ap->a_vp);
+
+		return (lf_advlockasync(ap, &(np->n_lockf), np->n_size));
+	}
+	return (EOPNOTSUPP);
+}
+
+/*
  * Print out the contents of an nfsnode.
  */
 static int
diff -urN /Projects/clean/src/sys/nfsclient/nfs_vnops.c /Projects/M4/src/sys/nfsclient/nfs_vnops.c
--- /Projects/clean/src/sys/nfsclient/nfs_vnops.c	2008-01-19 15:44:36.000000000 +0000
+++ /Projects/M4/src/sys/nfsclient/nfs_vnops.c	2008-02-12 09:57:11.000000000 +0000
@@ -129,6 +129,7 @@
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
+static vop_advlockasync_t nfs_advlockasync;
 
 /*
  * Global vfs data structures for nfs
@@ -137,6 +138,7 @@
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
+	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
@@ -3057,6 +3059,27 @@
 }
 
 /*
+ * NFS advisory byte-level locks.
+ */
+static int
+nfs_advlockasync(struct vop_advlockasync_args *ap)
+{
+	int error;
+	
+	mtx_lock(&Giant);
+	if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
+		struct nfsnode *np = VTONFS(ap->a_vp);
+
+		error = lf_advlockasync(ap, &(np->n_lockf), np->n_size);
+		goto out;
+	}
+	error = EOPNOTSUPP;
+out:	
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
  * Print out the contents of an nfsnode.
  */
 static int
diff -urN /Projects/clean/src/sys/sys/fcntl.h /Projects/M4/src/sys/sys/fcntl.h
--- /Projects/clean/src/sys/sys/fcntl.h	2008-01-19 15:54:44.000000000 +0000
+++ /Projects/M4/src/sys/sys/fcntl.h	2008-02-12 09:57:13.000000000 +0000
@@ -178,9 +178,13 @@
 #define	F_GETOWN	5		/* get SIGIO/SIGURG proc/pgrp */
 #define F_SETOWN	6		/* set SIGIO/SIGURG proc/pgrp */
 #endif
-#define	F_GETLK		7		/* get record locking information */
-#define	F_SETLK		8		/* set record locking information */
-#define	F_SETLKW	9		/* F_SETLK; wait if blocked */
+#define	F_OGETLK	7		/* get record locking information */
+#define	F_OSETLK	8		/* set record locking information */
+#define	F_OSETLKW	9		/* F_SETLK; wait if blocked */
+#define	F_GETLK		10		/* get record locking information */
+#define	F_SETLK		11		/* set record locking information */
+#define	F_SETLKW	12		/* F_SETLK; wait if blocked */
+#define	F_SETLK_REMOTE	13		/* debugging support for remote locks */
 
 /* file descriptor flags (F_GETFD, F_SETFD) */
 #define	FD_CLOEXEC	1		/* close-on-exec flag */
@@ -189,10 +193,13 @@
 #define	F_RDLCK		1		/* shared or read lock */
 #define	F_UNLCK		2		/* unlock */
 #define	F_WRLCK		3		/* exclusive or write lock */
+#define F_UNLCKSYS	4		/* purge locks for a given system ID */ 
+#define F_CANCEL	5		/* cancel an async lock request */
 #ifdef _KERNEL
 #define	F_WAIT		0x010		/* Wait until lock is granted */
 #define	F_FLOCK		0x020	 	/* Use flock(2) semantics for lock */
 #define	F_POSIX		0x040	 	/* Use POSIX semantics for lock */
+#define F_REMOTE	0x080		/* Lock owner is remote NFS client */
 #endif
 
 /*
@@ -205,6 +212,19 @@
 	pid_t	l_pid;		/* lock owner */
 	short	l_type;		/* lock type: read/write, etc. */
 	short	l_whence;	/* type of l_start */
+	int	l_sysid;	/* remote system id or zero for local */
+};
+
+/*
+ * Old advisory file segment locking data type,
+ * before adding l_sysid.
+ */
+struct oflock {
+	off_t	l_start;	/* starting offset */
+	off_t	l_len;		/* len = 0 means until end of file */
+	pid_t	l_pid;		/* lock owner */
+	short	l_type;		/* lock type: read/write, etc. */
+	short	l_whence;	/* type of l_start */
 };
 
 
diff -urN /Projects/clean/src/sys/sys/lockf.h /Projects/M4/src/sys/sys/lockf.h
--- /Projects/clean/src/sys/sys/lockf.h	2008-01-19 15:44:41.000000000 +0000
+++ /Projects/M4/src/sys/sys/lockf.h	2008-02-12 09:57:14.000000000 +0000
@@ -39,6 +39,7 @@
 #include <sys/queue.h>
 
 struct vop_advlock_args;
+struct vop_advlockasync_args;
 
 /*
  * The lockf structure is a kernel structure which contains the information
@@ -53,17 +54,21 @@
 	short	lf_type;	    /* Lock type: F_RDLCK, F_WRLCK */
 	off_t	lf_start;	    /* Byte # of the start of the lock */
 	off_t	lf_end;		    /* Byte # of the end of the lock (-1=EOF) */
-	caddr_t	lf_id;		    /* Id of the resource holding the lock */
+	struct	lock_owner *lf_owner; /* Owner of the lock */
+	struct	vnode *lf_vnode;    /* File being locked (only valid for active lock) */
 	struct	lockf **lf_head;    /* Back pointer to the head of the lockf list */
 	struct	inode *lf_inode;    /* Back pointer to the inode */
 	struct	lockf *lf_next;	    /* Pointer to the next lock on this inode */
 	struct	locklist lf_blkhd;  /* List of requests blocked on this lock */
+	struct	task *lf_async_task;/* Async lock callback */
 	TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
+	TAILQ_ENTRY(lockf) lf_olock;/* Linkage for owner lock lists */
 };
 
 /* Maximum length of sleep chains to traverse to try and detect deadlock. */
 #define MAXDEPTH 50
 
 int	 lf_advlock(struct vop_advlock_args *, struct lockf **, u_quad_t);
+int	 lf_advlockasync(struct vop_advlockasync_args *, struct lockf **, u_quad_t);
 
 #endif /* !_SYS_LOCKF_H_ */
diff -urN /Projects/clean/src/sys/ufs/ufs/ufs_vnops.c /Projects/M4/src/sys/ufs/ufs/ufs_vnops.c
--- /Projects/clean/src/sys/ufs/ufs/ufs_vnops.c	2008-01-19 15:44:30.000000000 +0000
+++ /Projects/M4/src/sys/ufs/ufs/ufs_vnops.c	2008-02-12 09:57:09.000000000 +0000
@@ -92,6 +92,7 @@
 
 static vop_access_t	ufs_access;
 static vop_advlock_t	ufs_advlock;
+static vop_advlockasync_t ufs_advlockasync;
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *);
 static vop_close_t	ufs_close;
@@ -2182,6 +2183,25 @@
 }
 
 /*
+ * Advisory record locking support
+ */
+static int
+ufs_advlockasync(ap)
+	struct vop_advlockasync_args /* {
+		struct vnode *a_vp;
+		caddr_t  a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+		struct task *a_task;
+	} */ *ap;
+{
+	struct inode *ip = VTOI(ap->a_vp);
+
+	return (lf_advlockasync(ap, &(ip->i_lockf), ip->i_size));
+}
+
+/*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
@@ -2449,6 +2469,7 @@
 	.vop_write =		VOP_PANIC,
 	.vop_access =		ufs_access,
 	.vop_advlock =		ufs_advlock,
+	.vop_advlockasync =	ufs_advlockasync,
 	.vop_bmap =		ufs_bmap,
 	.vop_cachedlookup =	ufs_lookup,
 	.vop_close =		ufs_close,
diff -urN /Projects/clean/src/tools/regression/file/flock/Makefile /Projects/M4/src/tools/regression/file/flock/Makefile
--- /Projects/clean/src/tools/regression/file/flock/Makefile	1970-01-01 01:00:00.000000000 +0100
+++ /Projects/M4/src/tools/regression/file/flock/Makefile	2008-02-12 09:57:19.000000000 +0000
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+PROG=	flock
+NO_MAN=
+WARNS?=	6
+DEBUG_FLAGS= -g -O0
+
+.include <bsd.prog.mk>
diff -urN /Projects/clean/src/tools/regression/file/flock/flock.c /Projects/M4/src/tools/regression/file/flock/flock.c
--- /Projects/clean/src/tools/regression/file/flock/flock.c	1970-01-01 01:00:00.000000000 +0100
+++ /Projects/M4/src/tools/regression/file/flock/flock.c	2008-02-12 09:57:19.000000000 +0000
@@ -0,0 +1,1098 @@
+/*-
+ * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
+ * Authors: Doug Rabson <dfr@rabson.org>
+ * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __FreeBSD__
+#define HAVE_SYSID
+#include <sys/cdefs.h>
+#else
+#define __unused
+#endif
+
+static int
+make_file(const char *dir, off_t sz)
+{
+	const char *template = "/flocktempXXXXXX";
+	size_t len;
+	char *filename;
+	int fd;
+
+	len = strlen(dir) + strlen(template) + 1;
+	filename = malloc(len);
+	strcpy(filename, dir);
+	strcat(filename, template);
+	fd = mkstemp(filename);
+	if (fd < 0)
+		err(1, "mkstemp");
+	if (ftruncate(fd, sz) < 0)
+		err(1, "ftruncate");
+	if (unlink(filename) < 0)
+		err(1, "unlink");
+	free(filename);
+
+	return (fd);
+}
+
+static void
+ignore_alarm(int __unused sig)
+{
+}
+
+#define FAIL(test)					\
+	do {						\
+		if (test) {				\
+			printf("FAIL (%s)\n", #test);	\
+			return -1;			\
+		}					\
+	} while (0)
+
+#define SUCCEED \
+	do { printf("SUCCEED\n"); return 0; } while (0)
+
+/*
+ * Test 1 - F_GETLK on unlocked region
+ *
+ * If no lock is found that would prevent this lock from being
+ * created, the structure is left unchanged by this function call
+ * except for the lock type which is set to F_UNLCK.
+ */
+static int
+test1(int fd)
+{
+	struct flock fl1, fl2;
+
+	memset(&fl1, 1, sizeof(fl1));
+	fl1.l_type = F_WRLCK;
+	fl1.l_whence = SEEK_SET;
+	fl2 = fl1;
+
+	if (fcntl(fd, F_GETLK, &fl1) < 0)
+		err(1, "F_GETLK");
+
+	printf("1 - F_GETLK on unlocked region: ");
+	FAIL(fl1.l_start != fl2.l_start);
+	FAIL(fl1.l_len != fl2.l_len);
+	FAIL(fl1.l_pid != fl2.l_pid);
+	FAIL(fl1.l_type != F_UNLCK);
+	FAIL(fl1.l_whence != fl2.l_whence);
+#ifdef HAVE_SYSID
+	FAIL(fl1.l_sysid != fl2.l_sysid);
+#endif
+
+	SUCCEED;
+}
+
+/*
+ * Test 2 - F_SETLK on locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test2(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should return -1 with errno set to either EACCES or
+	 * EAGAIN.
+	 */
+	printf("2 - F_SETLK on locked region: ");
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 3 - F_SETLKW on locked region
+ *
+ * If a shared or exclusive lock is blocked by other locks, the
+ * process waits until the request can be satisfied.
+ *
+ * XXX this test hangs on FreeBSD NFS filesystems due to limitations
+ * in FreeBSD's client (and server) lockd implementation.
+ */
+static int
+test3(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("3 - F_SETLKW on locked region: ");
+
+	alarm(1);
+
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+	FAIL(res == 0);
+	FAIL(errno != EINTR);
+
+	SUCCEED;
+}
+
+/*
+ * Test 4 - F_GETLK on locked region
+ *
+ * Get the first lock that blocks the lock.
+ */
+static int
+test4(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 99;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should return a lock structure reflecting the lock we
+	 * made in the child process.
+	 */
+	if (fcntl(fd, F_GETLK, &fl) < 0)
+		err(1, "F_GETLK");
+
+	printf("4 - F_GETLK on locked region: ");
+	FAIL(fl.l_start != 0);
+	FAIL(fl.l_len != 99);
+	FAIL(fl.l_type != F_WRLCK);
+	FAIL(fl.l_pid != pid);
+#ifdef HAVE_SYSID
+	FAIL(fl.l_sysid != 0);
+#endif
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	SUCCEED;
+}
+
+/*
+ * Test 5 - F_SETLKW simple deadlock
+ *
+ * If a blocking shared lock request would cause a deadlock (i.e. the
+ * lock request is blocked by a process which is itself blocked on a
+ * lock currently owned by the process making the new request),
+ * EDEADLK is returned.
+ */
+static int
+test5(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. Because our test relies on the child process being
+	 * blocked on the parent's lock, we can't easily use a pipe to
+	 * synchronize so we just sleep in the parent to given the
+	 * child a chance to setup.
+	 *
+	 * To create the deadlock condition, we arrange for the parent
+	 * to lock the first byte of the file and the child to lock
+	 * the second byte.  After locking the second byte, the child
+	 * will attempt to lock the first byte of the file, and
+	 * block. The parent will then attempt to lock the second byte
+	 * (owned by the child) which should cause deadlock.
+	 */
+	int pid;
+	struct flock fl;
+	int res;
+
+	/*
+	 * Lock the first byte in the parent.
+	 */
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_SETLK 1 (parent)");
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * Lock the second byte in the child and then block on
+		 * the parent's lock.
+		 */
+		fl.l_start = 1;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		fl.l_start = 0;
+		if (fcntl(fd, F_SETLKW, &fl) < 0)
+			err(1, "F_SETLKW (child)");
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	sleep(1);
+
+	/*
+	 * fcntl should immediately return -1 with errno set to EDEADLK.
+	 */
+	printf("5 - F_SETLKW simple deadlock: ");
+
+	fl.l_start = 1;
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	
+	FAIL(res == 0);
+	FAIL(errno != EDEADLK);
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	SUCCEED;
+}
+
+/*
+ * Test 6 - F_SETLKW complex deadlock.
+ *
+ * This test involves three process, P, C1 and C2. We set things up so
+ * that P locks byte zero, C1 locks byte 1 and C2 locks byte 2. We
+ * also block C2 by attempting to lock byte zero. Lastly, P attempts
+ * to lock a range including byte 1 and 2. This represents a deadlock
+ * (due to C2's blocking attempt to lock byte zero).
+ */
+static int
+test6(int fd)
+{
+	/*
+	 * Because our test relies on the child process being blocked
+	 * on the parent's lock, we can't easily use a pipe to
+	 * synchronize so we just sleep in the parent to given the
+	 * children a chance to setup.
+	 */
+	int pid1, pid2;
+	struct flock fl;
+	int res;
+
+	/*
+	 * Lock the first byte in the parent.
+	 */
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_SETLK 1 (parent)");
+
+	pid1 = fork();
+	if (pid1 < 0)
+		err(1, "fork");
+
+	if (pid1 == 0) {
+		/*
+		 * C1
+		 * Lock the second byte in the child and then sleep
+		 */
+		fl.l_start = 1;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child1)");
+		pause();
+		exit(0);
+	}
+
+	pid2 = fork();
+	if (pid2 < 0)
+		err(1, "fork");
+
+	if (pid2 == 0) {
+		/*
+		 * C2
+		 * Lock the third byte in the child and then block on
+		 * the parent's lock.
+		 */
+		fl.l_start = 2;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child2)");
+		fl.l_start = 0;
+		if (fcntl(fd, F_SETLKW, &fl) < 0)
+			err(1, "F_SETLKW (child2)");
+		exit(0);
+	}
+
+	/*
+	 * Wait until the children have set their locks and then
+	 * perform the test.
+	 */
+	sleep(1);
+
+	/*
+	 * fcntl should immediately return -1 with errno set to
+	 * EDEADLK. If the alarm fires, we failed to detect the
+	 * deadlock.
+	 */
+	alarm(1);
+	printf("6 - F_SETLKW complex deadlock: ");
+
+	fl.l_start = 1;
+	fl.l_len = 2;
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid1, SIGTERM);
+	if (waitpid(pid1, 0, 0) != pid1)
+		err(1, "waitpid");
+	kill(pid2, SIGTERM);
+	if (waitpid(pid2, 0, 0) != pid2)
+		err(1, "waitpid");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	FAIL(res == 0);
+	FAIL(errno != EDEADLK);
+
+	/*
+	 * Cancel the alarm to avoid confusing later tests.
+	 */
+	alarm(0);
+
+	SUCCEED;
+}
+
+/*
+ * Test 7 - F_SETLK shared lock on exclusive locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test7(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("7 - F_SETLK shared lock on exclusive locked region: ");
+
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 8 - F_SETLK shared lock on share locked region
+ *
+ * When a shared lock is set on a segment of a file, other processes
+ * shall be able to set shared locks on that segment or a portion of
+ * it.
+ */
+static int
+test8(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_RDLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("8 - F_SETLK shared lock on share locked region: ");
+
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	FAIL(res != 0);
+
+	SUCCEED;
+}
+
+/*
+ * Test 9 - F_SETLK exclusive lock on share locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test9(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_RDLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("9 - F_SETLK exclusive lock on share locked region: ");
+
+	fl.l_type = F_WRLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 10 - trying to set bogus pid or sysid values
+ *
+ * The l_pid and l_sysid fields are only used with F_GETLK to return
+ * the process ID of the process holding a blocking lock and the
+ * system ID of the system that owns that process
+ */
+static int
+test10(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_pid = 9999;
+	fl.l_sysid = 9999;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	printf("10 - trying to set bogus pid or sysid values: ");
+
+	if (fcntl(fd, F_GETLK, &fl) < 0)
+		err(1, "F_GETLK");
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(fl.l_pid != pid);
+#ifdef HAVE_SYSID
+	FAIL(fl.l_sysid != 0);
+#endif
+
+	SUCCEED;
+}
+
+/*
+ * Test 11 - remote locks
+ *
+ * XXX temporary interface which will be removed when the kernel lockd
+ * is added.
+ */
+static int
+test11(int fd)
+{
+	struct flock fl;
+	int res;
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_pid = 9999;
+	fl.l_sysid = 1;
+
+	printf("11 - remote locks: ");
+
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	res = fcntl(fd, F_GETLK, &fl);
+	FAIL(res != 0);
+	FAIL(fl.l_pid != 9999);
+	FAIL(fl.l_sysid != 1);
+
+	fl.l_type = F_UNLCK;
+	fl.l_sysid = 1;
+	fl.l_start = 0;
+	fl.l_len = 0;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_pid = 1234;
+	fl.l_sysid = 1;
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_whence = SEEK_SET;
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_type = F_UNLCKSYS;
+	fl.l_sysid = 1;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_type = F_WRLCK;
+	res = fcntl(fd, F_GETLK, &fl);
+	FAIL(res != 0);
+	FAIL(fl.l_pid != 1234);
+	FAIL(fl.l_sysid != 2);
+
+	fl.l_type = F_UNLCKSYS;
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	SUCCEED;
+}
+
+/*
+ * Test 12 - F_SETLKW on locked region which is then unlocked
+ *
+ * If a shared or exclusive lock is blocked by other locks, the
+ * process waits until the request can be satisfied.
+ */
+static int
+test12(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+
+		sleep(1);
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("12 - F_SETLKW on locked region which is then unlocked: ");
+
+	//alarm(1);
+
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+	FAIL(res != 0);
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	SUCCEED;
+}
+
+/*
+ * Test 13 - F_SETLKW on locked region, race with owner
+ *
+ * If a shared or exclusive lock is blocked by other locks, the
+ * process waits until the request can be satisfied.
+ */
+static int
+test13(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int i;
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+	struct itimerval itv;
+
+	printf("13 - F_SETLKW on locked region, race with owner: ");
+	fflush(stdout);
+
+	for (i = 0; i < 100; i++) {
+		if (pipe(pfd) < 0)
+			err(1, "pipe");
+
+		fl.l_start = 0;
+		fl.l_len = 0;
+		fl.l_type = F_WRLCK;
+		fl.l_whence = SEEK_SET;
+
+		pid = fork();
+		if (pid < 0)
+			err(1, "fork");
+
+		if (pid == 0) {
+			/*
+			 * We are the child. We set a write lock and then
+			 * write one byte back to the parent to tell it. The
+			 * parent will kill us when its done.
+			 */
+			if (fcntl(fd, F_SETLK, &fl) < 0)
+				err(1, "F_SETLK (child)");
+			if (write(pfd[1], "a", 1) < 0)
+				err(1, "writing to pipe (child)");
+
+			usleep(1);
+			exit(0);
+		}
+
+		/*
+		 * Wait until the child has set its lock and then perform the
+		 * test.
+		 */
+		if (read(pfd[0], &ch, 1) != 1)
+			err(1, "reading from pipe (child)");
+
+		/*
+		 * fcntl should wait until the alarm and then return -1 with
+		 * errno set to EINTR.
+		 */
+		itv.it_interval.tv_sec = 0;
+		itv.it_interval.tv_usec = 0;
+		itv.it_value.tv_sec = 0;
+		itv.it_value.tv_usec = 2;
+		setitimer(ITIMER_REAL, &itv, NULL);
+
+		res = fcntl(fd, F_SETLKW, &fl);
+		kill(pid, SIGTERM);
+		if (waitpid(pid, 0, 0) != pid)
+			err(1, "waitpid");
+		close(pfd[0]);
+		close(pfd[1]);
+		FAIL(!(res == 0 || (res == -1 && errno == EINTR)));
+
+		fl.l_start = 0;
+		fl.l_len = 0;
+		fl.l_type = F_UNLCK;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_UNLCK");
+	}
+	SUCCEED;
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int fd;
+	struct sigaction sa;
+
+	if (argc != 2) {
+		errx(1, "usage: flock <directory>");
+	}
+
+	fd = make_file(argv[1], 1024);
+
+	sa.sa_handler = ignore_alarm;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	sigaction(SIGALRM, &sa, 0);
+
+	test1(fd);
+	test2(fd);
+	test3(fd);
+	test4(fd);
+	test5(fd);
+	test6(fd);
+	test7(fd);
+	test8(fd);
+	test9(fd);
+	test10(fd);
+	test11(fd);
+	test12(fd);
+	test13(fd);
+
+	return 0;
+}