diff -urN /Projects/clean/src/lib/libc/gen/lockf.c /Projects/M3/src/lib/libc/gen/lockf.c
--- /Projects/clean/src/lib/libc/gen/lockf.c	2008-01-19 15:54:31.000000000 +0000
+++ /Projects/M3/src/lib/libc/gen/lockf.c	2008-01-30 10:35:45.000000000 +0000
@@ -74,7 +74,7 @@
 		fl.l_type = F_WRLCK;
 		if (_fcntl(filedes, F_GETLK, &fl) == -1)
 			return (-1);
-		if (fl.l_type == F_UNLCK || fl.l_pid == getpid())
+		if (fl.l_type == F_UNLCK || (fl.l_sysid == 0 && fl.l_pid == getpid()))
 			return (0);
 		errno = EAGAIN;
 		return (-1);
diff -urN /Projects/clean/src/lib/libc/sys/fcntl.2 /Projects/M3/src/lib/libc/sys/fcntl.2
--- /Projects/clean/src/lib/libc/sys/fcntl.2	2008-01-19 15:54:32.000000000 +0000
+++ /Projects/M3/src/lib/libc/sys/fcntl.2	2008-01-30 10:35:48.000000000 +0000
@@ -177,6 +177,7 @@
 	pid_t	l_pid;		/* lock owner */
 	short	l_type;		/* lock type: read/write, etc. */
 	short	l_whence;	/* type of l_start */
+	int	l_sysid;	/* remote system id or zero for local */
 };
 .Ed
 The commands available for advisory record locking are as follows:
@@ -264,9 +265,13 @@
 means end edge of the region.
 The
 .Fa l_pid
-field is only used with
+and
+.Fa l_sysid
+fields are only used with
 .Dv F_GETLK
-to return the process ID of the process holding a blocking lock.
+to return the process ID of the process holding a blocking lock and
+the system ID of the system that owns that process.
+Locks created by the local system will have a system ID of zero.
 After a successful
 .Dv F_GETLK
 request, the value of
diff -urN /Projects/clean/src/sys/compat/linux/linux_file.c /Projects/M3/src/sys/compat/linux/linux_file.c
--- /Projects/clean/src/sys/compat/linux/linux_file.c	2008-01-19 15:54:38.000000000 +0000
+++ /Projects/M3/src/sys/compat/linux/linux_file.c	2008-01-30 10:35:05.000000000 +0000
@@ -1051,6 +1051,7 @@
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
+	bsd_flock->l_sysid = 0;
 }
 
 static void
@@ -1107,6 +1108,7 @@
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
+	bsd_flock->l_sysid = 0;
 }
 
 static void
diff -urN /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c /Projects/M3/src/sys/compat/svr4/svr4_fcntl.c
--- /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c	2008-01-19 15:54:40.000000000 +0000
+++ /Projects/M3/src/sys/compat/svr4/svr4_fcntl.c	2008-01-30 10:35:06.000000000 +0000
@@ -191,7 +191,7 @@
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
-
+	oflp->l_sysid = iflp->l_sysid;
 }
 
 static void
@@ -217,7 +217,7 @@
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off64_t) iflp->l_start;
 	oflp->l_len = (svr4_off64_t) iflp->l_len;
-	oflp->l_sysid = 0;
+	oflp->l_sysid = iflp->l_sysid;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
diff -urN /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c /Projects/M3/src/sys/i386/ibcs2/ibcs2_fcntl.c
--- /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c	2008-01-19 15:54:41.000000000 +0000
+++ /Projects/M3/src/sys/i386/ibcs2/ibcs2_fcntl.c	2008-01-30 10:35:27.000000000 +0000
@@ -93,7 +93,7 @@
 	iflp->l_whence = (short)flp->l_whence;
 	iflp->l_start = (ibcs2_off_t)flp->l_start;
 	iflp->l_len = (ibcs2_off_t)flp->l_len;
-	iflp->l_sysid = 0;
+	iflp->l_sysid = flp->l_sysid;
 	iflp->l_pid = (ibcs2_pid_t)flp->l_pid;
 }
 
@@ -127,6 +127,7 @@
 		break;
 	}
 	flp->l_whence = iflp->l_whence;
+	flk->l_sysid = iflp->l_sysid;
 }
 
 /* convert iBCS2 mode into NetBSD mode */
diff -urN /Projects/clean/src/sys/kern/kern_descrip.c /Projects/M3/src/sys/kern/kern_descrip.c
--- /Projects/clean/src/sys/kern/kern_descrip.c	2008-01-19 15:54:42.000000000 +0000
+++ /Projects/M3/src/sys/kern/kern_descrip.c	2008-01-30 10:35:31.000000000 +0000
@@ -316,28 +316,67 @@
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
+	struct oflock ofl;
 	intptr_t arg;
 	int error;
+	int cmd;
 
 	error = 0;
+	cmd = uap->cmd;
 	switch (uap->cmd) {
-	case F_GETLK:
-	case F_SETLK:
-	case F_SETLKW:
-		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+	case F_OGETLK:
+	case F_OSETLK:
+	case F_OSETLKW:
+		/*
+		 * Convert old flock structure to new.
+		 */
+		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+		fl.l_start = ofl.l_start;
+		fl.l_len = ofl.l_len;
+		fl.l_pid = ofl.l_pid;
+		fl.l_type = ofl.l_type;
+		fl.l_whence = ofl.l_whence;
+		fl.l_sysid = 0;
+
+		switch (uap->cmd) {
+		case F_OGETLK:
+		    cmd = F_GETLK;
+		    break;
+		case F_OSETLK:
+		    cmd = F_SETLK;
+		    break;
+		case F_OSETLKW:
+		    cmd = F_SETLKW;
+		    break;
+		}
 		arg = (intptr_t)&fl;
 		break;
+        case F_GETLK:
+        case F_SETLK:
+        case F_SETLKW:
+	case F_SETLK_REMOTE:
+                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+                arg = (intptr_t)&fl;
+                break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
-	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
+	error = kern_fcntl(td, uap->fd, cmd, arg);
 	if (error)
 		return (error);
-	if (uap->cmd == F_GETLK)
+	if (uap->cmd == F_OGETLK) {
+		ofl.l_start = fl.l_start;
+		ofl.l_len = fl.l_len;
+		ofl.l_pid = fl.l_pid;
+		ofl.l_type = fl.l_type;
+		ofl.l_whence = fl.l_whence;
+		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
+	} else if (uap->cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+	}
 	return (error);
 }
 
@@ -490,11 +529,16 @@
 		fdrop(fp, td);
 		break;
 
+	case F_SETLK_REMOTE:
+		flg = F_REMOTE;
+		goto do_setlk;
+
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
+	do_setlk:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
@@ -550,7 +594,19 @@
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
-			    flp, F_POSIX);
+			    flp, flg);
+			break;
+		case F_UNLCKSYS:
+			/*
+			 * Temporary api for testing remote lock
+			 * infrastructure.
+			 */
+			if (flg != F_REMOTE) {
+				error = EINVAL;
+				break;
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
diff -urN /Projects/clean/src/sys/kern/kern_lockf.c /Projects/M3/src/sys/kern/kern_lockf.c
--- /Projects/clean/src/sys/kern/kern_lockf.c	2008-01-19 15:54:43.000000000 +0000
+++ /Projects/M3/src/sys/kern/kern_lockf.c	2008-01-30 10:35:31.000000000 +0000
@@ -39,6 +39,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
@@ -57,6 +58,7 @@
  */
 static int maxlockdepth = MAXDEPTH;
 
+#define LOCKF_DEBUG
 #ifdef LOCKF_DEBUG
 #include <sys/sysctl.h>
 
@@ -80,36 +82,185 @@
 	 lf_getblock(struct lockf *);
 static int	 lf_getlock(struct lockf *, struct flock *);
 static int	 lf_setlock(struct lockf *, struct vnode *, struct lockf **);
-static void	 lf_split(struct lockf *, struct lockf *, struct lockf **);
+static void	 lf_split(struct lockf *, struct lockf *, struct lockf **,
+	    int addlock);
 static void	 lf_wakelock(struct lockf *);
+void	 lf_clearremotesys(int sysid);
 #ifdef LOCKF_DEBUG
 static void	 lf_print(char *, struct lockf *);
 static void	 lf_printlist(char *, struct lockf *);
+static void	 lf_print_owner(struct lock_owner *);
 #endif
 
 /*
+ * This structure is used to keep track of both local and remote lock
+ * owners. The lf_owner field of the struct lockf points back at the
+ * lock owner structure. Each possible lock owner (local proc for
+ * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
+ * pair for remote locks) is represented by a unique instance of
+ * struct lock_owner.
+ *
+ * Locks:
+ * (l)		locked by lf_lock_owners_mutex
+ * (p)		locked by mtx_pool_lock(mtxpool_sleep, lo)
+ * (c)		const until freeing
+ */
+#define	LOCK_OWNER_HASH_SIZE	256
+
+struct lock_owner {
+	LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
+	int	lo_refs;	    /* (l) Number of locks referring to this */
+	int	lo_flags;	    /* (c) Flags passwd to lf_advlock */
+	caddr_t	lo_id;		    /* (c) Id value passed to lf_advlock */
+	pid_t	lo_pid;		    /* (c) Process Id of the lock owner */
+	int	lo_sysid;	    /* (c) System Id of the lock owner */
+	struct locklist lo_active;  /* (p) Active locks for this owner */
+	struct locklist lo_pending; /* (p) Pending locks for this owner */
+};
+
+LIST_HEAD(lock_owner_list, lock_owner);
+
+static struct mtx		lf_lock_owners_mutex;
+static struct lock_owner_list	lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
+
+/*
+ * Initialise the lock owner structures.
+ */
+static void
+lf_init(void *dummy)
+{
+	int i;
+
+	mtx_init(&lf_lock_owners_mutex, "lock owners lock", NULL, MTX_DEF);
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+		LIST_INIT(&lf_lock_owners[i]);
+}
+SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL)
+
+/*
+ * Generate a hash value for a lock owner.
+ */
+static int
+lf_hash_owner(caddr_t id, struct flock *fl, int flags)
+{
+	uint32_t h;
+
+	if (flags & F_REMOTE) {
+		h = HASHSTEP(0, fl->l_pid);
+		h = HASHSTEP(h, fl->l_sysid);
+	} else if (flags & F_FLOCK) {
+		h = ((uintptr_t) id) >> 7;
+	} else {
+		struct proc *p = (struct proc *) id;
+		h = HASHSTEP(0, p->p_pid);
+		h = HASHSTEP(h, 0);
+	}
+
+	return (h % LOCK_OWNER_HASH_SIZE);
+}
+
+/*
+ * Return true if a lock owner matches the details passed to
+ * lf_advlock.
+ */
+static int
+lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
+    int flags)
+{
+	if (flags & F_REMOTE) {
+		return lo->lo_pid == fl->l_pid
+			&& lo->lo_sysid == fl->l_sysid;
+	} else {
+		return lo->lo_id == id;
+	}
+}
+
+/*
  * Advisory record locking support
  */
 int
-lf_advlock(ap, head, size)
-	struct vop_advlock_args /* {
-		struct vnode *a_vp;
-		caddr_t  a_id;
-		int  a_op;
-		struct flock *a_fl;
-		int  a_flags;
-	} */ *ap;
-	struct lockf **head;
-	u_quad_t size;
+lf_advlock(struct vop_advlock_args *ap, struct lockf **head, u_quad_t size)
 {
 	struct flock *fl = ap->a_fl;
 	struct lockf *lock;
 	struct vnode *vp = ap->a_vp;
+	caddr_t id = ap->a_id;
+	int flags = ap->a_flags;
+	int hash;
+	struct lock_owner *lo;
 	off_t start, end, oadd;
 	struct lockf *clean, *n;
 	int error;
 
 	/*
+	 * Handle the F_UNLKSYS case first - no need to mess about
+	 * creating a lock owner for this one.
+	 */
+	if (ap->a_op == F_UNLCKSYS) {
+		lf_clearremotesys(fl->l_sysid);
+		return (0);
+	}
+
+	/*
+	 * Map our arguments to an existing lock owner or create one
+	 * if this is the first time we have seen this owner.
+	 */
+	hash = lf_hash_owner(id, fl, flags);
+	mtx_lock(&lf_lock_owners_mutex);
+	LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
+		if (lf_owner_matches(lo, id, fl, flags))
+			break;
+	if (!lo) {
+		/*
+		 * We initialise the lock with a reference
+		 * count of one which refers to the new lockf
+		 * structure created below.
+		 */
+		lo = malloc(sizeof(struct lock_owner),
+		    M_LOCKF, M_NOWAIT);
+		if (!lo) {
+			mtx_unlock(&lf_lock_owners_mutex);
+			return (ENOMEM);
+		}
+
+		lo->lo_refs = 1;
+		lo->lo_flags = flags;
+		lo->lo_id = id;
+		if (flags & F_REMOTE) {
+			lo->lo_pid = fl->l_pid;
+			lo->lo_sysid = fl->l_sysid;
+		} else if (flags & F_FLOCK) {
+			lo->lo_pid = -1;
+			lo->lo_sysid = 0;
+		} else {
+			struct proc *p = (struct proc *) id;
+			lo->lo_pid = p->p_pid;
+			lo->lo_sysid = 0;
+		}
+		TAILQ_INIT(&lo->lo_active);
+		TAILQ_INIT(&lo->lo_pending);
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			printf("lf_advlock: new lock owner %p ", lo);
+			lf_print_owner(lo);
+			printf("\n");
+		}
+#endif
+
+		LIST_INSERT_HEAD(&lf_lock_owners[hash],
+		    lo, lo_link);
+	} else {
+		/*
+		 * We have seen this lock owner before,
+		 * increase its reference count to account for
+		 * the new lockf struct we create below.
+		 */
+		lo->lo_refs++;
+	}
+	mtx_unlock(&lf_lock_owners_mutex);
+
+	/*
 	 * Convert the flock structure into a start and end.
 	 */
 	switch (fl->l_whence) {
@@ -165,6 +316,7 @@
 	clean = NULL;
 	if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) {
 		MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+		clean->lf_owner = 0;
 		clean->lf_next = NULL;
 	}
 	/*
@@ -173,7 +325,8 @@
 	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 	lock->lf_start = start;
 	lock->lf_end = end;
-	lock->lf_id = ap->a_id;
+	lock->lf_owner = lo;
+	lock->lf_vnode = vp;
 	/*
 	 * XXX The problem is that VTOI is ufs specific, so it will
 	 * break LOCKF_DEBUG for all other FS's other than UFS because
@@ -186,6 +339,7 @@
 	lock->lf_next = (struct lockf *)0;
 	TAILQ_INIT(&lock->lf_blkhd);
 	lock->lf_flags = ap->a_flags;
+
 	/*
 	 * Do the requested operation.
 	 */
@@ -215,6 +369,31 @@
 	}
 	VI_UNLOCK(vp);
 	for (lock = clean; lock != NULL; ) {
+		/*
+		 * Adjust the lock_owner reference count and
+		 * reclaim the entry if this is the last lock
+		 * for that owner.
+		 */
+		struct lock_owner *lo = lock->lf_owner;
+		if (lo) {
+			mtx_lock(&lf_lock_owners_mutex);
+			lo->lo_refs--;
+			if (lo->lo_refs == 0) {
+#ifdef LOCKF_DEBUG
+				if (lockf_debug & 1)
+					printf("lf_advlock: freeing lock owner %p\n",
+					    lo);
+#endif
+				KASSERT(TAILQ_EMPTY(&lo->lo_active),
+				    ("freeing lock owner with active locks"));
+				KASSERT(TAILQ_EMPTY(&lo->lo_pending),
+				    ("freeing lock owner with pending locks"));
+				LIST_REMOVE(lo, lo_link);
+				free(lo, M_LOCKF);
+			}
+			mtx_unlock(&lf_lock_owners_mutex);
+		}
+
 		n = lock->lf_next;
 		free(lock, M_LOCKF);
 		lock = n;
@@ -226,10 +405,7 @@
  * Set a byte-range lock.
  */
 static int
-lf_setlock(lock, vp, clean)
-	struct lockf *lock;
-	struct vnode *vp;
-	struct lockf **clean;
+lf_setlock(struct lockf *lock, struct vnode *vp, struct lockf **clean)
 {
 	struct lockf *block;
 	struct lockf **head = lock->lf_head;
@@ -267,48 +443,57 @@
 		 * For byte-range locks we must check for deadlock.
 		 *
 		 * Deadlock detection is done by looking through the
-		 * wait channels to see if there are any cycles that
-		 * involve us. MAXDEPTH is set just to make sure we
-		 * do not go off into neverland.
+		 * lock owner pending lists to see if there are any
+		 * cycles that involve us. MAXDEPTH is set just to
+		 * make sure we do not go off into neverland.
+		 *
+		 * This algorithm is simplistic - it only considers
+		 * the first blocking lock and it doesn't follow all
+		 * paths through the lock graph.
 		 */
 		if ((lock->lf_flags & F_POSIX) &&
 		    (block->lf_flags & F_POSIX)) {
-			struct proc *wproc;
-			struct proc *nproc;
-			struct thread *td;
 			struct lockf *waitblock;
-			int i = 0;
-
-			/* The block is waiting on something */
-			wproc = (struct proc *)block->lf_id;
-restart:
-			nproc = NULL;
-			PROC_SLOCK(wproc);
-			FOREACH_THREAD_IN_PROC(wproc, td) {
-				thread_lock(td);
-				while (td->td_wchan &&
-				    (td->td_wmesg == lockstr) &&
-				    (i++ < maxlockdepth)) {
-					waitblock = (struct lockf *)td->td_wchan;
-					/* Get the owner of the blocking lock */
-					waitblock = waitblock->lf_next;
-					if ((waitblock->lf_flags & F_POSIX) == 0)
-						break;
-					nproc = (struct proc *)waitblock->lf_id;
-					if (nproc == (struct proc *)lock->lf_id) {
-						PROC_SUNLOCK(wproc);
-						thread_unlock(td);
+			struct lockf *nblock;
+			struct lock_owner *lo;
+			struct lock_owner *nlo;
+			int i;
+
+			lo = block->lf_owner;
+			i = 0;
+			while (lo) {
+				if (i++ == maxlockdepth)
+					break;
+				mtx_pool_lock(mtxpool_sleep, lo);
+				nlo = NULL;
+				TAILQ_FOREACH(waitblock, &lo->lo_pending,
+				    lf_olock) {
+					/*
+					 * Get the owner of the
+					 * blocking lock.
+					 *
+					 * XXX this is unsafe - if
+					 * waitblock is on a different
+					 * vnode to this one, our
+					 * vnode interlock will not
+					 * protect us against changes
+					 * to waitblock->lf_next.
+					 */
+					nblock = waitblock->lf_next;
+					if ((nblock->lf_flags & F_POSIX) == 0)
+						continue;
+					nlo = nblock->lf_owner;
+					if (nlo == lock->lf_owner) {
+						mtx_pool_unlock(mtxpool_sleep,
+						    lo);
 						lock->lf_next = *clean;
 						*clean = lock;
 						return (EDEADLK);
 					}
 				}
-				thread_unlock(td);
+				mtx_pool_unlock(mtxpool_sleep, block->lf_owner);
+				lo = nlo;
 			}
-			PROC_SUNLOCK(wproc);
-			wproc = nproc;
-			if (wproc)
-				goto restart;
 		}
 		/*
 		 * For flock type locks, we must first remove
@@ -327,6 +512,9 @@
 		 */
 		lock->lf_next = block;
 		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+		mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+		TAILQ_INSERT_TAIL(&lock->lf_owner->lo_pending, lock, lf_olock);
+		mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			lf_print("lf_setlock: blocking on", block);
@@ -344,6 +532,10 @@
 		 */
 		if (lock->lf_next) {
 			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			TAILQ_REMOVE(&lock->lf_owner->lo_pending, lock,
+			    lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			lock->lf_next = NOLOCKF;
 		}
 		if (error) {
@@ -381,6 +573,10 @@
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			}
 			break;
 
@@ -412,8 +608,12 @@
 				*prev = lock;
 				lock->lf_next = overlap;
 				overlap->lf_start = lock->lf_end + 1;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			} else
-				lf_split(overlap, lock, clean);
+				lf_split(overlap, lock, clean, TRUE);
 			lf_wakelock(overlap);
 			break;
 
@@ -438,13 +638,21 @@
 			/*
 			 * Add the new lock if necessary and delete the overlap.
 			 */
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			KASSERT(lock->lf_owner == overlap->lf_owner,
+			    ("unexpected lock owner for overlap"));
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap->lf_next;
 				prev = &lock->lf_next;
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
 				needtolink = 0;
 			} else
 				*prev = overlap->lf_next;
+			TAILQ_REMOVE(&lock->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			overlap->lf_next = *clean;
 			*clean = overlap;
 			continue;
@@ -457,6 +665,10 @@
 			overlap->lf_next = lock;
 			overlap->lf_end = lock->lf_start - 1;
 			prev = &lock->lf_next;
+			mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+			TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+			    lock, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			lf_wakelock(overlap);
 			needtolink = 0;
 			continue;
@@ -468,6 +680,10 @@
 			if (needtolink) {
 				*prev = lock;
 				lock->lf_next = overlap;
+				mtx_pool_lock(mtxpool_sleep, lock->lf_owner);
+				TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active,
+				    lock, lf_olock);
+				mtx_pool_unlock(mtxpool_sleep, lock->lf_owner);
 			}
 			overlap->lf_start = lock->lf_end + 1;
 			lf_wakelock(overlap);
@@ -491,12 +707,10 @@
  * and remove it (or shrink it), then wakeup anyone we can.
  */
 static int
-lf_clearlock(unlock, clean)
-	struct lockf *unlock;
-	struct lockf **clean;
+lf_clearlock(struct lockf *unlock, struct lockf **clean)
 {
 	struct lockf **head = unlock->lf_head;
-	register struct lockf *lf = *head;
+	struct lockf *lf = *head;
 	struct lockf *overlap, **prev;
 	int ovcase;
 
@@ -521,6 +735,10 @@
 			*prev = overlap->lf_next;
 			overlap->lf_next = *clean;
 			*clean = overlap;
+			mtx_pool_lock(mtxpool_sleep, overlap->lf_owner);
+			TAILQ_REMOVE(&overlap->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner);
 			break;
 
 		case 2: /* overlap contains lock: split it */
@@ -528,8 +746,7 @@
 				overlap->lf_start = unlock->lf_end + 1;
 				break;
 			}
-			lf_split(overlap, unlock, clean);
-			overlap->lf_next = unlock->lf_next;
+			lf_split(overlap, unlock, clean, FALSE);
 			break;
 
 		case 3: /* lock contains overlap */
@@ -537,6 +754,10 @@
 			lf = overlap->lf_next;
 			overlap->lf_next = *clean;
 			*clean = overlap;
+			mtx_pool_lock(mtxpool_sleep, overlap->lf_owner);
+			TAILQ_REMOVE(&overlap->lf_owner->lo_active,
+			    overlap, lf_olock);
+			mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner);
 			continue;
 
 		case 4: /* overlap starts before lock */
@@ -563,11 +784,9 @@
  * and if so return its process identifier.
  */
 static int
-lf_getlock(lock, fl)
-	register struct lockf *lock;
-	register struct flock *fl;
+lf_getlock(struct lockf *lock, struct flock *fl)
 {
-	register struct lockf *block;
+	struct lockf *block;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 1)
@@ -582,10 +801,8 @@
 			fl->l_len = 0;
 		else
 			fl->l_len = block->lf_end - block->lf_start + 1;
-		if (block->lf_flags & F_POSIX)
-			fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
-		else
-			fl->l_pid = -1;
+		fl->l_pid = block->lf_owner->lo_pid;
+		fl->l_sysid = block->lf_owner->lo_sysid;
 	} else {
 		fl->l_type = F_UNLCK;
 	}
@@ -597,8 +814,7 @@
  * return the first blocking lock.
  */
 static struct lockf *
-lf_getblock(lock)
-	register struct lockf *lock;
+lf_getblock(struct lockf *lock)
 {
 	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
 	int ovcase;
@@ -627,12 +843,8 @@
  *	 may be more than one.
  */
 static int
-lf_findoverlap(lf, lock, type, prev, overlap)
-	register struct lockf *lf;
-	struct lockf *lock;
-	int type;
-	struct lockf ***prev;
-	struct lockf **overlap;
+lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
+    struct lockf ***prev, struct lockf **overlap)
 {
 	off_t start, end;
 
@@ -646,8 +858,8 @@
 	start = lock->lf_start;
 	end = lock->lf_end;
 	while (lf != NOLOCKF) {
-		if (((type & SELF) && lf->lf_id != lock->lf_id) ||
-		    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+		if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
+		    ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
 			*prev = &lf->lf_next;
 			*overlap = lf = lf->lf_next;
 			continue;
@@ -733,14 +945,13 @@
 }
 
 /*
- * Split a lock and a contained region into
- * two or three locks as necessary.
+ * Split a lock and a contained region into two or three locks as
+ * necessary. If addlock is TRUE, lock2 is being set so it must be
+ * added to the list, otherwise it is being cleared.
  */
 static void
-lf_split(lock1, lock2, split)
-	struct lockf *lock1;
-	struct lockf *lock2;
-	struct lockf **split;
+lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **split,
+    int addlock)
 {
 	struct lockf *splitlock;
 
@@ -755,13 +966,16 @@
 	 */
 	if (lock1->lf_start == lock2->lf_start) {
 		lock1->lf_start = lock2->lf_end + 1;
-		lock2->lf_next = lock1;
+		if (addlock)
+			lock2->lf_next = lock1;
 		return;
 	}
 	if (lock1->lf_end == lock2->lf_end) {
 		lock1->lf_end = lock2->lf_start - 1;
-		lock2->lf_next = lock1->lf_next;
-		lock1->lf_next = lock2;
+		if (addlock) {
+			lock2->lf_next = lock1->lf_next;
+			lock1->lf_next = lock2;
+		}
 		return;
 	}
 	/*
@@ -773,6 +987,15 @@
 	KASSERT(splitlock != NULL, ("no split"));
 	*split = splitlock->lf_next;
 	bcopy(lock1, splitlock, sizeof *splitlock);
+
+	/*
+	 * Update the lock owner reference count to account for the
+	 * new lock.
+	 */
+	mtx_lock(&lf_lock_owners_mutex);
+	splitlock->lf_owner->lo_refs++;
+	mtx_unlock(&lf_lock_owners_mutex);
+
 	splitlock->lf_start = lock2->lf_end + 1;
 	TAILQ_INIT(&splitlock->lf_blkhd);
 	lock1->lf_end = lock2->lf_start - 1;
@@ -780,23 +1003,37 @@
 	 * OK, now link it in
 	 */
 	splitlock->lf_next = lock1->lf_next;
-	lock2->lf_next = splitlock;
-	lock1->lf_next = lock2;
+	mtx_pool_lock(mtxpool_sleep, lock1->lf_owner);
+	TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, splitlock, lf_olock);
+	if (addlock) {
+		KASSERT(lock1->lf_owner == lock2->lf_owner,
+		    ("unexpected lock owner for split"));
+		TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, lock2,
+		    lf_olock);
+		lock2->lf_next = splitlock;
+		lock1->lf_next = lock2;
+	} else {
+		lock1->lf_next = splitlock;
+	}
+	mtx_pool_unlock(mtxpool_sleep, lock1->lf_owner);
 }
 
 /*
  * Wakeup a blocklist
  */
 static void
-lf_wakelock(listhead)
-	struct lockf *listhead;
+lf_wakelock(struct lockf *listhead)
 {
-	register struct lockf *wakelock;
+	struct lockf *wakelock;
 
 	while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
 		wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
 		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
 		wakelock->lf_next = NOLOCKF;
+		mtx_pool_lock(mtxpool_sleep, wakelock->lf_owner);
+		TAILQ_REMOVE(&wakelock->lf_owner->lo_pending, wakelock,
+		    lf_olock);
+		mtx_pool_unlock(mtxpool_sleep, wakelock->lf_owner);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 2)
 			lf_print("lf_wakelock: awakening", wakelock);
@@ -805,21 +1042,94 @@
 	}
 }
 
+struct clearlock {
+	STAILQ_ENTRY(clearlock) link;
+	struct vnode *vp;
+	struct flock fl;
+};
+STAILQ_HEAD(clearlocklist, clearlock);
+
+void
+lf_clearremotesys(int sysid)
+{
+	int i;
+	struct lock_owner *lo;
+	struct lockf *lf;
+	struct clearlock *cl;
+	struct clearlocklist locks;
+
+	KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
+
+	/*
+	 * In order to keep the locking simple, we iterate over the
+	 * active lock lists to build a list of locks that need
+	 * releasing. We then call VOP_ADVLOCK for each one in turn.
+	 */
+	STAILQ_INIT(&locks);
+	mtx_lock(&lf_lock_owners_mutex);
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) {
+		LIST_FOREACH(lo, &lf_lock_owners[i], lo_link) {
+			if (lo->lo_sysid != sysid)
+				continue;
+
+			mtx_pool_lock(mtxpool_sleep, lo);
+			TAILQ_FOREACH(lf, &lo->lo_active, lf_olock) {
+				cl = malloc(sizeof(struct clearlock),
+				    M_LOCKF, M_NOWAIT);
+				if (!cl)
+					continue;
+				cl->vp = lf->lf_vnode;
+				cl->fl.l_start = lf->lf_start;
+				if (lf->lf_end == -1)
+					cl->fl.l_len = 0;
+				else
+					cl->fl.l_len =
+						lf->lf_end - lf->lf_start;
+				cl->fl.l_whence = SEEK_SET;
+				cl->fl.l_type = F_UNLCK;
+				cl->fl.l_pid = lo->lo_pid;
+				cl->fl.l_sysid = sysid;
+				STAILQ_INSERT_TAIL(&locks, cl, link);
+			}
+			mtx_pool_unlock(mtxpool_sleep, lo);
+		}
+	}
+	mtx_unlock(&lf_lock_owners_mutex);
+
+	while ((cl = STAILQ_FIRST(&locks)) != NULL) {
+		STAILQ_REMOVE_HEAD(&locks, link);
+		VOP_ADVLOCK(cl->vp, 0, F_UNLCK, &cl->fl, F_REMOTE);
+		free(cl, M_LOCKF);
+	}
+}
+
 #ifdef LOCKF_DEBUG
 /*
+ * Print description of a lock owner
+ */
+static void
+lf_print_owner(struct lock_owner *lo)
+{
+
+	if (lo->lo_flags & F_REMOTE) {
+		printf("remote pid %d, system %d",
+		    lo->lo_pid, lo->lo_sysid);
+	} else if (lo->lo_flags & F_FLOCK) {
+		printf("file %p", lo->lo_id);
+	} else {
+		printf("local pid %d", lo->lo_pid);
+	}
+}
+
+/*
  * Print out a lock.
  */
 static void
-lf_print(tag, lock)
-	char *tag;
-	register struct lockf *lock;
+lf_print(char *tag, struct lockf *lock)
 {
 
 	printf("%s: lock %p for ", tag, (void *)lock);
-	if (lock->lf_flags & F_POSIX)
-		printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
-	else
-		printf("id %p", (void *)lock->lf_id);
+	lf_print_owner(lock->lf_owner);
 	if (lock->lf_inode != (struct inode *)0)
 		printf(" in ino %ju on dev <%s>, %s, start %jd, end %jd",
 		    (uintmax_t)lock->lf_inode->i_number,
@@ -841,11 +1151,9 @@
 }
 
 static void
-lf_printlist(tag, lock)
-	char *tag;
-	struct lockf *lock;
+lf_printlist(char *tag, struct lockf *lock)
 {
-	register struct lockf *lf, *blk;
+	struct lockf *lf, *blk;
 
 	if (lock->lf_inode == (struct inode *)0)
 		return;
@@ -855,11 +1163,7 @@
 	    devtoname(lock->lf_inode->i_dev));
 	for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
 		printf("\tlock %p for ",(void *)lf);
-		if (lf->lf_flags & F_POSIX)
-			printf("proc %ld",
-			    (long)((struct proc *)lf->lf_id)->p_pid);
-		else
-			printf("id %p", (void *)lf->lf_id);
+		lf_print_owner(lock->lf_owner);
 		printf(", %s, start %jd, end %jd",
 		    lf->lf_type == F_RDLCK ? "shared" :
 		    lf->lf_type == F_WRLCK ? "exclusive" :
@@ -867,11 +1171,7 @@
 		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
 		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
 			printf("\n\t\tlock request %p for ", (void *)blk);
-			if (blk->lf_flags & F_POSIX)
-				printf("proc %ld",
-				    (long)((struct proc *)blk->lf_id)->p_pid);
-			else
-				printf("id %p", (void *)blk->lf_id);
+			lf_print_owner(blk->lf_owner);
 			printf(", %s, start %jd, end %jd",
 			    blk->lf_type == F_RDLCK ? "shared" :
 			    blk->lf_type == F_WRLCK ? "exclusive" :
diff -urN /Projects/clean/src/sys/sys/fcntl.h /Projects/M3/src/sys/sys/fcntl.h
--- /Projects/clean/src/sys/sys/fcntl.h	2008-01-19 15:54:44.000000000 +0000
+++ /Projects/M3/src/sys/sys/fcntl.h	2008-01-30 10:35:43.000000000 +0000
@@ -178,9 +178,13 @@
 #define	F_GETOWN	5		/* get SIGIO/SIGURG proc/pgrp */
 #define F_SETOWN	6		/* set SIGIO/SIGURG proc/pgrp */
 #endif
-#define	F_GETLK		7		/* get record locking information */
-#define	F_SETLK		8		/* set record locking information */
-#define	F_SETLKW	9		/* F_SETLK; wait if blocked */
+#define	F_OGETLK	7		/* get record locking information */
+#define	F_OSETLK	8		/* set record locking information */
+#define	F_OSETLKW	9		/* F_SETLK; wait if blocked */
+#define	F_GETLK		10		/* get record locking information */
+#define	F_SETLK		11		/* set record locking information */
+#define	F_SETLKW	12		/* F_SETLK; wait if blocked */
+#define	F_SETLK_REMOTE	13		/* debugging support for remote locks */
 
 /* file descriptor flags (F_GETFD, F_SETFD) */
 #define	FD_CLOEXEC	1		/* close-on-exec flag */
@@ -189,10 +193,12 @@
 #define	F_RDLCK		1		/* shared or read lock */
 #define	F_UNLCK		2		/* unlock */
 #define	F_WRLCK		3		/* exclusive or write lock */
+#define F_UNLCKSYS	4		/* purge locks for a given system ID */ 
 #ifdef _KERNEL
 #define	F_WAIT		0x010		/* Wait until lock is granted */
 #define	F_FLOCK		0x020	 	/* Use flock(2) semantics for lock */
 #define	F_POSIX		0x040	 	/* Use POSIX semantics for lock */
+#define F_REMOTE	0x080		/* Lock owner is remote NFS client */
 #endif
 
 /*
@@ -205,6 +211,19 @@
 	pid_t	l_pid;		/* lock owner */
 	short	l_type;		/* lock type: read/write, etc. */
 	short	l_whence;	/* type of l_start */
+	int	l_sysid;	/* remote system id or zero for local */
+};
+
+/*
+ * Old advisory file segment locking data type,
+ * before adding l_sysid.
+ */
+struct oflock {
+	off_t	l_start;	/* starting offset */
+	off_t	l_len;		/* len = 0 means until end of file */
+	pid_t	l_pid;		/* lock owner */
+	short	l_type;		/* lock type: read/write, etc. */
+	short	l_whence;	/* type of l_start */
 };
 
 
diff -urN /Projects/clean/src/sys/sys/lockf.h /Projects/M3/src/sys/sys/lockf.h
--- /Projects/clean/src/sys/sys/lockf.h	2008-01-19 15:44:41.000000000 +0000
+++ /Projects/M3/src/sys/sys/lockf.h	2008-01-30 10:35:43.000000000 +0000
@@ -53,12 +53,14 @@
 	short	lf_type;	    /* Lock type: F_RDLCK, F_WRLCK */
 	off_t	lf_start;	    /* Byte # of the start of the lock */
 	off_t	lf_end;		    /* Byte # of the end of the lock (-1=EOF) */
-	caddr_t	lf_id;		    /* Id of the resource holding the lock */
+	struct	lock_owner *lf_owner; /* Owner of the lock */
+	struct	vnode *lf_vnode;    /* File being locked (only valid for active lock) */
 	struct	lockf **lf_head;    /* Back pointer to the head of the lockf list */
 	struct	inode *lf_inode;    /* Back pointer to the inode */
 	struct	lockf *lf_next;	    /* Pointer to the next lock on this inode */
 	struct	locklist lf_blkhd;  /* List of requests blocked on this lock */
 	TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
+	TAILQ_ENTRY(lockf) lf_olock;/* Linkage for owner lock lists */
 };
 
 /* Maximum length of sleep chains to traverse to try and detect deadlock. */
diff -urN /Projects/clean/src/tools/regression/file/flock/Makefile /Projects/M3/src/tools/regression/file/flock/Makefile
--- /Projects/clean/src/tools/regression/file/flock/Makefile	1970-01-01 01:00:00.000000000 +0100
+++ /Projects/M3/src/tools/regression/file/flock/Makefile	2008-01-30 10:35:49.000000000 +0000
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+PROG=	flock
+NO_MAN=
+WARNS?=	6
+DEBUG_FLAGS= -g -O0
+
+.include <bsd.prog.mk>
diff -urN /Projects/clean/src/tools/regression/file/flock/flock.c /Projects/M3/src/tools/regression/file/flock/flock.c
--- /Projects/clean/src/tools/regression/file/flock/flock.c	1970-01-01 01:00:00.000000000 +0100
+++ /Projects/M3/src/tools/regression/file/flock/flock.c	2008-01-30 10:35:49.000000000 +0000
@@ -0,0 +1,931 @@
+/*-
+ * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
+ * Authors: Doug Rabson <dfr@rabson.org>
+ * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/wait.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __FreeBSD__
+#define HAVE_SYSID
+#include <sys/cdefs.h>
+#else
+#define __unused
+#endif
+
+static int
+make_file(const char *dir, off_t sz)
+{
+	const char *template = "/flocktempXXXXXX";
+	size_t len;
+	char *filename;
+	int fd;
+
+	len = strlen(dir) + strlen(template) + 1;
+	filename = malloc(len);
+	strcpy(filename, dir);
+	strcat(filename, template);
+	fd = mkstemp(filename);
+	if (fd < 0)
+		err(1, "mkstemp");
+	if (ftruncate(fd, sz) < 0)
+		err(1, "ftruncate");
+	if (unlink(filename) < 0)
+		err(1, "unlink");
+	free(filename);
+
+	return (fd);
+}
+
+static void
+ignore_alarm(int __unused sig)
+{
+}
+
+#define FAIL(test)					\
+	do {						\
+		if (test) {				\
+			printf("FAIL (%s)\n", #test);	\
+			return -1;			\
+		}					\
+	} while (0)
+
+#define SUCCEED \
+	do { printf("SUCCEED\n"); return 0; } while (0)
+
+/*
+ * Test 1 - F_GETLK on unlocked region
+ *
+ * If no lock is found that would prevent this lock from being
+ * created, the structure is left unchanged by this function call
+ * except for the lock type which is set to F_UNLCK.
+ */
+static int
+test1(int fd)
+{
+	struct flock fl1, fl2;
+
+	memset(&fl1, 1, sizeof(fl1));
+	fl1.l_type = F_WRLCK;
+	fl1.l_whence = SEEK_SET;
+	fl2 = fl1;
+
+	if (fcntl(fd, F_GETLK, &fl1) < 0)
+		err(1, "F_GETLK");
+
+	printf("1 - F_GETLK on unlocked region: ");
+	FAIL(fl1.l_start != fl2.l_start);
+	FAIL(fl1.l_len != fl2.l_len);
+	FAIL(fl1.l_pid != fl2.l_pid);
+	FAIL(fl1.l_type != F_UNLCK);
+	FAIL(fl1.l_whence != fl2.l_whence);
+#ifdef HAVE_SYSID
+	FAIL(fl1.l_sysid != fl2.l_sysid);
+#endif
+
+	SUCCEED;
+}
+
+/*
+ * Test 2 - F_SETLK on locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test2(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should return -1 with errno set to either EACCES or
+	 * EAGAIN.
+	 */
+	printf("2 - F_SETLK on locked region: ");
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 3 - F_SETLKW on locked region
+ *
+ * If a shared or exclusive lock is blocked by other locks, the
+ * process waits until the request can be satisfied.
+ *
+ * XXX this test hangs on FreeBSD NFS filesystems due to limitations
+ * in FreeBSD's client (and server) lockd implementation.
+ */
+static int
+test3(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("3 - F_SETLKW on locked region: ");
+
+	alarm(1);
+
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+	FAIL(res == 0);
+	FAIL(errno != EINTR);
+
+	SUCCEED;
+}
+
+/*
+ * Test 4 - F_GETLK on locked region
+ *
+ * Get the first lock that blocks the lock.
+ */
+static int
+test4(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 99;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should return a lock structure reflecting the lock we
+	 * made in the child process.
+	 */
+	if (fcntl(fd, F_GETLK, &fl) < 0)
+		err(1, "F_GETLK");
+
+	printf("4 - F_GETLK on locked region: ");
+	FAIL(fl.l_start != 0);
+	FAIL(fl.l_len != 99);
+	FAIL(fl.l_type != F_WRLCK);
+	FAIL(fl.l_pid != pid);
+#ifdef HAVE_SYSID
+	FAIL(fl.l_sysid != 0);
+#endif
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	SUCCEED;
+}
+
+/*
+ * Test 5 - F_SETLKW simple deadlock
+ *
+ * If a blocking shared lock request would cause a deadlock (i.e. the
+ * lock request is blocked by a process which is itself blocked on a
+ * lock currently owned by the process making the new request),
+ * EDEADLK is returned.
+ */
+static int
+test5(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. Because our test relies on the child process being
+	 * blocked on the parent's lock, we can't easily use a pipe to
+	 * synchronize so we just sleep in the parent to given the
+	 * child a chance to setup.
+	 *
+	 * To create the deadlock condition, we arrange for the parent
+	 * to lock the first byte of the file and the child to lock
+	 * the second byte.  After locking the second byte, the child
+	 * will attempt to lock the first byte of the file, and
+	 * block. The parent will then attempt to lock the second byte
+	 * (owned by the child) which should cause deadlock.
+	 */
+	int pid;
+	struct flock fl;
+	int res;
+
+	/*
+	 * Lock the first byte in the parent.
+	 */
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_SETLK 1 (parent)");
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * Lock the second byte in the child and then block on
+		 * the parent's lock.
+		 */
+		fl.l_start = 1;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		fl.l_start = 0;
+		if (fcntl(fd, F_SETLKW, &fl) < 0)
+			err(1, "F_SETLKW (child)");
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	sleep(1);
+
+	/*
+	 * fcntl should immediately return -1 with errno set to EDEADLK.
+	 */
+	printf("5 - F_SETLKW simple deadlock: ");
+
+	fl.l_start = 1;
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	
+	FAIL(res == 0);
+	FAIL(errno != EDEADLK);
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	SUCCEED;
+}
+
+/*
+ * Test 6 - F_SETLKW complex deadlock.
+ *
+ * This test involves three process, P, C1 and C2. We set things up so
+ * that P locks byte zero, C1 locks byte 1 and C2 locks byte 2. We
+ * also block C2 by attempting to lock byte zero. Lastly, P attempts
+ * to lock a range including byte 1 and 2. This represents a deadlock
+ * (due to C2's blocking attempt to lock byte zero).
+ */
+static int
+test6(int fd)
+{
+	/*
+	 * Because our test relies on the child process being blocked
+	 * on the parent's lock, we can't easily use a pipe to
+	 * synchronize so we just sleep in the parent to given the
+	 * children a chance to setup.
+	 */
+	int pid1, pid2;
+	struct flock fl;
+	int res;
+
+	/*
+	 * Lock the first byte in the parent.
+	 */
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_SETLK 1 (parent)");
+
+	pid1 = fork();
+	if (pid1 < 0)
+		err(1, "fork");
+
+	if (pid1 == 0) {
+		/*
+		 * C1
+		 * Lock the second byte in the child and then sleep
+		 */
+		fl.l_start = 1;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child1)");
+		pause();
+		exit(0);
+	}
+
+	pid2 = fork();
+	if (pid2 < 0)
+		err(1, "fork");
+
+	if (pid2 == 0) {
+		/*
+		 * C2
+		 * Lock the third byte in the child and then block on
+		 * the parent's lock.
+		 */
+		fl.l_start = 2;
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child2)");
+		fl.l_start = 0;
+		if (fcntl(fd, F_SETLKW, &fl) < 0)
+			err(1, "F_SETLKW (child2)");
+		exit(0);
+	}
+
+	/*
+	 * Wait until the children have set their locks and then
+	 * perform the test.
+	 */
+	sleep(1);
+
+	/*
+	 * fcntl should immediately return -1 with errno set to
+	 * EDEADLK. If the alarm fires, we failed to detect the
+	 * deadlock.
+	 */
+	alarm(1);
+	printf("6 - F_SETLKW complex deadlock: ");
+
+	fl.l_start = 1;
+	fl.l_len = 2;
+	res = fcntl(fd, F_SETLKW, &fl);
+	kill(pid1, SIGTERM);
+	if (waitpid(pid1, 0, 0) != pid1)
+		err(1, "waitpid");
+	kill(pid2, SIGTERM);
+	if (waitpid(pid2, 0, 0) != pid2)
+		err(1, "waitpid");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	FAIL(res == 0);
+	FAIL(errno != EDEADLK);
+
+	/*
+	 * Cancel the alarm to avoid confusing later tests.
+	 */
+	alarm(0);
+
+	SUCCEED;
+}
+
+/*
+ * Test 7 - F_SETLK shared lock on exclusive locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test7(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("7 - F_SETLK shared lock on exclusive locked region: ");
+
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 8 - F_SETLK shared lock on share locked region
+ *
+ * When a shared lock is set on a segment of a file, other processes
+ * shall be able to set shared locks on that segment or a portion of
+ * it.
+ */
+static int
+test8(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_RDLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("8 - F_SETLK shared lock on share locked region: ");
+
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_UNLCK;
+	if (fcntl(fd, F_SETLK, &fl) < 0)
+		err(1, "F_UNLCK");
+
+	FAIL(res != 0);
+
+	SUCCEED;
+}
+
+/*
+ * Test 9 - F_SETLK exclusive lock on share locked region
+ *
+ * If a shared or exclusive lock cannot be set, fcntl returns
+ * immediately with EACCES or EAGAIN.
+ */
+static int
+test9(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+	int res;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_RDLCK;
+	fl.l_whence = SEEK_SET;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	/*
+	 * fcntl should wait until the alarm and then return -1 with
+	 * errno set to EINTR.
+	 */
+	printf("9 - F_SETLK exclusive lock on share locked region: ");
+
+	fl.l_type = F_WRLCK;
+	res = fcntl(fd, F_SETLK, &fl);
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	SUCCEED;
+}
+
+/*
+ * Test 10 - trying to set bogus pid or sysid values
+ *
+ * The l_pid and l_sysid fields are only used with F_GETLK to return
+ * the process ID of the process holding a blocking lock and the
+ * system ID of the system that owns that process
+ */
+static int
+test10(int fd)
+{
+	/*
+	 * We create a child process to hold the lock which we will
+	 * test. We use a pipe to communicate with the child.
+	 */
+	int pid;
+	int pfd[2];
+	struct flock fl;
+	char ch;
+
+	if (pipe(pfd) < 0)
+		err(1, "pipe");
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_pid = 9999;
+	fl.l_sysid = 9999;
+
+	pid = fork();
+	if (pid < 0)
+		err(1, "fork");
+
+	if (pid == 0) {
+		/*
+		 * We are the child. We set a write lock and then
+		 * write one byte back to the parent to tell it. The
+		 * parent will kill us when its done.
+		 */
+		if (fcntl(fd, F_SETLK, &fl) < 0)
+			err(1, "F_SETLK (child)");
+		if (write(pfd[1], "a", 1) < 0)
+			err(1, "writing to pipe (child)");
+		pause();
+		exit(0);
+	}
+
+	/*
+	 * Wait until the child has set its lock and then perform the
+	 * test.
+	 */
+	if (read(pfd[0], &ch, 1) != 1)
+		err(1, "reading from pipe (child)");
+
+	printf("10 - trying to set bogus pid or sysid values: ");
+
+	if (fcntl(fd, F_GETLK, &fl) < 0)
+		err(1, "F_GETLK");
+
+	kill(pid, SIGTERM);
+	if (waitpid(pid, 0, 0) != pid)
+		err(1, "waitpid");
+	close(pfd[0]);
+	close(pfd[1]);
+
+	FAIL(fl.l_pid != pid);
+#ifdef HAVE_SYSID
+	FAIL(fl.l_sysid != 0);
+#endif
+
+	SUCCEED;
+}
+
+/*
+ * Test 11 - remote locks
+ *
+ * XXX temporary interface which will be removed when the kernel lockd
+ * is added.
+ */
+static int
+test11(int fd)
+{
+	struct flock fl;
+	int res;
+
+	fl.l_start = 0;
+	fl.l_len = 0;
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_pid = 9999;
+	fl.l_sysid = 1;
+
+	printf("11 - remote locks: ");
+
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res == 0);
+	FAIL(errno != EACCES && errno != EAGAIN);
+
+	res = fcntl(fd, F_GETLK, &fl);
+	FAIL(res != 0);
+	FAIL(fl.l_pid != 9999);
+	FAIL(fl.l_sysid != 1);
+
+	fl.l_type = F_UNLCK;
+	fl.l_sysid = 1;
+	fl.l_start = 0;
+	fl.l_len = 0;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_pid = 1234;
+	fl.l_sysid = 1;
+	fl.l_start = 0;
+	fl.l_len = 1;
+	fl.l_whence = SEEK_SET;
+	fl.l_type = F_RDLCK;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_type = F_UNLCKSYS;
+	fl.l_sysid = 1;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	fl.l_type = F_WRLCK;
+	res = fcntl(fd, F_GETLK, &fl);
+	FAIL(res != 0);
+	FAIL(fl.l_pid != 1234);
+	FAIL(fl.l_sysid != 2);
+
+	fl.l_type = F_UNLCKSYS;
+	fl.l_sysid = 2;
+	res = fcntl(fd, F_SETLK_REMOTE, &fl);
+	FAIL(res != 0);
+
+	SUCCEED;
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int fd;
+	struct sigaction sa;
+
+	if (argc != 2) {
+		errx(1, "usage: flock <directory>");
+	}
+
+	fd = make_file(argv[1], 1024);
+
+	sa.sa_handler = ignore_alarm;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	sigaction(SIGALRM, &sa, 0);
+
+	test1(fd);
+	test2(fd);
+	test3(fd);
+	test4(fd);
+	test5(fd);
+	test6(fd);
+	test7(fd);
+	test8(fd);
+	test9(fd);
+	test10(fd);
+	test11(fd);
+
+	return 0;
+}