sys/conf/files                  |    1 +
 sys/fs/nfsclient/nfs_clbio.c    |   62 +++++--
 sys/fs/nfsclient/nfs_clvfsops.c |    3 +-
 sys/kern/kern_rangelock.c       |  246 ++++++++++++++++++++++++++++
 sys/kern/kern_thread.c          |    3 +
 sys/kern/subr_syscall.c         |    6 +
 sys/kern/vfs_subr.c             |    2 +
 sys/kern/vfs_vnops.c            |  338 ++++++++++++++++++++++++++++++++++-----
 sys/sys/mount.h                 |    3 +
 sys/sys/proc.h                  |    6 +-
 sys/sys/rangelock.h             |   78 +++++++++
 sys/sys/vnode.h                 |   16 +-
 sys/ufs/ffs/ffs_vfsops.c        |    2 +-
 sys/ufs/ffs/ffs_vnops.c         |    6 +-
 14 files changed, 709 insertions(+), 63 deletions(-)

diff --git a/sys/conf/files b/sys/conf/files
index 3c84cf6..843f2dd 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2523,6 +2523,7 @@ kern/kern_priv.c		standard
 kern/kern_proc.c		standard
 kern/kern_prot.c		standard
 kern/kern_racct.c		standard
+kern/kern_rangelock.c		standard
 kern/kern_rctl.c		standard
 kern/kern_resource.c		standard
 kern/kern_rmlock.c		standard
diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c
index fa9636b..df9677c 100644
--- a/sys/fs/nfsclient/nfs_clbio.c
+++ b/sys/fs/nfsclient/nfs_clbio.c
@@ -722,7 +722,7 @@ ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 	    };
 
 	    if (n > 0) {
-		    error = uiomove(bp->b_data + on, (int)n, uio);
+		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
@@ -897,8 +897,9 @@ ncl_write(struct vop_write_args *ap)
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount;
-	int n, on, error = 0;
-	off_t tmp_off;
+	int bp_cached, n, on, error = 0;
+	size_t orig_resid, local_resid;
+	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
@@ -950,6 +951,11 @@ flush_and_restart:
 			mtx_unlock(&np->n_mtx);
 	}
 
+	orig_resid = uio->uio_resid;
+	mtx_lock(&np->n_mtx);
+	orig_size = np->n_size;
+	mtx_unlock(&np->n_mtx);
+
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
@@ -1127,7 +1133,10 @@ again:
 		 * normally.
 		 */
 
+		bp_cached = 1;
 		if (on == 0 && n == bcount) {
+			if ((bp->b_flags & B_CACHE) == 0)
+				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
@@ -1178,7 +1187,7 @@ again:
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
-		 * as an optimization we could theoretically maintain
+		 * As an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
@@ -1193,7 +1202,23 @@ again:
 			goto again;
 		}
 
-		error = uiomove((char *)bp->b_data + on, n, uio);
+		local_resid = uio->uio_resid;
+		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
+
+		if (error != 0 && !bp_cached) {
+			/*
+			 * This block has no other content then what
+			 * possibly was written by faulty uiomove.
+			 * Release it, forgetting the data pages, to
+			 * prevent the leak of uninitialized data to
+			 * usermode.
+			 */
+			bp->b_ioflags |= BIO_ERROR;
+			brelse(bp);
+			uio->uio_offset -= local_resid - uio->uio_resid;
+			uio->uio_resid = local_resid;
+			break;
+		}
 
 		/*
 		 * Since this block is being modified, it must be written
@@ -1203,17 +1228,18 @@ again:
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
-		if (error) {
-			bp->b_ioflags |= BIO_ERROR;
-			brelse(bp);
-			break;
-		}
+		/*
+		 * Get the partial update on the progress made from
+		 * uiomove, if error occured.
+		 */
+		if (error != 0)
+			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
-		if (n) {
+		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
@@ -1242,8 +1268,22 @@ again:
 		} else {
 			bdwrite(bp);
 		}
+
+		if (error != 0)
+			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
+	if (error != 0) {
+		if (ioflag & IO_UNIT) {
+			VATTR_NULL(&vattr);
+			vattr.va_size = orig_size;
+			/* IO_SYNC is handled implicitely */
+			(void)VOP_SETATTR(vp, &vattr, cred);
+			uio->uio_offset -= orig_resid - uio->uio_resid;
+			uio->uio_resid = orig_resid;
+		}
+	}
+
 	return (error);
 }
 
diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c
index af0e33b..966688f 100644
--- a/sys/fs/nfsclient/nfs_clvfsops.c
+++ b/sys/fs/nfsclient/nfs_clvfsops.c
@@ -1136,7 +1136,8 @@ nfs_mount(struct mount *mp)
 out:
 	if (!error) {
 		MNT_ILOCK(mp);
-		mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED);
+		mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
+		    MNTK_NO_IOPF;
 		MNT_IUNLOCK(mp);
 	}
 	return (error);
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
new file mode 100644
index 0000000..5e02717
--- /dev/null
+++ b/sys/kern/kern_rangelock.c
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	off_t		rl_q_start, rl_q_end;
+	int		rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+static struct rl_q_entry *
+rlqentry_alloc(void)
+{
+
+	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+	uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+/*
+ * Verifies the supplied rl_q_entries for compatibility.  Returns true
+ * if rangelock queue entries are not compatible, false if they are.
+ *
+ * Two entries are compatible if their ranges do not overlap, or both
+ * entries are for read.
+ */
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+		return (1);
+	return (0);
+}
+
+/*
+ * Recalculate the lock->rl_currdep after an unlock.
+ */
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry != NULL;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+	mtx_assert(ilk, MA_OWNED);
+	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	if (curthread->td_rlqe == NULL)
+		curthread->td_rlqe = entry;
+	else
+		rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	rangelock_unlock_locked(lock, cookie, ilk);
+}
+
+/*
+ * Unlock the sub-range of granted lock.
+ */
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
+    off_t end, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+	entry = cookie;
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
+	    ("Unlocking non-granted lock"));
+	KASSERT(entry->rl_q_start == start, ("wrong start"));
+	KASSERT(entry->rl_q_end >= end, ("wrong end"));
+
+	mtx_lock(ilk);
+	if (entry->rl_q_end == end) {
+		rangelock_unlock_locked(lock, cookie, ilk);
+		return (NULL);
+	}
+	entry->rl_q_end = end;
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	return (cookie);
+}
+
+/*
+ * Add the lock request to the queue of the pending requests for
+ * rangelock.  Sleeps until the request can be granted.
+ */
+static void *
+rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
+    struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	MPASS(lock != NULL && ilk != NULL);
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	MPASS(entry != NULL);
+	entry->rl_q_flags = mode;
+	entry->rl_q_start = start;
+	entry->rl_q_end = end;
+
+	mtx_lock(ilk);
+	/*
+	 * XXXKIB TODO. Check that thread does not try to enqueue a
+	 * lock which is incompatible with other request from the same
+	 * thread.
+	 */
+
+	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+	if (lock->rl_currdep == NULL)
+		lock->rl_currdep = entry;
+	rangelock_calc_block(lock);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, ilk, 0, "range", 0);
+	mtx_unlock(ilk);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
+}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index d4c5c4c..8116c15 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
@@ -199,6 +200,7 @@ thread_init(void *mem, int size, int flags)
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
+	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
@@ -216,6 +218,7 @@ thread_fini(void *mem, int size)
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
+	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index 5b48595..5aee684 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -182,6 +182,12 @@ syscallret(struct thread *td, int error, struct syscall_args *sa __unused)
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	     syscallname(p, sa->code), td->td_locks));
+	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
+	    ("System call %s returning with pagefaults disabled",
+	     syscallname(p, sa->code)));
+	KASSERT((td->td_pflags & TDP_NOSLEEPING) == 0,
+	    ("System call %s returning with sleep disabled",
+	     syscallname(p, sa->code)));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a06ba31..8d999c3 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1027,6 +1027,7 @@ alloc:
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);
@@ -2468,6 +2469,7 @@ vdropl(struct vnode *vp)
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index d4b60f1..5b4799e 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -65,10 +65,15 @@ __FBSDID("$FreeBSD$");
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
+static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
@@ -77,8 +82,8 @@ static fo_stat_t	vn_statfile;
 static fo_close_t	vn_closefile;
 
 struct 	fileops vnops = {
-	.fo_read = vn_read,
-	.fo_write = vn_write,
+	.fo_read = vn_io_fault,
+	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
@@ -367,57 +372,56 @@ sequential_heuristic(struct uio *uio, struct file *fp)
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
-    aresid, td)
-	enum uio_rw rw;
-	struct vnode *vp;
-	void *base;
-	int len;
-	off_t offset;
-	enum uio_seg segflg;
-	int ioflg;
-	struct ucred *active_cred;
-	struct ucred *file_cred;
-	ssize_t *aresid;
-	struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
+	void *rl_cookie;
 	int error, lock_flags;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
 	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ) {
+			rl_cookie = vn_rangelock_rlock(vp, offset,
+			    offset + len);
+		} else {
+			rl_cookie = vn_rangelock_wlock(vp, offset,
+			    offset + len);
+		}
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
-				return (error);
+				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
-			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
-			} else {
+			else
 				lock_flags = LK_EXCLUSIVE;
-			}
-			vn_lock(vp, lock_flags | LK_RETRY);
 		} else
-			vn_lock(vp, LK_SHARED | LK_RETRY);
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
 
-	}
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	aiov.iov_base = base;
-	aiov.iov_len = len;
-	auio.uio_resid = len;
-	auio.uio_offset = offset;
-	auio.uio_segflg = segflg;
-	auio.uio_rw = rw;
-	auio.uio_td = td;
-	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
@@ -429,7 +433,7 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 	}
 #endif
 	if (error == 0) {
-		if (file_cred)
+		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
@@ -444,10 +448,13 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE && vp->v_type != VCHR)
-			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
+			vn_finished_write(mp);
 	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -688,29 +695,270 @@ unlock:
 	return (error);
 }
 
+static const int io_hold_cnt = 16;
+
 /*
- * File table truncate routine.
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove().  A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore , rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
  */
 static int
-vn_truncate(fp, length, active_cred, td)
-	struct file *fp;
-	off_t length;
-	struct ucred *active_cred;
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	vm_page_t ma[io_hold_cnt + 2];
+	struct uio *uio_clone, short_uio;
+	struct iovec short_iovec[1];
+	fo_rdwr_t *doio;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct mount *mp;
+	vm_page_t *prev_td_ma;
+	int cnt, error, save, saveheld, prev_td_ma_cnt;
+	vm_offset_t addr, end;
+	vm_prot_t prot;
+	size_t len, resid;
+	ssize_t adv;
+
+	if (uio->uio_rw == UIO_READ)
+		doio = vn_read;
+	else
+		doio = vn_write;
+	vp = fp->f_vnode;
+	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
+	    ((mp = vp->v_mount) != NULL &&
+	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0))
+		return (doio(fp, uio, active_cred, flags, td));
+
+	/*
+	 * The UFS follows IO_UNIT directive and replays back both
+	 * uio_offset and uio_resid if error encountered during the
+	 * operation.  But, since the iovec may be already advanced,
+	 * uio is still in the inconsistent state.
+	 *
+	 * Cache a copy of the original uio, which is advanced to redo
+	 * point using UIO_NOCOPY below.
+	 */
+	uio_clone = cloneuio(uio);
+	resid = uio->uio_resid;
+
+	short_uio.uio_segflg = UIO_USERSPACE;
+	short_uio.uio_rw = uio->uio_rw;
+	short_uio.uio_td = uio->uio_td;
+
+	if (uio->uio_rw == UIO_READ) {
+		prot = VM_PROT_WRITE;
+		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid);
+	} else {
+		prot = VM_PROT_READ;
+		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+	}
+
+	save = vm_fault_disable_pagefaults();
+	error = doio(fp, uio, active_cred, flags, td);
+	if (error != EFAULT)
+		goto out;
+
+	uio_clone->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, resid - uio->uio_resid, uio_clone);
+	uio_clone->uio_segflg = uio->uio_segflg;
+
+	saveheld = curthread_pflags_set(TDP_UIOHELD);
+	prev_td_ma = td->td_ma;
+	prev_td_ma_cnt = td->td_ma_cnt;
+
+	while (uio_clone->uio_resid != 0) {
+		len = uio_clone->uio_iov->iov_len;
+		if (len == 0) {
+			KASSERT(uio_clone->uio_iovcnt >= 1,
+			    ("iovcnt underflow"));
+			uio_clone->uio_iov++;
+			uio_clone->uio_iovcnt--;
+			continue;
+		}
+
+		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
+		end = round_page(addr + len);
+		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
+		/*
+		 * Perfectly misaligned address and lenght could cause
+		 * both start and end of the chunk to use partial
+		 * page.  +2 accounts for such situation.
+		 */
+		if (cnt > io_hold_cnt + 2) {
+			len = io_hold_cnt * PAGE_SIZE;
+			KASSERT(howmany(round_page(addr + len) -
+			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
+			    ("cnt overflow"));
+		}
+		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+		    addr, len, prot, ma, io_hold_cnt + 2);
+		if (cnt == -1) {
+			error = EFAULT;
+			break;
+		}
+		short_uio.uio_iov = &short_iovec[0];
+		short_iovec[0].iov_base = (void *)addr;
+		short_uio.uio_iovcnt = 1;
+		short_uio.uio_resid = short_iovec[0].iov_len = len;
+		short_uio.uio_offset = uio_clone->uio_offset;
+		td->td_ma = ma;
+		td->td_ma_cnt = cnt;
+
+		error = doio(fp, &short_uio, active_cred, flags, td);
+		vm_page_unhold_pages(ma, cnt);
+		adv = len - short_uio.uio_resid;
+
+		uio_clone->uio_iov->iov_base =
+		    (char *)uio_clone->uio_iov->iov_base + adv;
+		uio_clone->uio_iov->iov_len -= adv;
+		uio_clone->uio_resid -= adv;
+		uio_clone->uio_offset += adv;
+
+		uio->uio_resid -= adv;
+		uio->uio_offset += adv;
+
+		if (error != 0 || adv == 0)
+			break;
+	}
+	td->td_ma = prev_td_ma;
+	td->td_ma_cnt = prev_td_ma_cnt;
+	curthread_pflags_restore(saveheld);
+out:
+	vm_fault_enable_pagefaults(save);
+	vn_rangelock_unlock(vp, rl_cookie);
+	free(uio_clone, M_IOV);
+	return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout.  Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+	struct uio transp_uio;
+	struct iovec transp_iov[1];
 	struct thread *td;
+	size_t adv;
+	int error, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove(data, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	transp_iov[0].iov_base = data;
+	transp_uio.uio_iov = &transp_iov[0];
+	transp_uio.uio_iovcnt = 1;
+	if (xfersize > uio->uio_resid)
+		xfersize = uio->uio_resid;
+	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+	transp_uio.uio_offset = 0;
+	transp_uio.uio_segflg = UIO_SYSSPACE;
+	/*
+	 * Since transp_iov points to data, and td_ma page array
+	 * corresponds to original uio->uio_iov, we need to invert the
+	 * direction of the i/o operation as passed to
+	 * uiomove_fromphys().
+	 */
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		transp_uio.uio_rw = UIO_READ;
+		break;
+	case UIO_READ:
+		transp_uio.uio_rw = UIO_WRITE;
+		break;
+	}
+	transp_uio.uio_td = uio->uio_td;
+	error = uiomove_fromphys(td->td_ma,
+	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+	    xfersize, &transp_uio);
+	adv = xfersize - transp_uio.uio_resid;
+	pgadv =
+	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+	uio->uio_iov->iov_len -= adv;
+	uio->uio_resid -= adv;
+	uio->uio_offset += adv;
+	return (error);
+}
+
+/*
+ * File table truncate routine.
+ */
+static int
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
+	void *rl_cookie;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
+
+	/*
+	 * Lock the whole range for truncation.  Otherwise splitted
+	 * i/o might partially happen before, partially after the
+	 * truncation.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
+	if (error)
+		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
@@ -730,7 +978,9 @@ vn_truncate(fp, length, active_cred, td)
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
+out1:
 	VFS_UNLOCK_GIANT(vfslocked);
+	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 319e094..9bf8d08 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -369,6 +369,9 @@ void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
 #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
+#define	MNTK_NO_IOPF	0x00000100	/* Disallow page faults during reads
+					   and writes. Filesystem shall properly
+					   handle i/o state on EFAULT. */
 #define MNTK_NOASYNC	0x00800000	/* disable async */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 0873927..0ba8162 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -213,6 +213,7 @@ struct thread {
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
+	struct rl_q_entry *td_rlqe;	/* (k) Associated range lock entry. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
@@ -311,7 +312,9 @@ struct thread {
 	struct vnet	*td_vnet;	/* (k) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (k) Debugging vnet push / pop. */
 	struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
-	struct proc *td_rfppwait_p;	/* (k) The vforked child */
+	struct proc	*td_rfppwait_p;	/* (k) The vforked child */
+	struct vm_page	**td_ma;	/* (k) uio pages held */
+	int		td_ma_cnt;	/* (k) size of *td_ma */
 };
 
 struct mtx *thread_lock_block(struct thread *);
@@ -419,6 +422,7 @@ do {									\
 #define	TDP_RFPPWAIT	0x02000000 /* Handle RFPPWAIT on syscall exit */
 #define	TDP_RESETSPUR	0x04000000 /* Reset spurious page fault history. */
 #define	TDP_NERRNO	0x08000000 /* Last errno is already in td_errno */
+#define	TDP_UIOHELD	0x10000000 /* Current uio has pages held in td_ma */
 
 /*
  * Reasons that the current thread can not be run yet.
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
new file mode 100644
index 0000000..bf82183
--- /dev/null
+++ b/sys/sys/rangelock.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_RANGELOCK_H
+#define	_SYS_RANGELOCK_H
+
+#include <sys/queue.h>
+
+#define	RL_LOCK_READ		0x0001
+#define	RL_LOCK_WRITE		0x0002
+#define	RL_LOCK_TYPE_MASK	0x0003
+#define	RL_LOCK_GRANTED		0x0004
+
+struct rl_q_entry;
+
+/*
+ * The structure representing the range lock.  Caller may request the
+ * read or write access to the range of bytes. Access is granted if
+ * all existing lock owners are compatible with the request. Two lock
+ * owners are compatible if their ranges do not overlap, or both
+ * owners are for read.
+ *
+ * Access to the structure itself is synchronized with the externally
+ * supplied mutex.
+ *
+ * rl_waiters is the queue of lock requests in the order of arrival.
+ * rl_currdep is the first lock request that cannot be granted now due
+ * to the preceeding requests conflicting with it.
+ */
+struct rangelock {
+	TAILQ_HEAD(, rl_q_entry) rl_waiters;
+	struct rl_q_entry	*rl_currdep;
+};
+
+#ifdef _KERNEL
+
+struct mtx;
+
+void	 rangelock_init(struct rangelock *lock);
+void	 rangelock_destroy(struct rangelock *lock);
+void	 rangelock_unlock(struct rangelock *lock, void *cookie,
+	    struct mtx *ilk);
+void	*rangelock_unlock_range(struct rangelock *lock, void *cookie,
+	    off_t start, off_t end, struct mtx *ilk);
+void	*rangelock_rlock(struct rangelock *lock, off_t start, off_t end,
+	    struct mtx *ilk);
+void	*rangelock_wlock(struct rangelock *lock, off_t start, off_t end,
+	    struct mtx *ilk);
+void	 rlqentry_free(struct rl_q_entry *rlqe);
+
+#endif	/* _KERNEL */
+
+#endif	/* _SYS_RANGELOCK_H */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 49f6f5b..97e3e29 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -38,6 +38,7 @@
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
+#include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
@@ -164,7 +165,8 @@ struct vnode {
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
-	struct lockf *v_lockf;			/* Byte-level lock list */
+	struct lockf *v_lockf;			/* Byte-level adv lock list */
+	struct rangelock v_rl;			/* Byte-range lock */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
@@ -676,7 +678,17 @@ int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
-
+int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
+
+#define	vn_rangelock_unlock(vp, cookie)					\
+	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
+#define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
+	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
+	    VI_MTX(vp))
+#define	vn_rangelock_rlock(vp, start, end)				\
+	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+#define	vn_rangelock_wlock(vp, start, end)				\
+	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 9aff694..fee8012 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1064,7 +1064,7 @@ ffs_mountfs(devvp, mp, td)
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
-	    MNTK_EXTENDED_SHARED;
+	    MNTK_EXTENDED_SHARED | MNTK_NO_IOPF;
 	MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 0699eef..6259911 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -567,7 +567,7 @@ ffs_read(ap)
 			xfersize = size;
 		}
 
-		error = uiomove((char *)bp->b_data + blkoffset,
+		error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset,
 		    (int)xfersize, uio);
 		if (error)
 			break;
@@ -738,8 +738,8 @@ ffs_write(ap)
 		if (size < xfersize)
 			xfersize = size;
 
-		error =
-		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
+		error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset,
+		    (int)xfersize, uio);
 		/*
 		 * If the buffer is not already filled and we encounter an
 		 * error while trying to fill it, we have to clear out any