diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index fee3caf..8390526 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -742,7 +742,7 @@ trap_pfault(frame, usermode)
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
-		rv = vm_fault(map, va, ftype,
+		rv = vm_fault(map, eva, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
@@ -754,7 +754,7 @@ trap_pfault(frame, usermode)
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
-		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
+		rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
diff --git a/sys/conf/files b/sys/conf/files
index a3bd42f..3b9fca5 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1931,6 +1931,7 @@ kern/kern_poll.c		optional device_polling
 kern/kern_priv.c		standard
 kern/kern_proc.c		standard
 kern/kern_prot.c		standard
+kern/kern_rangelock.c		standard
 kern/kern_resource.c		standard
 kern/kern_rmlock.c		standard
 kern/kern_rwlock.c		standard
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
index 76237fb..0d0ef86 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -91,7 +91,6 @@ __FBSDID("$FreeBSD$");
 #include <ulp/tom/cxgb_t3_ddp.h>
 #include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/tom/cxgb_tcp.h>
-#include <ulp/tom/cxgb_vm.h>
 
 
 static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
index a54598c..1c3953d 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
@@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$");
 #include <ulp/tom/cxgb_t3_ddp.h>
 #include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/tom/cxgb_tcp.h>
-#include <ulp/tom/cxgb_vm.h>
 
 
 #define MAX_SCHEDULE_TIMEOUT	300
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
deleted file mode 100644
index e7a3893..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_vm.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/types.h>
-#include <sys/fcntl.h>
-#include <sys/kernel.h>
-#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/mbuf.h>
-#include <sys/condvar.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/syslog.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-#include <ulp/tom/cxgb_vm.h>
-
-/*
- * This routine takes a user's map, array of pages, number of pages, and flags
- * and then does the following:
- *  - validate that the user has access to those pages (flags indicates read
- *	or write) - if not fail
- *  - validate that count is enough to hold range number of pages - if not fail
- *  - fault in any non-resident pages
- *  - if the user is doing a read force a write fault for any COWed pages
- *  - if the user is doing a read mark all pages as dirty
- *  - hold all pages
- */
-int
-vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
-    int count, vm_prot_t prot)
-{
-	vm_offset_t end, va;
-	int faults, rv;
-	pmap_t pmap;
-	vm_page_t m, *pages;
-	
-	pmap = vm_map_pmap(map);
-	pages = mp;
-	addr &= ~PAGE_MASK;
-	/*
-	 * Check that virtual address range is legal
-	 * This check is somewhat bogus as on some architectures kernel
-	 * and user do not share VA - however, it appears that all FreeBSD
-	 * architectures define it
-	 */
-	end = addr + (count * PAGE_SIZE);
-	if (end > VM_MAXUSER_ADDRESS) {
-		log(LOG_WARNING, "bad address passed to vm_fault_hold_user_pages");
-		return (EFAULT);
-	}
-
-	/*
-	 * First optimistically assume that all pages are resident 
-	 * (and R/W if for write) if so just mark pages as held (and 
-	 * dirty if for write) and return
-	 */
-	vm_page_lock_queues();
-	for (pages = mp, faults = 0, va = addr; va < end;
-	     va += PAGE_SIZE, pages++) {
-		/*
-		 * page queue mutex is recursable so this is OK
-		 * it would be really nice if we had an unlocked
-		 * version of this so we were only acquiring the 
-		 * pmap lock 1 time as opposed to potentially
-		 * many dozens of times
-		 */
-		*pages = m = pmap_extract_and_hold(pmap, va, prot);
-		if (m == NULL) {
-			faults++;
-			continue;
-		}
-		/*
-		 * Preemptively mark dirty - the pages
-		 * will never have the modified bit set if
-		 * they are only changed via DMA
-		 */
-		if (prot & VM_PROT_WRITE)
-			vm_page_dirty(m);
-		
-	}
-	vm_page_unlock_queues();
-	
-	if (faults == 0)
-		return (0);
-	
-	/*
-	 * Pages either have insufficient permissions or are not present
-	 * trigger a fault where neccessary
-	 * 
-	 */
-	rv = 0;
-	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
-		/*
-		 * Account for a very narrow race where the page may be
-		 * taken away from us before it is held
-		 */
-		while (*pages == NULL) {
-			rv = vm_fault(map, va, prot,
-			    (prot & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL);
-			if (rv) 
-				goto error;
-			*pages = pmap_extract_and_hold(pmap, va, prot);
-		}
-	}
-	return (0);
-error:	
-	log(LOG_WARNING,
-	    "vm_fault bad return rv=%d va=0x%zx\n", rv, va);
-	vm_page_lock_queues();
-	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
-		if (*pages) {
-			vm_page_unhold(*pages);
-			*pages = NULL;
-		}
-	vm_page_unlock_queues();
-	return (EFAULT);
-}
-
-void
-vm_fault_unhold_pages(vm_page_t *mp, int count)
-{
-
-	KASSERT(count >= 0, ("negative count %d", count));
-	vm_page_lock_queues();
-	while (count--) {
-		vm_page_unhold(*mp);
-		mp++;
-	}
-	vm_page_unlock_queues();
-}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
deleted file mode 100644
index 7532e20..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_vm.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-$FreeBSD$
-
-***************************************************************************/
-#ifndef CXGB_VM_H_
-#define CXGB_VM_H_
-
-int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr,
-    vm_page_t *mp, int count, vm_prot_t prot);
-void vm_fault_unhold_pages(vm_page_t *mp, int count);
-
-#endif
diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c
index e967104..153b7da 100644
--- a/sys/i386/i386/trap.c
+++ b/sys/i386/i386/trap.c
@@ -820,7 +820,7 @@ trap_pfault(frame, usermode, eva)
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
-		rv = vm_fault(map, va, ftype,
+		rv = vm_fault(map, eva, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
@@ -832,7 +832,7 @@ trap_pfault(frame, usermode, eva)
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
-		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
+		rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
new file mode 100644
index 0000000..7a45c13
--- /dev/null
+++ b/sys/kern/kern_rangelock.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+static int
+rangelock_incompatible(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+#define	IN_RANGE(a, e) (a <= e->rl_q_start && a < e->rl_q_end)
+	if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||
+	    IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))
+		return (1);
+#undef	IN_RANGE
+	return (0);
+}
+
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+	ASSERT_VI_LOCKED(vp, "rangelock");
+	KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep"));
+	TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(&vp->v_rl);
+	VI_UNLOCK(vp);
+}
+
+void
+rangelock_unlock(struct vnode *vp, void *cookie)
+{
+	struct rl_q_entry *entry;
+
+	entry = cookie;
+	VI_LOCK(vp);
+	rangelock_unlock_vp_locked(vp, entry);
+}
+
+void *
+rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len)
+{
+	struct rl_q_entry *entry;
+
+	entry = cookie;
+	VI_LOCK(vp);
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX"));
+	KASSERT(entry->rl_q_start == base, ("XXX"));
+	KASSERT(entry->rl_q_end >= base + len, ("XXX"));
+	if (entry->rl_q_end == base + len) {
+		rangelock_unlock_vp_locked(vp, cookie);
+		return (NULL);
+	}
+	entry->rl_q_end = base + len;
+	rangelock_calc_block(&vp->v_rl);
+	VI_UNLOCK(vp);
+	return (cookie);
+}
+
+static void *
+rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+	VI_LOCK(vp);
+	TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link);
+	if (vp->v_rl.rl_currdep == NULL)
+		vp->v_rl.rl_currdep = entry;
+	rangelock_calc_block(&vp->v_rl);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, &vp->v_interlock, 0, "range", 0);
+	VI_UNLOCK(vp);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct vnode *vp, struct rl_q_entry *entry, off_t base,
+    size_t len)
+{
+
+	entry->rl_q_flags = RL_LOCK_READ;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(vp, entry));
+}
+
+void *
+rangelock_wlock(struct vnode *vp, struct rl_q_entry *entry, off_t base,
+    size_t len)
+{
+
+	entry->rl_q_flags = RL_LOCK_WRITE;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(vp, entry));
+}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index ce1afd2..e2cd1a9 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
+#include <vm/vm_extern.h>
 #ifdef ZERO_COPY_SOCKETS
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
@@ -138,7 +139,8 @@ uiomove(void *cp, int n, struct uio *uio)
 	int error = 0;
 	int save = 0;
 
-	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE ||
+	    uio->uio_rw == UIO_NOCOPY,
 	    ("uiomove: mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("uiomove proc"));
@@ -164,10 +166,25 @@ uiomove(void *cp, int n, struct uio *uio)
 		case UIO_USERSPACE:
 			if (ticks - PCPU_GET(switchticks) >= hogticks)
 				uio_yield();
+			if (td->td_pflags & TDP_VMUIODEADLK) {
+				td->td_iov_base = (uintptr_t)iov->iov_base;
+				td->td_iov_len = iov->iov_len;
+			}
 			if (uio->uio_rw == UIO_READ)
 				error = copyout(cp, iov->iov_base, cnt);
 			else
 				error = copyin(iov->iov_base, cp, cnt);
+			if (error == EFAULT && td->td_faultaddr != 0 &&
+			    (td->td_pflags & TDP_VMUIODEADLK)) {
+				KASSERT(td->td_faultaddr >= (uintptr_t)iov->iov_base &&
+				    td->td_faultaddr < (uintptr_t)iov->iov_base + cnt,
+				    ("faultaddr %jx outside region %p %d\n",
+				     (uintmax_t)td->td_faultaddr,
+				     iov->iov_base, iov->iov_len));
+				error = ERESTART;
+				fwduio(uio, td->td_faultaddr - (uintptr_t)
+				     iov->iov_base);
+			}
 			if (error)
 				goto out;
 			break;
@@ -181,10 +198,7 @@ uiomove(void *cp, int n, struct uio *uio)
 		case UIO_NOCOPY:
 			break;
 		}
-		iov->iov_base = (char *)iov->iov_base + cnt;
-		iov->iov_len -= cnt;
-		uio->uio_resid -= cnt;
-		uio->uio_offset += cnt;
+		fwduio(uio, cnt);
 		cp = (char *)cp + cnt;
 		n -= cnt;
 	}
@@ -544,6 +558,7 @@ copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop)
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
+	uio->uio_flags = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
 			free(uio, M_IOV);
@@ -569,3 +584,25 @@ cloneuio(struct uio *uiop)
 	bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
 	return (uio);
 }
+
+void
+fwduio(struct uio *uio, int cnt)
+{
+
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + cnt;
+	uio->uio_iov->iov_len -= cnt;
+	uio->uio_resid -= cnt;
+	uio->uio_offset += cnt;
+}
+
+void
+copyuio(struct uio *dst, struct uio *src)
+{
+	struct iovec *dst_iovec;
+
+	dst_iovec = dst->uio_iov;
+	*dst = *src;
+	dst->uio_iov = dst_iovec;
+	bcopy(src->uio_iov, dst->uio_iov, src->uio_iovcnt *
+	    sizeof(struct iovec));
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 8c26b13..e3867d6 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -871,6 +871,7 @@ vdestroy(struct vnode *vp)
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
@@ -1025,6 +1026,7 @@ alloc:
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 3cc6f22..9e94b58 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -60,8 +60,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
+#include <sys/sysctl.h>
 #include <sys/unistd.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
 #include <security/mac/mac_framework.h>
 
 static fo_rdwr_t	vn_read;
@@ -363,37 +367,68 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 	int *aresid;
 	struct thread *td;
 {
-	struct uio auio;
-	struct iovec aiov;
+	struct uio auio, auio_clone;
+	struct iovec aiov, aiov_clone;
 	struct mount *mp;
 	struct ucred *cred;
-	int error;
+	vm_page_t *m_hold;
+	struct rl_q_entry rl_entry;
+	void *rl_cookie;
+	int wired_pages, error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ)
+			rl_cookie = rangelock_rlock(vp, &rl_entry,
+			    offset, len);
+		else
+			rl_cookie = rangelock_wlock(vp, &rl_entry,
+			    offset, len);
+	} else
+		rl_cookie = NULL;
+
+	m_hold = NULL;
+	if (segflg == UIO_USERSPACE) {
+		m_hold = malloc(sizeof(vm_page_t) * (btoc(len) + 1), M_IOV,
+		    M_WAITOK);
+		aiov_clone = aiov;
+		auio_clone = auio;
+		auio_clone.uio_iov = &aiov_clone;
+		error = vm_wireuio(&auio, m_hold,
+		    round_page((vm_offset_t)base + len) -
+		    trunc_page((vm_offset_t)base),
+		    &wired_pages);
+		if (error) {
+			free(m_hold, M_IOV);
+			goto out;
+		}
+	}
+
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
-				return (error);
+				goto out_unwire;
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		} else
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	}
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	aiov.iov_base = base;
-	aiov.iov_len = len;
-	auio.uio_resid = len;
-	auio.uio_offset = offset;
-	auio.uio_segflg = segflg;
-	auio.uio_rw = rw;
-	auio.uio_td = td;
-	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
@@ -424,6 +459,14 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
 	}
+out_unwire:
+	if (segflg == UIO_USERSPACE) {
+		vm_unwireuio(&auio_clone, m_hold, wired_pages);
+		free(m_hold, M_IOV);
+	}
+out:
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -485,68 +528,215 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
 	return (error);
 }
 
+static int uio_hold_pages = 12;
+SYSCTL_INT(_vfs, OID_AUTO, uio_hold_pages, CTLFLAG_RW, &uio_hold_pages, 0,
+    "The max amount of held pages for one i/o chunk");
+static int uio_short = 128;
+SYSCTL_INT(_vfs, OID_AUTO, uio_short, CTLFLAG_RW, &uio_short, 0,
+    "The length of the short i/o");
+
+typedef int (*vn_chunk_func_t)(struct file *, struct uio *, struct ucred *,
+    int, int, struct thread *);
+
+static int
+do_vn_rw_chunked(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, int ioflag, struct thread *td, vm_page_t *m_hold,
+    vn_chunk_func_t vn_chunk_func)
+{
+	struct uio *uio_clone;
+	int error, wire_bytes, io_chunk, total_cnt, cnt;
+	int first_chunk, wired_pages;
+
+	if (uio->uio_segflg != UIO_USERSPACE || fp->f_vnode->v_type != VREG)
+		return (vn_chunk_func(fp, uio, active_cred, flags, ioflag, td));
+
+	uio_clone = cloneuio(uio);
+	KASSERT(!(td->td_pflags & TDP_VMUIODEADLK),
+	     ("Nested TDP_VMUIODEADLK"));
+	td->td_pflags |= TDP_VMUIODEADLK;
+	td->td_faultaddr = 0;
+	error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td);
+	td->td_pflags &= ~TDP_VMUIODEADLK;
+	if (error != ERESTART || td->td_faultaddr == 0)
+		goto out;
+
+	first_chunk = 1;
+	if (uio->uio_flags & UIO_ROLLBACK) {
+		cnt = uio_clone->uio_resid - uio->uio_resid;
+		copyuio(uio, uio_clone);
+		if (cnt > 0) {
+			uio->uio_rw = UIO_NOCOPY;
+			uiomove(NULL, cnt, uio);
+			uio->uio_rw = uio_clone->uio_rw;
+			first_chunk = 0;
+		}
+	}
+	while (uio->uio_resid > 0) {
+		io_chunk = min(uio_hold_pages * PAGE_SIZE, uio->uio_resid);  /* XXXKIB */
+		wire_bytes = round_page(io_chunk);
+		error = vm_wireuio(uio, m_hold, wire_bytes, &wired_pages);
+		if (error != 0) {
+			if (!first_chunk)
+				error = 0;
+			break;
+		}
+		copyuio(uio_clone, uio);
+		total_cnt = uio->uio_resid;
+		uio->uio_resid = io_chunk;
+		error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td);
+		vm_unwireuio(uio_clone, m_hold, wired_pages);
+		cnt = io_chunk - uio->uio_resid;
+		uio->uio_resid = total_cnt - cnt;
+		if (error != 0) {
+			if (!first_chunk)
+				error = 0;
+			break;
+		}
+		if (cnt == 0)
+			break;
+		first_chunk = 0;
+	}
+ out:
+	free(uio_clone, M_IOV);
+	return (error);
+}
+
+static struct mtx *
+vn_lock_foffset(struct file *fp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+		fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+		msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+		    "vnread offlock", 0);
+	}
+	fp->f_vnread_flags |= FOFFSET_LOCKED;
+	mtx_unlock(mtxp);
+	return (mtxp);
+}
+
+static void
+vn_unlock_foffset(struct file *fp, struct mtx *mtxp)
+{
+
+	mtx_lock(mtxp);
+	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+		wakeup(&fp->f_vnread_flags);
+	fp->f_vnread_flags = 0;
+	mtx_unlock(mtxp);
+}
+
+static inline int
+vn_read_wired_chunk(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, int ioflag, struct thread *td)
+{
+	struct vnode *vp;
+	int error, vfslocked;
+
+	vp = fp->f_vnode;
+
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	ioflag |= sequential_heuristic(uio, fp);
+
+#ifdef MAC
+	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+	if (error == 0)
+#endif
+		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
 /*
  * File table vnode read routine.
  */
 static int
-vn_read(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	int error, ioflag;
+	vm_page_t m_hold[uio_hold_pages];
 	struct mtx *mtxp;
-	int vfslocked;
+	struct rl_q_entry rl_entry;
+	void *rl_cookie;
+	int ioflag;
+	int error;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
-	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	vp = fp->f_vnode;
+
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-			    "vnread offlock", 0);
-		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
-		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		mtxp = vn_lock_foffset(fp);
 		uio->uio_offset = fp->f_offset;
 	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		mtxp = NULL; /* gcc */
+	if (vp->v_type == VREG)
+		rl_cookie = rangelock_rlock(vp, &rl_entry, uio->uio_offset,
+		    uio->uio_resid);
+	else
+		rl_cookie = NULL;
+	error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td,
+	    m_hold, vn_read_wired_chunk);
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
+	if ((flags & FOF_OFFSET) == 0) {
+		fp->f_offset = uio->uio_offset;
+		vn_unlock_foffset(fp, mtxp);
+	}
+	return (error);
+}
 
-	ioflag |= sequential_heuristic(uio, fp);
+static inline int
+vn_write_wired_chunk(struct file *fp, struct uio *uio,
+    struct ucred *active_cred, int flags, int ioflag, struct thread *td)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error, vfslocked;
 
+	mp = NULL;
+	vp = fp->f_vnode;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (vp->v_type == VREG)
+		bwillwrite();
+	if (vp->v_type != VCHR) {
+		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+		if (error != 0)
+			goto unlock;
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+	ioflag |= sequential_heuristic(uio, fp);
 #ifdef MAC
-	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
-		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
+		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
+	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
-	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
+unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -555,24 +745,18 @@ vn_read(fp, uio, active_cred, flags, td)
  * File table vnode write routine.
  */
 static int
-vn_write(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	struct mount *mp;
-	int error, ioflag;
-	int vfslocked;
+	vm_page_t m_hold[uio_hold_pages];
+	struct rl_q_entry rl_entry;
+	void *rl_cookie;
+	int ioflag, error;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	if (vp->v_type == VREG)
-		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
@@ -583,27 +767,23 @@ vn_write(fp, uio, active_cred, flags, td)
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
-	mp = NULL;
-	if (vp->v_type != VCHR &&
-	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
-		goto unlock;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-	ioflag |= sequential_heuristic(uio, fp);
-#ifdef MAC
-	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
-	if (error == 0)
-#endif
-		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
-	fp->f_nextoff = uio->uio_offset;
-	VOP_UNLOCK(vp, 0);
-	if (vp->v_type != VCHR)
-		vn_finished_write(mp);
-unlock:
-	VFS_UNLOCK_GIANT(vfslocked);
+	if (vp->v_type == VREG) {
+		if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET))
+			/*
+			 * For appenders, punt and lock the whole
+			 * range. It also protects f_offset.
+			 */
+			rl_cookie = rangelock_wlock(vp, &rl_entry,
+			    0, (size_t)-1);
+		else
+			rl_cookie = rangelock_wlock(vp, &rl_entry,
+			    uio->uio_offset, uio->uio_resid);
+	} else
+		rl_cookie = NULL;
+	error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td,
+	    m_hold, vn_write_wired_chunk);
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile
index 2b8750a..d2566fd 100644
--- a/sys/modules/cxgb/tom/Makefile
+++ b/sys/modules/cxgb/tom/Makefile
@@ -5,7 +5,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb
 
 KMOD=	tom
 SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
-SRCS+=  cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c
+SRCS+=  cxgb_ddp.c cxgb_l2t.c cxgb_tcp_offload.c
 SRCS+=	opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h 
 SRCS+=	opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h
 SRCS+=	device_if.h bus_if.h pci_if.h
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 96f811d..9ce96da 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -236,6 +236,9 @@ struct thread {
 	struct file	*td_fpop;	/* (k) file referencing cdev under op */
 	int		td_dbgflags;	/* (c) Userland debugger flags */
 	struct osd	td_osd;		/* (k) Object specific data. */
+	vm_offset_t	td_faultaddr;	/* (k) fault address for TDP_VMUIODEADLK */
+	vm_offset_t	td_iov_base;	/* (k) the region where VMUIODEADLK ... */
+	size_t		td_iov_len;	/* (k) ... is handled */
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall(). */
@@ -353,7 +356,7 @@ do {									\
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock aquisition - deadlock treatment. */
-#define	TDP_UNUSED80	0x00000080 /* available. */
+#define	TDP_VMUIODEADLK	0x00000080 /* Non-blocking vm_fault required. */
 #define	TDP_NOSLEEPING	0x00000100 /* Thread is not allowed to sleep on a sq. */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
new file mode 100644
index 0000000..4a5ac1e
--- /dev/null
+++ b/sys/sys/rangelock.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_RANGELOCK_H
+#define	_SYS_RANGELOCK_H
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/sx.h>
+
+#ifdef _KERNEL
+
+struct vnode;
+
+struct rl_q_entry
+{
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	size_t rl_q_start, rl_q_end;
+	int rl_q_flags;
+};
+
+#define	RL_LOCK_READ		0x0001
+#define	RL_LOCK_WRITE		0x0002
+#define	RL_LOCK_TYPE_MASK	0x0003
+#define	RL_LOCK_GRANTED		0x0004
+
+struct rangelock
+{
+	TAILQ_HEAD(, rl_q_entry) rl_waiters;
+	struct rl_q_entry *rl_currdep;
+};
+
+void	rangelock_init(struct rangelock *lock);
+void	rangelock_destroy(struct rangelock *lock);
+void	rangelock_unlock(struct vnode *vp, void *cookie);
+void   *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base,
+    size_t len);
+void   *rangelock_rlock(struct vnode *vp, struct rl_q_entry *entry,
+    off_t base, size_t len);
+void   *rangelock_wlock(struct vnode *vp, struct rl_q_entry *entry,
+    off_t base, size_t len);
+#endif
+
+#endif
diff --git a/sys/sys/uio.h b/sys/sys/uio.h
index 871f93a..6a7cce7 100644
--- a/sys/sys/uio.h
+++ b/sys/sys/uio.h
@@ -68,8 +68,11 @@ struct uio {
 	enum	uio_seg uio_segflg;	/* address space */
 	enum	uio_rw uio_rw;		/* operation */
 	struct	thread *uio_td;		/* owner */
+	int	uio_flags;
 };
 
+#define	UIO_ROLLBACK	0x0001
+
 /*
  * Limits
  *
@@ -100,6 +103,8 @@ int	uiomove_frombuf(void *buf, int buflen, struct uio *uio);
 int	uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n,
 	    struct uio *uio);
 int	uiomoveco(void *cp, int n, struct uio *uio, int disposable);
+void	fwduio(struct uio *uip, int cnt);
+void	copyuio(struct uio *dst, struct uio *src);
 
 #else /* !_KERNEL */
 
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 0a3d1dc..af760a5 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -38,6 +38,7 @@
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
+#include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
@@ -168,7 +169,8 @@ struct vnode {
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
-	struct lockf *v_lockf;			/* Byte-level lock list */
+	struct lockf *v_lockf;			/* Byte-level adv lock list */
+	struct rangelock v_rl;			/* Byte-range lock */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 1abb994..34d75e5 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -632,8 +632,8 @@ ffs_write(ap)
 	struct buf *bp;
 	struct thread *td;
 	ufs_lbn_t lbn;
-	off_t osize;
-	int seqcount;
+	off_t osize, s_size;
+	int seqcount, s_resid;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
 
 	vp = ap->a_vp;
@@ -707,6 +707,7 @@ ffs_write(ap)
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->fs_bsize - blkoffset;
+		s_size = ip->i_size;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (uio->uio_offset + xfersize > ip->i_size)
@@ -734,8 +735,10 @@ ffs_write(ap)
 		 * the prior contents of the pages exposed to a userland
 		 * mmap().  XXX deal with uiomove() errors a better way.
 		 */
-		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
+		if ((bp->b_flags & B_CACHE) == 0 /*&& fs->fs_bsize <= xfersize*/) {
 			vfs_bio_clrbuf(bp);
+			flags |= BA_CLRBUF;
+		}
 		if (ioflag & IO_DIRECT)
 			bp->b_flags |= B_DIRECT;
 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
@@ -750,6 +753,7 @@ ffs_write(ap)
 		if (size < xfersize)
 			xfersize = size;
 
+		s_resid = uio->uio_resid;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
@@ -757,6 +761,41 @@ ffs_write(ap)
 			bp->b_flags |= B_RELBUF;
 		}
 
+		if (error == ERESTART && !(flags & BA_CLRBUF)) {
+			/*
+			 * When uiomove() failed due to vm_fault
+			 * cowardly refused to process a dangerous
+			 * page-in, and the previous content of the
+			 * buffer is garbage, e.g. because supposed
+			 * transfer length was big enough to cover the
+			 * whole buffer, discard it.
+			 */
+			if (LIST_EMPTY(&bp->b_dep)) {
+				bp->b_flags |= B_RELBUF | B_NOCACHE | B_INVAL;
+				brelse(bp);
+			} else {
+				/*
+				 * But cannot discard the buffer with
+				 * dependencies. Since the buffer is
+				 * newly allocated, fill it with
+				 * zeros. If the buffer extended the
+				 * file, truncate.
+				 */
+				vfs_bio_clrbuf(bp);
+				if (ioflag & IO_SYNC)
+					bwrite(bp);
+				else
+					bawrite(bp);
+				ffs_truncate(vp, s_size,
+				    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred,
+				    uio->uio_td);
+			}
+			uio->uio_offset -= s_resid - uio->uio_resid;
+			uio->uio_resid = s_resid;
+			uio->uio_flags |= UIO_ROLLBACK;
+			break;
+		}
+
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
@@ -808,6 +847,8 @@ ffs_write(ap)
 			    ap->a_cred, uio->uio_td);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
+			if (error == ERESTART)
+				uio->uio_flags |= UIO_ROLLBACK;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 		error = ffs_update(vp, 1);
diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h
index 475a20e..f2b4d16 100644
--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@@ -34,11 +34,13 @@
 #define	_VM_EXTERN_H_
 
 struct buf;
+struct iovec;
 struct proc;
 struct vmspace;
 struct vmtotal;
 struct mount;
 struct vnode;
+struct uio;
 
 #ifdef _KERNEL
 
@@ -56,6 +58,9 @@ void swapout_procs(int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t);
+int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr,
+    vm_page_t *mp, int count, vm_prot_t prot);
+void vm_fault_unhold_pages(vm_page_t *mp, int count);
 void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
@@ -84,5 +89,9 @@ int vm_thread_new(struct thread *td, int pages);
 int vm_thread_new_altkstack(struct thread *td, int pages);
 void vm_thread_swapin(struct thread *td);
 void vm_thread_swapout(struct thread *td);
+int vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes,
+    int *wired_pages);
+void vm_unwireuio(struct uio *, struct vm_page *m_hold[], int wired_pages);
+
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 3a21616..d354c92 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -1,4 +1,30 @@
 /*-
+
+Copyright (c) 2007-2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
@@ -206,9 +232,9 @@ unlock_and_deallocate(struct faultstate *fs)
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
-int
-vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
-	 int fault_flags)
+static int
+vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+    int fault_flags, struct vm_page **m_hold)
 {
 	vm_prot_t prot;
 	int is_first_object_locked, result;
@@ -220,8 +246,20 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	int faultcount, ahead, behind;
 	struct faultstate fs;
 	struct vnode *vp;
+	struct thread *td;
 	int locked, error;
 
+	td = curthread;
+	if (td->td_pflags & TDP_VMUIODEADLK) {
+		KASSERT(td->td_iov_base <= vaddr &&
+		    vaddr < td->td_iov_base + td->td_iov_len,
+		    ("uiomove EFAULT %jx %jx %d\n", (uintmax_t)vaddr,
+			(uintmax_t)td->td_iov_base, td->td_iov_len));
+		td->td_faultaddr = vaddr;
+		return (KERN_VMUIODEADLOCK);
+	}
+	vaddr = trunc_page(vaddr);
+
 	hardfault = 0;
 	growstack = TRUE;
 	PCPU_INC(cnt.v_vm_faults);
@@ -949,6 +987,10 @@ vnode_locked:
 	} else {
 		vm_page_activate(fs.m);
 	}
+	if (m_hold != NULL) {
+		*m_hold = fs.m;
+		vm_page_hold(fs.m);
+	}
 	vm_page_unlock_queues();
 	vm_page_wakeup(fs.m);
 
@@ -964,6 +1006,14 @@ vnode_locked:
 	return (KERN_SUCCESS);
 }
 
+int
+vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+    int fault_flags)
+{
+
+	return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL));
+}
+
 /*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
@@ -1360,3 +1410,108 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	/* return number of pages */
 	return i;
 }
+
+/*
+ * This routine takes a user's map, array of pages, number of pages, and flags
+ * and then does the following:
+ *  - validate that the user has access to those pages (flags indicates read
+ *	or write) - if not fail
+ *  - validate that count is enough to hold range number of pages - if not fail
+ *  - fault in any non-resident pages
+ *  - if the user is doing a read force a write fault for any COWed pages
+ *  - if the user is doing a read mark all pages as dirty
+ *  - hold all pages
+ */
+int
+vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
+    int count, vm_prot_t prot)
+{
+	vm_offset_t end, va;
+	int faults, rv;
+	pmap_t pmap;
+	vm_page_t m, *pages;
+
+	pmap = vm_map_pmap(map);
+	pages = mp;
+	addr &= ~PAGE_MASK;
+
+	/*
+	 * Check that virtual address range is legal.
+	 * This check is somewhat bogus as on some architectures kernel
+	 * and user do not share VA - however, it appears that all FreeBSD
+	 * architectures define it
+	 */
+	end = addr + (count * PAGE_SIZE);
+	if (end > VM_MAXUSER_ADDRESS)
+		return (EFAULT);
+
+	/*
+	 * First optimistically assume that all pages are resident
+	 * (and R/W if for write) if so just mark pages as held (and
+	 * dirty if for write) and return.
+	 */
+	vm_page_lock_queues();
+	for (pages = mp, faults = 0, va = addr; va < end;
+	     va += PAGE_SIZE, pages++) {
+		/*
+		 * Page queue mutex is recursable so this is OK.
+		 * It would be really nice if we had an unlocked
+		 * version of this so we were only acquiring the
+		 * pmap lock 1 time as opposed to potentially
+		 * many dozens of times.
+		 */
+		*pages = m = pmap_extract_and_hold(pmap, va, prot);
+		if (m == NULL) {
+			faults++;
+			continue;
+		}
+
+		/*
+		 * Preemptively mark dirty - the pages will never have
+		 * the modified bit set if they are only changed via
+		 * DMA.
+		 */
+		if (prot & VM_PROT_WRITE)
+			vm_page_dirty(m);
+	}
+	vm_page_unlock_queues();
+
+	if (faults == 0)
+		return (0);
+
+	/*
+	 * Pages either have insufficient permissions or are not present
+	 * trigger a fault where neccessary.
+	 */
+	rv = 0;
+	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
+		rv = vm_fault_hold(map, va, prot, (prot & VM_PROT_WRITE) ?
+		    VM_FAULT_DIRTY : VM_FAULT_NORMAL, pages);
+		if (rv)
+			goto error;
+	}
+	return (0);
+
+error:
+	vm_page_lock_queues();
+	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
+		if (*pages) {
+			vm_page_unhold(*pages);
+			*pages = NULL;
+		}
+	vm_page_unlock_queues();
+	return (EFAULT);
+}
+
+void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+	KASSERT(count >= 0, ("negative count %d", count));
+	vm_page_lock_queues();
+	while (count--) {
+		vm_page_unhold(*mp);
+		mp++;
+	}
+	vm_page_unlock_queues();
+}
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 2c5821c..3f19a49 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -1374,6 +1374,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
+	new_entry->pin_count = 0;
 
 	/*
 	 * Insert the new entry into the list
@@ -1596,7 +1597,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
-		     (prev->wired_count == entry->wired_count)) {
+		     (prev->wired_count == entry->wired_count) &&
+		     (prev->pin_count == entry->pin_count)) {
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
@@ -1622,7 +1624,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
-		    (next->wired_count == entry->wired_count)) {
+		    (next->wired_count == entry->wired_count) &&
+		    (next->pin_count == entry->pin_count)) {
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			vm_map_entry_resize_free(map, entry);
@@ -2796,7 +2799,8 @@ reclip_start:
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
-		    vm_map_entry_system_wired_count(entry) != 0)) {
+		    vm_map_entry_system_wired_count(entry) != 0) ||
+		    entry->pin_count != 0) {
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 
@@ -2816,6 +2820,7 @@ reclip_start:
 				holder_entry->max_protection = VM_PROT_NONE;
 				holder_entry->inheritance = VM_INHERIT_NONE;
 				holder_entry->wired_count = 0;
+				holder_entry->pin_count = 0;
 				vm_map_entry_link(map, entry->prev, holder_entry);
 			}
 			(void) vm_map_unlock_and_wait(map, 0);
@@ -3154,6 +3159,7 @@ vmspace_fork(struct vmspace *vm1)
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wired_count = 0;
+			new_entry->pin_count = 0;
 
 			/*
 			 * Insert the entry into the new map -- we know we're
@@ -3181,6 +3187,7 @@ vmspace_fork(struct vmspace *vm1)
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wired_count = 0;
+			new_entry->pin_count = 0;
 			new_entry->object.vm_object = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
@@ -3865,6 +3872,167 @@ vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 	vm_map_unlock_read(map);
 }
 
+static boolean_t
+vm_map_unpin_entries(vm_map_t map, struct uio *uiop, int pinned_entries)
+{
+	vm_offset_t start;
+	struct iovec *iov;
+	vm_map_entry_t entry;
+	int i, acc, wire;
+	boolean_t need_wakeup;
+
+	iov = uiop->uio_iov;
+	need_wakeup = FALSE;
+
+	for (i = 0, acc = 0; acc < pinned_entries; iov++, i++) {
+		KASSERT(i < uiop->uio_iovcnt, ("wireio: iovcnt overflow %d %d %d",
+			i, uiop->uio_iovcnt, pinned_entries));
+		wire = round_page(iov->iov_len);
+		if (acc + wire > pinned_entries)
+			wire = pinned_entries - acc;
+		start = trunc_page((vm_offset_t)iov->iov_base);
+		for (;;) {
+			if (!vm_map_lookup_entry(map, start, &entry)) {
+#ifdef INVARIANTS
+				panic("vm_unwireuio: hole");
+#endif
+			}
+			KASSERT(entry->pin_count > 0, ("pin_count %p", entry));
+			if (--entry->pin_count == 0 &&
+			    (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP)) {
+				entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
+				need_wakeup = TRUE;
+			}
+			if (entry->end >= start + wire) {
+				acc += wire;
+				break;
+			} else {
+				acc += entry->end - start;
+				wire -= entry->end - start;
+				start = entry->end;
+			}
+		}
+	}
+	return (need_wakeup);
+}
+
+/*
+ * vm_wireuio
+ *
+ * Given userspace struct uio, we set up vm state such that after the
+ * successfull return there will be no page faults during uiomove with
+ * this uio until vm_unwireuio is called. At most wire_bytes bytes of
+ * the user address space are held.
+ *
+ * Function performs this by first pinning all map entries that will
+ * be referenced. This guarantees that our ranges of user address
+ * space cannot be remmapped during the operation. Then, all accessed
+ * pages are faulted in and held.
+ */
+int
+vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes,
+    int *wired_pages)
+{
+	vm_map_t map;
+	vm_offset_t start, start1;
+	struct iovec *iov;
+	vm_map_entry_t entry;
+	struct vm_page **m_hold1;
+	int i, acc, wire, wire_pages, pinned_entries, rv, prot;
+	int error;
+	boolean_t need_wakeup;
+
+	KASSERT(round_page(wire_bytes) == wire_bytes,
+	    ("wireuio: wire_bytes is not page-size aligned"));
+	KASSERT(uiop->uio_segflg == UIO_USERSPACE,
+	    ("wireuio: !UIO_USERSPACE"));
+
+	error = 0;
+	prot = uiop->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
+	m_hold1 = m_hold;
+	pinned_entries = 0;
+	*wired_pages = 0;
+	map = &uiop->uio_td->td_proc->p_vmspace->vm_map;
+	iov = uiop->uio_iov;
+
+	/*
+	 * Do the pass over iov.
+	 */
+	for (i = 0, acc = 0; acc < wire_bytes; iov++, i++) {
+		wire = round_page(iov->iov_len);
+		if (acc + wire > wire_bytes)
+			wire = wire_bytes - acc;
+		acc += wire;
+		wire_pages = btoc(wire);
+		start1 = start = trunc_page((vm_offset_t)iov->iov_base);
+		if (start < vm_map_min(map) || start + wire > vm_map_max(map) ||
+		    start > start + wire) {
+			error = EINVAL;
+			goto fault;
+		}
+		vm_map_lock(map);
+
+		/*
+		 * Pin each entry referenced by addresses in iov.
+		 */
+		for (;;) {
+			if (!vm_map_lookup_entry(map, start1, &entry) ||
+			    (entry->eflags & MAP_ENTRY_IS_HOLDER))
+				goto fault_map_locked;
+			entry->pin_count++;
+			if (entry->end >= start1 + wire) {
+				pinned_entries += wire;
+				break;
+			} else {
+				pinned_entries += entry->end - start1;
+				wire -= entry->end - start1;
+				start1 = entry->end;
+			}
+		}
+		vm_map_unlock(map);
+
+		/*
+		 * If entries are successfully pinned, the
+		 * corresponding pages are faulted in and held.
+		 */
+		rv = vm_fault_hold_user_pages(map, start, m_hold1, wire_pages,
+		    prot);
+		if (rv != KERN_SUCCESS) {
+			error = EFAULT;
+			goto fault;
+		}
+		*wired_pages += wire_pages;
+		m_hold1 += wire_pages;
+	}
+	return (0);
+ fault:
+	vm_map_lock(map);
+ fault_map_locked:
+	need_wakeup = vm_map_unpin_entries(map, uiop, pinned_entries);
+	vm_map_unlock(map);
+	vm_fault_unhold_pages(m_hold, *wired_pages);
+	if (need_wakeup)
+		vm_map_wakeup(map);
+	return (error);
+}
+
+void
+vm_unwireuio(struct uio *uiop, struct vm_page *m_hold[], int wired_pages)
+{
+	vm_map_t map;
+	boolean_t need_wakeup;
+
+	map = &uiop->uio_td->td_proc->p_vmspace->vm_map;
+
+	vm_fault_unhold_pages(m_hold, wired_pages);
+
+	vm_map_lock(map);
+	need_wakeup = vm_map_unpin_entries(map, uiop, ctob(wired_pages));
+	vm_map_unlock(map);
+	if (need_wakeup)
+		vm_map_wakeup(map);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index f2c4fd3..9310718 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -114,6 +114,7 @@ struct vm_map_entry {
 	vm_inherit_t inheritance;	/* inheritance */
 	int wired_count;		/* can be paged if = 0 */
 	vm_pindex_t lastr;		/* last read */
+	unsigned pin_count;		/* non-exclusive pin count */
 };
 
 #define MAP_ENTRY_NOSYNC		0x0001
@@ -383,5 +384,6 @@ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
 int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 int vmspace_swap_count (struct vmspace *vmspace);
+
 #endif				/* _KERNEL */
 #endif				/* _VM_MAP_ */
diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h
index 2ff2603..d866925 100644
--- a/sys/vm/vm_param.h
+++ b/sys/vm/vm_param.h
@@ -125,6 +125,7 @@ struct xswdev {
 #define	KERN_RESOURCE_SHORTAGE	6
 #define	KERN_NOT_RECEIVER	7
 #define	KERN_NO_ACCESS		8
+#define	KERN_VMUIODEADLOCK	9
 
 #ifndef ASSEMBLER
 #ifdef _KERNEL
diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c
new file mode 100644
index 0000000..d857605
--- /dev/null
+++ b/tools/regression/file/uio/uio.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int chunk_cnt = 1024;
+int chunk_size = 1024;
+
+int
+main(int argc, char *argv[])
+{
+	struct iovec *wiov, *riov;
+	char **wdata, **rdata;
+	int fd, i;
+	ssize_t io_error;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n");
+		return (2);
+	}
+	fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+	if (fd == -1) {
+		fprintf(stderr, "Failed to create %s: %s\n",
+		    argv[1], strerror(errno));
+		return (1);
+	}
+
+	if (argc > 2)
+		chunk_cnt = atoi(argv[2]);
+	if (argc > 3)
+		chunk_size = atoi(argv[3]);
+
+	wiov = calloc(chunk_cnt, sizeof(*wiov));
+	wdata = calloc(chunk_cnt, sizeof(*wdata));
+
+	riov = calloc(chunk_cnt, sizeof(*riov));
+	rdata = calloc(chunk_cnt, sizeof(*rdata));
+
+	for (i = 0; i < chunk_cnt; i++) {
+		rdata[i] = malloc(chunk_size);
+		riov[i].iov_base = rdata[i];
+		riov[i].iov_len = chunk_size;
+
+		wdata[i] = malloc(chunk_size);
+		memset(wdata[i], i, chunk_size);
+		wiov[i].iov_base = wdata[i];
+		wiov[i].iov_len = chunk_size;
+	}
+
+	io_error = writev(fd, wiov, chunk_cnt);
+	if (io_error == -1) {
+		fprintf(stderr, "write failed: %s\n", strerror(errno));
+		return (1);
+	} else if (io_error != chunk_cnt * chunk_size) {
+		fprintf(stderr, "truncated write: %d %d\n",
+		     io_error, chunk_cnt * chunk_size);
+		return (1);
+	}
+
+	if (lseek(fd, 0, SEEK_SET) == -1) {
+		fprintf(stderr, "lseek failed: %s\n", strerror(errno));
+		return (1);
+	}
+
+	io_error = readv(fd, riov, chunk_cnt);
+	if (io_error == -1) {
+		fprintf(stderr, "read failed: %s\n", strerror(errno));
+		return (1);
+	} else if (io_error != chunk_cnt * chunk_size) {
+		fprintf(stderr, "truncated read: %d %d\n",
+		     io_error, chunk_cnt * chunk_size);
+		return (1);
+	}
+
+	for (i = 0; i < chunk_cnt; i++) {
+		if (memcmp(rdata[i], wdata[i], chunk_size) != 0) {
+			fprintf(stderr, "chunk %d differs\n", i);
+			return (1);
+		}
+	}
+
+	return (0);
+}
diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c
new file mode 100644
index 0000000..9376648
--- /dev/null
+++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static const int blks = 2;
+
+static void
+flush_buffers(int fd)
+{
+	struct stat st;
+	char *addr;
+	int error;
+
+	printf("Flushing buffers\n");
+	error = fstat(fd, &st);
+	if (error == -1)
+		err(2, "stat");
+ 	fsync(fd);
+	addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (addr == (char *)-1)
+		err(2, "mmap");
+	error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE);
+	if (error == -1)
+		err(2, "msync");
+	munmap(addr, st.st_size);
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct statfs fst;
+	char *data, *vrfy;
+	size_t sz;
+	int fd, i, error, ret;
+
+	if (argc < 2)
+		errx(2, "Usage: ba_clrbuf file");
+
+	fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+	if (fd == -1)
+		err(2, "Failed to create %s", argv[1]);
+
+	if (fstatfs(fd, &fst) == -1)
+		err(2, "stat");
+
+	sz = fst.f_iosize * blks;
+	data = malloc(sz);
+	if (data == NULL)
+		err(2, "malloc");
+	vrfy = malloc(sz);
+	if (vrfy == NULL)
+		err(2, "malloc");
+	for (i = 0; i < (int)sz; i++)
+		data[i] = i;
+	error = write(fd, data, sz);
+	if (error == -1)
+		err(2, "write");
+	else if (error != (int)sz)
+		errx(2, "Short write %d %d", error, sz);
+
+	flush_buffers(fd);
+
+	error = lseek(fd, 0, SEEK_SET);
+	if (error == -1)
+		err(2, "lseek 0");
+	else if (error != 0)
+		errx(2, "lseek 0 returned %d", error);
+	error = write(fd, NULL, fst.f_iosize);
+	printf("faulty write, error %s\n", strerror(errno));
+
+	error = lseek(fd, 0, SEEK_SET);
+	if (error == -1)
+		err(2, "lseek 0/2");
+	else if (error != 0)
+		errx(2, "lseek 0/2 returned %d", error);
+	error = read(fd, vrfy, sz);
+	if (error == -1)
+		err(2, "read");
+	else if (error != (int)sz)
+		errx(2, "short read %d %d", error, sz);
+
+	if (memcmp(data, vrfy, fst.f_iosize) != 0) {
+		printf("Zero block corrupted, byte at 0 is %x\n",
+		    (unsigned char)vrfy[0]);
+		ret = 1;
+	} else {
+		printf("No corruption\n");
+		ret = 0;
+	}
+
+	return (ret);
+}