diff --git a/sys/conf/files b/sys/conf/files
index 70113b3..6713e75 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2166,6 +2166,7 @@ kern/kern_poll.c		optional device_polling
 kern/kern_priv.c		standard
 kern/kern_proc.c		standard
 kern/kern_prot.c		standard
+kern/kern_rangelock.c		standard
 kern/kern_resource.c		standard
 kern/kern_rmlock.c		standard
 kern/kern_rwlock.c		standard
@@ -2874,6 +2875,7 @@ vm/vm_page.c			standard
 vm/vm_pageout.c			standard
 vm/vm_pager.c			standard
 vm/vm_phys.c			standard
+vm/vm_readwrite.c		standard
 vm/vm_reserv.c			standard
 vm/vm_unix.c			standard
 vm/vm_zeroidle.c		standard
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index ccd9039..a201bd0 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -85,6 +85,7 @@
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
@@ -589,7 +590,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 {
 	struct sf_buf *sf;
 	int rv, offs, len, lastend;
-	vm_pindex_t i, lastp;
+	vm_pindex_t i, firstp, lastp;
 	vm_page_t m;
 	u_char *p;
 
@@ -612,18 +613,26 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 	 * we're operating on complete aligned pages).
 	 */
 	offs = bp->bio_offset % PAGE_SIZE;
+	firstp = bp->bio_offset / PAGE_SIZE;
 	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 
+	vm_page_t ma[lastp - firstp + 1];
+
 	rv = VM_PAGER_OK;
 	VM_OBJECT_LOCK(sc->object);
 	vm_object_pip_add(sc->object, 1);
-	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
+	for (i = firstp; i <= lastp; i++) {
 		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 
-		m = vm_page_grab(sc->object, i,
-		    VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+		/*
+		 * Write cleans pages of the buffer, give it a
+		 * priority.
+		 */
+		m = vm_page_grab(sc->object, i, (bp->bio_cmd == BIO_WRITE ?
+		    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_RETRY);
 		VM_OBJECT_UNLOCK(sc->object);
+		ma[i - firstp] = m;
 		sched_pin();
 		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 		VM_OBJECT_LOCK(sc->object);
@@ -685,6 +694,9 @@ printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid
 	}
 	vm_object_pip_subtract(sc->object, 1);
 	vm_object_set_writeable_dirty(sc->object);
+	if (rv != VM_PAGER_ERROR && bp->bio_cmd == BIO_WRITE &&
+	    vm_page_count_severe())
+		vm_pageout_flush(ma, lastp - firstp + 1, IO_SYNC, 0, NULL);
 	VM_OBJECT_UNLOCK(sc->object);
 	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 }
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
new file mode 100644
index 0000000..fc0ae39
--- /dev/null
+++ b/sys/kern/kern_rangelock.c
@@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	size_t		rl_q_start, rl_q_end;
+	int		rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	thread0.td_rlqe = rlqentry_alloc();
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+struct rl_q_entry *
+rlqentry_alloc()
+{
+
+	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+	uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+#define	IN_RANGE(a, e) (a >= e->rl_q_start && a < e->rl_q_end)
+	if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||
+	    IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))
+		return (1);
+#undef	IN_RANGE
+	return (0);
+}
+
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+	mtx_assert(ilk, MA_OWNED);
+	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	if (curthread->td_rlqe == NULL)
+		curthread->td_rlqe = entry;
+	else
+		rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	entry = cookie;
+	mtx_lock(ilk);
+	rangelock_unlock_locked(lock, entry, ilk);
+}
+
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t base,
+    size_t len, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	entry = cookie;
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX"));
+	KASSERT(entry->rl_q_start == base, ("XXX"));
+	KASSERT(entry->rl_q_end >= base + len, ("XXX"));
+	if (entry->rl_q_end == base + len) {
+		rangelock_unlock_locked(lock, cookie, ilk);
+		return (NULL);
+	}
+	entry->rl_q_end = base + len;
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	return (cookie);
+}
+
+static void *
+rangelock_enqueue(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+	if (lock->rl_currdep == NULL)
+		lock->rl_currdep = entry;
+	rangelock_calc_block(lock);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, ilk, 0, "range", 0);
+	mtx_unlock(ilk);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t base, size_t len, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	entry->rl_q_flags = RL_LOCK_READ;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(lock, entry, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t base, size_t len, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	entry->rl_q_flags = RL_LOCK_WRITE;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(lock, entry, ilk));
+}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 7161a99..ba1869b 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/rangelock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sched.h>
@@ -210,6 +211,7 @@ thread_init(void *mem, int size, int flags)
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
+	td->td_rlqe = rlqentry_alloc();
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
@@ -227,6 +229,8 @@ thread_fini(void *mem, int size)
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
+	KASSERT(td->td_rlqe != NULL, ("Leaked td_rlqe"));
+	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index ba331b1..de1eca9 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -71,8 +71,8 @@ static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
     "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
 
-static int read_max = 32;
-SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+int vfs_read_max = 32;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &vfs_read_max, 0,
     "Cluster read-ahead max block count");
 
 /* Page expended to mark partially backed buffers */
@@ -109,7 +109,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 	 */
 	racluster = vp->v_mount->mnt_iosize_max / size;
 	maxra = seqcount;
-	maxra = min(read_max, maxra);
+	maxra = min(vfs_read_max, maxra);
 	maxra = min(nbuf/8, maxra);
 	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
 		maxra = (filesize / size) - lblkno;
@@ -803,7 +803,9 @@ cluster_wbuild(vp, size, start_lbn, len)
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
-		  ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+		  ((bp = (vp->v_vflag & VV_MD) ?
+		    trypbuf(&cluster_pbuf_freecnt) :
+		    getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 195e735..bf038cf 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -78,6 +78,8 @@ static int	dirent_exists(struct vnode *vp, const char *dirname,
 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
+static int vop_stdextend(struct vop_extend_args *ap);
+
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
  * implement a particular VOP.
@@ -121,6 +123,7 @@ struct vop_vector default_vnodeops = {
 	.vop_unlock =		vop_stdunlock,
 	.vop_vptocnp =		vop_stdvptocnp,
 	.vop_vptofh =		vop_stdvptofh,
+	.vop_extend =		vop_stdextend,
 };
 
 /*
@@ -855,6 +858,23 @@ out:
 	return (error);
 }
 
+static int
+vop_stdextend(struct vop_extend_args *ap)
+{
+	struct vattr vattr, oattr;
+	int error;
+
+
+	error = VOP_GETATTR(ap->a_vp, &oattr, ap->a_cred);
+	if (error != 0)
+		return (error);
+	if (oattr.va_size >= ap->a_size)
+		return (0);
+	VATTR_NULL(&vattr);
+	vattr.va_size = ap->a_size;
+	return (VOP_SETATTR(ap->a_vp, &vattr, ap->a_cred));
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index fc413a2..00dffe8 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -878,6 +878,7 @@ vdestroy(struct vnode *vp)
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
@@ -1032,6 +1033,7 @@ alloc:
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 42abf6e..855e3f8 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -37,12 +37,14 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
@@ -63,6 +65,13 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static int vmio_enabled = 1;
+SYSCTL_INT(_vfs, OID_AUTO, vmio_enabled, CTLFLAG_RW, &vmio_enabled, 0,
+    "Use vm pages copyin/out instead of vops for read/write");
+
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_truncate_t	vn_truncate;
@@ -84,6 +93,9 @@ struct 	fileops vnops = {
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
+static int vn_write_chunk(struct vnode *, struct uio *, struct ucred *,
+    struct ucred *, int);
+
 int
 vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
@@ -280,17 +292,14 @@ vn_writechk(vp)
  * Vnode close call
  */
 int
-vn_close(vp, flags, file_cred, td)
-	register struct vnode *vp;
-	int flags;
-	struct ucred *file_cred;
-	struct thread *td;
+vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
+    struct thread *td)
 {
-	struct mount *mp;
+	struct mount *mp, *mp1;
 	int error, lock_flags;
 
-	if (!(flags & FWRITE) && vp->v_mount != NULL &&
-	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
+	if (!(flags & FWRITE) && (mp1 = vp->v_mount) != NULL &&
+	    MNT_SHARED_WRITES(mp1))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
@@ -338,7 +347,7 @@ sequential_heuristic(struct uio *uio, struct file *fp)
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
-		fp->f_seqcount += howmany(uio->uio_resid, 16384);
+		fp->f_seqcount += howmany(uio->uio_resid, FRA_BLOCK_SZ);
 		if (fp->f_seqcount > IO_SEQMAX)
 			fp->f_seqcount = IO_SEQMAX;
 		return (fp->f_seqcount << IO_SEQSHIFT);
@@ -356,76 +365,71 @@ sequential_heuristic(struct uio *uio, struct file *fp)
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
-    aresid, td)
-	enum uio_rw rw;
-	struct vnode *vp;
-	void *base;
-	int len;
-	off_t offset;
-	enum uio_seg segflg;
-	int ioflg;
-	struct ucred *active_cred;
-	struct ucred *file_cred;
-	int *aresid;
-	struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, int *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
+	void *rl_cookie;
 	int error, lock_flags;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
 	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ)
+			rl_cookie = vn_rangelock_rlock(vp, offset, len);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, offset, len);
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
-				return (error);
+				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
-			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
-			} else {
+			else
 				lock_flags = LK_EXCLUSIVE;
-			}
-			vn_lock(vp, lock_flags | LK_RETRY);
 		} else
-			vn_lock(vp, LK_SHARED | LK_RETRY);
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
 
-	}
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	aiov.iov_base = base;
-	aiov.iov_len = len;
-	auio.uio_resid = len;
-	auio.uio_offset = offset;
-	auio.uio_segflg = segflg;
-	auio.uio_rw = rw;
-	auio.uio_td = td;
-	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
-		if (rw == UIO_READ)
-			error = mac_vnode_check_read(active_cred, file_cred,
-			    vp);
-		else
+		if (rw == UIO_WRITE)
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
-		if (file_cred)
+		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (rw == UIO_READ)
-			error = VOP_READ(vp, &auio, ioflg, cred);
+			error = vn_read_chunk(vp, &auio, active_cred, cred,
+			    ioflg | IO_NODELOCKED);
 		else
-			error = VOP_WRITE(vp, &auio, ioflg, cred);
+			error = vn_write_chunk(vp, &auio, active_cred, cred,
+			    ioflg | IO_NODELOCKED);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
@@ -433,10 +437,13 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE && vp->v_type != VCHR)
-			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
+			vn_finished_write(mp);
 	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -498,68 +505,148 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
 	return (error);
 }
 
+static struct mtx *
+vn_lock_foffset(struct file *fp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+		fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+		msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+		    "vnread offlock", 0);
+	}
+	fp->f_vnread_flags |= FOFFSET_LOCKED;
+	mtx_unlock(mtxp);
+	return (mtxp);
+}
+
+static void
+vn_unlock_foffset(struct file *fp, struct mtx *mtxp)
+{
+
+	mtx_lock(mtxp);
+	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+		wakeup(&fp->f_vnread_flags);
+	fp->f_vnread_flags = 0;
+	mtx_unlock(mtxp);
+}
+
+int
+vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+    struct ucred *fcred, int ioflag)
+{
+	int error, vfslocked;
+
+	error = 0;
+	vfslocked = 0; /* gcc */
+
+	if ((ioflag & IO_NODELOCKED) == 0) {
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+	}
+
+#ifdef MAC
+	if ((ioflag & IO_NOMACCHECK) == 0)
+		error = mac_vnode_check_read(active_cred, fcred, vp);
+#endif
+	if (error == 0) {
+		if (!vmio_enabled ||
+		    (error = vnode_pager_read(vp, uio, ioflag)) == EOPNOTSUPP)
+			error = VOP_READ(vp, uio, ioflag, fcred);
+	}
+	if ((ioflag & IO_NODELOCKED) == 0) {
+		VOP_UNLOCK(vp, 0);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
 /*
  * File table vnode read routine.
  */
 static int
-vn_read(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	int flags;
-	struct thread *td;
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	int error, ioflag;
 	struct mtx *mtxp;
-	int vfslocked;
+	void *rl_cookie;
+	int ioflag;
+	int error;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
-	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	vp = fp->f_vnode;
+
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-			    "vnread offlock", 0);
-		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
-		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		mtxp = vn_lock_foffset(fp);
 		uio->uio_offset = fp->f_offset;
 	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY);
-
+		mtxp = NULL; /* gcc */
+	if (vp->v_type == VREG)
+		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_resid);
+	else
+		rl_cookie = NULL;
 	ioflag |= sequential_heuristic(uio, fp);
+	error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+	fp->f_nextoff = uio->uio_offset;
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
+	if ((flags & FOF_OFFSET) == 0) {
+		fp->f_offset = uio->uio_offset;
+		vn_unlock_foffset(fp, mtxp);
+	}
+	return (error);
+}
 
+static int
+vn_write_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+    struct ucred *fcred, int ioflag)
+{
+	struct mount *mp, *mp1;
+	int error, lock_flags, vfslocked;
+
+	mp = NULL;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (vp->v_type == VREG)
+		bwillwrite();
+	if (vp->v_type != VCHR &&
+	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto unlock;
+
+	if (MNT_SHARED_WRITES(mp) ||
+	    (mp == NULL && (mp1 = vp->v_mount) != NULL &&
+	     MNT_SHARED_WRITES(mp1)))
+		lock_flags = LK_SHARED;
+	else
+		lock_flags = LK_EXCLUSIVE;
+	vn_lock(vp, lock_flags | LK_RETRY);
 #ifdef MAC
-	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
-	if (error == 0)
+	error = mac_vnode_check_write(active_cred, fcred, vp);
+#else
+	error = 0;
 #endif
-		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
-		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
+	if (error == 0) {
+		if (!vmio_enabled ||
+		    (error = vnode_pager_write(vp, uio, ioflag)) == EOPNOTSUPP)
+			error = VOP_WRITE(vp, uio, ioflag, fcred);
 	}
-	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
+unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -568,24 +655,17 @@ vn_read(fp, uio, active_cred, flags, td)
  * File table vnode write routine.
  */
 static int
-vn_write(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	int flags;
-	struct thread *td;
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	struct mount *mp;
-	int error, ioflag, lock_flags;
-	int vfslocked;
+	struct mtx *mtxp;
+	void *rl_cookie;
+	int error, ioflag;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	if (vp->v_type == VREG)
-		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
@@ -596,36 +676,32 @@ vn_write(fp, uio, active_cred, flags, td)
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
-	mp = NULL;
-	if (vp->v_type != VCHR &&
-	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
-		goto unlock;
- 
-	if ((MNT_SHARED_WRITES(mp) ||
-	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
-	    (flags & FOF_OFFSET) != 0) {
-		lock_flags = LK_SHARED;
-	} else {
-		lock_flags = LK_EXCLUSIVE;
-	}
-
-	vn_lock(vp, lock_flags | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
+	if ((flags & FOF_OFFSET) == 0) {
+		mtxp = vn_lock_foffset(fp);
 		uio->uio_offset = fp->f_offset;
+	} else
+		mtxp = NULL; /* gcc */
 	ioflag |= sequential_heuristic(uio, fp);
-#ifdef MAC
-	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
-	if (error == 0)
-#endif
-		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
+	if (vp->v_type == VREG) {
+		if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET))
+			/*
+			 * For appenders, punt and lock the whole
+			 * range. It also protects f_offset.
+			 */
+			rl_cookie = vn_rangelock_wlock(vp, 0, (size_t)-1);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_resid);
+	} else
+		rl_cookie = NULL;
+	error = vn_write_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
+	if ((flags & FOF_OFFSET) == 0) {
 		fp->f_offset = uio->uio_offset;
+		vn_unlock_foffset(fp, mtxp);
+	}
 	fp->f_nextoff = uio->uio_offset;
-	VOP_UNLOCK(vp, 0);
-	if (vp->v_type != VCHR)
-		vn_finished_write(mp);
-unlock:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -633,25 +709,29 @@ unlock:
  * File table truncate routine.
  */
 static int
-vn_truncate(fp, length, active_cred, td)
-	struct file *fp;
-	off_t length;
-	struct ucred *active_cred;
-	struct thread *td;
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
+	void *rl_cookie;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
+
+	/*
+	 * Lock the range where the shortening take place. Increase of
+	 * file size does not need rangelock, but it is faster to lock
+	 * the range then call VOP_GETATTR to get the current size and
+	 * deal with races.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, length, -1);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
+	if (error)
+		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
@@ -671,7 +751,9 @@ vn_truncate(fp, length, active_cred, td)
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
+out1:
 	VFS_UNLOCK_GIANT(vfslocked);
+	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 304e009..47657d7 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -618,3 +618,12 @@ vop_vptocnp {
 	INOUT char *buf;
 	INOUT int *buflen;
 };
+
+%% extend		vp	L L L
+
+vop_extend {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN u_quad_t size;
+	IN int flags;
+};
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index f57d6ed..ffcfa80 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -258,6 +258,8 @@ extern const char *buf_wmesg;		/* Default buffer lock message */
 #include <sys/proc.h>			/* XXX for curthread */
 #include <sys/mutex.h>
 
+extern int vfs_read_max;
+
 /*
  * Initialize a lock.
  */
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 061ce02..3e7d5c7 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -141,6 +141,8 @@ struct file {
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2		 
 
+#define	FRA_BLOCK_SZ	     16384
+
 #endif /* _KERNEL || _WANT_FILE */
 
 /*
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 48ef012..91ddbe1 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -210,6 +210,7 @@ struct thread {
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
+	struct rl_q_entry *td_rlqe;	/* (k) Associated range lock entry. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
@@ -386,7 +387,7 @@ do {									\
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock aquisition - deadlock treatment. */
-#define	TDP_UNUSED80	0x00000080 /* available. */
+#define	TDP_VMIO	0x00000080 /* Busied pages for vnode_pager io. */
 #define	TDP_NOSLEEPING	0x00000100 /* Thread is not allowed to sleep on a sq. */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
new file mode 100644
index 0000000..a328330
--- /dev/null
+++ b/sys/sys/rangelock.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_RANGELOCK_H
+#define	_SYS_RANGELOCK_H
+
+#include <sys/queue.h>
+
+#ifdef _KERNEL
+
+#define	RL_LOCK_READ		0x0001
+#define	RL_LOCK_WRITE		0x0002
+#define	RL_LOCK_TYPE_MASK	0x0003
+#define	RL_LOCK_GRANTED		0x0004
+
+struct vnode;
+struct rl_q_entry;
+struct mtx;
+
+struct rangelock {
+	TAILQ_HEAD(, rl_q_entry) rl_waiters;
+	struct rl_q_entry	*rl_currdep;
+};
+
+void	 rangelock_init(struct rangelock *lock);
+void	 rangelock_destroy(struct rangelock *lock);
+void	 rangelock_unlock(struct rangelock *lock, void *cookie,
+	    struct mtx *ilk);
+void	*rangelock_unlock_range(struct rangelock *lock, void *cookie,
+	    off_t base, size_t len, struct mtx *ilk);
+void	*rangelock_rlock(struct rangelock *lock, off_t base, size_t len,
+	    struct mtx *ilk);
+void	*rangelock_wlock(struct rangelock *lock, off_t base, size_t len,
+	    struct mtx *ilk);
+
+struct rl_q_entry	*rlqentry_alloc(void);
+void			 rlqentry_free(struct rl_q_entry *rlqe);
+
+#endif	/* _KERNEL */
+
+#endif	/* _SYS_RANGELOCK_H */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 86ff8b6..78e579e 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -38,6 +38,7 @@
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
+#include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
@@ -168,7 +169,8 @@ struct vnode {
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
-	struct lockf *v_lockf;			/* Byte-level lock list */
+	struct lockf *v_lockf;			/* Byte-level adv lock list */
+	struct rangelock v_rl;			/* Byte-range lock */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
@@ -653,6 +655,8 @@ int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
+int	vn_read_chunk(struct vnode *vp, struct uio *uio,
+	    struct ucred *active_cred, struct ucred *f_cred, int ioflag);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    const struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
@@ -670,6 +674,14 @@ int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 
+#define	vn_rangelock_unlock(vp, cookie)					\
+	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
+#define	vn_rangelock_unlock_range(vp, cookie, base, len)		\
+	rangelock_unlock_range(&(vp)->v_rl, (cookie), (base), (len), VI_MTX(vp))
+#define	vn_rangelock_rlock(vp, base, len)				\
+	rangelock_rlock(&(vp)->v_rl, (base), (len), VI_MTX(vp))
+#define	vn_rangelock_wlock(vp, base, len)				\
+	rangelock_wlock(&(vp)->v_rl, (base), (len), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 6d5f27c..5dcceee 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -641,7 +641,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 	if (lastlbn < NDADDR && lastlbn < lbn) {
 		nb = lastlbn;
 		osize = blksize(fs, ip, nb);
-		if (osize < fs->fs_bsize && osize > 0) {
+		if (osize < fs->fs_bsize && osize > 0 && dp->di_db[nb] != 0) {
 			UFS_LOCK(ump);
 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
@@ -708,9 +708,17 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 					    nsize, osize, bp);
 			}
 		} else {
-			if (ip->i_size < smalllblktosize(fs, lbn + 1))
+			if (ip->i_size < smalllblktosize(fs, lbn))
 				nsize = fragroundup(fs, size);
-			else
+			else if (ip->i_size < smalllblktosize(fs, lbn + 1)) {
+				/*
+				 * Allocate entire tail of the file.
+				 * Write may cover subpart of the extended
+				 * area.
+				 */
+				nsize = fragroundup(fs, max(size,
+				    blkoff(fs, ip->i_size)));
+			} else
 				nsize = fs->fs_bsize;
 			UFS_LOCK(ump);
 			error = ffs_alloc(ip, lbn,
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index eb14e73..ec0789a 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -4378,7 +4378,7 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
 	long newsize;		/* size of new block */
-	long oldsize;		/* size of new block */
+	long oldsize;		/* size of old block */
 	struct buf *bp;		/* bp for allocated block */
 {
 	struct allocdirect *adp, *oldadp;
@@ -4506,8 +4506,8 @@ allocdirect_merge(adphead, newadp, oldadp)
 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 	    newadp->ad_oldsize != oldadp->ad_newsize ||
 	    newadp->ad_offset >= NDADDR)
-		panic("%s %jd != new %jd || old size %ld != new %ld",
-		    "allocdirect_merge: old blkno",
+		panic("allocdirect_merge: old blkno"
+		    " %jd != new %jd || old size %ld != new %ld",
 		    (intmax_t)newadp->ad_oldblkno,
 		    (intmax_t)oldadp->ad_newblkno,
 		    newadp->ad_oldsize, oldadp->ad_newsize);
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 15c3f9f..9574839 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -116,7 +116,7 @@ static vop_listextattr_t	ffs_listextattr;
 static vop_openextattr_t	ffs_openextattr;
 static vop_setextattr_t	ffs_setextattr;
 static vop_vptofh_t	ffs_vptofh;
-
+static vop_extend_t	ffs_extend;
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops1 = {
@@ -128,6 +128,7 @@ struct vop_vector ffs_vnodeops1 = {
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
 	.vop_vptofh =		ffs_vptofh,
+	.vop_extend =		ffs_extend,
 };
 
 struct vop_vector ffs_fifoops1 = {
@@ -153,6 +154,7 @@ struct vop_vector ffs_vnodeops2 = {
 	.vop_openextattr =	ffs_openextattr,
 	.vop_setextattr =	ffs_setextattr,
 	.vop_vptofh =		ffs_vptofh,
+	.vop_extend =		ffs_extend,
 };
 
 struct vop_vector ffs_fifoops2 = {
@@ -170,6 +172,18 @@ struct vop_vector ffs_fifoops2 = {
 	.vop_vptofh =		ffs_vptofh,
 };
 
+static void
+ffs_drop_suid(struct inode *ip, struct ucred *cred)
+{
+
+	if (ip->i_mode & (ISUID | ISGID)) {
+		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+			ip->i_mode &= ~(ISUID | ISGID);
+			DIP_SET(ip, i_mode, ip->i_mode);
+		}
+	}
+}
+
 /*
  * Synch an open file.
  */
@@ -803,13 +817,8 @@ ffs_write(ap)
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
-	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
-	    ap->a_cred) {
-		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
-			ip->i_mode &= ~(ISUID | ISGID);
-			DIP_SET(ip, i_mode, ip->i_mode);
-		}
-	}
+	if (resid > uio->uio_resid && ap->a_cred != NULL)
+		ffs_drop_suid(ip, ap->a_cred);
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)ffs_truncate(vp, osize,
@@ -1768,3 +1777,69 @@ vop_vptofh {
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
+
+static int
+ffs_extend(struct vop_extend_args *ap)
+{
+	struct vnode *vp;
+	struct inode *ip;
+	struct buf *bp;
+	struct fs *fs;
+	off_t osize, xosize;
+	u_quad_t size;
+	ufs_lbn_t lastlbn;
+	ufs2_daddr_t nb;
+	int error, flags;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	size = ap->a_size;
+	osize = ip->i_size;
+	if (osize >= size)
+		return (0);
+
+	vnode_pager_setsize(vp, size);
+	fs = ip->i_fs;
+	flags = ap->a_flags & IO_SYNC;
+	if (flags != 0)
+		goto slow;
+
+	lastlbn = lblkno(fs, osize);
+	if (lastlbn < NDADDR) {
+		xosize = fragroundup(fs, blkoff(fs, osize));
+		if (xosize < fs->fs_bsize && xosize > 0) {
+			if (ip->i_ump->um_fstype == UFS1)
+				nb = ip->i_din1->di_db[lastlbn];
+			else
+				nb = ip->i_din2->di_db[lastlbn];
+			/* Need to extend fragment */
+			if (nb != 0)
+				goto slow;
+		}
+	}
+	ip->i_size = size;
+	DIP_SET(ip, i_size, size);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	ffs_drop_suid(ip, ap->a_cred);
+	return (0);
+
+ slow:
+	error = UFS_BALLOC(vp, size - 1, 1, ap->a_cred, flags|BA_CLRBUF, &bp);
+	if (error) {
+		vnode_pager_setsize(vp, osize);
+		return (error);
+	}
+	ip->i_size = size;
+	DIP_SET(ip, i_size, size);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (bp->b_bufsize == fs->fs_bsize)
+		bp->b_flags |= B_CLUSTEROK;
+	if (flags & IO_SYNC) {
+		bwrite(bp);
+		error = ffs_update(vp, 1);
+	} else
+		bawrite(bp);
+	if (error == 0)
+		ffs_drop_suid(ip, ap->a_cred);
+	return (error);
+}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index c396910..af8f100 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -2150,7 +2150,8 @@ ufs_readdir(ap)
 	uio->uio_iov->iov_len = count;
 #	if (BYTE_ORDER == LITTLE_ENDIAN)
 		if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) {
-			error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
+			error = vn_read_chunk(ap->a_vp, uio, ap->a_cred,
+			    ap->a_cred, IO_NODELOCKED);
 		} else {
 			struct dirent *dp, *edp;
 			struct uio auio;
@@ -2166,7 +2167,8 @@ ufs_readdir(ap)
 			aiov.iov_len = count;
 			dirbuf = malloc(count, M_TEMP, M_WAITOK);
 			aiov.iov_base = dirbuf;
-			error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
+			error = vn_read_chunk(ap->a_vp, &auio, ap->a_cred,
+			    ap->a_cred, IO_NODELOCKED);
 			if (error == 0) {
 				readcnt = count - auio.uio_resid;
 				edp = (struct dirent *)&dirbuf[readcnt];
@@ -2188,7 +2190,8 @@ ufs_readdir(ap)
 			free(dirbuf, M_TEMP);
 		}
 #	else
-		error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
+		error = vn_read_chunk(ap->a_vp, uio, ap->a_cred,
+		    ap->a_cred, IO_NODELOCKED);
 #	endif
 	if (!error && ap->a_ncookies != NULL) {
 		struct dirent* dpStart;
diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h
index ae8e578..95f15d6 100644
--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@@ -87,5 +87,8 @@ struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
+int vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags);
+int vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags);
+
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index fefc2e7..4c8a420 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -157,18 +157,22 @@ static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
-	vm_object_pip_wakeup(fs->object);
-	VM_OBJECT_UNLOCK(fs->object);
-	if (fs->object != fs->first_object) {
-		VM_OBJECT_LOCK(fs->first_object);
-		vm_page_lock(fs->first_m);
-		vm_page_free(fs->first_m);
-		vm_page_unlock(fs->first_m);
-		vm_object_pip_wakeup(fs->first_object);
-		VM_OBJECT_UNLOCK(fs->first_object);
-		fs->first_m = NULL;
+	if (fs->object != NULL) {
+		vm_object_pip_wakeup(fs->object);
+		VM_OBJECT_UNLOCK(fs->object);
+		if (fs->object != fs->first_object &&
+		    fs->first_object != NULL) {
+			VM_OBJECT_LOCK(fs->first_object);
+			vm_page_lock(fs->first_m);
+			vm_page_free(fs->first_m);
+			vm_page_unlock(fs->first_m);
+			vm_object_pip_wakeup(fs->first_object);
+			VM_OBJECT_UNLOCK(fs->first_object);
+			fs->first_m = NULL;
+		}
+		vm_object_deallocate(fs->first_object);
+		fs->object = fs->first_object = NULL;
 	}
-	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);	
 	if (fs->vp != NULL) { 
 		vput(fs->vp);
@@ -226,14 +230,15 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	int faultcount, ahead, behind, alloc_req;
 	struct faultstate fs;
 	struct vnode *vp;
+	struct thread *td;
 	int locked, error;
 
 	hardfault = 0;
 	growstack = TRUE;
 	PCPU_INC(cnt.v_vm_faults);
-	fs.vp = NULL;
-	fs.vfslocked = 0;
+	memset(&fs, 0, sizeof(fs));
 	faultcount = behind = 0;
+	td = curthread;
 
 RetryFault:;
 
@@ -248,11 +253,14 @@ RetryFault:;
 		if (growstack && result == KERN_INVALID_ADDRESS &&
 		    map != kernel_map) {
 			result = vm_map_growstack(curproc, vaddr);
-			if (result != KERN_SUCCESS)
+			if (result != KERN_SUCCESS) {
+				unlock_and_deallocate(&fs);
 				return (KERN_FAILURE);
+			}
 			growstack = FALSE;
 			goto RetryFault;
 		}
+		unlock_and_deallocate(&fs);
 		return (result);
 	}
 
@@ -384,7 +392,8 @@ RetryFault:;
 			 */
 			vm_page_busy(fs.m);
 			if (fs.m->valid != VM_PAGE_BITS_ALL &&
-				fs.m->object != kernel_object && fs.m->object != kmem_object) {
+			    fs.m->object != kernel_object &&
+			    fs.m->object != kmem_object) {
 				goto readrest;
 			}
 
@@ -547,7 +556,7 @@ vnode_lock:
 					locked = LK_SHARED;
 				/* Do not sleep for vnode lock while fs.m is busy */
 				error = vget(vp, locked | LK_CANRECURSE |
-				    LK_NOWAIT, curthread);
+				    LK_NOWAIT, td);
 				if (error != 0) {
 					int vfslocked;
 
@@ -557,7 +566,7 @@ vnode_lock:
 					release_page(&fs);
 					unlock_and_deallocate(&fs);
 					error = vget(vp, locked | LK_RETRY |
-					    LK_CANRECURSE, curthread);
+					    LK_CANRECURSE, td);
 					vdrop(vp);
 					fs.vp = vp;
 					fs.vfslocked = vfslocked;
@@ -970,9 +979,9 @@ vnode_locked:
 	 */
 	unlock_and_deallocate(&fs);
 	if (hardfault)
-		curthread->td_ru.ru_majflt++;
+		td->td_ru.ru_majflt++;
 	else
-		curthread->td_ru.ru_minflt++;
+		td->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
 }
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index b830202..ee50c02 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -109,9 +109,13 @@ struct vm_object {
 		 * VNode pager
 		 *
 		 *	vnp_size - current size of file
+		 *	wpos - start write position for seq write detector
+		 *	off - offset from wpos for current write
 		 */
 		struct {
 			off_t vnp_size;
+			off_t wpos;
+			ssize_t off;
 		} vnp;
 
 		/*
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index c11b024..4e28f8f 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -856,6 +856,8 @@ vm_page_remove(vm_page_t m)
 		m->oflags &= ~VPO_BUSY;
 		vm_page_flash(m);
 	}
+	if (m->flags & PG_WRITEDIRTY)
+		vm_writedirty_cleaned(m);
 
 	/*
 	 * Now remove from the object's list of backed pages.
@@ -1384,6 +1386,19 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 	return (m);
 }
 
+void
+vm_wait_queue_free(const char *wmsg)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	if (!vm_pages_needed) {
+		vm_pages_needed = 1;
+		wakeup(&vm_pages_needed);
+	}
+	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, wmsg,
+	    0);
+}
+
 /*
  * Initialize a page that has been freshly dequeued from a freelist.
  * The caller has to drop the vnode returned, if it is not NULL.
@@ -1488,14 +1503,8 @@ vm_wait(void)
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
-	} else {
-		if (!vm_pages_needed) {
-			vm_pages_needed = 1;
-			wakeup(&vm_pages_needed);
-		}
-		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
-		    "vmwait", 0);
-	}
+	} else
+		vm_wait_queue_free("vmwait");
 }
 
 /*
@@ -2007,6 +2016,9 @@ vm_page_cache(vm_page_t m)
 	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
+	if (m->flags & PG_WRITEDIRTY)
+		vm_writedirty_cleaned(m);
+
 	/*
 	 * Insert the page into the object's collection of cached pages
 	 * and the physical memory allocator's cache/free page queues.
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 54a15fb..edb02da 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -217,6 +217,7 @@ extern struct vpglocks pa_lock[];
 #define PG_WINATCFLS	0x0004		/* flush dirty page on inactive q */
 #define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
 #define	PG_WRITEABLE	0x0010		/* page is mapped writeable */
+#define	PG_WRITEDIRTY	0x0020		/* dirtied by vmio write */
 #define	PG_ZERO		0x0040		/* page is zeroed */
 #define PG_REFERENCED	0x0080		/* page has been referenced */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 8a9bfe1..14d5522 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -111,7 +111,6 @@ __FBSDID("$FreeBSD$");
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
-static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(int pass);
 
 struct proc *pageproc;
@@ -215,7 +214,7 @@ static void vm_req_vmdaemon(int req);
 #endif
 static void vm_pageout_page_stats(void);
 
-static void
+void
 vm_pageout_init_marker(vm_page_t marker, u_short queue)
 {
 
@@ -316,7 +315,7 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
-static int
+int
 vm_pageout_clean(vm_page_t m)
 {
 	vm_object_t object;
@@ -388,7 +387,7 @@ more:
 		vm_page_lock(p);
 		vm_page_test_dirty(p);
 		if (p->dirty == 0 ||
-		    p->queue != PQ_INACTIVE ||
+		    (p->queue != PQ_INACTIVE && p->queue != PQ_ACTIVE) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			ib = 0;
@@ -416,7 +415,7 @@ more:
 		vm_page_lock(p);
 		vm_page_test_dirty(p);
 		if (p->dirty == 0 ||
-		    p->queue != PQ_INACTIVE ||
+		    (p->queue != PQ_INACTIVE && p->queue != PQ_ACTIVE) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			break;
@@ -531,11 +530,14 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen)
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
-			if (vm_page_count_severe()) {
-				vm_page_lock(mt);
+			vm_page_lock(mt);
+			if (mt->queue == PQ_INACTIVE && vm_page_count_severe())
 				vm_page_try_to_cache(mt);
-				vm_page_unlock(mt);
-			}
+			if ((mt->flags & PG_WRITEDIRTY) != 0 &&
+			    (pageout_status[i] == VM_PAGER_OK ||
+			     pageout_status[i] == VM_PAGER_BAD))
+				vm_writedirty_cleaned(mt);
+			vm_page_unlock(mt);
 		}
 	}
 	if (prunlen != NULL)
@@ -1258,7 +1260,6 @@ unlock_and_continue:
 		vm_pageout_oom(VM_OOM_MEM);
 }
 
-
 void
 vm_pageout_oom(int shortage)
 {
@@ -1478,12 +1479,17 @@ vm_pageout()
 		vm_pageout_page_count = 8;
 
 	/*
+	 * Try to allow not more then 1/4 of usable pages for write.
+	 */
+	vmio_max_writedirty = cnt.v_page_count / 4;
+
+	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	if (cnt.v_page_count > 1024)
-		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
+		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 100;
 	else
 		cnt.v_free_min = 4;
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index 53e051a..4fcf42c 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -77,6 +77,8 @@ extern int vm_pageout_pages_needed;
 extern int vm_pageout_deficit;
 extern int vm_pageout_page_count;
 
+extern long vmio_max_writedirty;
+
 /*
  * Swap out requests
  */
@@ -94,17 +96,21 @@ extern int vm_pageout_page_count;
  *	Signal pageout-daemon and wait for it.
  */
 
+#ifdef _KERNEL
 extern void pagedaemon_wakeup(void);
 #define VM_WAIT vm_wait()
 #define VM_WAITPFAULT vm_waitpfault()
 extern void vm_wait(void);
 extern void vm_waitpfault(void);
+extern void vm_wait_queue_free(const char *);
 
-#ifdef _KERNEL
 boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
 int vm_pageout_flush(vm_page_t *, int, int, int, int *);
-void vm_pageout_oom(int shortage);
+void vm_pageout_oom(int);
+int vm_pageout_clean(vm_page_t);
+void vm_writedirty_cleaned(vm_page_t);
 boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
 void vm_contig_grow_cache(int, vm_paddr_t, vm_paddr_t);
+void vm_pageout_init_marker(vm_page_t marker, u_short queue);
 #endif
 #endif	/* _VM_VM_PAGEOUT_H_ */
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index 16b6747..4236721 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -883,4 +883,24 @@ DB_SHOW_COMMAND(freepages, db_show_freepages)
 		db_printf("\n");
 	}
 }
+
+DB_SHOW_COMMAND(vpo_dw, vpo_dw)
+{
+	struct vm_phys_seg *seg;
+	vm_page_t m;
+	int segind;
+	long npages, i;
+
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		npages = seg->end - seg->start;
+		npages /= PAGE_SIZE;
+		m = seg->first_page;
+		for (i = 0; i < npages; i++, m++) {
+			if (m->flags & PG_WRITEDIRTY)
+				printf("%p\n", m);
+		}
+	}
+}
+
 #endif
diff --git a/sys/vm/vm_readwrite.c b/sys/vm/vm_readwrite.c
new file mode 100644
index 0000000..6fc5a11
--- /dev/null
+++ b/sys/vm/vm_readwrite.c
@@ -0,0 +1,1109 @@
+/*-
+ * Copyright (c) 2008 Jeffrey Roberson <jeff@FreeBSD.org>
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_vm.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/bufobj.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+
+/*
+ * XXXKIB TODO
+ *
+ * 2. VOP_REALLOCBLKS.
+ * 3. Unset setuid/setgid bits after write.
+ * 4. Filesystem full handling.
+ *
+ */
+
+static SYSCTL_NODE(_vfs, OID_AUTO, vmio, CTLFLAG_RW, 0, "VFS VMIO leaf");
+
+static int vmio_run = 0;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, run, CTLFLAG_RW, &vmio_run, 0,
+    "Calculate the max sequential run for vnode_pager_read_cluster");
+static int vmio_clrbuf = 1;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, clrbuf, CTLFLAG_RW, &vmio_clrbuf, 0,
+    ""); /* Intentionally undocumented */
+static int vmio_read_pack = 16;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, read_pack, CTLFLAG_RW, &vmio_read_pack, 0,
+    "Length of the page pack for read");
+static int vmio_write_pack = 16;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, write_pack, CTLFLAG_RW, &vmio_write_pack,
+    0,
+    "Length of the page pack for write");
+static int vmio_rollbacks1;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, rollbacks1, CTLFLAG_RD, &vmio_rollbacks1,
+    0,
+    "Count of times vnode size has to be rolled back for writes "
+    "while collecting pages");
+static int vmio_rollbacks2;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, rollbacks2, CTLFLAG_RD, &vmio_rollbacks2,
+    0,
+    "Count of times vnode size has to be rolled back for writes "
+    "while reading pages");
+static int vmio_getpages_read;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, getpages_read, CTLFLAG_RD,
+    &vmio_getpages_read, 0,
+    "Count of times VOP_GETPAGES called for read");
+static int vmio_getpages_write;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, getpages_write, CTLFLAG_RD,
+    &vmio_getpages_write, 0,
+    "Count of times VOP_GETPAGES called for write");
+static int vmio_reserv_used;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, reserv_used, CTLFLAG_RD,
+    &vmio_reserv_used, 0,
+    "Count of times reserved page was used by vmio");
+static int vmio_alloc_wait;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, alloc_wait, CTLFLAG_RD, &vmio_alloc_wait,
+    0,
+    "Count of times vmio reserved page allocation has to wait");
+static long vmio_writedirty;
+SYSCTL_LONG(_vfs_vmio, OID_AUTO, writedirty, CTLFLAG_RD, &vmio_writedirty,
+    0,
+    "Count of pages dirtied by vnode_pager_write");
+long vmio_max_writedirty;
+SYSCTL_LONG(_vfs_vmio, OID_AUTO, max_writedirty, CTLFLAG_RW,
+    &vmio_max_writedirty, 0,
+    "Maximum allowed system-wide count of pages dirtied by vnode_pager_write");
+static int vmio_writed_wakeups;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_wakeups, CTLFLAG_RD,
+    &vmio_writed_wakeups, 0,
+    "Count of times vmio write daemon was woken up");
+static int vmio_writed_inact;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_inact, CTLFLAG_RD,
+    &vmio_writed_inact, 0,
+    "Count of times vmio write daemon cleaned inactive queue");
+static int vmio_writed_act;
+SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_act, CTLFLAG_RD, &vmio_writed_act,
+    0,
+    "Count of times vmio write daemon cleaned active queue");
+
+static u_int
+io_page_bits(int i, vm_offset_t off, ssize_t size)
+{
+	int start, chunk;
+
+	if (i == 0) {
+		start = off;
+		chunk = min(PAGE_SIZE - off, size);
+	} else if (i * PAGE_SIZE < off + size) {
+		start = 0;
+		chunk = PAGE_SIZE;
+	} else if ((i - 1) * PAGE_SIZE < off + size) {
+		start = 0;
+		chunk = (size - off) % PAGE_SIZE;
+	} else
+		return (0);
+	return (vm_page_bits(start, chunk));
+}
+
+/*
+ * Blocking allocator of the reserve page. Cannot be called with vnode
+ * or object lock held.
+ */
+static void
+vnode_alloc_reserv(vm_page_t *reserv)
+{
+
+	while (*reserv == NULL) {
+		*reserv = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+		    VM_ALLOC_NOOBJ);
+		if (*reserv == NULL) {
+			atomic_add_int(&vmio_alloc_wait, 1);
+			VM_WAIT;
+		}
+	}
+}
+
+/*
+ * Copied from vm_pageout_scan().
+ */
+static boolean_t
+vnode_writedirty_clean_page(vm_page_t m, int queue, int *target,
+    vm_page_t *next)
+{
+	vm_object_t object;
+	struct mount *mp;
+	struct vnode *vp;
+	struct vm_page marker;
+	int vfslocked;
+
+	vm_page_lock_assert(m, MA_OWNED);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+
+	if (m->queue != queue)
+		return (FALSE);
+	*next = TAILQ_NEXT(m, pageq);
+
+	if (m->flags & PG_MARKER)
+		return (TRUE);
+	if (m->hold_count) {
+		vm_page_requeue(m);
+		return (TRUE);
+	}
+
+	object = m->object;
+	if (!VM_OBJECT_TRYLOCK(object) &&
+	    (!vm_pageout_fallback_object_lock(m, next) ||
+	     m->hold_count != 0)) {
+		VM_OBJECT_UNLOCK(object);
+		return (TRUE);
+	}
+	if (m->busy || (m->oflags & VPO_BUSY) || !(m->flags & PG_WRITEDIRTY)) {
+		VM_OBJECT_UNLOCK(object);
+		return (TRUE);
+	}
+
+	if (m->dirty != VM_PAGE_BITS_ALL && (m->flags & PG_WRITEABLE) != 0) {
+		if (pmap_is_modified(m))
+			vm_page_dirty(m);
+		else if (m->dirty == 0)
+			pmap_remove_all(m);
+	}
+
+	KASSERT(m->valid != 0, ("VPO_WRITEDIRTY and not valid %p", m));
+	if (m->dirty == 0) {
+		vm_page_flag_clear(m, PG_WRITEDIRTY);
+		vmio_writedirty--;
+		VM_OBJECT_UNLOCK(object);
+		return (TRUE);
+	}
+	if (object->flags & OBJ_DEAD) {
+		VM_OBJECT_UNLOCK(object);
+		vm_page_requeue(m);
+		return (TRUE);
+	}
+	KASSERT(object->type == OBJT_VNODE, ("VPO_WRITEDIRTY and not vnode"));
+
+	vm_pageout_init_marker(&marker, queue);
+	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq);
+	vp = object->handle;
+	vfslocked = 0;
+	if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		mp = NULL;
+		goto unlock_and_continue;
+	}
+	KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp));
+	vm_page_unlock_queues();
+	vm_page_unlock(m);
+	vm_object_reference_locked(object);
+	VM_OBJECT_UNLOCK(object);
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK, curthread)) {
+		VM_OBJECT_LOCK(object);
+		vm_page_lock(m);
+		vm_page_lock_queues();
+		vp = NULL;
+		goto unlock_and_continue;
+	}
+	VM_OBJECT_LOCK(object);
+	vm_page_lock(m);
+	vm_page_lock_queues();
+	if (m->queue != queue || m->object != object ||
+	    TAILQ_NEXT(m, pageq) != &marker)
+		goto unlock_and_continue;
+	if (m->busy || (m->oflags & VPO_BUSY))
+		goto unlock_and_continue;
+	if (m->hold_count) {
+		vm_page_requeue(m);
+		goto unlock_and_continue;
+	}
+
+	vm_page_unlock_queues();
+	if (vm_pageout_clean(m) != 0)
+		(*target)--;
+	vm_page_lock_queues();
+ unlock_and_continue:
+	VM_OBJECT_UNLOCK(object);
+	if (mp != NULL) {
+		vm_page_unlock_queues();
+		vm_page_unlock(m);
+		if (vp != NULL)
+			vput(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		vm_object_deallocate(object);
+		vn_finished_write(mp);
+		vm_page_lock(m);
+		vm_page_lock_queues();
+	}
+	*next = TAILQ_NEXT(&marker, pageq);
+	TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq);
+	return (TRUE);
+}
+
+static void
+vnode_writedirty_clean_queue(int *target, int queue)
+{
+	vm_page_t m, next;
+	boolean_t res;
+
+	vm_page_lock_queues();
+ rescan0:
+	for (m = TAILQ_FIRST(&vm_page_queues[queue].pl);
+	     m != NULL && *target > 0; m = next) {
+		if (!vm_pageout_page_lock(m, &next)) {
+			vm_page_unlock(m);
+			continue;
+		}
+		res = vnode_writedirty_clean_page(m, queue, target, &next);
+		vm_page_unlock(m);
+		if (!res)
+			goto rescan0;
+	}
+	vm_page_unlock_queues();
+}
+
+static struct cv wd_speedup;
+static struct cv wd_back;
+
+static void
+vnode_writedirty_daemon(void)
+{
+	int target;
+
+	cv_init(&wd_speedup, "writed");
+	cv_init(&wd_back, "vnodeww");
+
+	vm_page_lock_queues();
+	for (;;) {
+		cv_wait(&wd_speedup, &vm_page_queue_mtx);
+		target = vmio_writedirty - vmio_max_writedirty;
+		vm_page_unlock_queues();
+		atomic_add_int(&vmio_writed_wakeups, 1);
+		if (target > 0) {
+			bwillwrite();
+			atomic_add_int(&vmio_writed_inact, 1);
+			vnode_writedirty_clean_queue(&target, PQ_INACTIVE);
+		}
+		if (target > 0) {
+			bwillwrite();
+			atomic_add_int(&vmio_writed_act, 1);
+			vnode_writedirty_clean_queue(&target, PQ_ACTIVE);
+		}
+		vm_page_lock_queues();
+		cv_broadcast(&wd_back);
+	}
+}
+
+void
+vm_writedirty_cleaned(vm_page_t m)
+{
+
+	vm_page_lock_queues();
+	vm_page_flag_clear(m, PG_WRITEDIRTY);
+	vmio_writedirty--;
+	cv_broadcast(&wd_back);
+	vm_page_unlock_queues();
+}
+
+static struct proc *writedproc;
+static struct kproc_desc writed_kp = {
+	.arg0 = "writed",
+	.func = vnode_writedirty_daemon,
+	.global_procpp = &writedproc
+};
+SYSINIT(writed, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &writed_kp);
+
+/*
+ * Attempt to put backpressure on writes.
+ */
+static void
+vnode_pager_wwait(void)
+{
+
+	if (vmio_writedirty >= vmio_max_writedirty) {
+		vm_page_lock_queues();
+		while (vmio_writedirty >= vmio_max_writedirty) {
+			cv_signal(&wd_speedup);
+			cv_wait(&wd_back, &vm_page_queue_mtx);
+		}
+		vm_page_unlock_queues();
+	}
+}
+
+#define	VN_GRAB_NO_VMWAIT	0x0001
+
+/*
+ * Grab a page, waiting until we are woken up due to the page
+ * changing state.  We keep on waiting, if the page continues
+ * to be in the object.  If the page doesn't exist allocate it.
+ *
+ * This routine may block, either waiting for busy vnode page, or for
+ * a page allocation. Later may be disabled with VN_GRAB_NO_VMWAIT
+ * flag, when vnode lock is held. To ensure progress, reserve page is
+ * used for ma[0] when wait is disabled and system cannot provide a
+ * page.
+ *
+ * Returns updated page run length in *wp, and filled in ma page
+ * array.
+ */
+static void
+vnode_grab_pages(struct vnode *vp, vm_page_t *ma, int *wp, vm_pindex_t pindex,
+    int flags, vm_page_t *reserv)
+{
+	vm_object_t object;
+	vm_page_t m;
+	vm_pindex_t pi;
+	int i;
+
+	KASSERT((flags & VN_GRAB_NO_VMWAIT) || reserv == NULL,
+	    ("vnode_grab_pages: NO_VMWAIT and no reserve"));
+
+	object = vp->v_object;
+redo:
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	m = NULL;
+	for (i = 0, pi = pindex; i < *wp; ) {
+		if (i > 0)
+			m = vm_page_next(ma[i - 1]);
+		if (m == NULL)
+			m = vm_page_lookup(object, pi);
+		if (m != NULL) {
+			if (vm_page_sleep_if_busy(m, TRUE, "pgrnbwt"))
+				goto redo;
+		} else {
+			m = vm_page_alloc(object, pi, VM_ALLOC_NORMAL |
+			    VM_ALLOC_NOBUSY);
+		}
+		if (m != NULL) {
+			ma[i] = m;
+			i++;
+			pi++;
+			continue;
+		}
+		if (flags & VN_GRAB_NO_VMWAIT) {
+			if (i == 0) {
+				m = *reserv;
+				*reserv = NULL;
+				atomic_add_int(&vmio_reserv_used, 1);
+				m->flags &= ~PG_UNMANAGED;
+				if (object->memattr != VM_MEMATTR_DEFAULT)
+					pmap_page_set_memattr(m,
+					    object->memattr);
+				vm_page_insert(m, object, pindex);
+				ma[i] = m;
+				i++;
+			}
+			break;
+		}
+		VM_OBJECT_UNLOCK(object);
+		atomic_add_int(&vmio_alloc_wait, 1);
+		VM_WAIT;
+		VM_OBJECT_LOCK(object);
+		goto redo;
+	}
+	*wp = i;
+}
+
+/*
+ * Read a cluster starting at 'ma'. Note that we need to always redo
+ * page grab because our caller dropped object lock while not holding
+ * vnode lock.
+ */
+static int
+vnode_pager_read_cluster(struct vnode *vp, vm_page_t ma[], vm_pindex_t idx,
+    int *maxrun, int flags, vm_page_t *reserv)
+{
+	vm_object_t obj;
+	vm_page_t m;
+	daddr_t blkno;
+	int bsize;
+	int error;
+	int run;
+	int i;
+
+	obj = vp->v_object;
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	error = 0;
+	blkno = 0;
+
+	if (vmio_run) {
+		VM_OBJECT_UNLOCK(obj);
+		error = VOP_BMAP(vp, IDX_TO_OFF(idx)/bsize, NULL, &blkno, &run,
+		    NULL);
+		VM_OBJECT_LOCK(obj);
+		run = MIN(run, *maxrun);
+		if (error || run == 0 || blkno == -1) {
+/* printf("vnode_pager_read_cluster short\n"); */
+			*maxrun = 1;
+			vnode_grab_pages(vp, ma, maxrun, idx,
+			    VN_GRAB_NO_VMWAIT, reserv);
+			error = vm_pager_get_pages(obj, ma, 1, 0);
+			if (error != VM_PAGER_OK)
+				return (EIO);
+			return (0);
+		}
+		run = (run + 1) * bsize / PAGE_SIZE;
+		run = MIN(run, vp->v_mount->mnt_iosize_max / PAGE_SIZE);
+	} else {
+		if (*maxrun == 0)
+			*maxrun = 1;
+		run = MIN(*maxrun, vp->v_mount->mnt_iosize_max / PAGE_SIZE);
+	}
+	if (IDX_TO_OFF(idx) + run * PAGE_SIZE > obj->un_pager.vnp.vnp_size) {
+		run = (obj->un_pager.vnp.vnp_size - IDX_TO_OFF(idx)) /
+		    PAGE_SIZE;
+	}
+	if (run == 0)
+		run = 1;
+	vnode_grab_pages(vp, ma, &run, idx, VN_GRAB_NO_VMWAIT, reserv);
+	for (i = 0; i < run; i++) {
+		if (i > 0 && ma[i]->valid != 0) {
+			run = i;
+			break;
+		}
+		vm_page_busy(ma[i]);
+	}
+
+/* printf("vnode_pager_read_cluster %d %p %p\n", run, ma, ma[0]); */
+	error = vm_pager_get_pages(obj, ma, run, 0);
+	if (error != VM_PAGER_OK) {
+		for (i = 0; i < run; i++) {
+			vm_page_lock(ma[i]);
+			vm_page_free(ma[i]);
+			vm_page_unlock(ma[i]);
+		}
+		return (EIO);
+	}
+	KASSERT(ma[0]->valid == VM_PAGE_BITS_ALL,
+	    ("ma[0]->valid %x", ma[0]->valid));
+	vm_page_wakeup(ma[0]);
+	/* ma[0] cannot be cached */
+	for (i = 1; i < run; i++) {
+		m = vm_page_next(ma[i - 1]);
+		if (m == NULL || ma[i] != m || m->valid == 0)
+			break;
+/* printf("run %d ma[%d]: obj %p %p pindex %jd p+i %jd valid %x\n",
+   run, i, obj, ma[i]->object, ma[i]->pindex, ma[0]->pindex + i, ma[i]->valid); */
+	}
+	*maxrun = i;
+	return (0);
+}
+
+int
+vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags)
+{
+	vm_object_t obj;
+	vm_offset_t off;
+	vm_pindex_t idx;
+	vm_page_t reserv;
+	ssize_t size;
+	int error, seqcount, wpmax, wp, i;
+	u_int bits;
+	struct thread *td;
+
+	if (ioflags & (IO_EXT|IO_DIRECT))
+		return (EOPNOTSUPP);
+
+	ASSERT_VOP_LOCKED(vp, "vnode_pager_read");
+	if (vp->v_iflag & VI_DOOMED)
+		return (EBADF);
+
+	/*
+	 * Ignore non-regular files.
+	 */
+	if (vp->v_type != VREG)
+		return (EOPNOTSUPP);
+	obj = vp->v_object;
+	if (obj == NULL)
+		return (EOPNOTSUPP);
+
+	seqcount = (ioflags >> IO_SEQSHIFT) * FRA_BLOCK_SZ / PAGE_SIZE;
+	seqcount = min(vfs_read_max, seqcount);
+	seqcount = min(vp->v_mount->mnt_iosize_max / PAGE_SIZE, seqcount);
+	VOP_UNLOCK(vp, 0);
+
+	wpmax = atomic_load_acq_int(&vmio_read_pack);
+	vm_page_t ma[wpmax + 1];
+
+	while (vm_page_count_severe()) {
+		atomic_add_int(&vm_pageout_deficit, MIN(wpmax + 1,
+		    (uio->uio_resid + PAGE_SIZE - 1) >> PAGE_SHIFT));
+		VM_WAIT;
+	}
+
+	error = 0;
+	reserv = NULL;
+	td = uio->uio_td;
+	/* XXXKIB This should be disallowed. */
+	if (td == NULL)
+		td = curthread;
+
+	VM_OBJECT_LOCK(obj);
+	while (uio->uio_resid > 0) {
+		wp = wpmax;
+
+		size = obj->un_pager.vnp.vnp_size - uio->uio_offset;
+		if (size <= 0)
+			break;
+		idx = OFF_TO_IDX(uio->uio_offset);
+		off = uio->uio_offset - IDX_TO_OFF(idx);
+		size = MIN(MIN(PAGE_SIZE * wp - off, uio->uio_resid), size);
+
+		wp = (size + off + PAGE_SIZE - 1) / PAGE_SIZE;
+		vnode_grab_pages(vp, ma, &wp, idx, 0, NULL);
+	find_valid:
+		for (i = 0; i < wp; i++) {
+			bits = io_page_bits(i, off, size);
+
+			/*
+			 * Only do read if first page of array is not
+			 * valid for us. We have to drop object lock
+			 * to obtain vnode lock, that allows the pages
+			 * to change identity or validity bits, and we
+			 * can guarantee allocation of only one
+			 * (reserved) page.
+			 */
+			if ((ma[i]->valid & bits) != bits) {
+				if (i != 0) {
+					wp = i;
+					break;
+				}
+				VM_OBJECT_UNLOCK(obj);
+				vnode_alloc_reserv(&reserv);
+				error = vn_lock(vp, LK_SHARED);
+				VM_OBJECT_LOCK(obj);
+				if (error != 0) {
+					error = EBADF;
+					break;
+				}
+
+				/*
+				 * Read page, honouring read-ahead settings
+				 * for filedescriptor.
+				 */
+				atomic_add_int(&vmio_getpages_read, 1);
+				error = vnode_pager_read_cluster(vp, ma, idx,
+				    &wp, VN_GRAB_NO_VMWAIT, &reserv);
+				VOP_UNLOCK(vp, 0);
+				if (error != 0)
+					break;
+				/*
+				 * No need to redo size calculation.
+				 * Despite both vnode and object locks
+				 * were dropped, range lock and file
+				 * descriptor reference shall keep
+				 * file from truncation.
+				 */
+				goto find_valid;
+			}
+		}
+		if (error != 0)
+			break;
+		KASSERT(wp > 0, ("wp == 0"));
+/* printf("vp %p wp %d size %d\n", vp, wp, size); */
+
+		/*
+		 * Prevent object deallocation and pages swap-out.
+		 */
+		vm_object_pip_add(obj, 1);
+		for (i = 0; i < wp; i++) {
+			vm_page_lock(ma[i]);
+			vm_page_hold(ma[i]);
+			vm_page_unlock(ma[i]);
+		}
+		VM_OBJECT_UNLOCK(obj);
+
+		/*
+		 * Recalculate i/o size, since vnode_grab_pages()
+		 * might shortened the page run.
+		 */
+		size = MIN(MIN(PAGE_SIZE * wp - off, uio->uio_resid), size);
+
+		/*
+		 * Access user map pages, vnode lock is dropped.
+		 * Possible page fault is safe at this point.  Vnode
+		 * rangelock is held, protecting from parallel
+		 * writers.
+		 */
+/* printf("size %d %d %ju\n", size, uio->uio_resid, (uintmax_t)off); */
+		KASSERT((td->td_pflags & TDP_VMIO) == 0,
+		    ("Recursed vnode_pager_read"));
+		td->td_pflags |= TDP_VMIO;
+		error = uiomove_fromphys(ma, off, size, uio);
+		td->td_pflags &= ~TDP_VMIO;
+
+		VM_OBJECT_LOCK(obj);
+		for (i = 0; i < wp; i++) {
+			vm_page_lock(ma[i]);
+			vm_page_unhold(ma[i]);
+			vm_page_activate(ma[i]);
+			vm_page_unlock(ma[i]);
+		}
+		vm_object_pip_wakeup(obj);
+		if (error != 0)
+			break;
+	}
+	VM_OBJECT_UNLOCK(obj);
+	if (reserv != NULL)
+		vm_page_free(reserv);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	if (error == 0)
+		vfs_mark_atime(vp, td->td_ucred);
+
+	return (error);
+}
+
+int
+vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags)
+{
+	vm_object_t obj;
+	vm_offset_t off;
+	vm_pindex_t idx, clean_start, clean_end;
+	vm_page_t reserv;
+	struct vattr vattr;
+	ssize_t size, size1, osize, osize1, resid, sresid, written;
+	int error, vn_locked, wpmax, wp, i, pflags;
+	u_int bits;
+	boolean_t vnode_locked, freed, freed1, first_extend;
+	struct thread *td;
+
+	if (ioflags & (IO_EXT|IO_INVAL|IO_DIRECT))
+		return (EOPNOTSUPP);
+	ASSERT_VOP_LOCKED(vp, "vnode_pager_write");
+	if (vp->v_iflag & VI_DOOMED)
+		return (EBADF);
+	if (vp->v_type != VREG)
+		return (EOPNOTSUPP);
+	obj = vp->v_object;
+	if (obj == NULL)
+		return (EOPNOTSUPP);
+	vn_locked = VOP_ISLOCKED(vp);
+	vnode_locked = TRUE;
+	error = 0;
+	first_extend = TRUE;
+
+	/*
+	 * Reversed logic from vnode_generic_putpages().
+	 */
+	if (ioflags & IO_SYNC)
+		pflags = VM_PAGER_PUT_SYNC;
+	else if (ioflags & IO_ASYNC)
+		pflags = 0;
+	else
+		pflags = VM_PAGER_CLUSTER_OK;
+
+	wpmax = atomic_load_acq_int(&vmio_write_pack);
+	vm_page_t ma[wpmax + 1];
+
+	/*
+	 * Try to ensure that enough pages is available in advance.
+	 */
+	while (vm_page_count_severe()) {
+		if (vnode_locked) {
+			VOP_UNLOCK(vp, 0);
+			vnode_locked = FALSE;
+		}
+		atomic_add_int(&vm_pageout_deficit, MIN(wpmax + 1,
+		    (uio->uio_resid + PAGE_SIZE - 1) >> PAGE_SHIFT));
+		VM_WAIT;
+	}
+
+	/*
+	 * Allocate first reserve page.
+	 */
+	for (reserv = NULL; reserv == NULL; ) {
+		reserv = vm_page_alloc(NULL, 0,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
+		if (reserv == NULL) {
+			if (vnode_locked) {
+				VOP_UNLOCK(vp, 0);
+				vnode_locked = FALSE;
+			}
+			atomic_add_int(&vmio_alloc_wait, 1);
+			VM_WAIT;
+		}
+	}
+	if (!vnode_locked) {
+		/*
+		 * Since vnode lock was dropped, we are under low free
+		 * pages condition, so more write trottling is due.
+		 */
+		vnode_pager_wwait();
+
+		vn_lock(vp, vn_locked | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			if (reserv != NULL)
+				vm_page_free(reserv);
+			return (EBADF);
+		}
+		vnode_locked = TRUE;
+	}
+
+	if (ioflags & IO_APPEND)
+		uio->uio_offset = obj->un_pager.vnp.vnp_size;
+
+	clean_start = OFF_TO_IDX(uio->uio_offset);
+	clean_end = OFF_TO_IDX(uio->uio_offset + uio->uio_resid +
+	    PAGE_SIZE - 1);
+
+	td = uio->uio_td;
+	if (td == NULL)
+		td = curthread;
+
+	error = vn_rlimit_fsize(vp, uio, td);
+	if (error != 0)
+		return (error);
+	osize = osize1 = obj->un_pager.vnp.vnp_size;
+	resid = uio->uio_resid;
+
+io_loop:
+	while (uio->uio_resid > 0) {
+		wp = wpmax;
+		size = uio->uio_resid;
+		idx = OFF_TO_IDX(uio->uio_offset);
+		off = uio->uio_offset - IDX_TO_OFF(idx);
+		size = MIN(PAGE_SIZE * wp - off, uio->uio_resid);
+		if (!vnode_locked) {
+			error = vn_lock(vp, LK_EXCLUSIVE);
+			if (error != 0) {
+				error = EBADF;
+				break;
+			}
+			vnode_locked = TRUE;
+		}
+		osize1 = obj->un_pager.vnp.vnp_size;
+
+		/*
+		 * Extend the file if writing past end.
+		 */
+		if (osize1 < uio->uio_offset + size || first_extend) {
+			if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+				VOP_UNLOCK(vp, 0);
+				vnode_locked = FALSE;
+			}
+			if (!vnode_locked) {
+				error = vn_lock(vp, LK_EXCLUSIVE);
+				if (error != 0) {
+					error = EBADF;
+					break;
+				}
+				vnode_locked = TRUE;
+			}
+			vattr.va_size = uio->uio_offset + size;
+			error = VOP_EXTEND(vp, td->td_ucred, uio->uio_offset +
+			    size, ioflags);
+			first_extend = FALSE;
+		}
+		if (error != 0)
+			break;
+
+		wp = (size + off + PAGE_SIZE - 1) / PAGE_SIZE;
+		VM_OBJECT_LOCK(obj);
+
+		/*
+		 * Use VN_GRAB_NO_VMWAIT since vnode lock is held.
+		 */
+		vnode_grab_pages(vp, ma, &wp, idx, VN_GRAB_NO_VMWAIT, &reserv);
+	find_valid:
+		for (i = 0; i < wp; i++) {
+			/*
+			 * If the page falls into the newly-extended
+			 * range, zero it and mark as valid. There is
+			 * nothing VOP_GETPAGES can read from file.
+			 */
+			if (IDX_TO_OFF(ma[i]->pindex) >= osize1) {
+				if ((ma[i]->flags & PG_ZERO) == 0)
+					pmap_zero_page(ma[i]);
+				ma[i]->valid = VM_PAGE_BITS_ALL;
+			}
+
+			/*
+			 * Pages need to be fully valid, because we
+			 * can only hold them during uiomove later.
+			 *
+			 * The page fault happening in other thread
+			 * after uiomove finished but before valid
+			 * bits are corrected below would cause lost
+			 * of newly written data if page is not fully
+			 * valid.
+			 */
+			if (ma[i]->valid == VM_PAGE_BITS_ALL)
+				continue;
+			if (!vmio_clrbuf) {
+				bits = io_page_bits(i, off, size);
+				if ((ma[i]->valid & ~bits) == (~bits &
+				    VM_PAGE_BITS_ALL))
+					continue;
+			}
+			if (i != 0) {
+				wp = i;
+				break;
+			}
+			if (reserv == NULL)
+				reserv = vm_page_alloc(NULL, 0,
+				    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
+			if (reserv == NULL) {
+				VM_OBJECT_UNLOCK(obj);
+
+				/*
+				 * Truncate the file back to the
+				 * original size to prevent mmap from
+				 * seeing invalid pages. We are going
+				 * to drop vnode lock.
+				 */
+				if (osize1 < uio->uio_offset + size) {
+					atomic_add_int(&vmio_rollbacks1, 1);
+					VATTR_NULL(&vattr);
+					vattr.va_size = osize1;
+					error = VOP_SETATTR(vp, &vattr,
+					    td->td_ucred);
+					if (error != 0)
+						break;
+				}
+				KASSERT(vnode_locked, ("lost vnode lock 1"));
+				VOP_UNLOCK(vp, 0);
+				vnode_locked = FALSE;
+				vnode_pager_wwait();
+				vnode_alloc_reserv(&reserv);
+				goto io_loop;
+			}
+
+			atomic_add_int(&vmio_getpages_write, 1);
+			error = vnode_pager_read_cluster(vp, ma, idx, &wp,
+			    VN_GRAB_NO_VMWAIT, &reserv);
+			if (error != 0) {
+				VM_OBJECT_UNLOCK(obj);
+				break;
+			}
+			goto find_valid;
+		}
+		/* Loop above is exited with unlocked obj if error != 0. */
+		if (error != 0)
+			break;
+		KASSERT(wp > 0, ("wp == 0"));
+
+		/*
+		 * Prevent the object deallocation and hold the pages.
+		 * Held page can be removed from object, but cannot be
+		 * reused. Range lock taken in vn_truncate() prevents
+		 * most typical race.
+		 *
+		 * XXXKIB Busying the pages there would cause deadlock
+		 * with vm_object_page_remove() or self-lock with
+		 * vm_fault(), but would allow to not require the
+		 * pages to be fully valid before uiomove.
+		 *
+		 * The mmap could see zeroed pages that are inserted
+		 * into extended area after we dropped object lock.
+		 * This could be considered an application race.
+		 */
+		vm_object_pip_add(obj, 1);
+		for (i = 0; i < wp; i++) {
+			vm_page_lock(ma[i]);
+			vm_page_hold(ma[i]);
+			vm_page_unlock(ma[i]);
+		}
+		VM_OBJECT_UNLOCK(obj);
+
+		/*
+		 * Recalculate i/o size, since vnode_grab_pages()
+		 * might have shortened the page run. Save previous
+		 * resid to correctly mark written pages regions as
+		 * dirty.
+		 */
+		sresid = uio->uio_resid;
+		size1 = MIN(MIN(PAGE_SIZE * wp - off, sresid), size);
+
+		/*
+		 * Shrunk file in case we allocated less pages then
+		 * the estimation that was used to VOP_EXTEND.
+		 */
+		KASSERT(vnode_locked, ("lost vnode lock 2"));
+		if (size1 < size && osize1 < uio->uio_offset + size) {
+			atomic_add_int(&vmio_rollbacks2, 1);
+			VATTR_NULL(&vattr);
+			vattr.va_size = uio->uio_offset + size1;
+			error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+			if (error != 0) {
+				VM_OBJECT_LOCK(obj);
+				for (i = 0; i < wp; i++) {
+					vm_page_lock(ma[i]);
+					vm_page_unhold(ma[i]);
+					vm_page_deactivate(ma[i]);
+					vm_page_unlock(ma[i]);
+				}
+				vm_object_pip_wakeup(obj);
+				VM_OBJECT_UNLOCK(obj);
+				break;
+			}
+		}
+		size = size1;
+
+		VOP_UNLOCK(vp, 0);
+		vnode_locked = FALSE;
+
+		KASSERT((td->td_pflags & TDP_VMIO) == 0,
+		    ("Recursed vnode_pager_write"));
+/* printf("W: vp %p off %jd %jd size %jd\n",
+   vp, (intmax_t)uio->uio_offset, (intmax_t)off, (intmax_t)size); */
+		td->td_pflags |= TDP_VMIO;
+		error = uiomove_fromphys(ma, off, size, uio);
+		td->td_pflags &= ~TDP_VMIO;
+
+		freed = FALSE;
+		VM_OBJECT_LOCK(obj);
+		for (i = 0; i < wp; i++) {
+			/*
+			 * Note that the page is marked dirty
+			 * regardeless of the possible error from
+			 * uiomove. We must mark the pages that were
+			 * touched by uiomove before fault
+			 * occured. Since we do not record the
+			 * progress of the uiomove till fault, just
+			 * mark them all.
+			 */
+			ma[i]->dirty |= io_page_bits(i, off, sresid -
+			    uio->uio_resid);
+			vm_page_lock_queues();
+			if ((ma[i]->flags & PG_WRITEDIRTY) == 0) {
+				vm_page_flag_set(ma[i], PG_WRITEDIRTY);
+				vmio_writedirty++;
+			}
+			vm_page_unlock_queues();
+			freed1 = FALSE;
+			if (ma[i]->queue == PQ_HOLD)
+				freed = freed1 = TRUE;
+			vm_page_lock(ma[i]);
+			vm_page_unhold(ma[i]);
+			if (!freed1)
+				vm_page_activate(ma[i]);
+			vm_page_unlock(ma[i]);
+		}
+		/* See the comment above about page dirtiness. */
+		vm_object_set_writeable_dirty(obj);
+
+		/*
+		 * Try to cluster writes.
+		 */
+		written = sresid - uio->uio_resid;
+		if (obj->un_pager.vnp.wpos + obj->un_pager.vnp.off ==
+		    uio->uio_offset - written) {
+			/*
+			 * Sequential writes detected, make a note and
+			 * try to take immediate advantage of it.
+			 */
+			if (!freed && OFF_TO_IDX(uio->uio_offset) >
+				OFF_TO_IDX(uio->uio_offset - written) &&
+			    vn_lock(vp, vn_locked | LK_NOWAIT) == 0) {
+				vm_pageout_flush(ma, wp, pflags, 0, NULL);
+				VOP_UNLOCK(vp, 0);
+			}
+/* printf("seq write, wpos %jd off %jd written %d\n", (intmax_t)obj->un_pager.vnp.wpos, (intmax_t)obj->un_pager.vnp.off, written); */
+			obj->un_pager.vnp.off += written;
+		} else {
+			/*
+			 * Not a sequential write situation, still
+			 * might be good to not split large write in
+			 * the daemons struggling under pressure.
+			 */
+			if (!freed && wp >= vm_pageout_page_count &&
+			    vn_lock(vp, vn_locked | LK_NOWAIT) == 0) {
+				vm_pageout_flush(ma, wp, pflags, 0, NULL);
+				VOP_UNLOCK(vp, 0);
+			}
+/* printf("nonseq write, wpos %jd off %jd wp %d\n", (intmax_t)obj->un_pager.vnp.wpos, (intmax_t)obj->un_pager.vnp.off, wp); */
+			obj->un_pager.vnp.wpos = uio->uio_offset;
+			obj->un_pager.vnp.off = 0;
+		}
+		vm_object_pip_wakeup(obj);
+		VM_OBJECT_UNLOCK(obj);
+		if (error != 0)
+			break;
+		KASSERT(!vnode_locked, ("vnode leak 3"));
+
+		vnode_pager_wwait();
+
+		/*
+		 * Re-fill reserv while vnode lock is dropped.
+		 */
+		if (uio->uio_resid != 0)
+			vnode_alloc_reserv(&reserv);
+	}
+
+	if (!vnode_locked)
+		vn_lock(vp, vn_locked | LK_RETRY);
+	if (reserv != NULL)
+		vm_page_free(reserv);
+	if (vp->v_iflag & VI_DOOMED) {
+		if (error == 0)
+			error = EBADF;
+		return (error);
+	}
+	if (error == 0) {
+		if (((ioflags & IO_SYNC) != 0 &&
+		    (vp->v_vflag & VV_NOSYNC) == 0) || vm_page_count_severe()) {
+			VM_OBJECT_LOCK(obj);
+			vm_object_page_clean(obj, clean_start, clean_end,
+			    OBJPC_SYNC);
+			VM_OBJECT_UNLOCK(obj);
+#if 0
+			/*
+			 * XXXKIB The following call is commented out in
+			 * vm_object_page_clean() in the same way.
+			 */
+			error = VOP_FSYNC(vp, MNT_WAIT);
+#endif
+		}
+	} else {
+		/*
+		 * Roll back on error if atomic write was requested.
+		 */
+		VATTR_NULL(&vattr);
+		vattr.va_size = (ioflags & IO_UNIT) ? osize : osize1;
+		VOP_SETATTR(vp, &vattr, td->td_ucred);
+		if (ioflags & IO_UNIT) {
+			uio->uio_offset -= resid - uio->uio_resid;
+			uio->uio_resid = resid;
+		}
+	}
+
+	return (error);
+}
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index f497d41..7db9bd9 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -695,6 +695,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
+/* printf("vpgg: %p %jd %x %d\n", vp, m[0]->pindex, count, reqpage); */
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("vnode_pager_generic_getpages does not support devices"));
@@ -1087,6 +1088,7 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
+/* printf("vpgp: %p %jd %x %d\n", vp, m[0]->pindex, m[0]->dirty, count); */
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c
new file mode 100644
index 0000000..d857605
--- /dev/null
+++ b/tools/regression/file/uio/uio.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int chunk_cnt = 1024;
+int chunk_size = 1024;
+
+int
+main(int argc, char *argv[])
+{
+	struct iovec *wiov, *riov;
+	char **wdata, **rdata;
+	int fd, i;
+	ssize_t io_error;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n");
+		return (2);
+	}
+	fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+	if (fd == -1) {
+		fprintf(stderr, "Failed to create %s: %s\n",
+		    argv[1], strerror(errno));
+		return (1);
+	}
+
+	if (argc > 2)
+		chunk_cnt = atoi(argv[2]);
+	if (argc > 3)
+		chunk_size = atoi(argv[3]);
+
+	wiov = calloc(chunk_cnt, sizeof(*wiov));
+	wdata = calloc(chunk_cnt, sizeof(*wdata));
+
+	riov = calloc(chunk_cnt, sizeof(*riov));
+	rdata = calloc(chunk_cnt, sizeof(*rdata));
+
+	for (i = 0; i < chunk_cnt; i++) {
+		rdata[i] = malloc(chunk_size);
+		riov[i].iov_base = rdata[i];
+		riov[i].iov_len = chunk_size;
+
+		wdata[i] = malloc(chunk_size);
+		memset(wdata[i], i, chunk_size);
+		wiov[i].iov_base = wdata[i];
+		wiov[i].iov_len = chunk_size;
+	}
+
+	io_error = writev(fd, wiov, chunk_cnt);
+	if (io_error == -1) {
+		fprintf(stderr, "write failed: %s\n", strerror(errno));
+		return (1);
+	} else if (io_error != chunk_cnt * chunk_size) {
+		fprintf(stderr, "truncated write: %d %d\n",
+		     io_error, chunk_cnt * chunk_size);
+		return (1);
+	}
+
+	if (lseek(fd, 0, SEEK_SET) == -1) {
+		fprintf(stderr, "lseek failed: %s\n", strerror(errno));
+		return (1);
+	}
+
+	io_error = readv(fd, riov, chunk_cnt);
+	if (io_error == -1) {
+		fprintf(stderr, "read failed: %s\n", strerror(errno));
+		return (1);
+	} else if (io_error != chunk_cnt * chunk_size) {
+		fprintf(stderr, "truncated read: %d %d\n",
+		     io_error, chunk_cnt * chunk_size);
+		return (1);
+	}
+
+	for (i = 0; i < chunk_cnt; i++) {
+		if (memcmp(rdata[i], wdata[i], chunk_size) != 0) {
+			fprintf(stderr, "chunk %d differs\n", i);
+			return (1);
+		}
+	}
+
+	return (0);
+}
diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c
new file mode 100644
index 0000000..1b0acbe
--- /dev/null
+++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static const int blks = 2;
+
+static void
+flush_buffers(int fd)
+{
+	struct stat st;
+	char *addr;
+	int error;
+
+	printf("Flushing buffers\n");
+	error = fstat(fd, &st);
+	if (error == -1)
+		err(2, "stat");
+	fsync(fd);
+	addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (addr == (char *)-1)
+		err(2, "mmap");
+	error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE);
+	if (error == -1)
+		err(2, "msync");
+	munmap(addr, st.st_size);
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct statfs fst;
+	char *data, *vrfy;
+	size_t sz;
+	int fd, i, error, ret;
+
+	if (argc < 2)
+		errx(2, "Usage: ba_clrbuf file");
+
+	fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+	if (fd == -1)
+		err(2, "Failed to create %s", argv[1]);
+
+	if (fstatfs(fd, &fst) == -1)
+		err(2, "stat");
+
+	sz = fst.f_iosize * blks;
+	data = malloc(sz);
+	if (data == NULL)
+		err(2, "malloc");
+	vrfy = malloc(sz);
+	if (vrfy == NULL)
+		err(2, "malloc");
+	for (i = 0; i < (int)sz; i++)
+		data[i] = i;
+	error = write(fd, data, sz);
+	if (error == -1)
+		err(2, "write");
+	else if (error != (int)sz)
+		errx(2, "Short write %d %d", error, sz);
+
+	flush_buffers(fd);
+
+	error = lseek(fd, 0, SEEK_SET);
+	if (error == -1)
+		err(2, "lseek 0");
+	else if (error != 0)
+		errx(2, "lseek 0 returned %d", error);
+	error = write(fd, NULL, fst.f_iosize);
+	printf("faulty write, error %s\n", strerror(errno));
+
+	error = lseek(fd, 0, SEEK_SET);
+	if (error == -1)
+		err(2, "lseek 0/2");
+	else if (error != 0)
+		errx(2, "lseek 0/2 returned %d", error);
+	error = read(fd, vrfy, sz);
+	if (error == -1)
+		err(2, "read");
+	else if (error != (int)sz)
+		errx(2, "short read %d %d", error, sz);
+
+	if (memcmp(data, vrfy, fst.f_iosize) != 0) {
+		printf("Zero block corrupted, byte at 0 is %x\n",
+		    (unsigned char)vrfy[0]);
+		ret = 1;
+	} else {
+		printf("No corruption\n");
+		ret = 0;
+	}
+
+	return (ret);
+}
diff --git a/tools/tools/ufs/fragc/fragc.c b/tools/tools/ufs/fragc/fragc.c
new file mode 100644
index 0000000..80ec3ff
--- /dev/null
+++ b/tools/tools/ufs/fragc/fragc.c
@@ -0,0 +1,215 @@
+/* $Id: fragc.c,v 1.9 2010/02/07 14:32:22 kostik Exp kostik $ */
+
+/* /usr/local/opt/gcc-4.4.3/bin/gcc -g -Wall -Wextra -O -o fragc fragc.c -lufs */
+
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#include <libufs.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static const int blocksz = 512;
+
+static int verbose;
+
+static void
+usage(void)
+{
+
+	fprintf(stderr, "Usage: fragc [-v] devname\n");
+}
+
+static ufs2_daddr_t blks_total;
+static ufs2_daddr_t blks_breaks;
+
+static void
+block_pair(struct fs *fs, ufs2_daddr_t *prev, ufs2_daddr_t curr)
+{
+
+	blks_total++;
+	if (curr != 0) {
+		if (*prev != 0 &&
+		    (*prev) + fs->fs_bsize / fs->fs_fsize != curr) {
+			blks_breaks++;
+			if (verbose)
+				putchar('|');
+		}
+		if (verbose)
+			printf(" %jd", (intmax_t)curr);
+	}
+	*prev = curr;
+}
+
+static void
+count_indir(struct uufsd *u, struct fs *fs, int level, int maxlevel,
+    ufs2_daddr_t ib, ufs2_daddr_t *prev)
+{
+	ufs2_daddr_t *b;
+	unsigned i;
+
+	if (ib == 0)
+		return;
+	b = malloc(fs->fs_bsize);
+	if (bread(u, ib * fs->fs_fsize / blocksz, b, fs->fs_bsize) == -1) {
+		printf("\nRead block %jd: %s\n", (intmax_t)ib, u->d_error);
+		goto out;
+	}
+	for (i = 0; i < fs->fs_bsize / sizeof(ufs2_daddr_t); i++) {
+		if (level == maxlevel)
+			block_pair(fs, prev, b[i]);
+		else
+			count_indir(u, fs, level + 1, maxlevel, b[i], prev);
+	}
+ out:
+	free(b);
+}
+
+static void
+count_ino_ufs1(struct uufsd *u, struct fs *fs, struct ufs1_dinode *dp)
+{
+	ufs2_daddr_t prev;
+	unsigned i;
+
+	if (dp->di_size == 0)
+		return;
+	if ((dp->di_mode & IFMT) == IFLNK && dp->di_size <
+	    (u_int64_t)fs->fs_maxsymlinklen)
+		return;
+
+	prev = 0;
+	for (i = 0; i < NDADDR; i++)
+		block_pair(fs, &prev, dp->di_db[i]);
+	for (i = 0; i < NIADDR; i++) {
+		if (0 && verbose)
+			printf(" [%d]", dp->di_ib[i]);
+		count_indir(u, fs, 0, i, dp->di_ib[i], &prev);
+	}
+}
+
+static void
+count_ino_ufs2(struct uufsd *u, struct fs *fs, struct ufs2_dinode *dp)
+{
+	ufs2_daddr_t prev;
+	unsigned i;
+
+	if (dp->di_size == 0)
+		return;
+	if ((dp->di_mode & IFMT) == IFLNK && dp->di_size <
+	    (u_int64_t)fs->fs_maxsymlinklen)
+		return;
+
+	prev = 0;
+	for (i = 0; i < NDADDR; i++)
+		block_pair(fs, &prev, dp->di_db[i]);
+	for (i = 0; i < NIADDR; i++) {
+		if (0 && verbose)
+			printf(" [%jd]", (intmax_t)(dp->di_ib[i]));
+		count_indir(u, fs, 0, i, dp->di_ib[i], &prev);
+	}
+}
+
+static void
+frag_calc(struct uufsd *u)
+{
+	struct fs *fs;
+	struct cg *cg;
+	void *dino;
+	int32_t cgno;
+	uint32_t ino, inoused, cgino, next_cg_ino;
+	int mode;
+	u_int8_t *cp;
+
+	fs = &u->d_fs;
+	if (verbose)
+		printf("%s UFS%d\n", u->d_name, u->d_ufs);
+	ino = 0;
+	for (cgno = 0; cgread(u); cgno++) {
+		cg = &u->d_cg;
+		if (u->d_ufs == 1)
+			inoused = fs->fs_ipg;
+		else
+			inoused = cg->cg_initediblk;
+		if (verbose)
+			printf("cg %d inodes %u\n", cgno, inoused);
+		cp = cg_inosused(cg);
+		next_cg_ino = ino + fs->fs_ipg;
+		for (cgino = 0; cgino < inoused; cgino++, ino++) {
+			if ((cp[cgino / CHAR_BIT] & (1 << (cgino % CHAR_BIT)))
+			    != 0 && ino != 0 && ino != 1) {
+				if (verbose)
+					printf("  ino %u:", ino);
+				if (getino(u, &dino, ino, &mode) == -1) {
+					printf("\nReading ino %u: %s\n",
+					    ino, u->d_error);
+					return;
+				}
+				if (mode == 0) {
+					printf(
+"\nIno %u/%u is allocated in bitmap, but mode is 0\n",
+					ino, ino % fs->fs_ipg);
+					continue;
+				}
+				if (mode != IFDIR && mode != IFREG &&
+				    mode != IFLNK)
+					continue;
+
+				if (u->d_ufs == 1)
+					count_ino_ufs1(u, fs, dino);
+				else
+					count_ino_ufs2(u, fs, dino);
+				if (verbose)
+					putchar('\n');
+			}
+		}
+		ino = next_cg_ino;
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct uufsd ufsd;
+	int c;
+
+	verbose = 0;
+	while ((c = getopt(argc, argv, "hv")) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			return (0);
+		case 'v':
+			verbose = 1;
+			break;
+		default:
+			usage();
+			return (2);
+		}
+	}			
+	if (optind + 1 != argc) {
+		usage();
+		return (2);
+	}
+
+	if (ufs_disk_fillout(&ufsd, argv[optind]) == -1) {
+		fprintf(stderr, "Fillout: %s\n", ufsd.d_error);
+		return (1);
+	}
+
+	frag_calc(&ufsd);
+
+	if (ufs_disk_close(&ufsd) == -1) {
+		fprintf(stderr, "Disk close: %s\n", ufsd.d_error);
+		return (1);
+	}
+
+	printf("Total %jd data blocks, %jd breaks, %02.2f%% fragmentation.\n",
+	    (intmax_t)blks_total, (intmax_t)blks_breaks,
+	    (double)blks_breaks * 100.0 / blks_total);
+
+	return (0);
+}