Index: sbin/fsck_ffs/suj.c
===================================================================
--- sbin/fsck_ffs/suj.c	(revision 0)
+++ sbin/fsck_ffs/suj.c	(revision 0)
@@ -0,0 +1,2065 @@
+/*-
+ * Copyright (c) 2009 Jeffrey W. Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ffs/fs.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <libufs.h>
+#include <strings.h>
+#include <err.h>
+#include <assert.h>
+
+#include "fsck.h"
+
+static void	ino_decr(ino_t);
+
+#define	SUJ_HASHSIZE	128
+#define	SUJ_HASHMASK	(SUJ_HASHSIZE - 1)
+#define	SUJ_HASH(x)	((x * 2654435761) & SUJ_HASHMASK)
+
+struct suj_seg {
+	TAILQ_ENTRY(suj_seg) ss_next;
+	struct jsegrec	ss_rec;
+	uint8_t		*ss_blk;
+};
+
+struct suj_rec {
+	TAILQ_ENTRY(suj_rec) sr_next;
+	union jrec	*sr_rec;
+};
+TAILQ_HEAD(srechd, suj_rec);
+
+struct suj_ino {
+	LIST_ENTRY(suj_ino)	si_next;
+	struct srechd		si_recs;
+	struct srechd		si_movs;
+	ino_t			si_ino;
+	int			si_nlinkadj;
+	int			si_skipparent;
+	int			si_linkadj;
+	int			si_hasrecs;
+	int			si_blkadj;
+};
+LIST_HEAD(inohd, suj_ino);
+
+struct suj_blk {
+	LIST_ENTRY(suj_blk)	sb_next;
+	struct srechd		sb_recs;
+	ufs2_daddr_t		sb_blk;
+};
+LIST_HEAD(blkhd, suj_blk);
+
+struct data_blk {
+	LIST_ENTRY(data_blk)	db_next;
+	uint8_t			*db_buf;
+	ufs2_daddr_t		db_blk;
+	int			db_size;
+};
+
+struct ino_blk {
+	LIST_ENTRY(ino_blk)	ib_next;
+	uint8_t			*ib_buf;
+	int			ib_dirty;
+	ufs2_daddr_t		ib_blk;
+};
+LIST_HEAD(iblkhd, ino_blk);
+
+struct suj_cg {
+	LIST_ENTRY(suj_cg)	sc_next;
+	struct blkhd		sc_blkhash[SUJ_HASHSIZE];
+	struct inohd		sc_inohash[SUJ_HASHSIZE];
+	struct iblkhd		sc_iblkhash[SUJ_HASHSIZE];
+	struct ino_blk		*sc_lastiblk;
+	uint8_t			*sc_cgbuf;
+	struct cg		*sc_cgp;
+	int			sc_dirty;
+	int			sc_cgx;
+};
+
+LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE];
+LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE];
+
+TAILQ_HEAD(seghd, suj_seg) allsegs;
+uint64_t oldseq;
+static struct uufsd *disk = NULL;
+static struct fs *fs = NULL;
+
+/*
+ * Summary statistics.
+ */
+uint64_t freefrags;
+uint64_t freeblocks;
+uint64_t freeinos;
+uint64_t freedir;
+uint64_t jbytes;
+uint64_t jrecs;
+
+typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int);
+
+static void *
+errmalloc(size_t n)
+{
+	void *a;
+
+	a = malloc(n);
+	if (a == NULL)
+		errx(1, "malloc(%zu)", n);
+	return (a);
+}
+
+/*
+ * Open the given provider, load superblock.
+ */
+static void
+opendisk(const char *devnam)
+{
+	if (disk != NULL)
+		return;
+	disk = malloc(sizeof(*disk));
+	if (disk == NULL)
+		errx(1, "malloc(%zu)", sizeof(*disk));
+	if (ufs_disk_fillout(disk, devnam) == -1) {
+		err(1, "ufs_disk_fillout(%s) failed: %s", devnam,
+		    disk->d_error);
+	}
+	fs = &disk->d_fs;
+	/*
+	 * Setup a few things so reply() can work.
+	 */
+	bcopy(fs, &sblock, sizeof(sblock));
+	fsreadfd = disk->d_fd;
+	fswritefd = disk->d_fd;
+}
+
+/*
+ * Mark file system as clean, write the super-block back, close the disk.
+ */
+static void
+closedisk(const char *devnam)
+{
+	struct csum *cgsum;
+	int i;
+
+	/*
+	 * Recompute the fs summary info from correct cs summaries.
+	 */
+	bzero(&fs->fs_cstotal, sizeof(struct csum_total));
+	for (i = 0; i < fs->fs_ncg; i++) {
+		cgsum = &fs->fs_cs(fs, i);
+		fs->fs_cstotal.cs_nffree += cgsum->cs_nffree;
+		fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree;
+		fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
+		fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
+	}
+	/* XXX Don't set clean for now, we don't trust the journal. */
+	/* fs->fs_clean = 1; */
+	fs->fs_time = time(NULL);
+	fs->fs_mtime = time(NULL);
+	if (sbwrite(disk, 0) == -1)
+		err(1, "sbwrite(%s)", devnam);
+	if (ufs_disk_close(disk) == -1)
+		err(1, "ufs_disk_close(%s)", devnam);
+	free(disk);
+	disk = NULL;
+	fs = NULL;
+	fsreadfd = -1;
+	fswritefd = -1;
+}
+
+/*
+ * Lookup a cg by number in the hash so we can keep track of which cgs
+ * need stats rebuilt.
+ */
+static struct suj_cg *
+cg_lookup(int cgx)
+{
+	struct cghd *hd;
+	struct suj_cg *sc;
+
+	if (cgx < 0 || cgx >= fs->fs_ncg) {
+		abort();
+		errx(1, "Bad cg number %d", cgx);
+	}
+	hd = &cghash[SUJ_HASH(cgx)];
+	LIST_FOREACH(sc, hd, sc_next)
+		if (sc->sc_cgx == cgx)
+			return (sc);
+	sc = errmalloc(sizeof(*sc));
+	bzero(sc, sizeof(*sc));
+	sc->sc_cgbuf = errmalloc(fs->fs_bsize);
+	sc->sc_cgp = (struct cg *)sc->sc_cgbuf;
+	sc->sc_cgx = cgx;
+	LIST_INSERT_HEAD(hd, sc, sc_next);
+	if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+	    fs->fs_bsize) == -1)
+		err(1, "Unable to read cylinder group %d", sc->sc_cgx);
+
+	return (sc);
+}
+
+/*
+ * Lookup an inode number in the hash and allocate a suj_ino if it does
+ * not exist.
+ */
+static struct suj_ino *
+ino_lookup(ino_t ino, int creat)
+{
+	struct suj_ino *sino;
+	struct inohd *hd;
+	struct suj_cg *sc;
+
+	sc = cg_lookup(ino_to_cg(fs, ino));
+	hd = &sc->sc_inohash[SUJ_HASH(ino)];
+	LIST_FOREACH(sino, hd, si_next)
+		if (sino->si_ino == ino)
+			return (sino);
+	if (creat == 0)
+		return (NULL);
+	sino = errmalloc(sizeof(*sino));
+	bzero(sino, sizeof(*sino));
+	sino->si_ino = ino;
+	sino->si_nlinkadj = 0;
+	TAILQ_INIT(&sino->si_recs);
+	TAILQ_INIT(&sino->si_movs);
+	LIST_INSERT_HEAD(hd, sino, si_next);
+
+	return (sino);
+}
+
+/*
+ * Lookup a block number in the hash and allocate a suj_blk if it does
+ * not exist.
+ */
+static struct suj_blk *
+blk_lookup(ufs2_daddr_t blk, int creat)
+{
+	struct suj_blk *sblk;
+	struct suj_cg *sc;
+	struct blkhd *hd;
+
+	sc = cg_lookup(dtog(fs, blk));
+	hd = &sc->sc_blkhash[SUJ_HASH(blk)];
+	LIST_FOREACH(sblk, hd, sb_next)
+		if (sblk->sb_blk == blk)
+			return (sblk);
+	if (creat == 0)
+		return (NULL);
+	sblk = errmalloc(sizeof(*sblk));
+	bzero(sblk, sizeof(*sblk));
+	sblk->sb_blk = blk;
+	TAILQ_INIT(&sblk->sb_recs);
+	LIST_INSERT_HEAD(hd, sblk, sb_next);
+
+	return (sblk);
+}
+
+static uint8_t *
+dblk_read(ufs2_daddr_t blk, int size)
+{
+	struct data_blk *dblk;
+	struct dblkhd *hd;
+
+	hd = &dbhash[SUJ_HASH(blk)];
+	LIST_FOREACH(dblk, hd, db_next)
+		if (dblk->db_blk == blk)
+			goto found;
+	/*
+	 * The inode block wasn't located, allocate a new one.
+	 */
+	dblk = errmalloc(sizeof(*dblk));
+	bzero(dblk, sizeof(*dblk));
+	LIST_INSERT_HEAD(hd, dblk, db_next);
+	dblk->db_blk = blk;
+found:
+	/*
+	 * I doubt size mismatches can happen in practice but it is trivial
+	 * to handle.
+	 */
+	if (size != dblk->db_size) {
+		if (dblk->db_buf)
+			free(dblk->db_buf);
+		dblk->db_buf = errmalloc(size);
+		dblk->db_size = size;
+		if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1)
+			err(1, "Failed to read data block %jd", blk);
+	}
+	return (dblk->db_buf);
+}
+
+static union dinode *
+ino_read(ino_t ino)
+{
+	struct ino_blk *iblk;
+	struct iblkhd *hd;
+	struct suj_cg *sc;
+	ufs2_daddr_t blk;
+	int off;
+
+	blk = ino_to_fsba(fs, ino);
+	sc = cg_lookup(ino_to_cg(fs, ino));
+	hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+	LIST_FOREACH(iblk, hd, ib_next)
+		if (iblk->ib_blk == blk)
+			goto found;
+	/*
+	 * The inode block wasn't located, allocate a new one.
+	 */
+	iblk = errmalloc(sizeof(*iblk));
+	bzero(iblk, sizeof(*iblk));
+	iblk->ib_buf = errmalloc(fs->fs_bsize);
+	iblk->ib_blk = blk;
+	LIST_INSERT_HEAD(hd, iblk, ib_next);
+	if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1)
+		err(1, "Failed to read inode block %jd", blk);
+found:
+	sc->sc_lastiblk = iblk;
+	off = ino_to_fsbo(fs, ino);
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off];
+	else
+		return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off];
+}
+
+static void
+ino_dirty(ino_t ino)
+{
+	struct ino_blk *iblk;
+	struct iblkhd *hd;
+	struct suj_cg *sc;
+	ufs2_daddr_t blk;
+
+	blk = ino_to_fsba(fs, ino);
+	sc = cg_lookup(ino_to_cg(fs, ino));
+	iblk = sc->sc_lastiblk;
+	if (iblk && iblk->ib_blk == blk) {
+		iblk->ib_dirty = 1;
+		return;
+	}
+	hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+	LIST_FOREACH(iblk, hd, ib_next) {
+		if (iblk->ib_blk == blk) {
+			iblk->ib_dirty = 1;
+			return;
+		}
+	}
+	ino_read(ino);
+	ino_dirty(ino);
+}
+
+static void
+iblk_write(struct ino_blk *iblk)
+{
+
+	if (iblk->ib_dirty == 0)
+		return;
+	if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf,
+	    fs->fs_bsize) == -1)
+		err(1, "Failed to write inode block %jd", iblk->ib_blk);
+}
+
+/*
+ * Return 1 if the inode was free and 0 if it is allocated.
+ */
+static int
+ino_isfree(ino_t ino)
+{
+	struct suj_cg *sc;
+	uint8_t *inosused;
+	struct cg *cgp;
+	int cg;
+
+	cg = ino_to_cg(fs, ino);
+	ino = ino % fs->fs_ipg;
+	sc = cg_lookup(cg);
+	cgp = sc->sc_cgp;
+	inosused = cg_inosused(cgp);
+	return isclr(inosused, ino);
+}
+
+static int
+blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags)
+{
+	ufs2_daddr_t bstart;
+	ufs2_daddr_t bend;
+	ufs2_daddr_t end;
+
+	end = start + frags;
+	bstart = brec->jb_blkno + brec->jb_oldfrags;
+	bend = bstart + brec->jb_frags;
+	if (start < bend && end > bstart)
+		return (1);
+	return (0);
+}
+
+static int
+blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start,
+    int frags)
+{
+
+	if (brec->jb_ino != ino || brec->jb_lbn != lbn)
+		return (0);
+	if (brec->jb_blkno + brec->jb_oldfrags != start)
+		return (0);
+	if (brec->jb_frags != frags)
+		return (0);
+	return (1);
+}
+
+static void
+blk_setmask(struct jblkrec *brec, int *mask)
+{
+	int i;
+
+	for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++)
+		*mask |= 1 << i;
+}
+
+/*
+ * Determine whether a given block has been reallocated to a new location.
+ * Returns a mask of overlapping bits if any frags have been reused or
+ * zero if the block has not been re-used and the contents can be trusted.
+ * 
+ * This is used to ensure that an orphaned pointer due to truncate is safe
+ * to be freed.  The mask value can be used to free partial blocks.
+ */
+static int
+blk_isfree(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags)
+{
+	struct suj_blk *sblk;
+	struct suj_rec *srec;
+	struct jblkrec *brec;
+	int mask;
+	int off;
+
+	/*
+	 * To be certain we're not freeing a reallocated block we lookup
+	 * this block in the blk hash and see if there is an allocation
+	 * journal record that overlaps with any fragments in the block
+	 * we're concerned with.  If any fragments have ben reallocated
+	 * the block has already been freed and re-used for another purpose.
+	 */
+	mask = 0;
+	sblk = blk_lookup(blknum(fs, blk), 0);
+	if (sblk == NULL)
+		return (0);
+	off = blk - sblk->sb_blk;
+	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+		brec = (struct jblkrec *)srec->sr_rec;
+		/*
+		 * If the block overlaps but does not match
+		 * exactly it's a new allocation.  If it matches
+		 * exactly this record refers to the current
+		 * location.
+		 */ 
+		if (blk_overlaps(brec, blk, frags) == 0)
+			continue;
+		if (blk_equals(brec, ino, lbn, blk, frags) == 1)
+			mask = 0;
+		else
+			blk_setmask(brec, &mask);
+	}
+	if (debug)
+		printf("blk_isfree: blk %jd sblk %jd off %d mask 0x%X\n",
+		    blk, sblk->sb_blk, off, mask);
+	return (mask >> off);
+}
+
+/*
+ * Determine whether it is safe to follow an indirect.  It is not safe
+ * if any part of the indirect has been reallocated or the last journal
+ * entry was an allocation.  Just allocated indirects may not have valid
+ * pointers yet and all of their children will have their own records.
+ * 
+ * Returns 1 if it's safe to follow the indirect and 0 otherwise.
+ */
+static int
+blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn)
+{
+	struct suj_blk *sblk;
+	struct jblkrec *brec;
+
+	sblk = blk_lookup(blk, 0);
+	if (sblk == NULL)
+		return (1);
+	if (TAILQ_EMPTY(&sblk->sb_recs))
+		return (1);
+	brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec;
+	if (blk_equals(brec, ino, lbn, blk, fs->fs_frag))
+		if (brec->jb_op == JOP_FREEBLK)
+			return (1);
+	return (0);
+}
+
+/*
+ * Clear an inode from the cg bitmap.  If the inode was already clear return
+ * 0 so the caller knows it does not have to check the inode contents.
+ */
+static int
+ino_free(ino_t ino, int mode)
+{
+	struct suj_cg *sc;
+	uint8_t *inosused;
+	struct cg *cgp;
+	int cg;
+
+	cg = ino_to_cg(fs, ino);
+	ino = ino % fs->fs_ipg;
+	sc = cg_lookup(cg);
+	cgp = sc->sc_cgp;
+	inosused = cg_inosused(cgp);
+	/*
+	 * The bitmap may never have made it to the disk so we have to
+	 * conditionally clear.  We can avoid writing the cg in this case.
+	 */
+	if (isclr(inosused, ino))
+		return (0);
+	freeinos++;
+	clrbit(inosused, ino);
+	if (ino < cgp->cg_irotor)
+		cgp->cg_irotor = ino;
+	cgp->cg_cs.cs_nifree++;
+	if ((mode & IFMT) == IFDIR) {
+		freedir++;
+		cgp->cg_cs.cs_ndir--;
+	}
+	sc->sc_dirty = 1;
+
+	return (1);
+}
+
+/*
+ * Free 'frags' frags starting at filesystem block 'bno' skipping any frags
+ * set in the mask.
+ */
+static void
+blk_free(ufs2_daddr_t bno, int mask, int frags)
+{
+	ufs1_daddr_t fragno, cgbno;
+	struct suj_cg *sc;
+	struct cg *cgp;
+	int i, cg;
+	uint8_t *blksfree;
+
+	if (debug)
+		printf("Freeing %d frags at blk %jd\n", frags, bno);
+	cg = dtog(fs, bno);
+	sc = cg_lookup(cg);
+	cgp = sc->sc_cgp;
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp);
+
+	/*
+	 * If it's not allocated we only wrote the journal entry
+	 * and never the bitmaps.  Here we unconditionally clear and
+	 * resolve the cg summary later.
+	 */
+	if (frags == fs->fs_frag && mask == 0) {
+		fragno = fragstoblks(fs, cgbno);
+		ffs_setblock(fs, blksfree, fragno);
+		freeblocks++;
+	} else {
+		/*
+		 * deallocate the fragment
+		 */
+		for (i = 0; i < frags; i++)
+			if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) {
+				freefrags++;
+				setbit(blksfree, cgbno + i);
+			}
+	}
+	sc->sc_dirty = 1;
+}
+
+/*
+ * Fetch an indirect block to find the block at a given lbn.  The lbn
+ * may be negative to fetch a specific indirect block pointer or positive
+ * to fetch a specific block.
+ */
+static ufs2_daddr_t
+indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn, int level)
+{
+	ufs2_daddr_t *bap2;
+	ufs2_daddr_t *bap1;
+	ufs_lbn_t lbnadd;
+	ufs_lbn_t base;
+	int i;
+
+	if (blk == 0)
+		return (0);
+	if (cur == lbn)
+		return (blk);
+	if (level == 0 && lbn < 0) {
+		abort();
+		errx(1, "Invalid lbn %jd", lbn);
+	}
+	bap2 = (void *)dblk_read(blk, fs->fs_bsize);
+	bap1 = (void *)bap2;
+	lbnadd = 1;
+	base = -(cur + level);
+	for (i = level; i > 0; i--)
+		lbnadd *= NINDIR(fs);
+	if (lbn > 0) 
+		i = (lbn - base) / lbnadd;
+	else
+		i = (-lbn - base) / lbnadd;
+	if (i < 0 || i >= NINDIR(fs)) {
+		abort();
+		errx(1, "Invalid indirect index %d produced by lbn %jd",
+		    i, lbn);
+	}
+	if (level == 0)
+		cur = base + (i * lbnadd);
+	else
+		cur = -(base + (i * lbnadd)) - (level - 1);
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		blk = bap1[i];
+	else
+		blk = bap2[i];
+	if (cur == lbn)
+		return (blk);
+	if (level == 0) {
+		abort();
+		errx(1, "Invalid lbn %jd at level 0", lbn);
+	}
+	return indir_blkatoff(blk, ino, cur, lbn, level - 1);
+}
+
+/*
+ * Finds the disk block address at the specified lbn within the inode
+ * specified by ip.  This follows the whole tree and honors di_size and
+ * di_extsize so it is a true test of reachability.  The lbn may be
+ * negative if an extattr or indirect block is requested.
+ */
+static ufs2_daddr_t
+ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags)
+{
+	ufs_lbn_t tmpval;
+	ufs_lbn_t cur;
+	ufs_lbn_t next;
+	int i;
+
+	/*
+	 * Handle extattr blocks first.
+	 */
+	if (lbn < 0 && lbn >= -NXADDR) {
+		lbn = -1 - lbn;
+		if (lbn > lblkno(fs, ip->dp2.di_extsize - 1))
+			return (0);
+		*frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn));
+		return (ip->dp2.di_extb[lbn]);
+	}
+	/*
+	 * And now direct and indirect.  Verify that the lbn does not
+	 * exceed the size required to store the file by asking for
+	 * the lbn of the last byte.  These blocks should be 0 anyway
+	 * so this simply saves the traversal.
+	 */
+	if (lbn > 0 && lbn > lblkno(fs, DIP(ip, di_size) - 1))
+		return (0);
+	if (lbn < 0 && -lbn > lblkno(fs, DIP(ip, di_size) - 1))
+		return (0);
+	if (lbn >= 0 && lbn < NDADDR) {
+		*frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn));
+		return (DIP(ip, di_db[lbn]));
+	}
+	*frags = fs->fs_frag;
+
+	for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++,
+	    tmpval *= NINDIR(fs), cur = next) {
+		next = cur + tmpval;
+		if (lbn == -cur)
+			return (DIP(ip, di_ib[i]));
+		/*
+		 * Determine whether the lbn in question is within this tree.
+		 */
+		if (lbn < 0 && -lbn >= next)
+			continue;
+		if (lbn > 0 && lbn >= next)
+			continue;
+
+		return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn, i);
+	}
+	errx(1, "lbn %jd not in ino", lbn);
+}
+
+/*
+ * Determine whether a block exists at a particular lbn in an inode.
+ * Returns 1 if found, 0 if not.  lbn may be negative for indirects
+ * or ext blocks.
+ */
+static int
+blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags)
+{
+	union dinode *ip;
+	ufs2_daddr_t nblk;
+
+	ip = ino_read(ino);
+
+	if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0)
+		return (0);
+	nblk = ino_blkatoff(ip, ino, lbn, frags);
+
+	return (nblk == blk);
+}
+
+/*
+ * Determines whether a pointer to an inode exists within a directory
+ * at a specified offset.  Returns the mode of the found entry.
+ */
+static int
+ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot)
+{
+	union dinode *dip;
+	struct direct *dp;
+	ufs2_daddr_t blk;
+	uint8_t *block;
+	ufs_lbn_t lbn;
+	int blksize;
+	int frags;
+	int dpoff;
+	int doff;
+
+	*isdot = 0;
+	dip = ino_read(parent);
+	*mode = DIP(dip, di_mode);
+	if ((*mode & IFMT) != IFDIR) {
+		if (debug) {
+			/* This can happen if the parent inode was reallocated. */
+			if (*mode != 0)
+				printf("Directory %d has bad mode %o\n",
+				    parent, *mode);
+			else
+				printf("Directory %d zero inode\n", parent);
+		}
+		return (0);
+	}
+	lbn = lblkno(fs, diroff);
+	doff = blkoff(fs, diroff);
+	blksize = sblksize(fs, DIP(dip, di_size), lbn);
+	if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) {
+		if (debug)
+			printf("ino %d absent from %d due to offset %jd"
+			    " exceeding size %jd\n",
+			    child, parent, diroff, DIP(dip, di_size));
+		return (0);
+	}
+	blk = ino_blkatoff(dip, parent, lbn, &frags);
+	if (blk <= 0) {
+		if (debug)
+			printf("Sparse directory %d", parent);
+		return (0);
+	}
+	block = dblk_read(blk, blksize);
+	/*
+	 * Walk through the records from the start of the block to be
+	 * certain we hit a valid record and not some junk in the middle
+	 * of a file name.  Stop when we reach or pass the expected offset.
+	 */
+	dpoff = 0;
+	do {
+		dp = (struct direct *)&block[dpoff];
+		if (dpoff == doff)
+			break;
+		if (dp->d_reclen == 0)
+			break;
+		dpoff += dp->d_reclen;
+	} while (dpoff <= doff);
+	if (dpoff > fs->fs_bsize)
+		errx(1, "Corrupt directory block in dir inode %d", parent);
+	/* Not found. */
+	if (dpoff != doff) {
+		if (debug)
+			printf("ino %d not found in %d, lbn %jd, dpoff %d\n",
+			    child, parent, lbn, dpoff);
+		return (0);
+	}
+	/*
+	 * We found the item in question.  Record the mode and whether it's
+	 * a . or .. link for the caller.
+	 */
+	if (dp->d_ino == child) {
+		if (child == parent)
+			*isdot = 1;
+		else if (dp->d_namlen == 2 &&
+		    dp->d_name[0] == '.' && dp->d_name[1] == '.')
+			*isdot = 1;
+		*mode = DTTOIF(dp->d_type);
+		return (1);
+	}
+	if (debug)
+		printf("ino %d doesn't match dirent ino %d in parent %d\n",
+		    child, dp->d_ino, parent);
+	return (0);
+}
+
+#define	VISIT_INDIR	0x0001
+#define	VISIT_EXT	0x0002
+
+/*
+ * Read an indirect level which may or may not be linked into an inode.
+ */
+static void
+indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags,
+    ino_visitor visitor, int flags)
+{
+	ufs2_daddr_t *bap2;
+	ufs1_daddr_t *bap1;
+	ufs_lbn_t lbnadd;
+	ufs2_daddr_t nblk;
+	ufs_lbn_t nlbn;
+	int level;
+	int i;
+
+	/*
+	 * Don't visit indirect blocks with contents we can't trust.  This
+	 * should only happen when indir_visit() is called to complete a
+	 * truncate that never finished and not when a pointer is found via
+	 * an inode.
+	 */
+	if (blk == 0)
+		return;
+	if (blk_isindir(blk, ino, lbn) == 0) {
+		if (debug)
+			printf("blk %jd ino %d lbn %jd is not indir.\n",
+			    blk, ino, lbn);
+		goto out;
+	}
+	level = lbn_level(lbn);
+	if (level == -1) {
+		abort();
+		errx(1, "Invalid level for lbn %jd", lbn);
+	}
+	lbnadd = 1;
+	for (i = level; i > 0; i--)
+		lbnadd *= NINDIR(fs);
+	bap1 = (void *)dblk_read(blk, fs->fs_bsize);
+	bap2 = (void *)bap1;
+	for (i = 0; i < NINDIR(fs); i++) {
+		if (fs->fs_magic == FS_UFS1_MAGIC)
+			nblk = *bap1++;
+		else
+			nblk = *bap2++;
+		if (nblk == 0)
+			continue;
+		if (level == 0) {
+			nlbn = -lbn + i * lbnadd;
+			(*frags) += fs->fs_frag;
+			visitor(ino, nlbn, nblk, fs->fs_frag);
+		} else {
+			nlbn = (lbn + 1) - (i * lbnadd);
+			indir_visit(ino, nlbn, nblk, frags, visitor, flags);
+		}
+	}
+out:
+	if (flags & VISIT_INDIR) {
+		(*frags) += fs->fs_frag;
+		visitor(ino, lbn, blk, fs->fs_frag);
+	}
+}
+
+/*
+ * Visit each block in an inode as specified by 'flags' and call a
+ * callback function.  The callback may inspect or free blocks.  The
+ * count of frags found according to the size in the file is returned.
+ * This is not valid for sparse files but may be used to determine
+ * the correct di_blocks for a file.
+ */
+static uint64_t
+ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags)
+{
+	ufs_lbn_t tmpval;
+	ufs_lbn_t lbn;
+	uint64_t size;
+	uint64_t fragcnt;
+	int mode;
+	int frags;
+	int i;
+
+	size = DIP(ip, di_size);
+	mode = DIP(ip, di_mode) & IFMT;
+	fragcnt = 0;
+	if ((flags & VISIT_EXT) &&
+	    fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) {
+		for (i = 0; i < NXADDR; i++) {
+			if (ip->dp2.di_extb[i] == 0)
+				continue;
+			frags = sblksize(fs, ip->dp2.di_extsize, i);
+			frags = numfrags(fs, frags);
+			fragcnt += frags;
+			visitor(ino, -1 - i, ip->dp2.di_extb[i], frags);
+		}
+	}
+	/* Skip datablocks for short links and devices. */
+	if (mode == IFBLK || mode == IFCHR ||
+	    (mode == IFLNK && size < fs->fs_maxsymlinklen))
+		return (fragcnt);
+	for (i = 0; i < NDADDR; i++) {
+		if (DIP(ip, di_db[i]) == 0)
+			continue;
+		frags = sblksize(fs, size, i);
+		frags = numfrags(fs, frags);
+		fragcnt += frags;
+		visitor(ino, i, DIP(ip, di_db[i]), frags);
+	}
+	for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
+	    tmpval *= NINDIR(fs), lbn += tmpval) {
+		if (DIP(ip, di_ib[i]) == 0)
+			continue;
+		indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor,
+		    flags);
+	}
+	return (fragcnt);
+}
+
+/*
+ * Null visitor function used when we just want to count blocks.
+ */
+static void
+null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+}
+
+/*
+ * Recalculate di_blocks when we discover that a block allocation or
+ * free was not successfully completed.  The kernel does not roll this back
+ * because it would be too expensive to compute which indirects were
+ * reachable at the time the inode was written.
+ */
+static void
+ino_adjblks(ino_t ino)
+{
+	struct suj_ino *sino;
+	union dinode *ip;
+	uint64_t blocks;
+	uint64_t frags;
+
+	sino = ino_lookup(ino, 1);
+	if (sino->si_blkadj)
+		return;
+	sino->si_blkadj = 1;
+	ip = ino_read(ino);
+	/* No need to adjust zero'd inodes. */
+	if (DIP(ip, di_mode) == 0)
+		return;
+	frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
+	blocks = fsbtodb(fs, frags);
+	if (blocks == DIP(ip, di_blocks))
+		return;
+	if (debug)
+		printf("ino %d adjusting block count from %jd to %jd\n",
+		    ino, DIP(ip, di_blocks), blocks);
+	DIP_SET(ip, di_blocks, blocks);
+	ino_dirty(ino);
+}
+
+static void
+blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+	int mask;
+
+	mask = blk_isfree(blk, ino, lbn, frags);
+	if (debug)
+		printf("blk %jd freemask 0x%X\n", blk, mask);
+	blk_free(blk, mask, frags);
+}
+
+/*
+ * Free a block or tree of blocks that was previously rooted in ino at
+ * the given lbn.  If the lbn is an indirect all children are freed
+ * recursively.
+ */
+static void
+blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow)
+{
+	uint64_t resid;
+	int mask;
+
+	mask = blk_isfree(blk, ino, lbn, frags);
+	if (debug)
+		printf("blk %jd freemask 0x%X\n", blk, mask);
+	resid = 0;
+	if (lbn <= -NDADDR && follow && mask == 0)
+		indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR);
+	else
+		blk_free(blk, mask, frags);
+}
+
+static void
+ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+	struct suj_ino *sino;
+	struct suj_rec *srec;
+	struct jrefrec *rrec;
+	struct direct *dp;
+	off_t diroff;
+	uint8_t *block;
+	int skipparent;
+	int isparent;
+	int dpoff;
+	int size;
+
+	sino = ino_lookup(ino, 0);
+	if (sino)
+		skipparent = sino->si_skipparent;
+	else
+		skipparent = 0;
+	size = lfragtosize(fs, frags);
+	block = dblk_read(blk, size);
+	dp = (struct direct *)&block[0];
+	for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) {
+		dp = (struct direct *)&block[dpoff];
+		if (dp->d_ino == 0 || dp->d_ino == WINO)
+			continue;
+		if (dp->d_namlen == 1 && dp->d_name[0] == '.')
+			continue;
+		isparent = dp->d_namlen == 2 && dp->d_name[0] == '.' &&
+		    dp->d_name[1] == '.';
+		if (isparent && skipparent == 1)
+			continue;
+		if (debug)
+			printf("Directory %d removing inode %d name %s\n",
+			    ino, dp->d_ino, dp->d_name);
+		/*
+		 * Lookup this inode to see if we have a record for it.
+		 * If not, we've already adjusted it assuming this path
+		 * was valid and we have to adjust once more.
+		 */
+		sino = ino_lookup(dp->d_ino, 0);
+		if (sino == NULL || sino->si_linkadj || sino->si_hasrecs == 0) {
+			ino_decr(dp->d_ino);
+			continue;
+		}
+		/*
+		 * Tell any child directories we've already removed their
+		 * parent.  Don't try to adjust our link down again.
+		 */
+		if (isparent == 0)
+			sino->si_skipparent = 1;
+		/*
+		 * If we haven't yet processed this inode we need to make
+		 * sure we will successfully discover the lost path.  If not
+		 * use nlinkadj to remember.
+		 */
+		diroff = lblktosize(fs, lbn) + dpoff;
+		TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+			rrec = (struct jrefrec *)srec->sr_rec;
+			if (rrec->jr_parent == ino &&
+			    rrec->jr_diroff == diroff)
+				break;
+		}
+		if (srec == NULL)
+			sino->si_nlinkadj--;
+	}
+}
+
+/*
+ * Truncate an inode, freeing all blocks and decrementing all children's
+ * link counts.  Free the inode back to the cg.
+ */
+static void
+ino_truncate(union dinode *ip, ino_t ino, int mode)
+{
+	uint32_t gen;
+
+	if (ino == ROOTINO)
+		errx(1, "Attempting to free ROOTINO");
+	if (debug)
+		printf("Truncating and freeing ino %d, nlink %d, mode %o\n",
+		    ino, DIP(ip, di_nlink), DIP(ip, di_mode));
+
+	/* We are freeing an inode or directory. */
+	if ((DIP(ip, di_mode) & IFMT) == IFDIR)
+		ino_visit(ip, ino, ino_free_children, 0);
+	DIP_SET(ip, di_nlink, 0);
+	ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR);
+	/* Here we have to clear the inode and release any blocks it holds. */
+	gen = DIP(ip, di_gen);
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		bzero(ip, sizeof(struct ufs1_dinode));
+	else
+		bzero(ip, sizeof(struct ufs2_dinode));
+	DIP_SET(ip, di_gen, gen);
+	ino_dirty(ino);
+	ino_free(ino, mode);
+	return;
+}
+
+/*
+ * Adjust an inode's link count down by one when a directory goes away.
+ */
+static void
+ino_decr(ino_t ino)
+{
+	union dinode *ip;
+	int reqlink;
+	int nlink;
+	int mode;
+
+	ip = ino_read(ino);
+	nlink = DIP(ip, di_nlink);
+	mode = DIP(ip, di_mode);
+	if (nlink < 1)
+		errx(1, "Inode %d link count %d invalid", ino, nlink);
+	if (mode == 0)
+		errx(1, "Inode %d has a link of %d with 0 mode.", ino, nlink);
+	nlink--;
+	if ((mode & IFMT) == IFDIR)
+		reqlink = 2;
+	else
+		reqlink = 1;
+	if (nlink < reqlink) {
+		if (debug)
+			printf("ino %d not enough links to live %d < %d\n",
+			    ino, nlink, reqlink);
+		ino_truncate(ip, ino, mode);
+		return;
+	}
+	DIP_SET(ip, di_nlink, nlink);
+	ino_dirty(ino);
+}
+
+/*
+ * Adjust the inode link count to 'nlink'.  If the count reaches zero
+ * free it.
+ */
+static void
+ino_adjust(ino_t ino, int lastmode, nlink_t nlink)
+{
+	union dinode *ip;
+	int reqlink;
+	int mode;
+
+	ip = ino_read(ino);
+	mode = DIP(ip, di_mode) & IFMT;
+	if (nlink > LINK_MAX)
+		errx(1,
+		    "ino %d nlink manipulation error, new link %d, old link %d",
+		    ino, nlink, DIP(ip, di_nlink));
+	if (debug)
+		printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n",
+		    ino, nlink, DIP(ip, di_nlink), lastmode);
+	if (mode == 0) {
+		if (debug)
+			printf("ino %d, zero inode freeing bitmap\n", ino);
+		ino_free(ino, lastmode);
+		return;
+	}
+	/* XXX Should be an assert? */
+	if (mode != lastmode && debug)
+		printf("ino %d, mode %o != %o\n", ino, mode, lastmode);
+	if ((mode & IFMT) == IFDIR)
+		reqlink = 2;
+	else
+		reqlink = 1;
+	/* If the inode doesn't have enough links to live, free it. */
+	if (nlink < reqlink) {
+		if (debug)
+			printf("ino %d not enough links to live %d < %d\n",
+			    ino, nlink, reqlink);
+		ino_truncate(ip, ino, mode);
+		return;
+	}
+	/* If required write the updated link count. */
+	if (DIP(ip, di_nlink) == nlink) {
+		if (debug)
+			printf("ino %d, link matches, skipping.\n", ino);
+		return;
+	}
+	DIP_SET(ip, di_nlink, nlink);
+	ino_dirty(ino);
+}
+
+#define	DOTDOT_OFFSET	DIRECTSIZ(1)
+
+/*
+ * Process records available for one inode and determine whether the
+ * link count is correct or needs adjusting.
+ *
+ * XXX Failed to fix zero length directory.  Shouldn't .. have been mising?
+ */
+static void
+ino_check(struct suj_ino *sino)
+{
+	struct suj_rec *srec;
+	struct jrefrec *rrec;
+	struct suj_ino *stmp;
+	nlink_t dotlinks;
+	int newlinks;
+	int removes;
+	int nlink;
+	ino_t ino;
+	int isdot;
+	int isat;
+	int mode;
+
+	if (sino->si_hasrecs == 0)
+		return;
+	ino = sino->si_ino;
+	/*
+	 * XXX ino_isfree currently is skipping initialized inodes
+	 * that are unreferenced.
+	 */
+	if (0 && ino_isfree(ino))
+		return;
+	rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec;
+	nlink = rrec->jr_nlink;
+	newlinks = sino->si_nlinkadj;
+	dotlinks = 0;
+	removes = 0;
+	TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+		rrec = (struct jrefrec *)srec->sr_rec;
+		isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, 
+		    rrec->jr_ino, &mode, &isdot);
+		if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT))
+			errx(1, "Inode mode/directory type mismatch %o != %o",
+			    mode, rrec->jr_mode);
+		if (debug)
+			printf("jrefrec: op %d ino %d, nlink %d, parent %d, "
+			    "diroff %jd, mode %o, isat %d, isdot %d\n",
+			    rrec->jr_op, rrec->jr_ino, rrec->jr_nlink,
+			    rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode,
+			    isat, isdot);
+		mode = rrec->jr_mode & IFMT;
+		if (rrec->jr_op == JOP_REMREF)
+			removes++;
+		newlinks += isat;
+		if (isdot)
+			dotlinks += isat;
+	}
+	/*
+	 * The number of links that remain are the starting link count
+	 * subtracted by the total number of removes with the total
+	 * links discovered back in.  An incomplete remove thus
+	 * makes no change to the link count but an add increases
+	 * by one.
+	 */
+	nlink += newlinks;
+	nlink -= removes;
+	/*
+	 * If it's a directory with no real names pointing to it go ahead
+	 * and truncate it.  This will free any children.
+	 */
+	if ((mode & IFMT) == IFDIR && nlink - dotlinks == 0) {
+		nlink = 0;
+		/*
+		 * Mark any .. links so they know not to free this inode
+		 * when they are removed.
+		 */
+		TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+			rrec = (struct jrefrec *)srec->sr_rec;
+			if (rrec->jr_diroff == DOTDOT_OFFSET) {
+				stmp = ino_lookup(rrec->jr_parent, 0);
+				if (stmp)
+					stmp->si_skipparent = 1;
+			}
+		}
+	}
+	sino->si_linkadj = 1;
+	ino_adjust(ino, mode, nlink);
+}
+
+/*
+ * Process records available for one block and determine whether it is
+ * still allocated and whether the owning inode needs to be updated or
+ * a free completed.
+ */
+static void
+blk_check(struct suj_blk *sblk)
+{
+	struct suj_rec *srec;
+	struct jblkrec *brec;
+	ufs2_daddr_t blk;
+	int mask;
+	int frags;
+	int isat;
+
+	/*
+	 * Each suj_blk actually contains records for any fragments in that
+	 * block.  As a result we must evaluate each record individually.
+	 */
+	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+		brec = (struct jblkrec *)srec->sr_rec;
+		frags = brec->jb_frags;
+		blk = brec->jb_blkno + brec->jb_oldfrags;
+		isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags);
+		if (debug)
+			printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n",
+			    brec->jb_op, blk, brec->jb_ino, brec->jb_lbn,
+			    brec->jb_frags, isat, frags);
+		/*
+		 * If we found the block at this address we still have to
+		 * determine if we need to free the tail end that was
+		 * added by adding contiguous fragments from the same block.
+		 */
+		if (isat == 1) {
+			if (frags == brec->jb_frags)
+				continue;
+			mask = blk_isfree(blk, brec->jb_ino, brec->jb_lbn,
+			    brec->jb_frags);
+			mask >>= frags;
+			blk += frags;
+			frags = brec->jb_frags - frags;
+			blk_free(blk, mask, frags);
+			ino_adjblks(brec->jb_ino);
+			continue;
+		}
+		/*
+	 	 * The block wasn't found, attempt to free it.  It won't be
+		 * freed if it was actually reallocated.  If this was an
+		 * allocation we don't want to follow indirects as they
+		 * may not be written yet.  Any children of the indirect will
+		 * have their own records.  If it's a free we need to
+		 * recursively free children.
+		 */
+		blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags,
+		    brec->jb_op == JOP_FREEBLK);
+		ino_adjblks(brec->jb_ino);
+	}
+}
+
+/*
+ * Walk the list of inode and block records for this cg, recovering any
+ * changes which were not complete at the time of crash.
+ */
+static void
+cg_check(struct suj_cg *sc)
+{
+	struct suj_blk *nextb;
+	struct suj_ino *nexti;
+	struct suj_ino *sino;
+	struct suj_blk *sblk;
+	int i;
+
+	if (debug)
+		printf("Recovering cg %d\n", sc->sc_cgx);
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH_SAFE(sino, &sc->sc_inohash[i], si_next, nexti)
+			ino_check(sino);
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH_SAFE(sblk, &sc->sc_blkhash[i], sb_next, nextb)
+			blk_check(sblk);
+}
+
+/*
+ * Write a potentially dirty cg.  All inodes must be written before the
+ * cg maps are so that an allocated inode is never marked free, even if
+ * we crash during fsck.
+ */
+static void
+cg_write(struct suj_cg *sc)
+{
+	struct ino_blk *iblk;
+	ufs1_daddr_t fragno, cgbno, maxbno;
+	u_int8_t *blksfree;
+	struct cg *cgp;
+	int blk;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
+			iblk_write(iblk);
+	if (sc->sc_dirty == 0)
+		return;
+	/*
+	 * Fix the frag and cluster summary.
+	 */
+	cgp = sc->sc_cgp;
+	cgp->cg_cs.cs_nbfree = 0;
+	cgp->cg_cs.cs_nffree = 0;
+	bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum));
+	maxbno = fragstoblks(fs, fs->fs_fpg);
+	if (fs->fs_contigsumsize > 0) {
+		for (i = 1; i <= fs->fs_contigsumsize; i++)
+			cg_clustersum(cgp)[i] = 0;
+		bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT));
+	}
+	blksfree = cg_blksfree(cgp);
+	for (cgbno = 0; cgbno < maxbno; cgbno++) {
+		if (ffs_isfreeblock(fs, blksfree, cgbno))
+			continue;
+		if (ffs_isblock(fs, blksfree, cgbno)) {
+			ffs_clusteracct(fs, cgp, cgbno, 1);
+			cgp->cg_cs.cs_nbfree++;
+			continue;
+		}
+		fragno = blkstofrags(fs, cgbno);
+		blk = blkmap(fs, blksfree, fragno);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+		for (i = 0; i < fs->fs_frag; i++)
+			if (isset(blksfree, fragno + i))
+				cgp->cg_cs.cs_nffree++;
+	}
+	/*
+	 * Update the superblock cg summary from our now correct values
+	 * before writing the block.
+	 */
+	fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs;
+	if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+	    fs->fs_bsize) == -1)
+		err(1, "Unable to write cylinder group %d", sc->sc_cgx);
+}
+
+static void
+cg_apply(void (*apply)(struct suj_cg *))
+{
+	struct suj_cg *scg;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH(scg, &cghash[i], sc_next)
+			apply(scg);
+}
+
+/*
+ * Process the unlinked but referenced file list.  Freeing all inodes.
+ */
+static void
+ino_unlinked(void)
+{
+	union dinode *ip;
+	uint16_t mode;
+	ino_t inon;
+	ino_t ino;
+
+	ino = fs->fs_sujfree;
+	fs->fs_sujfree = 0;
+	while (ino != 0) {
+		ip = ino_read(ino);
+		mode = DIP(ip, di_mode) & IFMT;
+		inon = DIP(ip, di_freelink);
+		DIP_SET(ip, di_freelink, 0);
+		/*
+		 * XXX Should this be an errx?
+		 */
+		if (DIP(ip, di_nlink) == 0) {
+			if (debug)
+				printf("Freeing unlinked ino %d mode %o\n",
+				    ino, mode);
+			ino_truncate(ip, ino, mode);
+		} else if (debug)
+			printf("Skipping ino %d mode %o with link %d\n",
+			    ino, mode, DIP(ip, di_nlink));
+		ino = inon;
+	}
+}
+
+/*
+ * If we see two ops for the same inode to the same parent at the same
+ * offset we could miscount the link with ino_isat() returning twice.
+ * Keep only the first record because it has the valid link count but keep
+ * the mode from the final op as that should be the correct mode in case
+ * it changed.
+ */
+static void
+suj_build_ino(struct jrefrec *refrec)
+{
+	struct jmvrec *mvrec;
+	struct suj_rec *srec;
+	struct suj_ino *sino;
+	struct suj_rec *srn;
+	struct jrefrec *rrn;
+
+	if (debug)
+		printf("suj_build_ino: op %d, ino %d, nlink %d, parent %d, diroff %jd\n", 
+		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent,
+		    refrec->jr_diroff);
+	sino = ino_lookup(refrec->jr_ino, 1);
+	/*
+	 * Search for a mvrec that matches this offset.  Whether it's an add
+	 * or a remove we can delete the mvref.  It no longer applies to this
+	 * location.
+	 *
+	 * For removes, we have to find the original offset so we can create
+	 * a remove that matches the earlier add so it can be abandoned
+	 * if necessary.  We create an add in the new location so we can
+	 * tolerate the directory block as it existed before or after
+	 * the move.
+	 */
+	if (!TAILQ_EMPTY(&sino->si_movs)) {
+		for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
+		    srn = TAILQ_PREV(srn, srechd, sr_next)) {
+			mvrec = (struct jmvrec *)srn->sr_rec;
+			if (mvrec->jm_parent != refrec->jr_parent ||
+			    mvrec->jm_newoff != refrec->jr_diroff)
+				continue;
+			TAILQ_REMOVE(&sino->si_movs, srn, sr_next);
+			if (refrec->jr_op == JOP_REMREF) {
+				rrn = errmalloc(sizeof(*refrec));
+				*rrn = *refrec;
+				rrn->jr_op = JOP_ADDREF;
+				suj_build_ino(rrn);
+				refrec->jr_diroff = mvrec->jm_oldoff;
+			}
+		}
+	}
+	/*
+	 * We walk backwards so that adds and removes are evaluated in the
+	 * correct order.
+	 */
+	for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
+	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
+		rrn = (struct jrefrec *)srn->sr_rec;
+		if (rrn->jr_parent != refrec->jr_parent ||
+		    rrn->jr_diroff != refrec->jr_diroff)
+			continue;
+		if (debug)
+			printf("Discarding dup.\n");
+		rrn->jr_mode = refrec->jr_mode;
+		return;
+	}
+	sino->si_hasrecs = 1;
+	srec = errmalloc(sizeof(*srec));
+	srec->sr_rec = (union jrec *)refrec;
+	TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
+}
+
+/*
+ * Apply a move record to an inode.  We must search for adds that preceed us
+ * and add duplicates because we won't know which location to search first.
+ * Then we add movs to a queue that is maintained until the moved location
+ * is removed.  If a single record is moved multiple times we only maintain
+ * one copy that contains the original and final diroffs.
+ */
+static void
+suj_move_ino(struct jmvrec *mvrec)
+{
+	struct jrefrec *refrec;
+	struct suj_ino *sino;
+	struct suj_rec *srec;
+	struct jmvrec *mvrn;
+	struct suj_rec *srn;
+	struct jrefrec *rrn;
+
+	if (debug)
+		printf("suj_move_ino: ino %d, parent %d, diroff %jd, oldoff %jd\n", 
+		    mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff,
+		    mvrec->jm_oldoff);
+	sino = ino_lookup(mvrec->jm_ino, 0);
+	if (sino == NULL)
+		return;
+	/*
+	 * We walk backwards so we only evaluate the most recent record at
+	 * this offset.
+	 */
+	for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
+	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
+		rrn = (struct jrefrec *)srn->sr_rec;
+		if (rrn->jr_op != JOP_ADDREF)
+			continue;
+		if (rrn->jr_parent != mvrec->jm_parent ||
+		    rrn->jr_diroff != mvrec->jm_oldoff)
+			continue;
+		/*
+		 * When an entry is moved we don't know whether the write
+		 * to move has completed yet.  To resolve this we create
+		 * a new add dependency in the new location as if it were added
+		 * twice.  Only one will succeed.
+		 */
+		refrec = errmalloc(sizeof(*refrec));
+		refrec->jr_op = JOP_ADDREF;
+		refrec->jr_ino = mvrec->jm_ino;
+		refrec->jr_parent = mvrec->jm_parent;
+		refrec->jr_diroff = mvrec->jm_newoff;
+		refrec->jr_mode = rrn->jr_mode;
+		refrec->jr_nlink = rrn->jr_nlink;
+		suj_build_ino(refrec);
+		break;
+	}
+	/*
+	 * Add this mvrec to the queue of pending mvs.
+	 */
+	for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
+	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
+		mvrn = (struct jmvrec *)srn->sr_rec;
+		if (mvrn->jm_parent != mvrec->jm_parent ||
+		    mvrn->jm_newoff != mvrec->jm_oldoff)
+			continue;
+		mvrn->jm_newoff = mvrec->jm_newoff;
+		return;
+	}
+	srec = errmalloc(sizeof(*srec));
+	srec->sr_rec = (union jrec *)mvrec;
+	TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next);
+}
+
+/*
+ * Modify journal records so they refer to the base block number
+ * and a start and end frag range.  This is to facilitate the discovery
+ * of overlapping fragment allocations.
+ */
+static void
+suj_build_blk(struct jblkrec *blkrec)
+{
+	struct suj_rec *srec;
+	struct suj_blk *sblk;
+	struct jblkrec *blkrn;
+	ufs2_daddr_t blk;
+	int frag;
+
+	if (debug)
+		printf("suj_build_blk: op %d blkno %jd frags %d oldfrags %d "
+		    "ino %d lbn %jd\n",
+		    blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
+		    blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);
+	blk = blknum(fs, blkrec->jb_blkno);
+	frag = fragnum(fs, blkrec->jb_blkno);
+	sblk = blk_lookup(blk, 1);
+	/*
+	 * Rewrite the record using oldfrags to indicate the offset into
+	 * the block.  Leave jb_frags as the actual allocated count.
+	 */
+	blkrec->jb_blkno -= frag;
+	blkrec->jb_oldfrags = frag;
+	if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
+		errx(1, "Invalid fragment count %d oldfrags %d",
+		    blkrec->jb_frags, frag);
+	/*
+	 * Detect dups.  If we detect a dup we always discard the oldest
+	 * record as it is superseded by the new record.  This speeds up
+	 * later stages but also eliminates free records which are used
+	 * to indicate that the contents of indirects can be trusted.
+	 */
+	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+		blkrn = (struct jblkrec *)srec->sr_rec;
+		if (blkrn->jb_ino != blkrec->jb_ino ||
+		    blkrn->jb_lbn != blkrec->jb_lbn ||
+		    blkrn->jb_blkno != blkrec->jb_blkno ||
+		    blkrn->jb_frags != blkrec->jb_frags ||
+		    blkrn->jb_oldfrags != blkrec->jb_oldfrags)
+			continue;
+		if (debug)
+			printf("Removed dup.\n");
+		/* Discard the free which is a dup with an alloc. */
+		if (blkrec->jb_op == JOP_FREEBLK)
+			return;
+		TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next);
+		free(srec);
+		break;
+	}
+	srec = errmalloc(sizeof(*srec));
+	srec->sr_rec = (union jrec *)blkrec;
+	TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next);
+}
+
+/*
+ * Build up tables of the operations we need to recover.
+ */
+static void
+suj_build(void)
+{
+	struct suj_seg *seg;
+	union jrec *rec;
+	int i;
+
+	TAILQ_FOREACH(seg, &allsegs, ss_next) {
+		rec = (union jrec *)seg->ss_blk;
+		rec++;	/* skip the segrec. */
+		if (debug)
+			printf("seg %jd has %d records, oldseq %jd.\n",
+			    seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt,
+			    seg->ss_rec.jsr_oldest);
+		for (i = 0; i < seg->ss_rec.jsr_cnt; i++, rec++) {
+			switch (rec->rec_jrefrec.jr_op) {
+			case JOP_ADDREF:
+			case JOP_REMREF:
+				suj_build_ino((struct jrefrec *)rec);
+				break;
+			case JOP_MVREF:
+				suj_move_ino((struct jmvrec *)rec);
+				break;
+			case JOP_NEWBLK:
+			case JOP_FREEBLK:
+				suj_build_blk((struct jblkrec *)rec);
+				break;
+			default:
+				errx(1, "Unknown journal operation %d (%d)",
+				    rec->rec_jrefrec.jr_op, i);
+			}
+		}
+	}
+}
+
+/*
+ * Prune the journal segments to those we care about based on the
+ * oldest sequence in the newest segment.  Order the segment list
+ * based on sequence number.
+ */
+static void
+suj_prune(void)
+{
+	struct suj_seg *seg;
+	struct suj_seg *segn;
+	uint64_t newseq;
+	int discard;
+
+	if (debug)
+		printf("Pruning up to %jd\n", oldseq);
+	/* First free the expired segments. */
+	TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+		if (seg->ss_rec.jsr_seq >= oldseq)
+			continue;
+		TAILQ_REMOVE(&allsegs, seg, ss_next);
+		free(seg->ss_blk);
+		free(seg);
+	}
+	/* Next ensure that segments are ordered properly. */
+	seg = TAILQ_FIRST(&allsegs);
+	if (seg == NULL) {
+		if (debug)
+			printf("Empty journal\n");
+		return;
+	}
+	newseq = seg->ss_rec.jsr_seq;
+	for (;;) {
+		seg = TAILQ_LAST(&allsegs, seghd);
+		if (seg->ss_rec.jsr_seq >= newseq)
+			break;
+		TAILQ_REMOVE(&allsegs, seg, ss_next);
+		TAILQ_INSERT_HEAD(&allsegs, seg, ss_next);
+		newseq = seg->ss_rec.jsr_seq;
+		
+	}
+	if (newseq != oldseq)
+		errx(1, "Journal file sequence mismatch %jd != %jd",
+		    newseq, oldseq);
+	/*
+	 * The kernel may asynchronously write segments which can create
+	 * gaps in the sequence space.  Throw away any segments after the
+	 * gap as the kernel guarantees only those that are contiguously
+	 * reachable are marked as completed.
+	 */
+	discard = 0;
+	TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+		if (!discard && newseq++ == seg->ss_rec.jsr_seq)
+			continue;
+		discard = 1;
+		if (debug)
+			printf("Journal order mismatch %jd != %jd pruning\n",
+			    newseq-1, seg->ss_rec.jsr_seq);
+		TAILQ_REMOVE(&allsegs, seg, ss_next);
+		free(seg->ss_blk);
+		free(seg);
+	}
+	if (debug)
+		printf("Processing journal segments from %jd to %jd\n",
+		    oldseq, newseq-1);
+}
+
+/*
+ * Verify the journal inode before attempting to read records.
+ */
+static void
+suj_verifyino(union dinode *ip)
+{
+
+	if (DIP(ip, di_nlink) != 1)
+		errx(1, "Invalid link count %d for journal inode %d",
+		    DIP(ip, di_nlink), fs->fs_sujournal);
+
+	if (DIP(ip, di_mode) != IFREG)
+		errx(1, "Invalid mode %d for journal inode %d",
+		    DIP(ip, di_mode), fs->fs_sujournal);
+
+	if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX)
+		errx(1, "Invalid size %jd for journal inode %d",
+		    DIP(ip, di_size), fs->fs_sujournal);
+
+	if (DIP(ip, di_modrev) != fs->fs_mtime)
+		errx(1, "Journal timestamp does not match fs mount time");
+	/* XXX Add further checks. */
+}
+
+struct jblocks {
+	struct jextent *jb_extent;	/* Extent array. */
+	int		jb_avail;	/* Available extents. */
+	int		jb_used;	/* Last used extent. */
+	int		jb_head;	/* Allocator head. */
+	int		jb_off;		/* Allocator extent offset. */
+};
+struct jextent {
+	ufs2_daddr_t	je_daddr;	/* Disk block address. */
+	int		je_blocks;	/* Disk block count. */
+};
+
+struct jblocks *suj_jblocks;
+
+static struct jblocks *
+jblocks_create(void)
+{
+	struct jblocks *jblocks;
+	int size;
+
+	jblocks = errmalloc(sizeof(*jblocks));
+	jblocks->jb_avail = 10;
+	jblocks->jb_used = 0;
+	jblocks->jb_head = 0;
+	jblocks->jb_off = 0;
+	size = sizeof(struct jextent) * jblocks->jb_avail;
+	jblocks->jb_extent = errmalloc(size);
+	bzero(jblocks->jb_extent, size);
+
+	return (jblocks);
+}
+
+/*
+ * Return the next available disk block and the amount of contiguous
+ * free space it contains.
+ */
+static ufs2_daddr_t
+jblocks_next(struct jblocks *jblocks, int bytes, int *actual)
+{
+	struct jextent *jext;
+	ufs2_daddr_t daddr;
+	int freecnt;
+	int blocks;
+
+	blocks = bytes / DEV_BSIZE;
+	jext = &jblocks->jb_extent[jblocks->jb_head];
+	freecnt = jext->je_blocks - jblocks->jb_off;
+	if (freecnt == 0) {
+		jblocks->jb_off = 0;
+		if (++jblocks->jb_head > jblocks->jb_used)
+			return (0);
+		jext = &jblocks->jb_extent[jblocks->jb_head];
+		freecnt = jext->je_blocks;
+	}
+	if (freecnt > blocks)
+		freecnt = blocks;
+	*actual = freecnt * DEV_BSIZE;
+	daddr = jext->je_daddr + jblocks->jb_off;
+
+	return (daddr);
+}
+
+/*
+ * Advance the allocation head by a specified number of bytes, consuming
+ * one journal segment.
+ */
+static void
+jblocks_advance(struct jblocks *jblocks, int bytes)
+{
+
+	jblocks->jb_off += bytes / DEV_BSIZE;
+}
+
+static void
+jblocks_destroy(struct jblocks *jblocks)
+{
+
+	free(jblocks->jb_extent);
+	free(jblocks);
+}
+
+static void
+jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks)
+{
+	struct jextent *jext;
+	int size;
+
+	jext = &jblocks->jb_extent[jblocks->jb_used];
+	/* Adding the first block. */
+	if (jext->je_daddr == 0) {
+		jext->je_daddr = daddr;
+		jext->je_blocks = blocks;
+		return;
+	}
+	/* Extending the last extent. */
+	if (jext->je_daddr + jext->je_blocks == daddr) {
+		jext->je_blocks += blocks;
+		return;
+	}
+	/* Adding a new extent. */
+	if (++jblocks->jb_used == jblocks->jb_avail) {
+		jblocks->jb_avail *= 2;
+		size = sizeof(struct jextent) * jblocks->jb_avail;
+		jext = errmalloc(size);
+		bzero(jext, size);
+		bcopy(jblocks->jb_extent, jext,
+		    sizeof(struct jextent) * jblocks->jb_used);
+		free(jblocks->jb_extent);
+		jblocks->jb_extent = jext;
+	}
+	jext = &jblocks->jb_extent[jblocks->jb_used];
+	jext->je_daddr = daddr;
+	jext->je_blocks = blocks;
+
+	return;
+}
+
+/*
+ * Add a file block from the journal to the extent map.  We can't read
+ * each file block individually because the kernel treats it as a circular
+ * buffer and segments may span mutliple contiguous blocks.
+ */
+static void
+suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+
+	jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags));
+}
+
+static void
+suj_read(void)
+{
+	uint8_t block[1 * 1024 * 1024];
+	struct suj_seg *seg;
+	struct jsegrec *rec;
+	ufs2_daddr_t blk;
+	int recsize;
+	int size;
+
+	/*
+	 * Read records until we exhaust the journal space.  If we find
+	 * an invalid record we start searching for a valid segment header
+	 * at the next block.  This is because we don't have a head/tail
+	 * pointer and must recover the information indirectly.  At the gap
+	 * between the head and tail we won't necessarily have a valid
+	 * segment.
+	 */
+	for (;;) {
+		size = sizeof(block);
+		blk = jblocks_next(suj_jblocks, size, &size);
+		if (blk == 0)
+			return;
+		/*
+		 * Read 1MB at a time and scan for records within this block.
+		 */
+		if (bread(disk, blk, &block, size) == -1)
+			err(1, "Error reading journal block %jd",
+			    (intmax_t)blk);
+		for (rec = (void *)block; size; size -= recsize,
+		    rec = (struct jsegrec *)((uintptr_t)rec + recsize)) {
+			recsize = DEV_BSIZE;
+			if (rec->jsr_time != fs->fs_mtime) {
+				if (debug)
+					printf("Rec time %jd != fs mtime %jd\n",
+					    rec->jsr_time, fs->fs_mtime);
+				jblocks_advance(suj_jblocks, recsize);
+				continue;
+			}
+			if (rec->jsr_cnt == 0) {
+				if (debug)
+					printf("Found illegal count %d\n",
+					    rec->jsr_cnt);
+				jblocks_advance(suj_jblocks, recsize);
+				continue;
+			}
+			recsize = roundup2((rec->jsr_cnt + 1) * JREC_SIZE,
+			    DEV_BSIZE);
+			if (recsize > size) {
+				/*
+				 * We may just have run out of buffer, restart
+				 * the loop to re-read from this spot.
+				 */
+				if (size < fs->fs_bsize && 
+				    recsize <= fs->fs_bsize) {
+					recsize = size;
+					continue;
+				}
+				if (debug)
+					printf("Found invalid segsize %d > %d\n",
+					    recsize, size);
+				recsize = DEV_BSIZE;
+				jblocks_advance(suj_jblocks, recsize);
+				continue;
+			}
+			seg = errmalloc(sizeof(*seg));
+			seg->ss_blk = errmalloc(recsize);
+			seg->ss_rec = *rec;
+			bcopy((void *)rec, seg->ss_blk, recsize);
+			if (rec->jsr_oldest > oldseq)
+				oldseq = rec->jsr_oldest;
+			TAILQ_INSERT_TAIL(&allsegs, seg, ss_next);
+			jrecs += rec->jsr_cnt;
+			jbytes += recsize;
+			jblocks_advance(suj_jblocks, recsize);
+		}
+	}
+}
+
+/*
+ * Orchestrate the verification of a filesystem via the softupdates journal.
+ */
+void
+suj_check(const char *filesys)
+{
+	union dinode *jip;
+	uint64_t blocks;
+
+	opendisk(filesys);
+	TAILQ_INIT(&allsegs);
+	/*
+	 * Fetch the journal inode and verify it.
+	 */
+	jip = ino_read(fs->fs_sujournal);
+	printf("SU+J Checking %s\n", filesys);
+	suj_verifyino(jip);
+	/*
+	 * Build a list of journal blocks in jblocks before parsing the
+	 * available journal blocks in with suj_read().
+	 */
+	printf("Reading %jd byte journal from inode %d.\n",
+	    DIP(jip, di_size), fs->fs_sujournal);
+	suj_jblocks = jblocks_create();
+	blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0);
+	if (blocks != numfrags(fs, DIP(jip, di_size)))
+		errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal);
+	suj_read();
+	jblocks_destroy(suj_jblocks);
+	suj_jblocks = NULL;
+	if (reply("RECOVER")) {
+		printf("Building recovery table.\n");
+		suj_prune();
+		suj_build();
+		printf("Resolving unreferenced inode list.\n");
+		ino_unlinked();
+		printf("Processing journal entries.\n");
+		cg_apply(cg_check);
+	}
+	if (reply("WRITE CHANGES"))
+		cg_apply(cg_write);
+	printf("%jd journal records in %jd bytes for %.2f%% utilization\n",
+	    jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100);
+	printf("Freed %jd inodes (%jd directories) %jd blocks and %jd frags.\n",
+	    freeinos, freedir, freeblocks, freefrags);
+	/* Write back superblock. */
+	closedisk(filesys);
+}
Index: sbin/fsck_ffs/gjournal.c
===================================================================
--- sbin/fsck_ffs/gjournal.c	(revision 202342)
+++ sbin/fsck_ffs/gjournal.c	(working copy)
@@ -96,27 +96,6 @@ struct ufs2_dinode ufs2_zino;
 static void putcgs(void);
 
 /*
- * Write current block of inodes.
- */
-static int
-putino(struct uufsd *disk, ino_t inode)
-{
-	caddr_t inoblock;
-	struct fs *fs;
-	ssize_t ret;
-
-	fs = &disk->d_fs;
-	inoblock = disk->d_inoblock;
-
-	assert(inoblock != NULL);
-	assert(inode >= disk->d_inomin && inode <= disk->d_inomax);
-	ret = bwrite(disk, fsbtodb(fs, ino_to_fsba(fs, inode)), inoblock,
-	    fs->fs_bsize);
-
-	return (ret == -1 ? -1 : 0);
-}
-
-/*
  * Return cylinder group from the cache or load it if it is not in the
  * cache yet.
  * Don't cache more than MAX_CACHED_CGS cylinder groups.
@@ -242,13 +221,11 @@ cancelcgs(void)
 #endif
 
 /*
- * Open the given provider, load statistics.
+ * Open the given provider, load superblock.
  */
 static void
-getdisk(void)
+opendisk(void)
 {
-	int i;
-
 	if (disk != NULL)
 		return;
 	disk = malloc(sizeof(*disk));
@@ -259,24 +236,6 @@ static void
 		    disk->d_error);
 	}
 	fs = &disk->d_fs;
-	fs->fs_csp = malloc((size_t)fs->fs_cssize);
-	if (fs->fs_csp == NULL)
-		err(1, "malloc(%zu)", (size_t)fs->fs_cssize);
-	bzero(fs->fs_csp, (size_t)fs->fs_cssize);
-	for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
-		if (bread(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
-		    (void *)(((char *)fs->fs_csp) + i),
-		    (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
-			err(1, "bread: %s", disk->d_error);
-		}
-	}
-	if (fs->fs_contigsumsize > 0) {
-		fs->fs_maxcluster = malloc(fs->fs_ncg * sizeof(int32_t));
-		if (fs->fs_maxcluster == NULL)
-			err(1, "malloc(%zu)", fs->fs_ncg * sizeof(int32_t));
-		for (i = 0; i < fs->fs_ncg; i++)
-			fs->fs_maxcluster[i] = fs->fs_contigsumsize;
-	}
 }
 
 /*
@@ -286,11 +245,6 @@ static void
 closedisk(void)
 {
 
-	free(fs->fs_csp);
-	if (fs->fs_contigsumsize > 0) {
-		free(fs->fs_maxcluster);
-		fs->fs_maxcluster = NULL;
-	}
 	fs->fs_clean = 1;
 	if (sbwrite(disk, 0) == -1)
 		err(1, "sbwrite(%s)", devnam);
@@ -301,228 +255,7 @@ closedisk(void)
 	fs = NULL;
 }
 
-/*
- * Write the statistics back, call closedisk().
- */
 static void
-putdisk(void)
-{
-	int i;
-
-	assert(disk != NULL && fs != NULL);
-	for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
-		if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
-		    (void *)(((char *)fs->fs_csp) + i),
-		    (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
-			err(1, "bwrite: %s", disk->d_error);
-		}
-	}
-	closedisk();
-}
-
-#if 0
-/*
- * Free memory, close the disk, but don't write anything back.
- */
-static void
-canceldisk(void)
-{
-	int i;
-
-	assert(disk != NULL && fs != NULL);
-	free(fs->fs_csp);
-	if (fs->fs_contigsumsize > 0)
-		free(fs->fs_maxcluster);
-	if (ufs_disk_close(disk) == -1)
-		err(1, "ufs_disk_close(%s)", devnam);
-	free(disk);
-	disk = NULL;
-	fs = NULL;
-}
-#endif
-
-static int
-isblock(unsigned char *cp, ufs1_daddr_t h)
-{
-	unsigned char mask;
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0xff);
-	case 4:
-		mask = 0x0f << ((h & 0x1) << 2);
-		return ((cp[h >> 1] & mask) == mask);
-	case 2:
-		mask = 0x03 << ((h & 0x3) << 1);
-		return ((cp[h >> 2] & mask) == mask);
-	case 1:
-		mask = 0x01 << (h & 0x7);
-		return ((cp[h >> 3] & mask) == mask);
-	default:
-		assert(!"isblock: invalid number of fragments");
-	}
-	return (0);
-}
-
-/*
- * put a block into the map
- */
-static void
-setblock(unsigned char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		cp[h] = 0xff;
-		return;
-	case 4:
-		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
-		return;
-	case 2:
-		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
-		return;
-	case 1:
-		cp[h >> 3] |= (0x01 << (h & 0x7));
-		return;
-	default:
-		assert(!"setblock: invalid number of fragments");
-	}
-}
-
-/*
- * check if a block is free
- */
-static int
-isfreeblock(u_char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0);
-	case 4:
-		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
-	case 2:
-		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
-	case 1:
-		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
-	default:
-		assert(!"isfreeblock: invalid number of fragments");
-	}
-	return (0);
-}
-
-/*
- * Update the frsum fields to reflect addition or deletion
- * of some frags.
- */
-void
-fragacct(int fragmap, int32_t fraglist[], int cnt)
-{
-	int inblk;
-	int field, subfield;
-	int siz, pos;
-
-	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
-	fragmap <<= 1;
-	for (siz = 1; siz < fs->fs_frag; siz++) {
-		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
-			continue;
-		field = around[siz];
-		subfield = inside[siz];
-		for (pos = siz; pos <= fs->fs_frag; pos++) {
-			if ((fragmap & field) == subfield) {
-				fraglist[siz] += cnt;
-				pos += siz;
-				field <<= siz;
-				subfield <<= siz;
-			}
-			field <<= 1;
-			subfield <<= 1;
-		}
-	}
-}
-
-static void
-clusteracct(struct cg *cgp, ufs1_daddr_t blkno)
-{
-	int32_t *sump;
-	int32_t *lp;
-	u_char *freemapp, *mapp;
-	int i, start, end, forw, back, map, bit;
-
-	if (fs->fs_contigsumsize <= 0)
-		return;
-	freemapp = cg_clustersfree(cgp);
-	sump = cg_clustersum(cgp);
-	/*
-	 * Clear the actual block.
-	 */
-	setbit(freemapp, blkno);
-	/*
-	 * Find the size of the cluster going forward.
-	 */
-	start = blkno + 1;
-	end = start + fs->fs_contigsumsize;
-	if (end >= cgp->cg_nclusterblks)
-		end = cgp->cg_nclusterblks;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp++;
-	bit = 1 << (start % NBBY);
-	for (i = start; i < end; i++) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != (NBBY - 1)) {
-			bit <<= 1;
-		} else {
-			map = *mapp++;
-			bit = 1;
-		}
-	}
-	forw = i - start;
-	/*
-	 * Find the size of the cluster going backward.
-	 */
-	start = blkno - 1;
-	end = start - fs->fs_contigsumsize;
-	if (end < 0)
-		end = -1;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp--;
-	bit = 1 << (start % NBBY);
-	for (i = start; i > end; i--) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != 0) {
-			bit >>= 1;
-		} else {
-			map = *mapp--;
-			bit = 1 << (NBBY - 1);
-		}
-	}
-	back = start - i;
-	/*
-	 * Account for old cluster and the possibly new forward and
-	 * back clusters.
-	 */
-	i = back + forw + 1;
-	if (i > fs->fs_contigsumsize)
-		i = fs->fs_contigsumsize;
-	sump[i]++;
-	if (back > 0)
-		sump[back]--;
-	if (forw > 0)
-		sump[forw]--;
-	/*
-	 * Update cluster summary information.
-	 */
-	lp = &sump[fs->fs_contigsumsize];
-	for (i = fs->fs_contigsumsize; i > 0; i--)
-		if (*lp-- > 0)
-			break;
-	fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
-static void
 blkfree(ufs2_daddr_t bno, long size)
 {
 	struct cgchain *cgc;
@@ -539,10 +272,10 @@ blkfree(ufs2_daddr_t bno, long size)
 	blksfree = cg_blksfree(cgp);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
-		if (!isfreeblock(blksfree, fragno))
+		if (!ffs_isfreeblock(fs, blksfree, fragno))
 			assert(!"blkfree: freeing free block");
-		setblock(blksfree, fragno);
-		clusteracct(cgp, fragno);
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
@@ -552,7 +285,7 @@ blkfree(ufs2_daddr_t bno, long size)
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
-		fragacct(blk, cgp->cg_frsum, -1);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
@@ -569,16 +302,16 @@ blkfree(ufs2_daddr_t bno, long size)
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
-		fragacct(blk, cgp->cg_frsum, 1);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
-		if (isblock(blksfree, fragno)) {
+		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
-			clusteracct(cgp, fragno);
+			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
@@ -599,7 +332,7 @@ freeindir(ufs2_daddr_t blk, int level)
 	if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1)
 		err(1, "bread: %s", disk->d_error);
 	blks = (ufs2_daddr_t *)&sblks;
-	for (i = 0; i < howmany(fs->fs_bsize, sizeof(ufs2_daddr_t)); i++) {
+	for (i = 0; i < NINDIR(fs); i++) {
 		if (blks[i] == 0)
 			break;
 		if (level == 0)
@@ -671,7 +404,7 @@ gjournal_check(const char *filesys)
 	int cg, mode;
 
 	devnam = filesys;
-	getdisk();
+	opendisk();
 	/* Are there any unreferenced inodes in this file system? */
 	if (fs->fs_unrefs == 0) {
 		//printf("No unreferenced inodes.\n");
@@ -747,7 +480,7 @@ gjournal_check(const char *filesys)
 			/* Zero-fill the inode. */
 			*dino = ufs2_zino;
 			/* Write the inode back. */
-			if (putino(disk, ino) == -1)
+			if (putino(disk) == -1)
 				err(1, "putino(cg=%d ino=%d)", cg, ino);
 			if (cgp->cg_unrefs == 0) {
 				//printf("No more unreferenced inodes in cg=%d.\n", cg);
@@ -772,5 +505,5 @@ gjournal_check(const char *filesys)
 	/* Write back modified cylinder groups. */
 	putcgs();
 	/* Write back updated statistics and super-block. */
-	putdisk();
+	closedisk();
 }
Index: sbin/fsck_ffs/main.c
===================================================================
--- sbin/fsck_ffs/main.c	(revision 202342)
+++ sbin/fsck_ffs/main.c	(working copy)
@@ -256,7 +256,7 @@ checkfilesys(char *filesys)
 	}
 	if (ckclean && skipclean) {
 		/*
-		 * If file system is gjournaled, check it here.
+		 * If file system is gjournaled or su+j, check it here.
 		 */
 		if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0)
 			exit(3);	/* Cannot read superblock */
@@ -278,6 +278,18 @@ checkfilesys(char *filesys)
 				    "CANNOT RUN FAST FSCK\n");
 			}
 		}
+#if 0
+		if ((sblock.fs_flags & FS_SUJ) != 0) {
+			if (sblock.fs_clean == 1) {
+				pwarn("FILE SYSTEM CLEAN; SKIPPING CHECKS\n");
+				exit(0);
+			}
+			suj_check(filesys);
+			if (chkdoreload(mntp) == 0)
+				exit(0);
+			exit(4);
+		}
+#endif
 	}
 	/*
 	 * If we are to do a background check:
@@ -299,7 +311,7 @@ checkfilesys(char *filesys)
 			pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n");
 		} else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) {
 			if (readsb(0) != 0) {
-				if (sblock.fs_flags & FS_NEEDSFSCK) {
+				if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) {
 					bkgrdflag = 0;
 					pfatal("UNEXPECTED INCONSISTENCY, %s\n",
 					    "CANNOT RUN IN BACKGROUND\n");
@@ -481,6 +493,7 @@ checkfilesys(char *filesys)
 	inocleanup();
 	if (fsmodified) {
 		sblock.fs_time = time(NULL);
+		sblock.fs_mtime = time(NULL);
 		sbdirty();
 	}
 	if (cvtlevel && sblk.b_dirty) {
Index: sbin/fsck_ffs/pass4.c
===================================================================
--- sbin/fsck_ffs/pass4.c	(revision 202342)
+++ sbin/fsck_ffs/pass4.c	(working copy)
@@ -72,6 +72,9 @@ pass4(void)
 		for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) {
 			if (inumber < ROOTINO)
 				continue;
+			if (sblock.fs_flags & FS_SUJ &&
+			    inumber == sblock.fs_sujournal)
+				continue;
 			idesc.id_number = inumber;
 			switch (inoinfo(inumber)->ino_state) {
 
Index: sbin/fsck_ffs/pass5.c
===================================================================
--- sbin/fsck_ffs/pass5.c	(revision 202342)
+++ sbin/fsck_ffs/pass5.c	(working copy)
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <inttypes.h>
 #include <limits.h>
 #include <string.h>
+#include <libufs.h>
 
 #include "fsck.h"
 
Index: sbin/fsck_ffs/fsck.h
===================================================================
--- sbin/fsck_ffs/fsck.h	(revision 202342)
+++ sbin/fsck_ffs/fsck.h	(working copy)
@@ -347,10 +347,6 @@ void		direrror(ino_t ino, const char *errmesg);
 int		dirscan(struct inodesc *);
 int		dofix(struct inodesc *, const char *msg);
 int		eascan(struct inodesc *, struct ufs2_dinode *dp);
-void		ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
-void		ffs_fragacct(struct fs *, int, int32_t [], int);
-int		ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
-void		ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
 void		fileerror(ino_t cwd, ino_t ino, const char *errmesg);
 int		findino(struct inodesc *);
 int		findname(struct inodesc *);
@@ -392,3 +388,4 @@ void		sblock_init(void);
 void		setinodebuf(ino_t);
 int		setup(char *dev);
 void		gjournal_check(const char *filesys);
+void		suj_check(const char *filesys);
Index: sbin/fsck_ffs/Makefile
===================================================================
--- sbin/fsck_ffs/Makefile	(revision 202342)
+++ sbin/fsck_ffs/Makefile	(working copy)
@@ -7,12 +7,12 @@ LINKS+=	${BINDIR}/fsck_ffs ${BINDIR}/fsck_4.2bsd
 MAN=	fsck_ffs.8
 MLINKS=	fsck_ffs.8 fsck_ufs.8 fsck_ffs.8 fsck_4.2bsd.8
 SRCS=	dir.c ea.c fsutil.c inode.c main.c pass1.c pass1b.c pass2.c pass3.c \
-	pass4.c pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c gjournal.c \
-	getmntopts.c
+	pass4.c pass5.c setup.c suj.c utilities.c gjournal.c getmntopts.c
 DPADD=	${LIBUFS}
 LDADD=	-lufs
+CFLAGS=	-O0 -pipe
 WARNS?=	2
-CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../mount
+CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../mount -g
 
 .PATH:	${.CURDIR}/../../sys/ufs/ffs ${.CURDIR}/../mount
 
Index: sbin/fsdb/fsdbutil.c
===================================================================
--- sbin/fsdb/fsdbutil.c	(revision 202342)
+++ sbin/fsdb/fsdbutil.c	(working copy)
@@ -52,7 +52,7 @@ static const char rcsid[] =
 #include "fsck.h"
 
 static int charsperline(void);
-static int printindir(ufs2_daddr_t blk, int level, char *bufp);
+static void printindir(ufs2_daddr_t blk, int level, char *bufp);
 static void printblocks(ino_t inum, union dinode *dp);
 
 char **
@@ -226,7 +226,7 @@ charsperline(void)
 /*
  * Recursively print a list of indirect blocks.
  */
-static int
+static void
 printindir(ufs2_daddr_t blk, int level, char *bufp)
 {
     struct bufarea buf, *bp;
@@ -234,6 +234,9 @@ printindir(ufs2_daddr_t blk, int level, char *bufp
     int i, j, cpl, charssofar;
     ufs2_daddr_t blkno;
 
+    if (blk == 0)
+	return;
+    printf("%jd (%d) =>\n", (intmax_t)blk, level);
     if (level == 0) {
 	/* for the final indirect level, don't use the cache */
 	bp = &buf;
@@ -251,11 +254,8 @@ printindir(ufs2_daddr_t blk, int level, char *bufp
 		blkno = bp->b_un.b_indir1[i];
 	else
 		blkno = bp->b_un.b_indir2[i];
-	if (blkno == 0) {
-	    if (level == 0)
-		putchar('\n');
-	    return 0;
-	}
+	if (blkno == 0)
+	    continue;
 	j = sprintf(tempbuf, "%jd", (intmax_t)blkno);
 	if (level == 0) {
 	    charssofar += j;
@@ -270,13 +270,14 @@ printindir(ufs2_daddr_t blk, int level, char *bufp
 	    charssofar += 2;
 	} else {
 	    printf(" =>\n");
-	    if (printindir(blkno, level - 1, bufp) == 0)
-		return 0;
+	    printindir(blkno, level - 1, bufp);
+	    printf("\n");
+	    charssofar = 0;
 	}
     }
     if (level == 0)
 	putchar('\n');
-    return 1;
+    return;
 }
 
 
@@ -309,7 +310,7 @@ printblocks(ino_t inum, union dinode *dp)
 	}
     }
     putchar('\n');
-    if (DIP(dp, di_ib[0]) == 0)
+    if (ndb == 0)
 	return;
 
     bufp = malloc((unsigned int)sblock.fs_bsize);
@@ -317,8 +318,7 @@ printblocks(ino_t inum, union dinode *dp)
 	errx(EEXIT, "cannot allocate indirect block buffer");
     printf("Indirect blocks:\n");
     for (i = 0; i < NIADDR; i++)
-	if (printindir(DIP(dp, di_ib[i]), i, bufp) == 0)
-	    break;
+	printindir(DIP(dp, di_ib[i]), i, bufp);
     free(bufp);
 }
 
Index: sbin/mount/mount.c
===================================================================
--- sbin/mount/mount.c	(revision 202342)
+++ sbin/mount/mount.c	(working copy)
@@ -113,6 +113,7 @@ static struct opt {
 	{ MNT_ACLS,		"acls" },
 	{ MNT_NFS4ACLS,		"nfsv4acls" },
 	{ MNT_GJOURNAL,		"gjournal" },
+	{ MNT_SUJ,		"journal" }, /* always soft-updates, journal */
 	{ 0, NULL }
 };
 
Index: sbin/tunefs/tunefs.c
===================================================================
--- sbin/tunefs/tunefs.c	(revision 202342)
+++ sbin/tunefs/tunefs.c	(working copy)
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <paths.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -72,16 +73,19 @@ struct uufsd disk;
 
 void usage(void);
 void printfs(void);
+int journal_alloc(int64_t size);
+void sbdirty(void);
 
 int
 main(int argc, char *argv[])
 {
-	char *avalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
+	char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
 	const char *special, *on;
 	const char *name;
 	int active;
-	int Aflag, aflag, eflag, evalue, fflag, fvalue, Jflag, Lflag, lflag;
-	int mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag, svalue;
+	int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag;
+	int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag;
+	int svalue, Sflag, Svalue;
 	int ch, found_arg, i;
 	const char *chg[2];
 	struct ufs_args args;
@@ -89,13 +93,13 @@ main(int argc, char *argv[])
 
 	if (argc < 3)
 		usage();
-	Aflag = aflag = eflag = fflag = Jflag = Lflag = lflag = mflag = 0;
-	Nflag = nflag = oflag = pflag = sflag = 0;
-	avalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
-	evalue = fvalue = mvalue = ovalue = svalue = 0;
+	Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0;
+	mflag = Nflag = nflag = oflag = pflag = sflag = 0;
+	avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
+	evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0;
 	active = 0;
 	found_arg = 0;		/* At least one arg is required. */
-	while ((ch = getopt(argc, argv, "Aa:e:f:J:L:l:m:N:n:o:ps:")) != -1)
+	while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:")) != -1)
 		switch (ch) {
 
 		case 'A':
@@ -135,6 +139,18 @@ main(int argc, char *argv[])
 			fflag = 1;
 			break;
 
+		case 'j':
+			found_arg = 1;
+			name = "softdep journaled file system";
+			jvalue = optarg;
+			if (strcmp(jvalue, "enable") &&
+			    strcmp(jvalue, "disable")) {
+				errx(10, "bad %s (options are %s)",
+				    name, "`enable' or `disable'");
+			}
+			jflag = 1;
+			break;
+
 		case 'J':
 			found_arg = 1;
 			name = "gjournaled file system";
@@ -240,6 +256,16 @@ main(int argc, char *argv[])
 			sflag = 1;
 			break;
 
+		case 'S':
+			found_arg = 1;
+			name = "Softdep Journal Size";
+			Svalue = atoi(optarg);
+			if (Svalue < SUJ_MIN)
+				errx(10, "%s must be >= %d (was %s)",
+				    name, SUJ_MIN, optarg);
+			Sflag = 1;
+			break;
+
 		default:
 			usage();
 		}
@@ -310,6 +336,33 @@ main(int argc, char *argv[])
 			sblock.fs_avgfilesize = fvalue;
 		}
 	}
+	if (jflag) {
+ 		name = "soft updates journaling";
+ 		if (strcmp(jvalue, "enable") == 0) {
+			if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) ==
+			    (FS_DOSOFTDEP | FS_SUJ)) {
+				warnx("%s remains unchanged as enabled", name);
+			} else if (sblock.fs_clean == 0) {
+				warnx("%s cannot be enabled until fsck is run",
+				    name);
+			} else if (journal_alloc(Svalue) != 0) {
+				warnx("%s can not be enabled", name);
+			} else {
+ 				sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ;
+ 				warnx("%s set", name);
+			}
+ 		} else if (strcmp(jvalue, "disable") == 0) {
+			if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
+				warnx("%s remains unchanged as disabled", name);
+			} else {
+				sbdirty();
+ 				sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
+				sblock.fs_sujournal = 0;
+				sblock.fs_sujfree = 0;
+ 				warnx("%s cleared", name);
+			}
+ 		}
+	}
 	if (Jflag) {
 		name = "gjournal";
 		if (strcmp(Jvalue, "enable") == 0) {
@@ -456,6 +509,229 @@ err:
 }
 
 void
+sbdirty(void)
+{
+	disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK;
+	disk.d_fs.fs_clean = 0;
+}
+
+int blocks;
+static char clrbuf[MAXBSIZE];
+
+static ufs2_daddr_t
+journal_balloc(void)
+{
+	ufs2_daddr_t blk;
+	struct cg *cgp;
+	struct fs *fs;
+	int valid;
+
+	cgp = &disk.d_cg;
+	fs = &disk.d_fs;
+	for (;;) {
+		blk = cgballoc(&disk);
+		if (blk > 0)
+			break;
+		/*
+		 * If we failed to allocate a block from this cg, move to
+		 * the next.
+		 */
+		if (cgwrite(&disk) < 0) {
+			warn("Failed to write updated cg");
+			return (-1);
+		}
+		while ((valid = cgread(&disk)) == 1) {
+			/*
+			 * Try to minimize fragmentation by requiring a minimum
+			 * number of blocks present.
+			 */
+			if (cgp->cg_cs.cs_nbfree > blocks / 8)
+				break;
+		}
+		if (valid)
+			continue;
+		warnx("Failed to find sufficient free blocks for the journal");
+		return -1;
+	}
+	if (bwrite(&disk, fsbtodb(fs, blk), clrbuf, fs->fs_bsize) <= 0) {
+		warn("Failed to initialize new block");
+		return -1;
+	}
+	return (blk);
+}
+
+static int
+indir_fill(ufs2_daddr_t blk, int level, int *resid)
+{
+	char indirbuf[MAXBSIZE];
+	ufs1_daddr_t *bap1;
+	ufs2_daddr_t *bap2;
+	ufs2_daddr_t nblk;
+	struct fs *fs;
+	int ncnt;
+	int cnt;
+	int i;
+
+	fs = &disk.d_fs;
+	bzero(indirbuf, sizeof(indirbuf));
+	bap1 = (ufs1_daddr_t *)indirbuf;
+	bap2 = (void *)bap1;
+	cnt = 0;
+	for (i = 0; i < NINDIR(fs) && *resid != 0; i++) {
+		nblk = journal_balloc();
+		if (nblk <= 0)
+			return (-1);
+		cnt++;
+		if (fs->fs_magic == FS_UFS1_MAGIC)
+			*bap1++ = nblk;
+		else
+			*bap2++ = nblk;
+		if (level != 0) {
+			ncnt = indir_fill(nblk, level - 1, resid);
+			if (ncnt <= 0)
+				return (-1);
+			cnt += ncnt;
+		} else 
+			(*resid)--;
+	}
+	if (bwrite(&disk, fsbtodb(fs, blk), indirbuf, fs->fs_bsize) <= 0) {
+		warn("Failed to write indirect");
+		return (-1);
+	}
+	return (cnt);
+}
+
+int
+journal_alloc(int64_t size)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	ufs2_daddr_t blk;
+	void *ip;
+	struct cg *cgp;
+	struct fs *fs;
+	int resid;
+	ino_t ino;
+	int blks;
+	int mode;
+	int i;
+
+	fs = &disk.d_fs;
+	cgp = &disk.d_cg;
+	ino = 0;
+
+	/*
+	 * If the user didn't supply a size pick one based on the filesystem
+	 * size constrained with hardcoded MIN and MAX values.  We opt for
+	 * 1/1024th of the filesystem up to MAX but not exceeding one CG and
+	 * not less than the MIN.
+	 */
+	if (size == 0) {
+		size = (fs->fs_size * fs->fs_bsize) / 1024;
+		size = MIN(SUJ_MAX, size);
+		if (size / fs->fs_fsize > fs->fs_fpg)
+			size = fs->fs_fpg * fs->fs_fsize;
+		size = MAX(SUJ_MIN, size);
+	}
+	resid = blocks = size / fs->fs_bsize;
+	if (fs->fs_cstotal.cs_nbfree < blocks) {
+		warn("Insufficient free space for %jd byte journal", size);
+		return (-1);
+	}
+	/*
+	 * Find a cg with enough blocks to satisfy the journal
+	 * size.  Presently the journal does not span cgs.
+	 */
+	while (cgread(&disk) == 1) {
+		if (cgp->cg_cs.cs_nifree == 0)
+			continue;
+		/*
+		 * Try to minimize fragmentation by requiring at least a
+		 * 1/8th of the blocks be present in each cg we use.
+		 */
+		if (cgp->cg_cs.cs_nbfree < blocks / 8)
+			continue;
+		ino = cgialloc(&disk);
+		if (ino <= 0)
+			break;
+		printf("Using inode %d in cg %d for %jd byte journal\n", 
+		    ino, cgp->cg_cgx, size);
+		if (getino(&disk, &ip, ino, &mode) != 0) {
+			warn("Failed to get allocated inode");
+			sbdirty();
+			goto out;
+		}
+		/*
+		 * We leave fields unrelated to the number of allocated
+		 * blocks and size uninitialized.  This causes legacy
+		 * fsck implementations to clear the inode.
+		 */
+		dp2 = ip;
+		dp1 = ip;
+		if (fs->fs_magic == FS_UFS1_MAGIC) {
+			bzero(dp1, sizeof(*dp1));
+			dp1->di_size = size;
+			dp1->di_mode = IFREG;
+			dp1->di_nlink = 1;
+		} else {
+			bzero(dp2, sizeof(*dp2));
+			dp2->di_size = size;
+			dp2->di_mode = IFREG;
+			dp2->di_nlink = 1;
+		}
+		for (i = 0; i < NDADDR && resid; i++, resid--) {
+			blk = journal_balloc();
+			if (blk <= 0)
+				goto out;
+			if (fs->fs_magic == FS_UFS1_MAGIC) {
+				dp1->di_db[i] = blk;
+				dp1->di_blocks++;
+			} else {
+				dp2->di_db[i] = blk;
+				dp2->di_blocks++;
+			}
+		}
+		for (i = 0; i < NIADDR && resid; i++) {
+			blk = journal_balloc();
+			if (blk <= 0)
+				goto out;
+			blks = indir_fill(blk, i, &resid) + 1;
+			if (blks <= 0) {
+				sbdirty();
+				goto out;
+			}
+			if (fs->fs_magic == FS_UFS1_MAGIC) {
+				dp1->di_ib[i] = blk;
+				dp1->di_blocks += blks;
+			} else {
+				dp2->di_ib[i] = blk;
+				dp2->di_blocks += blks;
+			}
+		}
+		if (fs->fs_magic == FS_UFS1_MAGIC)
+			dp1->di_blocks *= fs->fs_bsize / disk.d_bsize;
+		else
+			dp2->di_blocks *= fs->fs_bsize / disk.d_bsize;
+		if (putino(&disk) < 0) {
+			warn("Failed to write inode");
+			sbdirty();
+			return (-1);
+		}
+		if (cgwrite(&disk) < 0) {
+			warn("Failed to write updated cg");
+			sbdirty();
+			return (-1);
+		}
+		fs->fs_sujournal = ino;
+		fs->fs_sujfree = 0;
+		return (0);
+	}
+	warnx("Insufficient contiguous free space for the journal.");
+out:
+	return (-1);
+}
+
+void
 usage(void)
 {
 	fprintf(stderr, "%s\n%s\n%s\n%s\n",
@@ -477,6 +753,8 @@ printfs(void)
 		(sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled");
 	warnx("soft updates: (-n)                                 %s", 
 		(sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled");
+	warnx("soft update journaling: (-j)                       %s", 
+		(sblock.fs_flags & FS_SUJ)? "enabled" : "disabled");
 	warnx("gjournal: (-J)                                     %s",
 		(sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled");
 	warnx("maximum blocks per file in a cylinder group: (-e)  %d",
Index: usr.sbin/makefs/ffs/ffs_bswap.c
===================================================================
--- usr.sbin/makefs/ffs/ffs_bswap.c	(revision 202342)
+++ usr.sbin/makefs/ffs/ffs_bswap.c	(working copy)
@@ -136,8 +136,6 @@ ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs
 
 	n->di_mode = bswap16(o->di_mode);
 	n->di_nlink = bswap16(o->di_nlink);
-	n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]);
-	n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]);
 	n->di_size = bswap64(o->di_size);
 	n->di_atime = bswap32(o->di_atime);
 	n->di_atimensec = bswap32(o->di_atimensec);
Index: lib/libufs/inode.c
===================================================================
--- lib/libufs/inode.c	(revision 202342)
+++ lib/libufs/inode.c	(working copy)
@@ -93,3 +93,19 @@ gotit:	switch (disk->d_ufs) {
 	ERROR(disk, "unknown UFS filesystem type");
 	return (-1);
 }
+
+int
+putino(struct uufsd *disk)
+{
+	struct fs *fs;
+
+	fs = &disk->d_fs;
+	if (disk->d_inoblock == NULL) {
+		ERROR(disk, "No inode block allocated");
+		return (-1);
+	}
+	if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)),
+	    disk->d_inoblock, disk->d_fs.fs_bsize) <= 0)
+		return (-1);
+	return (0);
+}
Index: lib/libufs/cgroup.c
===================================================================
--- lib/libufs/cgroup.c	(revision 202342)
+++ lib/libufs/cgroup.c	(working copy)
@@ -40,11 +40,82 @@ __FBSDID("$FreeBSD$");
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include <libufs.h>
 
+ufs2_daddr_t
+cgballoc(struct uufsd *disk)
+{
+	u_int8_t *blksfree;
+	struct cg *cgp;
+	struct fs *fs;
+	long bno;
+
+	fs = &disk->d_fs;
+	cgp = &disk->d_cg;
+	blksfree = cg_blksfree(cgp);
+	for (bno = 0; bno < fs->fs_fpg / fs->fs_frag; bno++)
+		if (ffs_isblock(fs, blksfree, bno))
+			goto gotit;
+	return (0);
+gotit:
+	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
+	ffs_clrblock(fs, blksfree, (long)bno);
+	ffs_clusteracct(fs, cgp, bno, -1);
+	cgp->cg_cs.cs_nbfree--;
+	fs->fs_cstotal.cs_nbfree--;
+	fs->fs_fmod = 1;
+	return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
+}
+
+ino_t
+cgialloc(struct uufsd *disk)
+{
+	struct ufs2_dinode *dp2;
+	u_int8_t *inosused;
+	struct cg *cgp;
+	struct fs *fs;
+	ino_t ino;
+	int i;
+
+	fs = &disk->d_fs;
+	cgp = &disk->d_cg;
+	inosused = cg_inosused(cgp);
+	for (ino = 0; ino < fs->fs_ipg / NBBY; ino++)
+		if (isclr(inosused, ino))
+			goto gotit;
+	return (0);
+gotit:
+	if (fs->fs_magic == FS_UFS2_MAGIC &&
+	    ino + INOPB(fs) > cgp->cg_initediblk &&
+	    cgp->cg_initediblk < cgp->cg_niblk) {
+		char block[MAXBSIZE];
+		bzero(block, (int)fs->fs_bsize);
+		dp2 = (struct ufs2_dinode *)&block;
+		for (i = 0; i < INOPB(fs); i++) {
+			dp2->di_gen = arc4random() / 2 + 1;
+			dp2++;
+		}
+		if (bwrite(disk, ino_to_fsba(fs,
+		    cgp->cg_cgx * fs->fs_ipg + cgp->cg_initediblk),
+		    block, fs->fs_bsize))
+			return (0);
+		cgp->cg_initediblk += INOPB(fs);
+	}
+
+	setbit(inosused, ino);
+	cgp->cg_irotor = ino;
+	cgp->cg_cs.cs_nifree--;
+	fs->fs_cstotal.cs_nifree--;
+	fs->fs_cs(fs, cgp->cg_cgx).cs_nifree--;
+	fs->fs_fmod = 1;
+
+	return (ino + (cgp->cg_cgx * fs->fs_ipg));
+}
+
 int
 cgread(struct uufsd *disk)
 {
@@ -55,14 +126,12 @@ int
 cgread1(struct uufsd *disk, int c)
 {
 	struct fs *fs;
-	off_t ccg;
 
 	fs = &disk->d_fs;
 
 	if (c >= fs->fs_ncg) {
 		return (0);
 	}
-	ccg = fsbtodb(fs, cgtod(fs, c)) * disk->d_bsize;
 	if (bread(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf,
 	    fs->fs_bsize) == -1) {
 		ERROR(disk, "unable to read cylinder group");
@@ -73,6 +142,12 @@ cgread1(struct uufsd *disk, int c)
 }
 
 int
+cgwrite(struct uufsd *disk)
+{
+	return (cgwrite1(disk, disk->d_lcg));
+}
+
+int
 cgwrite1(struct uufsd *disk, int c)
 {
 	struct fs *fs;
Index: lib/libufs/type.c
===================================================================
--- lib/libufs/type.c	(revision 202342)
+++ lib/libufs/type.c	(working copy)
@@ -66,6 +66,10 @@ ufs_disk_close(struct uufsd *disk)
 		free((char *)(uintptr_t)disk->d_name);
 		disk->d_name = NULL;
 	}
+	if (disk->d_sbcsum != NULL) {
+		free(disk->d_sbcsum);
+		disk->d_sbcsum = NULL;
+	}
 	return (0);
 }
 
@@ -156,6 +160,7 @@ again:	if ((ret = stat(name, &st)) < 0) {
 	disk->d_mine = 0;
 	disk->d_ufs = 0;
 	disk->d_error = NULL;
+	disk->d_sbcsum = NULL;
 
 	if (oname != name) {
 		name = strdup(name);
Index: lib/libufs/libufs.h
===================================================================
--- lib/libufs/libufs.h	(revision 202342)
+++ lib/libufs/libufs.h	(working copy)
@@ -71,6 +71,7 @@ struct uufsd {
 	int d_fd;		/* raw device file descriptor */
 	long d_bsize;		/* device bsize */
 	ufs2_daddr_t d_sblock;	/* superblock location */
+	struct csum *d_sbcsum;	/* Superblock summary info */
 	caddr_t d_inoblock;	/* inode block */
 	ino_t d_inomin;		/* low inode */
 	ino_t d_inomax;		/* high inode */
@@ -109,14 +110,18 @@ int berase(struct uufsd *, ufs2_daddr_t, ufs2_dadd
 /*
  * cgroup.c
  */
+ufs2_daddr_t cgballoc(struct uufsd *);
+ino_t cgialloc(struct uufsd *);
 int cgread(struct uufsd *);
 int cgread1(struct uufsd *, int);
+int cgwrite(struct uufsd *);
 int cgwrite1(struct uufsd *, int);
 
 /*
  * inode.c
  */
 int getino(struct uufsd *, void **, ino_t, int *);
+int putino(struct uufsd *);
 
 /*
  * sblock.c
@@ -132,6 +137,16 @@ int ufs_disk_fillout(struct uufsd *, const char *)
 int ufs_disk_fillout_blank(struct uufsd *, const char *);
 int ufs_disk_write(struct uufsd *);
 
+/*
+ * ffs_subr.c
+ */
+void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
+void	ffs_fragacct(struct fs *, int, int32_t [], int);
+int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int	ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
+
 __END_DECLS
 
 #endif	/* __LIBUFS_H__ */
Index: lib/libufs/Makefile
===================================================================
--- lib/libufs/Makefile	(revision 202342)
+++ lib/libufs/Makefile	(working copy)
@@ -3,7 +3,7 @@
 LIB=	ufs
 SHLIBDIR?= /lib
 
-SRCS=	block.c cgroup.c inode.c sblock.c type.c
+SRCS=	block.c cgroup.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c
 INCS=	libufs.h
 
 MAN=	bread.3 cgread.3 libufs.3 sbread.3 ufs_disk_close.3
@@ -16,8 +16,11 @@ MLINKS+= ufs_disk_close.3 ufs_disk_fillout.3
 MLINKS+= ufs_disk_close.3 ufs_disk_fillout_blank.3
 MLINKS+= ufs_disk_close.3 ufs_disk_write.3
 
-WARNS?=	3
+.PATH:  ${.CURDIR}/../../sys/ufs/ffs
 
+WARNS?=	2
+
+DEBUG_FLAGS = -g
 CFLAGS+= -D_LIBUFS
 .if defined(LIBUFS_DEBUG)
 CFLAGS+= -D_LIBUFS_DEBUGGING
Index: lib/libufs/sblock.c
===================================================================
--- lib/libufs/sblock.c	(revision 202342)
+++ lib/libufs/sblock.c	(working copy)
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 #include <unistd.h>
 
 #include <libufs.h>
@@ -49,8 +50,11 @@ static int superblocks[] = SBLOCKSEARCH;
 int
 sbread(struct uufsd *disk)
 {
+	uint8_t block[MAXBSIZE];
 	struct fs *fs;
 	int sb, superblock;
+	int i, size, blks;
+	uint8_t *space;
 
 	ERROR(disk, NULL);
 
@@ -86,6 +90,34 @@ sbread(struct uufsd *disk)
 	}
 	disk->d_bsize = fs->fs_fsize / fsbtodb(fs, 1);
 	disk->d_sblock = superblock / disk->d_bsize;
+	/*
+	 * Read in the superblock summary information.
+	 */
+	size = fs->fs_cssize;
+	blks = howmany(size, fs->fs_fsize);
+	size += fs->fs_ncg * sizeof(int32_t);
+	space = malloc(size);
+	if (space == NULL) {
+		ERROR(disk, "failed to allocate space for summary information");
+		return (-1);
+	}
+	fs->fs_csp = (struct csum *)space;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		size = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			size = (blks - i) * fs->fs_fsize;
+		if (bread(disk, fsbtodb(fs, fs->fs_csaddr + i), block, size)
+		    == -1) {
+			ERROR(disk, "Failed to read sb summary information");
+			free(fs->fs_csp);
+			return (-1);
+		}
+		bcopy(block, space, size);
+		space += size;
+	}
+	fs->fs_maxcluster = (uint32_t *)space;
+	disk->d_sbcsum = fs->fs_csp;
+
 	return (0);
 }
 
@@ -93,7 +125,8 @@ int
 sbwrite(struct uufsd *disk, int all)
 {
 	struct fs *fs;
-	int i;
+	int i, blks, size;
+	uint8_t *space;
 
 	ERROR(disk, NULL);
 
@@ -107,6 +140,22 @@ sbwrite(struct uufsd *disk, int all)
 		ERROR(disk, "failed to write superblock");
 		return (-1);
 	}
+	/*
+	 * Write superblock summary information.
+	 */
+	blks = howmany(fs->fs_cssize, fs->fs_fsize);
+	space = (uint8_t *)disk->d_sbcsum;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		size = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			size = (blks - i) * fs->fs_fsize;
+		if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + i), space, size)
+		    == -1) {
+			ERROR(disk, "Failed to write sb summary information");
+			return (-1);
+		}
+		space += size;
+	}
 	if (all) {
 		for (i = 0; i < fs->fs_ncg; i++)
 			if (bwrite(disk, fsbtodb(fs, cgsblock(fs, i)),
Index: sys/ufs/ufs/ufs_dirhash.c
===================================================================
--- sys/ufs/ufs/ufs_dirhash.c	(revision 202342)
+++ sys/ufs/ufs/ufs_dirhash.c	(working copy)
@@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");
 
 static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
 
-static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
-
 static int ufs_mindirhashsize = DIRBLKSIZ * 5;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
     &ufs_mindirhashsize,
Index: sys/ufs/ufs/inode.h
===================================================================
--- sys/ufs/ufs/inode.h	(revision 202342)
+++ sys/ufs/ufs/inode.h	(working copy)
@@ -120,7 +120,7 @@ struct inode {
 #define	IN_CHANGE	0x0002		/* Inode change time update request. */
 #define	IN_UPDATE	0x0004		/* Modification time update request. */
 #define	IN_MODIFIED	0x0008		/* Inode has been modified. */
-#define	IN_RENAME	0x0010		/* Inode is being renamed. */
+#define	IN_NEEDSYNC	0x0010		/* Inode requires fsync. */
 #define	IN_LAZYMOD	0x0040		/* Modified, but don't write yet. */
 #define	IN_SPACECOUNTED	0x0080		/* Blocks to be freed in free count. */
 #define	IN_LAZYACCESS	0x0100		/* Process IN_ACCESS after the
@@ -175,6 +175,7 @@ struct indir {
 /* Determine if soft dependencies are being done */
 #define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
 #define DOINGASYNC(vp)		((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
+#define	DOINGSUJ(vp)		((vp)->v_mount->mnt_flag & MNT_SUJ)
 
 /* This overlays the fid structure (see mount.h). */
 struct ufid {
Index: sys/ufs/ufs/dinode.h
===================================================================
--- sys/ufs/ufs/dinode.h	(revision 202342)
+++ sys/ufs/ufs/dinode.h	(working copy)
@@ -146,7 +146,8 @@ struct ufs2_dinode {
 	ufs2_daddr_t	di_db[NDADDR];	/* 112: Direct disk blocks. */
 	ufs2_daddr_t	di_ib[NIADDR];	/* 208: Indirect disk blocks. */
 	u_int64_t	di_modrev;	/* 232: i_modrev for NFSv4 */
-	int64_t		di_spare[2];	/* 240: Reserved; currently unused */
+	ino_t		di_freelink;	/* 240: SUJ: Next unlinked inode. */
+	uint32_t	di_spare[3];	/* 244: Reserved; currently unused */
 };
 
 /*
@@ -167,9 +168,7 @@ struct ufs2_dinode {
 struct ufs1_dinode {
 	u_int16_t	di_mode;	/*   0: IFMT, permissions; see below. */
 	int16_t		di_nlink;	/*   2: File link count. */
-	union {
-		u_int16_t oldids[2];	/*   4: Ffs: old user and group ids. */
-	} di_u;
+	ino_t		di_freelink;	/*   4: SUJ: Next unlinked inode. */
 	u_int64_t	di_size;	/*   8: File byte count. */
 	int32_t		di_atime;	/*  16: Last access time. */
 	int32_t		di_atimensec;	/*  20: Last access time. */
@@ -186,7 +185,5 @@ struct ufs1_dinode {
 	u_int32_t	di_gid;		/* 116: File group. */
 	u_int64_t	di_modrev;	/* 120: i_modrev for NFSv4 */
 };
-#define	di_ogid		di_u.oldids[1]
-#define	di_ouid		di_u.oldids[0]
 
 #endif /* _UFS_UFS_DINODE_H_ */
Index: sys/ufs/ufs/ufs_vnops.c
===================================================================
--- sys/ufs/ufs/ufs_vnops.c	(revision 202342)
+++ sys/ufs/ufs/ufs_vnops.c	(working copy)
@@ -114,6 +114,8 @@ static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 static vop_pathconf_t	ufsfifo_pathconf;
 
+SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
+
 /*
  * A virgin directory (no blushing please).
  */
@@ -974,6 +976,9 @@ ufs_link(ap)
 		error = EXDEV;
 		goto out;
 	}
+	if (VTOI(tdvp)->i_effnlink < 2)
+		panic("ufs_link: Bad link count %d on parent",
+		    VTOI(tdvp)->i_effnlink);
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
@@ -988,11 +993,11 @@ ufs_link(ap)
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
-		softdep_change_linkcnt(ip);
+		softdep_setup_link(VTOI(tdvp), ip);
 	error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
-		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);
+		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
 	}
 
 	if (error) {
@@ -1001,7 +1006,7 @@ ufs_link(ap)
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
-			softdep_change_linkcnt(ip);
+			softdep_revert_link(VTOI(tdvp), ip);
 	}
 out:
 	return (error);
@@ -1043,7 +1048,7 @@ ufs_whiteout(ap)
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
-		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);
+		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
 		break;
 
 	case DELETE:
@@ -1062,6 +1067,11 @@ ufs_whiteout(ap)
 	return (error);
 }
 
+static volatile int rename_restarts;
+SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
+    __DEVOLATILE(int *, &rename_restarts), 0,
+    "Times rename had to restart due to lock contention");
+
 /*
  * Rename system call.
  * 	rename("foo", "bar");
@@ -1101,111 +1111,183 @@ ufs_rename(ap)
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
+	struct vnode *nvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
-	struct inode *ip, *xp, *dp;
+	struct inode *fip, *tip, *tdp, *fdp;
 	struct direct newdir;
-	int doingdirectory = 0, oldparent = 0, newparent = 0;
+	off_t endoff;
+	int doingdirectory, newparent;
 	int error = 0, ioflag;
-	ino_t fvp_ino;
+	struct mount *mp;
+	ino_t ino;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
+	endoff = 0;
+	mp = tdvp->v_mount;
+	VOP_UNLOCK(tdvp, 0);
+	if (tvp && tvp != tdvp)
+		VOP_UNLOCK(tvp, 0);
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
-abortit:
-		if (tdvp == tvp)
-			vrele(tdvp);
-		else
-			vput(tdvp);
-		if (tvp)
-			vput(tvp);
-		vrele(fdvp);
+		mp = NULL;
+		goto releout;
+	}
+	error = vfs_busy(mp, 0);
+	if (error) {
+		mp = NULL;
+		goto releout;
+	}
+relock:
+	/* 
+	 * We need to acquire 2 to 4 locks depending on whether tvp is NULL
+	 * and fdvp and tdvp are the same directory.  Subsequently we need
+	 * to double-check all paths and in the directory rename case we
+	 * need to verify that we are not creating a directory loop.  To
+	 * handle this we acquire all but fdvp using non-blocking
+	 * acquisitions.  If we fail to acquire any lock in the path we will
+	 * drop all held locks, acquire the new lock in a blocking fashion,
+	 * and then release it and restart the rename.  This acquire/release
+	 * step ensures that we do not spin on a lock waiting for release.
+	 */
+	error = vn_lock(fdvp, LK_EXCLUSIVE);
+	if (error)
+		goto releout;
+	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		VOP_UNLOCK(fdvp, 0);
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto releout;
+		VOP_UNLOCK(tdvp, 0);
+		atomic_add_int(&rename_restarts, 1);
+		goto relock;
+	}
+	/*
+	 * Re-resolve fvp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+	if (error) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		goto releout;
+	}
+	error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+	if (error) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if (error != EBUSY)
+			goto releout;
+		error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+		if (error != 0)
+			goto releout;
+		VOP_UNLOCK(nvp, 0);
 		vrele(fvp);
-		return (error);
+		fvp = nvp;
+		atomic_add_int(&rename_restarts, 1);
+		goto relock;
 	}
-
+	vrele(fvp);
+	fvp = nvp;
+	/*
+	 * Re-resolve tvp and acquire the vnode lock if present.
+	 */
+	error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
+	if (error != 0 && error != EJUSTRETURN) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		VOP_UNLOCK(fvp, 0);
+		goto releout;
+	}
+	/*
+	 * If tvp disappeared we just carry on.
+	 */
+	if (error == EJUSTRETURN && tvp != NULL) {
+		vrele(tvp);
+		tvp = NULL;
+	}
+	/*
+	 * Get the tvp ino if the lookup succeeded.  We may have to restart
+	 * if the non-blocking acquire fails.
+	 */
+	if (error == 0) {
+		nvp = NULL;
+		error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+		if (tvp)
+			vrele(tvp);
+		tvp = nvp;
+		if (error) {
+			VOP_UNLOCK(fdvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			VOP_UNLOCK(fvp, 0);
+			if (error != EBUSY)
+				goto releout;
+			error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+			if (error != 0)
+				goto releout;
+			VOP_UNLOCK(nvp, 0);
+			atomic_add_int(&rename_restarts, 1);
+			goto relock;
+		}
+	}
+	fdp = VTOI(fdvp);
+	fip = VTOI(fvp);
+	tdp = VTOI(tdvp);
+	tip = NULL;
+	if (tvp)
+		tip = VTOI(tvp);
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
-		goto abortit;
+		goto unlockout;
 	}
-
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
-	 * not call us in that case.  Temporarily just warn if they do.
+	 * not call us in that case.  However, things could change after
+	 * we drop the locks above.
 	 */
 	if (fvp == tvp) {
-		printf("ufs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
-		goto abortit;
+		goto unlockout;
 	}
-
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
-		goto abortit;
-	dp = VTOI(fdvp);
-	ip = VTOI(fvp);
-	if (ip->i_nlink >= LINK_MAX) {
-		VOP_UNLOCK(fvp, 0);
+	doingdirectory = 0;
+	newparent = 0;
+	ino = fip->i_number;
+	if (fip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
-		goto abortit;
+		goto unlockout;
 	}
-	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
-	    || (dp->i_flags & APPEND)) {
-		VOP_UNLOCK(fvp, 0);
+	if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
+	    || (fdp->i_flags & APPEND)) {
 		error = EPERM;
-		goto abortit;
+		goto unlockout;
 	}
-	if ((ip->i_mode & IFMT) == IFDIR) {
+	if ((fip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
-		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
-		    (ip->i_flag & IN_RENAME)) {
-			VOP_UNLOCK(fvp, 0);
+		    fdp == fip ||
+		    (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
-			goto abortit;
+			goto unlockout;
 		}
-		ip->i_flag |= IN_RENAME;
-		oldparent = dp->i_number;
+		if (fdp->i_number != tdp->i_number)
+			newparent = tdp->i_number;
 		doingdirectory = 1;
 	}
-	vrele(fdvp);
-
-	/*
-	 * When the target exists, both the directory
-	 * and target vnodes are returned locked.
-	 */
-	dp = VTOI(tdvp);
-	xp = NULL;
-	if (tvp)
-		xp = VTOI(tvp);
-
-	/*
-	 * 1) Bump link count while we're moving stuff
-	 *    around.  If we crash somewhere before
-	 *    completing our work, the link count
-	 *    may be wrong, but correctable.
-	 */
-	ip->i_effnlink++;
-	ip->i_nlink++;
-	DIP_SET(ip, i_nlink, ip->i_nlink);
-	ip->i_flag |= IN_CHANGE;
-	if (DOINGSOFTDEP(fvp))
-		softdep_change_linkcnt(ip);
-	if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |
-				       DOINGASYNC(fvp)))) != 0) {
-		VOP_UNLOCK(fvp, 0);
-		goto bad;
+	if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) {
+		error = EXDEV;
+		goto unlockout;
 	}
 
 	/*
@@ -1214,88 +1296,93 @@ ufs_rename(ap)
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
-	 * as to be able to change "..". We must repeat the call
-	 * to namei, as the parent directory is unlocked by the
-	 * call to checkpath().
+	 * as to be able to change "..".
 	 */
-	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
-	fvp_ino = ip->i_number;
-	VOP_UNLOCK(fvp, 0);
-	if (oldparent != dp->i_number)
-		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
-		if (error)	/* write access check above */
-			goto bad;
-		if (xp != NULL)
-			vput(tvp);
-		error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred);
+		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 		if (error)
-			goto out;
+			goto unlockout;
+		error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
+		    &ino);
+		/*
+		 * We encountered a lock that we have to wait for.  Unlock
+		 * everything else and VGET before restarting.
+		 */
+		if (ino) {
+			VOP_UNLOCK(fdvp, 0);
+			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			if (tvp)
+				VOP_UNLOCK(tvp, 0);
+			error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
+			if (error == 0)
+				vput(nvp);
+			atomic_add_int(&rename_restarts, 1);
+			goto relock;
+		}
+		if (error)
+			goto unlockout;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
-		VREF(tdvp);
-		error = relookup(tdvp, &tvp, tcnp);
-		if (error)
-			goto out;
-		vrele(tdvp);
-		dp = VTOI(tdvp);
-		xp = NULL;
-		if (tvp)
-			xp = VTOI(tvp);
 	}
+	if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
+	    tdp->i_effnlink == 0)
+		panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
+
 	/*
+	 * 1) Bump link count while we're moving stuff
+	 *    around.  If we crash somewhere before
+	 *    completing our work, the link count
+	 *    may be wrong, but correctable.
+	 */
+	fip->i_effnlink++;
+	fip->i_nlink++;
+	DIP_SET(fip, i_nlink, fip->i_nlink);
+	fip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_setup_link(tdp, fip);
+	error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)));
+	if (error)
+		goto bad;
+
+	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
-	if (xp == NULL) {
-		if (dp->i_dev != ip->i_dev)
+	if (tip == NULL) {
+		if (tdp->i_dev != fip->i_dev)
 			panic("ufs_rename: EXDEV");
-		/*
-		 * Account for ".." in new directory.
-		 * When source and destination have the same
-		 * parent we don't fool with the link count.
-		 */
 		if (doingdirectory && newparent) {
-			if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+			/*
+			 * Account for ".." in new directory.
+			 * When source and destination have the same
+			 * parent we don't adjust the link count.  The
+			 * actual link modification is completed when
+			 * .. is rewritten below.
+			 */
+			if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
-			dp->i_effnlink++;
-			dp->i_nlink++;
-			DIP_SET(dp, i_nlink, dp->i_nlink);
-			dp->i_flag |= IN_CHANGE;
-			if (DOINGSOFTDEP(tdvp))
-				softdep_change_linkcnt(dp);
-			error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
-						   DOINGASYNC(tdvp)));
-			if (error)
-				goto bad;
 		}
-		ufs_makedirentry(ip, tcnp, &newdir);
-		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);
-		if (error) {
-			if (doingdirectory && newparent) {
-				dp->i_effnlink--;
-				dp->i_nlink--;
-				DIP_SET(dp, i_nlink, dp->i_nlink);
-				dp->i_flag |= IN_CHANGE;
-				if (DOINGSOFTDEP(tdvp))
-					softdep_change_linkcnt(dp);
-				(void)UFS_UPDATE(tdvp, 1);
-			}
+		ufs_makedirentry(fip, tcnp, &newdir);
+		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
+		if (error)
 			goto bad;
-		}
-		vput(tdvp);
+		/* Setup tdvp for directory compaction if needed. */
+		if (tdp->i_count && tdp->i_endoff &&
+		    tdp->i_endoff < tdp->i_size)
+			endoff = tdp->i_endoff;
 	} else {
-		if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+		if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev)
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
-		if (xp->i_number == ip->i_number)
+		if (tip->i_number == fip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
@@ -1303,7 +1390,7 @@ ufs_rename(ap)
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
-		if ((dp->i_mode & S_ISTXT) &&
+		if ((tdp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
@@ -1314,9 +1401,9 @@ ufs_rename(ap)
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
-		if ((xp->i_mode&IFMT) == IFDIR) {
-			if ((xp->i_effnlink > 2) ||
-			    !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
+		if ((tip->i_mode & IFMT) == IFDIR) {
+			if ((tip->i_effnlink > 2) ||
+			    !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
@@ -1329,21 +1416,31 @@ ufs_rename(ap)
 			error = EISDIR;
 			goto bad;
 		}
-		error = ufs_dirrewrite(dp, xp, ip->i_number,
-		    IFTODT(ip->i_mode),
-		    (doingdirectory && newparent) ? newparent : doingdirectory);
-		if (error)
-			goto bad;
 		if (doingdirectory) {
 			if (!newparent) {
-				dp->i_effnlink--;
+				tdp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
-					softdep_change_linkcnt(dp);
+					softdep_change_linkcnt(tdp);
 			}
-			xp->i_effnlink--;
+			tip->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
-				softdep_change_linkcnt(xp);
+				softdep_change_linkcnt(tip);
 		}
+		error = ufs_dirrewrite(tdp, tip, fip->i_number,
+		    IFTODT(fip->i_mode),
+		    (doingdirectory && newparent) ? newparent : doingdirectory);
+		if (error) {
+			if (doingdirectory) {
+				if (!newparent) {
+					tdp->i_effnlink++;
+					if (DOINGSOFTDEP(tdvp))
+						softdep_change_linkcnt(tdp);
+				}
+				tip->i_effnlink++;
+				if (DOINGSOFTDEP(tvp))
+					softdep_change_linkcnt(tip);
+			}
+		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
 			 * Truncate inode. The only stuff left in the directory
@@ -1357,115 +1454,107 @@ ufs_rename(ap)
 			 * them now.
 			 */
 			if (!newparent) {
-				dp->i_nlink--;
-				DIP_SET(dp, i_nlink, dp->i_nlink);
-				dp->i_flag |= IN_CHANGE;
+				tdp->i_nlink--;
+				DIP_SET(tdp, i_nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
 			}
-			xp->i_nlink--;
-			DIP_SET(xp, i_nlink, xp->i_nlink);
-			xp->i_flag |= IN_CHANGE;
+			tip->i_nlink--;
+			DIP_SET(tip, i_nlink, tip->i_nlink);
+			tip->i_flag |= IN_CHANGE;
 			ioflag = IO_NORMAL;
 			if (!DOINGASYNC(tvp))
 				ioflag |= IO_SYNC;
+			/* Don't go to bad here as the new link exists. */
 			if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
 			    tcnp->cn_cred, tcnp->cn_thread)) != 0)
-				goto bad;
+				goto unlockout;
 		}
-		vput(tdvp);
-		vput(tvp);
-		xp = NULL;
 	}
 
 	/*
-	 * 3) Unlink the source.
+	 * 3) Unlink the source.  We have to resolve the path again to
+	 * fixup the directory offset and count for ufs_dirremove.
 	 */
-	fcnp->cn_flags &= ~MODMASK;
-	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
-	if ((fcnp->cn_flags & SAVESTART) == 0)
-		panic("ufs_rename: lost from startdir");
-	VREF(fdvp);
-	error = relookup(fdvp, &fvp, fcnp);
-	if (error == 0)
-		vrele(fdvp);
-	if (fvp != NULL) {
-		xp = VTOI(fvp);
-		dp = VTOI(fdvp);
-	} else {
-		/*
-		 * From name has disappeared.  IN_RENAME is not sufficient
-		 * to protect against directory races due to timing windows,
-		 * so we have to remove the panic.  XXX the only real way
-		 * to solve this issue is at a much higher level.  By the
-		 * time we hit ufs_rename() it's too late.
-		 */
-#if 0
-		if (doingdirectory)
-			panic("ufs_rename: lost dir entry");
-#endif
-		vrele(ap->a_fvp);
-		return (0);
+	if (fdvp == tdvp) {
+		error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+		if (error)
+			panic("ufs_rename: from entry went away!");
+		if (ino != fip->i_number)
+			panic("ufs_rename: ino mismatch %d != %d\n", ino,
+			    fip->i_number);
 	}
 	/*
-	 * Ensure that the directory entry still exists and has not
-	 * changed while the new name has been entered. If the source is
-	 * a file then the entry may have been unlinked or renamed. In
-	 * either case there is no further work to be done. If the source
-	 * is a directory then it cannot have been rmdir'ed; the IN_RENAME
-	 * flag ensures that it cannot be moved by another rename or removed
-	 * by a rmdir.
+	 * If the source is a directory with a
+	 * new parent, the link count of the old
+	 * parent directory must be decremented
+	 * and ".." set to point to the new parent.
 	 */
-	if (xp != ip) {
+	if (doingdirectory && newparent) {
 		/*
-		 * From name resolves to a different inode.  IN_RENAME is
-		 * not sufficient protection against timing window races
-		 * so we can't panic here.  XXX the only real way
-		 * to solve this issue is at a much higher level.  By the
-		 * time we hit ufs_rename() it's too late.
+		 * If tip exists we simply use its link, otherwise we must
+		 * add a new one.
 		 */
-#if 0
-		if (doingdirectory)
-			panic("ufs_rename: lost dir entry");
-#endif
-	} else {
-		/*
-		 * If the source is a directory with a
-		 * new parent, the link count of the old
-		 * parent directory must be decremented
-		 * and ".." set to point to the new parent.
-		 */
-		if (doingdirectory && newparent) {
-			xp->i_offset = mastertemplate.dot_reclen;
-			ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
-			cache_purge(fdvp);
+		if (tip == NULL) {
+			tdp->i_effnlink++;
+			tdp->i_nlink++;
+			DIP_SET(tdp, i_nlink, tdp->i_nlink);
+			tdp->i_flag |= IN_CHANGE;
+			if (DOINGSOFTDEP(tdvp))
+				softdep_setup_dotdot_link(tdp, fip);
+			error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
+						   DOINGASYNC(tdvp)));
+			/* Don't go to bad here as the new link exists. */
+			if (error)
+				goto unlockout;
 		}
-		error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
-		xp->i_flag &= ~IN_RENAME;
+		fip->i_offset = mastertemplate.dot_reclen;
+		ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
+		cache_purge(fdvp);
 	}
-	if (dp)
-		vput(fdvp);
-	if (xp)
-		vput(fvp);
-	vrele(ap->a_fvp);
+	error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
+
+unlockout:
+	vput(fdvp);
+	vput(fvp);
+	if (tvp)
+		vput(tvp);
+	/*
+	 * If compaction or fsync was requested do it now that other locks
+	 * are no longer needed.
+	 */
+	if (error == 0 && endoff != 0) {
+#ifdef UFS_DIRHASH
+		if (tdp->i_dirhash != NULL)
+			ufsdirhash_dirtrunc(tdp, endoff);
+#endif
+		UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred,
+		    td);
+	}
+	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
+		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+	vput(tdvp);
+	if (mp)
+		vfs_unbusy(mp);
 	return (error);
 
 bad:
-	if (xp)
-		vput(ITOV(xp));
-	vput(ITOV(dp));
-out:
-	if (doingdirectory)
-		ip->i_flag &= ~IN_RENAME;
-	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
-		ip->i_effnlink--;
-		ip->i_nlink--;
-		DIP_SET(ip, i_nlink, ip->i_nlink);
-		ip->i_flag |= IN_CHANGE;
-		ip->i_flag &= ~IN_RENAME;
-		if (DOINGSOFTDEP(fvp))
-			softdep_change_linkcnt(ip);
-		vput(fvp);
-	} else
-		vrele(fvp);
+	fip->i_effnlink--;
+	fip->i_nlink--;
+	DIP_SET(fip, i_nlink, fip->i_nlink);
+	fip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_revert_link(tdp, fip);
+	goto unlockout;
+
+releout:
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp)
+		vrele(tvp);
+	if (mp)
+		vfs_unbusy(mp);
+
 	return (error);
 }
 
@@ -1664,8 +1753,7 @@ ufs_mkdir(ap)
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
-	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
@@ -1681,8 +1769,8 @@ ufs_mkdir(ap)
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
-		softdep_change_linkcnt(dp);
-	error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
+		softdep_setup_mkdir(dp, ip);
+	error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
 	if (error)
 		goto bad;
 #ifdef MAC
@@ -1791,7 +1879,7 @@ ufs_mkdir(ap)
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
-	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
 	
 bad:
 	if (error == 0) {
@@ -1807,8 +1895,6 @@ bad:
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
-		if (DOINGSOFTDEP(dvp))
-			softdep_change_linkcnt(dp);
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
@@ -1818,7 +1904,8 @@ bad:
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
-			softdep_change_linkcnt(ip);
+			softdep_revert_mkdir(dp, ip);
+
 		vput(tvp);
 	}
 out:
@@ -1854,10 +1941,13 @@ ufs_rmdir(ap)
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
-	if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) {
+	if (ip->i_effnlink < 2) {
 		error = EINVAL;
 		goto out;
 	}
+	if (dp->i_effnlink < 3)
+		panic("ufs_dirrem: Bad link count %d on parent",
+		    dp->i_effnlink);
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
@@ -1881,18 +1971,14 @@ ufs_rmdir(ap)
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
-	if (DOINGSOFTDEP(vp)) {
-		softdep_change_linkcnt(dp);
-		softdep_change_linkcnt(ip);
-	}
+	if (DOINGSOFTDEP(vp))
+		softdep_setup_rmdir(dp, ip);
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
-		if (DOINGSOFTDEP(vp)) {
-			softdep_change_linkcnt(dp);
-			softdep_change_linkcnt(ip);
-		}
+		if (DOINGSOFTDEP(vp))
+			softdep_revert_rmdir(dp, ip);
 		goto out;
 	}
 	cache_purge(dvp);
@@ -2401,6 +2487,9 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
+	if (VTOI(dvp)->i_effnlink < 2)
+		panic("ufs_makeinode: Bad link count %d on parent",
+		    VTOI(dvp)->i_effnlink);
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
@@ -2530,7 +2619,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+		softdep_setup_create(VTOI(dvp), ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
 		ip->i_mode &= ~ISGID;
@@ -2594,7 +2683,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
-	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
 	if (error)
 		goto bad;
 	*vpp = tvp;
@@ -2610,7 +2699,7 @@ bad:
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+		softdep_revert_create(VTOI(dvp), ip);
 	vput(tvp);
 	return (error);
 }
Index: sys/ufs/ufs/ufsmount.h
===================================================================
--- sys/ufs/ufs/ufsmount.h	(revision 202342)
+++ sys/ufs/ufs/ufsmount.h	(working copy)
@@ -57,7 +57,11 @@ struct ucred;
 struct uio;
 struct vnode;
 struct ufs_extattr_per_mount;
+struct jblocks;
+struct inodedep;
 
+TAILQ_HEAD(inodedeplst, inodedep);
+
 /* This structure describes the UFS specific mount structure data. */
 struct ufsmount {
 	struct	mount *um_mountp;		/* filesystem vfs structure */
@@ -75,6 +79,11 @@ struct ufsmount {
 	long	um_numindirdeps;		/* outstanding indirdeps */
 	struct	workhead softdep_workitem_pending; /* softdep work queue */
 	struct	worklist *softdep_worklist_tail; /* Tail pointer for above */
+	struct	workhead softdep_journal_pending; /* journal work queue */
+	struct	worklist *softdep_journal_tail;	/* Tail pointer for above */
+	struct	jblocks *softdep_jblocks;	/* Journal block information */
+	struct	inodedeplst softdep_unlinked; /* Unlinked inodes */
+	int	softdep_on_journal;		/* Items on the journal list */
 	int	softdep_on_worklist;		/* Items on the worklist */
 	int	softdep_on_worklist_inprogress;	/* Busy items on worklist */
 	int	softdep_deps;			/* Total dependency count */
Index: sys/ufs/ufs/ufs_lookup.c
===================================================================
--- sys/ufs/ufs/ufs_lookup.c	(revision 202342)
+++ sys/ufs/ufs/ufs_lookup.c	(working copy)
@@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW,
 /* true if old FS format...*/
 #define OFSFMT(vp)	((vp)->v_mount->mnt_maxsymlinklen <= 0)
 
-static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *,
-    ino_t *);
-
 static int
 ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred,
     struct thread *td)
@@ -189,11 +186,11 @@ ufs_lookup(ap)
 	} */ *ap;
 {
 
-	return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+	return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
 }
 
-static int
-ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+int
+ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
     ino_t *dd_ino)
 {
 	struct inode *dp;		/* inode for directory being searched */
@@ -524,6 +521,8 @@ notfound:
 	return (ENOENT);
 
 found:
+	if (dd_ino != NULL)
+		*dd_ino = ino;
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
@@ -546,11 +545,6 @@ found:
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);
 
-	if (dd_ino != NULL) {
-		*dd_ino = ino;
-		return (0);
-	}
-
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
@@ -558,17 +552,6 @@ found:
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		if (flags & LOCKPARENT)
 			ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);
-		if ((error = VFS_VGET(vdp->v_mount, ino,
-		    LK_EXCLUSIVE, &tdp)) != 0)
-			return (error);
-
-		error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
-		if (error) {
-			vput(tdp);
-			return (error);
-		}
-
-
 		/*
 		 * Return pointer to current entry in dp->i_offset,
 		 * and distance past previous entry (if there
@@ -585,6 +568,16 @@ found:
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
+		if (dd_ino != NULL)
+			return (0);
+		if ((error = VFS_VGET(vdp->v_mount, ino,
+		    LK_EXCLUSIVE, &tdp)) != 0)
+			return (error);
+		error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
+		if (error) {
+			vput(tdp);
+			return (error);
+		}
 		if (dp->i_number == ino) {
 			VREF(vdp);
 			*vpp = vdp;
@@ -616,6 +609,8 @@ found:
 		dp->i_offset = i_offset;
 		if (dp->i_number == ino)
 			return (EISDIR);
+		if (dd_ino != NULL)
+			return (0);
 		if ((error = VFS_VGET(vdp->v_mount, ino,
 		    LK_EXCLUSIVE, &tdp)) != 0)
 			return (error);
@@ -650,6 +645,8 @@ found:
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
+	if (dd_ino != NULL)
+		return (0);
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
@@ -681,7 +678,7 @@ found:
 		 * to the inode we looked up before vdp lock was
 		 * dropped.
 		 */
-		error = ufs_lookup_(pdp, NULL, cnp, &ino1);
+		error = ufs_lookup_ino(pdp, NULL, cnp, &ino1);
 		if (error) {
 			vput(tdp);
 			return (error);
@@ -825,12 +822,13 @@ ufs_makedirentry(ip, cnp, newdirp)
  * soft dependency code).
  */
 int
-ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
 	struct vnode *dvp;
 	struct vnode *tvp;
 	struct direct *dirp;
 	struct componentname *cnp;
 	struct buf *newdirbp;
+	int isrename;
 {
 	struct ucred *cr;
 	struct thread *td;
@@ -903,22 +901,28 @@ int
 				blkoff += DIRBLKSIZ;
 			}
 			if (softdep_setup_directory_add(bp, dp, dp->i_offset,
-			    dirp->d_ino, newdirbp, 1) == 0) {
-				bdwrite(bp);
+			    dirp->d_ino, newdirbp, 1))
+				dp->i_flag |= IN_NEEDSYNC;
+			if (newdirbp)
+				bdwrite(newdirbp);
+			bdwrite(bp);
+			if ((dp->i_flag & IN_NEEDSYNC) == 0)
 				return (UFS_UPDATE(dvp, 0));
-			}
-			/* We have just allocated a directory block in an
-			 * indirect block. Rather than tracking when it gets
-			 * claimed by the inode, we simply do a VOP_FSYNC
-			 * now to ensure that it is there (in case the user
-			 * does a future fsync). Note that we have to unlock
-			 * the inode for the entry that we just entered, as
-			 * the VOP_FSYNC may need to lock other inodes which
-			 * can lead to deadlock if we also hold a lock on
-			 * the newly entered node.
+			/*
+			 * We have just allocated a directory block in an
+			 * indirect block.  We must prevent holes in the
+			 * directory created if directory entries are
+			 * written out of order.  To accomplish this we
+			 * fsync when we extend a directory into indirects.
+			 * During rename it's not safe to drop the tvp lock
+			 * so sync must be delayed until it is.
+			 *
+			 * This synchronous step could be removed if fsck and
+			 * the kernel were taught to fill in sparse
+			 * directories rather than panic.
 			 */
-			if ((error = bwrite(bp)))
-				return (error);
+			if (isrename)
+				return (0);
 			if (tvp != NULL)
 				VOP_UNLOCK(tvp, 0);
 			error = VOP_FSYNC(dvp, MNT_WAIT, td);
@@ -1007,7 +1011,7 @@ int
 			    dp->i_offset + ((char *)ep - dirbuf));
 #endif
 		if (DOINGSOFTDEP(dvp))
-			softdep_change_directoryentry_offset(dp, dirbuf,
+			softdep_change_directoryentry_offset(bp, dp, dirbuf,
 			    (caddr_t)nep, (caddr_t)ep, dsize); 
 		else
 			bcopy((caddr_t)nep, (caddr_t)ep, dsize);
@@ -1059,6 +1063,8 @@ int
 		(void) softdep_setup_directory_add(bp, dp,
 		    dp->i_offset + (caddr_t)ep - dirbuf,
 		    dirp->d_ino, newdirbp, 0);
+		if (newdirbp != NULL)
+			bdwrite(newdirbp);
 		bdwrite(bp);
 	} else {
 		if (DOINGASYNC(dvp)) {
@@ -1076,7 +1082,8 @@ int
 	 * lock other inodes which can lead to deadlock if we also hold a
 	 * lock on the newly entered node.
 	 */
-	if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+	if (isrename == 0 && error == 0 &&
+	    dp->i_endoff && dp->i_endoff < dp->i_size) {
 		if (tvp != NULL)
 			VOP_UNLOCK(tvp, 0);
 #ifdef UFS_DIRHASH
@@ -1117,6 +1124,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
 
 	dp = VTOI(dvp);
 
+	/*
+	 * Adjust the link count early so softdep can block if necessary.
+	 */
+	if (ip) {
+		ip->i_effnlink--;
+		if (DOINGSOFTDEP(dvp)) {
+			softdep_setup_unlink(dp, ip);
+		} else {
+			ip->i_nlink--;
+			DIP_SET(ip, i_nlink, ip->i_nlink);
+			ip->i_flag |= IN_CHANGE;
+		}
+	}
 	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
@@ -1146,6 +1166,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_remove(dp, rep, dp->i_offset);
 #endif
+	if (ip && rep->d_ino != ip->i_number)
+		panic("ufs_dirremove: ip %d does not match dirent ino %d\n",
+		    ip->i_number, rep->d_ino);
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
@@ -1164,31 +1187,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
 		    dp->i_offset & ~(DIRBLKSIZ - 1));
 #endif
 out:
+	error = 0;
 	if (DOINGSOFTDEP(dvp)) {
-		if (ip) {
-			ip->i_effnlink--;
-			softdep_change_linkcnt(ip);
+		if (ip)
 			softdep_setup_remove(bp, dp, ip, isrmdir);
-		}
-		if (softdep_slowdown(dvp)) {
+		if (softdep_slowdown(dvp))
 			error = bwrite(bp);
-		} else {
+		else
 			bdwrite(bp);
-			error = 0;
-		}
 	} else {
-		if (ip) {
-			ip->i_effnlink--;
-			ip->i_nlink--;
-			DIP_SET(ip, i_nlink, ip->i_nlink);
-			ip->i_flag |= IN_CHANGE;
-		}
 		if (flags & DOWHITEOUT)
 			error = bwrite(bp);
-		else if (DOINGASYNC(dvp) && dp->i_count != 0) {
+		else if (DOINGASYNC(dvp) && dp->i_count != 0)
 			bdwrite(bp);
-			error = 0;
-		} else
+		else
 			error = bwrite(bp);
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -1221,6 +1233,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
 	struct vnode *vdp = ITOV(dp);
 	int error;
 
+	/*
+	 * Drop the link before we lock the buf so softdep can block if
+	 * necessary.
+	 */
+	oip->i_effnlink--;
+	if (DOINGSOFTDEP(vdp)) {
+		softdep_setup_unlink(dp, oip);
+	} else {
+		oip->i_nlink--;
+		DIP_SET(oip, i_nlink, oip->i_nlink);
+		oip->i_flag |= IN_CHANGE;
+	}
+
 	error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
 	if (error)
 		return (error);
@@ -1232,15 +1257,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
 	ep->d_ino = newinum;
 	if (!OFSFMT(vdp))
 		ep->d_type = newtype;
-	oip->i_effnlink--;
 	if (DOINGSOFTDEP(vdp)) {
-		softdep_change_linkcnt(oip);
 		softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
 		bdwrite(bp);
 	} else {
-		oip->i_nlink--;
-		DIP_SET(oip, i_nlink, oip->i_nlink);
-		oip->i_flag |= IN_CHANGE;
 		if (DOINGASYNC(vdp)) {
 			bdwrite(bp);
 			error = 0;
@@ -1355,25 +1375,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cre
 
 /*
  * Check if source directory is in the path of the target directory.
- * Target is supplied locked, source is unlocked.
- * The target is always vput before returning.
  */
 int
-ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
+ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino)
 {
-	struct vnode *vp, *vp1;
+	struct mount *mp;
+	struct vnode *tvp, *vp, *vp1;
 	int error;
 	ino_t dd_ino;
 
-	vp = ITOV(target);
-	if (target->i_number == source_ino) {
-		error = EEXIST;
-		goto out;
-	}
+	vp = tvp = ITOV(target);
+	mp = vp->v_mount;
+	*wait_ino = 0;
+	if (target->i_number == source_ino)
+		return (EEXIST);
+	if (target->i_number == parent_ino)
+		return (0);
+	if (target->i_number == ROOTINO)
+		return (0);
 	error = 0;
-	if (target->i_number == ROOTINO)
-		goto out;
-
 	for (;;) {
 		error = ufs_dir_dd_ino(vp, cred, &dd_ino);
 		if (error != 0)
@@ -1384,9 +1404,13 @@ int
 		}
 		if (dd_ino == ROOTINO)
 			break;
-		error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1);
-		if (error != 0)
+		if (dd_ino == parent_ino)
 			break;
+		error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1);
+		if (error != 0) {
+			*wait_ino = dd_ino;
+			break;
+		}
 		/* Recheck that ".." still points to vp1 after relock of vp */
 		error = ufs_dir_dd_ino(vp, cred, &dd_ino);
 		if (error != 0) {
@@ -1398,14 +1422,14 @@ int
 			vput(vp1);
 			continue;
 		}
-		vput(vp);
+		if (vp != tvp)
+			vput(vp);
 		vp = vp1;
 	}
 
-out:
 	if (error == ENOTDIR)
-		printf("checkpath: .. not a directory\n");
-	if (vp != NULL)
+		panic("checkpath: .. not a directory\n");
+	if (vp != tvp)
 		vput(vp);
 	return (error);
 }
Index: sys/ufs/ufs/ufs_extern.h
===================================================================
--- sys/ufs/ufs/ufs_extern.h	(revision 202342)
+++ sys/ufs/ufs/ufs_extern.h	(working copy)
@@ -57,7 +57,7 @@ int	 ufs_bmap(struct vop_bmap_args *);
 int	 ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *,
 	    struct buf *, int *, int *);
 int	 ufs_fhtovp(struct mount *, struct ufid *, struct vnode **);
-int	 ufs_checkpath(ino_t, struct inode *, struct ucred *);
+int	 ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *);
 void	 ufs_dirbad(struct inode *, doff_t, char *);
 int	 ufs_dirbadentry(struct vnode *, struct direct *, int);
 int	 ufs_dirempty(struct inode *, ino_t, struct ucred *);
@@ -66,9 +66,11 @@ int	 ufs_extwrite(struct vop_write_args *);
 void	 ufs_makedirentry(struct inode *, struct componentname *,
 	    struct direct *);
 int	 ufs_direnter(struct vnode *, struct vnode *, struct direct *,
-	    struct componentname *, struct buf *);
+	    struct componentname *, struct buf *, int);
 int	 ufs_dirremove(struct vnode *, struct inode *, int, int);
 int	 ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int);
+int	 ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *,
+	    ino_t *);
 int	 ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *);
 int	 ufs_inactive(struct vop_inactive_args *);
 int	 ufs_init(struct vfsconf *);
@@ -81,19 +83,33 @@ vfs_root_t ufs_root;
 int	 ufs_uninit(struct vfsconf *);
 int	 ufs_vinit(struct mount *, struct vop_vector *, struct vnode **);
 
+#include <sys/sysctl.h>
+SYSCTL_DECL(_vfs_ufs);
+
 /*
  * Soft update function prototypes.
  */
 int	softdep_setup_directory_add(struct buf *, struct inode *, off_t,
 	    ino_t, struct buf *, int);
-void	softdep_change_directoryentry_offset(struct inode *, caddr_t,
-	    caddr_t, caddr_t, int);
+void	softdep_change_directoryentry_offset(struct buf *, struct inode *,
+	    caddr_t, caddr_t, caddr_t, int);
 void	softdep_setup_remove(struct buf *,struct inode *, struct inode *, int);
 void	softdep_setup_directory_change(struct buf *, struct inode *,
 	    struct inode *, ino_t, int);
 void	softdep_change_linkcnt(struct inode *);
 void	softdep_releasefile(struct inode *);
 int	softdep_slowdown(struct vnode *);
+void	softdep_setup_create(struct inode *, struct inode *);
+void	softdep_setup_dotdot_link(struct inode *, struct inode *);
+void	softdep_setup_link(struct inode *, struct inode *);
+void	softdep_setup_mkdir(struct inode *, struct inode *);
+void	softdep_setup_rmdir(struct inode *, struct inode *);
+void	softdep_setup_unlink(struct inode *, struct inode *);
+void	softdep_revert_create(struct inode *, struct inode *);
+void	softdep_revert_dotdot_link(struct inode *, struct inode *);
+void	softdep_revert_link(struct inode *, struct inode *);
+void	softdep_revert_mkdir(struct inode *, struct inode *);
+void	softdep_revert_rmdir(struct inode *, struct inode *);
 
 /*
  * Flags to low-level allocation routines.  The low 16-bits are reserved
Index: sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- sys/ufs/ffs/ffs_vfsops.c	(revision 202342)
+++ sys/ufs/ffs/ffs_vfsops.c	(working copy)
@@ -79,7 +79,6 @@ static int	ffs_reload(struct mount *, struct threa
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
-static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
@@ -331,6 +330,7 @@ ffs_mount(struct mount *mp)
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
+			fs->fs_mtime = time_second;
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				vn_finished_write(mp);
@@ -898,6 +898,7 @@ ffs_mountfs(devvp, mp, td)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
+	mp->mnt_stat.f_iosize = fs->fs_bsize;
 
 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
@@ -909,6 +910,7 @@ ffs_mountfs(devvp, mp, td)
 	}
 
 	if (ronly == 0) {
+		fs->fs_mtime = time_second;
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
@@ -939,7 +941,6 @@ ffs_mountfs(devvp, mp, td)
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
-	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
@@ -1039,7 +1040,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc)
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
-static void
+void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
@@ -1134,6 +1135,7 @@ ffs_unmount(mp, mntflags)
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
+	softdep_unmount(mp);
 	if (fs->fs_ronly == 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
@@ -1575,16 +1577,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
-	/*
-	 * Ensure that uid and gid are correct. This is a temporary
-	 * fix until fsck has been changed to do the update.
-	 */
-	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
-	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
-		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
-		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
-	}						/* XXX */
-
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
@@ -1728,6 +1720,8 @@ ffs_sbupdate(mp, waitfor, suspended)
 	}
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
+	if (fs->fs_flags & FS_DOSOFTDEP)
+		softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
 	if (suspended)
@@ -1869,9 +1863,6 @@ ffs_bufwrite(struct buf *bp)
 	}
 	BO_UNLOCK(bp->b_bufobj);
 
-	/* Mark the buffer clean */
-	bundirty(bp);
-
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
@@ -1912,9 +1903,16 @@ ffs_bufwrite(struct buf *bp)
 		newbp->b_flags &= ~B_INVAL;
 
 #ifdef SOFTUPDATES
-		/* move over the dependencies */
-		if (!LIST_EMPTY(&bp->b_dep))
-			softdep_move_dependencies(bp, newbp);
+		/*
+		 * Move over the dependencies.  If there are rollbacks,
+		 * leave the parent buffer dirtied as it will need to
+		 * be written again.
+		 */
+		if (LIST_EMPTY(&bp->b_dep) ||
+		    softdep_move_dependencies(bp, newbp) == 0)
+			bundirty(bp);
+#else
+		bundirty(bp);
 #endif 
 
 		/*
@@ -1927,8 +1925,11 @@ ffs_bufwrite(struct buf *bp)
 		 */
 		bqrelse(bp);
 		bp = newbp;
-	}
+	} else
+		/* Mark the buffer clean */
+		bundirty(bp);
 
+
 	/* Let the normal bufwrite do the rest for us */
 normal_write:
 	return (bufwrite(bp));
Index: sys/ufs/ffs/ffs_softdep.c
===================================================================
--- sys/ufs/ffs/ffs_softdep.c	(revision 202342)
+++ sys/ufs/ffs/ffs_softdep.c	(working copy)
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #ifndef DEBUG
 #define DEBUG
 #endif
+#define	SUJ_DEBUG
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -62,6 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
+#include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
@@ -130,10 +132,12 @@ softdep_setup_inomapdep(bp, ip, newinum)
 }
 
 void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 	struct buf *bp;
 	struct mount *mp;
 	ufs2_daddr_t newblkno;
+	int frags;
+	int oldfrags;
 {
 
 	panic("softdep_setup_blkmapdep called");
@@ -403,31 +407,13 @@ softdep_get_depcounts(struct mount *mp,
  * These definitions need to be adapted to the system to which
  * this file is being ported.
  */
-/*
- * malloc types defined for the softdep system.
- */
-static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
-static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
-static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
-static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
-static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
-static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
-static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
-static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
-static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
-static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
-static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
-static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
-static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
-static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
-static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
 
 #define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
 
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
-#define	D_NEWBLK	2
-#define	D_BMSAFEMAP	3
+#define	D_BMSAFEMAP	2
+#define	D_NEWBLK	3
 #define	D_ALLOCDIRECT	4
 #define	D_INDIRDEP	5
 #define	D_ALLOCINDIR	6
@@ -438,8 +424,68 @@ softdep_get_depcounts(struct mount *mp,
 #define	D_MKDIR		11
 #define	D_DIRREM	12
 #define	D_NEWDIRBLK	13
-#define	D_LAST		D_NEWDIRBLK
+#define	D_FREEWORK	14
+#define	D_FREEDEP	15
+#define	D_JADDREF	16
+#define	D_JREMREF	17
+#define	D_JMVREF	18
+#define	D_JNEWBLK	19
+#define	D_JFREEBLK	20
+#define	D_JFREEFRAG	21
+#define	D_JSEG		22
+#define	D_JSEGDEP	23
+#define	D_SBDEP		24
+#define	D_JTRUNC	25
+#define	D_LAST		D_JTRUNC
 
+unsigned long dep_current[D_LAST + 1];
+unsigned long dep_total[D_LAST + 1];
+
+
+SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
+    "total dependencies allocated");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
+    "current dependencies allocated");
+
+#define	SOFTDEP_TYPE(type, str, long)					\
+    static MALLOC_DEFINE(M_ ## type, #str, long);			\
+    SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
+	&dep_total[D_ ## type], 0, "");					\
+    SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
+	&dep_current[D_ ## type], 0, "");
+
+SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 
+SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
+SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
+    "Block or frag allocated from cyl group map");
+SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
+SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
+SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
+SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
+SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
+SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
+SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
+SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
+SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
+SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
+SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
+SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
+SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
+SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
+SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
+SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
+SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
+SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
+SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
+SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
+SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
+SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
+SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
+
+static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
+static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
+
 /* 
  * translate from workitem type to memory type
  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
@@ -447,8 +493,8 @@ softdep_get_depcounts(struct mount *mp,
 static struct malloc_type *memtype[] = {
 	M_PAGEDEP,
 	M_INODEDEP,
+	M_BMSAFEMAP,
 	M_NEWBLK,
-	M_BMSAFEMAP,
 	M_ALLOCDIRECT,
 	M_INDIRDEP,
 	M_ALLOCINDIR,
@@ -458,7 +504,19 @@ static struct malloc_type *memtype[] = {
 	M_DIRADD,
 	M_MKDIR,
 	M_DIRREM,
-	M_NEWDIRBLK
+	M_NEWDIRBLK,
+	M_FREEWORK,
+	M_FREEDEP,
+	M_JADDREF,
+	M_JREMREF,
+	M_JMVREF,
+	M_JNEWBLK,
+	M_JFREEBLK,
+	M_JFREEFRAG,
+	M_JSEG,
+	M_JSEGDEP,
+	M_SBDEP,
+	M_JTRUNC
 };
 
 #define DtoM(type) (memtype[type])
@@ -467,17 +525,21 @@ static struct malloc_type *memtype[] = {
  * Names of malloc types.
  */
 #define TYPENAME(type)  \
-	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
+	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
 /*
  * End system adaptation definitions.
  */
 
+#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
+#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
+
 /*
  * Forward declarations.
  */
 struct inodedep_hashhead;
 struct newblk_hashhead;
 struct pagedep_hashhead;
+struct bmsafemap_hashhead;
 
 /*
  * Internal function prototypes.
@@ -487,59 +549,171 @@ static	void drain_output(struct vnode *);
 static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
 static	void clear_remove(struct thread *);
 static	void clear_inodedeps(struct thread *);
+static	void unlinked_inodedep(struct mount *, struct inodedep *);
+static	void clear_unlinked_inodedep(struct inodedep *);
+static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
 	    struct diraddhd *);
+static	void free_pagedep(struct pagedep *);
+static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 static	int flush_inodedep_deps(struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int handle_written_filepage(struct pagedep *, struct buf *);
+static	int handle_written_sbdep(struct sbdep *, struct buf *);
+static	void initiate_write_sbdep(struct sbdep *);
 static  void diradd_inode_written(struct diradd *, struct inodedep *);
+static	int handle_written_indirdep(struct indirdep *, struct buf *,
+	    struct buf**);
 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
-static	void handle_allocdirect_partdone(struct allocdirect *);
+static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static	void handle_written_jaddref(struct jaddref *, struct jseg *);
+static	void handle_written_jremref(struct jremref *, struct jseg *);
+static	void handle_written_jseg(struct jseg *, struct buf *);
+static	void handle_written_jnewblk(struct jnewblk *, struct jseg *);
+static	void handle_written_jfreeblk(struct jfreeblk *, struct jseg *);
+static	void handle_written_jfreefrag(struct jfreefrag *, struct jseg *);
+static	void complete_jseg(struct jseg *);
+static	void jseg_write(struct fs *, struct jblocks *, struct jseg *,
+	    uint8_t *);
+static	void jaddref_write(struct jaddref *, uint8_t *);
+static	void jremref_write(struct jremref *, uint8_t *);
+static	void jmvref_write(struct jmvref *, uint8_t *);
+static	void jtrunc_write(struct jtrunc *, uint8_t *);
+static	void jnewblk_write(struct jnewblk *, uint8_t *);
+static	void jfreeblk_write(struct jfreeblk *, uint8_t *);
+static	void jfreefrag_write(struct jfreefrag *, uint8_t *);
+static	inline void inoref_write(struct inoref *, struct jrefrec *);
+static	void handle_allocdirect_partdone(struct allocdirect *,
+	    struct workhead *);
+static	void cancel_newblk(struct newblk *, struct workhead *);
+static	void indirdep_complete(struct indirdep *);
 static	void handle_allocindir_partdone(struct allocindir *);
 static	void initiate_write_filepage(struct pagedep *, struct buf *);
+static	void initiate_write_indirdep(struct indirdep*, struct buf *);
 static	void handle_written_mkdir(struct mkdir *, int);
+static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 static	void handle_workitem_freefile(struct freefile *);
 static	void handle_workitem_remove(struct dirrem *, struct vnode *);
 static	struct dirrem *newdirrem(struct buf *, struct inode *,
 	    struct inode *, int, struct dirrem **);
-static	void free_diradd(struct diradd *);
-static	void free_allocindir(struct allocindir *, struct inodedep *);
+static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
+	    struct freeblks *);
+static	void free_indirdep(struct indirdep *);
+static	void free_diradd(struct diradd *, struct workhead *);
+static	void merge_diradd(struct inodedep *, struct diradd *);
+static	void complete_diradd(struct diradd *);
+static	struct diradd *diradd_lookup(struct pagedep *, int);
+static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
+	    struct jremref *);
+static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
+	    struct jremref *);
+static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
+	    struct jremref *, struct jremref *);
+static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
+	    struct jremref *);
+static	void cancel_allocindir(struct allocindir *, struct inodedep *,
+	    struct freeblks *);
+static	void complete_mkdir(struct mkdir *);
 static	void free_newdirblk(struct newdirblk *);
-static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
-	    ufs2_daddr_t *);
-static	void deallocate_dependencies(struct buf *, struct inodedep *);
-static	void free_allocdirect(struct allocdirectlst *,
-	    struct allocdirect *, int);
+static	void free_jremref(struct jremref *);
+static	void free_jaddref(struct jaddref *);
+static	void free_jsegdep(struct jsegdep *);
+static	void free_jseg(struct jseg *);
+static	void free_jnewblk(struct jnewblk *);
+static	void free_jfreeblk(struct jfreeblk *);
+static	void free_jfreefrag(struct jfreefrag *);
+static	void free_freedep(struct freedep *);
+static	void journal_jremref(struct dirrem *, struct jremref *,
+	    struct inodedep *);
+static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
+static	int cancel_jaddref(struct jaddref *, struct inodedep *,
+	    struct workhead *);
+static	void cancel_jfreefrag(struct jfreefrag *);
+static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
+static	int deallocate_dependencies(struct buf *, struct inodedep *,
+	    struct freeblks *);
+static	void free_newblk(struct newblk *);
+static	void cancel_allocdirect(struct allocdirectlst *,
+	    struct allocdirect *, struct freeblks *, int);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
+static	void freework_freeblock(struct freework *);
 static	void handle_workitem_freeblocks(struct freeblks *, int);
+static	void handle_complete_freeblocks(struct freeblks *);
+static	void handle_workitem_indirblk(struct freework *);
+static	void handle_written_freework(struct freework *);
 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 static	void setup_allocindir_phase2(struct buf *, struct inode *,
-	    struct allocindir *);
+	    struct inodedep *, struct allocindir *, ufs_lbn_t);
 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
-	    ufs2_daddr_t);
+	    ufs2_daddr_t, ufs_lbn_t);
 static	void handle_workitem_freefrag(struct freefrag *);
-static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
+static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
+	    ufs_lbn_t);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
-static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
-static	int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
-	    struct newblk **);
-static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
+static	struct freefrag *allocindir_merge(struct allocindir *,
+	    struct allocindir *);
+static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
+	    struct bmsafemap **);
+static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
+	    int cg);
+static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
+	    int, struct newblk **);
+static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
 	    struct inodedep **);
 static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
-static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
+static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
+	    struct pagedep **);
 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 	    struct mount *mp, int, struct pagedep **);
 static	void pause_timer(void *);
 static	int request_cleanup(struct mount *, int);
 static	int process_worklist_item(struct mount *, int);
-static	void add_to_worklist(struct worklist *);
+static	void process_removes(struct vnode *);
+static	void jwork_move(struct workhead *, struct workhead *);
+static	void add_to_worklist(struct worklist *, int);
+static	void remove_from_worklist(struct worklist *);
 static	void softdep_flush(void);
 static	int softdep_speedup(void);
+static	void worklist_speedup(void);
+static	int journal_mount(struct mount *, struct fs *, struct ucred *);
+static	void journal_unmount(struct mount *);
+static	int journal_space(struct ufsmount *, int);
+static	void journal_suspend(struct ufsmount *);
+static	void softdep_prelink(struct vnode *, struct vnode *);
+static	void add_to_journal(struct worklist *);
+static	void remove_from_journal(struct worklist *);
+static	void softdep_process_journal(struct mount *, int);
+static	struct jremref *newjremref(struct dirrem *, struct inode *,
+	    struct inode *ip, off_t, nlink_t);
+static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
+	    uint16_t);
+static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
+	    uint16_t);
+static inline struct jsegdep *inoref_segattach(struct inoref *, struct jseg *);
+static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
+static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
+	    ufs2_daddr_t, int);
+static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
+	    ufs2_daddr_t, long, ufs_lbn_t);
+static	struct freework *newfreework(struct freeblks *, struct freework *, 
+	    ufs_lbn_t, ufs2_daddr_t, int, int);
+static	void jwait(struct worklist *wk);
+static	struct inodedep *inodedep_lookup_ip(struct inode *);
+static	int bmsafemap_rollbacks(struct bmsafemap *);
+static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
+static	void handle_jwork(struct workhead *);
+static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
+	    struct mkdir **);
+static	struct jblocks *jblocks_create(void);
+static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
+static	void jblocks_free(struct jblocks *, struct mount *, int);
+static	void jblocks_destroy(struct jblocks *);
+static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 
 /*
  * Exported softdep operations.
@@ -572,40 +746,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX
 	(item)->wk_state &= ~ONWORKLIST;	\
 	LIST_REMOVE(item, wk_list);		\
 } while (0)
+#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
+#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
+
 #else /* DEBUG */
-static	void worklist_insert(struct workhead *, struct worklist *);
-static	void worklist_remove(struct worklist *);
+static	void worklist_insert(struct workhead *, struct worklist *, int);
+static	void worklist_remove(struct worklist *, int);
 
-#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
-#define WORKLIST_REMOVE(item) worklist_remove(item)
+#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
+#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
+#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
+#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
 
 static void
-worklist_insert(head, item)
+worklist_insert(head, item, locked)
 	struct workhead *head;
 	struct worklist *item;
+	int locked;
 {
 
-	mtx_assert(&lk, MA_OWNED);
+	if (locked)
+		mtx_assert(&lk, MA_OWNED);
 	if (item->wk_state & ONWORKLIST)
-		panic("worklist_insert: already on list");
+		panic("worklist_insert: %p %s(0x%X) already on list",
+		    item, TYPENAME(item->wk_type), item->wk_state);
 	item->wk_state |= ONWORKLIST;
 	LIST_INSERT_HEAD(head, item, wk_list);
 }
 
 static void
-worklist_remove(item)
+worklist_remove(item, locked)
 	struct worklist *item;
+	int locked;
 {
 
-	mtx_assert(&lk, MA_OWNED);
+	if (locked)
+		mtx_assert(&lk, MA_OWNED);
 	if ((item->wk_state & ONWORKLIST) == 0)
-		panic("worklist_remove: not on list");
+		panic("worklist_remove: %p %s(0x%X) not on list",
+		    item, TYPENAME(item->wk_type), item->wk_state);
 	item->wk_state &= ~ONWORKLIST;
 	LIST_REMOVE(item, wk_list);
 }
 #endif /* DEBUG */
 
 /*
+ * Merge two jsegdeps keeping only the oldest one as newer references
+ * can't be discarded until after older references.
+ */
+static inline struct jsegdep *
+jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
+{
+	struct jsegdep *swp;
+
+	if (two == NULL)
+		return (one);
+
+	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
+		swp = one;
+		one = two;
+		two = swp;
+	}
+	WORKLIST_REMOVE(&two->jd_list);
+	free_jsegdep(two);
+
+	return (one);
+}
+
+/*
+ * If two freedeps are compatible free one to reduce list size.
+ */
+static inline struct freedep *
+freedep_merge(struct freedep *one, struct freedep *two)
+{
+	if (two == NULL)
+		return (one);
+
+	if (one->fd_freework == two->fd_freework) {
+		WORKLIST_REMOVE(&two->fd_list);
+		free_freedep(two);
+	}
+	return (one);
+}
+
+/*
+ * Move journal work from one list to another.  Duplicate freedeps and
+ * jsegdeps are coalesced to keep the lists as small as possible.
+ */
+static void
+jwork_move(dst, src)
+	struct workhead *dst;
+	struct workhead *src;
+{
+	struct freedep *freedep;
+	struct jsegdep *jsegdep;
+	struct worklist *wkn;
+	struct worklist *wk;
+
+	KASSERT(dst != src,
+	    ("jwork_move: dst == src"));
+	freedep = NULL;
+	jsegdep = NULL;
+	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
+		if (wk->wk_type == D_JSEGDEP)
+			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+		if (wk->wk_type == D_FREEDEP)
+			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+	}
+
+	mtx_assert(&lk, MA_OWNED);
+	while ((wk = LIST_FIRST(src)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		WORKLIST_INSERT(dst, wk);
+		if (wk->wk_type == D_JSEGDEP) {
+			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+			continue;
+		}
+		if (wk->wk_type == D_FREEDEP)
+			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+	}
+}
+
+/*
  * Routines for tracking and managing workitems.
  */
 static	void workitem_free(struct worklist *, int);
@@ -623,13 +885,16 @@ workitem_free(item, type)
 
 #ifdef DEBUG
 	if (item->wk_state & ONWORKLIST)
-		panic("workitem_free: still on list");
+		panic("workitem_free: %s(0x%X) still on list",
+		    TYPENAME(item->wk_type), item->wk_state);
 	if (item->wk_type != type)
-		panic("workitem_free: type mismatch");
+		panic("workitem_free: type mismatch %s != %s",
+		    TYPENAME(item->wk_type), TYPENAME(type));
 #endif
 	ump = VFSTOUFS(item->wk_mp);
 	if (--ump->softdep_deps == 0 && ump->softdep_req)
 		wakeup(&ump->softdep_deps);
+	dep_current[type]--;
 	free(item, DtoM(type));
 }
 
@@ -643,6 +908,8 @@ workitem_alloc(item, type, mp)
 	item->wk_mp = mp;
 	item->wk_state = 0;
 	ACQUIRE_LOCK(&lk);
+	dep_current[type]++;
+	dep_total[type]++;
 	VFSTOUFS(mp)->softdep_deps++;
 	VFSTOUFS(mp)->softdep_accdeps++;
 	FREE_LOCK(&lk);
@@ -679,23 +946,38 @@ static int stat_inode_bitmap;	/* bufs redirtied as
 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
 
-SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
+SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
+    &max_softdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
+    &tickdelay, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
+    &maxindirdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
+    &stat_worklist_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
+    &stat_blk_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
+    &stat_ino_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
+    &stat_blk_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
+    &stat_ino_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
+    &stat_sync_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
+    &stat_indir_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
+    &stat_inode_bitmap, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
+    &stat_direct_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
+    &stat_dir_entry, 0, "");
 
 SYSCTL_DECL(_vfs_ffs);
 
+LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
+static u_long	bmsafemap_hash;	/* size of hash table - 1 */
+
 static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
@@ -770,16 +1052,22 @@ softdep_flush(void)
 	}
 }
 
-static int
-softdep_speedup(void)
+static void
+worklist_speedup(void)
 {
-
 	mtx_assert(&lk, MA_OWNED);
 	if (req_pending == 0) {
 		req_pending = 1;
 		wakeup(&req_pending);
 	}
+}
 
+static int
+softdep_speedup(void)
+{
+
+	worklist_speedup();
+	bd_speedup();
 	return speedup_syncer();
 }
 
@@ -791,15 +1079,17 @@ softdep_flush(void)
  * and does so in order from first to last.
  */
 static void
-add_to_worklist(wk)
+add_to_worklist(wk, nodelay)
 	struct worklist *wk;
+	int nodelay;
 {
 	struct ufsmount *ump;
 
 	mtx_assert(&lk, MA_OWNED);
 	ump = VFSTOUFS(wk->wk_mp);
 	if (wk->wk_state & ONWORKLIST)
-		panic("add_to_worklist: already on list");
+		panic("add_to_worklist: %s(0x%X) already on list",
+		    TYPENAME(wk->wk_type), wk->wk_state);
 	wk->wk_state |= ONWORKLIST;
 	if (LIST_EMPTY(&ump->softdep_workitem_pending))
 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
@@ -807,9 +1097,33 @@ static void
 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 	ump->softdep_worklist_tail = wk;
 	ump->softdep_on_worklist += 1;
+	if (nodelay)
+		worklist_speedup();
 }
 
 /*
+ * Remove the item to be processed. If we are removing the last
+ * item on the list, we need to recalculate the tail pointer.
+ */
+static void
+remove_from_worklist(wk)
+	struct worklist *wk;
+{
+	struct ufsmount *ump;
+	struct worklist *wkend;
+
+	ump = VFSTOUFS(wk->wk_mp);
+	WORKLIST_REMOVE(wk);
+	if (wk == ump->softdep_worklist_tail) {
+		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
+			if (LIST_NEXT(wkend, wk_list) == NULL)
+				break;
+		ump->softdep_worklist_tail = wkend;
+	}
+	ump->softdep_on_worklist -= 1;
+}
+
+/*
  * Process that runs once per second to handle items in the background queue.
  *
  * Note that we ensure that everything is done in the order in which they
@@ -838,8 +1152,9 @@ softdep_process_worklist(mp, full)
 	ACQUIRE_LOCK(&lk);
 	loopcount = 1;
 	starttime = time_second;
+	softdep_process_journal(mp, full?MNT_WAIT:0);
 	while (ump->softdep_on_worklist > 0) {
-		if ((cnt = process_worklist_item(mp, 0)) == -1)
+		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
 			break;
 		else
 			matchcnt += cnt;
@@ -871,16 +1186,61 @@ softdep_process_worklist(mp, full)
 		 * second. Otherwise the other mountpoints may get
 		 * excessively backlogged.
 		 */
-		if (!full && starttime != time_second) {
-			matchcnt = -1;
+		if (!full && starttime != time_second)
 			break;
-		}
 	}
 	FREE_LOCK(&lk);
 	return (matchcnt);
 }
 
 /*
+ * Process all removes associated with a vnode if we are running out of
+ * journal space.  Any other process which attempts to flush these will
+ * be unable as we have the vnodes locked.
+ */
+static void
+process_removes(vp)
+	struct vnode *vp;
+{
+	struct inodedep *inodedep;
+	struct dirrem *dirrem;
+	struct mount *mp;
+	ino_t inum;
+
+	mtx_assert(&lk, MA_OWNED);
+
+	mp = vp->v_mount;
+	inum = VTOI(vp)->i_number;
+	for (;;) {
+		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
+			return;
+		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
+			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
+			    (COMPLETE | ONWORKLIST))
+				break;
+		if (dirrem == NULL)
+			return;
+		/*
+		 * If another thread is trying to lock this vnode it will
+		 * fail but we must wait for it to do so before we can
+		 * proceed.
+		 */
+		if (dirrem->dm_state & INPROGRESS) {
+			dirrem->dm_state |= IOWAITING;
+			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
+			continue;
+		}
+		remove_from_worklist(&dirrem->dm_list);
+		FREE_LOCK(&lk);
+		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
+			panic("process_removes: suspended filesystem");
+		handle_workitem_remove(dirrem, vp);
+		vn_finished_secondary_write(mp);
+		ACQUIRE_LOCK(&lk);
+	}
+}
+
+/*
  * Process one item on the worklist.
  */
 static int
@@ -888,7 +1248,7 @@ process_worklist_item(mp, flags)
 	struct mount *mp;
 	int flags;
 {
-	struct worklist *wk, *wkend;
+	struct worklist *wk, *wkXXX;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	int matchcnt = 0;
@@ -908,11 +1268,14 @@ process_worklist_item(mp, flags)
 	 * inodes, we have to skip over any dirrem requests whose
 	 * vnodes are resident and locked.
 	 */
+	vp = NULL;
 	ump = VFSTOUFS(mp);
-	vp = NULL;
 	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
-		if (wk->wk_state & INPROGRESS)
+		if (wk->wk_state & INPROGRESS) {
+			wkXXX = wk;
 			continue;
+		}
+		wkXXX = wk;	/* Record the last valid wk pointer. */
 		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 			break;
 		wk->wk_state |= INPROGRESS;
@@ -921,6 +1284,10 @@ process_worklist_item(mp, flags)
 		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
 		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
 		ACQUIRE_LOCK(&lk);
+		if (wk->wk_state & IOWAITING) {
+			wk->wk_state &= ~IOWAITING;
+			wakeup(wk);
+		}
 		wk->wk_state &= ~INPROGRESS;
 		ump->softdep_on_worklist_inprogress--;
 		if (vp != NULL)
@@ -928,21 +1295,7 @@ process_worklist_item(mp, flags)
 	}
 	if (wk == 0)
 		return (-1);
-	/*
-	 * Remove the item to be processed. If we are removing the last
-	 * item on the list, we need to recalculate the tail pointer.
-	 * As this happens rarely and usually when the list is short,
-	 * we just run down the list to find it rather than tracking it
-	 * in the above loop.
-	 */
-	WORKLIST_REMOVE(wk);
-	if (wk == ump->softdep_worklist_tail) {
-		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
-			if (LIST_NEXT(wkend, wk_list) == NULL)
-				break;
-		ump->softdep_worklist_tail = wkend;
-	}
-	ump->softdep_on_worklist -= 1;
+	remove_from_worklist(wk);
 	FREE_LOCK(&lk);
 	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 		panic("process_worklist_item: suspended filesystem");
@@ -952,6 +1305,8 @@ process_worklist_item(mp, flags)
 	case D_DIRREM:
 		/* removal of a directory entry */
 		handle_workitem_remove(WK_DIRREM(wk), vp);
+		if (vp)
+			vput(vp);
 		break;
 
 	case D_FREEBLKS:
@@ -969,6 +1324,11 @@ process_worklist_item(mp, flags)
 		handle_workitem_freefile(WK_FREEFILE(wk));
 		break;
 
+	case D_FREEWORK:
+		/* Final block in an indirect was freed. */
+		handle_workitem_indirblk(WK_FREEWORK(wk));
+		break;
+
 	default:
 		panic("%s_process_worklist: Unknown type %s",
 		    "softdep", TYPENAME(wk->wk_type));
@@ -982,19 +1342,22 @@ process_worklist_item(mp, flags)
 /*
  * Move dependencies from one buffer to another.
  */
-void
+int
 softdep_move_dependencies(oldbp, newbp)
 	struct buf *oldbp;
 	struct buf *newbp;
 {
 	struct worklist *wk, *wktail;
+	int dirty;
 
-	if (!LIST_EMPTY(&newbp->b_dep))
-		panic("softdep_move_dependencies: need merge code");
-	wktail = 0;
+	dirty = 0;
+	wktail = NULL;
 	ACQUIRE_LOCK(&lk);
 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 		LIST_REMOVE(wk, wk_list);
+		if (wk->wk_type == D_BMSAFEMAP &&
+		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
+			dirty = 1;
 		if (wktail == 0)
 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 		else
@@ -1002,6 +1365,8 @@ softdep_move_dependencies(oldbp, newbp)
 		wktail = wk;
 	}
 	FREE_LOCK(&lk);
+
+	return (dirty);
 }
 
 /*
@@ -1198,23 +1563,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, paged
  * This routine must be called with splbio interrupts blocked.
  */
 static int
-pagedep_lookup(ip, lbn, flags, pagedeppp)
-	struct inode *ip;
+pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
+	struct mount *mp;
+	ino_t ino;
 	ufs_lbn_t lbn;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 	struct pagedep_hashhead *pagedephd;
-	struct mount *mp;
 	int ret;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
-	mp = ITOV(ip)->v_mount;
-	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
+	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
 
-	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp || (flags & DEPALLOC) == 0)
 		return (ret);
 	FREE_LOCK(&lk);
@@ -1222,12 +1586,12 @@ static int
 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 	ACQUIRE_LOCK(&lk);
-	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp) {
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 		return (ret);
 	}
-	pagedep->pd_ino = ip->i_number;
+	pagedep->pd_ino = ino;
 	pagedep->pd_lbn = lbn;
 	LIST_INIT(&pagedep->pd_dirremhd);
 	LIST_INIT(&pagedep->pd_pendinghd);
@@ -1314,10 +1678,13 @@ inodedep_lookup(mp, inum, flags, inodedeppp)
 	inodedep->id_savedino1 = NULL;
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
-	inodedep->id_buf = NULL;
+	inodedep->id_bmsafemap = NULL;
+	inodedep->id_mkdiradd = NULL;
+	LIST_INIT(&inodedep->id_dirremhd);
 	LIST_INIT(&inodedep->id_pendinghd);
 	LIST_INIT(&inodedep->id_inowait);
 	LIST_INIT(&inodedep->id_bufwait);
+	TAILQ_INIT(&inodedep->id_inoreflst);
 	TAILQ_INIT(&inodedep->id_inoupdt);
 	TAILQ_INIT(&inodedep->id_newinoupdt);
 	TAILQ_INIT(&inodedep->id_extupdt);
@@ -1336,17 +1703,29 @@ u_long	newblk_hash;		/* size of hash table - 1 */
 	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 
 static int
-newblk_find(newblkhd, fs, newblkno, newblkpp)
+newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
 	struct newblk_hashhead *newblkhd;
-	struct fs *fs;
+	struct mount *mp;
 	ufs2_daddr_t newblkno;
+	int flags;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 
-	LIST_FOREACH(newblk, newblkhd, nb_hash)
-		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
-			break;
+	LIST_FOREACH(newblk, newblkhd, nb_hash) {
+		if (newblkno != newblk->nb_newblkno)
+			continue;
+		if (mp != newblk->nb_list.wk_mp)
+			continue;
+		/*
+		 * If we're creating a new dependency don't match those that
+		 * have already been converted to allocdirects.  This is for
+		 * a frag extend.
+		 */
+		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
+			continue;
+		break;
+	}
 	if (newblk) {
 		*newblkpp = newblk;
 		return (1);
@@ -1361,8 +1740,8 @@ static int
  * Found or allocated entry is returned in newblkpp.
  */
 static int
-newblk_lookup(fs, newblkno, flags, newblkpp)
-	struct fs *fs;
+newblk_lookup(mp, newblkno, flags, newblkpp)
+	struct mount *mp;
 	ufs2_daddr_t newblkno;
 	int flags;
 	struct newblk **newblkpp;
@@ -1370,21 +1749,25 @@ static int
 	struct newblk *newblk;
 	struct newblk_hashhead *newblkhd;
 
-	newblkhd = NEWBLK_HASH(fs, newblkno);
-	if (newblk_find(newblkhd, fs, newblkno, newblkpp))
+	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
+	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	FREE_LOCK(&lk);
-	newblk = malloc(sizeof(struct newblk),
-		M_NEWBLK, M_SOFTDEP_FLAGS);
+	newblk = malloc(sizeof(union allblk), M_NEWBLK,
+	    M_SOFTDEP_FLAGS | M_ZERO);
+	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
 	ACQUIRE_LOCK(&lk);
-	if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
-		free(newblk, M_NEWBLK);
+	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
+		WORKITEM_FREE(newblk, D_NEWBLK);
 		return (1);
 	}
-	newblk->nb_state = 0;
-	newblk->nb_fs = fs;
+	newblk->nb_freefrag = NULL;
+	LIST_INIT(&newblk->nb_indirdeps);
+	LIST_INIT(&newblk->nb_newdirblk);
+	LIST_INIT(&newblk->nb_jwork);
+	newblk->nb_state = ATTACHED;
 	newblk->nb_newblkno = newblkno;
 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 	*newblkpp = newblk;
@@ -1401,10 +1784,10 @@ softdep_initialize()
 
 	LIST_INIT(&mkdirlisthd);
 	max_softdeps = desiredvnodes * 4;
-	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
-	    &pagedep_hash);
+	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
 	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
-	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
+	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
+	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
 
 	/* initialise bioops hack */
 	bioops.io_start = softdep_disk_io_initiation;
@@ -1428,6 +1811,7 @@ softdep_uninitialize()
 	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
 	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
 	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
+	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
 }
 
 /*
@@ -1457,9 +1841,16 @@ softdep_mount(devvp, mp, fs, cred)
 	MNT_IUNLOCK(mp);
 	ump = VFSTOUFS(mp);
 	LIST_INIT(&ump->softdep_workitem_pending);
+	LIST_INIT(&ump->softdep_journal_pending);
+	TAILQ_INIT(&ump->softdep_unlinked);
 	ump->softdep_worklist_tail = NULL;
 	ump->softdep_on_worklist = 0;
 	ump->softdep_deps = 0;
+	if ((fs->fs_flags & FS_SUJ) &&
+	    (error = journal_mount(mp, fs, cred)) != 0) {
+		printf("Failed to start journal: %d\n", error);
+		return (error);
+	}
 	/*
 	 * When doing soft updates, the counters in the
 	 * superblock may have gotten out of sync. Recomputation
@@ -1493,7 +1884,1953 @@ softdep_mount(devvp, mp, fs, cred)
 	return (0);
 }
 
+void
+softdep_unmount(mp)
+	struct mount *mp;
+{
+
+	if (mp->mnt_flag & MNT_SUJ)
+		journal_unmount(mp);
+}
+
+struct jblocks {
+	struct jseglst	jb_segs;	/* TAILQ of current segments. */
+	struct jseg	*jb_writeseg;	/* Next write to complete. */
+	struct jextent	*jb_extent;	/* Extent array. */
+	uint64_t	jb_nextseq;	/* Next sequence number. */
+	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
+	int		jb_avail;	/* Available extents. */
+	int		jb_used;	/* Last used extent. */
+	int		jb_head;	/* Allocator head. */
+	int		jb_off;		/* Allocator extent offset. */
+	int		jb_blocks;	/* Total disk blocks covered. */
+	int		jb_free;	/* Total disk blocks free. */
+	int		jb_min;		/* Minimum free space. */
+	int		jb_low;		/* Low on space. */
+	int		jb_age;		/* Insertion time of oldest rec. */
+	int		jb_suspended;	/* Did journal suspend writes? */
+};
+
+struct jextent {
+	ufs2_daddr_t	je_daddr;	/* Disk block address. */
+	int		je_blocks;	/* Disk block count. */
+};
+
+static struct jblocks *
+jblocks_create(void)
+{
+	struct jblocks *jblocks;
+
+	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&jblocks->jb_segs);
+	jblocks->jb_avail = 10;
+	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+	    M_JBLOCKS, M_WAITOK | M_ZERO);
+
+	return (jblocks);
+}
+
+static ufs2_daddr_t
+jblocks_alloc(jblocks, bytes, actual)
+	struct jblocks *jblocks;
+	int bytes;
+	int *actual;
+{
+	ufs2_daddr_t daddr;
+	struct jextent *jext;
+	int freecnt;
+	int blocks;
+
+	blocks = bytes / DEV_BSIZE;
+	jext = &jblocks->jb_extent[jblocks->jb_head];
+	freecnt = jext->je_blocks - jblocks->jb_off;
+	if (freecnt == 0) {
+		jblocks->jb_off = 0;
+		if (++jblocks->jb_head > jblocks->jb_used)
+			jblocks->jb_head = 0;
+		jext = &jblocks->jb_extent[jblocks->jb_head];
+		freecnt = jext->je_blocks;
+	}
+	if (freecnt > blocks)
+		freecnt = blocks;
+	*actual = freecnt * DEV_BSIZE;
+	daddr = jext->je_daddr + jblocks->jb_off;
+	jblocks->jb_off += freecnt;
+	jblocks->jb_free -= freecnt;
+
+	return (daddr);
+}
+
+static void
+jblocks_free(jblocks, mp, bytes)
+	struct jblocks *jblocks;
+	struct mount *mp;
+	int bytes;
+{
+
+	jblocks->jb_free += bytes / DEV_BSIZE;
+	if (jblocks->jb_suspended)
+		worklist_speedup();
+	wakeup(jblocks);
+}
+
+static void
+jblocks_destroy(jblocks)
+	struct jblocks *jblocks;
+{
+
+	if (jblocks->jb_extent)
+		free(jblocks->jb_extent, M_JBLOCKS);
+	free(jblocks, M_JBLOCKS);
+}
+
+static void
+jblocks_add(jblocks, daddr, blocks)
+	struct jblocks *jblocks;
+	ufs2_daddr_t daddr;
+	int blocks;
+{
+	struct jextent *jext;
+
+	jblocks->jb_blocks += blocks;
+	jblocks->jb_free += blocks;
+	jext = &jblocks->jb_extent[jblocks->jb_used];
+	/* Adding the first block. */
+	if (jext->je_daddr == 0) {
+		jext->je_daddr = daddr;
+		jext->je_blocks = blocks;
+		return;
+	}
+	/* Extending the last extent. */
+	if (jext->je_daddr + jext->je_blocks == daddr) {
+		jext->je_blocks += blocks;
+		return;
+	}
+	/* Adding a new extent. */
+	if (++jblocks->jb_used == jblocks->jb_avail) {
+		jblocks->jb_avail *= 2;
+		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+		    M_JBLOCKS, M_WAITOK | M_ZERO);
+		memcpy(jext, jblocks->jb_extent,
+		    sizeof(struct jextent) * jblocks->jb_used);
+		free(jblocks->jb_extent, M_JBLOCKS);
+		jblocks->jb_extent = jext;
+	}
+	jext = &jblocks->jb_extent[jblocks->jb_used];
+	jext->je_daddr = daddr;
+	jext->je_blocks = blocks;
+	return;
+}
+
 /*
+ * Open and verify the journal file.
+ */
+static int
+journal_mount(mp, fs, cred)
+	struct mount *mp;
+	struct fs *fs;
+	struct ucred *cred;
+{
+	struct jblocks *jblocks;
+	struct vnode *vp;
+	struct inode *ip;
+	ufs2_daddr_t blkno;
+	int bcount;
+	int error;
+	int i;
+
+	mp->mnt_flag |= MNT_SUJ;
+	error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp);
+	if (error)
+		return (error);
+	ip = VTOI(vp);
+	if (ip->i_size < SUJ_MIN) {
+		error = ENOSPC;
+		goto out;
+	}
+	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
+	jblocks = jblocks_create();
+	for (i = 0; i < bcount; i++) {
+		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
+		if (error)
+			break;
+		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
+	}
+	if (error) {
+		jblocks_destroy(jblocks);
+		goto out;
+	}
+	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
+	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
+	DIP_SET(ip, i_modrev, fs->fs_mtime);
+	ip->i_flags |= IN_MODIFIED;
+	ffs_update(vp, 1);
+	VFSTOUFS(mp)->softdep_jblocks = jblocks;
+out:
+	vput(vp);
+	return (error);
+}
+
+static void
+journal_unmount(mp)
+	struct mount *mp;
+{
+	struct ufsmount *ump;
+
+	ump = VFSTOUFS(mp);
+	if (ump->softdep_jblocks)
+		jblocks_destroy(ump->softdep_jblocks);
+	ump->softdep_jblocks = NULL;
+}
+
+/*
+ * Called when a journal record is ready to be written.  Space is allocated
+ * and the journal entry is created when the journal is flushed to stable
+ * store.
+ */
+static void
+add_to_journal(wk)
+	struct worklist *wk;
+{
+	struct ufsmount *ump;
+
+	mtx_assert(&lk, MA_OWNED);
+	ump = VFSTOUFS(wk->wk_mp);
+	if (wk->wk_state & ONWORKLIST)
+		panic("add_to_journal: %s(0x%X) already on list",
+		    TYPENAME(wk->wk_type), wk->wk_state);
+	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
+	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
+		ump->softdep_jblocks->jb_age = ticks;
+		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
+	} else
+		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
+	ump->softdep_journal_tail = wk;
+	ump->softdep_on_journal += 1;
+}
+
+/*
+ * Remove an arbitrary item for the journal worklist maintain the tail
+ * pointer.  This happens when a new operation obviates the need to
+ * journal an old operation.
+ */
+static void
+remove_from_journal(wk)
+	struct worklist *wk;
+{
+	struct ufsmount *ump;
+
+	mtx_assert(&lk, MA_OWNED);
+	ump = VFSTOUFS(wk->wk_mp);
+#ifdef DEBUG	/* XXX Expensive, temporary. */
+	{
+		struct worklist *wkn;
+
+		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
+			if (wkn == wk)
+				break;
+		if (wkn == NULL)
+			panic("remove_from_journal: %p is not in journal", wk);
+	}
+#endif
+	/*
+	 * We emulate a TAILQ to save space in most structures which do not
+	 * require TAILQ semantics.  Here we must update the tail position
+	 * when removing the tail which is not the final entry.
+	 */
+	if (ump->softdep_journal_tail == wk)
+		ump->softdep_journal_tail =
+		    (struct worklist *)wk->wk_list.le_prev;
+
+	WORKLIST_REMOVE(wk);
+	ump->softdep_on_journal -= 1;
+}
+
+static int
+journal_space(ump, thresh)
+	struct ufsmount *ump;
+	int thresh;
+{
+	struct jblocks *jblocks;
+	int avail;
+
+	jblocks = ump->softdep_jblocks;
+	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
+	avail = jblocks->jb_free - avail;
+
+	return (avail > thresh);
+}
+
+static void
+journal_suspend(ump)
+	struct ufsmount *ump;
+{
+	struct jblocks *jblocks;
+	struct mount *mp;
+
+	mp = UFSTOVFS(ump);
+	jblocks = ump->softdep_jblocks;
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
+		mp->mnt_kern_flag |= MNTK_SUSPEND;
+		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
+	}
+	jblocks->jb_suspended = 1;
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Called before any allocation function to be certain that there is
+ * sufficient space in the journal prior to creating any new records.
+ * Since in the case of block allocation we may have multiple locked
+ * buffers at the time of the actual allocation we can not block
+ * when the journal records are created.  Doing so would create a deadlock
+ * if any of these buffers needed to be flushed to reclaim space.  Instead
+ * we require a sufficiently large amount of available space such that
+ * each thread in the system could have passed this allocation check and
+ * still have sufficient free space.  With 20% of a minimum journal size
+ * of 1MB we have 6553 records available.
+ */
+int
+softdep_prealloc(vp, waitok)
+	struct vnode *vp;
+	int waitok;
+{
+	struct jblocks *jblocks;
+	struct ufsmount *ump;
+
+	if (DOINGSUJ(vp) == 0)
+		return (0);
+	ump = VFSTOUFS(vp->v_mount);
+	jblocks = ump->softdep_jblocks;
+	ACQUIRE_LOCK(&lk);
+	if (journal_space(ump, jblocks->jb_low)) {
+		FREE_LOCK(&lk);
+		return (0);
+	}
+	FREE_LOCK(&lk);
+	if (waitok == MNT_NOWAIT)
+		return (ENOSPC);
+	/*
+	 * Attempt to sync this vnode once to flush any journal
+	 * work attached to it.
+	 */
+	ffs_syncvnode(vp, waitok);
+	ACQUIRE_LOCK(&lk);
+	process_removes(vp);
+	if (journal_space(ump, jblocks->jb_low) == 0) {
+		softdep_speedup();
+		if (journal_space(ump, jblocks->jb_min) == 0)
+			journal_suspend(ump);
+	}
+	FREE_LOCK(&lk);
+
+	return (0);
+}
+
+static void
+softdep_prelink(dvp, vp)
+	struct vnode *dvp;
+	struct vnode *vp;
+{
+	struct jblocks *jblocks;
+	struct ufsmount *ump;
+
+	ump = VFSTOUFS(dvp->v_mount);
+	jblocks = ump->softdep_jblocks;
+	mtx_assert(&lk, MA_OWNED);
+	if (journal_space(ump, jblocks->jb_low))
+		return;
+	FREE_LOCK(&lk);
+	if (vp)
+		ffs_syncvnode(vp, MNT_NOWAIT);
+	ffs_syncvnode(dvp, MNT_WAIT);
+	ACQUIRE_LOCK(&lk);
+	/* Process vp before dvp as it may create .. removes. */
+	if (vp)
+		process_removes(vp);
+	process_removes(dvp);
+	softdep_speedup();
+	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+	if (journal_space(ump, jblocks->jb_low) == 0) {
+		softdep_speedup();
+		if (journal_space(ump, jblocks->jb_min) == 0)
+			journal_suspend(ump);
+	}
+}
+
+static void
+jseg_write(fs, jblocks, jseg, data)
+	struct fs *fs;
+	struct jblocks *jblocks;
+	struct jseg *jseg;
+	uint8_t *data;
+{
+	struct jsegrec *rec;
+
+	rec = (struct jsegrec *)data;
+	rec->jsr_seq = jseg->js_seq;
+	rec->jsr_oldest = jblocks->jb_oldestseq;
+	rec->jsr_cnt = jseg->js_cnt;
+	rec->jsr_crc = 0;
+	rec->jsr_time = fs->fs_mtime;
+}
+
+static inline void
+inoref_write(inoref, rec)
+	struct inoref *inoref;
+	struct jrefrec *rec;
+{
+	rec->jr_ino = inoref->if_ino;
+	rec->jr_parent = inoref->if_parent;
+	rec->jr_nlink = inoref->if_nlink;
+	rec->jr_mode = inoref->if_mode;
+	rec->jr_diroff = inoref->if_diroff;
+}
+
+static void
+jaddref_write(jaddref, data)
+	struct jaddref *jaddref;
+	uint8_t *data;
+{
+	struct jrefrec *rec;
+
+	rec = (struct jrefrec *)data;
+	rec->jr_op = JOP_ADDREF;
+	inoref_write(&jaddref->ja_ref, rec);
+}
+
+static void
+jremref_write(jremref, data)
+	struct jremref *jremref;
+	uint8_t *data;
+{
+	struct jrefrec *rec;
+
+	rec = (struct jrefrec *)data;
+	rec->jr_op = JOP_REMREF;
+	inoref_write(&jremref->jr_ref, rec);
+}
+
+static	void
+jmvref_write(jmvref, data)
+	struct jmvref *jmvref;
+	uint8_t *data;
+{
+	struct jmvrec *rec;
+
+	rec = (struct jmvrec *)data;
+	rec->jm_op = JOP_MVREF;
+	rec->jm_ino = jmvref->jm_ino;
+	rec->jm_parent = jmvref->jm_parent;
+	rec->jm_oldoff = jmvref->jm_oldoff;
+	rec->jm_newoff = jmvref->jm_newoff;
+}
+
+static void
+jnewblk_write(jnewblk, data)
+	struct jnewblk *jnewblk;
+	uint8_t *data;
+{
+	struct jblkrec *rec;
+
+	rec = (struct jblkrec *)data;
+	rec->jb_op = JOP_NEWBLK;
+	rec->jb_ino = jnewblk->jn_ino;
+	rec->jb_blkno = jnewblk->jn_blkno;
+	rec->jb_lbn = jnewblk->jn_lbn;
+	rec->jb_frags = jnewblk->jn_frags;
+	rec->jb_oldfrags = jnewblk->jn_oldfrags;
+}
+
+static void
+jfreeblk_write(jfreeblk, data)
+	struct jfreeblk *jfreeblk;
+	uint8_t *data;
+{
+	struct jblkrec *rec;
+
+	rec = (struct jblkrec *)data;
+	rec->jb_op = JOP_FREEBLK;
+	rec->jb_ino = jfreeblk->jf_ino;
+	rec->jb_blkno = jfreeblk->jf_blkno;
+	rec->jb_lbn = jfreeblk->jf_lbn;
+	rec->jb_frags = jfreeblk->jf_frags;
+	rec->jb_oldfrags = 0;
+}
+
+static void
+jfreefrag_write(jfreefrag, data)
+	struct jfreefrag *jfreefrag;
+	uint8_t *data;
+{
+	struct jblkrec *rec;
+
+	rec = (struct jblkrec *)data;
+	rec->jb_op = JOP_FREEBLK;
+	rec->jb_ino = jfreefrag->fr_ino;
+	rec->jb_blkno = jfreefrag->fr_blkno;
+	rec->jb_lbn = jfreefrag->fr_lbn;
+	rec->jb_frags = jfreefrag->fr_frags;
+	rec->jb_oldfrags = 0;
+}
+
+static void
+jtrunc_write(jtrunc, data)
+	struct jtrunc *jtrunc;
+	uint8_t *data;
+{
+	struct jtrncrec *rec;
+
+	rec = (struct jtrncrec *)data;
+	rec->jt_op = JOP_TRUNC;
+	rec->jt_ino = jtrunc->jt_ino;
+	rec->jt_size = jtrunc->jt_size;
+	rec->jt_extsize = jtrunc->jt_extsize;
+}
+
+/*
+ * Flush some journal records to disk.
+ */
+static void
+softdep_process_journal(mp, flags)
+	struct mount *mp;
+	int flags;
+{
+	struct jblocks *jblocks;
+	struct ufsmount *ump;
+	struct worklist *wk;
+	struct jseg *jseg;
+	struct buf *bp;
+	uint8_t *data;
+	struct fs *fs;
+	int segwritten;
+	int jrecmin;	/* Minimum write size. */
+	int jrecmax;	/* Maximum write size. */
+	int size;
+	int cnt;
+
+	if ((mp->mnt_flag & MNT_SUJ) == 0)
+		return;
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	jblocks = ump->softdep_jblocks;
+	/*
+	 * We write anywhere between a disk block and fs block.  The upper
+	 * bound is picked to prevent buffer cache fragmentation and limit
+	 * processing time per I/O.
+	 */
+	jrecmax = fs->fs_bsize / JREC_SIZE;
+	jrecmin = DEV_BSIZE / JREC_SIZE;
+	segwritten = 0;
+	while ((cnt = ump->softdep_on_journal) != 0) {
+		/*
+		 * Create a new segment to hold as many as 'cnt' journal
+		 * entries and add them to the segment.  Notice cnt is
+		 * off by one to account for the space required by the
+		 * jsegrec.  If we don't have a full block to log skip it
+		 * unless we haven't written anything in 10 seconds.
+		 */
+		cnt++;
+		if (cnt < jrecmax) {
+			if (segwritten)
+				return;
+			if (flags != MNT_WAIT &&
+			   (ticks - jblocks->jb_age) > hz*10)
+			break;
+		}
+		/*
+		 * Verify some free journal space.  softdep_prealloc() should
+	 	 * guarantee that we don't run out so this is indicative of
+		 * a problem with the flow control.  Try to recover
+		 * gracefully in any event.
+		 */
+		while (jblocks->jb_free == 0) {
+			if (flags != MNT_WAIT)
+				break;
+			printf("softdep: Out of journal space!\n");
+			softdep_speedup();
+			msleep(jblocks, &lk, PRIBIO, "jblocks", 1);
+		}
+		FREE_LOCK(&lk);
+		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
+		workitem_alloc(&jseg->js_list, D_JSEG, mp);
+		LIST_INIT(&jseg->js_entries);
+		jseg->js_state = ATTACHED;
+		jseg->js_refs = 1;	/* Self reference. */
+		jseg->js_jblocks = jblocks;
+		size = roundup2(cnt * JREC_SIZE, DEV_BSIZE);
+		bp = geteblk(fs->fs_bsize, 0);
+		ACQUIRE_LOCK(&lk);
+		/*
+		 * If there was a race while we were allocating the block
+		 * and jseg the entry we care about was likely written.
+		 * We bail out in both the WAIT and NOWAIT case and assume
+		 * the caller will loop if the entry it cares about is
+		 * not written.
+		 */
+		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
+			bp->b_flags |= B_INVAL | B_NOCACHE;
+			WORKITEM_FREE(jseg, D_JSEG);
+			FREE_LOCK(&lk);
+			brelse(bp);
+			ACQUIRE_LOCK(&lk);
+			break;
+		}
+		/*
+		 * Calculate the disk block size required for the available
+		 * records rounded to the min size.
+		 */
+		cnt = ump->softdep_on_journal + 1;
+		if (cnt < jrecmax)
+			cnt = roundup2(cnt, jrecmin);
+		else
+			cnt = jrecmax;
+		size = cnt * JREC_SIZE;
+		/*
+		 * Allocate a disk block for this journal data and account
+		 * for truncation of the requested size if enough contiguous
+		 * space was not available.
+		 */
+		bp->b_blkno = bp->b_lblkno = jblocks_alloc(jblocks, size,
+		    &size);
+		bp->b_offset = bp->b_blkno * DEV_BSIZE;
+		bp->b_bcount = size;
+		bp->b_bufobj = &ump->um_devvp->v_bufobj;
+		bp->b_flags &= ~B_INVAL;
+		/*
+		 * Initialize our jseg with as many as cnt - 1 records.
+		 * Assign the next sequence number to it and link it
+		 * in-order.
+		 */
+		cnt = MIN(ump->softdep_on_journal, (size / JREC_SIZE) - 1);
+		jseg->js_buf = bp;
+		jseg->js_cnt = cnt;
+		jseg->js_size = size;
+		jseg->js_seq = jblocks->jb_nextseq++;
+		if (TAILQ_EMPTY(&jblocks->jb_segs))
+			jblocks->jb_oldestseq = jseg->js_seq;
+		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
+		if (jblocks->jb_writeseg == NULL)
+			jblocks->jb_writeseg = jseg;
+		/*
+		 * Start filling in records from the pending list.
+		 */
+		data = bp->b_data;
+		jseg_write(fs, jblocks, jseg, data);
+		data += JREC_SIZE;
+		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
+		    != NULL) {
+			remove_from_journal(wk);
+			wk->wk_state |= IOSTARTED;
+			WORKLIST_INSERT(&jseg->js_entries, wk);
+			switch (wk->wk_type) {
+			case D_JADDREF:
+				jaddref_write(WK_JADDREF(wk), data);
+				break;
+			case D_JREMREF:
+				jremref_write(WK_JREMREF(wk), data);
+				break;
+			case D_JMVREF:
+				jmvref_write(WK_JMVREF(wk), data);
+				break;
+			case D_JNEWBLK:
+				jnewblk_write(WK_JNEWBLK(wk), data);
+				break;
+			case D_JFREEBLK:
+				jfreeblk_write(WK_JFREEBLK(wk), data);
+				break;
+			case D_JFREEFRAG:
+				jfreefrag_write(WK_JFREEFRAG(wk), data);
+				break;
+			case D_JTRUNC:
+				jtrunc_write(WK_JTRUNC(wk), data);
+				break;
+			default:
+				panic("process_journal: Unknown type %s",
+				    TYPENAME(wk->wk_type));
+				/* NOTREACHED */
+			}
+			data += JREC_SIZE;
+			if (--cnt == 0)
+				break;
+		}
+		/*
+		 * Write this one buffer and continue.
+		 */
+#if 1
+		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
+		FREE_LOCK(&lk);
+		BO_LOCK(bp->b_bufobj);
+		bgetvp(ump->um_devvp, bp);
+		BO_UNLOCK(bp->b_bufobj);
+		/* XXX Could bawrite here. */
+		bwrite(bp);
+		ACQUIRE_LOCK(&lk);
+#else
+		/* This case simulates the write but does not log anything. */
+		handle_written_jseg(jseg, bp);
+		FREE_LOCK(&lk);
+		brelse(bp);
+		ACQUIRE_LOCK(&lk);
+#endif
+		segwritten++;
+	}
+	/*
+	 * If we've suspended the filesystem because we ran out of journal
+	 * space either try to sync it here to make some progress or
+	 * unsuspend it if we already have.
+	 */
+	if (flags == 0 && jblocks && jblocks->jb_suspended) {
+		if (journal_space(ump, jblocks->jb_min)) {
+			FREE_LOCK(&lk);
+			jblocks->jb_suspended = 0;
+			mp->mnt_susp_owner = curthread;
+			vfs_write_resume(mp);
+			ACQUIRE_LOCK(&lk);
+			return;
+		}
+		FREE_LOCK(&lk);
+		VFS_SYNC(mp, MNT_NOWAIT);
+		ffs_sbupdate(ump, MNT_WAIT, 0);
+		ACQUIRE_LOCK(&lk);
+	}
+}
+
+/*
+ * Complete a jseg, allowing all dependencies awaiting journal writes
+ * to proceed.  Each journal dependency also attaches a jsegdep to dependent
+ * structures so that the journal segment can be freed to reclaim space.
+ */
+static void
+complete_jseg(jseg)
+	struct jseg *jseg;
+{
+	struct worklist *wk;
+	struct jmvref *jmvref;
+	int waiting;
+	int i;
+
+	i = 0;
+	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		waiting = wk->wk_state & IOWAITING;
+		wk->wk_state &= ~(IOSTARTED | IOWAITING);
+		wk->wk_state |= COMPLETE;
+		KASSERT(i < jseg->js_cnt,
+		    ("handle_written_jseg: overflow %d >= %d",
+		    i, jseg->js_cnt));
+		jseg->js_refs++; /* Ref goes to the jsegdep below. */
+		switch (wk->wk_type) {
+		case D_JADDREF:
+			handle_written_jaddref(WK_JADDREF(wk), jseg);
+			break;
+		case D_JREMREF:
+			handle_written_jremref(WK_JREMREF(wk), jseg);
+			break;
+		case D_JMVREF:
+			jseg->js_refs--;	/* No jsegdep here. */
+			jmvref = WK_JMVREF(wk);
+			LIST_REMOVE(jmvref, jm_deps);
+			free_pagedep(jmvref->jm_pagedep);
+			WORKITEM_FREE(jmvref, D_JMVREF);
+			break;
+		case D_JNEWBLK:
+			handle_written_jnewblk(WK_JNEWBLK(wk), jseg);
+			break;
+		case D_JFREEBLK:
+			handle_written_jfreeblk(WK_JFREEBLK(wk), jseg);
+			break;
+		case D_JFREEFRAG:
+			handle_written_jfreefrag(WK_JFREEFRAG(wk), jseg);
+			break;
+		case D_JTRUNC:
+			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
+			WORKITEM_FREE(wk, D_JTRUNC);
+			break;
+		default:
+			panic("handle_written_jseg: Unknown type %s",
+			    TYPENAME(wk->wk_type));
+			/* NOTREACHED */
+		}
+		if (waiting)
+			wakeup(wk);
+	}
+	/* Release the self reference so the structure may be freed. */
+	free_jseg(jseg);
+}
+
+/*
+ * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
+ * completions in order only.
+ */
+static void
+handle_written_jseg(jseg, bp)
+	struct jseg *jseg;
+	struct buf *bp;
+{
+	struct jblocks *jblocks;
+	struct jseg *jsegn;
+
+	if (jseg->js_refs == 0)
+		panic("handle_written_jseg: No self-reference on %p", jseg);
+	jseg->js_state |= DEPCOMPLETE;
+	/*
+	 * We'll never need this buffer again, set flags so it will be
+	 * discarded.
+	 */
+	bp->b_flags |= B_INVAL | B_NOCACHE;
+	jblocks = jseg->js_jblocks;
+	/*
+	 * Don't allow out of order completions.  If this isn't the first
+	 * block wait for it to write before we're done.
+	 */
+	if (jseg != jblocks->jb_writeseg)
+		return;
+	/* Iterate through available jsegs processing their entries. */
+	do {
+		jsegn = TAILQ_NEXT(jseg, js_next);
+		complete_jseg(jseg);
+		jseg = jsegn;
+	} while (jseg && jseg->js_state & DEPCOMPLETE);
+	jblocks->jb_writeseg = jseg;
+}
+
+static inline struct jsegdep *
+inoref_segattach(inoref, jseg)
+	struct inoref *inoref;
+	struct jseg *jseg;
+{
+	struct jsegdep *jsegdep;
+
+	jsegdep = inoref->if_jsegdep;
+	inoref->if_jsegdep = NULL;
+	jsegdep->jd_seg = jseg;
+
+	return (jsegdep);
+}
+
+/*
+ * Called once a jremref has made it to stable store.  The jremref is marked
+ * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
+ * for the jremref to complete will be awoken by free_jremref.
+ */
+static void
+handle_written_jremref(jremref, jseg)
+	struct jremref *jremref;
+	struct jseg *jseg;
+{
+	struct inodedep *inodedep;
+	struct jsegdep *jsegdep;
+	struct dirrem *dirrem;
+
+	/*
+	 * Attach the jsegdep to the jseg.
+	 */
+	jsegdep = inoref_segattach(&jremref->jr_ref, jseg);
+	/*
+	 * Remove us from the inoref list.
+	 */
+	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
+	    0, &inodedep) == 0)
+		panic("handle_written_jremref: Lost inodedep");
+	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+	/*
+	 * Complete the dirrem.
+	 */
+	dirrem = jremref->jr_dirrem;
+	jremref->jr_dirrem = NULL;
+	LIST_REMOVE(jremref, jr_deps);
+	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
+	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
+	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
+	    (dirrem->dm_state & COMPLETE) != 0)
+		add_to_worklist(&dirrem->dm_list, 0);
+	free_jremref(jremref);
+}
+
+/*
+ * Called once a jaddref has made it to stable store.  The dependency is
+ * marked complete and any dependent structures are added to the inode
+ * bufwait list to be completed as soon as it is written.  If a bitmap write
+ * depends on this entry we move the inode into the inodedephd of the
+ * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
+ */
+static void
+handle_written_jaddref(jaddref, jseg)
+	struct jaddref *jaddref;
+	struct jseg *jseg;
+{
+	struct jsegdep *jsegdep;
+	struct inodedep *inodedep;
+	struct diradd *diradd;
+	struct mkdir *mkdir;
+
+	/*
+	 * Attach the jsegdep to the jseg.
+	 */
+	jsegdep = inoref_segattach(&jaddref->ja_ref, jseg);
+	mkdir = NULL;
+	diradd = NULL;
+	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+	    0, &inodedep) == 0)
+		panic("handle_written_jaddref: Lost inodedep.");
+	if (jaddref->ja_diradd == NULL)
+		panic("handle_written_jaddref: No dependency");
+	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
+		diradd = jaddref->ja_diradd;
+		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
+	} else if (jaddref->ja_state & MKDIR_PARENT) {
+		mkdir = jaddref->ja_mkdir;
+		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
+	} else if (jaddref->ja_state & MKDIR_BODY)
+		mkdir = jaddref->ja_mkdir;
+	else
+		panic("handle_written_jaddref: Unknown dependency %p",
+		    jaddref->ja_diradd);
+	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
+	/*
+	 * Remove us from the inode list.
+	 */
+	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
+	/*
+	 * The mkdir may be waiting on the jaddref to clear before freeing.
+	 */
+	if (mkdir) {
+		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
+		    ("handle_written_jaddref: Incorrect type for mkdir %s",
+		    TYPENAME(mkdir->md_list.wk_type)));
+		mkdir->md_jaddref = NULL;
+		diradd = mkdir->md_diradd;
+		mkdir->md_state |= DEPCOMPLETE;
+		complete_mkdir(mkdir);
+	}
+	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
+	if (jaddref->ja_state & NEWBLOCK) {
+		inodedep->id_state |= ONDEPLIST;
+		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
+		    inodedep, id_deps);
+	}
+	free_jaddref(jaddref);
+}
+
+/*
+ * Called once a jnewblk journal is written.  The allocdirect or allocindir
+ * is placed in the bmsafemap to await notification of a written bitmap.
+ */
+static void
+handle_written_jnewblk(jnewblk, jseg)
+	struct jnewblk *jnewblk;
+	struct jseg *jseg;
+{
+	struct bmsafemap *bmsafemap;
+	struct jsegdep *jsegdep;
+	struct newblk *newblk;
+
+	/*
+	 * Attach the jsegdep to the jseg.
+	 */
+	jsegdep = jnewblk->jn_jsegdep;
+	jnewblk->jn_jsegdep = NULL;
+	jsegdep->jd_seg = jseg;
+	/*
+	 * Add the written block to the bmsafemap so it can be notified when
+	 * the bitmap is on disk.
+	 */
+	newblk = jnewblk->jn_newblk;
+	jnewblk->jn_newblk = NULL;
+	if (newblk == NULL) 
+		panic("handle_written_jnewblk: No dependency for the segdep.");
+
+	newblk->nb_jnewblk = NULL;
+	bmsafemap = newblk->nb_bmsafemap;
+	WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
+	newblk->nb_state |= ONDEPLIST;
+	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+	free_jnewblk(jnewblk);
+}
+
+/*
+ * Cancel a jfreefrag that won't be needed, probably due to colliding with
+ * an in-flight allocation that has not yet been committed.  Divorce us
+ * from the freefrag and mark it DEPCOMPLETE so that it may be added
+ * to the worklist.
+ */
+static void
+cancel_jfreefrag(jfreefrag)
+	struct jfreefrag *jfreefrag;
+{
+	struct freefrag *freefrag;
+
+	if (jfreefrag->fr_jsegdep) {
+		free_jsegdep(jfreefrag->fr_jsegdep);
+		jfreefrag->fr_jsegdep = NULL;
+	}
+	freefrag = jfreefrag->fr_freefrag;
+	jfreefrag->fr_freefrag = NULL;
+	freefrag->ff_jfreefrag = NULL;
+	free_jfreefrag(jfreefrag);
+	freefrag->ff_state |= DEPCOMPLETE;
+}
+
+/*
+ * Free a jfreefrag when the parent freefrag is rendered obsolete.
+ */
+static void
+free_jfreefrag(jfreefrag)
+	struct jfreefrag *jfreefrag;
+{
+
+	if (jfreefrag->fr_state & IOSTARTED)
+		WORKLIST_REMOVE(&jfreefrag->fr_list);
+	else if (jfreefrag->fr_state & ONWORKLIST)
+		remove_from_journal(&jfreefrag->fr_list);
+	if (jfreefrag->fr_freefrag != NULL)
+		panic("free_jfreefrag:  Still attached to a freefrag.");
+	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
+}
+
+/*
+ * Called when the journal write for a jfreefrag completes.  The parent
+ * freefrag is added to the worklist if this completes its dependencies.
+ */
+static void
+handle_written_jfreefrag(jfreefrag, jseg)
+	struct jfreefrag *jfreefrag;
+	struct jseg *jseg;
+{
+	struct jsegdep *jsegdep;
+	struct freefrag *freefrag;
+
+	/*
+	 * Attach the jsegdep to the jseg.
+	 */
+	jsegdep = jfreefrag->fr_jsegdep;
+	jfreefrag->fr_jsegdep = NULL;
+	jsegdep->jd_seg = jseg;
+	freefrag = jfreefrag->fr_freefrag;
+	if (freefrag == NULL)
+		panic("handle_written_jfreefrag: No freefrag.");
+	freefrag->ff_state |= DEPCOMPLETE;
+	freefrag->ff_jfreefrag = NULL;
+	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
+	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+		add_to_worklist(&freefrag->ff_list, 0);
+	jfreefrag->fr_freefrag = NULL;
+	free_jfreefrag(jfreefrag);
+}
+
+/*
+ * Called when the journal write for a jfreeblk completes.  The jfreeblk
+ * is removed from the freeblks list of pending journal writes and the
+ * jsegdep is moved to the freeblks jwork to be completed when all blocks
+ * have been reclaimed.
+ */
+static void
+handle_written_jfreeblk(jfreeblk, jseg)
+	struct jfreeblk *jfreeblk;
+	struct jseg *jseg;
+{
+	struct freeblks *freeblks;
+	struct jsegdep *jsegdep;
+
+	/* Attach the jsegdep to the jseg. */
+	jsegdep = jfreeblk->jf_jsegdep;
+	jfreeblk->jf_jsegdep = NULL;
+	jsegdep->jd_seg = jseg;
+	freeblks = jfreeblk->jf_freeblks;
+	LIST_REMOVE(jfreeblk, jf_deps);
+	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
+	/*
+	 * If the freeblks is all journaled, we can add it to the worklist.
+	 */
+	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
+	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
+		/* Remove from the b_dep that is waiting on this write. */
+		if (freeblks->fb_state & ONWORKLIST)
+			WORKLIST_REMOVE(&freeblks->fb_list);
+		add_to_worklist(&freeblks->fb_list, 1);
+	}
+
+	free_jfreeblk(jfreeblk);
+}
+
+static struct jsegdep *
+newjsegdep(struct worklist *wk)
+{
+	struct jsegdep *jsegdep;
+
+	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
+	jsegdep->jd_seg = NULL;
+
+	return (jsegdep);
+}
+
+static struct jmvref *
+newjmvref(dp, ino, oldoff, newoff)
+	struct inode *dp;
+	ino_t ino;
+	off_t oldoff;
+	off_t newoff;
+{
+	struct jmvref *jmvref;
+
+	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
+	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
+	jmvref->jm_parent = dp->i_number;
+	jmvref->jm_ino = ino;
+	jmvref->jm_oldoff = oldoff;
+	jmvref->jm_newoff = newoff;
+
+	return (jmvref);
+}
+
+/*
+ * Allocate a new jremref that tracks the removal of ip from dp with the
+ * directory entry offset of diroff.  Mark the entry as ATTACHED and
+ * DEPCOMPLETE as we have all the information required for the journal write
+ * and the directory has already been removed from the buffer.  The caller
+ * is responsible for linking the jremref into the pagedep and adding it
+ * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
+ * a DOTDOT addition so handle_workitem_remove() can properly assign
+ * the jsegdep when we're done.
+ */
+static struct jremref *
+newjremref(dirrem, dp, ip, diroff, nlink)
+	struct dirrem *dirrem;
+	struct inode *dp;
+	struct inode *ip;
+	off_t diroff;
+	nlink_t nlink;
+{
+	struct jremref *jremref;
+
+	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
+	jremref->jr_state = ATTACHED;
+	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
+	   nlink, ip->i_mode);
+	jremref->jr_dirrem = dirrem;
+
+	return (jremref);
+}
+
+static inline void
+newinoref(inoref, ino, parent, diroff, nlink, mode)
+	struct inoref *inoref;
+	ino_t ino;
+	ino_t parent;
+	off_t diroff;
+	nlink_t nlink;
+	uint16_t mode;
+{
+
+	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
+	inoref->if_diroff = diroff;
+	inoref->if_ino = ino;
+	inoref->if_parent = parent;
+	inoref->if_nlink = nlink;
+	inoref->if_mode = mode;
+}
+
+/*
+ * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
+ * directory offset may not be known until later.  The caller is responsible
+ * adding the entry to the journal when this information is available.  nlink
+ * should be the link count prior to the addition and mode is only required
+ * to have the correct FMT.
+ */
+static struct jaddref *
+newjaddref(dp, ino, diroff, nlink, mode)
+	struct inode *dp;
+	ino_t ino;
+	off_t diroff;
+	int16_t nlink;
+	uint16_t mode;
+{
+	struct jaddref *jaddref;
+
+	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
+	jaddref->ja_state = ATTACHED;
+	jaddref->ja_mkdir = NULL;
+	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
+
+	return (jaddref);
+}
+
+/*
+ * Create a new free dependency for a freework.  The caller is responsible
+ * for adjusting the reference count when it has the lock held.  The freedep
+ * will track an outstanding bitmap write that will ultimately clear the
+ * freework to continue.
+ */
+static struct freedep *
+newfreedep(struct freework *freework)
+{
+	struct freedep *freedep;
+
+	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
+	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
+	freedep->fd_freework = freework;
+
+	return (freedep);
+}
+
+/*
+ * Free a freedep structure once the buffer it is linked to is written.  If
+ * this is the last reference to the freework schedule it for completion.
+ */
+static void
+free_freedep(freedep)
+	struct freedep *freedep;
+{
+
+	if (--freedep->fd_freework->fw_ref == 0)
+		add_to_worklist(&freedep->fd_freework->fw_list, 1);
+	WORKITEM_FREE(freedep, D_FREEDEP);
+}
+
+/*
+ * Allocate a new freework structure that may be a level in an indirect
+ * when parent is not NULL or a top level block when it is.  The top level
+ * freework structures are allocated without lk held and before the freeblks
+ * is visible outside of softdep_setup_freeblocks().
+ */
+static struct freework *
+newfreework(freeblks, parent, lbn, nb, frags, journal)
+	struct freeblks *freeblks;
+	struct freework *parent;
+	ufs_lbn_t lbn;
+	ufs2_daddr_t nb;
+	int frags;
+	int journal;
+{
+	struct freework *freework;
+
+	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
+	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
+	freework->fw_freeblks = freeblks;
+	freework->fw_parent = parent;
+	freework->fw_lbn = lbn;
+	freework->fw_blkno = nb;
+	freework->fw_frags = frags;
+	freework->fw_ref = 0;
+	freework->fw_off = 0;
+	LIST_INIT(&freework->fw_jwork);
+
+	if (parent == NULL) {
+		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
+		    &freework->fw_list);
+		freeblks->fb_ref++;
+	}
+	if (journal)
+		newjfreeblk(freeblks, lbn, nb, frags);
+
+	return (freework);
+}
+
+/*
+ * Allocate a new jfreeblk to journal top level block pointer when truncating
+ * a file.  The caller must add this to the worklist when lk is held.
+ */
+static struct jfreeblk *
+newjfreeblk(freeblks, lbn, blkno, frags)
+	struct freeblks *freeblks;
+	ufs_lbn_t lbn;
+	ufs2_daddr_t blkno;
+	int frags;
+{
+	struct jfreeblk *jfreeblk;
+
+	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
+	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
+	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
+	jfreeblk->jf_ino = freeblks->fb_previousinum;
+	jfreeblk->jf_lbn = lbn;
+	jfreeblk->jf_blkno = blkno;
+	jfreeblk->jf_frags = frags;
+	jfreeblk->jf_freeblks = freeblks;
+	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
+
+	return (jfreeblk);
+}
+
+static void move_newblock_dep(struct jaddref *, struct inodedep *);
+/*
+ * If we're canceling a new bitmap we have to search for another ref
+ * to move into the bmsafemap dep.  This might be better expressed
+ * with another structure.
+ */
+static void
+move_newblock_dep(jaddref, inodedep)
+	struct jaddref *jaddref;
+	struct inodedep *inodedep;
+{
+	struct inoref *inoref;
+	struct jaddref *jaddrefn;
+
+	jaddrefn = NULL;
+	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+	    inoref = TAILQ_NEXT(inoref, if_deps)) {
+		if ((jaddref->ja_state & NEWBLOCK) &&
+		    inoref->if_list.wk_type == D_JADDREF) {
+			jaddrefn = (struct jaddref *)inoref;
+			break;
+		}
+	}
+	if (jaddrefn == NULL)
+		return;
+	if (inodedep == NULL)
+		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+		    0, &inodedep) == 0)
+			panic("move_newblock_dep: Lost inodedep");
+	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
+	jaddrefn->ja_state |= jaddref->ja_state &
+	    (ATTACHED | UNDONE | NEWBLOCK);
+	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
+	jaddref->ja_state |= ATTACHED;
+	LIST_REMOVE(jaddref, ja_bmdeps);
+	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
+	    ja_bmdeps);
+}
+
+/*
+ * Cancel a jaddref either before it has been written or while it is being
+ * written.  This happens when a link is removed before the add reaches
+ * the disk.  The jaddref dependency is kept linked into the bmsafemap
+ * and inode to prevent the link count or bitmap from reaching the disk
+ * until handle_workitem_remove() re-adjusts the counts and bitmaps as
+ * required.
+ *
+ * Returns 1 if the canceled addref requires journaling of the remove and
+ * 0 otherwise.
+ */
+static int
+cancel_jaddref(jaddref, inodedep, wkhd)
+	struct jaddref *jaddref;
+	struct inodedep *inodedep;
+	struct workhead *wkhd;
+{
+	struct inoref *inoref;
+	int needsj;
+
+	KASSERT((jaddref->ja_state & COMPLETE) == 0,
+	    ("cancel_jaddref: Canceling complete jaddref"));
+	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
+		needsj = 1;
+	else
+		needsj = 0;
+	/*
+	 * If we're not journaling this remove we must adjust the nlink of
+	 * any reference operation that follows us so that it is consistent
+	 * with the in-memory reference.
+	 */
+	if (needsj == 0)
+		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+		    inoref = TAILQ_NEXT(inoref, if_deps))
+			inoref->if_nlink--;
+	if (jaddref->ja_ref.if_jsegdep) {
+		free_jsegdep(jaddref->ja_ref.if_jsegdep);
+		jaddref->ja_ref.if_jsegdep = NULL;
+	}
+	if (jaddref->ja_state & NEWBLOCK)
+		move_newblock_dep(jaddref, inodedep);
+	if (jaddref->ja_state & IOWAITING) {
+		jaddref->ja_state &= ~IOWAITING;
+		wakeup(&jaddref->ja_list);
+	}
+	jaddref->ja_mkdir = NULL;
+	if (jaddref->ja_state & IOSTARTED) {
+		jaddref->ja_state &= ~IOSTARTED;
+		WORKLIST_REMOVE(&jaddref->ja_list);
+	} else
+		remove_from_journal(&jaddref->ja_list);
+	jaddref->ja_state |= GOINGAWAY;
+	/*
+	 * Leave the head of the list for jsegdeps for fast merging.
+	 */
+	if (LIST_FIRST(wkhd) != NULL) {
+		jaddref->ja_state |= ONWORKLIST;
+		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
+	} else
+		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
+
+	return (needsj);
+}
+
+/* 
+ * Attempt to free a jaddref structure when some work completes.  This
+ * should only succeed once the entry is written and all dependencies have
+ * been notified.
+ */
+static void
+free_jaddref(jaddref)
+	struct jaddref *jaddref;
+{
+
+	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
+		return;
+	if (jaddref->ja_ref.if_jsegdep)
+		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
+		    jaddref, jaddref->ja_state);
+	if (jaddref->ja_state & NEWBLOCK)
+		LIST_REMOVE(jaddref, ja_bmdeps);
+	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
+		panic("free_jaddref: Bad state %p(0x%X)",
+		    jaddref, jaddref->ja_state);
+	if (jaddref->ja_mkdir != NULL)
+		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
+	WORKITEM_FREE(jaddref, D_JADDREF);
+}
+
+/*
+ * Free a jremref structure once it has been written or discarded.
+ */
+static void
+free_jremref(jremref)
+	struct jremref *jremref;
+{
+
+	if (jremref->jr_ref.if_jsegdep)
+		free_jsegdep(jremref->jr_ref.if_jsegdep);
+	if (jremref->jr_state & IOSTARTED)
+		panic("free_jremref: IO still pending");
+	WORKITEM_FREE(jremref, D_JREMREF);
+}
+
+/*
+ * Free a jnewblk structure.
+ */
+static void
+free_jnewblk(jnewblk)
+	struct jnewblk *jnewblk;
+{
+
+	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
+		return;
+	LIST_REMOVE(jnewblk, jn_deps);
+	if (jnewblk->jn_newblk != NULL)
+		panic("free_jnewblk: Dependency still attached.");
+	WORKITEM_FREE(jnewblk, D_JNEWBLK);
+}
+
+/*
+ * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
+ * is kept linked into the bmsafemap until the free completes, thus
+ * preventing the modified state from ever reaching disk.  The free
+ * routine must pass this structure via ffs_blkfree() to
+ * softdep_setup_freeblks() so there is no race in releasing the space.
+ */
+static void
+cancel_jnewblk(jnewblk, wkhd)
+	struct jnewblk *jnewblk;
+	struct workhead *wkhd;
+{
+
+	if (jnewblk->jn_jsegdep) {
+		free_jsegdep(jnewblk->jn_jsegdep);
+		jnewblk->jn_jsegdep = NULL;
+	}
+	if (jnewblk->jn_state & IOWAITING) {
+		jnewblk->jn_state &= ~IOWAITING;
+		wakeup(&jnewblk->jn_list);
+	}
+	jnewblk->jn_newblk = NULL;
+	jnewblk->jn_state |= GOINGAWAY;
+	if (jnewblk->jn_state & IOSTARTED) {
+		jnewblk->jn_state &= ~IOSTARTED;
+		WORKLIST_REMOVE(&jnewblk->jn_list);
+	} else
+		remove_from_journal(&jnewblk->jn_list);
+	/*
+	 * Leave the head of the list for jsegdeps for fast merging.
+	 */
+	if (LIST_FIRST(wkhd) != NULL) {
+		jnewblk->jn_state |= ONWORKLIST;
+		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
+	} else
+		WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
+}
+
+static void
+free_jfreeblk(jfreeblk)
+	struct jfreeblk *jfreeblk;
+{
+
+	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
+}
+
+/*
+ * Release one reference to a jseg and free it if the count reaches 0.  This
+ * should eventually reclaim journal space as well.
+ */
+static void
+free_jseg(jseg)
+	struct jseg *jseg;
+{
+	struct jblocks *jblocks;
+
+	KASSERT(jseg->js_refs > 0,
+	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
+	if (--jseg->js_refs != 0)
+		return;
+	/*
+	 * Free only those jsegs which have none allocated before them to
+	 * preserve the journal space ordering.
+	 */
+	jblocks = jseg->js_jblocks;
+	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
+		jblocks->jb_oldestseq = jseg->js_seq;
+		if (jseg->js_refs != 0)
+			break;
+		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
+		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
+		KASSERT(LIST_EMPTY(&jseg->js_entries),
+		    ("free_jseg: Freed jseg has valid entries."));
+		WORKITEM_FREE(jseg, D_JSEG);
+	}
+}
+
+/*
+ * Release a jsegdep and decrement the jseg count.
+ */
+static void
+free_jsegdep(jsegdep)
+	struct jsegdep *jsegdep;
+{
+
+	if (jsegdep->jd_seg)
+		free_jseg(jsegdep->jd_seg);
+	WORKITEM_FREE(jsegdep, D_JSEGDEP);
+}
+
+/*
+ * Wait for a journal item to make it to disk.  Initiate journal processing
+ * if required.
+ */
+static void
+jwait(wk)
+	struct worklist *wk;
+{
+
+	/*
+	 * If IO has not started we process the journal.  We can't mark the
+	 * worklist item as IOWAITING because we drop the lock while
+	 * processing the journal and the worklist entry may be freed after
+	 * this point.  The caller may call back in and re-issue the request.
+	 */
+	if ((wk->wk_state & IOSTARTED) == 0) {
+		softdep_process_journal(wk->wk_mp, MNT_WAIT);
+		return;
+	}
+	wk->wk_state |= IOWAITING;
+	msleep(wk, &lk, PRIBIO, "jwait", 0);
+}
+
+/*
+ * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
+ * appropriate.  This is a convenience function to reduce duplicate code
+ * for the setup and revert functions below.
+ */
+static struct inodedep *
+inodedep_lookup_ip(ip)
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+
+	KASSERT(ip->i_nlink >= ip->i_effnlink,
+	    ("inodedep_lookup_ip: bad delta"));
+	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
+	    DEPALLOC, &inodedep);
+	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+
+	return (inodedep);
+}
+
+/*
+ * Create a journal entry that describes a truncate that we're about to
+ * perform.  The inode allocations and frees between here and the completion
+ * of the operation are done asynchronously and without journaling.  At
+ * the end of the operation the vnode is sync'd and the journal space
+ * is released.  Recovery will discover the partially completed truncate
+ * and complete it.
+ */
+void *
+softdep_setup_trunc(vp, length, flags)
+	struct vnode *vp;
+	off_t length;
+	int flags;
+{
+	struct jsegdep *jsegdep;
+	struct jtrunc *jtrunc;
+	struct ufsmount *ump;
+	struct inode *ip;
+
+	softdep_prealloc(vp, MNT_WAIT);
+	ip = VTOI(vp);
+	ump = VFSTOUFS(vp->v_mount);
+	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
+	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
+	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
+	jtrunc->jt_ino = ip->i_number;
+	jtrunc->jt_extsize = 0;
+	jtrunc->jt_size = length;
+	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
+		jtrunc->jt_extsize = ip->i_din2->di_extsize;
+	if ((flags & IO_NORMAL) == 0)
+		jtrunc->jt_size = DIP(ip, i_size);
+	ACQUIRE_LOCK(&lk);
+	add_to_journal(&jtrunc->jt_list);
+	while (jsegdep->jd_seg == NULL)
+		jwait(&jtrunc->jt_list);
+	FREE_LOCK(&lk);
+
+	return (jsegdep);
+}
+
+/*
+ * After synchronous truncation is complete we free sync the vnode and
+ * release the jsegdep so the journal space can be freed.
+ */
+int
+softdep_complete_trunc(vp, cookie)
+	struct vnode *vp;
+	void *cookie;
+{
+	int error;
+	
+	error = ffs_syncvnode(vp, MNT_WAIT);
+	ACQUIRE_LOCK(&lk);
+	free_jsegdep((struct jsegdep *)cookie);
+	FREE_LOCK(&lk);
+
+	return (error);
+}
+
+/*
+ * Called prior to creating a new inode and linking it to a directory.  The
+ * jaddref structure must already be allocated by softdep_setup_inomapdep
+ * and it is discovered here so we can initialize the mode and update
+ * nlinkdelta.
+ */
+void
+softdep_setup_create(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	KASSERT(ip->i_nlink == 1,
+	    ("softdep_setup_create: Invalid link count."));
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(ip);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+		    ("softdep_setup_create: No addref structure present."));
+		jaddref->ja_mode = ip->i_mode;
+		softdep_prelink(dvp, NULL);
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track the addition of a DOTDOT link when
+ * we are reparenting an inode as part of a rename.  This jaddref will be
+ * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
+ * non-journaling softdep.
+ */
+void
+softdep_setup_dotdot_link(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	jaddref = NULL;
+	/*
+	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
+	 * is used as a normal link would be.
+	 */
+	if (DOINGSUJ(dvp))
+		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+		    dp->i_effnlink - 1, dp->i_mode);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(dp);
+	if (jaddref) {
+		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+		    if_deps);
+		softdep_prelink(dvp, ITOV(ip));
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track a new link to an inode.  The directory
+ * offset is not known until softdep_setup_directory_add or
+ * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
+ * softdep.
+ */
+void
+softdep_setup_link(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	jaddref = NULL;
+	if (DOINGSUJ(dvp))
+		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
+		    ip->i_mode);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(ip);
+	if (jaddref) {
+		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+		    if_deps);
+		softdep_prelink(dvp, ITOV(ip));
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to create the jaddref structures to track . and .. references as
+ * well as lookup and further initialize the incomplete jaddref created
+ * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
+ * nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_setup_mkdir(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *dotdotaddref;
+	struct jaddref *dotaddref;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	dotaddref = dotdotaddref = NULL;
+	if (DOINGSUJ(dvp)) {
+		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
+		    ip->i_mode);
+		dotaddref->ja_state |= MKDIR_BODY;
+		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+		    dp->i_effnlink - 1, dp->i_mode);
+		dotdotaddref->ja_state |= MKDIR_PARENT;
+	}
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(ip);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref != NULL,
+		    ("softdep_setup_mkdir: No addref structure present."));
+		KASSERT(jaddref->ja_parent == dp->i_number, 
+		    ("softdep_setup_mkdir: bad parent %d",
+		    jaddref->ja_parent));
+		jaddref->ja_mode = ip->i_mode;
+		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
+		    if_deps);
+	}
+	inodedep = inodedep_lookup_ip(dp);
+	if (DOINGSUJ(dvp)) {
+		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
+		    &dotdotaddref->ja_ref, if_deps);
+		softdep_prelink(ITOV(dp), NULL);
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlinking a directory.
+ */
+void
+softdep_setup_rmdir(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	(void) inodedep_lookup_ip(ip);
+	(void) inodedep_lookup_ip(dp);
+	if (DOINGSUJ(dvp))
+		softdep_prelink(dvp, ITOV(ip));
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlink.
+ */
+void
+softdep_setup_unlink(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	(void) inodedep_lookup_ip(ip);
+	(void) inodedep_lookup_ip(dp);
+	if (DOINGSUJ(dvp))
+		softdep_prelink(dvp, ITOV(ip));
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed non-directory
+ * creation.  Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_create(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(ip);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == dp->i_number,
+		    ("softdep_revert_create: addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed dotdot link
+ * creation.  Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_dotdot_link(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(dp);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == ip->i_number,
+		    ("softdep_revert_dotdot_link: addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed link
+ * addition.  Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_link(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(ip);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == dp->i_number,
+		    ("softdep_revert_link: addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+	}
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed mkdir
+ * attempt.  Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_mkdir(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct vnode *dvp;
+
+	dvp = ITOV(dp);
+
+	ACQUIRE_LOCK(&lk);
+	inodedep = inodedep_lookup_ip(dp);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == ip->i_number,
+		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+	}
+	inodedep = inodedep_lookup_ip(ip);
+	if (DOINGSUJ(dvp)) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == dp->i_number,
+		    ("softdep_revert_mkdir: addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref->ja_parent == ip->i_number,
+		    ("softdep_revert_mkdir: dot addref parent mismatch"));
+		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+	}
+	FREE_LOCK(&lk);
+}
+
+/* 
+ * Called to correct nlinkdelta after a failed rmdir.
+ */
+void
+softdep_revert_rmdir(dp, ip)
+	struct inode *dp;
+	struct inode *ip;
+{
+
+	ACQUIRE_LOCK(&lk);
+	(void) inodedep_lookup_ip(ip);
+	(void) inodedep_lookup_ip(dp);
+	FREE_LOCK(&lk);
+}
+
+/*
  * Protecting the freemaps (or bitmaps).
  * 
  * To eliminate the need to execute fsck before mounting a filesystem
@@ -1536,22 +3873,44 @@ softdep_setup_inomapdep(bp, ip, newinum)
 {
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
+	struct jaddref *jaddref;
+	struct mount *mp;
+	struct fs *fs;
 
+	mp = UFSTOVFS(ip->i_ump);
+	fs = ip->i_ump->um_fs;
+	jaddref = NULL;
+
 	/*
+	 * Allocate the journal reference add structure so that the bitmap
+	 * can be dependent on it.
+	 */
+	if (mp->mnt_flag & MNT_SUJ) {
+		jaddref = newjaddref(ip, newinum, 0, 0, 0);
+		jaddref->ja_state |= NEWBLOCK;
+	}
+
+	/*
 	 * Create a dependency for the newly allocated inode.
 	 * Panic if it already exists as something is seriously wrong.
 	 * Otherwise add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
-	    &inodedep)))
-		panic("softdep_setup_inomapdep: dependency for new inode "
-		    "already exists");
-	inodedep->id_buf = bp;
+	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
+		panic("softdep_setup_inomapdep: dependency %p for new"
+		    "inode already exists", inodedep);
+	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
+	if (jaddref) {
+		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
+		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+		    if_deps);
+	} else {
+		inodedep->id_state |= ONDEPLIST;
+		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
+	}
+	inodedep->id_bmsafemap = bmsafemap;
 	inodedep->id_state &= ~DEPCOMPLETE;
-	bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
-	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 	FREE_LOCK(&lk);
 }
 
@@ -1560,29 +3919,98 @@ softdep_setup_inomapdep(bp, ip, newinum)
  * allocate block or fragment.
  */
 void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 	struct buf *bp;		/* buffer for cylgroup block with block map */
 	struct mount *mp;	/* filesystem doing allocation */
 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
+	int frags;		/* Number of fragments. */
+	int oldfrags;		/* Previous number of fragments for extend. */
 {
 	struct newblk *newblk;
 	struct bmsafemap *bmsafemap;
+	struct jnewblk *jnewblk;
 	struct fs *fs;
 
 	fs = VFSTOUFS(mp)->um_fs;
+	jnewblk = NULL;
 	/*
 	 * Create a dependency for the newly allocated block.
 	 * Add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
+	if (mp->mnt_flag & MNT_SUJ) {
+		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
+		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
+		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
+		jnewblk->jn_state = ATTACHED;
+		jnewblk->jn_blkno = newblkno;
+		jnewblk->jn_frags = frags;
+		jnewblk->jn_oldfrags = oldfrags;
+#ifdef SUJ_DEBUG
+		{
+			struct cg *cgp;
+			uint8_t *blksfree;
+			long bno;
+			int i;
+	
+			cgp = (struct cg *)bp->b_data;
+			blksfree = cg_blksfree(cgp);
+			bno = dtogd(fs, jnewblk->jn_blkno);
+			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+			    i++) {
+				if (isset(blksfree, bno + i))
+					panic("softdep_setup_blkmapdep: "
+					    "free fragment %d from %d-%d "
+					    "state 0x%X dep %p", i,
+					    jnewblk->jn_oldfrags,
+					    jnewblk->jn_frags,
+					    jnewblk->jn_state,
+					    jnewblk->jn_newblk);
+			}
+		}
+#endif
+	}
 	ACQUIRE_LOCK(&lk);
-	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
+	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
 		panic("softdep_setup_blkmapdep: found block");
-	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
-	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
+	    dtog(fs, newblkno));
+	if (jnewblk) {
+		jnewblk->jn_newblk = newblk;
+		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
+	} else {
+		newblk->nb_state |= ONDEPLIST;
+		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+	}
+	newblk->nb_bmsafemap = bmsafemap;
+	newblk->nb_jnewblk = jnewblk;
 	FREE_LOCK(&lk);
 }
 
+#define	BMSAFEMAP_HASH(fs, cg) \
+      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
+
+static int
+bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
+	struct bmsafemap_hashhead *bmsafemaphd;
+	struct mount *mp;
+	int cg;
+	struct bmsafemap **bmsafemapp;
+{
+	struct bmsafemap *bmsafemap;
+
+	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
+		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
+			break;
+	if (bmsafemap) {
+		*bmsafemapp = bmsafemap;
+		return (1);
+	}
+	*bmsafemapp = NULL;
+
+	return (0);
+}
+
 /*
  * Find the bmsafemap associated with a cylinder group buffer.
  * If none exists, create one. The buffer must be locked when
@@ -1590,27 +4018,43 @@ void
  * splbio interrupts blocked.
  */
 static struct bmsafemap *
-bmsafemap_lookup(mp, bp)
+bmsafemap_lookup(mp, bp, cg)
 	struct mount *mp;
 	struct buf *bp;
+	int cg;
 {
-	struct bmsafemap *bmsafemap;
+	struct bmsafemap_hashhead *bmsafemaphd;
+	struct bmsafemap *bmsafemap, *collision;
 	struct worklist *wk;
+	struct fs *fs;
 
 	mtx_assert(&lk, MA_OWNED);
-	LIST_FOREACH(wk, &bp->b_dep, wk_list)
-		if (wk->wk_type == D_BMSAFEMAP)
-			return (WK_BMSAFEMAP(wk));
+	if (bp)
+		LIST_FOREACH(wk, &bp->b_dep, wk_list)
+			if (wk->wk_type == D_BMSAFEMAP)
+				return (WK_BMSAFEMAP(wk));
+	fs = VFSTOUFS(mp)->um_fs;
+	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
+	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
+		return (bmsafemap);
 	FREE_LOCK(&lk);
 	bmsafemap = malloc(sizeof(struct bmsafemap),
 		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 	bmsafemap->sm_buf = bp;
-	LIST_INIT(&bmsafemap->sm_allocdirecthd);
-	LIST_INIT(&bmsafemap->sm_allocindirhd);
 	LIST_INIT(&bmsafemap->sm_inodedephd);
+	LIST_INIT(&bmsafemap->sm_inodedepwr);
 	LIST_INIT(&bmsafemap->sm_newblkhd);
+	LIST_INIT(&bmsafemap->sm_newblkwr);
+	LIST_INIT(&bmsafemap->sm_jaddrefhd);
+	LIST_INIT(&bmsafemap->sm_jnewblkhd);
 	ACQUIRE_LOCK(&lk);
+	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
+		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+		return (collision);
+	}
+	bmsafemap->sm_cg = cg;
+	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 	return (bmsafemap);
 }
@@ -1645,9 +4089,9 @@ static struct bmsafemap *
  * unreferenced fragments.
  */ 
 void 
-softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;	/* inode to which block is being added */
-	ufs_lbn_t lbn;		/* block pointer within inode */
+	ufs_lbn_t off;		/* block pointer within inode */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
 	long newsize;		/* size of new block */
@@ -1656,34 +4100,33 @@ void
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
-	struct bmsafemap *bmsafemap;
+	struct freefrag *freefrag;
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
+	struct jnewblk *jnewblk;
 	struct newblk *newblk;
 	struct mount *mp;
+	ufs_lbn_t lbn;
 
+	lbn = bp->b_lblkno;
 	mp = UFSTOVFS(ip->i_ump);
-	adp = malloc(sizeof(struct allocdirect),
-		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
-	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
-	adp->ad_lbn = lbn;
-	adp->ad_newblkno = newblkno;
-	adp->ad_oldblkno = oldblkno;
-	adp->ad_newsize = newsize;
-	adp->ad_oldsize = oldsize;
-	adp->ad_state = ATTACHED;
-	LIST_INIT(&adp->ad_newdirblk);
-	if (newblkno == oldblkno)
-		adp->ad_freefrag = NULL;
+	if (oldblkno && oldblkno != newblkno)
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
 	else
-		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+		freefrag = NULL;
 
 	ACQUIRE_LOCK(&lk);
-	if (lbn >= NDADDR) {
+	if (off >= NDADDR) {
+		if (lbn > 0)
+			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
+			    lbn, off);
 		/* allocating an indirect block */
 		if (oldblkno != 0)
 			panic("softdep_setup_allocdirect: non-zero indir");
 	} else {
+		if (off != lbn)
+			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
+			    lbn, off);
 		/*
 		 * Allocating a direct block.
 		 *
@@ -1692,26 +4135,39 @@ void
 		 * deletions.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR &&
-		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
+		    &pagedep) == 0)
 			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	}
-	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocdirect: lost block");
-	if (newblk->nb_state == DEPCOMPLETE) {
-		adp->ad_state |= DEPCOMPLETE;
-		adp->ad_buf = NULL;
-	} else {
-		bmsafemap = newblk->nb_bmsafemap;
-		adp->ad_buf = bmsafemap->sm_buf;
-		LIST_REMOVE(newblk, nb_deps);
-		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
+	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+	    ("softdep_setup_allocdirect: newblk already initialized"));
+	/*
+	 * Convert the newblk to an allocdirect.
+	 */
+	newblk->nb_list.wk_type = D_ALLOCDIRECT;
+	adp = (struct allocdirect *)newblk;
+	newblk->nb_freefrag = freefrag;
+	adp->ad_offset = off;
+	adp->ad_oldblkno = oldblkno;
+	adp->ad_newsize = newsize;
+	adp->ad_oldsize = oldsize;
+
+	/*
+	 * Finish initializing the journal.
+	 */
+	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+		jnewblk->jn_ino = ip->i_number;
+		jnewblk->jn_lbn = lbn;
+		add_to_journal(&jnewblk->jn_list);
 	}
-	LIST_REMOVE(newblk, nb_hash);
-	free(newblk, M_NEWBLK);
-
+	if (freefrag && freefrag->ff_jfreefrag != NULL)
+		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
-	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
+
+	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
@@ -1726,24 +4182,25 @@ void
 	 */
 	adphead = &inodedep->id_newinoupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
-	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+	if (oldadp == NULL || oldadp->ad_offset <= off) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
-		if (oldadp != NULL && oldadp->ad_lbn == lbn)
+		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
-		if (oldadp->ad_lbn >= lbn)
+		if (oldadp->ad_offset >= off)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocdirect: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
-	if (oldadp->ad_lbn == lbn)
+	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
+
 	FREE_LOCK(&lk);
 }
 
@@ -1761,10 +4218,11 @@ allocdirect_merge(adphead, newadp, oldadp)
 	struct freefrag *freefrag;
 	struct newdirblk *newdirblk;
 
+	freefrag = NULL;
 	mtx_assert(&lk, MA_OWNED);
 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 	    newadp->ad_oldsize != oldadp->ad_newsize ||
-	    newadp->ad_lbn >= NDADDR)
+	    newadp->ad_offset >= NDADDR)
 		panic("%s %jd != new %jd || old size %ld != new %ld",
 		    "allocdirect_merge: old blkno",
 		    (intmax_t)newadp->ad_oldblkno,
@@ -1779,7 +4237,7 @@ allocdirect_merge(adphead, newadp, oldadp)
 	 * This action is done by swapping the freefrag dependencies.
 	 * The new dependency gains the old one's freefrag, and the
 	 * old one gets the new one and then immediately puts it on
-	 * the worklist when it is freed by free_allocdirect. It is
+	 * the worklist when it is freed by free_newblk. It is
 	 * not possible to do this swap when the old dependency had a
 	 * non-zero size but no previous fragment to free. This condition
 	 * arises when the new block is an extension of the old block.
@@ -1788,8 +4246,8 @@ allocdirect_merge(adphead, newadp, oldadp)
 	 * the old dependency, so cannot legitimately be freed until the
 	 * conditions for the new dependency are fulfilled.
 	 */
+	freefrag = newadp->ad_freefrag;
 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
-		freefrag = newadp->ad_freefrag;
 		newadp->ad_freefrag = oldadp->ad_freefrag;
 		oldadp->ad_freefrag = freefrag;
 	}
@@ -1804,32 +4262,118 @@ allocdirect_merge(adphead, newadp, oldadp)
 			panic("allocdirect_merge: extra newdirblk");
 		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 	}
-	free_allocdirect(adphead, oldadp, 0);
+	TAILQ_REMOVE(adphead, oldadp, ad_next);
+	/*
+	 * We need to move any journal dependencies over to the freefrag
+	 * that releases this block if it exists.  Otherwise we are
+	 * extending an existing block and we'll wait until that is
+	 * complete to release the journal space and extend the
+	 * new journal to cover this old space as well.
+	 */
+	if (freefrag == NULL) {
+		struct jnewblk *jnewblk;
+		struct jnewblk *njnewblk;
+
+		if (oldadp->ad_newblkno != newadp->ad_newblkno)
+			panic("allocdirect_merge: %jd != %jd",
+			    oldadp->ad_newblkno, newadp->ad_newblkno);
+		jnewblk = oldadp->ad_block.nb_jnewblk;
+		cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
+		/*
+		 * We have an unwritten jnewblk, we need to merge the
+		 * frag bits with our own.  The newer adp's journal can not
+		 * be written prior to the old one so no need to check for
+		 * it here.
+		 */
+		if (jnewblk) {
+			njnewblk = newadp->ad_block.nb_jnewblk;
+			if (njnewblk == NULL)
+				panic("allocdirect_merge: No jnewblk");
+			if (jnewblk->jn_state & UNDONE) {
+				njnewblk->jn_state |= UNDONE | NEWBLOCK;
+				njnewblk->jn_state &= ~ATTACHED;
+				jnewblk->jn_state &= ~UNDONE;
+			}
+			njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
+			WORKLIST_REMOVE(&jnewblk->jn_list);
+			jnewblk->jn_state |= ATTACHED | COMPLETE;
+			free_jnewblk(jnewblk);
+		}
+	} else {
+		/*
+		 * We can skip journaling for this freefrag and just complete
+		 * any pending journal work for the allocdirect that is being
+		 * removed after the freefrag completes.
+		 */
+		if (freefrag->ff_jfreefrag)
+			cancel_jfreefrag(freefrag->ff_jfreefrag);
+		cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
+	}
+	free_newblk(&oldadp->ad_block);
 }
-		
+
 /*
- * Allocate a new freefrag structure if needed.
+ * Allocate a jfreefrag structure to journal a single block free.
  */
+static struct jfreefrag *
+newjfreefrag(freefrag, ip, blkno, size, lbn)
+	struct freefrag *freefrag;
+	struct inode *ip;
+	ufs2_daddr_t blkno;
+	long size;
+	ufs_lbn_t lbn;
+{
+	struct jfreefrag *jfreefrag;
+	struct fs *fs;
+
+	fs = ip->i_fs;
+	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
+	    M_SOFTDEP_FLAGS);
+	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
+	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
+	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
+	jfreefrag->fr_ino = ip->i_number;
+	jfreefrag->fr_lbn = lbn;
+	jfreefrag->fr_blkno = blkno;
+	jfreefrag->fr_frags = numfrags(fs, size);
+	jfreefrag->fr_freefrag = freefrag;
+
+	return (jfreefrag);
+}
+
+/*
+ * Allocate a new freefrag structure.
+ */
 static struct freefrag *
-newfreefrag(ip, blkno, size)
+newfreefrag(ip, blkno, size, lbn)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
+	ufs_lbn_t lbn;
 {
 	struct freefrag *freefrag;
 	struct fs *fs;
 
-	if (blkno == 0)
-		return (NULL);
 	fs = ip->i_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	freefrag = malloc(sizeof(struct freefrag),
-		M_FREEFRAG, M_SOFTDEP_FLAGS);
+	    M_FREEFRAG, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
+	freefrag->ff_state = ATTACHED;
+	LIST_INIT(&freefrag->ff_jwork);
 	freefrag->ff_inum = ip->i_number;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
+
+	if (fs->fs_flags & FS_SUJ) {
+		freefrag->ff_jfreefrag =
+		    newjfreefrag(freefrag, ip, blkno, size, lbn);
+	} else {
+		freefrag->ff_state |= DEPCOMPLETE;
+		freefrag->ff_jfreefrag = NULL;
+	}
+
 	return (freefrag);
 }
 
@@ -1842,9 +4386,17 @@ handle_workitem_freefrag(freefrag)
 	struct freefrag *freefrag;
 {
 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
+	struct workhead wkhd;
 
+	/*
+	 * It would be illegal to add new completion items to the
+	 * freefrag after it was schedule to be done so it must be
+	 * safe to modify the list head here.
+	 */
+	LIST_INIT(&wkhd);
+	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
-	    freefrag->ff_fragsize, freefrag->ff_inum);
+	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
 	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freefrag, D_FREEFRAG);
 	FREE_LOCK(&lk);
@@ -1856,9 +4408,9 @@ handle_workitem_freefrag(freefrag)
  * See the description of softdep_setup_allocdirect above for details.
  */
 void 
-softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
-	ufs_lbn_t lbn;
+	ufs_lbn_t off;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
@@ -1867,50 +4419,55 @@ void
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
-	struct bmsafemap *bmsafemap;
+	struct freefrag *freefrag;
 	struct inodedep *inodedep;
+	struct jnewblk *jnewblk;
 	struct newblk *newblk;
 	struct mount *mp;
+	ufs_lbn_t lbn;
 
+	if (off >= NXADDR)
+		panic("softdep_setup_allocext: lbn %lld > NXADDR",
+		    (long long)off);
+
+	lbn = bp->b_lblkno;
 	mp = UFSTOVFS(ip->i_ump);
-	adp = malloc(sizeof(struct allocdirect),
-		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
-	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
-	adp->ad_lbn = lbn;
-	adp->ad_newblkno = newblkno;
-	adp->ad_oldblkno = oldblkno;
-	adp->ad_newsize = newsize;
-	adp->ad_oldsize = oldsize;
-	adp->ad_state = ATTACHED | EXTDATA;
-	LIST_INIT(&adp->ad_newdirblk);
-	if (newblkno == oldblkno)
-		adp->ad_freefrag = NULL;
+	if (oldblkno && oldblkno != newblkno)
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
 	else
-		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+		freefrag = NULL;
 
 	ACQUIRE_LOCK(&lk);
-	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocext: lost block");
+	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+	    ("softdep_setup_allocext: newblk already initialized"));
+	/*
+	 * Convert the newblk to an allocdirect.
+	 */
+	newblk->nb_list.wk_type = D_ALLOCDIRECT;
+	adp = (struct allocdirect *)newblk;
+	newblk->nb_freefrag = freefrag;
+	adp->ad_offset = off;
+	adp->ad_oldblkno = oldblkno;
+	adp->ad_newsize = newsize;
+	adp->ad_oldsize = oldsize;
+	adp->ad_state |=  EXTDATA;
 
+	/*
+	 * Finish initializing the journal.
+	 */
+	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+		jnewblk->jn_ino = ip->i_number;
+		jnewblk->jn_lbn = lbn;
+		add_to_journal(&jnewblk->jn_list);
+	}
+	if (freefrag && freefrag->ff_jfreefrag != NULL)
+		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
 
-	if (newblk->nb_state == DEPCOMPLETE) {
-		adp->ad_state |= DEPCOMPLETE;
-		adp->ad_buf = NULL;
-	} else {
-		bmsafemap = newblk->nb_bmsafemap;
-		adp->ad_buf = bmsafemap->sm_buf;
-		LIST_REMOVE(newblk, nb_deps);
-		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
-	}
-	LIST_REMOVE(newblk, nb_hash);
-	free(newblk, M_NEWBLK);
-
-	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
-	if (lbn >= NXADDR)
-		panic("softdep_setup_allocext: lbn %lld > NXADDR",
-		    (long long)lbn);
+	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
@@ -1925,23 +4482,23 @@ void
 	 */
 	adphead = &inodedep->id_newextupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
-	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+	if (oldadp == NULL || oldadp->ad_offset <= off) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
-		if (oldadp != NULL && oldadp->ad_lbn == lbn)
+		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
-		if (oldadp->ad_lbn >= lbn)
+		if (oldadp->ad_offset >= off)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocext: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
-	if (oldadp->ad_lbn == lbn)
+	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(&lk);
 }
@@ -1975,22 +4532,39 @@ void
  * Allocate a new allocindir structure.
  */
 static struct allocindir *
-newallocindir(ip, ptrno, newblkno, oldblkno)
+newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
 	struct inode *ip;	/* inode for file being extended */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
+	ufs_lbn_t lbn;
 {
+	struct newblk *newblk;
 	struct allocindir *aip;
+	struct freefrag *freefrag;
+	struct jnewblk *jnewblk;
 
-	aip = malloc(sizeof(struct allocindir),
-		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
-	workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
-	aip->ai_state = ATTACHED;
+	if (oldblkno)
+		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
+	else
+		freefrag = NULL;
+	ACQUIRE_LOCK(&lk);
+	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
+		panic("new_allocindir: lost block");
+	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+	    ("newallocindir: newblk already initialized"));
+	newblk->nb_list.wk_type = D_ALLOCINDIR;
+	newblk->nb_freefrag = freefrag;
+	aip = (struct allocindir *)newblk;
 	aip->ai_offset = ptrno;
-	aip->ai_newblkno = newblkno;
 	aip->ai_oldblkno = oldblkno;
-	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
+	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+		jnewblk->jn_ino = ip->i_number;
+		jnewblk->jn_lbn = lbn;
+		add_to_journal(&jnewblk->jn_list);
+	}
+	if (freefrag && freefrag->ff_jfreefrag != NULL)
+		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
 	return (aip);
 }
 
@@ -2008,22 +4582,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno,
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 	struct buf *nbp;	/* buffer holding allocated page */
 {
+	struct inodedep *inodedep;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
+	struct mount *mp;
 
+	if (lbn != nbp->b_lblkno)
+		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
+		    lbn, bp->b_lblkno);
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
-	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
-	ACQUIRE_LOCK(&lk);
+	mp = UFSTOVFS(ip->i_ump);
+	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
+	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	/*
 	 * If we are allocating a directory page, then we must
 	 * allocate an associated pagedep to track additions and
 	 * deletions.
 	 */
 	if ((ip->i_mode & IFMT) == IFDIR &&
-	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
-	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
-	setup_allocindir_phase2(bp, ip, aip);
+	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
 	FREE_LOCK(&lk);
 }
 
@@ -2039,38 +4619,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno,
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 {
+	struct inodedep *inodedep;
 	struct allocindir *aip;
+	ufs_lbn_t lbn;
 
+	lbn = nbp->b_lblkno;
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
-	aip = newallocindir(ip, ptrno, newblkno, 0);
-	ACQUIRE_LOCK(&lk);
-	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
-	setup_allocindir_phase2(bp, ip, aip);
+	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
+	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
+	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
 	FREE_LOCK(&lk);
 }
 
+static void
+indirdep_complete(indirdep)
+	struct indirdep *indirdep;
+{
+	struct allocindir *aip;
+
+	LIST_REMOVE(indirdep, ir_next);
+	indirdep->ir_state &= ~ONDEPLIST;
+
+	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+		LIST_REMOVE(aip, ai_next);
+		free_newblk(&aip->ai_block);
+	}
+	/*
+	 * If this indirdep is not attached to a buf it was simply waiting
+	 * on completion to clear completehd.  free_indirdep() asserts
+	 * that nothing is dangling.
+	 */
+	if ((indirdep->ir_state & ONWORKLIST) == 0)
+		free_indirdep(indirdep);
+}
+
 /*
  * Called to finish the allocation of the "aip" allocated
  * by one of the two routines above.
  */
 static void 
-setup_allocindir_phase2(bp, ip, aip)
+setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
 	struct buf *bp;		/* in-memory copy of the indirect block */
 	struct inode *ip;	/* inode for file being extended */
+	struct inodedep *inodedep; /* Inodedep for ip */
 	struct allocindir *aip;	/* allocindir allocated by the above routines */
+	ufs_lbn_t lbn;		/* Logical block number for this block. */
 {
 	struct worklist *wk;
+	struct fs *fs;
+	struct newblk *newblk;
 	struct indirdep *indirdep, *newindirdep;
-	struct bmsafemap *bmsafemap;
 	struct allocindir *oldaip;
 	struct freefrag *freefrag;
-	struct newblk *newblk;
+	struct mount *mp;
 	ufs2_daddr_t blkno;
 
+	mp = UFSTOVFS(ip->i_ump);
+	fs = ip->i_fs;
 	mtx_assert(&lk, MA_OWNED);
 	if (bp->b_lblkno >= 0)
 		panic("setup_allocindir_phase2: not indir blk");
-	for (indirdep = NULL, newindirdep = NULL; ; ) {
+	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type != D_INDIRDEP)
 				continue;
@@ -2079,49 +4689,41 @@ static void
 		}
 		if (indirdep == NULL && newindirdep) {
 			indirdep = newindirdep;
+			newindirdep = NULL;
 			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
-			newindirdep = NULL;
+			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
+			    &newblk)) {
+				indirdep->ir_state |= ONDEPLIST;
+				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
+				    indirdep, ir_next);
+			} else
+				indirdep->ir_state |= DEPCOMPLETE;
 		}
 		if (indirdep) {
-			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
-			    &newblk) == 0)
-				panic("setup_allocindir: lost block");
-			if (newblk->nb_state == DEPCOMPLETE) {
-				aip->ai_state |= DEPCOMPLETE;
-				aip->ai_buf = NULL;
-			} else {
-				bmsafemap = newblk->nb_bmsafemap;
-				aip->ai_buf = bmsafemap->sm_buf;
-				LIST_REMOVE(newblk, nb_deps);
-				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
-				    aip, ai_deps);
-			}
-			LIST_REMOVE(newblk, nb_hash);
-			free(newblk, M_NEWBLK);
 			aip->ai_indirdep = indirdep;
 			/*
 			 * Check to see if there is an existing dependency
 			 * for this block. If there is, merge the old
-			 * dependency into the new one.
+			 * dependency into the new one.  This happens
+			 * as a result of reallocblk only.
 			 */
 			if (aip->ai_oldblkno == 0)
 				oldaip = NULL;
 			else
 
-				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
+				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
+				    ai_next)
 					if (oldaip->ai_offset == aip->ai_offset)
 						break;
-			freefrag = NULL;
-			if (oldaip != NULL) {
-				if (oldaip->ai_newblkno != aip->ai_oldblkno)
-					panic("setup_allocindir_phase2: blkno");
-				aip->ai_oldblkno = oldaip->ai_oldblkno;
-				freefrag = aip->ai_freefrag;
-				aip->ai_freefrag = oldaip->ai_freefrag;
-				oldaip->ai_freefrag = NULL;
-				free_allocindir(oldaip, NULL);
-			}
+			if (oldaip != NULL)
+				freefrag = allocindir_merge(aip, oldaip);
 			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
+			KASSERT(aip->ai_offset >= 0 &&
+			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
+			    ("setup_allocindir_phase2: Bad offset %d",
+			    aip->ai_offset));
+			KASSERT(indirdep->ir_savebp != NULL,
+			    ("setup_allocindir_phase2 NULL ir_savebp"));
 			if (ip->i_ump->um_fstype == UFS1)
 				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
 				    [aip->ai_offset] = aip->ai_oldblkno;
@@ -2148,13 +4750,16 @@ static void
 		}
 		newindirdep = malloc(sizeof(struct indirdep),
 			M_INDIRDEP, M_SOFTDEP_FLAGS);
-		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
-		    UFSTOVFS(ip->i_ump));
+		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
 		newindirdep->ir_state = ATTACHED;
 		if (ip->i_ump->um_fstype == UFS1)
 			newindirdep->ir_state |= UFS1FMT;
+		newindirdep->ir_saveddata = NULL;
 		LIST_INIT(&newindirdep->ir_deplisthd);
 		LIST_INIT(&newindirdep->ir_donehd);
+		LIST_INIT(&newindirdep->ir_writehd);
+		LIST_INIT(&newindirdep->ir_completehd);
+		LIST_INIT(&newindirdep->ir_jwork);
 		if (bp->b_blkno == bp->b_lblkno) {
 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 			    NULL, NULL);
@@ -2169,6 +4774,51 @@ static void
 }
 
 /*
+ * Merge two allocindirs which refer to the same block.  Move newblock
+ * dependencies and setup the freefrags appropriately.
+ */
+static struct freefrag *
+allocindir_merge(aip, oldaip)
+	struct allocindir *aip;
+	struct allocindir *oldaip;
+{
+	struct newdirblk *newdirblk;
+	struct freefrag *freefrag;
+	struct worklist *wk;
+
+	if (oldaip->ai_newblkno != aip->ai_oldblkno)
+		panic("allocindir_merge: blkno");
+	aip->ai_oldblkno = oldaip->ai_oldblkno;
+	freefrag = aip->ai_freefrag;
+	aip->ai_freefrag = oldaip->ai_freefrag;
+	oldaip->ai_freefrag = NULL;
+	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
+	/*
+	 * If we are tracking a new directory-block allocation,
+	 * move it from the old allocindir to the new allocindir.
+	 */
+	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
+		newdirblk = WK_NEWDIRBLK(wk);
+		WORKLIST_REMOVE(&newdirblk->db_list);
+		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
+			panic("allocindir_merge: extra newdirblk");
+		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
+	}
+	/*
+	 * We can skip journaling for this freefrag and just complete
+	 * any pending journal work for the allocindir that is being
+	 * removed after the freefrag completes.
+	 */
+	if (freefrag->ff_jfreefrag)
+		cancel_jfreefrag(freefrag->ff_jfreefrag);
+	LIST_REMOVE(oldaip, ai_next);
+	cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
+	free_newblk(&oldaip->ai_block);
+
+	return (freefrag);
+}
+
+/*
  * Block de-allocation dependencies.
  * 
  * When blocks are de-allocated, the on-disk pointers must be nullified before
@@ -2206,6 +4856,7 @@ softdep_setup_freeblocks(ip, length, flags)
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
+	struct jfreeblk *jfreeblk;
 	struct bufobj *bo;
 	struct vnode *vp;
 	struct buf *bp;
@@ -2213,6 +4864,13 @@ softdep_setup_freeblocks(ip, length, flags)
 	ufs2_daddr_t extblocks, datablocks;
 	struct mount *mp;
 	int i, delay, error;
+	ufs2_daddr_t blkno;
+	ufs_lbn_t tmpval;
+	ufs_lbn_t lbn;
+	long oldextsize;
+	long oldsize;
+	int frags;
+	int needj;
 
 	fs = ip->i_fs;
 	mp = UFSTOVFS(ip->i_ump);
@@ -2221,32 +4879,53 @@ softdep_setup_freeblocks(ip, length, flags)
 	freeblks = malloc(sizeof(struct freeblks),
 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
+	LIST_INIT(&freeblks->fb_jfreeblkhd);
+	LIST_INIT(&freeblks->fb_jwork);
 	freeblks->fb_state = ATTACHED;
 	freeblks->fb_uid = ip->i_uid;
 	freeblks->fb_previousinum = ip->i_number;
 	freeblks->fb_devvp = ip->i_devvp;
+	freeblks->fb_chkcnt = 0;
 	ACQUIRE_LOCK(&lk);
+	/*
+	 * If we're truncating a removed file that will never be written
+	 * we don't need to journal the block frees.  The canceled journals
+	 * for the allocations will suffice.
+	 */
+	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
+	    (fs->fs_flags & FS_SUJ) == 0)
+		needj = 0;
+	else
+		needj = 1;
 	num_freeblkdep++;
 	FREE_LOCK(&lk);
 	extblocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	datablocks = DIP(ip, i_blocks) - extblocks;
-	if ((flags & IO_NORMAL) == 0) {
-		freeblks->fb_oldsize = 0;
-		freeblks->fb_chkcnt = 0;
-	} else {
-		freeblks->fb_oldsize = ip->i_size;
+	if ((flags & IO_NORMAL) != 0) {
+		oldsize = ip->i_size;
 		ip->i_size = 0;
 		DIP_SET(ip, i_size, 0);
 		freeblks->fb_chkcnt = datablocks;
 		for (i = 0; i < NDADDR; i++) {
-			freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
+			blkno = DIP(ip, i_db[i]);
 			DIP_SET(ip, i_db[i], 0);
+			if (blkno == 0)
+				continue;
+			frags = sblksize(fs, oldsize, i);
+			frags = numfrags(fs, frags);
+			newfreework(freeblks, NULL, i, blkno, frags, needj);
 		}
-		for (i = 0; i < NIADDR; i++) {
-			freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
+		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
+		    i++, tmpval *= NINDIR(fs)) {
+			blkno = DIP(ip, i_ib[i]);
 			DIP_SET(ip, i_ib[i], 0);
+			if (blkno) 
+				newfreework(freeblks, NULL, -lbn - i, blkno,
+				    fs->fs_frag, needj);
+			lbn += tmpval;
 		}
 		/*
 		 * If the file was removed, then the space being freed was
@@ -2259,17 +4938,23 @@ softdep_setup_freeblocks(ip, length, flags)
 			UFS_UNLOCK(ip->i_ump);
 		}
 	}
-	if ((flags & IO_EXT) == 0) {
-		freeblks->fb_oldextsize = 0;
-	} else {
-		freeblks->fb_oldextsize = ip->i_din2->di_extsize;
+	if ((flags & IO_EXT) != 0) {
+		oldextsize = ip->i_din2->di_extsize;
 		ip->i_din2->di_extsize = 0;
 		freeblks->fb_chkcnt += extblocks;
 		for (i = 0; i < NXADDR; i++) {
-			freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
+			blkno = ip->i_din2->di_extb[i];
 			ip->i_din2->di_extb[i] = 0;
+			if (blkno == 0)
+				continue;
+			frags = sblksize(fs, oldextsize, i);
+			frags = numfrags(fs, frags);
+			newfreework(freeblks, NULL, -1 - i, blkno, frags,
+			    needj);
 		}
 	}
+	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
+		needj = 0;
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
 	/*
 	 * Push the zero'ed inode to to its disk buffer so that we are free
@@ -2304,7 +4989,9 @@ softdep_setup_freeblocks(ip, length, flags)
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	if (delay)
-		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
+		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
+	else if (needj)
+		freeblks->fb_state |= DEPCOMPLETE | COMPLETE;
 	/*
 	 * Because the file length has been truncated to zero, any
 	 * pending block allocation dependency structures associated
@@ -2318,14 +5005,19 @@ softdep_setup_freeblocks(ip, length, flags)
 		merge_inode_lists(&inodedep->id_newinoupdt,
 		    &inodedep->id_inoupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
-			free_allocdirect(&inodedep->id_inoupdt, adp, delay);
+			cancel_allocdirect(&inodedep->id_inoupdt, adp,
+			    freeblks, delay);
 	}
 	if (flags & IO_EXT) {
 		merge_inode_lists(&inodedep->id_newextupdt,
 		    &inodedep->id_extupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
-			free_allocdirect(&inodedep->id_extupdt, adp, delay);
+			cancel_allocdirect(&inodedep->id_extupdt, adp,
+			    freeblks, delay);
 	}
+	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
+		add_to_journal(&jfreeblk->jf_list);
+
 	FREE_LOCK(&lk);
 	bdwrite(bp);
 	/*
@@ -2349,9 +5041,9 @@ restart:
 		BO_UNLOCK(bo);
 		ACQUIRE_LOCK(&lk);
 		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
-		deallocate_dependencies(bp, inodedep);
+		if (deallocate_dependencies(bp, inodedep, freeblks))
+			bp->b_flags |= B_INVAL | B_NOCACHE;
 		FREE_LOCK(&lk);
-		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		BO_LOCK(bo);
 		goto restart;
@@ -2361,7 +5053,7 @@ restart:
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 
-	if(delay) {
+	if (delay) {
 		freeblks->fb_state |= DEPCOMPLETE;
 		/*
 		 * If the inode with zeroed block pointers is now on disk
@@ -2371,16 +5063,16 @@ restart:
 		 * the request here than in the !delay case.
 		 */  
 		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
-			add_to_worklist(&freeblks->fb_list);
+			add_to_worklist(&freeblks->fb_list, 1);
 	}
 
 	FREE_LOCK(&lk);
 	/*
-	 * If the inode has never been written to disk (delay == 0),
-	 * then we can process the freeblks now that we have deleted
-	 * the dependencies.
+	 * If the inode has never been written to disk (delay == 0) and
+	 * we're not waiting on any journal writes, then we can process the
+	 * freeblks now that we have deleted the dependencies.
 	 */
-	if (!delay)
+	if (!delay && !needj)
 		handle_workitem_freeblocks(freeblks, 0);
 }
 
@@ -2389,19 +5081,23 @@ restart:
  * be reallocated to a new vnode. The buffer must be locked, thus,
  * no I/O completion operations can occur while we are manipulating
  * its associated dependencies. The mutex is held so that other I/O's
- * associated with related dependencies do not occur.
+ * associated with related dependencies do not occur.  Returns 1 if
+ * all dependencies were cleared, 0 otherwise.
  */
-static void
-deallocate_dependencies(bp, inodedep)
+static int
+deallocate_dependencies(bp, inodedep, freeblks)
 	struct buf *bp;
 	struct inodedep *inodedep;
+	struct freeblks *freeblks;
 {
 	struct worklist *wk;
 	struct indirdep *indirdep;
+	struct newdirblk *newdirblk;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
+	struct jremref *jremref;
+	struct jmvref *jmvref;
 	struct dirrem *dirrem;
-	struct diradd *dap;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
@@ -2410,47 +5106,24 @@ restart:
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
-			/*
-			 * None of the indirect pointers will ever be visible,
-			 * so they can simply be tossed. GOINGAWAY ensures
-			 * that allocated pointers will be saved in the buffer
-			 * cache until they are freed. Note that they will
-			 * only be able to be found by their physical address
-			 * since the inode mapping the logical address will
-			 * be gone. The save buffer used for the safe copy
-			 * was allocated in setup_allocindir_phase2 using
-			 * the physical address so it could be used for this
-			 * purpose. Hence we swap the safe copy with the real
-			 * copy, allowing the safe copy to be freed and holding
-			 * on to the real copy for later use in indir_trunc.
-			 */
-			if (indirdep->ir_state & GOINGAWAY)
-				panic("deallocate_dependencies: already gone");
-			indirdep->ir_state |= GOINGAWAY;
-			VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
-			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
-				free_allocindir(aip, inodedep);
 			if (bp->b_lblkno >= 0 ||
 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 				panic("deallocate_dependencies: not indir");
-			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
-			    bp->b_bcount);
-			WORKLIST_REMOVE(wk);
-			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
+			cancel_indirdep(indirdep, bp, inodedep, freeblks);
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			/*
-			 * None of the directory additions will ever be
-			 * visible, so they can simply be tossed.
+			 * There should be no directory add dependencies present
+			 * as the directory could not be truncated until all
+			 * children were removed.
 			 */
+			KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
+			    ("deallocate_dependencies: pendinghd != NULL"));
 			for (i = 0; i < DAHASHSZ; i++)
-				while ((dap =
-				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
-					free_diradd(dap);
-			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
-				free_diradd(dap);
+				KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
+				    ("deallocate_dependencies: diraddhd != NULL"));
 			/*
 			 * Copy any directory remove dependencies to the list
 			 * to be processed after the zero'ed inode is written.
@@ -2458,36 +5131,47 @@ restart:
 			 * can be dumped directly onto the work list.
 			 */
 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+				/*
+				 * If there are any dirrems we wait for
+				 * the journal write to complete and
+				 * then restart the buf scan as the lock
+				 * has been dropped.
+				 */
+				while ((jremref =
+				    LIST_FIRST(&dirrem->dm_jremrefhd))
+				    != NULL) {
+					jwait(&jremref->jr_list);
+					return (0);
+				}
 				LIST_REMOVE(dirrem, dm_next);
 				dirrem->dm_dirinum = pagedep->pd_ino;
 				if (inodedep == NULL ||
 				    (inodedep->id_state & ALLCOMPLETE) ==
-				     ALLCOMPLETE)
-					add_to_worklist(&dirrem->dm_list);
-				else
+				     ALLCOMPLETE) {
+					dirrem->dm_state |= COMPLETE;
+					add_to_worklist(&dirrem->dm_list, 0);
+				} else
 					WORKLIST_INSERT(&inodedep->id_bufwait,
 					    &dirrem->dm_list);
 			}
 			if ((pagedep->pd_state & NEWBLOCK) != 0) {
-				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
-					if (wk->wk_type == D_NEWDIRBLK &&
-					    WK_NEWDIRBLK(wk)->db_pagedep ==
-					      pagedep)
-						break;
-				if (wk != NULL) {
-					WORKLIST_REMOVE(wk);
-					free_newdirblk(WK_NEWDIRBLK(wk));
-				} else
-					panic("deallocate_dependencies: "
-					      "lost pagedep");
+				newdirblk = pagedep->pd_newdirblk;
+				WORKLIST_REMOVE(&newdirblk->db_list);
+				free_newdirblk(newdirblk);
 			}
+			while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
+			    != NULL) {
+				jwait(&jmvref->jm_list);
+				return (0);
+			}
 			WORKLIST_REMOVE(&pagedep->pd_list);
 			LIST_REMOVE(pagedep, pd_hash);
 			WORKITEM_FREE(pagedep, D_PAGEDEP);
 			continue;
 
 		case D_ALLOCINDIR:
-			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
+			aip = WK_ALLOCINDIR(wk);
+			cancel_allocindir(aip, inodedep, freeblks);
 			continue;
 
 		case D_ALLOCDIRECT:
@@ -2502,46 +5186,155 @@ restart:
 			/* NOTREACHED */
 		}
 	}
+
+	return (1);
 }
 
 /*
- * Free an allocdirect. Generate a new freefrag work request if appropriate.
- * This routine must be called with splbio interrupts blocked.
+ * An allocdirect is being canceled due to a truncate.  We must make sure
+ * the journal entry is released in concert with the blkfree that releases
+ * the storage.  Completed journal entries must not be released until the
+ * space is no longer pointed to by the inode or in the bitmap.
  */
 static void
-free_allocdirect(adphead, adp, delay)
+cancel_allocdirect(adphead, adp, freeblks, delay)
 	struct allocdirectlst *adphead;
 	struct allocdirect *adp;
+	struct freeblks *freeblks;
 	int delay;
 {
+	struct freework *freework;
+	struct newblk *newblk;
+	struct worklist *wk;
+	ufs_lbn_t lbn;
+
+	TAILQ_REMOVE(adphead, adp, ad_next);
+	newblk = (struct newblk *)adp;
+	/*
+	 * If the journal hasn't been written the jnewblk must be passed
+	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
+	 * this by linking the journal dependency into the freework to be
+	 * freed when freework_freeblock() is called.  If the journal has
+	 * been written we can simply reclaim the journal space when the
+	 * freeblks work is complete.
+	 */
+	if (newblk->nb_jnewblk == NULL) {
+		cancel_newblk(newblk, &freeblks->fb_jwork);
+		goto found;
+	}
+	lbn = newblk->nb_jnewblk->jn_lbn;
+	/*
+	 * Find the correct freework structure so it releases the canceled
+	 * journal when the bitmap is cleared.  This preserves rollback
+	 * until the allocation is reverted.
+	 */
+	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
+		freework = WK_FREEWORK(wk);
+		if (freework->fw_lbn != lbn)
+			continue;
+		cancel_newblk(newblk, &freework->fw_jwork);
+		goto found;
+	}
+	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
+found:
+	if (delay)
+		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
+		    &newblk->nb_list);
+	else
+		free_newblk(newblk);
+	return;
+}
+
+
+static void
+cancel_newblk(newblk, wkhd)
+	struct newblk *newblk;
+	struct workhead *wkhd;
+{
+	struct indirdep *indirdep;
+	struct allocindir *aip;
+
+	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+		indirdep->ir_state &= ~ONDEPLIST;
+		LIST_REMOVE(indirdep, ir_next);
+		/*
+		 * If an indirdep is not on the buf worklist we need to
+		 * free it here as deallocate_dependencies() will never
+		 * find it.  These pointers were never visible on disk and
+		 * can be discarded immediately.
+		 */
+		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+			LIST_REMOVE(aip, ai_next);
+			cancel_newblk(&aip->ai_block, wkhd);
+			free_newblk(&aip->ai_block);
+		}
+		/*
+		 * If this indirdep is not attached to a buf it was simply
+		 * waiting on completion to clear completehd.  free_indirdep()
+		 * asserts that nothing is dangling.
+		 */
+		if ((indirdep->ir_state & ONWORKLIST) == 0)
+			free_indirdep(indirdep);
+	}
+	if (newblk->nb_state & ONDEPLIST) {
+		newblk->nb_state &= ~ONDEPLIST;
+		LIST_REMOVE(newblk, nb_deps);
+	}
+	if (newblk->nb_state & ONWORKLIST)
+		WORKLIST_REMOVE(&newblk->nb_list);
+	/*
+	 * If the journal entry hasn't been written we hold onto the dep
+	 * until it is safe to free along with the other journal work.
+	 */
+	if (newblk->nb_jnewblk != NULL) {
+		cancel_jnewblk(newblk->nb_jnewblk, wkhd);
+		newblk->nb_jnewblk = NULL;
+	}
+	if (!LIST_EMPTY(&newblk->nb_jwork))
+		jwork_move(wkhd, &newblk->nb_jwork);
+}
+
+/*
+ * Free a newblk. Generate a new freefrag work request if appropriate.
+ * This must be called after the inode pointer and any direct block pointers
+ * are valid or fully removed via truncate or frag extension.
+ */
+static void
+free_newblk(newblk)
+	struct newblk *newblk;
+{
+	struct indirdep *indirdep;
 	struct newdirblk *newdirblk;
+	struct freefrag *freefrag;
 	struct worklist *wk;
 
 	mtx_assert(&lk, MA_OWNED);
-	if ((adp->ad_state & DEPCOMPLETE) == 0)
-		LIST_REMOVE(adp, ad_deps);
-	TAILQ_REMOVE(adphead, adp, ad_next);
-	if ((adp->ad_state & COMPLETE) == 0)
-		WORKLIST_REMOVE(&adp->ad_list);
-	if (adp->ad_freefrag != NULL) {
-		if (delay)
-			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
-			    &adp->ad_freefrag->ff_list);
-		else
-			add_to_worklist(&adp->ad_freefrag->ff_list);
+	if (newblk->nb_state & ONDEPLIST)
+		LIST_REMOVE(newblk, nb_deps);
+	if (newblk->nb_state & ONWORKLIST)
+		WORKLIST_REMOVE(&newblk->nb_list);
+	LIST_REMOVE(newblk, nb_hash);
+	if ((freefrag = newblk->nb_freefrag) != NULL) {
+		freefrag->ff_state |= COMPLETE;
+		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+			add_to_worklist(&freefrag->ff_list, 0);
 	}
-	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
+	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
-		if (!LIST_EMPTY(&adp->ad_newdirblk))
-			panic("free_allocdirect: extra newdirblk");
-		if (delay)
-			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
-			    &newdirblk->db_list);
-		else
-			free_newdirblk(newdirblk);
+		if (!LIST_EMPTY(&newblk->nb_newdirblk))
+			panic("free_newblk: extra newdirblk");
+		free_newdirblk(newdirblk);
 	}
-	WORKITEM_FREE(adp, D_ALLOCDIRECT);
+	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+		indirdep->ir_state |= DEPCOMPLETE;
+		indirdep_complete(indirdep);
+	}
+	KASSERT(newblk->nb_jnewblk == NULL,
+	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
+	handle_jwork(&newblk->nb_jwork);
+	newblk->nb_list.wk_type = D_NEWBLK;
+	WORKITEM_FREE(newblk, D_NEWBLK);
 }
 
 /*
@@ -2554,6 +5347,7 @@ free_newdirblk(newdirblk)
 {
 	struct pagedep *pagedep;
 	struct diradd *dap;
+	struct worklist *wk;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
@@ -2571,17 +5365,25 @@ free_newdirblk(newdirblk)
 	pagedep->pd_state &= ~NEWBLOCK;
 	if ((pagedep->pd_state & ONWORKLIST) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
-			free_diradd(dap);
+			free_diradd(dap, NULL);
 	/*
 	 * If no dependencies remain, the pagedep will be freed.
 	 */
 	for (i = 0; i < DAHASHSZ; i++)
 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 			break;
-	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
+	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
+	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
+		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
+		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
+	/* Should only ever be one item in the list. */
+	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
+	}
 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 }
 
@@ -2608,6 +5410,7 @@ softdep_freefile(pvp, ino, mode)
 	freefile->fx_mode = mode;
 	freefile->fx_oldinum = ino;
 	freefile->fx_devvp = ip->i_devvp;
+	LIST_INIT(&freefile->fx_jwork);
 	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 		UFS_LOCK(ip->i_ump);
 		ip->i_fs->fs_pendinginodes += 1;
@@ -2618,11 +5421,29 @@ softdep_freefile(pvp, ino, mode)
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
-	 * case we can free the file immediately.
+	 * case we can free the file immediately.  If the journal was
+	 * canceled before being written the inode will never make it to
+	 * disk and we must send the canceled journal entrys to
+	 * ffs_freefile() to be cleared in conjunction with the bitmap.
+	 * Any blocks waiting on the inode to write can be safely freed
+	 * here as it will never been written.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
-	    check_inode_unwritten(inodedep)) {
+	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+	/*
+	 * Remove this inode from the unlinked list and set
+	 * GOINGAWAY as appropriate to indicate that this inode
+	 * will never be written.
+	 */
+	if (inodedep && inodedep->id_state & UNLINKED) {
+		clear_unlinked_inodedep(inodedep);
+		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+		if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) {
+			inodedep->id_state |= GOINGAWAY;
+			handle_bufwait(inodedep, &freefile->fx_jwork);
+		}
+	}
+	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
 		FREE_LOCK(&lk);
 		handle_workitem_freefile(freefile);
 		return;
@@ -2654,7 +5475,8 @@ check_inode_unwritten(inodedep)
 {
 
 	mtx_assert(&lk, MA_OWNED);
-	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
+
+	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
@@ -2662,9 +5484,9 @@ check_inode_unwritten(inodedep)
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
+	    inodedep->id_mkdiradd != NULL || 
 	    inodedep->id_nlinkdelta != 0)
 		return (0);
-
 	/*
 	 * Another process might be in initiate_write_inodeblock_ufs[12]
 	 * trying to allocate memory without holding "Softdep Lock".
@@ -2673,9 +5495,11 @@ check_inode_unwritten(inodedep)
 	    inodedep->id_savedino1 == NULL)
 		return (0);
 
+	if (inodedep->id_state & ONDEPLIST)
+		LIST_REMOVE(inodedep, id_deps);
+	inodedep->id_state &= ~ONDEPLIST;
 	inodedep->id_state |= ALLCOMPLETE;
-	LIST_REMOVE(inodedep, id_deps);
-	inodedep->id_buf = NULL;
+	inodedep->id_bmsafemap = NULL;
 	if (inodedep->id_state & ONWORKLIST)
 		WORKLIST_REMOVE(&inodedep->id_list);
 	if (inodedep->id_savedino1 != NULL) {
@@ -2696,17 +5520,23 @@ free_inodedep(inodedep)
 {
 
 	mtx_assert(&lk, MA_OWNED);
-	if ((inodedep->id_state & ONWORKLIST) != 0 ||
+	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
+	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
+	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
-	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
+	    inodedep->id_mkdiradd != NULL ||
+	    inodedep->id_nlinkdelta != 0 ||
+	    inodedep->id_savedino1 != NULL)
 		return (0);
+	if (inodedep->id_state & ONDEPLIST)
+		LIST_REMOVE(inodedep, id_deps);
 	LIST_REMOVE(inodedep, id_hash);
 	WORKITEM_FREE(inodedep, D_INODEDEP);
 	num_inodedep -= 1;
@@ -2714,6 +5544,123 @@ free_inodedep(inodedep)
 }
 
 /*
+ * Free the block referenced by a freework structure.  The parent freeblks
+ * structure is released and completed when the final cg bitmap reaches
+ * the disk.  This routine may be freeing a jnewblk which never made it to
+ * disk in which case we do not have to wait as the operation is undone
+ * in memory immediately.
+ */
+static void
+freework_freeblock(freework)
+	struct freework *freework;
+{
+	struct freeblks *freeblks;
+	struct ufsmount *ump;
+	struct workhead wkhd;
+	struct fs *fs;
+	int complete;
+	int pending;
+	int bsize;
+
+	freeblks = freework->fw_freeblks;
+	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+	fs = ump->um_fs;
+	complete = 0;
+	LIST_INIT(&wkhd);
+	/*
+	 * If we are canceling an existing jnewblk pass it to the free
+	 * routine, otherwise pass the freeblk which will ultimately
+	 * release the freeblks
+	 */
+	if (!LIST_EMPTY(&freework->fw_jwork)) {
+		LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
+		complete = 1;
+	} else
+		WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
+	bsize = lfragtosize(fs, freework->fw_frags);
+	pending = btodb(bsize);
+	ACQUIRE_LOCK(&lk);
+	freeblks->fb_chkcnt -= pending;
+	FREE_LOCK(&lk);
+	/*
+	 * extattr blocks don't show up in pending blocks.  XXX why?
+	 */
+	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
+		UFS_LOCK(ump);
+		fs->fs_pendingblocks -= pending;
+		UFS_UNLOCK(ump);
+	}
+	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
+	    bsize, freeblks->fb_previousinum, &wkhd);
+	if (complete == 0)
+		return;
+	/*
+	 * The jnewblk will be discarded and the bits in the map never
+	 * made it to disk.  We can immediately free the freeblk.
+	 */
+	ACQUIRE_LOCK(&lk);
+	handle_written_freework(freework);
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Start, continue, or finish the process of freeing an indirect block tree.
+ * The free operation may be paused at any point with fw_off containing the
+ * offset to restart from.  This enables us to implement some flow control
+ * for large truncates which may fan out and generate a huge number of
+ * dependencies.
+ */
+static void
+handle_workitem_indirblk(freework)
+	struct freework *freework;
+{
+	struct freeblks *freeblks;
+	struct ufsmount *ump;
+	struct fs *fs;
+
+
+	freeblks = freework->fw_freeblks;
+	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+	fs = ump->um_fs;
+	if (freework->fw_off == NINDIR(fs))
+		freework_freeblock(freework);
+	else
+		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
+		    freework->fw_lbn);
+}
+
+/*
+ * Called when a freework structure attached to a cg buf is written.  The
+ * ref on either the parent or the freeblks structure is released and
+ * either may be added to the worklist if it is the final ref.
+ */
+static void
+handle_written_freework(freework)
+	struct freework *freework;
+{
+	struct freeblks *freeblks;
+	struct freework *parent;
+
+	freeblks = freework->fw_freeblks;
+	parent = freework->fw_parent;
+	if (parent) {
+		if (--parent->fw_ref != 0)
+			parent = NULL;
+		freeblks = NULL;
+	} else if (--freeblks->fb_ref != 0)
+		freeblks = NULL;
+	WORKITEM_FREE(freework, D_FREEWORK);
+	/*
+	 * Don't delay these block frees or it takes an intolerable amount
+	 * of time to process truncates and free their journal entries.
+	 */
+	if (freeblks)
+		add_to_worklist(&freeblks->fb_list, 1);
+	if (parent)
+		add_to_worklist(&parent->fw_list, 1);
+}
+
+/*
  * This workitem routine performs the block de-allocation.
  * The workitem is added to the pending list after the updated
  * inode block has been written to disk.  As mentioned above,
@@ -2726,99 +5673,79 @@ handle_workitem_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
+	struct freework *freework;
+	struct worklist *wk;
+
+	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
+	    ("handle_workitem_freeblocks: Journal entries not written."));
+	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
+		handle_complete_freeblocks(freeblks);
+		return;
+	}
+	freeblks->fb_ref++;
+	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
+		KASSERT(wk->wk_type == D_FREEWORK,
+		    ("handle_workitem_freeblocks: Unknown type %s",
+		    TYPENAME(wk->wk_type)));
+		WORKLIST_REMOVE_UNLOCKED(wk);
+		freework = WK_FREEWORK(wk);
+		if (freework->fw_lbn <= -NDADDR)
+			handle_workitem_indirblk(freework);
+		else
+			freework_freeblock(freework);
+	}
+	ACQUIRE_LOCK(&lk);
+	if (--freeblks->fb_ref != 0)
+		freeblks = NULL;
+	FREE_LOCK(&lk);
+	if (freeblks)
+		handle_complete_freeblocks(freeblks);
+}
+
+/*
+ * Once all of the freework workitems are complete we can retire the
+ * freeblocks dependency and any journal work awaiting completion.  This
+ * can not be called until all other dependencies are stable on disk.
+ */
+static void
+handle_complete_freeblocks(freeblks)
+	struct freeblks *freeblks;
+{
 	struct inode *ip;
 	struct vnode *vp;
 	struct fs *fs;
 	struct ufsmount *ump;
-	int i, nblocks, level, bsize;
-	ufs2_daddr_t bn, blocksreleased = 0;
-	int error, allerror = 0;
-	ufs_lbn_t baselbns[NIADDR], tmpval;
-	int fs_pendingblocks;
+	int flags;
 
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
-	fs_pendingblocks = 0;
-	tmpval = 1;
-	baselbns[0] = NDADDR;
-	for (i = 1; i < NIADDR; i++) {
-		tmpval *= NINDIR(fs);
-		baselbns[i] = baselbns[i - 1] + tmpval;
-	}
-	nblocks = btodb(fs->fs_bsize);
-	blocksreleased = 0;
+	flags = LK_NOWAIT;
+
 	/*
-	 * Release all extended attribute blocks or frags.
-	 */
-	if (freeblks->fb_oldextsize > 0) {
-		for (i = (NXADDR - 1); i >= 0; i--) {
-			if ((bn = freeblks->fb_eblks[i]) == 0)
-				continue;
-			bsize = sblksize(fs, freeblks->fb_oldextsize, i);
-			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
-			    freeblks->fb_previousinum);
-			blocksreleased += btodb(bsize);
-		}
-	}
-	/*
-	 * Release all data blocks or frags.
-	 */
-	if (freeblks->fb_oldsize > 0) {
-		/*
-		 * Indirect blocks first.
-		 */
-		for (level = (NIADDR - 1); level >= 0; level--) {
-			if ((bn = freeblks->fb_iblks[level]) == 0)
-				continue;
-			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
-			    level, baselbns[level], &blocksreleased)) != 0)
-				allerror = error;
-			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
-			    fs->fs_bsize, freeblks->fb_previousinum);
-			fs_pendingblocks += nblocks;
-			blocksreleased += nblocks;
-		}
-		/*
-		 * All direct blocks or frags.
-		 */
-		for (i = (NDADDR - 1); i >= 0; i--) {
-			if ((bn = freeblks->fb_dblks[i]) == 0)
-				continue;
-			bsize = sblksize(fs, freeblks->fb_oldsize, i);
-			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
-			    freeblks->fb_previousinum);
-			fs_pendingblocks += btodb(bsize);
-			blocksreleased += btodb(bsize);
-		}
-	}
-	UFS_LOCK(ump);
-	fs->fs_pendingblocks -= fs_pendingblocks;
-	UFS_UNLOCK(ump);
-	/*
 	 * If we still have not finished background cleanup, then check
 	 * to see if the block count needs to be adjusted.
 	 */
-	if (freeblks->fb_chkcnt != blocksreleased &&
-	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
+	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
 	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
-		(flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)
-	    == 0) {
+	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
 		ip = VTOI(vp);
-		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
-		    freeblks->fb_chkcnt - blocksreleased);
+		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
 	}
 
 #ifdef INVARIANTS
-	if (freeblks->fb_chkcnt != blocksreleased &&
+	if (freeblks->fb_chkcnt != 0 && 
 	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
 		printf("handle_workitem_freeblocks: block count\n");
-	if (allerror)
-		softdep_error("handle_workitem_freeblks", allerror);
 #endif /* INVARIANTS */
 
 	ACQUIRE_LOCK(&lk);
+	/*
+	 * All of the freeblock deps must be complete prior to this call
+	 * so it's now safe to complete earlier outstanding journal entries.
+	 */
+	handle_jwork(&freeblks->fb_jwork);
 	WORKITEM_FREE(freeblks, D_FREEBLKS);
 	num_freeblkdep--;
 	FREE_LOCK(&lk);
@@ -2830,29 +5757,39 @@ handle_workitem_freeblocks(freeblks, flags)
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  */
-static int
-indir_trunc(freeblks, dbn, level, lbn, countp)
-	struct freeblks *freeblks;
+static void
+indir_trunc(freework, dbn, lbn)
+	struct freework *freework;
 	ufs2_daddr_t dbn;
-	int level;
 	ufs_lbn_t lbn;
-	ufs2_daddr_t *countp;
 {
+	struct workhead wkhd;
+	struct jnewblk *jnewblk;
+	struct freeblks *freeblks;
 	struct buf *bp;
 	struct fs *fs;
+	struct worklist *wkn;
 	struct worklist *wk;
 	struct indirdep *indirdep;
 	struct ufsmount *ump;
 	ufs1_daddr_t *bap1 = 0;
-	ufs2_daddr_t nb, *bap2 = 0;
+	ufs2_daddr_t nb, nnb, *bap2 = 0;
 	ufs_lbn_t lbnadd;
 	int i, nblocks, ufs1fmt;
-	int error, allerror = 0;
 	int fs_pendingblocks;
+	int freedeps;
+	int level;
+	int cnt;
 
+	LIST_INIT(&wkhd);
+	level = lbn_level(lbn);
+	if (level == -1)
+		panic("indir_trunc: Invalid lbn %jd\n", lbn);
+	freeblks = freework->fw_freeblks;
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	fs_pendingblocks = 0;
+	freedeps = 0;
 	lbnadd = 1;
 	for (i = level; i > 0; i--)
 		lbnadd *= NINDIR(fs);
@@ -2877,13 +5814,14 @@ handle_workitem_freeblocks(freeblks, flags)
 	ACQUIRE_LOCK(&lk);
 	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		if (wk->wk_type != D_INDIRDEP ||
-		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
-		    (indirdep->ir_state & GOINGAWAY) == 0)
-			panic("indir_trunc: lost indirdep");
-		WORKLIST_REMOVE(wk);
-		WORKITEM_FREE(indirdep, D_INDIRDEP);
+		    (wk->wk_state & GOINGAWAY) == 0)
+			panic("indir_trunc: lost indirdep %p", wk);
+		indirdep = WK_INDIRDEP(wk);
+		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
+		free_indirdep(indirdep);
 		if (!LIST_EMPTY(&bp->b_dep))
-			panic("indir_trunc: dangling dep");
+			panic("indir_trunc: dangling dep %p",
+			    LIST_FIRST(&bp->b_dep));
 		ump->um_numindirdeps -= 1;
 		FREE_LOCK(&lk);
 	} else {
@@ -2892,11 +5830,10 @@ handle_workitem_freeblocks(freeblks, flags)
 			brelse(bp);
 #endif
 		FREE_LOCK(&lk);
-		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
-		    NOCRED, &bp);
-		if (error) {
+		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
+		    NOCRED, &bp) != 0) {
 			brelse(bp);
-			return (error);
+			return;
 		}
 	}
 	/*
@@ -2909,57 +5846,245 @@ handle_workitem_freeblocks(freeblks, flags)
 		ufs1fmt = 0;
 		bap2 = (ufs2_daddr_t *)bp->b_data;
 	}
-	nblocks = btodb(fs->fs_bsize);
-	for (i = NINDIR(fs) - 1; i >= 0; i--) {
-		if (ufs1fmt)
+	/*
+	 * Reclaim indirect blocks which never made it to disk.
+	 */
+	cnt = 0;
+	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
+		struct workhead freewk;
+		if (wk->wk_type != D_JNEWBLK)
+			continue;
+		WORKLIST_REMOVE_UNLOCKED(wk);
+		LIST_INIT(&freewk);
+		WORKLIST_INSERT_UNLOCKED(&freewk, wk);
+		jnewblk = WK_JNEWBLK(wk);
+		if (jnewblk->jn_lbn > 0)
+			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
+		else
+			i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;
+		KASSERT(i >= 0 && i < NINDIR(fs),
+		    ("indir_trunc: Index out of range %d parent %jd lbn %jd",
+		    i, lbn, jnewblk->jn_lbn));
+		/* Clear the pointer so it isn't found below. */
+		if (ufs1fmt) {
 			nb = bap1[i];
-		else
+			bap1[i] = 0;
+		} else {
 			nb = bap2[i];
+			bap2[i] = 0;
+		}
+		KASSERT(nb == jnewblk->jn_blkno,
+		    ("indir_trunc: Block mismatch %jd != %jd",
+		    nb, jnewblk->jn_blkno));
+		ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,
+		    fs->fs_bsize, freeblks->fb_previousinum, &freewk);
+		cnt++;
+	}
+	ACQUIRE_LOCK(&lk);
+	freework->fw_ref += NINDIR(fs) + 1;
+	/* Any remaining journal work can be completed with freeblks. */
+	jwork_move(&freeblks->fb_jwork, &wkhd);
+	FREE_LOCK(&lk);
+	nblocks = btodb(fs->fs_bsize);
+	if (ufs1fmt)
+		nb = bap1[0];
+	else
+		nb = bap2[0];
+	/*
+	 * Reclaim on disk blocks.
+	 */
+	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
+		if (i != NINDIR(fs) - 1) {
+			if (ufs1fmt)
+				nnb = bap1[i+1];
+			else
+				nnb = bap2[i+1];
+		} else
+			nnb = 0;
 		if (nb == 0)
 			continue;
+		cnt++;
 		if (level != 0) {
-			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
-			     level - 1, lbn + (i * lbnadd), countp)) != 0)
-				allerror = error;
+			struct freework *nfreework;
+			ufs_lbn_t nlbn;
+
+			nlbn = (lbn + 1) - (i * lbnadd);
+			nfreework = newfreework(freeblks, freework, nlbn, nb,
+			    fs->fs_frag, 0);
+			freedeps++;
+			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
+		} else {
+			struct freedep *freedep;
+
+			/*
+			 * Attempt to aggregate freedep dependencies for
+			 * all blocks being released to the same CG.
+			 */
+			LIST_INIT(&wkhd);
+			if (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb))) {
+				freedep = newfreedep(freework);
+				WORKLIST_INSERT_UNLOCKED(&wkhd,
+				    &freedep->fd_list);
+				freedeps++;
+			}
+			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
+			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
+			fs_pendingblocks += nblocks;
 		}
-		ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
-		    freeblks->fb_previousinum);
-		fs_pendingblocks += nblocks;
-		*countp += nblocks;
 	}
-	UFS_LOCK(ump);
-	fs->fs_pendingblocks -= fs_pendingblocks;
-	UFS_UNLOCK(ump);
+	ACQUIRE_LOCK(&lk);
+	freework->fw_off = i;
+	if (level == 0)
+		fs_pendingblocks = (nblocks * cnt);
+	freework->fw_ref += freedeps;
+	freework->fw_ref -= NINDIR(fs) + 1;
+	if (freework->fw_ref != 0)
+		freework = NULL;
+	FREE_LOCK(&lk);
+	if (fs_pendingblocks) {
+		ACQUIRE_LOCK(&lk);
+		freeblks->fb_chkcnt -= fs_pendingblocks;
+		FREE_LOCK(&lk);
+		UFS_LOCK(ump);
+		fs->fs_pendingblocks -= fs_pendingblocks;
+		UFS_UNLOCK(ump);
+	}
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
-	return (allerror);
+	if (freework)
+		handle_workitem_indirblk(freework);
+	return;
 }
 
 /*
- * Free an allocindir.
- * This routine must be called with splbio interrupts blocked.
+ * Cancel an allocindir when it is removed via truncation.
  */
 static void
-free_allocindir(aip, inodedep)
+cancel_allocindir(aip, inodedep, freeblks)
 	struct allocindir *aip;
 	struct inodedep *inodedep;
+	struct freeblks *freeblks;
 {
-	struct freefrag *freefrag;
+	struct newblk *newblk;
 
-	mtx_assert(&lk, MA_OWNED);
-	if ((aip->ai_state & DEPCOMPLETE) == 0)
-		LIST_REMOVE(aip, ai_deps);
-	if (aip->ai_state & ONWORKLIST)
-		WORKLIST_REMOVE(&aip->ai_list);
+	/*
+	 * If the journal hasn't been written the jnewblk must be passed
+	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
+	 * this by linking the journal dependency into the indirdep to be
+	 * freed when indir_trunc() is called.  If the journal has already
+	 * been written we can simply reclaim the journal space when the
+	 * freeblks work is complete.
+	 */
 	LIST_REMOVE(aip, ai_next);
-	if ((freefrag = aip->ai_freefrag) != NULL) {
+	newblk = (struct newblk *)aip;
+	if (newblk->nb_jnewblk == NULL)
+		cancel_newblk(newblk, &freeblks->fb_jwork);
+	else
+		cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
+	if (inodedep && inodedep->id_state & DEPCOMPLETE)
+		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
+	else
+		free_newblk(newblk);
+}
+
+/*
+ * Create the mkdir dependencies for . and .. in a new directory.  Link them
+ * in to a newdirblk so any subsequent additions are tracked properly.  The
+ * caller is responsible for adding the mkdir1 dependency to the journal
+ * and updating id_mkdiradd.  This function returns with lk held.
+ */
+static struct mkdir *
+setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
+	struct diradd *dap;
+	ino_t newinum;
+	ino_t dinum;
+	struct buf *newdirbp;
+	struct mkdir **mkdirp;
+{
+	struct newblk *newblk;
+	struct pagedep *pagedep;
+	struct inodedep *inodedep;
+	struct newdirblk *newdirblk = 0;
+	struct mkdir *mkdir1, *mkdir2;
+	struct worklist *wk;
+	struct jaddref *jaddref;
+	struct mount *mp;
+
+	mp = dap->da_list.wk_mp;
+	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
+	    M_SOFTDEP_FLAGS);
+	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+	LIST_INIT(&newdirblk->db_mkdir);
+	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
+	mkdir1->md_state = ATTACHED | MKDIR_BODY;
+	mkdir1->md_diradd = dap;
+	mkdir1->md_jaddref = NULL;
+	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
+	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
+	mkdir2->md_diradd = dap;
+	mkdir2->md_jaddref = NULL;
+	if ((mp->mnt_flag & MNT_SUJ) == 0) {
+		mkdir1->md_state |= DEPCOMPLETE;
+		mkdir2->md_state |= DEPCOMPLETE;
+	}
+	/*
+	 * Dependency on "." and ".." being written to disk.
+	 */
+	mkdir1->md_buf = newdirbp;
+	ACQUIRE_LOCK(&lk);
+	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
+	/*
+	 * We must link the pagedep, allocdirect, and newdirblk for
+	 * the initial file page so the pointer to the new directory
+	 * is not written until the directory contents are live and
+	 * any subsequent additions are not marked live until the
+	 * block is reachable via the inode.
+	 */
+	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
+		panic("setup_newdir: lost pagedep");
+	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
+		if (wk->wk_type == D_ALLOCDIRECT)
+			break;
+	if (wk == NULL)
+		panic("setup_newdir: lost allocdirect");
+	newblk = WK_NEWBLK(wk);
+	pagedep->pd_state |= NEWBLOCK;
+	pagedep->pd_newdirblk = newdirblk;
+	newdirblk->db_pagedep = pagedep;
+	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
+	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
+	/*
+	 * Look up the inodedep for the parent directory so that we
+	 * can link mkdir2 into the pending dotdot jaddref or
+	 * the inode write if there is none.  If the inode is
+	 * ALLCOMPLETE and no jaddref is present all dependencies have
+	 * been satisfied and mkdir2 can be freed.
+	 */
+	inodedep_lookup(mp, dinum, 0, &inodedep);
+	if (mp->mnt_flag & MNT_SUJ) {
 		if (inodedep == NULL)
-			add_to_worklist(&freefrag->ff_list);
-		else
-			WORKLIST_INSERT(&inodedep->id_bufwait,
-			    &freefrag->ff_list);
+			panic("setup_newdir: Lost parent.");
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
+		    (jaddref->ja_state & MKDIR_PARENT),
+		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
+		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+		mkdir2->md_jaddref = jaddref;
+		jaddref->ja_mkdir = mkdir2;
+	} else if (inodedep == NULL ||
+	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+		dap->da_state &= ~MKDIR_PARENT;
+		WORKITEM_FREE(mkdir2, D_MKDIR);
+	} else {
+		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 	}
-	WORKITEM_FREE(aip, D_ALLOCINDIR);
+	*mkdirp = mkdir2;
+
+	return (mkdir1);
 }
 
 /*
@@ -2998,12 +6123,14 @@ softdep_setup_directory_add(bp, dp, diroffset, new
 	ufs_lbn_t lbn;		/* block in directory containing new entry */
 	struct fs *fs;
 	struct diradd *dap;
-	struct allocdirect *adp;
+	struct newblk *newblk;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct newdirblk *newdirblk = 0;
 	struct mkdir *mkdir1, *mkdir2;
+	struct jaddref *jaddref;
 	struct mount *mp;
+	int isindir;
 
 	/*
 	 * Whiteouts have no dependencies.
@@ -3013,6 +6140,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new
 			bdwrite(newdirbp);
 		return (0);
 	}
+	jaddref = NULL;
+	mkdir1 = mkdir2 = NULL;
 	mp = UFSTOVFS(dp->i_ump);
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
@@ -3023,111 +6152,123 @@ softdep_setup_directory_add(bp, dp, diroffset, new
 	dap->da_offset = offset;
 	dap->da_newinum = newinum;
 	dap->da_state = ATTACHED;
-	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
+	LIST_INIT(&dap->da_jwork);
+	isindir = bp->b_lblkno >= NDADDR;
+	if (isnewblk &&
+	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
 		newdirblk = malloc(sizeof(struct newdirblk),
 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+		LIST_INIT(&newdirblk->db_mkdir);
 	}
+	/*
+	 * If we're creating a new directory setup the dependencies and set
+	 * the dap state to wait for them.  Otherwise it's COMPLETE and
+	 * we can move on.
+	 */
 	if (newdirbp == NULL) {
 		dap->da_state |= DEPCOMPLETE;
 		ACQUIRE_LOCK(&lk);
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
-		mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR,
-		    M_SOFTDEP_FLAGS);
-		workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
-		mkdir1->md_state = MKDIR_BODY;
-		mkdir1->md_diradd = dap;
-		mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR,
-		    M_SOFTDEP_FLAGS);
-		workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
-		mkdir2->md_state = MKDIR_PARENT;
-		mkdir2->md_diradd = dap;
-		/*
-		 * Dependency on "." and ".." being written to disk.
-		 */
-		mkdir1->md_buf = newdirbp;
-		ACQUIRE_LOCK(&lk);
-		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
-		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
-		FREE_LOCK(&lk);
-		bdwrite(newdirbp);
-		/*
-		 * Dependency on link count increase for parent directory
-		 */
-		ACQUIRE_LOCK(&lk);
-		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
-		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
-			dap->da_state &= ~MKDIR_PARENT;
-			WORKITEM_FREE(mkdir2, D_MKDIR);
-		} else {
-			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
-			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
-		}
+		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
+		    &mkdir2);
 	}
 	/*
 	 * Link into parent directory pagedep to await its being written.
 	 */
-	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+#ifdef DEBUG
+	if (diradd_lookup(pagedep, offset) != NULL)
+		panic("softdep_setup_directory_add: %p already at off %d\n",
+		    diradd_lookup(pagedep, offset), offset);
+#endif
 	dap->da_pagedep = pagedep;
 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 	    da_pdlist);
+	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 	/*
-	 * Link into its inodedep. Put it on the id_bufwait list if the inode
-	 * is not yet written. If it is written, do the post-inode write
-	 * processing to put it on the id_pendinghd list.
+	 * If we're journaling, link the diradd into the jaddref so it
+	 * may be completed after the journal entry is written.  Otherwise,
+	 * link the diradd into its inodedep.  If the inode is not yet
+	 * written place it on the bufwait list, otherwise do the post-inode
+	 * write processing to put it on the id_pendinghd list.
 	 */
-	(void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
-	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
+	if (mp->mnt_flag & MNT_SUJ) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
+		jaddref->ja_diroff = diroffset;
+		jaddref->ja_diradd = dap;
+		add_to_journal(&jaddref->ja_list);
+	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 		diradd_inode_written(dap, inodedep);
 	else
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
-	if (isnewblk) {
+	/*
+	 * Add the journal entries for . and .. links now that the primary
+	 * link is written.
+	 */
+	if (mkdir1 != NULL && mp->mnt_flag & MNT_SUJ) {
+		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
+		    inoreflst, if_deps);
+		KASSERT(jaddref != NULL &&
+		    jaddref->ja_ino == jaddref->ja_parent &&
+		    (jaddref->ja_state & MKDIR_BODY),
+		    ("softdep_setup_directory_add: bad dot jaddref %p",
+		    jaddref));
+		mkdir1->md_jaddref = jaddref;
+		jaddref->ja_mkdir = mkdir1;
 		/*
-		 * Directories growing into indirect blocks are rare
-		 * enough and the frequency of new block allocation
-		 * in those cases even more rare, that we choose not
-		 * to bother tracking them. Rather we simply force the
-		 * new directory entry to disk.
+		 * It is important that the dotdot journal entry
+		 * is added prior to the dot entry since dot writes
+		 * both the dot and dotdot links.  These both must
+		 * be added after the primary link for the journal
+		 * to remain consistent.
 		 */
-		if (lbn >= NDADDR) {
-			FREE_LOCK(&lk);
-			/*
-			 * We only have a new allocation when at the
-			 * beginning of a new block, not when we are
-			 * expanding into an existing block.
-			 */
-			if (blkoff(fs, diroffset) == 0)
-				return (1);
-			return (0);
-		}
+		add_to_journal(&mkdir2->md_jaddref->ja_list);
+		add_to_journal(&jaddref->ja_list);
+	}
+	/*
+	 * If we are adding a new directory remember this diradd so that if
+	 * we rename it we can keep the dot and dotdot dependencies.  If
+	 * we are adding a new name for an inode that has a mkdiradd we
+	 * must be in rename and we have to move the dot and dotdot
+	 * dependencies to this new name.  The old name is being orphaned
+	 * soon.
+	 */
+	if (mkdir1 != NULL) {
+		if (inodedep->id_mkdiradd != NULL)
+			panic("softdep_setup_directory_add: Existing mkdir");
+		inodedep->id_mkdiradd = dap;
+	} else if (inodedep->id_mkdiradd)
+		merge_diradd(inodedep, dap);
+	if (newdirblk) {
 		/*
-		 * We only have a new allocation when at the beginning
-		 * of a new fragment, not when we are expanding into an
-		 * existing fragment. Also, there is nothing to do if we
-		 * are already tracking this block.
+		 * There is nothing to do if we are already tracking
+		 * this block.
 		 */
-		if (fragoff(fs, diroffset) != 0) {
-			FREE_LOCK(&lk);
-			return (0);
-		}
 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 			FREE_LOCK(&lk);
 			return (0);
 		}
-		/*
-		 * Find our associated allocdirect and have it track us.
-		 */
-		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
-			panic("softdep_setup_directory_add: lost inodedep");
-		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
-		if (adp == NULL || adp->ad_lbn != lbn)
+		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
+		    == 0)
 			panic("softdep_setup_directory_add: lost entry");
+		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
 		pagedep->pd_state |= NEWBLOCK;
+		pagedep->pd_newdirblk = newdirblk;
 		newdirblk->db_pagedep = pagedep;
-		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
+		FREE_LOCK(&lk);
+		/*
+		 * If we extended into an indirect signal direnter to sync.
+		 */
+		if (isindir)
+			return (1);
+		return (0);
 	}
 	FREE_LOCK(&lk);
 	return (0);
@@ -3141,7 +6282,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new
  * occur while the move is in progress.
  */
 void 
-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
+	struct buf *bp;		/* Buffer holding directory block. */
 	struct inode *dp;	/* inode for directory */
 	caddr_t base;		/* address of dp->i_offset */
 	caddr_t oldloc;		/* address of old directory location */
@@ -3150,40 +6292,204 @@ void
 {
 	int offset, oldoffset, newoffset;
 	struct pagedep *pagedep;
+	struct jmvref *jmvref;
 	struct diradd *dap;
+	struct direct *de;
+	struct mount *mp;
 	ufs_lbn_t lbn;
+	int flags;
 
-	ACQUIRE_LOCK(&lk);
+	mp = UFSTOVFS(dp->i_ump);
+	de = (struct direct *)oldloc;
+	jmvref = NULL;
+	flags = 0;
+	/*
+	 * Moves are always journaled as it would be too complex to
+	 * determine if any affected adds or removes are present in the
+	 * journal.
+	 */
+	if (mp->mnt_flag & MNT_SUJ)  {
+		flags = DEPALLOC;
+		jmvref = newjmvref(dp, de->d_ino,
+		    dp->i_offset + (oldloc - base),
+		    dp->i_offset + (newloc - base));
+	}
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
-	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
-		goto done;
 	oldoffset = offset + (oldloc - base);
 	newoffset = offset + (newloc - base);
+	ACQUIRE_LOCK(&lk);
+	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
+		if (pagedep)
+			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+		goto done;
+	}
+	dap = diradd_lookup(pagedep, oldoffset);
+	if (dap) {
+		dap->da_offset = newoffset;
+		newoffset = DIRADDHASH(newoffset);
+		oldoffset = DIRADDHASH(oldoffset);
+		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
+		    newoffset != oldoffset) {
+			LIST_REMOVE(dap, da_pdlist);
+			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
+			    dap, da_pdlist);
+		}
+	}
+done:
+	if (jmvref) {
+		jmvref->jm_pagedep = pagedep;
+		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
+		add_to_journal(&jmvref->jm_list);
+	}
+	bcopy(oldloc, newloc, entrysize);
+	FREE_LOCK(&lk);
+}
 
-	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
-		if (dap->da_offset != oldoffset)
-			continue;
-		dap->da_offset = newoffset;
-		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
-			break;
+/*
+ * Move the mkdir dependencies and journal work from one diradd to another
+ * when renaming a directory.  The new name must depend on the mkdir deps
+ * completing as the old name did.  Directories can only have one valid link
+ * at a time so one must be canonical.
+ */
+static void
+merge_diradd(inodedep, newdap)
+	struct inodedep *inodedep;
+	struct diradd *newdap;
+{
+	struct diradd *olddap;
+	struct mkdir *mkdir, *nextmd;
+	short state;
+
+	olddap = inodedep->id_mkdiradd;
+	inodedep->id_mkdiradd = newdap;
+	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+		newdap->da_state &= ~DEPCOMPLETE;
+		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
+			nextmd = LIST_NEXT(mkdir, md_mkdirs);
+			if (mkdir->md_diradd != olddap)
+				continue;
+			mkdir->md_diradd = newdap;
+			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
+			newdap->da_state |= state;
+			olddap->da_state &= ~state;
+			if ((olddap->da_state &
+			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
+				break;
+		}
+		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
+			panic("merge_diradd: unfound ref");
+	}
+	/*
+	 * Any mkdir related journal items are not safe to be freed until
+	 * the new name is stable.
+	 */
+	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
+	olddap->da_state |= DEPCOMPLETE;
+	complete_diradd(olddap);
+}
+
+/*
+ * Move the diradd to the pending list when all diradd dependencies are
+ * complete.
+ */
+static void
+complete_diradd(dap)
+	struct diradd *dap;
+{
+	struct pagedep *pagedep;
+
+	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
+		if (dap->da_state & DIRCHG)
+			pagedep = dap->da_previous->dm_pagedep;
+		else
+			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
-		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
-		    dap, da_pdlist);
-		break;
+		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
-	if (dap == NULL) {
+}
 
-		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
-			if (dap->da_offset == oldoffset) {
-				dap->da_offset = newoffset;
-				break;
+/*
+ * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
+ * add entries and conditonally journal the remove.
+ */
+static void
+cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
+	struct diradd *dap;
+	struct dirrem *dirrem;
+	struct jremref *jremref;
+	struct jremref *dotremref;
+	struct jremref *dotdotremref;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct inoref *inoref;
+	struct mkdir *mkdir;
+
+	/*
+	 * If no remove references were allocated we're on a non-journaled
+	 * filesystem and can skip the cancel step.
+	 */
+	if (jremref == NULL) {
+		free_diradd(dap, NULL);
+		return;
+	}
+	/*
+	 * Cancel the primary name an free it if it does not require
+	 * journaling.
+	 */
+	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
+	    0, &inodedep) != 0) {
+		/* Abort the addref that reference this diradd.  */
+		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+			if (inoref->if_list.wk_type != D_JADDREF)
+				continue;
+			jaddref = (struct jaddref *)inoref;
+			if (jaddref->ja_diradd != dap)
+				continue;
+			if (cancel_jaddref(jaddref, inodedep,
+			    &dirrem->dm_jwork) == 0) {
+				free_jremref(jremref);
+				jremref = NULL;
 			}
+			break;
 		}
 	}
-done:
-	bcopy(oldloc, newloc, entrysize);
-	FREE_LOCK(&lk);
+	/*
+	 * Cancel subordinate names and free them if they do not require
+	 * journaling.
+	 */
+	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+			if (mkdir->md_diradd != dap)
+				continue;
+			if ((jaddref = mkdir->md_jaddref) == NULL)
+				continue;
+			mkdir->md_jaddref = NULL;
+			if (mkdir->md_state & MKDIR_PARENT) {
+				if (cancel_jaddref(jaddref, NULL,
+				    &dirrem->dm_jwork) == 0) {
+					free_jremref(dotdotremref);
+					dotdotremref = NULL;
+				}
+			} else {
+				if (cancel_jaddref(jaddref, inodedep,
+				    &dirrem->dm_jwork) == 0) {
+					free_jremref(dotremref);
+					dotremref = NULL;
+				}
+			}
+		}
+	}
+
+	if (jremref)
+		journal_jremref(dirrem, jremref, inodedep);
+	if (dotremref)
+		journal_jremref(dirrem, dotremref, inodedep);
+	if (dotdotremref)
+		journal_jremref(dirrem, dotdotremref, NULL);
+	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
+	free_diradd(dap, &dirrem->dm_jwork);
 }
 
 /*
@@ -3191,8 +6497,9 @@ void
  * with splbio interrupts blocked.
  */
 static void
-free_diradd(dap)
+free_diradd(dap, wkhd)
 	struct diradd *dap;
+	struct workhead *wkhd;
 {
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
@@ -3200,32 +6507,48 @@ static void
 	struct mkdir *mkdir, *nextmd;
 
 	mtx_assert(&lk, MA_OWNED);
-	WORKLIST_REMOVE(&dap->da_list);
 	LIST_REMOVE(dap, da_pdlist);
+	if (dap->da_state & ONWORKLIST)
+		WORKLIST_REMOVE(&dap->da_list);
 	if ((dap->da_state & DIRCHG) == 0) {
 		pagedep = dap->da_pagedep;
 	} else {
 		dirrem = dap->da_previous;
 		pagedep = dirrem->dm_pagedep;
 		dirrem->dm_dirinum = pagedep->pd_ino;
-		add_to_worklist(&dirrem->dm_list);
+		dirrem->dm_state |= COMPLETE;
+		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+			add_to_worklist(&dirrem->dm_list, 0);
 	}
 	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 	    0, &inodedep) != 0)
-		(void) free_inodedep(inodedep);
+		if (inodedep->id_mkdiradd == dap)
+			inodedep->id_mkdiradd = NULL;
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
 			if (mkdir->md_diradd != dap)
 				continue;
-			dap->da_state &= ~mkdir->md_state;
-			WORKLIST_REMOVE(&mkdir->md_list);
+			dap->da_state &=
+			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
 			LIST_REMOVE(mkdir, md_mkdirs);
+			if (mkdir->md_state & ONWORKLIST)
+				WORKLIST_REMOVE(&mkdir->md_list);
+			if (mkdir->md_jaddref != NULL)
+				panic("free_diradd: Unexpected jaddref");
 			WORKITEM_FREE(mkdir, D_MKDIR);
+			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+				break;
 		}
 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 			panic("free_diradd: unfound ref");
 	}
+	if (inodedep)
+		free_inodedep(inodedep);
+	/*
+	 * Free any journal segments waiting for the directory write.
+	 */
+	handle_jwork(&dap->da_jwork);
 	WORKITEM_FREE(dap, D_DIRADD);
 }
 
@@ -3254,11 +6577,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	struct dirrem *dirrem, *prevdirrem;
+	struct inodedep *inodedep;
+	int direct;
 
 	/*
-	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
+	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
+	 * newdirrem() to setup the full directory remove which requires
+	 * isrmdir > 1.
 	 */
-	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
+	dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);
+	/*
+	 * Add the dirrem to the inodedep's pending remove list for quick
+	 * discovery later.
+	 */
+	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+	    &inodedep) == 0)
+		panic("softdep_setup_remove: Lost inodedep.");
+	dirrem->dm_state |= ONDEPLIST;
+	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
@@ -3280,12 +6616,148 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 			    prevdirrem, dm_next);
 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
+		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
 		FREE_LOCK(&lk);
-		handle_workitem_remove(dirrem, NULL);
+		if (direct)
+			handle_workitem_remove(dirrem, NULL);
 	}
 }
 
 /*
+ * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
+ * pd_pendinghd list of a pagedep.
+ */
+static struct diradd *
+diradd_lookup(pagedep, offset)
+	struct pagedep *pagedep;
+	int offset;
+{
+	struct diradd *dap;
+
+	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
+		if (dap->da_offset == offset)
+			return (dap);
+	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
+		if (dap->da_offset == offset)
+			return (dap);
+	return (NULL);
+}
+
+/*
+ * Search for a .. diradd dependency in a directory that is being removed.
+ * If the directory was renamed to a new parent we have a diradd rather
+ * than a mkdir for the .. entry.  We need to cancel it now before
+ * it is found in truncate().
+ */
+static struct jremref *
+cancel_diradd_dotdot(ip, dirrem, jremref)
+	struct inode *ip;
+	struct dirrem *dirrem;
+	struct jremref *jremref;
+{
+	struct pagedep *pagedep;
+	struct diradd *dap;
+	struct worklist *wk;
+
+	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
+	    &pagedep) == 0)
+		return (jremref);
+	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
+	if (dap == NULL)
+		return (jremref);
+	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
+	/*
+	 * Mark any journal work as belonging to the parent so it is freed
+	 * with the .. reference.
+	 */
+	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+		wk->wk_state |= MKDIR_PARENT;
+	return (NULL);
+}
+
+/*
+ * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
+ * replace it with a dirrem/diradd pair as a result of re-parenting a
+ * directory.  This ensures that we don't simultaneously have a mkdir and
+ * a diradd for the same .. entry.
+ */
+static struct jremref *
+cancel_mkdir_dotdot(ip, dirrem, jremref)
+	struct inode *ip;
+	struct dirrem *dirrem;
+	struct jremref *jremref;
+{
+	struct inodedep *inodedep;
+	struct jaddref *jaddref;
+	struct mkdir *mkdir;
+	struct diradd *dap;
+
+	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+	    &inodedep) == 0)
+		panic("cancel_mkdir_dotdot: Lost inodedep");
+	dap = inodedep->id_mkdiradd;
+	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
+		return (jremref);
+	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
+	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
+		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
+			break;
+	if (mkdir == NULL)
+		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
+	if ((jaddref = mkdir->md_jaddref) != NULL) {
+		mkdir->md_jaddref = NULL;
+		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
+		    &inodedep) == 0)
+			panic("cancel_mkdir_dotdot: Lost parent inodedep");
+		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
+			journal_jremref(dirrem, jremref, inodedep);
+			jremref = NULL;
+		}
+	}
+	if (mkdir->md_state & ONWORKLIST)
+		WORKLIST_REMOVE(&mkdir->md_list);
+	mkdir->md_state |= ALLCOMPLETE;
+	complete_mkdir(mkdir);
+	return (jremref);
+}
+
+static void
+journal_jremref(dirrem, jremref, inodedep)
+	struct dirrem *dirrem;
+	struct jremref *jremref;
+	struct inodedep *inodedep;
+{
+
+	if (inodedep == NULL)
+		if (inodedep_lookup(jremref->jr_list.wk_mp,
+		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
+			panic("journal_jremref: Lost inodedep");
+	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
+	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+	add_to_journal(&jremref->jr_list);
+}
+
+static void
+dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
+	struct dirrem *dirrem;
+	struct jremref *jremref;
+	struct jremref *dotremref;
+	struct jremref *dotdotremref;
+{
+	struct inodedep *inodedep;
+
+
+	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
+	    &inodedep) == 0)
+		panic("dirrem_journal: Lost inodedep");
+	journal_jremref(dirrem, jremref, inodedep);
+	if (dotremref)
+		journal_jremref(dirrem, dotremref, inodedep);
+	if (dotdotremref)
+		journal_jremref(dirrem, dotdotremref, NULL);
+}
+
+/*
  * Allocate a new dirrem if appropriate and return it along with
  * its associated pagedep. Called without a lock, returns with lock.
  */
@@ -3303,12 +6775,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	struct diradd *dap;
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
+	struct jremref *jremref;
+	struct jremref *dotremref;
+	struct jremref *dotdotremref;
+	struct vnode *dvp;
 
 	/*
 	 * Whiteouts have no deletion dependencies.
 	 */
 	if (ip == NULL)
 		panic("newdirrem: whiteout");
+	dvp = ITOV(dp);
 	/*
 	 * If we are over our limit, try to improve the situation.
 	 * Limiting the number of dirrem structures will also limit
@@ -3321,34 +6798,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	FREE_LOCK(&lk);
 	dirrem = malloc(sizeof(struct dirrem),
 		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
-	workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
+	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
+	LIST_INIT(&dirrem->dm_jremrefhd);
+	LIST_INIT(&dirrem->dm_jwork);
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
 	dirrem->dm_oldinum = ip->i_number;
 	*prevdirremp = NULL;
-
+	/*
+	 * Allocate remove reference structures to track journal write
+	 * dependencies.  We will always have one for the link and
+	 * when doing directories we will always have one more for dot.
+	 * When renaming a directory we skip the dotdot link change so
+	 * this is not needed.
+	 */
+	jremref = dotremref = dotdotremref = NULL;
+	if (DOINGSUJ(dvp)) {
+		if (isrmdir) {
+			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+			    ip->i_effnlink + 2);
+			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
+			    ip->i_effnlink + 1);
+		} else
+			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+			    ip->i_effnlink + 1);
+		if (isrmdir > 1) {
+			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
+			    dp->i_effnlink + 1);
+			dotdotremref->jr_state |= MKDIR_PARENT;
+		}
+	}
 	ACQUIRE_LOCK(&lk);
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
-	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
+	    &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	dirrem->dm_pagedep = pagedep;
 	/*
+	 * If we're renaming a .. link to a new directory, cancel any
+	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
+	 * the jremref is preserved for any potential diradd in this
+	 * location.  This can not coincide with a rmdir.
+	 */
+	if (dp->i_offset == DOTDOT_OFFSET) {
+		if (isrmdir)
+			panic("newdirrem: .. directory change during remove?");
+		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
+	}
+	/*
+	 * If we're removing a directory search for the .. dependency now and
+	 * cancel it.  Any pending journal work will be added to the dirrem
+	 * to be completed when the workitem remove completes.
+	 */
+	if (isrmdir > 1)
+		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
+	/*
 	 * Check for a diradd dependency for the same directory entry.
 	 * If present, then both dependencies become obsolete and can
-	 * be de-allocated. Check for an entry on both the pd_dirraddhd
-	 * list and the pd_pendinghd list.
+	 * be de-allocated.
 	 */
-
-	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
-		if (dap->da_offset == offset)
-			break;
+	dap = diradd_lookup(pagedep, offset);
 	if (dap == NULL) {
-
-		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
-			if (dap->da_offset == offset)
-				break;
-		if (dap == NULL)
-			return (dirrem);
+		/*
+		 * Link the jremref structures into the dirrem so they are
+		 * written prior to the pagedep.
+		 */
+		if (jremref)
+			dirrem_journal(dirrem, jremref, dotremref,
+			    dotdotremref);
+		return (dirrem);
 	}
 	/*
 	 * Must be ATTACHED at this point.
@@ -3373,7 +6891,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	 * Mark it COMPLETE so we can delete its inode immediately.
 	 */
 	dirrem->dm_state |= COMPLETE;
-	free_diradd(dap);
+	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
+#ifdef SUJ_DEBUG
+	if (isrmdir == 0) {
+		struct worklist *wk;
+
+		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
+				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
+	}
+#endif
+
 	return (dirrem);
 }
 
@@ -3407,6 +6935,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum
 	struct dirrem *dirrem, *prevdirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
+	struct jaddref *jaddref;
 	struct mount *mp;
 
 	offset = blkoff(dp->i_fs, dp->i_offset);
@@ -3422,6 +6951,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 		dap->da_offset = offset;
 		dap->da_newinum = newinum;
+		LIST_INIT(&dap->da_jwork);
 	}
 
 	/*
@@ -3454,11 +6984,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum
 			    dm_next);
 		} else {
 			dirrem->dm_dirinum = pagedep->pd_ino;
-			add_to_worklist(&dirrem->dm_list);
+			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+				add_to_worklist(&dirrem->dm_list, 0);
 		}
 		FREE_LOCK(&lk);
 		return;
 	}
+	/*
+	 * Add the dirrem to the inodedep's pending remove list for quick
+	 * discovery later.  A valid nlinkdelta ensures that this lookup
+	 * will not fail.
+	 */
+	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
+		panic("softdep_setup_directory_change: Lost inodedep.");
+	dirrem->dm_state |= ONDEPLIST;
+	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
@@ -3483,15 +7023,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum
 			dap->da_pagedep = pagedep;
 		}
 		dirrem->dm_dirinum = pagedep->pd_ino;
-		add_to_worklist(&dirrem->dm_list);
+		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+			add_to_worklist(&dirrem->dm_list, 0);
 	}
 	/*
-	 * Link into its inodedep. Put it on the id_bufwait list if the inode
+	 * Lookup the jaddref for this journal entry.  We must finish
+	 * initializing it and make the diradd write dependent on it.
+	 * If we're not journaling Put it on the id_bufwait list if the inode
 	 * is not yet written. If it is written, do the post-inode write
 	 * processing to put it on the id_pendinghd list.
 	 */
-	if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
-	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
+	if (mp->mnt_flag & MNT_SUJ) {
+		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+		    inoreflst);
+		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+		    ("softdep_setup_directory_change: bad jaddref %p",
+		    jaddref));
+		jaddref->ja_diroff = dp->i_offset;
+		jaddref->ja_diradd = dap;
+		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
+		    dap, da_pdlist);
+		add_to_journal(&jaddref->ja_list);
+	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		dap->da_state |= COMPLETE;
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
@@ -3500,6 +7054,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum
 		    dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	}
+	/*
+	 * If we're making a new name for a directory that has not been
+	 * committed when need to move the dot and dotdot references to
+	 * this new name.
+	 */
+	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
+		merge_diradd(inodedep, dap);
 	FREE_LOCK(&lk);
 }
 
@@ -3516,8 +7077,7 @@ softdep_change_linkcnt(ip)
 	struct inodedep *inodedep;
 
 	ACQUIRE_LOCK(&lk);
-	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
-	    DEPALLOC, &inodedep);
+	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
@@ -3574,6 +7134,304 @@ softdep_releasefile(ip)
 }
 
 /*
+ * Attach a sbdep dependency to the superblock buf so that we can keep
+ * track of the head of the linked list of referenced but unlinked inodes.
+ */
+void
+softdep_setup_sbupdate(ump, fs, bp)
+	struct ufsmount *ump;
+	struct fs *fs;
+	struct buf *bp;
+{
+	struct sbdep *sbdep;
+	struct worklist *wk;
+
+	if ((fs->fs_flags & FS_SUJ) == 0)
+		return;
+	LIST_FOREACH(wk, &bp->b_dep, wk_list)
+		if (wk->wk_type == D_SBDEP)
+			break;
+	if (wk != NULL)
+		return;
+	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
+	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
+	sbdep->sb_fs = fs;
+	sbdep->sb_ump = ump;
+	ACQUIRE_LOCK(&lk);
+	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
+	FREE_LOCK(&lk);
+}
+
+/*
+ * Return the first unlinked inodedep which is ready to be the head of the
+ * list.  The inodedep and all those after it must have valid next pointers.
+ */
+static struct inodedep *
+first_unlinked_inodedep(ump)
+	struct ufsmount *ump;
+{
+	struct inodedep *inodedep;
+	struct inodedep *idp;
+
+	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
+	    inodedep; inodedep = idp) {
+		if ((inodedep->id_state & UNLINKNEXT) == 0)
+			return (NULL);
+		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
+			break;
+		if ((inodedep->id_state & UNLINKPREV) == 0)
+			panic("first_unlinked_inodedep: prev != next");
+	}
+	if (inodedep == NULL)
+		return (NULL);
+
+	return (inodedep);
+}
+
+/*
+ * Set the sujfree unlinked head pointer prior to writing a superblock.
+ */
+static void
+initiate_write_sbdep(sbdep)
+	struct sbdep *sbdep;
+{
+	struct inodedep *inodedep;
+	struct fs *bpfs;
+	struct fs *fs;
+
+	bpfs = sbdep->sb_fs;
+	fs = sbdep->sb_ump->um_fs;
+	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+	if (inodedep) {
+		fs->fs_sujfree = inodedep->id_ino;
+		inodedep->id_state |= UNLINKPREV;
+	} else
+		fs->fs_sujfree = 0;
+	bpfs->fs_sujfree = fs->fs_sujfree;
+}
+
+/*
+ * After a superblock is written determine whether it must be written again
+ * due to a changing unlinked list head.
+ */
+static int
+handle_written_sbdep(sbdep, bp)
+	struct sbdep *sbdep;
+	struct buf *bp;
+{
+	struct inodedep *inodedep;
+	struct mount *mp;
+	struct fs *fs;
+
+	fs = sbdep->sb_fs;
+	mp = UFSTOVFS(sbdep->sb_ump);
+	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
+	    (inodedep == NULL && fs->fs_sujfree != 0)) {
+		bdirty(bp);
+		return (1);
+	}
+	WORKITEM_FREE(sbdep, D_SBDEP);
+	if (fs->fs_sujfree == 0)
+		return (0);
+	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
+		panic("handle_written_sbdep: lost inodedep");
+	/*
+	 * Now that we have a record of this indode in stable store we can
+	 * discard any pending work.
+	 */
+	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
+		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
+			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
+			    inodedep, inodedep->id_state);
+		if (handle_bufwait(inodedep, NULL) != NULL)
+			panic("handle_written_sbdep: freefile on "
+			    "unlinked inodedep");
+	}
+
+	return (0);
+}
+
+/*
+ * Mark an inodedep has unlinked and insert it into the in-memory unlinked
+ * list.
+ */
+static void
+unlinked_inodedep(mp, inodedep)
+	struct mount *mp;
+	struct inodedep *inodedep;
+{
+	struct ufsmount *ump;
+
+	if ((mp->mnt_flag & MNT_SUJ) == 0)
+		return;
+	ump = VFSTOUFS(mp);
+	ump->um_fs->fs_fmod = 1;
+	inodedep->id_state |= UNLINKED;
+	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
+}
+
+/*
+ * Remove an inodedep from the unlinked inodedep list.  This may require
+ * disk writes if the inode has made it that far.
+ */
+static void
+clear_unlinked_inodedep(inodedep)
+	struct inodedep *inodedep;
+{
+	struct ufsmount *ump;
+	struct inodedep *idp;
+	struct inodedep *idn;
+	struct fs *fs;
+	struct buf *bp;
+	ino_t ino;
+	ino_t nino;
+	ino_t pino;
+	int error;
+
+	ump = VFSTOUFS(inodedep->id_list.wk_mp);
+	fs = ump->um_fs;
+	ino = inodedep->id_ino;
+	error = 0;
+	for (;;) {
+		/*
+		 * If nothing has yet been written simply remove us from
+		 * the in memory list and return.  This is the most common
+		 * case where handle_workitem_remove() loses the final
+		 * reference.
+		 */
+		if ((inodedep->id_state & UNLINKLINKS) == 0)
+			break;
+		/*
+		 * If we have a NEXT pointer and no PREV pointer we can simply
+		 * clear NEXT's PREV and remove ourselves from the list.  Be
+		 * careful not to clear PREV if the superblock points at
+		 * next as well.
+		 */
+		idn = TAILQ_NEXT(inodedep, id_unlinked);
+		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
+			if (idn && fs->fs_sujfree != idn->id_ino)
+				idn->id_state &= ~UNLINKPREV;
+			break;
+		}
+		/*
+		 * Here we have an inodedep which is actually linked into
+		 * the list.  We must remove it by forcing a write to the
+		 * link before us, whether it be the superblock or an inode.
+		 * Unfortunately the list may change while we're waiting
+		 * on the buf lock for either resource so we must loop until
+		 * we lock. the right one.  If both the superblock and an
+		 * inode point to this inode we must clear the inode first
+		 * followed by the superblock.
+		 */
+		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+		pino = 0;
+		if (idp && (idp->id_state & UNLINKNEXT))
+			pino = idp->id_ino;
+		FREE_LOCK(&lk);
+		if (pino == 0)
+			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+			    (int)fs->fs_sbsize, 0, 0, 0);
+		else
+			error = bread(ump->um_devvp,
+			    fsbtodb(fs, ino_to_fsba(fs, pino)),
+			    (int)fs->fs_bsize, NOCRED, &bp);
+		ACQUIRE_LOCK(&lk);
+		if (error)
+			break;
+		/* If the list has changed restart the loop. */
+		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+		nino = 0;
+		if (idp && (idp->id_state & UNLINKNEXT))
+			nino = idp->id_ino;
+		if (nino != pino ||
+		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
+			FREE_LOCK(&lk);
+			brelse(bp);
+			ACQUIRE_LOCK(&lk);
+			continue;
+		}
+		/*
+		 * Remove us from the in memory list.  After this we cannot
+		 * access the inodedep.
+		 */
+		idn = TAILQ_NEXT(inodedep, id_unlinked);
+		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+		/*
+		 * Determine the next inode number.
+		 */
+		nino = 0;
+		if (idn) {
+			/*
+			 * If next isn't on the list we can just clear prev's
+			 * state and schedule it to be fixed later.  No need
+			 * to synchronously write if we're not in the real
+			 * list.
+			 */
+			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
+				idp->id_state &= ~UNLINKNEXT;
+				if ((idp->id_state & ONWORKLIST) == 0)
+					WORKLIST_INSERT(&bp->b_dep,
+					    &idp->id_list);
+				FREE_LOCK(&lk);
+				bawrite(bp);
+				ACQUIRE_LOCK(&lk);
+				return;
+			}
+			nino = idn->id_ino;
+		}
+		FREE_LOCK(&lk);
+		/*
+		 * The predecessor's next pointer is manually updated here
+		 * so that the NEXT flag is never cleared for an element
+		 * that is in the list.
+		 */
+		if (pino == 0) {
+			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+			    bp);
+		} else if (fs->fs_magic == FS_UFS1_MAGIC)
+			((struct ufs1_dinode *)bp->b_data +
+			    ino_to_fsbo(fs, pino))->di_freelink = nino;
+		else
+			((struct ufs2_dinode *)bp->b_data +
+			    ino_to_fsbo(fs, pino))->di_freelink = nino;
+		/*
+		 * If the bwrite fails we have no recourse to recover.  The
+		 * filesystem is corrupted already.
+		 */
+		bwrite(bp);
+		ACQUIRE_LOCK(&lk);
+		/*
+		 * If the superblock pointer still needs to be cleared force
+		 * a write here.
+		 */
+		if (fs->fs_sujfree == ino) {
+			FREE_LOCK(&lk);
+			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+			    (int)fs->fs_sbsize, 0, 0, 0);
+			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+			    bp);
+			bwrite(bp);
+			ACQUIRE_LOCK(&lk);
+		}
+		if (fs->fs_sujfree != ino)
+			return;
+		panic("clear_unlinked_inodedep: Failed to clear free head");
+	}
+	if (inodedep->id_ino == fs->fs_sujfree)
+		panic("clear_unlinked_inodedep: Freeing head of free list");
+	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+	return;
+}
+
+/*
  * This workitem decrements the inode's link count.
  * If the link count reaches zero, the file is removed.
  */
@@ -3584,23 +7442,55 @@ handle_workitem_remove(dirrem, xp)
 {
 	struct thread *td = curthread;
 	struct inodedep *inodedep;
+	struct workhead dotdotwk;
+	struct worklist *wk;
+	struct ufsmount *ump;
+	struct mount *mp;
 	struct vnode *vp;
 	struct inode *ip;
 	ino_t oldinum;
 	int error;
 
+	if (dirrem->dm_state & ONWORKLIST)
+		panic("handle_workitem_remove: dirrem %p still on worklist",
+		    dirrem);
+	oldinum = dirrem->dm_oldinum;
+	mp = dirrem->dm_list.wk_mp;
+	ump = VFSTOUFS(mp);
 	if ((vp = xp) == NULL &&
-	    (error = ffs_vgetf(dirrem->dm_list.wk_mp,
-		    dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) {
+	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
+	    FFSV_FORCEINSMQ)) != 0) {
 		softdep_error("handle_workitem_remove: vget", error);
 		return;
 	}
 	ip = VTOI(vp);
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(dirrem->dm_list.wk_mp,
-	    dirrem->dm_oldinum, 0, &inodedep)) == 0)
+	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
 		panic("handle_workitem_remove: lost inodedep");
+	if (dirrem->dm_state & ONDEPLIST)
+		LIST_REMOVE(dirrem, dm_inonext);
+	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+	    ("handle_workitem_remove:  Journal entries not written."));
+
 	/*
+	 * Move all dependencies waiting on the remove to complete
+	 * from the dirrem to the inode inowait list to be completed
+	 * after the inode has been updated and written to disk.  Any
+	 * marked MKDIR_PARENT are saved to be completed when the .. ref
+	 * is removed.
+	 */
+	LIST_INIT(&dotdotwk);
+	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		if (wk->wk_state & MKDIR_PARENT) {
+			wk->wk_state &= ~MKDIR_PARENT;
+			WORKLIST_INSERT(&dotdotwk, wk);
+			continue;
+		}
+		WORKLIST_INSERT(&inodedep->id_inowait, wk);
+	}
+	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
+	/*
 	 * Normal file deletion.
 	 */
 	if ((dirrem->dm_state & RMDIR) == 0) {
@@ -3609,12 +7499,16 @@ handle_workitem_remove(dirrem, xp)
 		ip->i_flag |= IN_CHANGE;
 		if (ip->i_nlink < ip->i_effnlink)
 			panic("handle_workitem_remove: bad file delta");
+		if (ip->i_nlink == 0) 
+			unlinked_inodedep(mp, inodedep);
 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 		num_dirrem -= 1;
+		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+		    ("handle_workitem_remove: worklist not empty. %s",
+		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
-		vput(vp);
-		return;
+		goto out;
 	}
 	/*
 	 * Directory deletion. Decrement reference count for both the
@@ -3628,6 +7522,8 @@ handle_workitem_remove(dirrem, xp)
 	ip->i_flag |= IN_CHANGE;
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("handle_workitem_remove: bad dir delta");
+	if (ip->i_nlink == 0)
+		unlinked_inodedep(mp, inodedep);
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(&lk);
 	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
@@ -3639,36 +7535,47 @@ handle_workitem_remove(dirrem, xp)
 	 * directory should not change. Thus we skip the followup dirrem.
 	 */
 	if (dirrem->dm_state & DIRCHG) {
+		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
 		num_dirrem -= 1;
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
-		vput(vp);
-		return;
+		goto out;
 	}
+	dirrem->dm_state = ONDEPLIST;
+	dirrem->dm_oldinum = dirrem->dm_dirinum;
 	/*
-	 * If the inodedep does not exist, then the zero'ed inode has
-	 * been written to disk. If the allocated inode has never been
-	 * written to disk, then the on-disk inode is zero'ed. In either
-	 * case we can remove the file immediately.
+	 * Place the dirrem on the parent's diremhd list.
 	 */
-	dirrem->dm_state = 0;
-	oldinum = dirrem->dm_oldinum;
-	dirrem->dm_oldinum = dirrem->dm_dirinum;
-	if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
-	    0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
+	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
+		panic("handle_workitem_remove: lost dir inodedep");
+	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
+	/*
+	 * If the allocated inode has never been written to disk, then
+	 * the on-disk inode is zero'ed and we can remove the file
+	 * immediately.  When journaling if the inode has been marked
+	 * unlinked and not DEPCOMPLETE we know it can never be written.
+	 */
+	inodedep_lookup(mp, oldinum, 0, &inodedep);
+	if (inodedep == NULL ||
+	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
+	    check_inode_unwritten(inodedep)) {
 		if (xp != NULL)
-			add_to_worklist(&dirrem->dm_list);
+			add_to_worklist(&dirrem->dm_list, 0);
 		FREE_LOCK(&lk);
-		vput(vp);
-		if (xp == NULL)
+		if (xp == NULL) {
+			vput(vp);
 			handle_workitem_remove(dirrem, NULL);
+		}
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 	FREE_LOCK(&lk);
 	ip->i_flag |= IN_CHANGE;
+out:
 	ffs_update(vp, 0);
-	vput(vp);
+	if (xp == NULL)
+		vput(vp);
 }
 
 /*
@@ -3689,6 +7596,7 @@ static void
 handle_workitem_freefile(freefile)
 	struct freefile *freefile;
 {
+	struct workhead wkhd;
 	struct fs *fs;
 	struct inodedep *idp;
 	struct ufsmount *ump;
@@ -3701,13 +7609,15 @@ handle_workitem_freefile(freefile)
 	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 	FREE_LOCK(&lk);
 	if (error)
-		panic("handle_workitem_freefile: inodedep survived");
+		panic("handle_workitem_freefile: inodedep %p survived", idp);
 #endif
 	UFS_LOCK(ump);
 	fs->fs_pendinginodes -= 1;
 	UFS_UNLOCK(ump);
+	LIST_INIT(&wkhd);
+	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
 	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
-	    freefile->fx_oldinum, freefile->fx_mode)) != 0)
+	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
 		softdep_error("handle_workitem_freefile", error);
 	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freefile, D_FREEFILE);
@@ -3757,8 +7667,10 @@ softdep_disk_io_initiation(bp)
 {
 	struct worklist *wk;
 	struct worklist marker;
-	struct indirdep *indirdep;
 	struct inodedep *inodedep;
+	struct freeblks *freeblks;
+	struct jfreeblk *jfreeblk;
+	struct newblk *newblk;
 
 	/*
 	 * We only care about write operations. There should never
@@ -3767,6 +7679,10 @@ softdep_disk_io_initiation(bp)
 	if (bp->b_iocmd != BIO_WRITE)
 		panic("softdep_disk_io_initiation: not write");
 
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("softdep_disk_io_initiation: Writing buffer with "
+		    "background write in progress: %p", bp);
+
 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
 	PHOLD(curproc);			/* Don't swap out kernel stack */
 
@@ -3792,46 +7708,58 @@ softdep_disk_io_initiation(bp)
 			continue;
 
 		case D_INDIRDEP:
-			indirdep = WK_INDIRDEP(wk);
-			if (indirdep->ir_state & GOINGAWAY)
-				panic("disk_io_initiation: indirdep gone");
+			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
+			continue;
+
+		case D_BMSAFEMAP:
+			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
+			continue;
+
+		case D_JSEG:
+			WK_JSEG(wk)->js_buf = NULL;
+			continue;
+
+		case D_FREEBLKS:
+			freeblks = WK_FREEBLKS(wk);
+			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
 			/*
-			 * If there are no remaining dependencies, this
-			 * will be writing the real pointers, so the
-			 * dependency can be freed.
+			 * We have to wait for the jfreeblks to be journaled
+			 * before we can write an inodeblock with updated
+			 * pointers.  Be careful to arrange the marker so
+			 * we revisit the jfreeblk if it's not removed by
+			 * the first jwait().
 			 */
-			if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
-				struct buf *bp;
-
-				bp = indirdep->ir_savebp;
-				bp->b_flags |= B_INVAL | B_NOCACHE;
-				/* inline expand WORKLIST_REMOVE(wk); */
-				wk->wk_state &= ~ONWORKLIST;
-				LIST_REMOVE(wk, wk_list);
-				WORKITEM_FREE(indirdep, D_INDIRDEP);
-				FREE_LOCK(&lk);
-				brelse(bp);
-				ACQUIRE_LOCK(&lk);
-				continue;
+			if (jfreeblk != NULL) {
+				LIST_REMOVE(&marker, wk_list);
+				LIST_INSERT_BEFORE(wk, &marker, wk_list);
+				jwait(&jfreeblk->jf_list);
 			}
+			continue;
+		case D_ALLOCDIRECT:
+		case D_ALLOCINDIR:
 			/*
-			 * Replace up-to-date version with safe version.
+			 * We have to wait for the jnewblk to be journaled
+			 * before we can write to a block otherwise the
+			 * contents may be confused with an earlier file
+			 * at recovery time.  Handle the marker as described
+			 * above.
 			 */
-			FREE_LOCK(&lk);
-			indirdep->ir_saveddata = malloc(bp->b_bcount,
-			    M_INDIRDEP, M_SOFTDEP_FLAGS);
-			ACQUIRE_LOCK(&lk);
-			indirdep->ir_state &= ~ATTACHED;
-			indirdep->ir_state |= UNDONE;
-			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
-			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
-			    bp->b_bcount);
+			newblk = WK_NEWBLK(wk);
+			if (newblk->nb_jnewblk != NULL) {
+				LIST_REMOVE(&marker, wk_list);
+				LIST_INSERT_BEFORE(wk, &marker, wk_list);
+				jwait(&newblk->nb_jnewblk->jn_list);
+			}
 			continue;
 
+		case D_SBDEP:
+			initiate_write_sbdep(WK_SBDEP(wk));
+			continue;
+
 		case D_MKDIR:
-		case D_BMSAFEMAP:
-		case D_ALLOCDIRECT:
-		case D_ALLOCINDIR:
+		case D_FREEWORK:
+		case D_FREEDEP:
+		case D_JSEGDEP:
 			continue;
 
 		default:
@@ -3855,6 +7783,9 @@ initiate_write_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;
 {
+	struct jremref *jremref;
+	struct jmvref *jmvref;
+	struct dirrem *dirrem;
 	struct diradd *dap;
 	struct direct *ep;
 	int i;
@@ -3869,6 +7800,18 @@ initiate_write_filepage(pagedep, bp)
 		return;
 	}
 	pagedep->pd_state |= IOSTARTED;
+	/*
+	 * Wait for all journal remove dependencies to hit the disk.
+	 * We can not allow any potentially conflicting directory adds
+	 * to be visible before removes and rollback is too difficult.
+	 * lk may be dropped and re-acquired, however we hold the buf
+	 * locked so the dependency can not go away.
+	 */
+	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
+		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
+			jwait(&jremref->jr_list);
+	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
+		jwait(&jmvref->jm_list);
 	for (i = 0; i < DAHASHSZ; i++) {
 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 			ep = (struct direct *)
@@ -3905,6 +7848,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	struct allocdirect *adp, *lastadp;
 	struct ufs1_dinode *dp;
 	struct ufs1_dinode *sip;
+	struct inoref *inoref;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
@@ -3918,7 +7862,21 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	fs = inodedep->id_fs;
 	dp = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
+
 	/*
+	 * If we're on the unlinked list but have not yet written our
+	 * next pointer initialize it here.
+	 */
+	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+		struct inodedep *inon;
+
+		inon = TAILQ_NEXT(inodedep, id_unlinked);
+		if (inon)
+			dp->di_freelink = inon->id_ino;
+		else
+			dp->di_freelink = 0;
+	}
+	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
@@ -3933,6 +7891,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 		*inodedep->id_savedino1 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 		dp->di_gen = inodedep->id_savedino1->di_gen;
+		dp->di_freelink = inodedep->id_savedino1->di_freelink;
 		return;
 	}
 	/*
@@ -3940,32 +7899,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = 0;
-	if (TAILQ_EMPTY(&inodedep->id_inoupdt))
+	inodedep->id_savednlink = dp->di_nlink;
+	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
+	    TAILQ_EMPTY(&inodedep->id_inoreflst))
 		return;
 	/*
+	 * Revert the link count to that of the first unwritten journal entry.
+	 */
+	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+	if (inoref)
+		dp->di_nlink = inoref->if_nlink;
+	/*
 	 * Set the dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
-		if (deplist != 0 && prevlbn >= adp->ad_lbn)
+		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("softdep_write_inodeblock: lbn order");
-		prevlbn = adp->ad_lbn;
-		if (adp->ad_lbn < NDADDR &&
-		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+		prevlbn = adp->ad_offset;
+		if (adp->ad_offset < NDADDR &&
+		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
-			    (intmax_t)adp->ad_lbn,
-			    dp->di_db[adp->ad_lbn],
+			    (intmax_t)adp->ad_offset,
+			    dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
-		if (adp->ad_lbn >= NDADDR &&
-		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+		if (adp->ad_offset >= NDADDR &&
+		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
 			panic("%s: indirect pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
-			    (intmax_t)adp->ad_lbn - NDADDR,
-			    dp->di_ib[adp->ad_lbn - NDADDR],
+			    (intmax_t)adp->ad_offset - NDADDR,
+			    dp->di_ib[adp->ad_offset - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
-		deplist |= 1 << adp->ad_lbn;
+		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
@@ -3981,14 +7948,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
-		if (adp->ad_lbn >= NDADDR)
+		if (adp->ad_offset >= NDADDR)
 			break;
-		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
-		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
-		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
@@ -4012,8 +7979,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
-	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
-		for (i = lastadp->ad_lbn; i >= 0; i--)
+	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4030,7 +7997,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
-		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+		dp->di_ib[adp->ad_offset - NDADDR] = 0;
 }
 		
 /*
@@ -4051,6 +8018,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	struct allocdirect *adp, *lastadp;
 	struct ufs2_dinode *dp;
 	struct ufs2_dinode *sip;
+	struct inoref *inoref;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
@@ -4064,7 +8032,21 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	fs = inodedep->id_fs;
 	dp = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
+
 	/*
+	 * If we're on the unlinked list but have not yet written our
+	 * next pointer initialize it here.
+	 */
+	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+		struct inodedep *inon;
+
+		inon = TAILQ_NEXT(inodedep, id_unlinked);
+		if (inon)
+			dp->di_freelink = inon->id_ino;
+		else
+			dp->di_freelink = 0;
+	}
+	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
@@ -4079,6 +8061,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 		*inodedep->id_savedino2 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 		dp->di_gen = inodedep->id_savedino2->di_gen;
+		dp->di_freelink = inodedep->id_savedino1->di_freelink;
 		return;
 	}
 	/*
@@ -4086,25 +8069,38 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = dp->di_extsize;
+	inodedep->id_savednlink = dp->di_nlink;
 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
-	    TAILQ_EMPTY(&inodedep->id_extupdt))
+	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
+	    TAILQ_EMPTY(&inodedep->id_inoreflst))
 		return;
 	/*
+	 * Revert the link count to that of the first unwritten journal entry.
+	 *
+	 * XXX What if it is canceled?  Could entries after it be expired
+	 * before we remove this?  Thus leaving us with an incorrect link on
+	 * disk with no journal entries to cover it?
+	 */
+	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+	if (inoref)
+		dp->di_nlink = inoref->if_nlink;
+
+	/*
 	 * Set the ext data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
-		if (deplist != 0 && prevlbn >= adp->ad_lbn)
+		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("softdep_write_inodeblock: lbn order");
-		prevlbn = adp->ad_lbn;
-		if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
+		prevlbn = adp->ad_offset;
+		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
-			    (intmax_t)adp->ad_lbn,
-			    (intmax_t)dp->di_extb[adp->ad_lbn],
+			    (intmax_t)adp->ad_offset,
+			    (intmax_t)dp->di_extb[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
-		deplist |= 1 << adp->ad_lbn;
+		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
@@ -4120,12 +8116,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
-		dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
+		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
-		dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
-		for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
+		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
@@ -4142,8 +8138,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
-	    dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
-		for (i = lastadp->ad_lbn; i >= 0; i--)
+	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_extb[i] != 0)
 				break;
 		dp->di_extsize = (i + 1) * fs->fs_bsize;
@@ -4154,24 +8150,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
-		if (deplist != 0 && prevlbn >= adp->ad_lbn)
+		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("softdep_write_inodeblock: lbn order");
-		prevlbn = adp->ad_lbn;
-		if (adp->ad_lbn < NDADDR &&
-		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+		prevlbn = adp->ad_offset;
+		if (adp->ad_offset < NDADDR &&
+		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
-			    (intmax_t)adp->ad_lbn,
-			    (intmax_t)dp->di_db[adp->ad_lbn],
+			    (intmax_t)adp->ad_offset,
+			    (intmax_t)dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
-		if (adp->ad_lbn >= NDADDR &&
-		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+		if (adp->ad_offset >= NDADDR &&
+		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
 			panic("%s indirect pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock:",
-			    (intmax_t)adp->ad_lbn - NDADDR,
-			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
+			    (intmax_t)adp->ad_offset - NDADDR,
+			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
-		deplist |= 1 << adp->ad_lbn;
+		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
@@ -4187,14 +8183,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
-		if (adp->ad_lbn >= NDADDR)
+		if (adp->ad_offset >= NDADDR)
 			break;
-		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
-		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
-		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep2");
@@ -4218,8 +8214,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
-	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
-		for (i = lastadp->ad_lbn; i >= 0; i--)
+	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4236,16 +8232,363 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
-		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+		dp->di_ib[adp->ad_offset - NDADDR] = 0;
 }
 
 /*
+ * Cancel an indirdep as a result of truncation.  Release all of the
+ * children allocindirs and place their journal work on the appropriate
+ * list.
+ */
+static void
+cancel_indirdep(indirdep, bp, inodedep, freeblks)
+	struct indirdep *indirdep;
+	struct buf *bp;
+	struct inodedep *inodedep;
+	struct freeblks *freeblks;
+{
+	struct allocindir *aip;
+
+	/*
+	 * None of the indirect pointers will ever be visible,
+	 * so they can simply be tossed. GOINGAWAY ensures
+	 * that allocated pointers will be saved in the buffer
+	 * cache until they are freed. Note that they will
+	 * only be able to be found by their physical address
+	 * since the inode mapping the logical address will
+	 * be gone. The save buffer used for the safe copy
+	 * was allocated in setup_allocindir_phase2 using
+	 * the physical address so it could be used for this
+	 * purpose. Hence we swap the safe copy with the real
+	 * copy, allowing the safe copy to be freed and holding
+	 * on to the real copy for later use in indir_trunc.
+	 */
+	if (indirdep->ir_state & GOINGAWAY)
+		panic("cancel_indirdep: already gone");
+	if (indirdep->ir_state & ONDEPLIST) {
+		indirdep->ir_state &= ~ONDEPLIST;
+		LIST_REMOVE(indirdep, ir_next);
+	}
+	indirdep->ir_state |= GOINGAWAY;
+	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
+	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
+		cancel_allocindir(aip, inodedep, freeblks);
+	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
+		cancel_allocindir(aip, inodedep, freeblks);
+	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
+		cancel_allocindir(aip, inodedep, freeblks);
+	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
+		cancel_allocindir(aip, inodedep, freeblks);
+	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
+	WORKLIST_REMOVE(&indirdep->ir_list);
+	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
+	indirdep->ir_savebp = NULL;
+}
+
+/*
+ * Free an indirdep once it no longer has new pointers to track.
+ */
+static void
+free_indirdep(indirdep)
+	struct indirdep *indirdep;
+{
+
+	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
+	    ("free_indirdep: Journal work not empty."));
+	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
+	    ("free_indirdep: Complete head not empty."));
+	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
+	    ("free_indirdep: write head not empty."));
+	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
+	    ("free_indirdep: done head not empty."));
+	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
+	    ("free_indirdep: deplist head not empty."));
+	KASSERT(indirdep->ir_savebp == NULL,
+	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
+	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
+	    ("free_indirdep: %p still on deplist.", indirdep));
+	if (indirdep->ir_state & ONWORKLIST)
+		WORKLIST_REMOVE(&indirdep->ir_list);
+	WORKITEM_FREE(indirdep, D_INDIRDEP);
+}
+
+/*
+ * Called before a write to an indirdep.  This routine is responsible for
+ * rolling back pointers to a safe state which includes only those
+ * allocindirs which have been completed.
+ */
+static void
+initiate_write_indirdep(indirdep, bp)
+	struct indirdep *indirdep;
+	struct buf *bp;
+{
+
+	if (indirdep->ir_state & GOINGAWAY)
+		panic("disk_io_initiation: indirdep gone");
+
+	/*
+	 * If there are no remaining dependencies, this will be writing
+	 * the real pointers.
+	 */
+	if (LIST_EMPTY(&indirdep->ir_deplisthd))
+		return;
+	/*
+	 * Replace up-to-date version with safe version.
+	 */
+	FREE_LOCK(&lk);
+	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
+	    M_SOFTDEP_FLAGS);
+	ACQUIRE_LOCK(&lk);
+	indirdep->ir_state &= ~ATTACHED;
+	indirdep->ir_state |= UNDONE;
+	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
+	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
+	    bp->b_bcount);
+}
+
+/*
+ * Called when an inode has been cleared in a cg bitmap.  This finally
+ * eliminates any canceled jaddrefs
+ */
+void
+softdep_setup_inofree(mp, bp, ino, wkhd)
+	struct mount *mp;
+	struct buf *bp;
+	ino_t ino;
+	struct workhead *wkhd;
+{
+	struct worklist *wk, *wkn;
+	struct inodedep *inodedep;
+	uint8_t *inosused;
+	struct cg *cgp;
+	struct fs *fs;
+
+	ACQUIRE_LOCK(&lk);
+	fs = VFSTOUFS(mp)->um_fs;
+	cgp = (struct cg *)bp->b_data;
+	inosused = cg_inosused(cgp);
+	if (isset(inosused, ino % fs->fs_ipg))
+		panic("softdep_setup_inofree: inode %d not freed.", ino);
+	if (inodedep_lookup(mp, ino, 0, &inodedep))
+		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
+		    ino, inodedep);
+	if (wkhd) {
+		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+			if (wk->wk_type != D_JADDREF)
+				continue;
+			WORKLIST_REMOVE(wk);
+			/*
+			 * We can free immediately even if the jaddref
+			 * isn't attached in a background write as now
+			 * the bitmaps are reconciled.
+		 	 */
+			wk->wk_state |= COMPLETE | ATTACHED;
+			free_jaddref(WK_JADDREF(wk));
+		}
+		jwork_move(&bp->b_dep, wkhd);
+	}
+	FREE_LOCK(&lk);
+}
+
+
+/*
+ * Called via ffs_blkfree() after a set of frags has been cleared from a cg
+ * map.  Any dependencies waiting for the write to clear are added to the
+ * buf's list and any jnewblks that are being canceled are discarded
+ * immediately.
+ */
+void
+softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
+	struct mount *mp;
+	struct buf *bp;
+	ufs2_daddr_t blkno;
+	int frags;
+	struct workhead *wkhd;
+{
+	struct jnewblk *jnewblk;
+	struct worklist *wk, *wkn;
+#ifdef SUJ_DEBUG
+	struct bmsafemap *bmsafemap;
+	struct fs *fs;
+	uint8_t *blksfree;
+	struct cg *cgp;
+	ufs2_daddr_t jstart;
+	ufs2_daddr_t jend;
+	ufs2_daddr_t end;
+	long bno;
+	int i;
+#endif
+
+	ACQUIRE_LOCK(&lk);
+	/*
+	 * Detach any jnewblks which have been canceled.  They must linger
+	 * until the bitmap is cleared again by ffs_blkfree() to prevent
+	 * an unjournaled allocation from hitting the disk.
+	 */
+	if (wkhd) {
+		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+			if (wk->wk_type != D_JNEWBLK)
+				continue;
+			jnewblk = WK_JNEWBLK(wk);
+			KASSERT(jnewblk->jn_state & GOINGAWAY,
+			    ("softdep_setup_blkfree: jnewblk not canceled."));
+			WORKLIST_REMOVE(wk);
+#ifdef SUJ_DEBUG
+			/*
+			 * Assert that this block is free in the bitmap
+			 * before we discard the jnewblk.
+			 */
+			fs = VFSTOUFS(mp)->um_fs;
+			cgp = (struct cg *)bp->b_data;
+			blksfree = cg_blksfree(cgp);
+			bno = dtogd(fs, jnewblk->jn_blkno);
+			for (i = jnewblk->jn_oldfrags;
+			    i < jnewblk->jn_frags; i++) {
+				if (isset(blksfree, bno + i))
+					continue;
+				panic("softdep_setup_blkfree: not free");
+			}
+#endif
+			/*
+			 * Even if it's not attached we can free immediately
+			 * as the new bitmap is correct.
+			 */
+			wk->wk_state |= COMPLETE | ATTACHED;
+			free_jnewblk(jnewblk);
+		}
+		/*
+		 * The buf must be locked by the caller otherwise these could
+		 * be added while it's being written and the write would
+		 * complete them before they made it to disk.
+		 */
+		jwork_move(&bp->b_dep, wkhd);
+	}
+
+#ifdef SUJ_DEBUG
+	/*
+	 * Assert that we are not freeing a block which has an outstanding
+	 * allocation dependency.
+	 */
+	fs = VFSTOUFS(mp)->um_fs;
+	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
+	end = blkno + frags;
+	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+		/*
+		 * Don't match against blocks that will be freed when the
+		 * background write is done.
+		 */
+		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
+		    (COMPLETE | DEPCOMPLETE))
+			continue;
+		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
+		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
+		if ((blkno >= jstart && blkno < jend) ||
+		    (end > jstart && end <= jend)) {
+			printf("state 0x%X %jd - %d %d dep %p\n",
+			    jnewblk->jn_state, jnewblk->jn_blkno,
+			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
+			    jnewblk->jn_newblk);
+			panic("softdep_setup_blkfree: "
+			    "%jd-%jd(%d) overlaps with %jd-%jd",
+			    blkno, end, frags, jstart, jend);
+		}
+	}
+#endif
+	FREE_LOCK(&lk);
+}
+
+static void 
+initiate_write_bmsafemap(bmsafemap, bp)
+	struct bmsafemap *bmsafemap;
+	struct buf *bp;			/* The cg block. */
+{
+	struct jaddref *jaddref;
+	struct jnewblk *jnewblk;
+	uint8_t *inosused;
+	uint8_t *blksfree;
+	struct cg *cgp;
+	struct fs *fs;
+	int cleared;
+	ino_t ino;
+	long bno;
+	int i;
+
+	if (bmsafemap->sm_state & IOSTARTED)
+		panic("initiate_write_bmsafemap: Already started\n");
+	bmsafemap->sm_state |= IOSTARTED;
+	/*
+	 * Clear any inode allocations which are pending journal writes.
+	 */
+	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
+		cgp = (struct cg *)bp->b_data;
+		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+		inosused = cg_inosused(cgp);
+		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
+			ino = jaddref->ja_ino % fs->fs_ipg;
+			/*
+			 * If this is a background copy the inode may not
+			 * be marked used yet.
+			 */
+			if (isset(inosused, ino)) {
+				if ((jaddref->ja_mode & IFMT) == IFDIR)
+					cgp->cg_cs.cs_ndir--;
+				cgp->cg_cs.cs_nifree++;
+				clrbit(inosused, ino);
+				jaddref->ja_state &= ~ATTACHED;
+				jaddref->ja_state |= UNDONE;
+			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+				panic("initiate_write_bmsafemap: inode %d "
+				    "marked free", jaddref->ja_ino);
+		}
+	}
+	/*
+	 * Clear any block allocations which are pending journal writes.
+	 */
+	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+		cgp = (struct cg *)bp->b_data;
+		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+		blksfree = cg_blksfree(cgp);
+		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+			bno = dtogd(fs, jnewblk->jn_blkno);
+			cleared = 0;
+			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+			    i++) {
+				if (isclr(blksfree, bno + i)) {
+					cleared = 1;
+					setbit(blksfree, bno + i);
+				}
+			}
+			/*
+			 * We may not clear the block if it's a background
+			 * copy.  In that case there is no reason to detach
+			 * it.
+			 */
+			if (cleared) {
+				jnewblk->jn_state &= ~ATTACHED;
+				jnewblk->jn_state |= UNDONE;
+			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+				panic("initiate_write_bmsafemap: block %jd "
+				    "marked free", jnewblk->jn_blkno);
+		}
+	}
+	/*
+	 * Move allocation lists to the written lists so they can be
+	 * cleared once the block write is complete.
+	 */
+	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
+	    inodedep, id_deps);
+	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+	    newblk, nb_deps);
+}
+
+/*
  * This routine is called during the completion interrupt
  * service routine for a disk write (from the procedure called
  * by the device driver to inform the filesystem caches of
  * a request completion).  It should be called early in this
  * procedure, before the block is made available to other
  * processes or other routines are called.
+ *
  */
 static void 
 softdep_disk_write_complete(bp)
@@ -4254,12 +8597,7 @@ softdep_disk_write_complete(bp)
 	struct worklist *wk;
 	struct worklist *owk;
 	struct workhead reattach;
-	struct newblk *newblk;
-	struct allocindir *aip;
-	struct allocdirect *adp;
-	struct indirdep *indirdep;
-	struct inodedep *inodedep;
-	struct bmsafemap *bmsafemap;
+	struct buf *sbp;
 
 	/*
 	 * If an error occurred while doing the write, then the data
@@ -4271,8 +8609,9 @@ softdep_disk_write_complete(bp)
 	/*
 	 * This lock must not be released anywhere in this code segment.
 	 */
+	sbp = NULL;
+	owk = NULL;
 	ACQUIRE_LOCK(&lk);
-	owk = NULL;
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if (wk == owk)
@@ -4291,33 +8630,8 @@ softdep_disk_write_complete(bp)
 			continue;
 
 		case D_BMSAFEMAP:
-			bmsafemap = WK_BMSAFEMAP(wk);
-			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
-				newblk->nb_state |= DEPCOMPLETE;
-				newblk->nb_bmsafemap = NULL;
-				LIST_REMOVE(newblk, nb_deps);
-			}
-			while ((adp =
-			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
-				adp->ad_state |= DEPCOMPLETE;
-				adp->ad_buf = NULL;
-				LIST_REMOVE(adp, ad_deps);
-				handle_allocdirect_partdone(adp);
-			}
-			while ((aip =
-			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
-				aip->ai_state |= DEPCOMPLETE;
-				aip->ai_buf = NULL;
-				LIST_REMOVE(aip, ai_deps);
-				handle_allocindir_partdone(aip);
-			}
-			while ((inodedep =
-			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
-				inodedep->id_state |= DEPCOMPLETE;
-				LIST_REMOVE(inodedep, id_deps);
-				inodedep->id_buf = NULL;
-			}
-			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_MKDIR:
@@ -4325,37 +8639,47 @@ softdep_disk_write_complete(bp)
 			continue;
 
 		case D_ALLOCDIRECT:
-			adp = WK_ALLOCDIRECT(wk);
-			adp->ad_state |= COMPLETE;
-			handle_allocdirect_partdone(adp);
+			wk->wk_state |= COMPLETE;
+			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
 			continue;
 
 		case D_ALLOCINDIR:
-			aip = WK_ALLOCINDIR(wk);
-			aip->ai_state |= COMPLETE;
-			handle_allocindir_partdone(aip);
+			wk->wk_state |= COMPLETE;
+			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
 			continue;
 
 		case D_INDIRDEP:
-			indirdep = WK_INDIRDEP(wk);
-			if (indirdep->ir_state & GOINGAWAY)
-				panic("disk_write_complete: indirdep gone");
-			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
-			free(indirdep->ir_saveddata, M_INDIRDEP);
-			indirdep->ir_saveddata = 0;
-			indirdep->ir_state &= ~UNDONE;
-			indirdep->ir_state |= ATTACHED;
-			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
-				handle_allocindir_partdone(aip);
-				if (aip == LIST_FIRST(&indirdep->ir_donehd))
-					panic("disk_write_complete: not gone");
-			}
-			WORKLIST_INSERT(&reattach, wk);
-			if ((bp->b_flags & B_DELWRI) == 0)
-				stat_indir_blk_ptrs++;
-			bdirty(bp);
+			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
+		case D_FREEBLKS:
+			wk->wk_state |= COMPLETE;
+			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+				add_to_worklist(wk, 1);
+			continue;
+
+		case D_FREEWORK:
+			handle_written_freework(WK_FREEWORK(wk));
+			break;
+
+		case D_FREEDEP:
+			free_freedep(WK_FREEDEP(wk));
+			continue;
+
+		case D_JSEGDEP:
+			free_jsegdep(WK_JSEGDEP(wk));
+			continue;
+
+		case D_JSEG:
+			handle_written_jseg(WK_JSEG(wk), bp);
+			continue;
+
+		case D_SBDEP:
+			if (handle_written_sbdep(WK_SBDEP(wk), bp))
+				WORKLIST_INSERT(&reattach, wk);
+			continue;
+
 		default:
 			panic("handle_disk_write_complete: Unknown type %s",
 			    TYPENAME(wk->wk_type));
@@ -4370,6 +8694,8 @@ softdep_disk_write_complete(bp)
 		WORKLIST_INSERT(&bp->b_dep, wk);
 	}
 	FREE_LOCK(&lk);
+	if (sbp)
+		brelse(sbp);
 }
 
 /*
@@ -4378,18 +8704,17 @@ softdep_disk_write_complete(bp)
  * splbio interrupts blocked.
  */
 static void 
-handle_allocdirect_partdone(adp)
+handle_allocdirect_partdone(adp, wkhd)
 	struct allocdirect *adp;	/* the completed allocdirect */
+	struct workhead *wkhd;		/* Work to do when inode is writtne. */
 {
 	struct allocdirectlst *listhead;
 	struct allocdirect *listadp;
 	struct inodedep *inodedep;
-	long bsize, delay;
+	long bsize;
 
 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
-	if (adp->ad_buf != NULL)
-		panic("handle_allocdirect_partdone: dangling dep");
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
@@ -4439,25 +8764,27 @@ static void
 		return;
 	}
 	/*
-	 * If we have found the just finished dependency, then free
+	 * If we have found the just finished dependency, then queue
 	 * it along with anything that follows it that is complete.
-	 * If the inode still has a bitmap dependency, then it has
-	 * never been written to disk, hence the on-disk inode cannot
-	 * reference the old fragment so we can free it without delay.
+	 * Since the pointer has not yet been written in the inode
+	 * as the dependency prevents it, place the allocdirect on the
+	 * bufwait list where it will be freed once the pointer is
+	 * valid.
 	 */
-	delay = (inodedep->id_state & DEPCOMPLETE);
+	if (wkhd == NULL)
+		wkhd = &inodedep->id_bufwait;
 	for (; adp; adp = listadp) {
 		listadp = TAILQ_NEXT(adp, ad_next);
 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 			return;
-		free_allocdirect(listhead, adp, delay);
+		TAILQ_REMOVE(listhead, adp, ad_next);
+		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
 	}
 }
 
 /*
- * Called from within softdep_disk_write_complete above. Note that
- * this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * Called from within softdep_disk_write_complete above.  This routine
+ * completes successfully written allocindirs.
  */
 static void
 handle_allocindir_partdone(aip)
@@ -4467,11 +8794,9 @@ handle_allocindir_partdone(aip)
 
 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
-	if (aip->ai_buf != NULL)
-		panic("handle_allocindir_partdone: dangling dependency");
 	indirdep = aip->ai_indirdep;
+	LIST_REMOVE(aip, ai_next);
 	if (indirdep->ir_state & UNDONE) {
-		LIST_REMOVE(aip, ai_next);
 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 		return;
 	}
@@ -4481,13 +8806,130 @@ handle_allocindir_partdone(aip)
 	else
 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
-	LIST_REMOVE(aip, ai_next);
-	if (aip->ai_freefrag != NULL)
-		add_to_worklist(&aip->ai_freefrag->ff_list);
-	WORKITEM_FREE(aip, D_ALLOCINDIR);
+	/*
+	 * Await the pointer write before freeing the allocindir.
+	 */
+	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
 }
 
 /*
+ * Release segments held on a jwork list.
+ */
+static void
+handle_jwork(wkhd)
+	struct workhead *wkhd;
+{
+	struct worklist *wk;
+
+	while ((wk = LIST_FIRST(wkhd)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		switch (wk->wk_type) {
+		case D_JSEGDEP:
+			free_jsegdep(WK_JSEGDEP(wk));
+			continue;
+		default:
+			panic("handle_jwork: Unknown type %s\n",
+			    TYPENAME(wk->wk_type));
+		}
+	}
+}
+
+/*
+ * Handle the bufwait list on an inode when it is safe to release items
+ * held there.  This normally happens after an inode block is written but
+ * may be delayed and handle later if there are pending journal items that
+ * are not yet safe to be released.
+ */
+static struct freefile *
+handle_bufwait(inodedep, refhd)
+	struct inodedep *inodedep;
+	struct workhead *refhd;
+{
+	struct jaddref *jaddref;
+	struct freefile *freefile;
+	struct worklist *wk;
+
+	freefile = NULL;
+	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
+		WORKLIST_REMOVE(wk);
+		switch (wk->wk_type) {
+		case D_FREEFILE:
+			/*
+			 * We defer adding freefile to the worklist
+			 * until all other additions have been made to
+			 * ensure that it will be done after all the
+			 * old blocks have been freed.
+			 */
+			if (freefile != NULL)
+				panic("handle_bufwait: freefile");
+			freefile = WK_FREEFILE(wk);
+			continue;
+
+		case D_MKDIR:
+			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
+			continue;
+
+		case D_DIRADD:
+			diradd_inode_written(WK_DIRADD(wk), inodedep);
+			continue;
+
+		case D_FREEFRAG:
+			wk->wk_state |= COMPLETE;
+			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+				add_to_worklist(wk, 0);
+			continue;
+
+		case D_DIRREM:
+			wk->wk_state |= COMPLETE;
+			add_to_worklist(wk, 0);
+			continue;
+
+		case D_ALLOCDIRECT:
+		case D_ALLOCINDIR:
+			free_newblk(WK_NEWBLK(wk));
+			continue;
+
+		case D_JNEWBLK:
+			wk->wk_state |= COMPLETE;
+			free_jnewblk(WK_JNEWBLK(wk));
+			continue;
+
+		/*
+		 * Save freed journal segments and add references on
+		 * the supplied list which will delay their release
+		 * until the cg bitmap is cleared on disk.
+		 */
+		case D_JSEGDEP:
+			if (refhd == NULL)
+				free_jsegdep(WK_JSEGDEP(wk));
+			else
+				WORKLIST_INSERT(refhd, wk);
+			continue;
+
+		case D_JADDREF:
+			jaddref = WK_JADDREF(wk);
+			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
+			    if_deps);
+			/*
+			 * Transfer any jaddrefs to the list to be freed with
+			 * the bitmap if we're handling a removed file.
+			 */
+			if (refhd == NULL) {
+				wk->wk_state |= COMPLETE;
+				free_jaddref(jaddref);
+			} else
+				WORKLIST_INSERT(refhd, wk);
+			continue;
+
+		default:
+			panic("handle_bufwait: Unknown type %p(%s)",
+			    wk, TYPENAME(wk->wk_type));
+			/* NOTREACHED */
+		}
+	}
+	return (freefile);
+}
+/*
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
@@ -4498,12 +8940,17 @@ handle_written_inodeblock(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
 {
-	struct worklist *wk, *filefree;
+	struct freefile *freefile;
 	struct allocdirect *adp, *nextadp;
 	struct ufs1_dinode *dp1 = NULL;
 	struct ufs2_dinode *dp2 = NULL;
+	struct workhead wkhd;
 	int hadchanges, fstype;
+	ino_t freelink;
 
+	LIST_INIT(&wkhd);
+	hadchanges = 0;
+	freefile = NULL;
 	if ((inodedep->id_state & IOSTARTED) == 0)
 		panic("handle_written_inodeblock: not started");
 	inodedep->id_state &= ~IOSTARTED;
@@ -4511,12 +8958,30 @@ handle_written_inodeblock(inodedep, bp)
 		fstype = UFS1;
 		dp1 = (struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+		freelink = dp1->di_freelink;
 	} else {
 		fstype = UFS2;
 		dp2 = (struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+		freelink = dp2->di_freelink;
 	}
 	/*
+	 * If we wrote a freelink pointer during the last write record it
+	 * here.  If we did not, keep the buffer dirty until we do.
+	 */
+	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+		struct inodedep *inon;
+
+		inon = TAILQ_NEXT(inodedep, id_unlinked);
+		if ((inon == NULL && freelink == 0) ||
+		    (inon && inon->id_ino == freelink)) {
+			if (inon)
+				inon->id_state |= UNLINKPREV;
+			inodedep->id_state |= UNLINKNEXT;
+		} else
+			hadchanges = 1;
+	}
+	/*
 	 * If we had to rollback the inode allocation because of
 	 * bitmaps being incomplete, then simply restore it.
 	 * Keep the block dirty so that it will not be reclaimed until
@@ -4524,6 +8989,7 @@ handle_written_inodeblock(inodedep, bp)
 	 * corresponding updates written to disk.
 	 */
 	if (inodedep->id_savedino1 != NULL) {
+		hadchanges = 1;
 		if (fstype == UFS1)
 			*dp1 = *inodedep->id_savedino1;
 		else
@@ -4533,6 +8999,13 @@ handle_written_inodeblock(inodedep, bp)
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_inode_bitmap++;
 		bdirty(bp);
+		/*
+		 * If the inode is clear here and GOINGAWAY it will never
+		 * be written.  Process the bufwait and clear any pending
+		 * work which may include the freefile.
+		 */
+		if (inodedep->id_state & GOINGAWAY)
+			goto bufwait;
 		return (1);
 	}
 	inodedep->id_state |= COMPLETE;
@@ -4540,50 +9013,49 @@ handle_written_inodeblock(inodedep, bp)
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
 	 */
-	hadchanges = 0;
 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (fstype == UFS1) {
-			if (adp->ad_lbn < NDADDR) {
-				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+			if (adp->ad_offset < NDADDR) {
+				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
 					panic("%s %s #%jd mismatch %d != %jd",
 					    "handle_written_inodeblock:",
 					    "direct pointer",
-					    (intmax_t)adp->ad_lbn,
-					    dp1->di_db[adp->ad_lbn],
+					    (intmax_t)adp->ad_offset,
+					    dp1->di_db[adp->ad_offset],
 					    (intmax_t)adp->ad_oldblkno);
-				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
+				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
 			} else {
-				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
+				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %d",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
-					    (intmax_t)adp->ad_lbn - NDADDR,
-					    dp1->di_ib[adp->ad_lbn - NDADDR]);
-				dp1->di_ib[adp->ad_lbn - NDADDR] =
+					    (intmax_t)adp->ad_offset - NDADDR,
+					    dp1->di_ib[adp->ad_offset - NDADDR]);
+				dp1->di_ib[adp->ad_offset - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		} else {
-			if (adp->ad_lbn < NDADDR) {
-				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+			if (adp->ad_offset < NDADDR) {
+				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
 					panic("%s: %s #%jd %s %jd != %jd",
 					    "handle_written_inodeblock",
 					    "direct pointer",
-					    (intmax_t)adp->ad_lbn, "mismatch",
-					    (intmax_t)dp2->di_db[adp->ad_lbn],
+					    (intmax_t)adp->ad_offset, "mismatch",
+					    (intmax_t)dp2->di_db[adp->ad_offset],
 					    (intmax_t)adp->ad_oldblkno);
-				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
+				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
 			} else {
-				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
+				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %jd",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
-					    (intmax_t)adp->ad_lbn - NDADDR,
+					    (intmax_t)adp->ad_offset - NDADDR,
 					    (intmax_t)
-					    dp2->di_ib[adp->ad_lbn - NDADDR]);
-				dp2->di_ib[adp->ad_lbn - NDADDR] =
+					    dp2->di_ib[adp->ad_offset - NDADDR]);
+				dp2->di_ib[adp->ad_offset - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		}
@@ -4595,13 +9067,13 @@ handle_written_inodeblock(inodedep, bp)
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
-		if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
+		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
 			panic("%s: direct pointers #%jd %s %jd != %jd",
 			    "handle_written_inodeblock",
-			    (intmax_t)adp->ad_lbn, "mismatch",
-			    (intmax_t)dp2->di_extb[adp->ad_lbn],
+			    (intmax_t)adp->ad_offset, "mismatch",
+			    (intmax_t)dp2->di_extb[adp->ad_offset],
 			    (intmax_t)adp->ad_oldblkno);
-		dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
+		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
@@ -4613,12 +9085,23 @@ handle_written_inodeblock(inodedep, bp)
 	 */
 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 		panic("handle_written_inodeblock: bad size");
+	if (inodedep->id_savednlink > LINK_MAX)
+		panic("handle_written_inodeblock: Invalid link count "
+		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
 	if (fstype == UFS1) {
+		if (dp1->di_nlink != inodedep->id_savednlink) { 
+			dp1->di_nlink = inodedep->id_savednlink;
+			hadchanges = 1;
+		}
 		if (dp1->di_size != inodedep->id_savedsize) {
 			dp1->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 	} else {
+		if (dp2->di_nlink != inodedep->id_savednlink) { 
+			dp2->di_nlink = inodedep->id_savednlink;
+			hadchanges = 1;
+		}
 		if (dp2->di_size != inodedep->id_savedsize) {
 			dp2->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
@@ -4630,6 +9113,7 @@ handle_written_inodeblock(inodedep, bp)
 	}
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
+	inodedep->id_savednlink = -1;
 	/*
 	 * If there were any rollbacks in the inode block, then it must be
 	 * marked dirty so that its will eventually get written back in
@@ -4637,69 +9121,49 @@ handle_written_inodeblock(inodedep, bp)
 	 */
 	if (hadchanges)
 		bdirty(bp);
+bufwait:
 	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
-		handle_allocdirect_partdone(adp);
+		handle_allocdirect_partdone(adp, &wkhd);
 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
-		handle_allocdirect_partdone(adp);
+		handle_allocdirect_partdone(adp, &wkhd);
 	/*
 	 * Process deallocations that were held pending until the
 	 * inode had been written to disk. Freeing of the inode
 	 * is delayed until after all blocks have been freed to
 	 * avoid creation of new <vfsid, inum, lbn> triples
-	 * before the old ones have been deleted.
+	 * before the old ones have been deleted.  Completely
+	 * unlinked inodes are not processed until the unlinked
+	 * inode list is written or the last reference is removed.
 	 */
-	filefree = NULL;
-	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
-		WORKLIST_REMOVE(wk);
-		switch (wk->wk_type) {
-
-		case D_FREEFILE:
-			/*
-			 * We defer adding filefree to the worklist until
-			 * all other additions have been made to ensure
-			 * that it will be done after all the old blocks
-			 * have been freed.
-			 */
-			if (filefree != NULL)
-				panic("handle_written_inodeblock: filefree");
-			filefree = wk;
-			continue;
-
-		case D_MKDIR:
-			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
-			continue;
-
-		case D_DIRADD:
-			diradd_inode_written(WK_DIRADD(wk), inodedep);
-			continue;
-
-		case D_FREEBLKS:
-			wk->wk_state |= COMPLETE;
-			if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
-				continue;
-			 /* -- fall through -- */
-		case D_FREEFRAG:
-		case D_DIRREM:
-			add_to_worklist(wk);
-			continue;
-
-		case D_NEWDIRBLK:
-			free_newdirblk(WK_NEWDIRBLK(wk));
-			continue;
-
-		default:
-			panic("handle_written_inodeblock: Unknown type %s",
-			    TYPENAME(wk->wk_type));
-			/* NOTREACHED */
+	if ((inodedep->id_state & UNLINKED) == 0) {
+		freefile = handle_bufwait(inodedep, NULL);
+		if (freefile && !LIST_EMPTY(&wkhd)) {
+			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
+			freefile = NULL;
 		}
 	}
-	if (filefree != NULL) {
+	/*
+	 * Move rolled forward dependency completions to the bufwait list
+	 * now that those that were already written have been processed.
+	 */
+	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
+		panic("handle_written_inodeblock: bufwait but no changes");
+	jwork_move(&inodedep->id_bufwait, &wkhd);
+
+	if (freefile != NULL) {
+		/*
+		 * If the inode is goingaway it was never written.  Fake up
+		 * the state here so free_inodedep() can succeed.
+		 */
+		if (inodedep->id_state & GOINGAWAY)
+			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
 		if (free_inodedep(inodedep) == 0)
-			panic("handle_written_inodeblock: live inodedep");
-		add_to_worklist(filefree);
+			panic("handle_written_inodeblock: live inodedep %p",
+			    inodedep);
+		add_to_worklist(&freefile->fx_list, 0);
 		return (0);
 	}
 
@@ -4707,12 +9171,101 @@ handle_written_inodeblock(inodedep, bp)
 	 * If no outstanding dependencies, free it.
 	 */
 	if (free_inodedep(inodedep) ||
-	    (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
-	     TAILQ_FIRST(&inodedep->id_extupdt) == 0))
+	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
+	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
+	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
+	     LIST_FIRST(&inodedep->id_bufwait) == 0))
 		return (0);
 	return (hadchanges);
 }
 
+static int
+handle_written_indirdep(indirdep, bp, bpp)
+	struct indirdep *indirdep;
+	struct buf *bp;
+	struct buf **bpp;
+{
+	struct allocindir *aip;
+	int chgs;
+
+	if (indirdep->ir_state & GOINGAWAY)
+		panic("disk_write_complete: indirdep gone");
+	chgs = 0;
+	/*
+	 * If there were rollbacks revert them here.
+	 */
+	if (indirdep->ir_saveddata) {
+		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
+		free(indirdep->ir_saveddata, M_INDIRDEP);
+		indirdep->ir_saveddata = 0;
+		chgs = 1;
+	}
+	indirdep->ir_state &= ~UNDONE;
+	indirdep->ir_state |= ATTACHED;
+	/*
+	 * Move allocindirs with written pointers to the completehd if
+	 * the the indirdep's pointer is not yet written.  Otherwise
+	 * free them here.
+	 */
+	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
+		LIST_REMOVE(aip, ai_next);
+		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
+			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
+			    ai_next);
+			continue;
+		}
+		free_newblk(&aip->ai_block);
+	}
+	/*
+	 * Move allocindirs that have finished dependency processing from
+	 * the done list to the write list after updating the pointers.
+	 */
+	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
+		handle_allocindir_partdone(aip);
+		if (aip == LIST_FIRST(&indirdep->ir_donehd))
+			panic("disk_write_complete: not gone");
+		chgs = 1;
+	}
+	/*
+	 * If this indirdep has been detached from its newblk during
+	 * I/O we need to keep this dep attached to the buffer so
+	 * deallocate_dependencies can find it and properly resolve
+	 * any outstanding dependencies.
+	 */
+	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
+		chgs = 1;
+	if ((bp->b_flags & B_DELWRI) == 0)
+		stat_indir_blk_ptrs++;
+	/*
+	 * If there were no changes we can discard the savedbp and detach
+	 * ourselves from the buf.  We are only carrying completed pointers
+	 * in this case.
+	 */
+	if (chgs == 0) {
+		struct buf *sbp;
+
+		sbp = indirdep->ir_savebp;
+		sbp->b_flags |= B_INVAL | B_NOCACHE;
+		indirdep->ir_savebp = NULL;
+		if (*bpp != NULL)
+			panic("handle_written_indirdep: bp already exists.");
+		*bpp = sbp;
+	} else
+		bdirty(bp);
+	/*
+	 * If there are no fresh dependencies and none waiting on writes
+	 * we can free the indirdep. 
+	 */
+	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
+		if (indirdep->ir_state & ONDEPLIST)
+			LIST_REMOVE(indirdep, ir_next);
+		free_indirdep(indirdep);
+		return (0);
+	}
+
+	return (chgs);
+}
+
 /*
  * Process a diradd entry after its dependent inode has been written.
  * This routine must be called with splbio interrupts blocked.
@@ -4722,50 +9275,200 @@ diradd_inode_written(dap, inodedep)
 	struct diradd *dap;
 	struct inodedep *inodedep;
 {
-	struct pagedep *pagedep;
 
 	dap->da_state |= COMPLETE;
-	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
-		if (dap->da_state & DIRCHG)
-			pagedep = dap->da_previous->dm_pagedep;
-		else
-			pagedep = dap->da_pagedep;
-		LIST_REMOVE(dap, da_pdlist);
-		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
-	}
+	complete_diradd(dap);
 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 }
 
 /*
- * Handle the completion of a mkdir dependency.
+ * Returns true if the bmsafemap will have rollbacks when written.  Must
+ * only be called with lk and the buf lock on the cg held.
  */
+static int
+bmsafemap_rollbacks(bmsafemap)
+	struct bmsafemap *bmsafemap;
+{
+
+	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 
+	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
+}
+
+/*
+ * Complete a write to a bmsafemap structure.  Roll forward any bitmap
+ * changes if it's not a background write.  Set all written dependencies 
+ * to DEPCOMPLETE and free the structure if possible.
+ */
+static int
+handle_written_bmsafemap(bmsafemap, bp)
+	struct bmsafemap *bmsafemap;
+	struct buf *bp;
+{
+	struct newblk *newblk;
+	struct inodedep *inodedep;
+	struct jaddref *jaddref, *jatmp;
+	struct jnewblk *jnewblk, *jntmp;
+	uint8_t *inosused;
+	uint8_t *blksfree;
+	struct cg *cgp;
+	struct fs *fs;
+	ino_t ino;
+	long bno;
+	int chgs;
+	int i;
+
+	if ((bmsafemap->sm_state & IOSTARTED) == 0)
+		panic("initiate_write_bmsafemap: Not started\n");
+	chgs = 0;
+	bmsafemap->sm_state &= ~IOSTARTED;
+	/*
+	 * Restore unwritten inode allocation pending jaddref writes.
+	 */
+	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
+		cgp = (struct cg *)bp->b_data;
+		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+		inosused = cg_inosused(cgp);
+		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
+		    ja_bmdeps, jatmp) {
+			if ((jaddref->ja_state & UNDONE) == 0)
+				continue;
+			ino = jaddref->ja_ino % fs->fs_ipg;
+			if (isset(inosused, ino))
+				panic("handle_written_bmsafemap: "
+				    "re-allocated inode");
+			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
+				if ((jaddref->ja_mode & IFMT) == IFDIR)
+					cgp->cg_cs.cs_ndir++;
+				cgp->cg_cs.cs_nifree--;
+				setbit(inosused, ino);
+				chgs = 1;
+			}
+			jaddref->ja_state &= ~UNDONE;
+			jaddref->ja_state |= ATTACHED;
+			free_jaddref(jaddref);
+		}
+	}
+	/*
+	 * Restore any block allocations which are pending journal writes.
+	 */
+	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+		cgp = (struct cg *)bp->b_data;
+		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+		blksfree = cg_blksfree(cgp);
+		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
+		    jntmp) {
+			if ((jnewblk->jn_state & UNDONE) == 0)
+				continue;
+			bno = dtogd(fs, jnewblk->jn_blkno);
+			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+			    i++) {
+				if (bp->b_xflags & BX_BKGRDMARKER)
+					break;
+				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
+				    isclr(blksfree, bno + i))
+					panic("handle_written_bmsafemap: "
+					    "re-allocated fragment");
+				clrbit(blksfree, bno + i);
+				chgs = 1;
+			}
+			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
+			jnewblk->jn_state |= ATTACHED;
+			free_jnewblk(jnewblk);
+		}
+	}
+	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
+		newblk->nb_state |= DEPCOMPLETE;
+		newblk->nb_state &= ~ONDEPLIST;
+		newblk->nb_bmsafemap = NULL;
+		LIST_REMOVE(newblk, nb_deps);
+		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
+			handle_allocdirect_partdone(
+			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
+		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
+			handle_allocindir_partdone(
+			    WK_ALLOCINDIR(&newblk->nb_list));
+		else if (newblk->nb_list.wk_type != D_NEWBLK)
+			panic("handle_written_bmsafemap: Unexpected type: %s",
+			    TYPENAME(newblk->nb_list.wk_type));
+	}
+	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
+		inodedep->id_state |= DEPCOMPLETE;
+		inodedep->id_state &= ~ONDEPLIST;
+		LIST_REMOVE(inodedep, id_deps);
+		inodedep->id_bmsafemap = NULL;
+	}
+	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
+	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
+	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
+	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
+		if (chgs)
+			bdirty(bp);
+		LIST_REMOVE(bmsafemap, sm_hash);
+		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+		return (0);
+	}
+	bdirty(bp);
+	return (1);
+}
+
+/*
+ * Try to free a mkdir dependency.
+ */
 static void
-handle_written_mkdir(mkdir, type)
+complete_mkdir(mkdir)
 	struct mkdir *mkdir;
-	int type;
 {
 	struct diradd *dap;
-	struct pagedep *pagedep;
 
-	if (mkdir->md_state != type)
-		panic("handle_written_mkdir: bad type");
+	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
+		return;
+	LIST_REMOVE(mkdir, md_mkdirs);
 	dap = mkdir->md_diradd;
-	dap->da_state &= ~type;
-	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
+	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
 		dap->da_state |= DEPCOMPLETE;
-	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
-		if (dap->da_state & DIRCHG)
-			pagedep = dap->da_previous->dm_pagedep;
-		else
-			pagedep = dap->da_pagedep;
-		LIST_REMOVE(dap, da_pdlist);
-		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+		complete_diradd(dap);
 	}
-	LIST_REMOVE(mkdir, md_mkdirs);
 	WORKITEM_FREE(mkdir, D_MKDIR);
 }
 
 /*
+ * Handle the completion of a mkdir dependency.
+ */
+static void
+handle_written_mkdir(mkdir, type)
+	struct mkdir *mkdir;
+	int type;
+{
+
+	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
+		panic("handle_written_mkdir: bad type");
+	mkdir->md_state |= COMPLETE;
+	complete_mkdir(mkdir);
+}
+
+static void
+free_pagedep(pagedep)
+	struct pagedep *pagedep;
+{
+	int i;
+
+	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
+		return;
+	for (i = 0; i < DAHASHSZ; i++)
+		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
+			return;
+	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
+		return;
+	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
+		return;
+	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
+		return;
+	LIST_REMOVE(pagedep, pd_hash);
+	WORKITEM_FREE(pagedep, D_PAGEDEP);
+}
+
+/*
  * Called from within softdep_disk_write_complete above.
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
@@ -4790,8 +9493,11 @@ handle_written_filepage(pagedep, bp)
 	 */
 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 		LIST_REMOVE(dirrem, dm_next);
+		dirrem->dm_state |= COMPLETE;
 		dirrem->dm_dirinum = pagedep->pd_ino;
-		add_to_worklist(&dirrem->dm_list);
+		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+		    ("handle_written_filepage: Journal entries not written."));
+		add_to_worklist(&dirrem->dm_list, 0);
 	}
 	/*
 	 * Free any directory additions that have been committed.
@@ -4800,7 +9506,7 @@ handle_written_filepage(pagedep, bp)
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
-			free_diradd(dap);
+			free_diradd(dap, NULL);
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
@@ -4845,7 +9551,8 @@ handle_written_filepage(pagedep, bp)
 	 * Otherwise it will remain to track any new entries on
 	 * the page in case they are fsync'ed.
 	 */
-	if ((pagedep->pd_state & NEWBLOCK) == 0) {
+	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
+	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
@@ -4880,8 +9587,8 @@ softdep_load_inodeblock(ip)
 	 */
 	ip->i_effnlink = ip->i_nlink;
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(UFSTOVFS(ip->i_ump),
-	    ip->i_number, 0, &inodedep) == 0) {
+	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+	    &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
@@ -4908,6 +9615,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
 	int waitfor;		/* nonzero => update must be allowed */
 {
 	struct inodedep *inodedep;
+	struct inoref *inoref;
 	struct worklist *wk;
 	struct mount *mp;
 	struct buf *ibp;
@@ -4922,6 +9630,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
 	 */
 	mp = UFSTOVFS(ip->i_ump);
 	ACQUIRE_LOCK(&lk);
+again:
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		if (ip->i_effnlink != ip->i_nlink)
@@ -4931,6 +9640,19 @@ softdep_update_inodeblock(ip, bp, waitfor)
 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 		panic("softdep_update_inodeblock: bad delta");
 	/*
+	 * If we're flushing all dependencies we must also move any waiting
+	 * for journal writes onto the bufwait list prior to I/O.
+	 */
+	if (waitfor) {
+		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+			    == DEPCOMPLETE) {
+				jwait(&inoref->if_list);
+				goto again;
+			}
+		}
+	}
+	/*
 	 * Changes have been initiated. Anything depending on these
 	 * changes cannot occur until this inode has been written.
 	 */
@@ -4945,10 +9667,12 @@ softdep_update_inodeblock(ip, bp, waitfor)
 	 */
 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
-		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
+		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
+		    NULL);
 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
-		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
+		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
+		    NULL);
 	/*
 	 * Now that the inode has been pushed into the buffer, the
 	 * operations dependent on the inode being written to disk
@@ -4971,11 +9695,11 @@ softdep_update_inodeblock(ip, bp, waitfor)
 		return;
 	}
 retry:
-	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
+	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
-	ibp = inodedep->id_buf;
+	ibp = inodedep->id_bmsafemap->sm_buf;
 	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
 	if (ibp == NULL) {
 		/*
@@ -5007,13 +9731,13 @@ merge_inode_lists(newlisthead, oldlisthead)
 
 	newadp = TAILQ_FIRST(newlisthead);
 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
-		if (listadp->ad_lbn < newadp->ad_lbn) {
+		if (listadp->ad_offset < newadp->ad_offset) {
 			listadp = TAILQ_NEXT(listadp, ad_next);
 			continue;
 		}
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
-		if (listadp->ad_lbn == newadp->ad_lbn) {
+		if (listadp->ad_offset == newadp->ad_offset) {
 			allocdirect_merge(oldlisthead, newadp,
 			    listadp);
 			listadp = newadp;
@@ -5036,6 +9760,7 @@ softdep_fsync(vp)
 {
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
+	struct inoref *inoref;
 	struct worklist *wk;
 	struct diradd *dap;
 	struct mount *mp;
@@ -5052,17 +9777,24 @@ softdep_fsync(vp)
 	fs = ip->i_fs;
 	mp = vp->v_mount;
 	ACQUIRE_LOCK(&lk);
+restart:
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return (0);
 	}
+	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+		    == DEPCOMPLETE) {
+			jwait(&inoref->if_list);
+			goto restart;
+		}
+	}
 	if (!LIST_EMPTY(&inodedep->id_inowait) ||
-	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
-		panic("softdep_fsync: pending ops");
+		panic("softdep_fsync: pending ops %p", inodedep);
 	for (error = 0, flushparent = 0; ; ) {
 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 			break;
@@ -5254,8 +9986,8 @@ int
 softdep_sync_metadata(struct vnode *vp)
 {
 	struct pagedep *pagedep;
-	struct allocdirect *adp;
 	struct allocindir *aip;
+	struct newblk *newblk;
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 	struct bufobj *bo;
@@ -5319,27 +10051,15 @@ loop:
 		switch (wk->wk_type) {
 
 		case D_ALLOCDIRECT:
-			adp = WK_ALLOCDIRECT(wk);
-			if (adp->ad_state & DEPCOMPLETE)
-				continue;
-			nbp = adp->ad_buf;
-			nbp = getdirtybuf(nbp, &lk, waitfor);
-			if (nbp == NULL)
-				continue;
-			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
-				bawrite(nbp);
-			} else if ((error = bwrite(nbp)) != 0) {
-				break;
+		case D_ALLOCINDIR:
+			newblk = WK_NEWBLK(wk);
+			if (newblk->nb_jnewblk != NULL) {
+				jwait(&newblk->nb_jnewblk->jn_list);
+				goto restart;
 			}
-			ACQUIRE_LOCK(&lk);
-			continue;
-
-		case D_ALLOCINDIR:
-			aip = WK_ALLOCINDIR(wk);
-			if (aip->ai_state & DEPCOMPLETE)
+			if (newblk->nb_state & DEPCOMPLETE)
 				continue;
-			nbp = aip->ai_buf;
+			nbp = newblk->nb_bmsafemap->sm_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
@@ -5355,10 +10075,16 @@ loop:
 		case D_INDIRDEP:
 		restart:
 
-			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
-				if (aip->ai_state & DEPCOMPLETE)
+			LIST_FOREACH(aip,
+			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
+				newblk = (struct newblk *)aip;
+				if (newblk->nb_jnewblk != NULL) {
+					jwait(&newblk->nb_jnewblk->jn_list);
+					goto restart;
+				}
+				if (newblk->nb_state & DEPCOMPLETE)
 					continue;
-				nbp = aip->ai_buf;
+				nbp = newblk->nb_bmsafemap->sm_buf;
 				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
 				if (nbp == NULL)
 					goto restart;
@@ -5371,14 +10097,6 @@ loop:
 			}
 			continue;
 
-		case D_INODEDEP:
-			if ((error = flush_inodedep_deps(wk->wk_mp,
-			    WK_INODEDEP(wk)->id_ino)) != 0) {
-				FREE_LOCK(&lk);
-				break;
-			}
-			continue;
-
 		case D_PAGEDEP:
 			/*
 			 * We are trying to sync a directory that may
@@ -5400,48 +10118,6 @@ loop:
 			}
 			continue;
 
-		case D_MKDIR:
-			/*
-			 * This case should never happen if the vnode has
-			 * been properly sync'ed. However, if this function
-			 * is used at a place where the vnode has not yet
-			 * been sync'ed, this dependency can show up. So,
-			 * rather than panic, just flush it.
-			 */
-			nbp = WK_MKDIR(wk)->md_buf;
-			nbp = getdirtybuf(nbp, &lk, waitfor);
-			if (nbp == NULL)
-				continue;
-			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
-				bawrite(nbp);
-			} else if ((error = bwrite(nbp)) != 0) {
-				break;
-			}
-			ACQUIRE_LOCK(&lk);
-			continue;
-
-		case D_BMSAFEMAP:
-			/*
-			 * This case should never happen if the vnode has
-			 * been properly sync'ed. However, if this function
-			 * is used at a place where the vnode has not yet
-			 * been sync'ed, this dependency can show up. So,
-			 * rather than panic, just flush it.
-			 */
-			nbp = WK_BMSAFEMAP(wk)->sm_buf;
-			nbp = getdirtybuf(nbp, &lk, waitfor);
-			if (nbp == NULL)
-				continue;
-			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
-				bawrite(nbp);
-			} else if ((error = bwrite(nbp)) != 0) {
-				break;
-			}
-			ACQUIRE_LOCK(&lk);
-			continue;
-
 		default:
 			panic("softdep_sync_metadata: Unknown type %s",
 			    TYPENAME(wk->wk_type));
@@ -5489,7 +10165,8 @@ loop:
 	BO_LOCK(bo);
 	drain_output(vp);
 	BO_UNLOCK(bo);
-	return (0);
+	return ffs_update(vp, 1);
+	/* return (0); */
 }
 
 /*
@@ -5502,6 +10179,7 @@ flush_inodedep_deps(mp, ino)
 	ino_t ino;
 {
 	struct inodedep *inodedep;
+	struct inoref *inoref;
 	int error, waitfor;
 
 	/*
@@ -5522,8 +10200,16 @@ flush_inodedep_deps(mp, ino)
 			return (error);
 		FREE_LOCK(&lk);
 		ACQUIRE_LOCK(&lk);
+restart:
 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			return (0);
+		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+			    == DEPCOMPLETE) {
+				jwait(&inoref->if_list);
+				goto restart;
+			}
+		}
 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
@@ -5555,13 +10241,19 @@ flush_deplist(listhead, waitfor, errorp)
 	int *errorp;
 {
 	struct allocdirect *adp;
+	struct newblk *newblk;
 	struct buf *bp;
 
 	mtx_assert(&lk, MA_OWNED);
 	TAILQ_FOREACH(adp, listhead, ad_next) {
-		if (adp->ad_state & DEPCOMPLETE)
+		newblk = (struct newblk *)adp;
+		if (newblk->nb_jnewblk != NULL) {
+			jwait(&newblk->nb_jnewblk->jn_list);
+			return (1);
+		}
+		if (newblk->nb_state & DEPCOMPLETE)
 			continue;
-		bp = adp->ad_buf;
+		bp = newblk->nb_bmsafemap->sm_buf;
 		bp = getdirtybuf(bp, &lk, waitfor);
 		if (bp == NULL) {
 			if (waitfor == MNT_NOWAIT)
@@ -5582,6 +10274,100 @@ flush_deplist(listhead, waitfor, errorp)
 }
 
 /*
+ * Flush dependencies associated with an allocdirect block.
+ */
+static int
+flush_newblk_dep(vp, mp, lbn)
+	struct vnode *vp;
+	struct mount *mp;
+	ufs_lbn_t lbn;
+{
+	struct newblk *newblk;
+	struct bufobj *bo;
+	struct inode *ip;
+	struct buf *bp;
+	ufs2_daddr_t blkno;
+	int error;
+
+	error = 0;
+	bo = &vp->v_bufobj;
+	ip = VTOI(vp);
+	blkno = DIP(ip, i_db[lbn]);
+	if (blkno == 0)
+		panic("flush_newblk_dep: Missing block");
+	ACQUIRE_LOCK(&lk);
+	/*
+	 * Loop until all dependencies related to this block are satisfied.
+	 * We must be careful to restart after each sleep in case a write
+	 * completes some part of this process for us.
+	 */
+	for (;;) {
+		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
+			FREE_LOCK(&lk);
+			break;
+		}
+		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
+			panic("flush_newblk_deps: Bad newblk %p", newblk);
+		/*
+		 * Flush the journal.
+		 */
+		if (newblk->nb_jnewblk != NULL) {
+			jwait(&newblk->nb_jnewblk->jn_list);
+			continue;
+		}
+		/*
+		 * Write the bitmap dependency.
+		 */
+		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
+			bp = newblk->nb_bmsafemap->sm_buf;
+			bp = getdirtybuf(bp, &lk, MNT_WAIT);
+			if (bp == NULL)
+				continue;
+			FREE_LOCK(&lk);
+			error = bwrite(bp);
+			if (error)
+				break;
+			ACQUIRE_LOCK(&lk);
+			continue;
+		}
+		/*
+		 * Write the buffer.
+		 */
+		FREE_LOCK(&lk);
+		BO_LOCK(bo);
+		bp = gbincore(bo, lbn);
+		if (bp != NULL) {
+			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
+			    LK_INTERLOCK, BO_MTX(bo));
+			if (error == ENOLCK) {
+				ACQUIRE_LOCK(&lk);
+				continue; /* Slept, retry */
+			}
+			if (error != 0)
+				break;	/* Failed */
+			if (bp->b_flags & B_DELWRI) {
+				bremfree(bp);
+				error = bwrite(bp);
+				if (error)
+					break;
+			} else
+				BUF_UNLOCK(bp);
+		} else
+			BO_UNLOCK(bo);
+		/*
+		 * We have to wait for the direct pointers to
+		 * point at the newdirblk before the dependency
+		 * will go away.
+		 */
+		error = ffs_update(vp, MNT_WAIT);
+		if (error)
+			break;
+		ACQUIRE_LOCK(&lk);
+	}
+	return (error);
+}
+
+/*
  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  * Called with splbio blocked.
  */
@@ -5592,16 +10378,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 	struct diraddhd *diraddhdp;
 {
 	struct inodedep *inodedep;
+	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct diradd *dap;
 	struct vnode *vp;
-	struct bufobj *bo;
 	int error = 0;
 	struct buf *bp;
 	ino_t inum;
-	struct worklist *wk;
 
 	ump = VFSTOUFS(mp);
+restart:
 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 		/*
 		 * Flush ourselves if this directory entry
@@ -5609,7 +10395,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 		 */
 		if (dap->da_state & MKDIR_PARENT) {
 			FREE_LOCK(&lk);
-			if ((error = ffs_update(pvp, 1)) != 0)
+			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
 				break;
 			ACQUIRE_LOCK(&lk);
 			/*
@@ -5623,84 +10409,51 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 		/*
 		 * A newly allocated directory must have its "." and
 		 * ".." entries written out before its name can be
-		 * committed in its parent. We do not want or need
-		 * the full semantics of a synchronous ffs_syncvnode as
-		 * that may end up here again, once for each directory
-		 * level in the filesystem. Instead, we push the blocks
-		 * and wait for them to clear. We have to fsync twice
-		 * because the first call may choose to defer blocks
-		 * that still have dependencies, but deferral will
-		 * happen at most once.
+		 * committed in its parent. 
 		 */
 		inum = dap->da_newinum;
+		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
+			panic("flush_pagedep_deps: lost inode1");
+		/*
+		 * Wait for any pending journal adds to complete so we don't
+		 * cause rollbacks while syncing.
+		 */
+		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+			    == DEPCOMPLETE) {
+				jwait(&inoref->if_list);
+				goto restart;
+			}
+		}
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(&lk);
 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
 			    FFSV_FORCEINSMQ)))
 				break;
-			if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
-			    (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
-				vput(vp);
-				break;
-			}
-			bo = &vp->v_bufobj;
-			BO_LOCK(bo);
-			drain_output(vp);
+			error = flush_newblk_dep(vp, mp, 0);
 			/*
-			 * If first block is still dirty with a D_MKDIR
-			 * dependency then it needs to be written now.
+			 * If we still have the dependency we might need to
+			 * update the vnode to sync the new link count to
+			 * disk.
 			 */
-			for (;;) {
-				error = 0;
-				bp = gbincore(bo, 0);
-				if (bp == NULL)
-					break;	/* First block not present */
-				error = BUF_LOCK(bp,
-						 LK_EXCLUSIVE |
-						 LK_SLEEPFAIL |
-						 LK_INTERLOCK,
-						 BO_MTX(bo));
-				BO_LOCK(bo);
-				if (error == ENOLCK)
-					continue;	/* Slept, retry */
-				if (error != 0)
-					break;		/* Failed */
-				if ((bp->b_flags & B_DELWRI) == 0) {
-					BUF_UNLOCK(bp);
-					break;	/* Buffer not dirty */
-				}
-				for (wk = LIST_FIRST(&bp->b_dep);
-				     wk != NULL;
-				     wk = LIST_NEXT(wk, wk_list))
-					if (wk->wk_type == D_MKDIR)
-						break;
-				if (wk == NULL)
-					BUF_UNLOCK(bp);	/* Dependency gone */
-				else {
-					/*
-					 * D_MKDIR dependency remains,
-					 * must write buffer to stable
-					 * storage.
-					 */
-					BO_UNLOCK(bo);
-					bremfree(bp);
-					error = bwrite(bp);
-					BO_LOCK(bo);
-				}
-				break;
-			}
-			BO_UNLOCK(bo);
+			if (error == 0 && dap == LIST_FIRST(diraddhdp))
+				error = ffs_update(vp, MNT_WAIT);
 			vput(vp);
 			if (error != 0)
-				break;	/* Flushing of first block failed */
+				break;
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
-			if (dap->da_state & MKDIR_BODY)
-				panic("flush_pagedep_deps: MKDIR_BODY");
+			if (dap->da_state & MKDIR_BODY) {
+				inodedep_lookup(UFSTOVFS(ump), inum, 0,
+				    &inodedep);
+				panic("flush_pagedep_deps: MKDIR_BODY "
+				    "inodedep %p dap %p vp %p",
+				    inodedep, dap, vp);
+			}
 		}
 		/*
 		 * Flush the inode on which the directory entry depends.
@@ -5719,8 +10472,8 @@ retry:
 		 * If the inode still has bitmap dependencies,
 		 * push them to disk.
 		 */
-		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
-			bp = inodedep->id_buf;
+		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
+			bp = inodedep->id_bmsafemap->sm_buf;
 			bp = getdirtybuf(bp, &lk, MNT_WAIT);
 			if (bp == NULL)
 				goto retry;
@@ -5733,24 +10486,29 @@ retry:
 		}
 		/*
 		 * If the inode is still sitting in a buffer waiting
-		 * to be written, push it to disk.
+		 * to be written or waiting for the link count to be
+		 * adjusted update it here to flush it to disk.
 		 */
-		FREE_LOCK(&lk);
-		if ((error = bread(ump->um_devvp,
-		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
-		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
-			brelse(bp);
-			break;
+		if (dap == LIST_FIRST(diraddhdp)) {
+			FREE_LOCK(&lk);
+			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
+			    FFSV_FORCEINSMQ)))
+				break;
+			error = ffs_update(vp, MNT_WAIT);
+			vput(vp);
+			if (error)
+				break;
+			ACQUIRE_LOCK(&lk);
 		}
-		if ((error = bwrite(bp)) != 0)
-			break;
-		ACQUIRE_LOCK(&lk);
 		/*
 		 * If we have failed to get rid of all the dependencies
 		 * then something is seriously wrong.
 		 */
-		if (dap == LIST_FIRST(diraddhdp))
-			panic("flush_pagedep_deps: flush failed");
+		if (dap == LIST_FIRST(diraddhdp)) {
+			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
+			panic("flush_pagedep_deps: failed to flush " 
+			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
+		}
 	}
 	if (error)
 		ACQUIRE_LOCK(&lk);
@@ -6100,10 +10858,13 @@ softdep_count_dependencies(bp, wantcount)
 	int wantcount;
 {
 	struct worklist *wk;
+	struct bmsafemap *bmsafemap;
 	struct inodedep *inodedep;
 	struct indirdep *indirdep;
+	struct freeblks *freeblks;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
+	struct dirrem *dirrem;
 	struct diradd *dap;
 	int i, retval;
 
@@ -6132,6 +10893,12 @@ softdep_count_dependencies(bp, wantcount)
 				if (!wantcount)
 					goto out;
 			}
+			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
+				/* Add reference dependency. */
+				retval += 1;
+				if (!wantcount)
+					goto out;
+			}
 			continue;
 
 		case D_INDIRDEP:
@@ -6147,6 +10914,14 @@ softdep_count_dependencies(bp, wantcount)
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
+			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
+					/* Journal remove ref dependency. */
+					retval += 1;
+					if (!wantcount)
+						goto out;
+				}
+			}
 			for (i = 0; i < DAHASHSZ; i++) {
 
 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
@@ -6159,14 +10934,44 @@ softdep_count_dependencies(bp, wantcount)
 			continue;
 
 		case D_BMSAFEMAP:
+			bmsafemap = WK_BMSAFEMAP(wk);
+			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
+				/* Add reference dependency. */
+				retval += 1;
+				if (!wantcount)
+					goto out;
+			}
+			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
+				/* Allocate block dependency. */
+				retval += 1;
+				if (!wantcount)
+					goto out;
+			}
+			continue;
+
+		case D_FREEBLKS:
+			freeblks = WK_FREEBLKS(wk);
+			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
+				/* Freeblk journal dependency. */
+				retval += 1;
+				if (!wantcount)
+					goto out;
+			}
+			continue;
+
+		case D_FREEWORK:
+		case D_FREEDEP:
+		case D_JSEGDEP:
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 		case D_MKDIR:
+		case D_JSEG:
+		case D_SBDEP:
 			/* never a dependency on these blocks */
 			continue;
 
 		default:
-			panic("softdep_check_for_rollback: Unexpected type %s",
+			panic("softdep_count_dependencies: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
@@ -6382,6 +11187,45 @@ softdep_error(func, error)
 
 #ifdef DDB
 
+static void
+inodedep_print(struct inodedep *inodedep, int verbose)
+{
+	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
+	    " saveino %p\n",
+	    inodedep, inodedep->id_fs, inodedep->id_state,
+	    (intmax_t)inodedep->id_ino,
+	    (intmax_t)fsbtodb(inodedep->id_fs,
+	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
+	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
+	    inodedep->id_savedino1);
+
+	if (verbose == 0)
+		return;
+
+	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
+	    "mkdiradd %p\n",
+	    LIST_FIRST(&inodedep->id_pendinghd),
+	    LIST_FIRST(&inodedep->id_bufwait),
+	    LIST_FIRST(&inodedep->id_inowait),
+	    TAILQ_FIRST(&inodedep->id_inoreflst),
+	    inodedep->id_mkdiradd);
+	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
+	    TAILQ_FIRST(&inodedep->id_inoupdt),
+	    TAILQ_FIRST(&inodedep->id_newinoupdt),
+	    TAILQ_FIRST(&inodedep->id_extupdt),
+	    TAILQ_FIRST(&inodedep->id_newextupdt));
+}
+
+DB_SHOW_COMMAND(inodedep, db_show_inodedep)
+{
+
+	if (have_addr == 0) {
+		db_printf("Address required\n");
+		return;
+	}
+	inodedep_print((struct inodedep*)addr, 1);
+}
+
 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
 {
 	struct inodedep_hashhead *inodedephd;
@@ -6395,15 +11239,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
 		LIST_FOREACH(inodedep, inodedephd, id_hash) {
 			if (fs != NULL && fs != inodedep->id_fs)
 				continue;
-			db_printf("%p fs %p st %x ino %jd inoblk %jd\n",
-			    inodedep, inodedep->id_fs, inodedep->id_state,
-			    (intmax_t)inodedep->id_ino,
-			    (intmax_t)fsbtodb(inodedep->id_fs,
-			    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)));
+			inodedep_print(inodedep, 0);
 		}
 	}
 }
 
+DB_SHOW_COMMAND(worklist, db_show_worklist)
+{
+	struct worklist *wk;
+
+	if (have_addr == 0) {
+		db_printf("Address required\n");
+		return;
+	}
+	wk = (struct worklist *)addr;
+	printf("worklist: %p type %s state 0x%X\n",
+	    wk, TYPENAME(wk->wk_type), wk->wk_state);
+}
+
+DB_SHOW_COMMAND(workhead, db_show_workhead)
+{
+	struct workhead *wkhd;
+	struct worklist *wk;
+	int i;
+
+	if (have_addr == 0) {
+		db_printf("Address required\n");
+		return;
+	}
+	wkhd = (struct workhead *)addr;
+	wk = LIST_FIRST(wkhd);
+	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
+		db_printf("worklist: %p type %s state 0x%X",
+		    wk, TYPENAME(wk->wk_type), wk->wk_state);
+	if (i == 100)
+		db_printf("workhead overflow");
+	printf("\n");
+}
+
+
+DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
+{
+	struct jaddref *jaddref;
+	struct diradd *diradd;
+	struct mkdir *mkdir;
+
+	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+		diradd = mkdir->md_diradd;
+		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
+		    mkdir, mkdir->md_state, diradd, diradd->da_state);
+		if ((jaddref = mkdir->md_jaddref) != NULL)
+			db_printf(" jaddref %p jaddref state 0x%X",
+			    jaddref, jaddref->ja_state);
+		db_printf("\n");
+	}
+}
+
 #endif /* DDB */
 
 #endif /* SOFTUPDATES */
Index: sys/ufs/ffs/ffs_vnops.c
===================================================================
--- sys/ufs/ffs/ffs_vnops.c	(revision 202342)
+++ sys/ufs/ffs/ffs_vnops.c	(working copy)
@@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
 	wait = (waitfor == MNT_WAIT);
 	lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
 	bo = &vp->v_bufobj;
+	ip->i_flag &= ~IN_NEEDSYNC;
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
Index: sys/ufs/ffs/ffs_extern.h
===================================================================
--- sys/ufs/ffs/ffs_extern.h	(revision 202342)
+++ sys/ufs/ffs/ffs_extern.h	(working copy)
@@ -47,6 +47,7 @@ struct ucred;
 struct vnode;
 struct vop_fsync_args;
 struct vop_reallocblks_args;
+struct workhead;
 
 int	ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int,
 	    struct ucred *, ufs2_daddr_t *);
@@ -56,20 +57,23 @@ int	ffs_balloc_ufs2(struct vnode *a_vp, off_t a_st
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void	ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-	    ufs2_daddr_t, long, ino_t);
+	    ufs2_daddr_t, long, ino_t, struct workhead *);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
 void	ffs_bdflush(struct bufobj *, struct buf *);
 int	ffs_copyonwrite(struct vnode *, struct buf *);
 int	ffs_flushfiles(struct mount *, int, struct thread *);
 void	ffs_fragacct(struct fs *, int, int32_t [], int);
 int	ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,
-	    int);
+	    int, struct workhead *);
 int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int	ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
 void	ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
 int	ffs_mountroot(void);
+void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 int	ffs_reallocblks(struct vop_reallocblks_args *);
 int	ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
 	    ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2;
 
 int	softdep_check_suspend(struct mount *, struct vnode *,
 	  int, int, int, int);
+int	softdep_complete_trunc(struct vnode *, void *);
 void	softdep_get_depcounts(struct mount *, int *, int *);
 void	softdep_initialize(void);
 void	softdep_uninitialize(void);
 int	softdep_mount(struct vnode *, struct mount *, struct fs *,
 	    struct ucred *);
-void	softdep_move_dependencies(struct buf *, struct buf *);
+void	softdep_unmount(struct mount *);
+int	softdep_move_dependencies(struct buf *, struct buf *);
 int	softdep_flushworklist(struct mount *, int *, struct thread *);
 int	softdep_flushfiles(struct mount *, int, struct thread *);
 void	softdep_update_inodeblock(struct inode *, struct buf *, int);
@@ -117,7 +123,8 @@ void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
 void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
-void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);
+void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
+	    int, int);
 void	softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
 	    ufs2_daddr_t, long, long, struct buf *);
 void	softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,
@@ -126,11 +133,18 @@ void	softdep_setup_allocindir_meta(struct buf *, s
 	    struct buf *, int, ufs2_daddr_t);
 void	softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
 	    struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
+void	softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,
+	    struct workhead *);
+void	softdep_setup_inofree(struct mount *, struct buf *, ino_t,
+	    struct workhead *);
+void	softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
+void 	*softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
 void	softdep_fsync_mountdev(struct vnode *);
 int	softdep_sync_metadata(struct vnode *);
 int     softdep_process_worklist(struct mount *, int);
 int     softdep_fsync(struct vnode *);
 int	softdep_waitidle(struct mount *);
+int	softdep_prealloc(struct vnode *, int);
 
 int	ffs_rdonly(struct inode *);
 
Index: sys/ufs/ffs/ffs_alloc.c
===================================================================
--- sys/ufs/ffs/ffs_alloc.c	(revision 202342)
+++ sys/ufs/ffs/ffs_alloc.c	(working copy)
@@ -94,23 +94,23 @@ __FBSDID("$FreeBSD$");
 #include <ufs/ffs/ffs_extern.h>
 
 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref,
-				  int size);
+				  int size, int rsize);
 
-static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int);
+static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int, int);
 static ufs2_daddr_t
-	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
+	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
-static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int);
-static void	ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
-		    ufs1_daddr_t, int);
+static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int,
+		    int);
 static ino_t	ffs_dirpref(struct inode *);
 static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int);
 static void	ffs_fserr(struct fs *, ino_t, char *);
 static ufs2_daddr_t	ffs_hashalloc
-		(struct inode *, int, ufs2_daddr_t, int, allocfcn_t *);
-static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int);
+		(struct inode *, int, ufs2_daddr_t, int, int, allocfcn_t *);
+static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int,
+		    int);
 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
@@ -187,7 +187,7 @@ retry:
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
-	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
 	if (bno > 0) {
 		delta = btodb(size);
 		if (ip->i_flag & IN_SPACECOUNTED) {
@@ -385,16 +385,12 @@ retry:
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
-	bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
-			    ip->i_number);
-		if (nsize < request)
-			ffs_blkfree(ump, fs, ip->i_devvp,
-			    bno + numfrags(fs, nsize),
-			    (long)(request - nsize), ip->i_number);
+			    ip->i_number, NULL);
 		delta = btodb(nsize - osize);
 		if (ip->i_flag & IN_SPACECOUNTED) {
 			UFS_LOCK(ump);
@@ -483,6 +479,14 @@ ffs_reallocblks(ap)
 
 	if (doreallocblks == 0)
 		return (ENOSPC);
+	/*
+	 * We can't wait in softdep prealloc as it may fsync and recurse
+	 * here.  Instead we simply fail to reallocate blocks if this
+	 * rare condition arises.
+	 */
+	if (DOINGSOFTDEP(ap->a_vp))
+		if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
+			return (ENOSPC);
 	if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
@@ -583,7 +587,7 @@ ffs_reallocblks_ufs1(ap)
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
-	    len, ffs_clusteralloc)) == 0) {
+	    len, len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
@@ -669,7 +673,7 @@ ffs_reallocblks_ufs1(ap)
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number);
+			    fs->fs_bsize, ip->i_number, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
@@ -791,7 +795,7 @@ ffs_reallocblks_ufs2(ap)
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
-	    len, ffs_clusteralloc)) == 0) {
+	    len, len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
@@ -877,7 +881,7 @@ ffs_reallocblks_ufs2(ap)
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number);
+			    fs->fs_bsize, ip->i_number, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
@@ -964,7 +968,7 @@ ffs_valloc(pvp, mode, cred, vpp)
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
-	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
+	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
@@ -1273,11 +1277,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
  */
 /*VARARGS5*/
 static ufs2_daddr_t
-ffs_hashalloc(ip, cg, pref, size, allocator)
+ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t pref;
-	int size;	/* size for data blocks, mode for inodes */
+	int size;	/* Search size for data blocks, mode for inodes */
+	int rsize;	/* Real allocated size. */
 	allocfcn_t *allocator;
 {
 	struct fs *fs;
@@ -1293,7 +1298,7 @@ static ufs2_daddr_t
 	/*
 	 * 1: preferred cylinder group
 	 */
-	result = (*allocator)(ip, cg, pref, size);
+	result = (*allocator)(ip, cg, pref, size, rsize);
 	if (result)
 		return (result);
 	/*
@@ -1303,7 +1308,7 @@ static ufs2_daddr_t
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 	}
@@ -1314,7 +1319,7 @@ static ufs2_daddr_t
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 		cg++;
@@ -1396,7 +1401,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
+		    frags, numfrags(fs, osize));
 	bdwrite(bp);
 	return (bprev);
 
@@ -1414,11 +1420,12 @@ fail:
  * and if it is, allocate it.
  */
 static ufs2_daddr_t
-ffs_alloccg(ip, cg, bpref, size)
+ffs_alloccg(ip, cg, bpref, size, rsize)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t bpref;
 	int size;
+	int rsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
@@ -1446,7 +1453,7 @@ static ufs2_daddr_t
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	if (size == fs->fs_bsize) {
 		UFS_LOCK(ump);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
+		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
@@ -1470,21 +1477,14 @@ static ufs2_daddr_t
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		UFS_LOCK(ump);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
-		bno = dtogd(fs, blkno);
-		for (i = frags; i < fs->fs_frag; i++)
-			setbit(blksfree, bno + i);
-		i = fs->fs_frag - frags;
-		cgp->cg_cs.cs_nffree += i;
-		fs->fs_cstotal.cs_nffree += i;
-		fs->fs_cs(fs, cg).cs_nffree += i;
-		fs->fs_fmod = 1;
-		cgp->cg_frsum[i]++;
+		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
+	KASSERT(size == rsize,
+	    ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0)
 		goto fail;
@@ -1502,7 +1502,7 @@ static ufs2_daddr_t
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
 	bdwrite(bp);
 	return (blkno);
 
@@ -1524,10 +1524,11 @@ fail:
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs2_daddr_t
-ffs_alloccgblk(ip, bp, bpref)
+ffs_alloccgblk(ip, bp, bpref, size)
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t bpref;
+	int size;
 {
 	struct fs *fs;
 	struct cg *cgp;
@@ -1535,6 +1536,7 @@ static ufs2_daddr_t
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
+	int i;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
@@ -1562,16 +1564,32 @@ static ufs2_daddr_t
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
-	ffs_clusteracct(ump, fs, cgp, blkno, -1);
+	ffs_clusteracct(fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
+	/*
+	 * If the caller didn't want the whole block free the frags here.
+	 */
+	size = numfrags(fs, size);
+	if (size != fs->fs_frag) {
+		bno = dtogd(fs, blkno);
+		for (i = size; i < fs->fs_frag; i++)
+			setbit(blksfree, bno + i);
+		i = fs->fs_frag - size;
+		cgp->cg_cs.cs_nffree += i;
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
+		fs->fs_fmod = 1;
+		cgp->cg_frsum[i]++;
+	}
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
+		    size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
@@ -1584,11 +1602,12 @@ gotit:
  * take the first one that we find following bpref.
  */
 static ufs2_daddr_t
-ffs_clusteralloc(ip, cg, bpref, len)
+ffs_clusteralloc(ip, cg, bpref, len, unused)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t bpref;
 	int len;
+	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
@@ -1684,7 +1703,7 @@ static ufs2_daddr_t
 	len = blkstofrags(fs, len);
 	UFS_LOCK(ump);
 	for (i = 0; i < len; i += fs->fs_frag)
-		if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
+		if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
@@ -1708,11 +1727,12 @@ fail:
  *      inode in the specified cylinder group.
  */
 static ufs2_daddr_t
-ffs_nodealloccg(ip, cg, ipref, mode)
+ffs_nodealloccg(ip, cg, ipref, mode, unused)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t ipref;
 	int mode;
+	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
@@ -1815,28 +1835,6 @@ gotit:
 }
 
 /*
- * check if a block is free
- */
-static int
-ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0);
-	case 4:
-		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
-	case 2:
-		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
-	case 1:
-		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
-	default:
-		panic("ffs_isfreeblock");
-	}
-	return (0);
-}
-
-/*
  * Free a block or fragment.
  *
  * The specified block or fragment is placed back in the
@@ -1844,13 +1842,14 @@ gotit:
  * block reassembly is checked.
  */
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
+	struct workhead *dephd;
 {
 	struct cg *cgp;
 	struct buf *bp;
@@ -1917,7 +1916,7 @@ void
 			panic("ffs_blkfree: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
-		ffs_clusteracct(ump, fs, cgp, fragno, 1);
+		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1957,7 +1956,7 @@ void
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
-			ffs_clusteracct(ump, fs, cgp, fragno, 1);
+			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1966,6 +1965,9 @@ void
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
+	if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP)
+		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
+		    numfrags(fs, size), dephd);
 	bdwrite(bp);
 }
 
@@ -2036,7 +2038,8 @@ ffs_vfree(pvp, ino, mode)
 		return (0);
 	}
 	ip = VTOI(pvp);
-	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
+	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,
+	    NULL));
 }
 
 /*
@@ -2044,12 +2047,13 @@ ffs_vfree(pvp, ino, mode)
  * The specified inode is placed back in the free map.
  */
 int
-ffs_freefile(ump, fs, devvp, ino, mode)
+ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 	int mode;
+	struct workhead *wkhd;
 {
 	struct cg *cgp;
 	struct buf *bp;
@@ -2105,6 +2109,9 @@ int
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
+	if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP)
+		softdep_setup_inofree(UFSTOVFS(ump), bp,
+		    ino + cg * fs->fs_ipg, wkhd);
 	bdwrite(bp);
 	return (0);
 }
@@ -2218,101 +2225,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz)
 }
 
 /*
- * Update the cluster map because of an allocation or free.
- *
- * Cnt == 1 means free; cnt == -1 means allocating.
- */
-void
-ffs_clusteracct(ump, fs, cgp, blkno, cnt)
-	struct ufsmount *ump;
-	struct fs *fs;
-	struct cg *cgp;
-	ufs1_daddr_t blkno;
-	int cnt;
-{
-	int32_t *sump;
-	int32_t *lp;
-	u_char *freemapp, *mapp;
-	int i, start, end, forw, back, map, bit;
-
-	mtx_assert(UFS_MTX(ump), MA_OWNED);
-
-	if (fs->fs_contigsumsize <= 0)
-		return;
-	freemapp = cg_clustersfree(cgp);
-	sump = cg_clustersum(cgp);
-	/*
-	 * Allocate or clear the actual block.
-	 */
-	if (cnt > 0)
-		setbit(freemapp, blkno);
-	else
-		clrbit(freemapp, blkno);
-	/*
-	 * Find the size of the cluster going forward.
-	 */
-	start = blkno + 1;
-	end = start + fs->fs_contigsumsize;
-	if (end >= cgp->cg_nclusterblks)
-		end = cgp->cg_nclusterblks;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp++;
-	bit = 1 << (start % NBBY);
-	for (i = start; i < end; i++) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != (NBBY - 1)) {
-			bit <<= 1;
-		} else {
-			map = *mapp++;
-			bit = 1;
-		}
-	}
-	forw = i - start;
-	/*
-	 * Find the size of the cluster going backward.
-	 */
-	start = blkno - 1;
-	end = start - fs->fs_contigsumsize;
-	if (end < 0)
-		end = -1;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp--;
-	bit = 1 << (start % NBBY);
-	for (i = start; i > end; i--) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != 0) {
-			bit >>= 1;
-		} else {
-			map = *mapp--;
-			bit = 1 << (NBBY - 1);
-		}
-	}
-	back = start - i;
-	/*
-	 * Account for old cluster and the possibly new forward and
-	 * back clusters.
-	 */
-	i = back + forw + 1;
-	if (i > fs->fs_contigsumsize)
-		i = fs->fs_contigsumsize;
-	sump[i] += cnt;
-	if (back > 0)
-		sump[back] -= cnt;
-	if (forw > 0)
-		sump[forw] -= cnt;
-	/*
-	 * Update cluster summary information.
-	 */
-	lp = &sump[fs->fs_contigsumsize];
-	for (i = fs->fs_contigsumsize; i > 0; i--)
-		if (*lp-- > 0)
-			break;
-	fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
-/*
  * Fserr prints the name of a filesystem with an error diagnostic.
  *
  * The form of the error message is:
@@ -2532,7 +2444,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 #endif /* DEBUG */
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
-			    cmd.value, filetype)))
+			    cmd.value, filetype, NULL)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
@@ -2560,7 +2472,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 			if (blksize > blkcnt)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-			    blksize * fs->fs_fsize, ROOTINO);
+			    blksize * fs->fs_fsize, ROOTINO, NULL);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
Index: sys/ufs/ffs/softdep.h
===================================================================
--- sys/ufs/ffs/softdep.h	(revision 202342)
+++ sys/ufs/ffs/softdep.h	(working copy)
@@ -94,22 +94,28 @@
  * The ONWORKLIST flag shows whether the structure is currently linked
  * onto a worklist.
  */
-#define	ATTACHED	0x0001
-#define	UNDONE		0x0002
-#define	COMPLETE	0x0004
-#define	DEPCOMPLETE	0x0008
-#define	MKDIR_PARENT	0x0010	/* diradd & mkdir only */
-#define	MKDIR_BODY	0x0020	/* diradd & mkdir only */
-#define	RMDIR		0x0040	/* dirrem only */
-#define	DIRCHG		0x0080	/* diradd & dirrem only */
-#define	GOINGAWAY	0x0100	/* indirdep only */
-#define	IOSTARTED	0x0200	/* inodedep & pagedep only */
-#define	SPACECOUNTED	0x0400	/* inodedep only */
-#define	NEWBLOCK	0x0800	/* pagedep only */
-#define	INPROGRESS	0x1000	/* dirrem, freeblks, freefrag, freefile only */
-#define	UFS1FMT		0x2000	/* indirdep only */
-#define	EXTDATA		0x4000	/* allocdirect only */
-#define ONWORKLIST	0x8000
+#define	ATTACHED	0x000001
+#define	UNDONE		0x000002
+#define	COMPLETE	0x000004
+#define	DEPCOMPLETE	0x000008
+#define	MKDIR_PARENT	0x000010 /* diradd, mkdir, jaddref, jsegdep only */
+#define	MKDIR_BODY	0x000020 /* diradd, mkdir, jaddref only */
+#define	RMDIR		0x000040 /* dirrem only */
+#define	DIRCHG		0x000080 /* diradd, dirrem only */
+#define	GOINGAWAY	0x000100 /* indirdep, jremref only */
+#define	IOSTARTED	0x000200 /* inodedep, pagedep, bmsafemap only */
+#define	SPACECOUNTED	0x000400 /* inodedep only */
+#define	NEWBLOCK	0x000800 /* pagedep, jaddref only */
+#define	INPROGRESS	0x001000 /* dirrem, freeblks, freefrag, freefile only */
+#define	UFS1FMT		0x002000 /* indirdep only */
+#define	EXTDATA		0x004000 /* allocdirect only */
+#define ONWORKLIST	0x008000
+#define	IOWAITING	0x010000 /* Thread is waiting for IO to complete. */
+#define	ONDEPLIST	0x020000 /* Structure is on a dependency list. */
+#define	UNLINKED	0x040000 /* inodedep has been unlinked. */
+#define	UNLINKNEXT	0x080000 /* inodedep has valid di_freelink */
+#define	UNLINKPREV	0x100000 /* inodedep is pointed at in the unlink list */
+#define	UNLINKLINKS	(UNLINKNEXT | UNLINKPREV)
 
 #define	ALLCOMPLETE	(ATTACHED | COMPLETE | DEPCOMPLETE)
 
@@ -135,25 +141,38 @@
  * and the macros below changed to use it.
  */
 struct worklist {
+	LIST_ENTRY(worklist)	wk_list;	/* list of work requests */
 	struct mount		*wk_mp;		/* Mount we live in */
-	LIST_ENTRY(worklist)	wk_list;	/* list of work requests */
-	unsigned short		wk_type;	/* type of request */
-	unsigned short		wk_state;	/* state flags */
+	unsigned int		wk_type:8,	/* type of request */
+				wk_state:24;	/* state flags */
 };
 #define WK_DATA(wk) ((void *)(wk))
 #define WK_PAGEDEP(wk) ((struct pagedep *)(wk))
 #define WK_INODEDEP(wk) ((struct inodedep *)(wk))
 #define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))
+#define	WK_NEWBLK(wk)  ((struct newblk *)(wk))
 #define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))
 #define WK_INDIRDEP(wk) ((struct indirdep *)(wk))
 #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))
 #define WK_FREEFRAG(wk) ((struct freefrag *)(wk))
 #define WK_FREEBLKS(wk) ((struct freeblks *)(wk))
+#define WK_FREEWORK(wk) ((struct freework *)(wk))
 #define WK_FREEFILE(wk) ((struct freefile *)(wk))
 #define WK_DIRADD(wk) ((struct diradd *)(wk))
 #define WK_MKDIR(wk) ((struct mkdir *)(wk))
 #define WK_DIRREM(wk) ((struct dirrem *)(wk))
 #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))
+#define	WK_JADDREF(wk) ((struct jaddref *)(wk))
+#define	WK_JREMREF(wk) ((struct jremref *)(wk))
+#define	WK_JMVREF(wk) ((struct jmvref *)(wk))
+#define	WK_JSEGDEP(wk) ((struct jsegdep *)(wk))
+#define	WK_JSEG(wk) ((struct jseg *)(wk))
+#define	WK_JNEWBLK(wk) ((struct jnewblk *)(wk))
+#define	WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))
+#define	WK_FREEDEP(wk) ((struct freedep *)(wk))
+#define	WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))
+#define	WK_SBDEP(wk) ((struct sbdep *)wk)
+#define	WK_JTRUNC(wk) ((struct jtrunc *)(wk))
 
 /*
  * Various types of lists
@@ -165,6 +184,15 @@ LIST_HEAD(inodedephd, inodedep);
 LIST_HEAD(allocindirhd, allocindir);
 LIST_HEAD(allocdirecthd, allocdirect);
 TAILQ_HEAD(allocdirectlst, allocdirect);
+LIST_HEAD(indirdephd, indirdep);
+LIST_HEAD(jaddrefhd, jaddref);
+LIST_HEAD(jremrefhd, jremref);
+LIST_HEAD(jmvrefhd, jmvref);
+LIST_HEAD(jnewblkhd, jnewblk);
+LIST_HEAD(jfreeblkhd, jfreeblk);
+LIST_HEAD(freeworkhd, freework);
+TAILQ_HEAD(jseglst, jseg);
+TAILQ_HEAD(inoreflst, inoref);
 
 /*
  * The "pagedep" structure tracks the various dependencies related to
@@ -192,9 +220,11 @@ struct pagedep {
 	LIST_ENTRY(pagedep) pd_hash;	/* hashed lookup */
 	ino_t	pd_ino;			/* associated file */
 	ufs_lbn_t pd_lbn;		/* block within file */
+	struct	newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */
 	struct	dirremhd pd_dirremhd;	/* dirrem's waiting for page */
 	struct	diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */
 	struct	diraddhd pd_pendinghd;	/* directory entries awaiting write */
+	struct	jmvrefhd pd_jmvrefhd;	/* Dependent journal writes. */
 };
 
 /*
@@ -248,13 +278,18 @@ struct inodedep {
 	struct	worklist id_list;	/* buffer holding inode block */
 #	define	id_state id_list.wk_state /* inode dependency state */
 	LIST_ENTRY(inodedep) id_hash;	/* hashed lookup */
+	TAILQ_ENTRY(inodedep) id_unlinked;	/* Unlinked but ref'd inodes */
 	struct	fs *id_fs;		/* associated filesystem */
 	ino_t	id_ino;			/* dependent inode */
 	nlink_t	id_nlinkdelta;		/* saved effective link count */
+	nlink_t	id_savednlink;		/* Link saved during rollback */
 	LIST_ENTRY(inodedep) id_deps;	/* bmsafemap's list of inodedep's */
-	struct	buf *id_buf;		/* related bmsafemap (if pending) */
+	struct	bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */
+	struct	diradd *id_mkdiradd;	/* diradd for a mkdir. */
+	struct	inoreflst id_inoreflst;	/* Inode reference adjustments. */
 	long	id_savedextsize;	/* ext size saved during rollback */
 	off_t	id_savedsize;		/* file size saved during rollback */
+	struct	dirremhd id_dirremhd;	/* Removals pending. */
 	struct	workhead id_pendinghd;	/* entries awaiting directory write */
 	struct	workhead id_bufwait;	/* operations after inode written */
 	struct	workhead id_inowait;	/* operations waiting inode update */
@@ -271,23 +306,6 @@ struct inodedep {
 #define id_savedino2 id_un.idu_savedino2
 
 /*
- * A "newblk" structure is attached to a bmsafemap structure when a block
- * or fragment is allocated from a cylinder group. Its state is set to
- * DEPCOMPLETE when its cylinder group map is written. It is consumed by
- * an associated allocdirect or allocindir allocation which will attach
- * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag
- * is not set (i.e., its cylinder group map has not been written).
- */ 
-struct newblk {
-	LIST_ENTRY(newblk) nb_hash;	/* hashed lookup */
-	struct	fs *nb_fs;		/* associated filesystem */
-	int	nb_state;		/* state of bitmap dependency */
-	ufs2_daddr_t nb_newblkno;	/* allocated block number */
-	LIST_ENTRY(newblk) nb_deps;	/* bmsafemap's list of newblk's */
-	struct	bmsafemap *nb_bmsafemap; /* associated bmsafemap */
-};
-
-/*
  * A "bmsafemap" structure maintains a list of dependency structures
  * that depend on the update of a particular cylinder group map.
  * It has lists for newblks, allocdirects, allocindirs, and inodedeps.
@@ -299,14 +317,44 @@ struct inodedep {
  */
 struct bmsafemap {
 	struct	worklist sm_list;	/* cylgrp buffer */
+#	define	sm_state sm_list.wk_state
+	int	sm_cg;
+	LIST_ENTRY(bmsafemap) sm_hash;	/* Hash links. */
 	struct	buf *sm_buf;		/* associated buffer */
 	struct	allocdirecthd sm_allocdirecthd; /* allocdirect deps */
+	struct	allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */
 	struct	allocindirhd sm_allocindirhd; /* allocindir deps */
+	struct	allocindirhd sm_allocindirwr; /* writing allocindir deps */
 	struct	inodedephd sm_inodedephd; /* inodedep deps */
+	struct	inodedephd sm_inodedepwr; /* writing inodedep deps */
 	struct	newblkhd sm_newblkhd;	/* newblk deps */
+	struct	newblkhd sm_newblkwr;	/* writing newblk deps */
+	struct	jaddrefhd sm_jaddrefhd;	/* Pending inode allocations. */
+	struct	jnewblkhd sm_jnewblkhd;	/* Pending block allocations. */
 };
 
 /*
+ * A "newblk" structure is attached to a bmsafemap structure when a block
+ * or fragment is allocated from a cylinder group. Its state is set to
+ * DEPCOMPLETE when its cylinder group map is written. It is converted to
+ * an allocdirect or allocindir allocation once the allocator calls the
+ * appropriate setup function.
+ */ 
+struct newblk {
+	struct	worklist nb_list;
+#	define	nb_state nb_list.wk_state
+	LIST_ENTRY(newblk) nb_hash;	/* hashed lookup */
+	LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */
+	struct	jnewblk *nb_jnewblk;	/* New block journal entry. */
+	struct	bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */
+	struct	freefrag *nb_freefrag;	/* fragment to be freed (if any) */
+	struct	indirdephd nb_indirdeps; /* Children indirect blocks. */
+	struct	workhead nb_newdirblk;	/* dir block to notify when written */
+	struct	workhead nb_jwork;	/* Journal work pending. */
+	ufs2_daddr_t	nb_newblkno;	/* new value of block pointer */
+};
+
+/*
  * An "allocdirect" structure is attached to an "inodedep" when a new block
  * or fragment is allocated and pointed to by the inode described by
  * "inodedep". The worklist is linked to the buffer that holds the block.
@@ -334,20 +382,18 @@ struct bmsafemap {
  * and inodedep->id_pendinghd lists.
  */
 struct allocdirect {
-	struct	worklist ad_list;	/* buffer holding block */
-#	define	ad_state ad_list.wk_state /* block pointer state */
+	struct	newblk ad_block;	/* Common block logic */
+#	define	ad_state ad_block.nb_list.wk_state /* block pointer state */
 	TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */
-	ufs_lbn_t ad_lbn;		/* block within file */
-	ufs2_daddr_t ad_newblkno;	/* new value of block pointer */
-	ufs2_daddr_t ad_oldblkno;	/* old value of block pointer */
-	long	ad_newsize;		/* size of new block */
-	long	ad_oldsize;		/* size of old block */
-	LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */
-	struct	buf *ad_buf;		/* cylgrp buffer (if pending) */
 	struct	inodedep *ad_inodedep;	/* associated inodedep */
-	struct	freefrag *ad_freefrag;	/* fragment to be freed (if any) */
-	struct	workhead ad_newdirblk;	/* dir block to notify when written */
+	ufs2_daddr_t	ad_oldblkno;	/* old value of block pointer */
+	int		ad_offset;	/* Pointer offset in parent. */
+	long		ad_newsize;	/* size of new block */
+	long		ad_oldsize;	/* size of old block */
 };
+#define	ad_newblkno	ad_block.nb_newblkno
+#define	ad_freefrag	ad_block.nb_freefrag
+#define	ad_newdirblk	ad_block.nb_newdirblk
 
 /*
  * A single "indirdep" structure manages all allocation dependencies for
@@ -369,10 +415,14 @@ struct allocdirect {
 struct indirdep {
 	struct	worklist ir_list;	/* buffer holding indirect block */
 #	define	ir_state ir_list.wk_state /* indirect block pointer state */
-	caddr_t ir_saveddata;		/* buffer cache contents */
+	LIST_ENTRY(indirdep) ir_next;	/* alloc{direct,indir} list */
+	caddr_t	ir_saveddata;		/* buffer cache contents */
 	struct	buf *ir_savebp;		/* buffer holding safe copy */
+	struct	allocindirhd ir_completehd; /* waiting for indirdep complete */
+	struct	allocindirhd ir_writehd; /* Waiting for the pointer write. */
 	struct	allocindirhd ir_donehd;	/* done waiting to update safecopy */
 	struct	allocindirhd ir_deplisthd; /* allocindir deps for this block */
+	struct	workhead ir_jwork;	/* Journal work pending. */
 };
 
 /*
@@ -389,31 +439,39 @@ struct indirdep {
  * can then be freed as it is no longer applicable.
  */
 struct allocindir {
-	struct	worklist ai_list;	/* buffer holding indirect block */
-#	define	ai_state ai_list.wk_state /* indirect block pointer state */
+	struct	newblk ai_block;	/* Common block area */
+#	define	ai_state ai_block.nb_list.wk_state /* indirect pointer state */
 	LIST_ENTRY(allocindir) ai_next;	/* indirdep's list of allocindir's */
-	int	ai_offset;		/* pointer offset in indirect block */
-	ufs2_daddr_t ai_newblkno;	/* new block pointer value */
-	ufs2_daddr_t ai_oldblkno;	/* old block pointer value */
-	struct	freefrag *ai_freefrag;	/* block to be freed when complete */
 	struct	indirdep *ai_indirdep;	/* address of associated indirdep */
-	LIST_ENTRY(allocindir) ai_deps;	/* bmsafemap's list of allocindir's */
-	struct	buf *ai_buf;		/* cylgrp buffer (if pending) */
+	ufs2_daddr_t	ai_oldblkno;	/* old value of block pointer */
+	int		ai_offset;	/* Pointer offset in parent. */
 };
+#define	ai_newblkno	ai_block.nb_newblkno
+#define	ai_freefrag	ai_block.nb_freefrag
+#define	ai_newdirblk	ai_block.nb_newdirblk
 
 /*
+ * The allblk union is used to size the newblk structure on allocation so
+ * that it may be any one of three types.
+ */
+union allblk {
+	struct	allocindir ab_allocindir;
+	struct	allocdirect ab_allocdirect;
+	struct	newblk	ab_newblk;
+};
+
+/*
  * A "freefrag" structure is attached to an "inodedep" when a previously
  * allocated fragment is replaced with a larger fragment, rather than extended.
  * The "freefrag" structure is constructed and attached when the replacement
  * block is first allocated. It is processed after the inode claiming the
- * bigger block that replaces it has been written to disk. Note that the
- * ff_state field is is used to store the uid, so may lose data. However,
- * the uid is used only in printing an error message, so is not critical.
- * Keeping it in a short keeps the data structure down to 32 bytes.
+ * bigger block that replaces it has been written to disk.
  */
 struct freefrag {
 	struct	worklist ff_list;	/* id_inowait or delayed worklist */
-#	define	ff_state ff_list.wk_state /* owning user; should be uid_t */
+#	define	ff_state ff_list.wk_state
+	struct	jfreefrag *ff_jfreefrag; /* Associated journal entry. */
+	struct	workhead ff_jwork;	/* Journal work pending. */
 	ufs2_daddr_t ff_blkno;		/* fragment physical block number */
 	long	ff_fragsize;		/* size of fragment being deleted */
 	ino_t	ff_inum;		/* owning inode number */
@@ -423,23 +481,60 @@ struct freefrag {
  * A "freeblks" structure is attached to an "inodedep" when the
  * corresponding file's length is reduced to zero. It records all
  * the information needed to free the blocks of a file after its
- * zero'ed inode has been written to disk.
+ * zero'ed inode has been written to disk.  The actual work is done
+ * by child freework structures which are responsible for individual
+ * inode pointers while freeblks is responsible for retiring the
+ * entire operation when it is complete and holding common members.
  */
 struct freeblks {
 	struct	worklist fb_list;	/* id_inowait or delayed worklist */
 #	define	fb_state fb_list.wk_state /* inode and dirty block state */
+	struct	jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */
+	struct	workhead fb_freeworkhd;	/* Work items pending */
+	struct	workhead fb_jwork;	/* Journal work pending */
 	ino_t	fb_previousinum;	/* inode of previous owner of blocks */
 	uid_t	fb_uid;			/* uid of previous owner of blocks */
 	struct	vnode *fb_devvp;	/* filesystem device vnode */
-	long	fb_oldextsize;		/* previous ext data size */
-	off_t	fb_oldsize;		/* previous file size */
 	ufs2_daddr_t fb_chkcnt;		/* used to check cnt of blks released */
-	ufs2_daddr_t fb_dblks[NDADDR];	/* direct blk ptrs to deallocate */
-	ufs2_daddr_t fb_iblks[NIADDR];	/* indirect blk ptrs to deallocate */
-	ufs2_daddr_t fb_eblks[NXADDR];	/* indirect blk ptrs to deallocate */
+	int	fb_ref;			/* Children outstanding. */
 };
 
 /*
+ * A "freework" structure handles the release of a tree of blocks or a single
+ * block.  Each indirect block in a tree is allocated its own freework
+ * structure so that the indrect block may be freed only when all of its
+ * children are freed.  In this way we enforce the rule that an allocated
+ * block must have a valid path to a root that is journaled.  Each child
+ * block acquires a reference and when the ref hits zero the parent ref
+ * is decremented.  If there is no parent the freeblks ref is decremented.
+ */
+struct freework {
+	struct	worklist fw_list;
+#	define	fw_state fw_list.wk_state
+	LIST_ENTRY(freework) fw_next;		/* Queue for freeblksk. */
+	struct	freeblks *fw_freeblks;		/* Root of operation. */
+	struct	freework *fw_parent;		/* Parent indirect. */
+	ufs2_daddr_t	 fw_blkno;		/* Our block #. */
+	ufs_lbn_t	 fw_lbn;		/* Original lbn before free. */
+	int		 fw_frags;		/* Number of frags. */
+	int		 fw_ref;		/* Number of children out. */
+	int		 fw_off;		/* Current working position. */
+	struct	workhead fw_jwork;		/* Journal work pending. */
+};
+
+/*
+ * A "freedep" structure is allocated to track the completion of a bitmap
+ * write for a freework.  One freedep may cover many freed blocks so long
+ * as they reside in the same cylinder group.  When the cg is written
+ * the freedep decrements the ref on the freework which may permit it
+ * to be freed as well.
+ */
+struct freedep {
+	struct	worklist fd_list;
+	struct	freework *fd_freework;	/* Parent freework. */
+};
+
+/*
  * A "freefile" structure is attached to an inode when its
  * link count is reduced to zero. It marks the inode as free in
  * the cylinder group map after the zero'ed inode has been written
@@ -450,6 +545,7 @@ struct freefile {
 	mode_t	fx_mode;		/* mode of inode */
 	ino_t	fx_oldinum;		/* inum of the unlinked file */
 	struct	vnode *fx_devvp;	/* filesystem device vnode */
+	struct	workhead fx_jwork;	/* journal work pending. */
 };
 
 /*
@@ -482,12 +578,11 @@ struct freefile {
  * than zero.
  *
  * The overlaying of da_pagedep and da_previous is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. If a
- * da_previous entry is present, the pointer to its pagedep is available
- * in the associated dirrem entry. If the DIRCHG flag is set, the
- * da_previous entry is valid; if not set the da_pagedep entry is valid.
- * The DIRCHG flag never changes; it is set when the structure is created
- * if appropriate and is never cleared.
+ * structure down. If a da_previous entry is present, the pointer to its
+ * pagedep is available in the associated dirrem entry. If the DIRCHG flag
+ * is set, the da_previous entry is valid; if not set the da_pagedep entry
+ * is valid. The DIRCHG flag never changes; it is set when the structure
+ * is created if appropriate and is never cleared.
  */
 struct diradd {
 	struct	worklist da_list;	/* id_inowait or id_pendinghd list */
@@ -499,6 +594,7 @@ struct diradd {
 	struct	dirrem *dau_previous;	/* entry being replaced in dir change */
 	struct	pagedep *dau_pagedep;	/* pagedep dependency for addition */
 	} da_un;
+	struct workhead da_jwork;	/* Journal work awaiting completion. */
 };
 #define da_previous da_un.dau_previous
 #define da_pagedep da_un.dau_pagedep
@@ -525,12 +621,13 @@ struct diradd {
  * mkdir structures that reference it. The deletion would be faster if the
  * diradd structure were simply augmented to have two pointers that referenced
  * the associated mkdir's. However, this would increase the size of the diradd
- * structure from 32 to 64-bits to speed a very infrequent operation.
+ * structure to speed a very infrequent operation.
  */
 struct mkdir {
 	struct	worklist md_list;	/* id_inowait or buffer holding dir */
 #	define	md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */
 	struct	diradd *md_diradd;	/* associated diradd */
+	struct	jaddref *md_jaddref;	/* dependent jaddref. */
 	struct	buf *md_buf;		/* MKDIR_BODY: buffer holding dir */
 	LIST_ENTRY(mkdir) md_mkdirs;	/* list of all mkdirs */
 };
@@ -542,20 +639,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
  * list of the pagedep for the directory page that contains the entry.
  * It is processed after the directory page with the deleted entry has
  * been written to disk.
- *
- * The overlaying of dm_pagedep and dm_dirinum is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. It works
- * because they are never used concurrently.
  */
 struct dirrem {
 	struct	worklist dm_list;	/* delayed worklist */
 #	define	dm_state dm_list.wk_state /* state of the old directory entry */
 	LIST_ENTRY(dirrem) dm_next;	/* pagedep's list of dirrem's */
+	LIST_ENTRY(dirrem) dm_inonext;	/* inodedep's list of dirrem's */
+	struct	jremrefhd dm_jremrefhd;	/* Pending remove reference deps. */
 	ino_t	dm_oldinum;		/* inum of the removed dir entry */
 	union {
 	struct	pagedep *dmu_pagedep;	/* pagedep dependency for remove */
 	ino_t	dmu_dirinum;		/* parent inode number (for rmdir) */
 	} dm_un;
+	struct workhead dm_jwork;	/* Journal work awaiting completion. */
 };
 #define dm_pagedep dm_un.dmu_pagedep
 #define dm_dirinum dm_un.dmu_dirinum
@@ -577,9 +673,200 @@ struct dirrem {
  * blocks using a similar scheme with the allocindir structures. Rather
  * than adding this level of complexity, we simply write those newly 
  * allocated indirect blocks synchronously as such allocations are rare.
+ * In the case of a new directory the . and .. links are tracked with
+ * a mkdir rather than a pagedep.  In this case we track the mkdir
+ * so it can be released when it is written.  A workhead is used
+ * to simplify canceling a mkdir that is removed by a subsequent dirrem.
  */
 struct newdirblk {
 	struct	worklist db_list;	/* id_inowait or pg_newdirblk */
 #	define	db_state db_list.wk_state /* unused */
 	struct	pagedep *db_pagedep;	/* associated pagedep */
+	struct	workhead db_mkdir;
 };
+
+/*
+ * The inoref structure holds the elements common to jaddref and jremref
+ * so they may easily be queued in-order on the inodedep.
+ */
+struct inoref {
+	struct	worklist if_list;
+#	define	if_state if_list.wk_state
+	TAILQ_ENTRY(inoref) if_deps;	/* Links for inodedep. */
+	struct	jsegdep	*if_jsegdep;
+	off_t		if_diroff;	/* Directory offset. */
+	ino_t		if_ino;		/* Inode number. */
+	ino_t		if_parent;	/* Parent inode number. */
+	nlink_t		if_nlink;	/* nlink before addition. */
+	uint16_t	if_mode;	/* File mode, needed for IFMT. */
+};
+
+/*
+ * A "jaddref" structure tracks a new reference (link count) on an inode
+ * and prevents the link count increase and bitmap allocation until a
+ * journal entry can be written.  Once the journal entry is written,
+ * the inode is put on the pendinghd of the bmsafemap and a diradd or
+ * mkdir entry is placed on the bufwait list of the inode.  The DEPCOMPLETE
+ * flag is used to indicate that all of the required information for writing
+ * the journal entry is present.  MKDIR_BODY and MKDIR_PARENT are used to
+ * differentiate . and .. links from regular file names.  NEWBLOCK indicates
+ * a bitmap is still pending.  If a new reference is canceled by a delete
+ * prior to writing the journal the jaddref write is canceled and the
+ * structure persists to prevent any disk-visible changes until it is
+ * ultimately released when the file is freed or the link is dropped again.
+ */
+struct jaddref {
+	struct	inoref	ja_ref;
+#	define	ja_list	ja_ref.if_list	/* Journal pending or jseg entries. */
+#	define	ja_state ja_ref.if_list.wk_state
+	LIST_ENTRY(jaddref) ja_bmdeps;	/* Links for bmsafemap. */
+	union {
+		struct	diradd	*jau_diradd;	/* Pending diradd. */
+		struct	mkdir	*jau_mkdir;	/* MKDIR_{PARENT,BODY} */
+	} ja_un;
+};
+#define	ja_diradd	ja_un.jau_diradd
+#define	ja_mkdir	ja_un.jau_mkdir
+#define	ja_diroff	ja_ref.if_diroff
+#define	ja_ino		ja_ref.if_ino
+#define	ja_parent	ja_ref.if_parent
+#define	ja_mode		ja_ref.if_mode
+
+/*
+ * A "jremref" structure tracks a removed reference (unlink) on an
+ * inode and prevents the directory remove from proceeding until the
+ * journal entry is written.  Once the journal has been written the remove
+ * may proceed as normal. 
+ */
+struct jremref {
+	struct	inoref	jr_ref;
+#	define	jr_list	jr_ref.if_list	/* Journal pending or jseg entries. */
+#	define	jr_state jr_ref.if_list.wk_state
+	LIST_ENTRY(jremref) jr_deps;	/* Links for pagdep. */
+	struct	dirrem	*jr_dirrem;	/* Back pointer to dirrem. */
+};
+
+struct jmvref {
+	struct	worklist jm_list;
+	LIST_ENTRY(jmvref) jm_deps;
+	struct pagedep	*jm_pagedep;
+	ino_t		jm_parent;
+	ino_t		jm_ino;
+	off_t		jm_oldoff;
+	off_t		jm_newoff;
+};
+
+/*
+ * A "jnewblk" structure tracks a newly allocated block or fragment and
+ * prevents the direct or indirect block pointer as well as the cg bitmap
+ * from being written until it is logged.  After it is logged the jsegdep
+ * is attached to the allocdirect or allocindir until the operation is
+ * completed or reverted.  If the operation is reverted prior to the journal
+ * write the jnewblk structure is maintained to prevent the bitmaps from
+ * reaching the disk.  Ultimately the jnewblk structure will be passed
+ * to the free routine as the in memory cg is modified back to the free
+ * state at which time it can be released.
+ */
+struct jnewblk {
+	struct	worklist jn_list;
+#	define	jn_state jn_list.wk_state
+	struct	jsegdep	*jn_jsegdep;
+	LIST_ENTRY(jnewblk) jn_deps;		/* All jnewblks on bmsafemap */
+	struct	newblk	*jn_newblk;
+	ino_t		jn_ino;
+	ufs_lbn_t	jn_lbn;
+	ufs2_daddr_t	jn_blkno;
+	int		jn_oldfrags;
+	int		jn_frags;
+};
+
+/*
+ * A "jfreeblk" structure tracks the journal write for freeing a block
+ * or tree of blocks.  The block pointer must not be cleared in the inode
+ * or indirect prior to the jfreeblk being written.
+ */
+struct jfreeblk {
+	struct	worklist jf_list;
+#	define	jf_state jf_list.wk_state
+	struct	jsegdep	*jf_jsegdep;
+	struct freeblks	*jf_freeblks;
+	LIST_ENTRY(jfreeblk) jf_deps;
+	ino_t		jf_ino;
+	ufs_lbn_t	jf_lbn;
+	ufs2_daddr_t	jf_blkno;
+	int		jf_frags;
+};
+
+/*
+ * A "jfreefrag" tracks the freeing of a single block when a fragment is
+ * extended or an indirect page is replaced.  It is not part of a larger
+ * freeblks operation.
+ */
+struct jfreefrag {
+	struct	worklist fr_list;
+#	define	fr_state fr_list.wk_state
+	struct	jsegdep	*fr_jsegdep;
+	struct freefrag	*fr_freefrag;
+	ino_t		fr_ino;
+	ufs_lbn_t	fr_lbn;
+	ufs2_daddr_t	fr_blkno;
+	int		fr_frags;
+};
+
+/*
+ * A "jtrunc" journals the intent to truncate an inode to a non-zero
+ * value.  This is done synchronously prior to the synchronous partial
+ * truncation process.  The jsegdep is not released until the truncation
+ * is complete and the truncated inode is fsync'd.
+ */
+struct jtrunc {
+	struct	worklist jt_list;
+	struct	jsegdep	*jt_jsegdep;
+	ino_t		 jt_ino;
+	off_t		 jt_size;
+	int		 jt_extsize;
+};
+
+/*
+ * A "jsegdep" structure tracks a single reference to a written journal
+ * segment so the journal space can be reclaimed when all dependencies
+ * have been written.
+ */
+struct jsegdep {
+	struct	worklist jd_list;
+#	define	jd_state jd_list.wk_state
+	struct	jseg	*jd_seg;
+};
+
+/*
+ * A "jseg" structure contains all of the journal records written in a
+ * single disk write.  jaddref and jremref structures are linked into
+ * js_entries so thay may be completed when the write completes.  The
+ * js_deps array contains as many entries as there are ref counts to
+ * reduce the number of allocations required per journal write to one.
+ */
+struct jseg {
+	struct	worklist js_list;	/* b_deps link for journal */
+#	define	js_state js_list.wk_state
+	struct	workhead js_entries;	/* Entries awaiting write */
+	TAILQ_ENTRY(jseg) js_next;
+	struct	jblocks *js_jblocks;	/* Back pointer to block/seg list */
+	struct	buf *js_buf;		/* Buffer while unwritten */
+	uint64_t js_seq;
+	int	js_size;		/* Allocated size in bytes */
+	int	js_cnt;			/* Total items allocated */
+	int	js_refs;		/* Count of items pending completion */
+};
+
+/*
+ * A 'sbdep' structure tracks the head of the free inode list and
+ * superblock writes.  This makes sure the superblock is always pointing at
+ * the first possible unlinked inode for the suj recovery process.  If a
+ * block write completes and we discover a new head is available the buf
+ * is dirtied and the dep is kept.
+ */
+struct sbdep {
+	struct	worklist sb_list;	/* b_dep linkage */
+	struct	fs	*sb_fs;		/* Filesystem pointer within buf. */
+	struct	ufsmount *sb_ump;
+};
Index: sys/ufs/ffs/ffs_subr.c
===================================================================
--- sys/ufs/ffs/ffs_subr.c	(revision 202342)
+++ sys/ufs/ffs/ffs_subr.c	(working copy)
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
 #ifndef _KERNEL
 #include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
-#include "fsck.h"
 #else
 #include <sys/systm.h>
 #include <sys/lock.h>
@@ -223,12 +222,43 @@ ffs_isblock(fs, cp, h)
 		mask = 0x01 << (h & 0x7);
 		return ((cp[h >> 3] & mask) == mask);
 	default:
+#ifdef _KERNEL
 		panic("ffs_isblock");
+#endif
+		break;
 	}
 	return (0);
 }
 
 /*
+ * check if a block is free
+ */
+int
+ffs_isfreeblock(fs, cp, h)
+	struct fs *fs;
+	u_char *cp;
+	ufs1_daddr_t h;
+{
+ 
+	switch ((int)fs->fs_frag) {
+	case 8:
+		return (cp[h] == 0);
+	case 4:
+		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+	case 2:
+		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+	case 1:
+		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+	default:
+#ifdef _KERNEL
+		panic("ffs_isfreeblock");
+#endif
+		break;
+	}
+	return (0);
+}
+
+/*
  * take a block out of the map
  */
 void
@@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h)
 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
 		return;
 	default:
+#ifdef _KERNEL
 		panic("ffs_clrblock");
+#endif
+		break;
 	}
 }
 
@@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h)
 		cp[h >> 3] |= (0x01 << (h & 0x7));
 		return;
 	default:
+#ifdef _KERNEL
 		panic("ffs_setblock");
+#endif
+		break;
 	}
 }
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(fs, cgp, blkno, cnt)
+	struct fs *fs;
+	struct cg *cgp;
+	ufs1_daddr_t blkno;
+	int cnt;
+{
+	int32_t *sump;
+	int32_t *lp;
+	u_char *freemapp, *mapp;
+	int i, start, end, forw, back, map, bit;
+
+	if (fs->fs_contigsumsize <= 0)
+		return;
+	freemapp = cg_clustersfree(cgp);
+	sump = cg_clustersum(cgp);
+	/*
+	 * Allocate or clear the actual block.
+	 */
+	if (cnt > 0)
+		setbit(freemapp, blkno);
+	else
+		clrbit(freemapp, blkno);
+	/*
+	 * Find the size of the cluster going forward.
+	 */
+	start = blkno + 1;
+	end = start + fs->fs_contigsumsize;
+	if (end >= cgp->cg_nclusterblks)
+		end = cgp->cg_nclusterblks;
+	mapp = &freemapp[start / NBBY];
+	map = *mapp++;
+	bit = 1 << (start % NBBY);
+	for (i = start; i < end; i++) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != (NBBY - 1)) {
+			bit <<= 1;
+		} else {
+			map = *mapp++;
+			bit = 1;
+		}
+	}
+	forw = i - start;
+	/*
+	 * Find the size of the cluster going backward.
+	 */
+	start = blkno - 1;
+	end = start - fs->fs_contigsumsize;
+	if (end < 0)
+		end = -1;
+	mapp = &freemapp[start / NBBY];
+	map = *mapp--;
+	bit = 1 << (start % NBBY);
+	for (i = start; i > end; i--) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != 0) {
+			bit >>= 1;
+		} else {
+			map = *mapp--;
+			bit = 1 << (NBBY - 1);
+		}
+	}
+	back = start - i;
+	/*
+	 * Account for old cluster and the possibly new forward and
+	 * back clusters.
+	 */
+	i = back + forw + 1;
+	if (i > fs->fs_contigsumsize)
+		i = fs->fs_contigsumsize;
+	sump[i] += cnt;
+	if (back > 0)
+		sump[back] -= cnt;
+	if (forw > 0)
+		sump[forw] -= cnt;
+	/*
+	 * Update cluster summary information.
+	 */
+	lp = &sump[fs->fs_contigsumsize];
+	for (i = fs->fs_contigsumsize; i > 0; i--)
+		if (*lp-- > 0)
+			break;
+	fs->fs_maxcluster[cgp->cg_cgx] = i;
+}
Index: sys/ufs/ffs/ffs_balloc.c
===================================================================
--- sys/ufs/ffs/ffs_balloc.c	(revision 202342)
+++ sys/ufs/ffs/ffs_balloc.c	(working copy)
@@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse
 	if (lbn < 0)
 		return (EFBIG);
 
+	if (DOINGSOFTDEP(vp))
+		softdep_prealloc(vp, MNT_WAIT);
 	/*
 	 * If the next write will extend the file into a new block,
 	 * and the file is currently composed of a fragment
@@ -418,6 +420,8 @@ fail:
 	 * slow, running out of disk space is not expected to be a common
 	 * occurence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
+	 *
+	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -473,7 +477,7 @@ fail:
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 	}
 	return (error);
 }
@@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse
 	if (lbn < 0)
 		return (EFBIG);
 
+	if (DOINGSOFTDEP(vp))
+		softdep_prealloc(vp, MNT_WAIT);
+	
 	/*
 	 * Check for allocating external data.
 	 */
@@ -930,6 +937,8 @@ fail:
 	 * slow, running out of disk space is not expected to be a common
 	 * occurence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
+	 *
+	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -985,7 +994,7 @@ fail:
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 	}
 	return (error);
 }
Index: sys/ufs/ffs/ffs_inode.c
===================================================================
--- sys/ufs/ffs/ffs_inode.c	(revision 202342)
+++ sys/ufs/ffs/ffs_inode.c	(working copy)
@@ -92,15 +92,6 @@ ffs_update(vp, waitfor)
 	fs = ip->i_fs;
 	if (fs->fs_ronly)
 		return (0);
-	/*
-	 * Ensure that uid and gid are correct. This is a temporary
-	 * fix until fsck has been changed to do the update.
-	 */
-	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
-	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
-		ip->i_din1->di_ouid = ip->i_uid;	/* XXX */
-		ip->i_din1->di_ogid = ip->i_gid;	/* XXX */
-	}						/* XXX */
 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
@@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td)
 	ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
 	ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	ufs2_daddr_t count, blocksreleased = 0, datablocks;
+	void *cookie;
 	struct bufobj *bo;
 	struct fs *fs;
 	struct buf *bp;
@@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td)
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	bo = &vp->v_bufobj;
+	cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "ffs_truncate");
 
 	if (length < 0)
 		return (EINVAL);
+	if (length > fs->fs_maxfilesize)
+		return (EFBIG);
 	/*
 	 * Historically clients did not have to specify which data
 	 * they were truncating. So, if not specified, we assume
@@ -212,6 +207,8 @@ ffs_truncate(vp, length, flags, cred, td)
 				panic("ffs_truncate: partial trunc of extdata");
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
 				return (error);
+			if (DOINGSUJ(vp))
+				cookie = softdep_setup_trunc(vp, length, flags);
 			osize = ip->i_din2->di_extsize;
 			ip->i_din2->di_blocks -= extblocks;
 #ifdef QUOTA
@@ -227,19 +224,19 @@ ffs_truncate(vp, length, flags, cred, td)
 			}
 			ip->i_flag |= IN_CHANGE;
 			if ((error = ffs_update(vp, 1)))
-				return (error);
+				goto out;
 			for (i = 0; i < NXADDR; i++) {
 				if (oldblks[i] == 0)
 					continue;
 				ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],
-				    sblksize(fs, osize, i), ip->i_number);
+				    sblksize(fs, osize, i), ip->i_number, NULL);
 			}
 		}
 	}
-	if ((flags & IO_NORMAL) == 0)
-		return (0);
-	if (length > fs->fs_maxfilesize)
-		return (EFBIG);
+	if ((flags & IO_NORMAL) == 0) {
+		error = 0;
+		goto out;
+	}
 	if (vp->v_type == VLNK &&
 	    (ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
 	     datablocks == 0)) {
@@ -253,24 +250,52 @@ ffs_truncate(vp, length, flags, cred, td)
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			softdep_setup_freeblocks(ip, length, IO_EXT);
-		return (ffs_update(vp, 1));
+		error = ffs_update(vp, 1);
+		goto out;
 	}
 	if (ip->i_size == length) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			softdep_setup_freeblocks(ip, length, IO_EXT);
-		return (ffs_update(vp, 0));
+		error = ffs_update(vp, 0);
+		goto out;
 	}
 	if (fs->fs_ronly)
 		panic("ffs_truncate: read-only filesystem");
 #ifdef QUOTA
 	error = getinoquota(ip);
 	if (error)
-		return (error);
+		goto out;
 #endif
 	if ((ip->i_flags & SF_SNAPSHOT) != 0)
 		ffs_snapremove(vp);
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	osize = ip->i_size;
+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+	if (osize < length) {
+		vnode_pager_setsize(vp, length);
+		flags |= BA_CLRBUF;
+		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+		if (error) {
+			vnode_pager_setsize(vp, osize);
+			goto out;
+		}
+		ip->i_size = length;
+		DIP_SET(ip, i_size, length);
+		if (bp->b_bufsize == fs->fs_bsize)
+			bp->b_flags |= B_CLUSTEROK;
+		if (flags & IO_SYNC)
+			bwrite(bp);
+		else
+			bawrite(bp);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		error = ffs_update(vp, 1);
+		goto out;
+	}
 	if (DOINGSOFTDEP(vp)) {
 		if (length > 0 || softdepslowdown) {
 			/*
@@ -283,11 +308,18 @@ ffs_truncate(vp, length, flags, cred, td)
 			 * so that it will have no data structures left.
 			 */
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-				return (error);
+				goto out;
 			UFS_LOCK(ump);
 			if (ip->i_flag & IN_SPACECOUNTED)
 				fs->fs_pendingblocks -= datablocks;
 			UFS_UNLOCK(ump);
+			/*
+			 * We have to journal the truncation before we change
+			 * any blocks so we don't leave the file partially
+			 * truncated.
+			 */
+			if (DOINGSUJ(vp) && cookie == NULL)
+				cookie = softdep_setup_trunc(vp, length, flags);
 		} else {
 #ifdef QUOTA
 			(void) chkdq(ip, -datablocks, NOCRED, 0);
@@ -301,35 +333,11 @@ ffs_truncate(vp, length, flags, cred, td)
 				    OFF_TO_IDX(lblktosize(fs, -extblocks)));
 			vnode_pager_setsize(vp, 0);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			return (ffs_update(vp, 0));
+			error = ffs_update(vp, 0);
+			goto out;
 		}
 	}
-	osize = ip->i_size;
 	/*
-	 * Lengthen the size of the file. We must ensure that the
-	 * last byte of the file is allocated. Since the smallest
-	 * value of osize is 0, length will be at least 1.
-	 */
-	if (osize < length) {
-		vnode_pager_setsize(vp, length);
-		flags |= BA_CLRBUF;
-		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
-		if (error) {
-			vnode_pager_setsize(vp, osize);
-			return (error);
-		}
-		ip->i_size = length;
-		DIP_SET(ip, i_size, length);
-		if (bp->b_bufsize == fs->fs_bsize)
-			bp->b_flags |= B_CLUSTEROK;
-		if (flags & IO_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
-		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (ffs_update(vp, 1));
-	}
-	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundary, the contents of the
 	 * partial block following the end of the file must be
@@ -345,9 +353,8 @@ ffs_truncate(vp, length, flags, cred, td)
 		lbn = lblkno(fs, length);
 		flags |= BA_CLRBUF;
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
-		if (error) {
-			return (error);
-		}
+		if (error)
+			goto out;
 		/*
 		 * When we are doing soft updates and the UFS_BALLOC
 		 * above fills in a direct block hole with a full sized
@@ -359,7 +366,7 @@ ffs_truncate(vp, length, flags, cred, td)
 		if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 		    (error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-			return (error);
+			goto out;
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		size = blksize(fs, ip, lbn);
@@ -445,7 +452,7 @@ ffs_truncate(vp, length, flags, cred, td)
 			if (lastiblock[level] < 0) {
 				DIP_SET(ip, i_ib[level], 0);
 				ffs_blkfree(ump, fs, ip->i_devvp, bn,
-				    fs->fs_bsize, ip->i_number);
+				    fs->fs_bsize, ip->i_number, NULL);
 				blocksreleased += nblocks;
 			}
 		}
@@ -464,7 +471,8 @@ ffs_truncate(vp, length, flags, cred, td)
 			continue;
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
-		ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number);
+		ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,
+		    NULL);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
@@ -496,7 +504,7 @@ ffs_truncate(vp, length, flags, cred, td)
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(ump, fs, ip->i_devvp, bn,
-			    oldspace - newspace, ip->i_number);
+			    oldspace - newspace, ip->i_number, NULL);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
@@ -528,7 +536,14 @@ done:
 #ifdef QUOTA
 	(void) chkdq(ip, -blocksreleased, NOCRED, 0);
 #endif
-	return (allerror);
+	error = allerror;
+out:
+	if (cookie) {
+		allerror = softdep_complete_trunc(vp, cookie);
+		if (allerror != 0 && error == 0)
+			error = allerror;
+	}
+	return (error);
 }
 
 /*
@@ -638,7 +653,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp
 			blocksreleased += blkcount;
 		}
 		ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 		blocksreleased += nblocks;
 	}
 
Index: sys/ufs/ffs/fs.h
===================================================================
--- sys/ufs/ffs/fs.h	(revision 202342)
+++ sys/ufs/ffs/fs.h	(working copy)
@@ -340,7 +340,10 @@ struct fs {
 	int32_t	 fs_avgfilesize;	/* expected average file size */
 	int32_t	 fs_avgfpdir;		/* expected # of files per directory */
 	int32_t	 fs_save_cgsize;	/* save real cg size to use fs_bsize */
-	int32_t	 fs_sparecon32[26];	/* reserved for future constants */
+	int32_t  fs_sujournal;		/* SUJ journal file */
+	int32_t  fs_sujfree;		/* SUJ free list */
+	ufs_time_t fs_mtime;		/* Last mount or fsck time. */
+	int32_t	 fs_sparecon32[22];	/* reserved for future constants */
 	int32_t  fs_flags;		/* see FS_ flags below */
 	int32_t	 fs_contigsumsize;	/* size of cluster summary array */ 
 	int32_t	 fs_maxsymlinklen;	/* max length of an internal symlink */
@@ -414,6 +417,7 @@ CTASSERT(sizeof(struct fs) == 1376);
 #define FS_GJOURNAL	0x0040	/* gjournaled file system */
 #define FS_FLAGS_UPDATED 0x0080	/* flags have been moved to new location */
 #define FS_NFS4ACLS	0x0100	/* file system has NFSv4 ACLs enabled */
+#define	FS_SUJ       0x200	/* Filesystem using softupdate journal */
 
 /*
  * Macros to access bits in the fs_active array.
@@ -603,8 +607,32 @@ struct cg {
 	  ? (fs)->fs_bsize \
 	  : (fragroundup(fs, blkoff(fs, (size)))))
 
-
 /*
+ * Indirect lbns are aligned on NDADDR addresses where single indirects
+ * are the negated address of the lowest lbn reachable, double indirects
+ * are this lbn - 1 and triple indirects are this lbn - 2.  This yields
+ * an unusual bit order to determine level.
+ */
+static inline int
+lbn_level(ufs_lbn_t lbn)
+{
+	if (lbn >= 0)
+		return 0;
+	switch (lbn & 0x3) {
+	case 0:
+		return (0);
+	case 1:
+		break;
+	case 2:
+		return (2);
+	case 3:
+		return (1);
+	default:
+		break;
+	}
+	return (-1);
+}
+/*
  * Number of inodes in a secondary storage block/fragment.
  */
 #define	INOPB(fs)	((fs)->fs_inopb)
@@ -615,6 +643,89 @@ struct cg {
  */
 #define	NINDIR(fs)	((fs)->fs_nindir)
 
+/*
+ * Softdep journal record format.
+ */
+
+#define	JOP_ADDREF	1	/* Add a reference to an inode. */
+#define	JOP_REMREF	2	/* Remove a reference from an inode. */
+#define	JOP_NEWBLK	3	/* Allocate a block. */
+#define	JOP_FREEBLK	4	/* Free a block or a tree of blocks. */
+#define	JOP_MVREF	5	/* Move a reference from one off to another. */
+#define	JOP_TRUNC	6	/* Partial truncation record. */
+
+#define	JREC_SIZE	32	/* Record and segment header size. */
+
+#define	SUJ_MIN		(1 * 1024 * 1024)	/* Minimum journal size */
+#define	SUJ_MAX		(64 * SUJ_MIN)		/* Maximum journal size */
+
+/*
+ * Size of the segment record header.  There is at most one for each disk
+ * block and at least one for each filesystem block in the journal.  The
+ * segment header is followed by an array of records.
+ */
+struct jsegrec {
+	uint64_t	jsr_seq;	/* Our sequence number */
+	uint64_t	jsr_oldest;	/* Oldest valid sequence number */
+	uint32_t	jsr_cnt;	/* Count of valid records */
+	uint32_t	jsr_crc;	/* 32bit crc of the valid space */
+	ufs_time_t	jsr_time;	/* timestamp for mount instance */
+};
+
+struct jrefrec {
+	uint32_t	jr_op;
+	ino_t		jr_ino;
+	ino_t		jr_parent;
+	uint16_t	jr_nlink;
+	uint16_t	jr_mode;
+	off_t		jr_diroff;
+	uint64_t	jr_unused;
+};
+
+struct jmvrec {
+	uint32_t	jm_op;
+	ino_t		jm_ino;
+	ino_t		jm_parent;
+	uint16_t	jm_unused;
+	off_t		jm_oldoff;
+	off_t		jm_newoff;
+};
+
+struct jblkrec {
+	uint32_t	jb_op;
+	uint32_t	jb_ino;
+	ufs2_daddr_t	jb_blkno;
+	ufs_lbn_t	jb_lbn;
+	uint16_t	jb_frags;
+	uint16_t	jb_oldfrags;
+	uint32_t	jb_unused;
+};
+
+struct jtrncrec {
+	uint32_t	jt_op;
+	uint32_t	jt_ino;
+	off_t		jt_size;
+	uint32_t	jt_extsize;
+	uint32_t	jt_pad[3];
+};
+
+union jrec {
+	struct jsegrec	rec_jsegrec;
+	struct jrefrec	rec_jrefrec;
+	struct jmvrec	rec_jmvrec;
+	struct jblkrec	rec_jblkrec;
+	struct jtrncrec	rec_jtrncrec;
+};
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct jsegrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jrefrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jmvrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jblkrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE);
+CTASSERT(sizeof(union jrec) == JREC_SIZE);
+#endif
+
 extern int inside[], around[];
 extern u_char *fragtbl[];
 
Index: sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- sys/ufs/ffs/ffs_snapshot.c	(revision 202342)
+++ sys/ufs/ffs/ffs_snapshot.c	(working copy)
@@ -582,7 +582,8 @@ loop:
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
-				    DIP(xp, i_db[loc]), len, xp->i_number);
+				    DIP(xp, i_db[loc]), len, xp->i_number,
+				    NULL);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
@@ -598,7 +599,7 @@ loop:
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
-			    xp->i_mode);
+			    xp->i_mode, NULL);
 		VOP_UNLOCK(xvp, 0);
 		vdrop(xvp);
 		if (error) {
@@ -700,7 +701,7 @@ out1:
 					     copy_fs,
 					     vp,
 					     xp->i_number,
-					     xp->i_mode);
+					     xp->i_mode, NULL);
 		}
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
@@ -1220,7 +1221,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, ex
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
 	}
 	return (0);
 }
@@ -1500,7 +1501,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, ex
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
 	}
 	return (0);
 }
Index: sys/kern/vfs_bio.c
===================================================================
--- sys/kern/vfs_bio.c	(revision 202342)
+++ sys/kern/vfs_bio.c	(working copy)
@@ -216,6 +216,14 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLA
 static int bd_request;
 
 /*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf.  This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx bdlock;
@@ -467,12 +475,20 @@ bd_wakeup(int dirtybuflevel)
  * bd_speedup - speedup the buffer cache flushing code
  */
 
-static __inline
 void
 bd_speedup(void)
 {
+	int needwake;
 
-	bd_wakeup(1);
+	mtx_lock(&bdlock);
+	needwake = 0;
+	if (bd_speedupreq == 0 || bd_request == 0)
+		needwake = 1;
+	bd_speedupreq = 1;
+	bd_request = 1;
+	if (needwake)
+		wakeup(&bd_request);
+	mtx_unlock(&bdlock);
 }
 
 /*
@@ -2120,6 +2136,7 @@ buf_do_flush(struct vnode *vp)
 static void
 buf_daemon()
 {
+	int lodirtysave;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
@@ -2137,7 +2154,11 @@ buf_daemon()
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
-
+		lodirtysave = lodirtybuffers;
+		if (bd_speedupreq) {
+			lodirtybuffers = numdirtybuffers / 2;
+			bd_speedupreq = 0;
+		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
@@ -2149,6 +2170,7 @@ buf_daemon()
 				break;
 			uio_yield();
 		}
+		lodirtybuffers = lodirtysave;
 
 		/*
 		 * Only clear bd_request if we have reached our low water
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c	(revision 202342)
+++ sys/kern/vfs_subr.c	(working copy)
@@ -2833,6 +2833,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
+	MNT_FLAG(MNT_SOFTDEP);
 #undef MNT_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
Index: sys/sys/mount.h
===================================================================
--- sys/sys/mount.h	(revision 202342)
+++ sys/sys/mount.h	(working copy)
@@ -240,6 +240,7 @@ void          __mnt_vnode_markerfree(struct vnode
 #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
 #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
 #define	MNT_NFS4ACLS	0x00000010
+#define	MNT_SUJ		0x00000080	/* softdep journaling */
 
 /*
  * NFS export related mount flags.
@@ -275,7 +276,8 @@ void          __mnt_vnode_markerfree(struct vnode
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
-			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS | MNT_NFS4ACLS)
+			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
+			MNT_NFS4ACLS	| MNT_SUJ)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
Index: sys/sys/buf.h
===================================================================
--- sys/sys/buf.h	(revision 202342)
+++ sys/sys/buf.h	(working copy)
@@ -493,6 +493,7 @@ int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
 void	bufdone_finish(struct buf *);
+void	bd_speedup(void);
 
 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **);