Index: sbin/fsck_ffs/suj.c =================================================================== --- sbin/fsck_ffs/suj.c (revision 0) +++ sbin/fsck_ffs/suj.c (revision 0) @@ -0,0 +1,2065 @@ +/*- + * Copyright (c) 2009 Jeffrey W. Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "fsck.h" + +static void ino_decr(ino_t); + +#define SUJ_HASHSIZE 128 +#define SUJ_HASHMASK (SUJ_HASHSIZE - 1) +#define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK) + +struct suj_seg { + TAILQ_ENTRY(suj_seg) ss_next; + struct jsegrec ss_rec; + uint8_t *ss_blk; +}; + +struct suj_rec { + TAILQ_ENTRY(suj_rec) sr_next; + union jrec *sr_rec; +}; +TAILQ_HEAD(srechd, suj_rec); + +struct suj_ino { + LIST_ENTRY(suj_ino) si_next; + struct srechd si_recs; + struct srechd si_movs; + ino_t si_ino; + int si_nlinkadj; + int si_skipparent; + int si_linkadj; + int si_hasrecs; + int si_blkadj; +}; +LIST_HEAD(inohd, suj_ino); + +struct suj_blk { + LIST_ENTRY(suj_blk) sb_next; + struct srechd sb_recs; + ufs2_daddr_t sb_blk; +}; +LIST_HEAD(blkhd, suj_blk); + +struct data_blk { + LIST_ENTRY(data_blk) db_next; + uint8_t *db_buf; + ufs2_daddr_t db_blk; + int db_size; +}; + +struct ino_blk { + LIST_ENTRY(ino_blk) ib_next; + uint8_t *ib_buf; + int ib_dirty; + ufs2_daddr_t ib_blk; +}; +LIST_HEAD(iblkhd, ino_blk); + +struct suj_cg { + LIST_ENTRY(suj_cg) sc_next; + struct blkhd sc_blkhash[SUJ_HASHSIZE]; + struct inohd sc_inohash[SUJ_HASHSIZE]; + struct iblkhd sc_iblkhash[SUJ_HASHSIZE]; + struct ino_blk *sc_lastiblk; + uint8_t *sc_cgbuf; + struct cg *sc_cgp; + int sc_dirty; + int sc_cgx; +}; + +LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE]; +LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE]; + +TAILQ_HEAD(seghd, suj_seg) allsegs; +uint64_t oldseq; +static struct uufsd *disk = NULL; +static struct fs *fs = NULL; + +/* + * Summary statistics. + */ +uint64_t freefrags; +uint64_t freeblocks; +uint64_t freeinos; +uint64_t freedir; +uint64_t jbytes; +uint64_t jrecs; + +typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int); + +static void * +errmalloc(size_t n) +{ + void *a; + + a = malloc(n); + if (a == NULL) + errx(1, "malloc(%zu)", n); + return (a); +} + +/* + * Open the given provider, load superblock. + */ +static void +opendisk(const char *devnam) +{ + if (disk != NULL) + return; + disk = malloc(sizeof(*disk)); + if (disk == NULL) + errx(1, "malloc(%zu)", sizeof(*disk)); + if (ufs_disk_fillout(disk, devnam) == -1) { + err(1, "ufs_disk_fillout(%s) failed: %s", devnam, + disk->d_error); + } + fs = &disk->d_fs; + /* + * Setup a few things so reply() can work. + */ + bcopy(fs, &sblock, sizeof(sblock)); + fsreadfd = disk->d_fd; + fswritefd = disk->d_fd; +} + +/* + * Mark file system as clean, write the super-block back, close the disk. + */ +static void +closedisk(const char *devnam) +{ + struct csum *cgsum; + int i; + + /* + * Recompute the fs summary info from correct cs summaries. + */ + bzero(&fs->fs_cstotal, sizeof(struct csum_total)); + for (i = 0; i < fs->fs_ncg; i++) { + cgsum = &fs->fs_cs(fs, i); + fs->fs_cstotal.cs_nffree += cgsum->cs_nffree; + fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree; + fs->fs_cstotal.cs_nifree += cgsum->cs_nifree; + fs->fs_cstotal.cs_ndir += cgsum->cs_ndir; + } + /* XXX Don't set clean for now, we don't trust the journal. */ + /* fs->fs_clean = 1; */ + fs->fs_time = time(NULL); + fs->fs_mtime = time(NULL); + if (sbwrite(disk, 0) == -1) + err(1, "sbwrite(%s)", devnam); + if (ufs_disk_close(disk) == -1) + err(1, "ufs_disk_close(%s)", devnam); + free(disk); + disk = NULL; + fs = NULL; + fsreadfd = -1; + fswritefd = -1; +} + +/* + * Lookup a cg by number in the hash so we can keep track of which cgs + * need stats rebuilt. + */ +static struct suj_cg * +cg_lookup(int cgx) +{ + struct cghd *hd; + struct suj_cg *sc; + + if (cgx < 0 || cgx >= fs->fs_ncg) { + abort(); + errx(1, "Bad cg number %d", cgx); + } + hd = &cghash[SUJ_HASH(cgx)]; + LIST_FOREACH(sc, hd, sc_next) + if (sc->sc_cgx == cgx) + return (sc); + sc = errmalloc(sizeof(*sc)); + bzero(sc, sizeof(*sc)); + sc->sc_cgbuf = errmalloc(fs->fs_bsize); + sc->sc_cgp = (struct cg *)sc->sc_cgbuf; + sc->sc_cgx = cgx; + LIST_INSERT_HEAD(hd, sc, sc_next); + if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, + fs->fs_bsize) == -1) + err(1, "Unable to read cylinder group %d", sc->sc_cgx); + + return (sc); +} + +/* + * Lookup an inode number in the hash and allocate a suj_ino if it does + * not exist. + */ +static struct suj_ino * +ino_lookup(ino_t ino, int creat) +{ + struct suj_ino *sino; + struct inohd *hd; + struct suj_cg *sc; + + sc = cg_lookup(ino_to_cg(fs, ino)); + hd = &sc->sc_inohash[SUJ_HASH(ino)]; + LIST_FOREACH(sino, hd, si_next) + if (sino->si_ino == ino) + return (sino); + if (creat == 0) + return (NULL); + sino = errmalloc(sizeof(*sino)); + bzero(sino, sizeof(*sino)); + sino->si_ino = ino; + sino->si_nlinkadj = 0; + TAILQ_INIT(&sino->si_recs); + TAILQ_INIT(&sino->si_movs); + LIST_INSERT_HEAD(hd, sino, si_next); + + return (sino); +} + +/* + * Lookup a block number in the hash and allocate a suj_blk if it does + * not exist. + */ +static struct suj_blk * +blk_lookup(ufs2_daddr_t blk, int creat) +{ + struct suj_blk *sblk; + struct suj_cg *sc; + struct blkhd *hd; + + sc = cg_lookup(dtog(fs, blk)); + hd = &sc->sc_blkhash[SUJ_HASH(blk)]; + LIST_FOREACH(sblk, hd, sb_next) + if (sblk->sb_blk == blk) + return (sblk); + if (creat == 0) + return (NULL); + sblk = errmalloc(sizeof(*sblk)); + bzero(sblk, sizeof(*sblk)); + sblk->sb_blk = blk; + TAILQ_INIT(&sblk->sb_recs); + LIST_INSERT_HEAD(hd, sblk, sb_next); + + return (sblk); +} + +static uint8_t * +dblk_read(ufs2_daddr_t blk, int size) +{ + struct data_blk *dblk; + struct dblkhd *hd; + + hd = &dbhash[SUJ_HASH(blk)]; + LIST_FOREACH(dblk, hd, db_next) + if (dblk->db_blk == blk) + goto found; + /* + * The inode block wasn't located, allocate a new one. + */ + dblk = errmalloc(sizeof(*dblk)); + bzero(dblk, sizeof(*dblk)); + LIST_INSERT_HEAD(hd, dblk, db_next); + dblk->db_blk = blk; +found: + /* + * I doubt size mismatches can happen in practice but it is trivial + * to handle. + */ + if (size != dblk->db_size) { + if (dblk->db_buf) + free(dblk->db_buf); + dblk->db_buf = errmalloc(size); + dblk->db_size = size; + if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1) + err(1, "Failed to read data block %jd", blk); + } + return (dblk->db_buf); +} + +static union dinode * +ino_read(ino_t ino) +{ + struct ino_blk *iblk; + struct iblkhd *hd; + struct suj_cg *sc; + ufs2_daddr_t blk; + int off; + + blk = ino_to_fsba(fs, ino); + sc = cg_lookup(ino_to_cg(fs, ino)); + hd = &sc->sc_iblkhash[SUJ_HASH(blk)]; + LIST_FOREACH(iblk, hd, ib_next) + if (iblk->ib_blk == blk) + goto found; + /* + * The inode block wasn't located, allocate a new one. + */ + iblk = errmalloc(sizeof(*iblk)); + bzero(iblk, sizeof(*iblk)); + iblk->ib_buf = errmalloc(fs->fs_bsize); + iblk->ib_blk = blk; + LIST_INSERT_HEAD(hd, iblk, ib_next); + if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1) + err(1, "Failed to read inode block %jd", blk); +found: + sc->sc_lastiblk = iblk; + off = ino_to_fsbo(fs, ino); + if (fs->fs_magic == FS_UFS1_MAGIC) + return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off]; + else + return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off]; +} + +static void +ino_dirty(ino_t ino) +{ + struct ino_blk *iblk; + struct iblkhd *hd; + struct suj_cg *sc; + ufs2_daddr_t blk; + + blk = ino_to_fsba(fs, ino); + sc = cg_lookup(ino_to_cg(fs, ino)); + iblk = sc->sc_lastiblk; + if (iblk && iblk->ib_blk == blk) { + iblk->ib_dirty = 1; + return; + } + hd = &sc->sc_iblkhash[SUJ_HASH(blk)]; + LIST_FOREACH(iblk, hd, ib_next) { + if (iblk->ib_blk == blk) { + iblk->ib_dirty = 1; + return; + } + } + ino_read(ino); + ino_dirty(ino); +} + +static void +iblk_write(struct ino_blk *iblk) +{ + + if (iblk->ib_dirty == 0) + return; + if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf, + fs->fs_bsize) == -1) + err(1, "Failed to write inode block %jd", iblk->ib_blk); +} + +/* + * Return 1 if the inode was free and 0 if it is allocated. + */ +static int +ino_isfree(ino_t ino) +{ + struct suj_cg *sc; + uint8_t *inosused; + struct cg *cgp; + int cg; + + cg = ino_to_cg(fs, ino); + ino = ino % fs->fs_ipg; + sc = cg_lookup(cg); + cgp = sc->sc_cgp; + inosused = cg_inosused(cgp); + return isclr(inosused, ino); +} + +static int +blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags) +{ + ufs2_daddr_t bstart; + ufs2_daddr_t bend; + ufs2_daddr_t end; + + end = start + frags; + bstart = brec->jb_blkno + brec->jb_oldfrags; + bend = bstart + brec->jb_frags; + if (start < bend && end > bstart) + return (1); + return (0); +} + +static int +blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start, + int frags) +{ + + if (brec->jb_ino != ino || brec->jb_lbn != lbn) + return (0); + if (brec->jb_blkno + brec->jb_oldfrags != start) + return (0); + if (brec->jb_frags != frags) + return (0); + return (1); +} + +static void +blk_setmask(struct jblkrec *brec, int *mask) +{ + int i; + + for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++) + *mask |= 1 << i; +} + +/* + * Determine whether a given block has been reallocated to a new location. + * Returns a mask of overlapping bits if any frags have been reused or + * zero if the block has not been re-used and the contents can be trusted. + * + * This is used to ensure that an orphaned pointer due to truncate is safe + * to be freed. The mask value can be used to free partial blocks. + */ +static int +blk_isfree(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags) +{ + struct suj_blk *sblk; + struct suj_rec *srec; + struct jblkrec *brec; + int mask; + int off; + + /* + * To be certain we're not freeing a reallocated block we lookup + * this block in the blk hash and see if there is an allocation + * journal record that overlaps with any fragments in the block + * we're concerned with. If any fragments have ben reallocated + * the block has already been freed and re-used for another purpose. + */ + mask = 0; + sblk = blk_lookup(blknum(fs, blk), 0); + if (sblk == NULL) + return (0); + off = blk - sblk->sb_blk; + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + brec = (struct jblkrec *)srec->sr_rec; + /* + * If the block overlaps but does not match + * exactly it's a new allocation. If it matches + * exactly this record refers to the current + * location. + */ + if (blk_overlaps(brec, blk, frags) == 0) + continue; + if (blk_equals(brec, ino, lbn, blk, frags) == 1) + mask = 0; + else + blk_setmask(brec, &mask); + } + if (debug) + printf("blk_isfree: blk %jd sblk %jd off %d mask 0x%X\n", + blk, sblk->sb_blk, off, mask); + return (mask >> off); +} + +/* + * Determine whether it is safe to follow an indirect. It is not safe + * if any part of the indirect has been reallocated or the last journal + * entry was an allocation. Just allocated indirects may not have valid + * pointers yet and all of their children will have their own records. + * + * Returns 1 if it's safe to follow the indirect and 0 otherwise. + */ +static int +blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn) +{ + struct suj_blk *sblk; + struct jblkrec *brec; + + sblk = blk_lookup(blk, 0); + if (sblk == NULL) + return (1); + if (TAILQ_EMPTY(&sblk->sb_recs)) + return (1); + brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec; + if (blk_equals(brec, ino, lbn, blk, fs->fs_frag)) + if (brec->jb_op == JOP_FREEBLK) + return (1); + return (0); +} + +/* + * Clear an inode from the cg bitmap. If the inode was already clear return + * 0 so the caller knows it does not have to check the inode contents. + */ +static int +ino_free(ino_t ino, int mode) +{ + struct suj_cg *sc; + uint8_t *inosused; + struct cg *cgp; + int cg; + + cg = ino_to_cg(fs, ino); + ino = ino % fs->fs_ipg; + sc = cg_lookup(cg); + cgp = sc->sc_cgp; + inosused = cg_inosused(cgp); + /* + * The bitmap may never have made it to the disk so we have to + * conditionally clear. We can avoid writing the cg in this case. + */ + if (isclr(inosused, ino)) + return (0); + freeinos++; + clrbit(inosused, ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + if ((mode & IFMT) == IFDIR) { + freedir++; + cgp->cg_cs.cs_ndir--; + } + sc->sc_dirty = 1; + + return (1); +} + +/* + * Free 'frags' frags starting at filesystem block 'bno' skipping any frags + * set in the mask. + */ +static void +blk_free(ufs2_daddr_t bno, int mask, int frags) +{ + ufs1_daddr_t fragno, cgbno; + struct suj_cg *sc; + struct cg *cgp; + int i, cg; + uint8_t *blksfree; + + if (debug) + printf("Freeing %d frags at blk %jd\n", frags, bno); + cg = dtog(fs, bno); + sc = cg_lookup(cg); + cgp = sc->sc_cgp; + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp); + + /* + * If it's not allocated we only wrote the journal entry + * and never the bitmaps. Here we unconditionally clear and + * resolve the cg summary later. + */ + if (frags == fs->fs_frag && mask == 0) { + fragno = fragstoblks(fs, cgbno); + ffs_setblock(fs, blksfree, fragno); + freeblocks++; + } else { + /* + * deallocate the fragment + */ + for (i = 0; i < frags; i++) + if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) { + freefrags++; + setbit(blksfree, cgbno + i); + } + } + sc->sc_dirty = 1; +} + +/* + * Fetch an indirect block to find the block at a given lbn. The lbn + * may be negative to fetch a specific indirect block pointer or positive + * to fetch a specific block. + */ +static ufs2_daddr_t +indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn, int level) +{ + ufs2_daddr_t *bap2; + ufs2_daddr_t *bap1; + ufs_lbn_t lbnadd; + ufs_lbn_t base; + int i; + + if (blk == 0) + return (0); + if (cur == lbn) + return (blk); + if (level == 0 && lbn < 0) { + abort(); + errx(1, "Invalid lbn %jd", lbn); + } + bap2 = (void *)dblk_read(blk, fs->fs_bsize); + bap1 = (void *)bap2; + lbnadd = 1; + base = -(cur + level); + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + if (lbn > 0) + i = (lbn - base) / lbnadd; + else + i = (-lbn - base) / lbnadd; + if (i < 0 || i >= NINDIR(fs)) { + abort(); + errx(1, "Invalid indirect index %d produced by lbn %jd", + i, lbn); + } + if (level == 0) + cur = base + (i * lbnadd); + else + cur = -(base + (i * lbnadd)) - (level - 1); + if (fs->fs_magic == FS_UFS1_MAGIC) + blk = bap1[i]; + else + blk = bap2[i]; + if (cur == lbn) + return (blk); + if (level == 0) { + abort(); + errx(1, "Invalid lbn %jd at level 0", lbn); + } + return indir_blkatoff(blk, ino, cur, lbn, level - 1); +} + +/* + * Finds the disk block address at the specified lbn within the inode + * specified by ip. This follows the whole tree and honors di_size and + * di_extsize so it is a true test of reachability. The lbn may be + * negative if an extattr or indirect block is requested. + */ +static ufs2_daddr_t +ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags) +{ + ufs_lbn_t tmpval; + ufs_lbn_t cur; + ufs_lbn_t next; + int i; + + /* + * Handle extattr blocks first. + */ + if (lbn < 0 && lbn >= -NXADDR) { + lbn = -1 - lbn; + if (lbn > lblkno(fs, ip->dp2.di_extsize - 1)) + return (0); + *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn)); + return (ip->dp2.di_extb[lbn]); + } + /* + * And now direct and indirect. Verify that the lbn does not + * exceed the size required to store the file by asking for + * the lbn of the last byte. These blocks should be 0 anyway + * so this simply saves the traversal. + */ + if (lbn > 0 && lbn > lblkno(fs, DIP(ip, di_size) - 1)) + return (0); + if (lbn < 0 && -lbn > lblkno(fs, DIP(ip, di_size) - 1)) + return (0); + if (lbn >= 0 && lbn < NDADDR) { + *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn)); + return (DIP(ip, di_db[lbn])); + } + *frags = fs->fs_frag; + + for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++, + tmpval *= NINDIR(fs), cur = next) { + next = cur + tmpval; + if (lbn == -cur) + return (DIP(ip, di_ib[i])); + /* + * Determine whether the lbn in question is within this tree. + */ + if (lbn < 0 && -lbn >= next) + continue; + if (lbn > 0 && lbn >= next) + continue; + + return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn, i); + } + errx(1, "lbn %jd not in ino", lbn); +} + +/* + * Determine whether a block exists at a particular lbn in an inode. + * Returns 1 if found, 0 if not. lbn may be negative for indirects + * or ext blocks. + */ +static int +blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags) +{ + union dinode *ip; + ufs2_daddr_t nblk; + + ip = ino_read(ino); + + if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0) + return (0); + nblk = ino_blkatoff(ip, ino, lbn, frags); + + return (nblk == blk); +} + +/* + * Determines whether a pointer to an inode exists within a directory + * at a specified offset. Returns the mode of the found entry. + */ +static int +ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot) +{ + union dinode *dip; + struct direct *dp; + ufs2_daddr_t blk; + uint8_t *block; + ufs_lbn_t lbn; + int blksize; + int frags; + int dpoff; + int doff; + + *isdot = 0; + dip = ino_read(parent); + *mode = DIP(dip, di_mode); + if ((*mode & IFMT) != IFDIR) { + if (debug) { + /* This can happen if the parent inode was reallocated. */ + if (*mode != 0) + printf("Directory %d has bad mode %o\n", + parent, *mode); + else + printf("Directory %d zero inode\n", parent); + } + return (0); + } + lbn = lblkno(fs, diroff); + doff = blkoff(fs, diroff); + blksize = sblksize(fs, DIP(dip, di_size), lbn); + if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) { + if (debug) + printf("ino %d absent from %d due to offset %jd" + " exceeding size %jd\n", + child, parent, diroff, DIP(dip, di_size)); + return (0); + } + blk = ino_blkatoff(dip, parent, lbn, &frags); + if (blk <= 0) { + if (debug) + printf("Sparse directory %d", parent); + return (0); + } + block = dblk_read(blk, blksize); + /* + * Walk through the records from the start of the block to be + * certain we hit a valid record and not some junk in the middle + * of a file name. Stop when we reach or pass the expected offset. + */ + dpoff = 0; + do { + dp = (struct direct *)&block[dpoff]; + if (dpoff == doff) + break; + if (dp->d_reclen == 0) + break; + dpoff += dp->d_reclen; + } while (dpoff <= doff); + if (dpoff > fs->fs_bsize) + errx(1, "Corrupt directory block in dir inode %d", parent); + /* Not found. */ + if (dpoff != doff) { + if (debug) + printf("ino %d not found in %d, lbn %jd, dpoff %d\n", + child, parent, lbn, dpoff); + return (0); + } + /* + * We found the item in question. Record the mode and whether it's + * a . or .. link for the caller. + */ + if (dp->d_ino == child) { + if (child == parent) + *isdot = 1; + else if (dp->d_namlen == 2 && + dp->d_name[0] == '.' && dp->d_name[1] == '.') + *isdot = 1; + *mode = DTTOIF(dp->d_type); + return (1); + } + if (debug) + printf("ino %d doesn't match dirent ino %d in parent %d\n", + child, dp->d_ino, parent); + return (0); +} + +#define VISIT_INDIR 0x0001 +#define VISIT_EXT 0x0002 + +/* + * Read an indirect level which may or may not be linked into an inode. + */ +static void +indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags, + ino_visitor visitor, int flags) +{ + ufs2_daddr_t *bap2; + ufs1_daddr_t *bap1; + ufs_lbn_t lbnadd; + ufs2_daddr_t nblk; + ufs_lbn_t nlbn; + int level; + int i; + + /* + * Don't visit indirect blocks with contents we can't trust. This + * should only happen when indir_visit() is called to complete a + * truncate that never finished and not when a pointer is found via + * an inode. + */ + if (blk == 0) + return; + if (blk_isindir(blk, ino, lbn) == 0) { + if (debug) + printf("blk %jd ino %d lbn %jd is not indir.\n", + blk, ino, lbn); + goto out; + } + level = lbn_level(lbn); + if (level == -1) { + abort(); + errx(1, "Invalid level for lbn %jd", lbn); + } + lbnadd = 1; + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + bap1 = (void *)dblk_read(blk, fs->fs_bsize); + bap2 = (void *)bap1; + for (i = 0; i < NINDIR(fs); i++) { + if (fs->fs_magic == FS_UFS1_MAGIC) + nblk = *bap1++; + else + nblk = *bap2++; + if (nblk == 0) + continue; + if (level == 0) { + nlbn = -lbn + i * lbnadd; + (*frags) += fs->fs_frag; + visitor(ino, nlbn, nblk, fs->fs_frag); + } else { + nlbn = (lbn + 1) - (i * lbnadd); + indir_visit(ino, nlbn, nblk, frags, visitor, flags); + } + } +out: + if (flags & VISIT_INDIR) { + (*frags) += fs->fs_frag; + visitor(ino, lbn, blk, fs->fs_frag); + } +} + +/* + * Visit each block in an inode as specified by 'flags' and call a + * callback function. The callback may inspect or free blocks. The + * count of frags found according to the size in the file is returned. + * This is not valid for sparse files but may be used to determine + * the correct di_blocks for a file. + */ +static uint64_t +ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags) +{ + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + uint64_t size; + uint64_t fragcnt; + int mode; + int frags; + int i; + + size = DIP(ip, di_size); + mode = DIP(ip, di_mode) & IFMT; + fragcnt = 0; + if ((flags & VISIT_EXT) && + fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) { + for (i = 0; i < NXADDR; i++) { + if (ip->dp2.di_extb[i] == 0) + continue; + frags = sblksize(fs, ip->dp2.di_extsize, i); + frags = numfrags(fs, frags); + fragcnt += frags; + visitor(ino, -1 - i, ip->dp2.di_extb[i], frags); + } + } + /* Skip datablocks for short links and devices. */ + if (mode == IFBLK || mode == IFCHR || + (mode == IFLNK && size < fs->fs_maxsymlinklen)) + return (fragcnt); + for (i = 0; i < NDADDR; i++) { + if (DIP(ip, di_db[i]) == 0) + continue; + frags = sblksize(fs, size, i); + frags = numfrags(fs, frags); + fragcnt += frags; + visitor(ino, i, DIP(ip, di_db[i]), frags); + } + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, + tmpval *= NINDIR(fs), lbn += tmpval) { + if (DIP(ip, di_ib[i]) == 0) + continue; + indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor, + flags); + } + return (fragcnt); +} + +/* + * Null visitor function used when we just want to count blocks. + */ +static void +null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ +} + +/* + * Recalculate di_blocks when we discover that a block allocation or + * free was not successfully completed. The kernel does not roll this back + * because it would be too expensive to compute which indirects were + * reachable at the time the inode was written. + */ +static void +ino_adjblks(ino_t ino) +{ + struct suj_ino *sino; + union dinode *ip; + uint64_t blocks; + uint64_t frags; + + sino = ino_lookup(ino, 1); + if (sino->si_blkadj) + return; + sino->si_blkadj = 1; + ip = ino_read(ino); + /* No need to adjust zero'd inodes. */ + if (DIP(ip, di_mode) == 0) + return; + frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); + blocks = fsbtodb(fs, frags); + if (blocks == DIP(ip, di_blocks)) + return; + if (debug) + printf("ino %d adjusting block count from %jd to %jd\n", + ino, DIP(ip, di_blocks), blocks); + DIP_SET(ip, di_blocks, blocks); + ino_dirty(ino); +} + +static void +blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + int mask; + + mask = blk_isfree(blk, ino, lbn, frags); + if (debug) + printf("blk %jd freemask 0x%X\n", blk, mask); + blk_free(blk, mask, frags); +} + +/* + * Free a block or tree of blocks that was previously rooted in ino at + * the given lbn. If the lbn is an indirect all children are freed + * recursively. + */ +static void +blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow) +{ + uint64_t resid; + int mask; + + mask = blk_isfree(blk, ino, lbn, frags); + if (debug) + printf("blk %jd freemask 0x%X\n", blk, mask); + resid = 0; + if (lbn <= -NDADDR && follow && mask == 0) + indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR); + else + blk_free(blk, mask, frags); +} + +static void +ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + struct suj_ino *sino; + struct suj_rec *srec; + struct jrefrec *rrec; + struct direct *dp; + off_t diroff; + uint8_t *block; + int skipparent; + int isparent; + int dpoff; + int size; + + sino = ino_lookup(ino, 0); + if (sino) + skipparent = sino->si_skipparent; + else + skipparent = 0; + size = lfragtosize(fs, frags); + block = dblk_read(blk, size); + dp = (struct direct *)&block[0]; + for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) { + dp = (struct direct *)&block[dpoff]; + if (dp->d_ino == 0 || dp->d_ino == WINO) + continue; + if (dp->d_namlen == 1 && dp->d_name[0] == '.') + continue; + isparent = dp->d_namlen == 2 && dp->d_name[0] == '.' && + dp->d_name[1] == '.'; + if (isparent && skipparent == 1) + continue; + if (debug) + printf("Directory %d removing inode %d name %s\n", + ino, dp->d_ino, dp->d_name); + /* + * Lookup this inode to see if we have a record for it. + * If not, we've already adjusted it assuming this path + * was valid and we have to adjust once more. + */ + sino = ino_lookup(dp->d_ino, 0); + if (sino == NULL || sino->si_linkadj || sino->si_hasrecs == 0) { + ino_decr(dp->d_ino); + continue; + } + /* + * Tell any child directories we've already removed their + * parent. Don't try to adjust our link down again. + */ + if (isparent == 0) + sino->si_skipparent = 1; + /* + * If we haven't yet processed this inode we need to make + * sure we will successfully discover the lost path. If not + * use nlinkadj to remember. + */ + diroff = lblktosize(fs, lbn) + dpoff; + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + if (rrec->jr_parent == ino && + rrec->jr_diroff == diroff) + break; + } + if (srec == NULL) + sino->si_nlinkadj--; + } +} + +/* + * Truncate an inode, freeing all blocks and decrementing all children's + * link counts. Free the inode back to the cg. + */ +static void +ino_truncate(union dinode *ip, ino_t ino, int mode) +{ + uint32_t gen; + + if (ino == ROOTINO) + errx(1, "Attempting to free ROOTINO"); + if (debug) + printf("Truncating and freeing ino %d, nlink %d, mode %o\n", + ino, DIP(ip, di_nlink), DIP(ip, di_mode)); + + /* We are freeing an inode or directory. */ + if ((DIP(ip, di_mode) & IFMT) == IFDIR) + ino_visit(ip, ino, ino_free_children, 0); + DIP_SET(ip, di_nlink, 0); + ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR); + /* Here we have to clear the inode and release any blocks it holds. */ + gen = DIP(ip, di_gen); + if (fs->fs_magic == FS_UFS1_MAGIC) + bzero(ip, sizeof(struct ufs1_dinode)); + else + bzero(ip, sizeof(struct ufs2_dinode)); + DIP_SET(ip, di_gen, gen); + ino_dirty(ino); + ino_free(ino, mode); + return; +} + +/* + * Adjust an inode's link count down by one when a directory goes away. + */ +static void +ino_decr(ino_t ino) +{ + union dinode *ip; + int reqlink; + int nlink; + int mode; + + ip = ino_read(ino); + nlink = DIP(ip, di_nlink); + mode = DIP(ip, di_mode); + if (nlink < 1) + errx(1, "Inode %d link count %d invalid", ino, nlink); + if (mode == 0) + errx(1, "Inode %d has a link of %d with 0 mode.", ino, nlink); + nlink--; + if ((mode & IFMT) == IFDIR) + reqlink = 2; + else + reqlink = 1; + if (nlink < reqlink) { + if (debug) + printf("ino %d not enough links to live %d < %d\n", + ino, nlink, reqlink); + ino_truncate(ip, ino, mode); + return; + } + DIP_SET(ip, di_nlink, nlink); + ino_dirty(ino); +} + +/* + * Adjust the inode link count to 'nlink'. If the count reaches zero + * free it. + */ +static void +ino_adjust(ino_t ino, int lastmode, nlink_t nlink) +{ + union dinode *ip; + int reqlink; + int mode; + + ip = ino_read(ino); + mode = DIP(ip, di_mode) & IFMT; + if (nlink > LINK_MAX) + errx(1, + "ino %d nlink manipulation error, new link %d, old link %d", + ino, nlink, DIP(ip, di_nlink)); + if (debug) + printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n", + ino, nlink, DIP(ip, di_nlink), lastmode); + if (mode == 0) { + if (debug) + printf("ino %d, zero inode freeing bitmap\n", ino); + ino_free(ino, lastmode); + return; + } + /* XXX Should be an assert? */ + if (mode != lastmode && debug) + printf("ino %d, mode %o != %o\n", ino, mode, lastmode); + if ((mode & IFMT) == IFDIR) + reqlink = 2; + else + reqlink = 1; + /* If the inode doesn't have enough links to live, free it. */ + if (nlink < reqlink) { + if (debug) + printf("ino %d not enough links to live %d < %d\n", + ino, nlink, reqlink); + ino_truncate(ip, ino, mode); + return; + } + /* If required write the updated link count. */ + if (DIP(ip, di_nlink) == nlink) { + if (debug) + printf("ino %d, link matches, skipping.\n", ino); + return; + } + DIP_SET(ip, di_nlink, nlink); + ino_dirty(ino); +} + +#define DOTDOT_OFFSET DIRECTSIZ(1) + +/* + * Process records available for one inode and determine whether the + * link count is correct or needs adjusting. + * + * XXX Failed to fix zero length directory. Shouldn't .. have been mising? + */ +static void +ino_check(struct suj_ino *sino) +{ + struct suj_rec *srec; + struct jrefrec *rrec; + struct suj_ino *stmp; + nlink_t dotlinks; + int newlinks; + int removes; + int nlink; + ino_t ino; + int isdot; + int isat; + int mode; + + if (sino->si_hasrecs == 0) + return; + ino = sino->si_ino; + /* + * XXX ino_isfree currently is skipping initialized inodes + * that are unreferenced. + */ + if (0 && ino_isfree(ino)) + return; + rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec; + nlink = rrec->jr_nlink; + newlinks = sino->si_nlinkadj; + dotlinks = 0; + removes = 0; + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, + rrec->jr_ino, &mode, &isdot); + if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT)) + errx(1, "Inode mode/directory type mismatch %o != %o", + mode, rrec->jr_mode); + if (debug) + printf("jrefrec: op %d ino %d, nlink %d, parent %d, " + "diroff %jd, mode %o, isat %d, isdot %d\n", + rrec->jr_op, rrec->jr_ino, rrec->jr_nlink, + rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode, + isat, isdot); + mode = rrec->jr_mode & IFMT; + if (rrec->jr_op == JOP_REMREF) + removes++; + newlinks += isat; + if (isdot) + dotlinks += isat; + } + /* + * The number of links that remain are the starting link count + * subtracted by the total number of removes with the total + * links discovered back in. An incomplete remove thus + * makes no change to the link count but an add increases + * by one. + */ + nlink += newlinks; + nlink -= removes; + /* + * If it's a directory with no real names pointing to it go ahead + * and truncate it. This will free any children. + */ + if ((mode & IFMT) == IFDIR && nlink - dotlinks == 0) { + nlink = 0; + /* + * Mark any .. links so they know not to free this inode + * when they are removed. + */ + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + if (rrec->jr_diroff == DOTDOT_OFFSET) { + stmp = ino_lookup(rrec->jr_parent, 0); + if (stmp) + stmp->si_skipparent = 1; + } + } + } + sino->si_linkadj = 1; + ino_adjust(ino, mode, nlink); +} + +/* + * Process records available for one block and determine whether it is + * still allocated and whether the owning inode needs to be updated or + * a free completed. + */ +static void +blk_check(struct suj_blk *sblk) +{ + struct suj_rec *srec; + struct jblkrec *brec; + ufs2_daddr_t blk; + int mask; + int frags; + int isat; + + /* + * Each suj_blk actually contains records for any fragments in that + * block. As a result we must evaluate each record individually. + */ + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + brec = (struct jblkrec *)srec->sr_rec; + frags = brec->jb_frags; + blk = brec->jb_blkno + brec->jb_oldfrags; + isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags); + if (debug) + printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n", + brec->jb_op, blk, brec->jb_ino, brec->jb_lbn, + brec->jb_frags, isat, frags); + /* + * If we found the block at this address we still have to + * determine if we need to free the tail end that was + * added by adding contiguous fragments from the same block. + */ + if (isat == 1) { + if (frags == brec->jb_frags) + continue; + mask = blk_isfree(blk, brec->jb_ino, brec->jb_lbn, + brec->jb_frags); + mask >>= frags; + blk += frags; + frags = brec->jb_frags - frags; + blk_free(blk, mask, frags); + ino_adjblks(brec->jb_ino); + continue; + } + /* + * The block wasn't found, attempt to free it. It won't be + * freed if it was actually reallocated. If this was an + * allocation we don't want to follow indirects as they + * may not be written yet. Any children of the indirect will + * have their own records. If it's a free we need to + * recursively free children. + */ + blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags, + brec->jb_op == JOP_FREEBLK); + ino_adjblks(brec->jb_ino); + } +} + +/* + * Walk the list of inode and block records for this cg, recovering any + * changes which were not complete at the time of crash. + */ +static void +cg_check(struct suj_cg *sc) +{ + struct suj_blk *nextb; + struct suj_ino *nexti; + struct suj_ino *sino; + struct suj_blk *sblk; + int i; + + if (debug) + printf("Recovering cg %d\n", sc->sc_cgx); + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH_SAFE(sino, &sc->sc_inohash[i], si_next, nexti) + ino_check(sino); + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH_SAFE(sblk, &sc->sc_blkhash[i], sb_next, nextb) + blk_check(sblk); +} + +/* + * Write a potentially dirty cg. All inodes must be written before the + * cg maps are so that an allocated inode is never marked free, even if + * we crash during fsck. + */ +static void +cg_write(struct suj_cg *sc) +{ + struct ino_blk *iblk; + ufs1_daddr_t fragno, cgbno, maxbno; + u_int8_t *blksfree; + struct cg *cgp; + int blk; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next) + iblk_write(iblk); + if (sc->sc_dirty == 0) + return; + /* + * Fix the frag and cluster summary. + */ + cgp = sc->sc_cgp; + cgp->cg_cs.cs_nbfree = 0; + cgp->cg_cs.cs_nffree = 0; + bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum)); + maxbno = fragstoblks(fs, fs->fs_fpg); + if (fs->fs_contigsumsize > 0) { + for (i = 1; i <= fs->fs_contigsumsize; i++) + cg_clustersum(cgp)[i] = 0; + bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT)); + } + blksfree = cg_blksfree(cgp); + for (cgbno = 0; cgbno < maxbno; cgbno++) { + if (ffs_isfreeblock(fs, blksfree, cgbno)) + continue; + if (ffs_isblock(fs, blksfree, cgbno)) { + ffs_clusteracct(fs, cgp, cgbno, 1); + cgp->cg_cs.cs_nbfree++; + continue; + } + fragno = blkstofrags(fs, cgbno); + blk = blkmap(fs, blksfree, fragno); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + for (i = 0; i < fs->fs_frag; i++) + if (isset(blksfree, fragno + i)) + cgp->cg_cs.cs_nffree++; + } + /* + * Update the superblock cg summary from our now correct values + * before writing the block. + */ + fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs; + if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, + fs->fs_bsize) == -1) + err(1, "Unable to write cylinder group %d", sc->sc_cgx); +} + +static void +cg_apply(void (*apply)(struct suj_cg *)) +{ + struct suj_cg *scg; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(scg, &cghash[i], sc_next) + apply(scg); +} + +/* + * Process the unlinked but referenced file list. Freeing all inodes. + */ +static void +ino_unlinked(void) +{ + union dinode *ip; + uint16_t mode; + ino_t inon; + ino_t ino; + + ino = fs->fs_sujfree; + fs->fs_sujfree = 0; + while (ino != 0) { + ip = ino_read(ino); + mode = DIP(ip, di_mode) & IFMT; + inon = DIP(ip, di_freelink); + DIP_SET(ip, di_freelink, 0); + /* + * XXX Should this be an errx? + */ + if (DIP(ip, di_nlink) == 0) { + if (debug) + printf("Freeing unlinked ino %d mode %o\n", + ino, mode); + ino_truncate(ip, ino, mode); + } else if (debug) + printf("Skipping ino %d mode %o with link %d\n", + ino, mode, DIP(ip, di_nlink)); + ino = inon; + } +} + +/* + * If we see two ops for the same inode to the same parent at the same + * offset we could miscount the link with ino_isat() returning twice. + * Keep only the first record because it has the valid link count but keep + * the mode from the final op as that should be the correct mode in case + * it changed. + */ +static void +suj_build_ino(struct jrefrec *refrec) +{ + struct jmvrec *mvrec; + struct suj_rec *srec; + struct suj_ino *sino; + struct suj_rec *srn; + struct jrefrec *rrn; + + if (debug) + printf("suj_build_ino: op %d, ino %d, nlink %d, parent %d, diroff %jd\n", + refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent, + refrec->jr_diroff); + sino = ino_lookup(refrec->jr_ino, 1); + /* + * Search for a mvrec that matches this offset. Whether it's an add + * or a remove we can delete the mvref. It no longer applies to this + * location. + * + * For removes, we have to find the original offset so we can create + * a remove that matches the earlier add so it can be abandoned + * if necessary. We create an add in the new location so we can + * tolerate the directory block as it existed before or after + * the move. + */ + if (!TAILQ_EMPTY(&sino->si_movs)) { + for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; + srn = TAILQ_PREV(srn, srechd, sr_next)) { + mvrec = (struct jmvrec *)srn->sr_rec; + if (mvrec->jm_parent != refrec->jr_parent || + mvrec->jm_newoff != refrec->jr_diroff) + continue; + TAILQ_REMOVE(&sino->si_movs, srn, sr_next); + if (refrec->jr_op == JOP_REMREF) { + rrn = errmalloc(sizeof(*refrec)); + *rrn = *refrec; + rrn->jr_op = JOP_ADDREF; + suj_build_ino(rrn); + refrec->jr_diroff = mvrec->jm_oldoff; + } + } + } + /* + * We walk backwards so that adds and removes are evaluated in the + * correct order. + */ + for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; + srn = TAILQ_PREV(srn, srechd, sr_next)) { + rrn = (struct jrefrec *)srn->sr_rec; + if (rrn->jr_parent != refrec->jr_parent || + rrn->jr_diroff != refrec->jr_diroff) + continue; + if (debug) + printf("Discarding dup.\n"); + rrn->jr_mode = refrec->jr_mode; + return; + } + sino->si_hasrecs = 1; + srec = errmalloc(sizeof(*srec)); + srec->sr_rec = (union jrec *)refrec; + TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next); +} + +/* + * Apply a move record to an inode. We must search for adds that preceed us + * and add duplicates because we won't know which location to search first. + * Then we add movs to a queue that is maintained until the moved location + * is removed. If a single record is moved multiple times we only maintain + * one copy that contains the original and final diroffs. + */ +static void +suj_move_ino(struct jmvrec *mvrec) +{ + struct jrefrec *refrec; + struct suj_ino *sino; + struct suj_rec *srec; + struct jmvrec *mvrn; + struct suj_rec *srn; + struct jrefrec *rrn; + + if (debug) + printf("suj_move_ino: ino %d, parent %d, diroff %jd, oldoff %jd\n", + mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff, + mvrec->jm_oldoff); + sino = ino_lookup(mvrec->jm_ino, 0); + if (sino == NULL) + return; + /* + * We walk backwards so we only evaluate the most recent record at + * this offset. + */ + for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; + srn = TAILQ_PREV(srn, srechd, sr_next)) { + rrn = (struct jrefrec *)srn->sr_rec; + if (rrn->jr_op != JOP_ADDREF) + continue; + if (rrn->jr_parent != mvrec->jm_parent || + rrn->jr_diroff != mvrec->jm_oldoff) + continue; + /* + * When an entry is moved we don't know whether the write + * to move has completed yet. To resolve this we create + * a new add dependency in the new location as if it were added + * twice. Only one will succeed. + */ + refrec = errmalloc(sizeof(*refrec)); + refrec->jr_op = JOP_ADDREF; + refrec->jr_ino = mvrec->jm_ino; + refrec->jr_parent = mvrec->jm_parent; + refrec->jr_diroff = mvrec->jm_newoff; + refrec->jr_mode = rrn->jr_mode; + refrec->jr_nlink = rrn->jr_nlink; + suj_build_ino(refrec); + break; + } + /* + * Add this mvrec to the queue of pending mvs. + */ + for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; + srn = TAILQ_PREV(srn, srechd, sr_next)) { + mvrn = (struct jmvrec *)srn->sr_rec; + if (mvrn->jm_parent != mvrec->jm_parent || + mvrn->jm_newoff != mvrec->jm_oldoff) + continue; + mvrn->jm_newoff = mvrec->jm_newoff; + return; + } + srec = errmalloc(sizeof(*srec)); + srec->sr_rec = (union jrec *)mvrec; + TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next); +} + +/* + * Modify journal records so they refer to the base block number + * and a start and end frag range. This is to facilitate the discovery + * of overlapping fragment allocations. + */ +static void +suj_build_blk(struct jblkrec *blkrec) +{ + struct suj_rec *srec; + struct suj_blk *sblk; + struct jblkrec *blkrn; + ufs2_daddr_t blk; + int frag; + + if (debug) + printf("suj_build_blk: op %d blkno %jd frags %d oldfrags %d " + "ino %d lbn %jd\n", + blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags, + blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn); + blk = blknum(fs, blkrec->jb_blkno); + frag = fragnum(fs, blkrec->jb_blkno); + sblk = blk_lookup(blk, 1); + /* + * Rewrite the record using oldfrags to indicate the offset into + * the block. Leave jb_frags as the actual allocated count. + */ + blkrec->jb_blkno -= frag; + blkrec->jb_oldfrags = frag; + if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag) + errx(1, "Invalid fragment count %d oldfrags %d", + blkrec->jb_frags, frag); + /* + * Detect dups. If we detect a dup we always discard the oldest + * record as it is superseded by the new record. This speeds up + * later stages but also eliminates free records which are used + * to indicate that the contents of indirects can be trusted. + */ + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + blkrn = (struct jblkrec *)srec->sr_rec; + if (blkrn->jb_ino != blkrec->jb_ino || + blkrn->jb_lbn != blkrec->jb_lbn || + blkrn->jb_blkno != blkrec->jb_blkno || + blkrn->jb_frags != blkrec->jb_frags || + blkrn->jb_oldfrags != blkrec->jb_oldfrags) + continue; + if (debug) + printf("Removed dup.\n"); + /* Discard the free which is a dup with an alloc. */ + if (blkrec->jb_op == JOP_FREEBLK) + return; + TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next); + free(srec); + break; + } + srec = errmalloc(sizeof(*srec)); + srec->sr_rec = (union jrec *)blkrec; + TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next); +} + +/* + * Build up tables of the operations we need to recover. + */ +static void +suj_build(void) +{ + struct suj_seg *seg; + union jrec *rec; + int i; + + TAILQ_FOREACH(seg, &allsegs, ss_next) { + rec = (union jrec *)seg->ss_blk; + rec++; /* skip the segrec. */ + if (debug) + printf("seg %jd has %d records, oldseq %jd.\n", + seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt, + seg->ss_rec.jsr_oldest); + for (i = 0; i < seg->ss_rec.jsr_cnt; i++, rec++) { + switch (rec->rec_jrefrec.jr_op) { + case JOP_ADDREF: + case JOP_REMREF: + suj_build_ino((struct jrefrec *)rec); + break; + case JOP_MVREF: + suj_move_ino((struct jmvrec *)rec); + break; + case JOP_NEWBLK: + case JOP_FREEBLK: + suj_build_blk((struct jblkrec *)rec); + break; + default: + errx(1, "Unknown journal operation %d (%d)", + rec->rec_jrefrec.jr_op, i); + } + } + } +} + +/* + * Prune the journal segments to those we care about based on the + * oldest sequence in the newest segment. Order the segment list + * based on sequence number. + */ +static void +suj_prune(void) +{ + struct suj_seg *seg; + struct suj_seg *segn; + uint64_t newseq; + int discard; + + if (debug) + printf("Pruning up to %jd\n", oldseq); + /* First free the expired segments. */ + TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { + if (seg->ss_rec.jsr_seq >= oldseq) + continue; + TAILQ_REMOVE(&allsegs, seg, ss_next); + free(seg->ss_blk); + free(seg); + } + /* Next ensure that segments are ordered properly. */ + seg = TAILQ_FIRST(&allsegs); + if (seg == NULL) { + if (debug) + printf("Empty journal\n"); + return; + } + newseq = seg->ss_rec.jsr_seq; + for (;;) { + seg = TAILQ_LAST(&allsegs, seghd); + if (seg->ss_rec.jsr_seq >= newseq) + break; + TAILQ_REMOVE(&allsegs, seg, ss_next); + TAILQ_INSERT_HEAD(&allsegs, seg, ss_next); + newseq = seg->ss_rec.jsr_seq; + + } + if (newseq != oldseq) + errx(1, "Journal file sequence mismatch %jd != %jd", + newseq, oldseq); + /* + * The kernel may asynchronously write segments which can create + * gaps in the sequence space. Throw away any segments after the + * gap as the kernel guarantees only those that are contiguously + * reachable are marked as completed. + */ + discard = 0; + TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { + if (!discard && newseq++ == seg->ss_rec.jsr_seq) + continue; + discard = 1; + if (debug) + printf("Journal order mismatch %jd != %jd pruning\n", + newseq-1, seg->ss_rec.jsr_seq); + TAILQ_REMOVE(&allsegs, seg, ss_next); + free(seg->ss_blk); + free(seg); + } + if (debug) + printf("Processing journal segments from %jd to %jd\n", + oldseq, newseq-1); +} + +/* + * Verify the journal inode before attempting to read records. + */ +static void +suj_verifyino(union dinode *ip) +{ + + if (DIP(ip, di_nlink) != 1) + errx(1, "Invalid link count %d for journal inode %d", + DIP(ip, di_nlink), fs->fs_sujournal); + + if (DIP(ip, di_mode) != IFREG) + errx(1, "Invalid mode %d for journal inode %d", + DIP(ip, di_mode), fs->fs_sujournal); + + if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) + errx(1, "Invalid size %jd for journal inode %d", + DIP(ip, di_size), fs->fs_sujournal); + + if (DIP(ip, di_modrev) != fs->fs_mtime) + errx(1, "Journal timestamp does not match fs mount time"); + /* XXX Add further checks. */ +} + +struct jblocks { + struct jextent *jb_extent; /* Extent array. */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ +}; +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +struct jblocks *suj_jblocks; + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + int size; + + jblocks = errmalloc(sizeof(*jblocks)); + jblocks->jb_avail = 10; + jblocks->jb_used = 0; + jblocks->jb_head = 0; + jblocks->jb_off = 0; + size = sizeof(struct jextent) * jblocks->jb_avail; + jblocks->jb_extent = errmalloc(size); + bzero(jblocks->jb_extent, size); + + return (jblocks); +} + +/* + * Return the next available disk block and the amount of contiguous + * free space it contains. + */ +static ufs2_daddr_t +jblocks_next(struct jblocks *jblocks, int bytes, int *actual) +{ + struct jextent *jext; + ufs2_daddr_t daddr; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + return (0); + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + + return (daddr); +} + +/* + * Advance the allocation head by a specified number of bytes, consuming + * one journal segment. + */ +static void +jblocks_advance(struct jblocks *jblocks, int bytes) +{ + + jblocks->jb_off += bytes / DEV_BSIZE; +} + +static void +jblocks_destroy(struct jblocks *jblocks) +{ + + free(jblocks->jb_extent); + free(jblocks); +} + +static void +jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) +{ + struct jextent *jext; + int size; + + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + size = sizeof(struct jextent) * jblocks->jb_avail; + jext = errmalloc(size); + bzero(jext, size); + bcopy(jblocks->jb_extent, jext, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + + return; +} + +/* + * Add a file block from the journal to the extent map. We can't read + * each file block individually because the kernel treats it as a circular + * buffer and segments may span mutliple contiguous blocks. + */ +static void +suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + + jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags)); +} + +static void +suj_read(void) +{ + uint8_t block[1 * 1024 * 1024]; + struct suj_seg *seg; + struct jsegrec *rec; + ufs2_daddr_t blk; + int recsize; + int size; + + /* + * Read records until we exhaust the journal space. If we find + * an invalid record we start searching for a valid segment header + * at the next block. This is because we don't have a head/tail + * pointer and must recover the information indirectly. At the gap + * between the head and tail we won't necessarily have a valid + * segment. + */ + for (;;) { + size = sizeof(block); + blk = jblocks_next(suj_jblocks, size, &size); + if (blk == 0) + return; + /* + * Read 1MB at a time and scan for records within this block. + */ + if (bread(disk, blk, &block, size) == -1) + err(1, "Error reading journal block %jd", + (intmax_t)blk); + for (rec = (void *)block; size; size -= recsize, + rec = (struct jsegrec *)((uintptr_t)rec + recsize)) { + recsize = DEV_BSIZE; + if (rec->jsr_time != fs->fs_mtime) { + if (debug) + printf("Rec time %jd != fs mtime %jd\n", + rec->jsr_time, fs->fs_mtime); + jblocks_advance(suj_jblocks, recsize); + continue; + } + if (rec->jsr_cnt == 0) { + if (debug) + printf("Found illegal count %d\n", + rec->jsr_cnt); + jblocks_advance(suj_jblocks, recsize); + continue; + } + recsize = roundup2((rec->jsr_cnt + 1) * JREC_SIZE, + DEV_BSIZE); + if (recsize > size) { + /* + * We may just have run out of buffer, restart + * the loop to re-read from this spot. + */ + if (size < fs->fs_bsize && + recsize <= fs->fs_bsize) { + recsize = size; + continue; + } + if (debug) + printf("Found invalid segsize %d > %d\n", + recsize, size); + recsize = DEV_BSIZE; + jblocks_advance(suj_jblocks, recsize); + continue; + } + seg = errmalloc(sizeof(*seg)); + seg->ss_blk = errmalloc(recsize); + seg->ss_rec = *rec; + bcopy((void *)rec, seg->ss_blk, recsize); + if (rec->jsr_oldest > oldseq) + oldseq = rec->jsr_oldest; + TAILQ_INSERT_TAIL(&allsegs, seg, ss_next); + jrecs += rec->jsr_cnt; + jbytes += recsize; + jblocks_advance(suj_jblocks, recsize); + } + } +} + +/* + * Orchestrate the verification of a filesystem via the softupdates journal. + */ +void +suj_check(const char *filesys) +{ + union dinode *jip; + uint64_t blocks; + + opendisk(filesys); + TAILQ_INIT(&allsegs); + /* + * Fetch the journal inode and verify it. + */ + jip = ino_read(fs->fs_sujournal); + printf("SU+J Checking %s\n", filesys); + suj_verifyino(jip); + /* + * Build a list of journal blocks in jblocks before parsing the + * available journal blocks in with suj_read(). + */ + printf("Reading %jd byte journal from inode %d.\n", + DIP(jip, di_size), fs->fs_sujournal); + suj_jblocks = jblocks_create(); + blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0); + if (blocks != numfrags(fs, DIP(jip, di_size))) + errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal); + suj_read(); + jblocks_destroy(suj_jblocks); + suj_jblocks = NULL; + if (reply("RECOVER")) { + printf("Building recovery table.\n"); + suj_prune(); + suj_build(); + printf("Resolving unreferenced inode list.\n"); + ino_unlinked(); + printf("Processing journal entries.\n"); + cg_apply(cg_check); + } + if (reply("WRITE CHANGES")) + cg_apply(cg_write); + printf("%jd journal records in %jd bytes for %.2f%% utilization\n", + jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100); + printf("Freed %jd inodes (%jd directories) %jd blocks and %jd frags.\n", + freeinos, freedir, freeblocks, freefrags); + /* Write back superblock. */ + closedisk(filesys); +} Index: sbin/fsck_ffs/gjournal.c =================================================================== --- sbin/fsck_ffs/gjournal.c (revision 202342) +++ sbin/fsck_ffs/gjournal.c (working copy) @@ -96,27 +96,6 @@ struct ufs2_dinode ufs2_zino; static void putcgs(void); /* - * Write current block of inodes. - */ -static int -putino(struct uufsd *disk, ino_t inode) -{ - caddr_t inoblock; - struct fs *fs; - ssize_t ret; - - fs = &disk->d_fs; - inoblock = disk->d_inoblock; - - assert(inoblock != NULL); - assert(inode >= disk->d_inomin && inode <= disk->d_inomax); - ret = bwrite(disk, fsbtodb(fs, ino_to_fsba(fs, inode)), inoblock, - fs->fs_bsize); - - return (ret == -1 ? -1 : 0); -} - -/* * Return cylinder group from the cache or load it if it is not in the * cache yet. * Don't cache more than MAX_CACHED_CGS cylinder groups. @@ -242,13 +221,11 @@ cancelcgs(void) #endif /* - * Open the given provider, load statistics. + * Open the given provider, load superblock. */ static void -getdisk(void) +opendisk(void) { - int i; - if (disk != NULL) return; disk = malloc(sizeof(*disk)); @@ -259,24 +236,6 @@ static void disk->d_error); } fs = &disk->d_fs; - fs->fs_csp = malloc((size_t)fs->fs_cssize); - if (fs->fs_csp == NULL) - err(1, "malloc(%zu)", (size_t)fs->fs_cssize); - bzero(fs->fs_csp, (size_t)fs->fs_cssize); - for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) { - if (bread(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)), - (void *)(((char *)fs->fs_csp) + i), - (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) { - err(1, "bread: %s", disk->d_error); - } - } - if (fs->fs_contigsumsize > 0) { - fs->fs_maxcluster = malloc(fs->fs_ncg * sizeof(int32_t)); - if (fs->fs_maxcluster == NULL) - err(1, "malloc(%zu)", fs->fs_ncg * sizeof(int32_t)); - for (i = 0; i < fs->fs_ncg; i++) - fs->fs_maxcluster[i] = fs->fs_contigsumsize; - } } /* @@ -286,11 +245,6 @@ static void closedisk(void) { - free(fs->fs_csp); - if (fs->fs_contigsumsize > 0) { - free(fs->fs_maxcluster); - fs->fs_maxcluster = NULL; - } fs->fs_clean = 1; if (sbwrite(disk, 0) == -1) err(1, "sbwrite(%s)", devnam); @@ -301,228 +255,7 @@ closedisk(void) fs = NULL; } -/* - * Write the statistics back, call closedisk(). - */ static void -putdisk(void) -{ - int i; - - assert(disk != NULL && fs != NULL); - for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) { - if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)), - (void *)(((char *)fs->fs_csp) + i), - (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) { - err(1, "bwrite: %s", disk->d_error); - } - } - closedisk(); -} - -#if 0 -/* - * Free memory, close the disk, but don't write anything back. - */ -static void -canceldisk(void) -{ - int i; - - assert(disk != NULL && fs != NULL); - free(fs->fs_csp); - if (fs->fs_contigsumsize > 0) - free(fs->fs_maxcluster); - if (ufs_disk_close(disk) == -1) - err(1, "ufs_disk_close(%s)", devnam); - free(disk); - disk = NULL; - fs = NULL; -} -#endif - -static int -isblock(unsigned char *cp, ufs1_daddr_t h) -{ - unsigned char mask; - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0xff); - case 4: - mask = 0x0f << ((h & 0x1) << 2); - return ((cp[h >> 1] & mask) == mask); - case 2: - mask = 0x03 << ((h & 0x3) << 1); - return ((cp[h >> 2] & mask) == mask); - case 1: - mask = 0x01 << (h & 0x7); - return ((cp[h >> 3] & mask) == mask); - default: - assert(!"isblock: invalid number of fragments"); - } - return (0); -} - -/* - * put a block into the map - */ -static void -setblock(unsigned char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - cp[h] = 0xff; - return; - case 4: - cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); - return; - case 2: - cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); - return; - case 1: - cp[h >> 3] |= (0x01 << (h & 0x7)); - return; - default: - assert(!"setblock: invalid number of fragments"); - } -} - -/* - * check if a block is free - */ -static int -isfreeblock(u_char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0); - case 4: - return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); - case 2: - return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); - case 1: - return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); - default: - assert(!"isfreeblock: invalid number of fragments"); - } - return (0); -} - -/* - * Update the frsum fields to reflect addition or deletion - * of some frags. - */ -void -fragacct(int fragmap, int32_t fraglist[], int cnt) -{ - int inblk; - int field, subfield; - int siz, pos; - - inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; - fragmap <<= 1; - for (siz = 1; siz < fs->fs_frag; siz++) { - if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) - continue; - field = around[siz]; - subfield = inside[siz]; - for (pos = siz; pos <= fs->fs_frag; pos++) { - if ((fragmap & field) == subfield) { - fraglist[siz] += cnt; - pos += siz; - field <<= siz; - subfield <<= siz; - } - field <<= 1; - subfield <<= 1; - } - } -} - -static void -clusteracct(struct cg *cgp, ufs1_daddr_t blkno) -{ - int32_t *sump; - int32_t *lp; - u_char *freemapp, *mapp; - int i, start, end, forw, back, map, bit; - - if (fs->fs_contigsumsize <= 0) - return; - freemapp = cg_clustersfree(cgp); - sump = cg_clustersum(cgp); - /* - * Clear the actual block. - */ - setbit(freemapp, blkno); - /* - * Find the size of the cluster going forward. - */ - start = blkno + 1; - end = start + fs->fs_contigsumsize; - if (end >= cgp->cg_nclusterblks) - end = cgp->cg_nclusterblks; - mapp = &freemapp[start / NBBY]; - map = *mapp++; - bit = 1 << (start % NBBY); - for (i = start; i < end; i++) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != (NBBY - 1)) { - bit <<= 1; - } else { - map = *mapp++; - bit = 1; - } - } - forw = i - start; - /* - * Find the size of the cluster going backward. - */ - start = blkno - 1; - end = start - fs->fs_contigsumsize; - if (end < 0) - end = -1; - mapp = &freemapp[start / NBBY]; - map = *mapp--; - bit = 1 << (start % NBBY); - for (i = start; i > end; i--) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != 0) { - bit >>= 1; - } else { - map = *mapp--; - bit = 1 << (NBBY - 1); - } - } - back = start - i; - /* - * Account for old cluster and the possibly new forward and - * back clusters. - */ - i = back + forw + 1; - if (i > fs->fs_contigsumsize) - i = fs->fs_contigsumsize; - sump[i]++; - if (back > 0) - sump[back]--; - if (forw > 0) - sump[forw]--; - /* - * Update cluster summary information. - */ - lp = &sump[fs->fs_contigsumsize]; - for (i = fs->fs_contigsumsize; i > 0; i--) - if (*lp-- > 0) - break; - fs->fs_maxcluster[cgp->cg_cgx] = i; -} - -static void blkfree(ufs2_daddr_t bno, long size) { struct cgchain *cgc; @@ -539,10 +272,10 @@ blkfree(ufs2_daddr_t bno, long size) blksfree = cg_blksfree(cgp); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); - if (!isfreeblock(blksfree, fragno)) + if (!ffs_isfreeblock(fs, blksfree, fragno)) assert(!"blkfree: freeing free block"); - setblock(blksfree, fragno); - clusteracct(cgp, fragno); + ffs_setblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -552,7 +285,7 @@ blkfree(ufs2_daddr_t bno, long size) * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); - fragacct(blk, cgp->cg_frsum, -1); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ @@ -569,16 +302,16 @@ blkfree(ufs2_daddr_t bno, long size) * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); - fragacct(blk, cgp->cg_frsum, 1); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); - if (isblock(blksfree, fragno)) { + if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; - clusteracct(cgp, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -599,7 +332,7 @@ freeindir(ufs2_daddr_t blk, int level) if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1) err(1, "bread: %s", disk->d_error); blks = (ufs2_daddr_t *)&sblks; - for (i = 0; i < howmany(fs->fs_bsize, sizeof(ufs2_daddr_t)); i++) { + for (i = 0; i < NINDIR(fs); i++) { if (blks[i] == 0) break; if (level == 0) @@ -671,7 +404,7 @@ gjournal_check(const char *filesys) int cg, mode; devnam = filesys; - getdisk(); + opendisk(); /* Are there any unreferenced inodes in this file system? */ if (fs->fs_unrefs == 0) { //printf("No unreferenced inodes.\n"); @@ -747,7 +480,7 @@ gjournal_check(const char *filesys) /* Zero-fill the inode. */ *dino = ufs2_zino; /* Write the inode back. */ - if (putino(disk, ino) == -1) + if (putino(disk) == -1) err(1, "putino(cg=%d ino=%d)", cg, ino); if (cgp->cg_unrefs == 0) { //printf("No more unreferenced inodes in cg=%d.\n", cg); @@ -772,5 +505,5 @@ gjournal_check(const char *filesys) /* Write back modified cylinder groups. */ putcgs(); /* Write back updated statistics and super-block. */ - putdisk(); + closedisk(); } Index: sbin/fsck_ffs/main.c =================================================================== --- sbin/fsck_ffs/main.c (revision 202342) +++ sbin/fsck_ffs/main.c (working copy) @@ -256,7 +256,7 @@ checkfilesys(char *filesys) } if (ckclean && skipclean) { /* - * If file system is gjournaled, check it here. + * If file system is gjournaled or su+j, check it here. */ if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0) exit(3); /* Cannot read superblock */ @@ -278,6 +278,18 @@ checkfilesys(char *filesys) "CANNOT RUN FAST FSCK\n"); } } +#if 0 + if ((sblock.fs_flags & FS_SUJ) != 0) { + if (sblock.fs_clean == 1) { + pwarn("FILE SYSTEM CLEAN; SKIPPING CHECKS\n"); + exit(0); + } + suj_check(filesys); + if (chkdoreload(mntp) == 0) + exit(0); + exit(4); + } +#endif } /* * If we are to do a background check: @@ -299,7 +311,7 @@ checkfilesys(char *filesys) pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n"); } else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) { if (readsb(0) != 0) { - if (sblock.fs_flags & FS_NEEDSFSCK) { + if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) { bkgrdflag = 0; pfatal("UNEXPECTED INCONSISTENCY, %s\n", "CANNOT RUN IN BACKGROUND\n"); @@ -481,6 +493,7 @@ checkfilesys(char *filesys) inocleanup(); if (fsmodified) { sblock.fs_time = time(NULL); + sblock.fs_mtime = time(NULL); sbdirty(); } if (cvtlevel && sblk.b_dirty) { Index: sbin/fsck_ffs/pass4.c =================================================================== --- sbin/fsck_ffs/pass4.c (revision 202342) +++ sbin/fsck_ffs/pass4.c (working copy) @@ -72,6 +72,9 @@ pass4(void) for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) { if (inumber < ROOTINO) continue; + if (sblock.fs_flags & FS_SUJ && + inumber == sblock.fs_sujournal) + continue; idesc.id_number = inumber; switch (inoinfo(inumber)->ino_state) { Index: sbin/fsck_ffs/pass5.c =================================================================== --- sbin/fsck_ffs/pass5.c (revision 202342) +++ sbin/fsck_ffs/pass5.c (working copy) @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "fsck.h" Index: sbin/fsck_ffs/fsck.h =================================================================== --- sbin/fsck_ffs/fsck.h (revision 202342) +++ sbin/fsck_ffs/fsck.h (working copy) @@ -347,10 +347,6 @@ void direrror(ino_t ino, const char *errmesg); int dirscan(struct inodesc *); int dofix(struct inodesc *, const char *msg); int eascan(struct inodesc *, struct ufs2_dinode *dp); -void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); -void ffs_fragacct(struct fs *, int, int32_t [], int); -int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); -void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); void fileerror(ino_t cwd, ino_t ino, const char *errmesg); int findino(struct inodesc *); int findname(struct inodesc *); @@ -392,3 +388,4 @@ void sblock_init(void); void setinodebuf(ino_t); int setup(char *dev); void gjournal_check(const char *filesys); +void suj_check(const char *filesys); Index: sbin/fsck_ffs/Makefile =================================================================== --- sbin/fsck_ffs/Makefile (revision 202342) +++ sbin/fsck_ffs/Makefile (working copy) @@ -7,12 +7,12 @@ LINKS+= ${BINDIR}/fsck_ffs ${BINDIR}/fsck_4.2bsd MAN= fsck_ffs.8 MLINKS= fsck_ffs.8 fsck_ufs.8 fsck_ffs.8 fsck_4.2bsd.8 SRCS= dir.c ea.c fsutil.c inode.c main.c pass1.c pass1b.c pass2.c pass3.c \ - pass4.c pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c gjournal.c \ - getmntopts.c + pass4.c pass5.c setup.c suj.c utilities.c gjournal.c getmntopts.c DPADD= ${LIBUFS} LDADD= -lufs +CFLAGS= -O0 -pipe WARNS?= 2 -CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../mount +CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../mount -g .PATH: ${.CURDIR}/../../sys/ufs/ffs ${.CURDIR}/../mount Index: sbin/fsdb/fsdbutil.c =================================================================== --- sbin/fsdb/fsdbutil.c (revision 202342) +++ sbin/fsdb/fsdbutil.c (working copy) @@ -52,7 +52,7 @@ static const char rcsid[] = #include "fsck.h" static int charsperline(void); -static int printindir(ufs2_daddr_t blk, int level, char *bufp); +static void printindir(ufs2_daddr_t blk, int level, char *bufp); static void printblocks(ino_t inum, union dinode *dp); char ** @@ -226,7 +226,7 @@ charsperline(void) /* * Recursively print a list of indirect blocks. */ -static int +static void printindir(ufs2_daddr_t blk, int level, char *bufp) { struct bufarea buf, *bp; @@ -234,6 +234,9 @@ printindir(ufs2_daddr_t blk, int level, char *bufp int i, j, cpl, charssofar; ufs2_daddr_t blkno; + if (blk == 0) + return; + printf("%jd (%d) =>\n", (intmax_t)blk, level); if (level == 0) { /* for the final indirect level, don't use the cache */ bp = &buf; @@ -251,11 +254,8 @@ printindir(ufs2_daddr_t blk, int level, char *bufp blkno = bp->b_un.b_indir1[i]; else blkno = bp->b_un.b_indir2[i]; - if (blkno == 0) { - if (level == 0) - putchar('\n'); - return 0; - } + if (blkno == 0) + continue; j = sprintf(tempbuf, "%jd", (intmax_t)blkno); if (level == 0) { charssofar += j; @@ -270,13 +270,14 @@ printindir(ufs2_daddr_t blk, int level, char *bufp charssofar += 2; } else { printf(" =>\n"); - if (printindir(blkno, level - 1, bufp) == 0) - return 0; + printindir(blkno, level - 1, bufp); + printf("\n"); + charssofar = 0; } } if (level == 0) putchar('\n'); - return 1; + return; } @@ -309,7 +310,7 @@ printblocks(ino_t inum, union dinode *dp) } } putchar('\n'); - if (DIP(dp, di_ib[0]) == 0) + if (ndb == 0) return; bufp = malloc((unsigned int)sblock.fs_bsize); @@ -317,8 +318,7 @@ printblocks(ino_t inum, union dinode *dp) errx(EEXIT, "cannot allocate indirect block buffer"); printf("Indirect blocks:\n"); for (i = 0; i < NIADDR; i++) - if (printindir(DIP(dp, di_ib[i]), i, bufp) == 0) - break; + printindir(DIP(dp, di_ib[i]), i, bufp); free(bufp); } Index: sbin/mount/mount.c =================================================================== --- sbin/mount/mount.c (revision 202342) +++ sbin/mount/mount.c (working copy) @@ -113,6 +113,7 @@ static struct opt { { MNT_ACLS, "acls" }, { MNT_NFS4ACLS, "nfsv4acls" }, { MNT_GJOURNAL, "gjournal" }, + { MNT_SUJ, "journal" }, /* always soft-updates, journal */ { 0, NULL } }; Index: sbin/tunefs/tunefs.c =================================================================== --- sbin/tunefs/tunefs.c (revision 202342) +++ sbin/tunefs/tunefs.c (working copy) @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -72,16 +73,19 @@ struct uufsd disk; void usage(void); void printfs(void); +int journal_alloc(int64_t size); +void sbdirty(void); int main(int argc, char *argv[]) { - char *avalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue; + char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue; const char *special, *on; const char *name; int active; - int Aflag, aflag, eflag, evalue, fflag, fvalue, Jflag, Lflag, lflag; - int mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag, svalue; + int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag; + int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag; + int svalue, Sflag, Svalue; int ch, found_arg, i; const char *chg[2]; struct ufs_args args; @@ -89,13 +93,13 @@ main(int argc, char *argv[]) if (argc < 3) usage(); - Aflag = aflag = eflag = fflag = Jflag = Lflag = lflag = mflag = 0; - Nflag = nflag = oflag = pflag = sflag = 0; - avalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL; - evalue = fvalue = mvalue = ovalue = svalue = 0; + Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0; + mflag = Nflag = nflag = oflag = pflag = sflag = 0; + avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL; + evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0; active = 0; found_arg = 0; /* At least one arg is required. */ - while ((ch = getopt(argc, argv, "Aa:e:f:J:L:l:m:N:n:o:ps:")) != -1) + while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:")) != -1) switch (ch) { case 'A': @@ -135,6 +139,18 @@ main(int argc, char *argv[]) fflag = 1; break; + case 'j': + found_arg = 1; + name = "softdep journaled file system"; + jvalue = optarg; + if (strcmp(jvalue, "enable") && + strcmp(jvalue, "disable")) { + errx(10, "bad %s (options are %s)", + name, "`enable' or `disable'"); + } + jflag = 1; + break; + case 'J': found_arg = 1; name = "gjournaled file system"; @@ -240,6 +256,16 @@ main(int argc, char *argv[]) sflag = 1; break; + case 'S': + found_arg = 1; + name = "Softdep Journal Size"; + Svalue = atoi(optarg); + if (Svalue < SUJ_MIN) + errx(10, "%s must be >= %d (was %s)", + name, SUJ_MIN, optarg); + Sflag = 1; + break; + default: usage(); } @@ -310,6 +336,33 @@ main(int argc, char *argv[]) sblock.fs_avgfilesize = fvalue; } } + if (jflag) { + name = "soft updates journaling"; + if (strcmp(jvalue, "enable") == 0) { + if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) == + (FS_DOSOFTDEP | FS_SUJ)) { + warnx("%s remains unchanged as enabled", name); + } else if (sblock.fs_clean == 0) { + warnx("%s cannot be enabled until fsck is run", + name); + } else if (journal_alloc(Svalue) != 0) { + warnx("%s can not be enabled", name); + } else { + sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ; + warnx("%s set", name); + } + } else if (strcmp(jvalue, "disable") == 0) { + if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) { + warnx("%s remains unchanged as disabled", name); + } else { + sbdirty(); + sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ); + sblock.fs_sujournal = 0; + sblock.fs_sujfree = 0; + warnx("%s cleared", name); + } + } + } if (Jflag) { name = "gjournal"; if (strcmp(Jvalue, "enable") == 0) { @@ -456,6 +509,229 @@ err: } void +sbdirty(void) +{ + disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK; + disk.d_fs.fs_clean = 0; +} + +int blocks; +static char clrbuf[MAXBSIZE]; + +static ufs2_daddr_t +journal_balloc(void) +{ + ufs2_daddr_t blk; + struct cg *cgp; + struct fs *fs; + int valid; + + cgp = &disk.d_cg; + fs = &disk.d_fs; + for (;;) { + blk = cgballoc(&disk); + if (blk > 0) + break; + /* + * If we failed to allocate a block from this cg, move to + * the next. + */ + if (cgwrite(&disk) < 0) { + warn("Failed to write updated cg"); + return (-1); + } + while ((valid = cgread(&disk)) == 1) { + /* + * Try to minimize fragmentation by requiring a minimum + * number of blocks present. + */ + if (cgp->cg_cs.cs_nbfree > blocks / 8) + break; + } + if (valid) + continue; + warnx("Failed to find sufficient free blocks for the journal"); + return -1; + } + if (bwrite(&disk, fsbtodb(fs, blk), clrbuf, fs->fs_bsize) <= 0) { + warn("Failed to initialize new block"); + return -1; + } + return (blk); +} + +static int +indir_fill(ufs2_daddr_t blk, int level, int *resid) +{ + char indirbuf[MAXBSIZE]; + ufs1_daddr_t *bap1; + ufs2_daddr_t *bap2; + ufs2_daddr_t nblk; + struct fs *fs; + int ncnt; + int cnt; + int i; + + fs = &disk.d_fs; + bzero(indirbuf, sizeof(indirbuf)); + bap1 = (ufs1_daddr_t *)indirbuf; + bap2 = (void *)bap1; + cnt = 0; + for (i = 0; i < NINDIR(fs) && *resid != 0; i++) { + nblk = journal_balloc(); + if (nblk <= 0) + return (-1); + cnt++; + if (fs->fs_magic == FS_UFS1_MAGIC) + *bap1++ = nblk; + else + *bap2++ = nblk; + if (level != 0) { + ncnt = indir_fill(nblk, level - 1, resid); + if (ncnt <= 0) + return (-1); + cnt += ncnt; + } else + (*resid)--; + } + if (bwrite(&disk, fsbtodb(fs, blk), indirbuf, fs->fs_bsize) <= 0) { + warn("Failed to write indirect"); + return (-1); + } + return (cnt); +} + +int +journal_alloc(int64_t size) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + ufs2_daddr_t blk; + void *ip; + struct cg *cgp; + struct fs *fs; + int resid; + ino_t ino; + int blks; + int mode; + int i; + + fs = &disk.d_fs; + cgp = &disk.d_cg; + ino = 0; + + /* + * If the user didn't supply a size pick one based on the filesystem + * size constrained with hardcoded MIN and MAX values. We opt for + * 1/1024th of the filesystem up to MAX but not exceeding one CG and + * not less than the MIN. + */ + if (size == 0) { + size = (fs->fs_size * fs->fs_bsize) / 1024; + size = MIN(SUJ_MAX, size); + if (size / fs->fs_fsize > fs->fs_fpg) + size = fs->fs_fpg * fs->fs_fsize; + size = MAX(SUJ_MIN, size); + } + resid = blocks = size / fs->fs_bsize; + if (fs->fs_cstotal.cs_nbfree < blocks) { + warn("Insufficient free space for %jd byte journal", size); + return (-1); + } + /* + * Find a cg with enough blocks to satisfy the journal + * size. Presently the journal does not span cgs. + */ + while (cgread(&disk) == 1) { + if (cgp->cg_cs.cs_nifree == 0) + continue; + /* + * Try to minimize fragmentation by requiring at least a + * 1/8th of the blocks be present in each cg we use. + */ + if (cgp->cg_cs.cs_nbfree < blocks / 8) + continue; + ino = cgialloc(&disk); + if (ino <= 0) + break; + printf("Using inode %d in cg %d for %jd byte journal\n", + ino, cgp->cg_cgx, size); + if (getino(&disk, &ip, ino, &mode) != 0) { + warn("Failed to get allocated inode"); + sbdirty(); + goto out; + } + /* + * We leave fields unrelated to the number of allocated + * blocks and size uninitialized. This causes legacy + * fsck implementations to clear the inode. + */ + dp2 = ip; + dp1 = ip; + if (fs->fs_magic == FS_UFS1_MAGIC) { + bzero(dp1, sizeof(*dp1)); + dp1->di_size = size; + dp1->di_mode = IFREG; + dp1->di_nlink = 1; + } else { + bzero(dp2, sizeof(*dp2)); + dp2->di_size = size; + dp2->di_mode = IFREG; + dp2->di_nlink = 1; + } + for (i = 0; i < NDADDR && resid; i++, resid--) { + blk = journal_balloc(); + if (blk <= 0) + goto out; + if (fs->fs_magic == FS_UFS1_MAGIC) { + dp1->di_db[i] = blk; + dp1->di_blocks++; + } else { + dp2->di_db[i] = blk; + dp2->di_blocks++; + } + } + for (i = 0; i < NIADDR && resid; i++) { + blk = journal_balloc(); + if (blk <= 0) + goto out; + blks = indir_fill(blk, i, &resid) + 1; + if (blks <= 0) { + sbdirty(); + goto out; + } + if (fs->fs_magic == FS_UFS1_MAGIC) { + dp1->di_ib[i] = blk; + dp1->di_blocks += blks; + } else { + dp2->di_ib[i] = blk; + dp2->di_blocks += blks; + } + } + if (fs->fs_magic == FS_UFS1_MAGIC) + dp1->di_blocks *= fs->fs_bsize / disk.d_bsize; + else + dp2->di_blocks *= fs->fs_bsize / disk.d_bsize; + if (putino(&disk) < 0) { + warn("Failed to write inode"); + sbdirty(); + return (-1); + } + if (cgwrite(&disk) < 0) { + warn("Failed to write updated cg"); + sbdirty(); + return (-1); + } + fs->fs_sujournal = ino; + fs->fs_sujfree = 0; + return (0); + } + warnx("Insufficient contiguous free space for the journal."); +out: + return (-1); +} + +void usage(void) { fprintf(stderr, "%s\n%s\n%s\n%s\n", @@ -477,6 +753,8 @@ printfs(void) (sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled"); warnx("soft updates: (-n) %s", (sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled"); + warnx("soft update journaling: (-j) %s", + (sblock.fs_flags & FS_SUJ)? "enabled" : "disabled"); warnx("gjournal: (-J) %s", (sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled"); warnx("maximum blocks per file in a cylinder group: (-e) %d", Index: usr.sbin/makefs/ffs/ffs_bswap.c =================================================================== --- usr.sbin/makefs/ffs/ffs_bswap.c (revision 202342) +++ usr.sbin/makefs/ffs/ffs_bswap.c (working copy) @@ -136,8 +136,6 @@ ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs n->di_mode = bswap16(o->di_mode); n->di_nlink = bswap16(o->di_nlink); - n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]); - n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]); n->di_size = bswap64(o->di_size); n->di_atime = bswap32(o->di_atime); n->di_atimensec = bswap32(o->di_atimensec); Index: lib/libufs/inode.c =================================================================== --- lib/libufs/inode.c (revision 202342) +++ lib/libufs/inode.c (working copy) @@ -93,3 +93,19 @@ gotit: switch (disk->d_ufs) { ERROR(disk, "unknown UFS filesystem type"); return (-1); } + +int +putino(struct uufsd *disk) +{ + struct fs *fs; + + fs = &disk->d_fs; + if (disk->d_inoblock == NULL) { + ERROR(disk, "No inode block allocated"); + return (-1); + } + if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)), + disk->d_inoblock, disk->d_fs.fs_bsize) <= 0) + return (-1); + return (0); +} Index: lib/libufs/cgroup.c =================================================================== --- lib/libufs/cgroup.c (revision 202342) +++ lib/libufs/cgroup.c (working copy) @@ -40,11 +40,82 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include +ufs2_daddr_t +cgballoc(struct uufsd *disk) +{ + u_int8_t *blksfree; + struct cg *cgp; + struct fs *fs; + long bno; + + fs = &disk->d_fs; + cgp = &disk->d_cg; + blksfree = cg_blksfree(cgp); + for (bno = 0; bno < fs->fs_fpg / fs->fs_frag; bno++) + if (ffs_isblock(fs, blksfree, bno)) + goto gotit; + return (0); +gotit: + fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; + ffs_clrblock(fs, blksfree, (long)bno); + ffs_clusteracct(fs, cgp, bno, -1); + cgp->cg_cs.cs_nbfree--; + fs->fs_cstotal.cs_nbfree--; + fs->fs_fmod = 1; + return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno)); +} + +ino_t +cgialloc(struct uufsd *disk) +{ + struct ufs2_dinode *dp2; + u_int8_t *inosused; + struct cg *cgp; + struct fs *fs; + ino_t ino; + int i; + + fs = &disk->d_fs; + cgp = &disk->d_cg; + inosused = cg_inosused(cgp); + for (ino = 0; ino < fs->fs_ipg / NBBY; ino++) + if (isclr(inosused, ino)) + goto gotit; + return (0); +gotit: + if (fs->fs_magic == FS_UFS2_MAGIC && + ino + INOPB(fs) > cgp->cg_initediblk && + cgp->cg_initediblk < cgp->cg_niblk) { + char block[MAXBSIZE]; + bzero(block, (int)fs->fs_bsize); + dp2 = (struct ufs2_dinode *)█ + for (i = 0; i < INOPB(fs); i++) { + dp2->di_gen = arc4random() / 2 + 1; + dp2++; + } + if (bwrite(disk, ino_to_fsba(fs, + cgp->cg_cgx * fs->fs_ipg + cgp->cg_initediblk), + block, fs->fs_bsize)) + return (0); + cgp->cg_initediblk += INOPB(fs); + } + + setbit(inosused, ino); + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree--; + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cgp->cg_cgx).cs_nifree--; + fs->fs_fmod = 1; + + return (ino + (cgp->cg_cgx * fs->fs_ipg)); +} + int cgread(struct uufsd *disk) { @@ -55,14 +126,12 @@ int cgread1(struct uufsd *disk, int c) { struct fs *fs; - off_t ccg; fs = &disk->d_fs; if (c >= fs->fs_ncg) { return (0); } - ccg = fsbtodb(fs, cgtod(fs, c)) * disk->d_bsize; if (bread(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf, fs->fs_bsize) == -1) { ERROR(disk, "unable to read cylinder group"); @@ -73,6 +142,12 @@ cgread1(struct uufsd *disk, int c) } int +cgwrite(struct uufsd *disk) +{ + return (cgwrite1(disk, disk->d_lcg)); +} + +int cgwrite1(struct uufsd *disk, int c) { struct fs *fs; Index: lib/libufs/type.c =================================================================== --- lib/libufs/type.c (revision 202342) +++ lib/libufs/type.c (working copy) @@ -66,6 +66,10 @@ ufs_disk_close(struct uufsd *disk) free((char *)(uintptr_t)disk->d_name); disk->d_name = NULL; } + if (disk->d_sbcsum != NULL) { + free(disk->d_sbcsum); + disk->d_sbcsum = NULL; + } return (0); } @@ -156,6 +160,7 @@ again: if ((ret = stat(name, &st)) < 0) { disk->d_mine = 0; disk->d_ufs = 0; disk->d_error = NULL; + disk->d_sbcsum = NULL; if (oname != name) { name = strdup(name); Index: lib/libufs/libufs.h =================================================================== --- lib/libufs/libufs.h (revision 202342) +++ lib/libufs/libufs.h (working copy) @@ -71,6 +71,7 @@ struct uufsd { int d_fd; /* raw device file descriptor */ long d_bsize; /* device bsize */ ufs2_daddr_t d_sblock; /* superblock location */ + struct csum *d_sbcsum; /* Superblock summary info */ caddr_t d_inoblock; /* inode block */ ino_t d_inomin; /* low inode */ ino_t d_inomax; /* high inode */ @@ -109,14 +110,18 @@ int berase(struct uufsd *, ufs2_daddr_t, ufs2_dadd /* * cgroup.c */ +ufs2_daddr_t cgballoc(struct uufsd *); +ino_t cgialloc(struct uufsd *); int cgread(struct uufsd *); int cgread1(struct uufsd *, int); +int cgwrite(struct uufsd *); int cgwrite1(struct uufsd *, int); /* * inode.c */ int getino(struct uufsd *, void **, ino_t, int *); +int putino(struct uufsd *); /* * sblock.c @@ -132,6 +137,16 @@ int ufs_disk_fillout(struct uufsd *, const char *) int ufs_disk_fillout_blank(struct uufsd *, const char *); int ufs_disk_write(struct uufsd *); +/* + * ffs_subr.c + */ +void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); +void ffs_fragacct(struct fs *, int, int32_t [], int); +int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); + __END_DECLS #endif /* __LIBUFS_H__ */ Index: lib/libufs/Makefile =================================================================== --- lib/libufs/Makefile (revision 202342) +++ lib/libufs/Makefile (working copy) @@ -3,7 +3,7 @@ LIB= ufs SHLIBDIR?= /lib -SRCS= block.c cgroup.c inode.c sblock.c type.c +SRCS= block.c cgroup.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c INCS= libufs.h MAN= bread.3 cgread.3 libufs.3 sbread.3 ufs_disk_close.3 @@ -16,8 +16,11 @@ MLINKS+= ufs_disk_close.3 ufs_disk_fillout.3 MLINKS+= ufs_disk_close.3 ufs_disk_fillout_blank.3 MLINKS+= ufs_disk_close.3 ufs_disk_write.3 -WARNS?= 3 +.PATH: ${.CURDIR}/../../sys/ufs/ffs +WARNS?= 2 + +DEBUG_FLAGS = -g CFLAGS+= -D_LIBUFS .if defined(LIBUFS_DEBUG) CFLAGS+= -D_LIBUFS_DEBUGGING Index: lib/libufs/sblock.c =================================================================== --- lib/libufs/sblock.c (revision 202342) +++ lib/libufs/sblock.c (working copy) @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -49,8 +50,11 @@ static int superblocks[] = SBLOCKSEARCH; int sbread(struct uufsd *disk) { + uint8_t block[MAXBSIZE]; struct fs *fs; int sb, superblock; + int i, size, blks; + uint8_t *space; ERROR(disk, NULL); @@ -86,6 +90,34 @@ sbread(struct uufsd *disk) } disk->d_bsize = fs->fs_fsize / fsbtodb(fs, 1); disk->d_sblock = superblock / disk->d_bsize; + /* + * Read in the superblock summary information. + */ + size = fs->fs_cssize; + blks = howmany(size, fs->fs_fsize); + size += fs->fs_ncg * sizeof(int32_t); + space = malloc(size); + if (space == NULL) { + ERROR(disk, "failed to allocate space for summary information"); + return (-1); + } + fs->fs_csp = (struct csum *)space; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + if (bread(disk, fsbtodb(fs, fs->fs_csaddr + i), block, size) + == -1) { + ERROR(disk, "Failed to read sb summary information"); + free(fs->fs_csp); + return (-1); + } + bcopy(block, space, size); + space += size; + } + fs->fs_maxcluster = (uint32_t *)space; + disk->d_sbcsum = fs->fs_csp; + return (0); } @@ -93,7 +125,8 @@ int sbwrite(struct uufsd *disk, int all) { struct fs *fs; - int i; + int i, blks, size; + uint8_t *space; ERROR(disk, NULL); @@ -107,6 +140,22 @@ sbwrite(struct uufsd *disk, int all) ERROR(disk, "failed to write superblock"); return (-1); } + /* + * Write superblock summary information. + */ + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = (uint8_t *)disk->d_sbcsum; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + i), space, size) + == -1) { + ERROR(disk, "Failed to write sb summary information"); + return (-1); + } + space += size; + } if (all) { for (i = 0; i < fs->fs_ncg; i++) if (bwrite(disk, fsbtodb(fs, cgsblock(fs, i)), Index: sys/ufs/ufs/ufs_dirhash.c =================================================================== --- sys/ufs/ufs/ufs_dirhash.c (revision 202342) +++ sys/ufs/ufs/ufs_dirhash.c (working copy) @@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$"); static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); -static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); - static int ufs_mindirhashsize = DIRBLKSIZ * 5; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, &ufs_mindirhashsize, Index: sys/ufs/ufs/inode.h =================================================================== --- sys/ufs/ufs/inode.h (revision 202342) +++ sys/ufs/ufs/inode.h (working copy) @@ -120,7 +120,7 @@ struct inode { #define IN_CHANGE 0x0002 /* Inode change time update request. */ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ -#define IN_RENAME 0x0010 /* Inode is being renamed. */ +#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ #define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ #define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */ #define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the @@ -175,6 +175,7 @@ struct indir { /* Determine if soft dependencies are being done */ #define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP) #define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) +#define DOINGSUJ(vp) ((vp)->v_mount->mnt_flag & MNT_SUJ) /* This overlays the fid structure (see mount.h). */ struct ufid { Index: sys/ufs/ufs/dinode.h =================================================================== --- sys/ufs/ufs/dinode.h (revision 202342) +++ sys/ufs/ufs/dinode.h (working copy) @@ -146,7 +146,8 @@ struct ufs2_dinode { ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */ - int64_t di_spare[2]; /* 240: Reserved; currently unused */ + ino_t di_freelink; /* 240: SUJ: Next unlinked inode. */ + uint32_t di_spare[3]; /* 244: Reserved; currently unused */ }; /* @@ -167,9 +168,7 @@ struct ufs2_dinode { struct ufs1_dinode { u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ int16_t di_nlink; /* 2: File link count. */ - union { - u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */ - } di_u; + ino_t di_freelink; /* 4: SUJ: Next unlinked inode. */ u_int64_t di_size; /* 8: File byte count. */ int32_t di_atime; /* 16: Last access time. */ int32_t di_atimensec; /* 20: Last access time. */ @@ -186,7 +185,5 @@ struct ufs1_dinode { u_int32_t di_gid; /* 116: File group. */ u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */ }; -#define di_ogid di_u.oldids[1] -#define di_ouid di_u.oldids[0] #endif /* _UFS_UFS_DINODE_H_ */ Index: sys/ufs/ufs/ufs_vnops.c =================================================================== --- sys/ufs/ufs/ufs_vnops.c (revision 202342) +++ sys/ufs/ufs/ufs_vnops.c (working copy) @@ -114,6 +114,8 @@ static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; static vop_pathconf_t ufsfifo_pathconf; +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + /* * A virgin directory (no blushing please). */ @@ -974,6 +976,9 @@ ufs_link(ap) error = EXDEV; goto out; } + if (VTOI(tdvp)->i_effnlink < 2) + panic("ufs_link: Bad link count %d on parent", + VTOI(tdvp)->i_effnlink); ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; @@ -988,11 +993,11 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); + error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); } if (error) { @@ -1001,7 +1006,7 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_revert_link(VTOI(tdvp), ip); } out: return (error); @@ -1043,7 +1048,7 @@ ufs_whiteout(ap) newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; - error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); + error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); break; case DELETE: @@ -1062,6 +1067,11 @@ ufs_whiteout(ap) return (error); } +static volatile int rename_restarts; +SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, + __DEVOLATILE(int *, &rename_restarts), 0, + "Times rename had to restart due to lock contention"); + /* * Rename system call. * rename("foo", "bar"); @@ -1101,111 +1111,183 @@ ufs_rename(ap) struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; + struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; - struct inode *ip, *xp, *dp; + struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; - int doingdirectory = 0, oldparent = 0, newparent = 0; + off_t endoff; + int doingdirectory, newparent; int error = 0, ioflag; - ino_t fvp_ino; + struct mount *mp; + ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif + endoff = 0; + mp = tdvp->v_mount; + VOP_UNLOCK(tdvp, 0); + if (tvp && tvp != tdvp) + VOP_UNLOCK(tvp, 0); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; -abortit: - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vput(tvp); - vrele(fdvp); + mp = NULL; + goto releout; + } + error = vfs_busy(mp, 0); + if (error) { + mp = NULL; + goto releout; + } +relock: + /* + * We need to acquire 2 to 4 locks depending on whether tvp is NULL + * and fdvp and tdvp are the same directory. Subsequently we need + * to double-check all paths and in the directory rename case we + * need to verify that we are not creating a directory loop. To + * handle this we acquire all but fdvp using non-blocking + * acquisitions. If we fail to acquire any lock in the path we will + * drop all held locks, acquire the new lock in a blocking fashion, + * and then release it and restart the rename. This acquire/release + * step ensures that we do not spin on a lock waiting for release. + */ + error = vn_lock(fdvp, LK_EXCLUSIVE); + if (error) + goto releout; + if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto releout; + VOP_UNLOCK(tdvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * Re-resolve fvp to be certain it still exists and fetch the + * correct vnode. + */ + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + goto releout; + } + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); vrele(fvp); - return (error); + fvp = nvp; + atomic_add_int(&rename_restarts, 1); + goto relock; } - + vrele(fvp); + fvp = nvp; + /* + * Re-resolve tvp and acquire the vnode lock if present. + */ + error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); + if (error != 0 && error != EJUSTRETURN) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + goto releout; + } + /* + * If tvp disappeared we just carry on. + */ + if (error == EJUSTRETURN && tvp != NULL) { + vrele(tvp); + tvp = NULL; + } + /* + * Get the tvp ino if the lookup succeeded. We may have to restart + * if the non-blocking acquire fails. + */ + if (error == 0) { + nvp = NULL; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (tvp) + vrele(tvp); + tvp = nvp; + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + } + fdp = VTOI(fdvp); + fip = VTOI(fvp); + tdp = VTOI(tdvp); + tip = NULL; + if (tvp) + tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; - goto abortit; + goto unlockout; } - /* * Renaming a file to itself has no effect. The upper layers should - * not call us in that case. Temporarily just warn if they do. + * not call us in that case. However, things could change after + * we drop the locks above. */ if (fvp == tvp) { - printf("ufs_rename: fvp == tvp (can't happen)\n"); error = 0; - goto abortit; + goto unlockout; } - - if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) - goto abortit; - dp = VTOI(fdvp); - ip = VTOI(fvp); - if (ip->i_nlink >= LINK_MAX) { - VOP_UNLOCK(fvp, 0); + doingdirectory = 0; + newparent = 0; + ino = fip->i_number; + if (fip->i_nlink >= LINK_MAX) { error = EMLINK; - goto abortit; + goto unlockout; } - if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) - || (dp->i_flags & APPEND)) { - VOP_UNLOCK(fvp, 0); + if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) + || (fdp->i_flags & APPEND)) { error = EPERM; - goto abortit; + goto unlockout; } - if ((ip->i_mode & IFMT) == IFDIR) { + if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || - (ip->i_flag & IN_RENAME)) { - VOP_UNLOCK(fvp, 0); + fdp == fip || + (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; - goto abortit; + goto unlockout; } - ip->i_flag |= IN_RENAME; - oldparent = dp->i_number; + if (fdp->i_number != tdp->i_number) + newparent = tdp->i_number; doingdirectory = 1; } - vrele(fdvp); - - /* - * When the target exists, both the directory - * and target vnodes are returned locked. - */ - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); - - /* - * 1) Bump link count while we're moving stuff - * around. If we crash somewhere before - * completing our work, the link count - * may be wrong, but correctable. - */ - ip->i_effnlink++; - ip->i_nlink++; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | - DOINGASYNC(fvp)))) != 0) { - VOP_UNLOCK(fvp, 0); - goto bad; + if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) { + error = EXDEV; + goto unlockout; } /* @@ -1214,88 +1296,93 @@ ufs_rename(ap) * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so - * as to be able to change "..". We must repeat the call - * to namei, as the parent directory is unlocked by the - * call to checkpath(). + * as to be able to change "..". */ - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); - fvp_ino = ip->i_number; - VOP_UNLOCK(fvp, 0); - if (oldparent != dp->i_number) - newparent = dp->i_number; if (doingdirectory && newparent) { - if (error) /* write access check above */ - goto bad; - if (xp != NULL) - vput(tvp); - error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred); + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) - goto out; + goto unlockout; + error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, + &ino); + /* + * We encountered a lock that we have to wait for. Unlock + * everything else and VGET before restarting. + */ + if (ino) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_SHARED, &nvp); + if (error == 0) + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + if (error) + goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); - VREF(tdvp); - error = relookup(tdvp, &tvp, tcnp); - if (error) - goto out; - vrele(tdvp); - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); } + if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || + tdp->i_effnlink == 0) + panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + fip->i_effnlink++; + fip->i_nlink++; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_setup_link(tdp, fip); + error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp))); + if (error) + goto bad; + + /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ - if (xp == NULL) { - if (dp->i_dev != ip->i_dev) + if (tip == NULL) { + if (tdp->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); - /* - * Account for ".." in new directory. - * When source and destination have the same - * parent we don't fool with the link count. - */ if (doingdirectory && newparent) { - if ((nlink_t)dp->i_nlink >= LINK_MAX) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't adjust the link count. The + * actual link modification is completed when + * .. is rewritten below. + */ + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } - dp->i_effnlink++; - dp->i_nlink++; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | - DOINGASYNC(tdvp))); - if (error) - goto bad; } - ufs_makedirentry(ip, tcnp, &newdir); - error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); - if (error) { - if (doingdirectory && newparent) { - dp->i_effnlink--; - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - (void)UFS_UPDATE(tdvp, 1); - } + ufs_makedirentry(fip, tcnp, &newdir); + error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); + if (error) goto bad; - } - vput(tdvp); + /* Setup tdvp for directory compaction if needed. */ + if (tdp->i_count && tdp->i_endoff && + tdp->i_endoff < tdp->i_size) + endoff = tdp->i_endoff; } else { - if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ - if (xp->i_number == ip->i_number) + if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller @@ -1303,7 +1390,7 @@ ufs_rename(ap) * destination of the rename. This implements append-only * directories. */ - if ((dp->i_mode & S_ISTXT) && + if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; @@ -1314,9 +1401,9 @@ ufs_rename(ap) * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ - if ((xp->i_mode&IFMT) == IFDIR) { - if ((xp->i_effnlink > 2) || - !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { + if ((tip->i_mode & IFMT) == IFDIR) { + if ((tip->i_effnlink > 2) || + !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } @@ -1329,21 +1416,31 @@ ufs_rename(ap) error = EISDIR; goto bad; } - error = ufs_dirrewrite(dp, xp, ip->i_number, - IFTODT(ip->i_mode), - (doingdirectory && newparent) ? newparent : doingdirectory); - if (error) - goto bad; if (doingdirectory) { if (!newparent) { - dp->i_effnlink--; + tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); + softdep_change_linkcnt(tdp); } - xp->i_effnlink--; + tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(xp); + softdep_change_linkcnt(tip); } + error = ufs_dirrewrite(tdp, tip, fip->i_number, + IFTODT(fip->i_mode), + (doingdirectory && newparent) ? newparent : doingdirectory); + if (error) { + if (doingdirectory) { + if (!newparent) { + tdp->i_effnlink++; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + tip->i_effnlink++; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(tip); + } + } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory @@ -1357,115 +1454,107 @@ ufs_rename(ap) * them now. */ if (!newparent) { - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; + tdp->i_nlink--; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; } - xp->i_nlink--; - DIP_SET(xp, i_nlink, xp->i_nlink); - xp->i_flag |= IN_CHANGE; + tip->i_nlink--; + DIP_SET(tip, i_nlink, tip->i_nlink); + tip->i_flag |= IN_CHANGE; ioflag = IO_NORMAL; if (!DOINGASYNC(tvp)) ioflag |= IO_SYNC; + /* Don't go to bad here as the new link exists. */ if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) - goto bad; + goto unlockout; } - vput(tdvp); - vput(tvp); - xp = NULL; } /* - * 3) Unlink the source. + * 3) Unlink the source. We have to resolve the path again to + * fixup the directory offset and count for ufs_dirremove. */ - fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); - VREF(fdvp); - error = relookup(fdvp, &fvp, fcnp); - if (error == 0) - vrele(fdvp); - if (fvp != NULL) { - xp = VTOI(fvp); - dp = VTOI(fdvp); - } else { - /* - * From name has disappeared. IN_RENAME is not sufficient - * to protect against directory races due to timing windows, - * so we have to remove the panic. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. - */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - vrele(ap->a_fvp); - return (0); + if (fdvp == tdvp) { + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + panic("ufs_rename: from entry went away!"); + if (ino != fip->i_number) + panic("ufs_rename: ino mismatch %d != %d\n", ino, + fip->i_number); } /* - * Ensure that the directory entry still exists and has not - * changed while the new name has been entered. If the source is - * a file then the entry may have been unlinked or renamed. In - * either case there is no further work to be done. If the source - * is a directory then it cannot have been rmdir'ed; the IN_RENAME - * flag ensures that it cannot be moved by another rename or removed - * by a rmdir. + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. */ - if (xp != ip) { + if (doingdirectory && newparent) { /* - * From name resolves to a different inode. IN_RENAME is - * not sufficient protection against timing window races - * so we can't panic here. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. + * If tip exists we simply use its link, otherwise we must + * add a new one. */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - } else { - /* - * If the source is a directory with a - * new parent, the link count of the old - * parent directory must be decremented - * and ".." set to point to the new parent. - */ - if (doingdirectory && newparent) { - xp->i_offset = mastertemplate.dot_reclen; - ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); - cache_purge(fdvp); + if (tip == NULL) { + tdp->i_effnlink++; + tdp->i_nlink++; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_setup_dotdot_link(tdp, fip); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + /* Don't go to bad here as the new link exists. */ + if (error) + goto unlockout; } - error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); - xp->i_flag &= ~IN_RENAME; + fip->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); + cache_purge(fdvp); } - if (dp) - vput(fdvp); - if (xp) - vput(fvp); - vrele(ap->a_fvp); + error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); + +unlockout: + vput(fdvp); + vput(fvp); + if (tvp) + vput(tvp); + /* + * If compaction or fsync was requested do it now that other locks + * are no longer needed. + */ + if (error == 0 && endoff != 0) { +#ifdef UFS_DIRHASH + if (tdp->i_dirhash != NULL) + ufsdirhash_dirtrunc(tdp, endoff); +#endif + UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred, + td); + } + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + vput(tdvp); + if (mp) + vfs_unbusy(mp); return (error); bad: - if (xp) - vput(ITOV(xp)); - vput(ITOV(dp)); -out: - if (doingdirectory) - ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - ip->i_flag &= ~IN_RENAME; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - vput(fvp); - } else - vrele(fvp); + fip->i_effnlink--; + fip->i_nlink--; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_revert_link(tdp, fip); + goto unlockout; + +releout: + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp) + vrele(tvp); + if (mp) + vfs_unbusy(mp); + return (error); } @@ -1664,8 +1753,7 @@ ufs_mkdir(ap) ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); - if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); @@ -1681,8 +1769,8 @@ ufs_mkdir(ap) DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); + softdep_setup_mkdir(dp, ip); + error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; #ifdef MAC @@ -1791,7 +1879,7 @@ ufs_mkdir(ap) else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); + error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0); bad: if (error == 0) { @@ -1807,8 +1895,6 @@ bad: dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. @@ -1818,7 +1904,8 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_mkdir(dp, ip); + vput(tvp); } out: @@ -1854,10 +1941,13 @@ ufs_rmdir(ap) * tries to remove a locally mounted on directory). */ error = 0; - if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) { + if (ip->i_effnlink < 2) { error = EINVAL; goto out; } + if (dp->i_effnlink < 3) + panic("ufs_dirrem: Bad link count %d on parent", + dp->i_effnlink); if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; @@ -1881,18 +1971,14 @@ ufs_rmdir(ap) */ dp->i_effnlink--; ip->i_effnlink--; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_revert_rmdir(dp, ip); goto out; } cache_purge(dvp); @@ -2401,6 +2487,9 @@ ufs_makeinode(mode, dvp, vpp, cnp) if ((mode & IFMT) == 0) mode |= IFREG; + if (VTOI(dvp)->i_effnlink < 2) + panic("ufs_makeinode: Bad link count %d on parent", + VTOI(dvp)->i_effnlink); error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); @@ -2530,7 +2619,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { ip->i_mode &= ~ISGID; @@ -2594,7 +2683,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); + error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); if (error) goto bad; *vpp = tvp; @@ -2610,7 +2699,7 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_create(VTOI(dvp), ip); vput(tvp); return (error); } Index: sys/ufs/ufs/ufsmount.h =================================================================== --- sys/ufs/ufs/ufsmount.h (revision 202342) +++ sys/ufs/ufs/ufsmount.h (working copy) @@ -57,7 +57,11 @@ struct ucred; struct uio; struct vnode; struct ufs_extattr_per_mount; +struct jblocks; +struct inodedep; +TAILQ_HEAD(inodedeplst, inodedep); + /* This structure describes the UFS specific mount structure data. */ struct ufsmount { struct mount *um_mountp; /* filesystem vfs structure */ @@ -75,6 +79,11 @@ struct ufsmount { long um_numindirdeps; /* outstanding indirdeps */ struct workhead softdep_workitem_pending; /* softdep work queue */ struct worklist *softdep_worklist_tail; /* Tail pointer for above */ + struct workhead softdep_journal_pending; /* journal work queue */ + struct worklist *softdep_journal_tail; /* Tail pointer for above */ + struct jblocks *softdep_jblocks; /* Journal block information */ + struct inodedeplst softdep_unlinked; /* Unlinked inodes */ + int softdep_on_journal; /* Items on the journal list */ int softdep_on_worklist; /* Items on the worklist */ int softdep_on_worklist_inprogress; /* Busy items on worklist */ int softdep_deps; /* Total dependency count */ Index: sys/ufs/ufs/ufs_lookup.c =================================================================== --- sys/ufs/ufs/ufs_lookup.c (revision 202342) +++ sys/ufs/ufs/ufs_lookup.c (working copy) @@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) -static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *, - ino_t *); - static int ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, struct thread *td) @@ -189,11 +186,11 @@ ufs_lookup(ap) } */ *ap; { - return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); + return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } -static int -ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, +int +ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ @@ -524,6 +521,8 @@ notfound: return (ENOENT); found: + if (dd_ino != NULL) + *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* @@ -546,11 +545,6 @@ found: if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); - if (dd_ino != NULL) { - *dd_ino = ino; - return (0); - } - /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. @@ -558,17 +552,6 @@ found: if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); - if ((error = VFS_VGET(vdp->v_mount, ino, - LK_EXCLUSIVE, &tdp)) != 0) - return (error); - - error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); - if (error) { - vput(tdp); - return (error); - } - - /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there @@ -585,6 +568,16 @@ found: dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; + if (dd_ino != NULL) + return (0); + if ((error = VFS_VGET(vdp->v_mount, ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; @@ -616,6 +609,8 @@ found: dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); + if (dd_ino != NULL) + return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -650,6 +645,8 @@ found: cnp->cn_flags |= SAVENAME; return (0); } + if (dd_ino != NULL) + return (0); /* * Step through the translation in the name. We do not `vput' the @@ -681,7 +678,7 @@ found: * to the inode we looked up before vdp lock was * dropped. */ - error = ufs_lookup_(pdp, NULL, cnp, &ino1); + error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); @@ -825,12 +822,13 @@ ufs_makedirentry(ip, cnp, newdirp) * soft dependency code). */ int -ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) +ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; + int isrename; { struct ucred *cr; struct thread *td; @@ -903,22 +901,28 @@ int blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, dp->i_offset, - dirp->d_ino, newdirbp, 1) == 0) { - bdwrite(bp); + dirp->d_ino, newdirbp, 1)) + dp->i_flag |= IN_NEEDSYNC; + if (newdirbp) + bdwrite(newdirbp); + bdwrite(bp); + if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); - } - /* We have just allocated a directory block in an - * indirect block. Rather than tracking when it gets - * claimed by the inode, we simply do a VOP_FSYNC - * now to ensure that it is there (in case the user - * does a future fsync). Note that we have to unlock - * the inode for the entry that we just entered, as - * the VOP_FSYNC may need to lock other inodes which - * can lead to deadlock if we also hold a lock on - * the newly entered node. + /* + * We have just allocated a directory block in an + * indirect block. We must prevent holes in the + * directory created if directory entries are + * written out of order. To accomplish this we + * fsync when we extend a directory into indirects. + * During rename it's not safe to drop the tvp lock + * so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and + * the kernel were taught to fill in sparse + * directories rather than panic. */ - if ((error = bwrite(bp))) - return (error); + if (isrename) + return (0); if (tvp != NULL) VOP_UNLOCK(tvp, 0); error = VOP_FSYNC(dvp, MNT_WAIT, td); @@ -1007,7 +1011,7 @@ int dp->i_offset + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) - softdep_change_directoryentry_offset(dp, dirbuf, + softdep_change_directoryentry_offset(bp, dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); @@ -1059,6 +1063,8 @@ int (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); + if (newdirbp != NULL) + bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { @@ -1076,7 +1082,8 @@ int * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ - if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { + if (isrename == 0 && error == 0 && + dp->i_endoff && dp->i_endoff < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp, 0); #ifdef UFS_DIRHASH @@ -1117,6 +1124,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir) dp = VTOI(dvp); + /* + * Adjust the link count early so softdep can block if necessary. + */ + if (ip) { + ip->i_effnlink--; + if (DOINGSOFTDEP(dvp)) { + softdep_setup_unlink(dp, ip); + } else { + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + } + } if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to WINO. @@ -1146,6 +1166,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir) if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, rep, dp->i_offset); #endif + if (ip && rep->d_ino != ip->i_number) + panic("ufs_dirremove: ip %d does not match dirent ino %d\n", + ip->i_number, rep->d_ino); if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. @@ -1164,31 +1187,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir) dp->i_offset & ~(DIRBLKSIZ - 1)); #endif out: + error = 0; if (DOINGSOFTDEP(dvp)) { - if (ip) { - ip->i_effnlink--; - softdep_change_linkcnt(ip); + if (ip) softdep_setup_remove(bp, dp, ip, isrmdir); - } - if (softdep_slowdown(dvp)) { + if (softdep_slowdown(dvp)) error = bwrite(bp); - } else { + else bdwrite(bp); - error = 0; - } } else { - if (ip) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - } if (flags & DOWHITEOUT) error = bwrite(bp); - else if (DOINGASYNC(dvp) && dp->i_count != 0) { + else if (DOINGASYNC(dvp) && dp->i_count != 0) bdwrite(bp); - error = 0; - } else + else error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; @@ -1221,6 +1233,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) struct vnode *vdp = ITOV(dp); int error; + /* + * Drop the link before we lock the buf so softdep can block if + * necessary. + */ + oip->i_effnlink--; + if (DOINGSOFTDEP(vdp)) { + softdep_setup_unlink(dp, oip); + } else { + oip->i_nlink--; + DIP_SET(oip, i_nlink, oip->i_nlink); + oip->i_flag |= IN_CHANGE; + } + error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); if (error) return (error); @@ -1232,15 +1257,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; - oip->i_effnlink--; if (DOINGSOFTDEP(vdp)) { - softdep_change_linkcnt(oip); softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { - oip->i_nlink--; - DIP_SET(oip, i_nlink, oip->i_nlink); - oip->i_flag |= IN_CHANGE; if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; @@ -1355,25 +1375,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cre /* * Check if source directory is in the path of the target directory. - * Target is supplied locked, source is unlocked. - * The target is always vput before returning. */ int -ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred) +ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { - struct vnode *vp, *vp1; + struct mount *mp; + struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; - vp = ITOV(target); - if (target->i_number == source_ino) { - error = EEXIST; - goto out; - } + vp = tvp = ITOV(target); + mp = vp->v_mount; + *wait_ino = 0; + if (target->i_number == source_ino) + return (EEXIST); + if (target->i_number == parent_ino) + return (0); + if (target->i_number == ROOTINO) + return (0); error = 0; - if (target->i_number == ROOTINO) - goto out; - for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) @@ -1384,9 +1404,13 @@ int } if (dd_ino == ROOTINO) break; - error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1); - if (error != 0) + if (dd_ino == parent_ino) break; + error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); + if (error != 0) { + *wait_ino = dd_ino; + break; + } /* Recheck that ".." still points to vp1 after relock of vp */ error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) { @@ -1398,14 +1422,14 @@ int vput(vp1); continue; } - vput(vp); + if (vp != tvp) + vput(vp); vp = vp1; } -out: if (error == ENOTDIR) - printf("checkpath: .. not a directory\n"); - if (vp != NULL) + panic("checkpath: .. not a directory\n"); + if (vp != tvp) vput(vp); return (error); } Index: sys/ufs/ufs/ufs_extern.h =================================================================== --- sys/ufs/ufs/ufs_extern.h (revision 202342) +++ sys/ufs/ufs/ufs_extern.h (working copy) @@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *); int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, struct buf *, int *, int *); int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); -int ufs_checkpath(ino_t, struct inode *, struct ucred *); +int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); @@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args *); void ufs_makedirentry(struct inode *, struct componentname *, struct direct *); int ufs_direnter(struct vnode *, struct vnode *, struct direct *, - struct componentname *, struct buf *); + struct componentname *, struct buf *, int); int ufs_dirremove(struct vnode *, struct inode *, int, int); int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, + ino_t *); int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); int ufs_inactive(struct vop_inactive_args *); int ufs_init(struct vfsconf *); @@ -81,19 +83,33 @@ vfs_root_t ufs_root; int ufs_uninit(struct vfsconf *); int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); +#include +SYSCTL_DECL(_vfs_ufs); + /* * Soft update function prototypes. */ int softdep_setup_directory_add(struct buf *, struct inode *, off_t, ino_t, struct buf *, int); -void softdep_change_directoryentry_offset(struct inode *, caddr_t, - caddr_t, caddr_t, int); +void softdep_change_directoryentry_offset(struct buf *, struct inode *, + caddr_t, caddr_t, caddr_t, int); void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int); void softdep_setup_directory_change(struct buf *, struct inode *, struct inode *, ino_t, int); void softdep_change_linkcnt(struct inode *); void softdep_releasefile(struct inode *); int softdep_slowdown(struct vnode *); +void softdep_setup_create(struct inode *, struct inode *); +void softdep_setup_dotdot_link(struct inode *, struct inode *); +void softdep_setup_link(struct inode *, struct inode *); +void softdep_setup_mkdir(struct inode *, struct inode *); +void softdep_setup_rmdir(struct inode *, struct inode *); +void softdep_setup_unlink(struct inode *, struct inode *); +void softdep_revert_create(struct inode *, struct inode *); +void softdep_revert_dotdot_link(struct inode *, struct inode *); +void softdep_revert_link(struct inode *, struct inode *); +void softdep_revert_mkdir(struct inode *, struct inode *); +void softdep_revert_rmdir(struct inode *, struct inode *); /* * Flags to low-level allocation routines. The low 16-bits are reserved Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c (revision 202342) +++ sys/ufs/ffs/ffs_vfsops.c (working copy) @@ -79,7 +79,6 @@ static int ffs_reload(struct mount *, struct threa static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); -static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; @@ -331,6 +330,7 @@ ffs_mount(struct mount *mp) MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); + fs->fs_mtime = time_second; fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); @@ -898,6 +898,7 @@ ffs_mountfs(devvp, mp, td) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); + mp->mnt_stat.f_iosize = fs->fs_bsize; if( mp->mnt_flag & MNT_ROOTFS) { /* @@ -909,6 +910,7 @@ ffs_mountfs(devvp, mp, td) } if (ronly == 0) { + fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); @@ -939,7 +941,6 @@ ffs_mountfs(devvp, mp, td) * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ - mp->mnt_stat.f_iosize = fs->fs_bsize; (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ @@ -1039,7 +1040,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc) * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ -static void +void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; @@ -1134,6 +1135,7 @@ ffs_unmount(mp, mntflags) fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); + softdep_unmount(mp); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); @@ -1575,16 +1577,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags) DIP_SET(ip, i_gen, ip->i_gen); } } - /* - * Ensure that uid and gid are correct. This is a temporary - * fix until fsck has been changed to do the update. - */ - if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ - fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ - ip->i_uid = ip->i_din1->di_ouid; /* XXX */ - ip->i_gid = ip->i_din1->di_ogid; /* XXX */ - } /* XXX */ - #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* @@ -1728,6 +1720,8 @@ ffs_sbupdate(mp, waitfor, suspended) } fs->fs_fmod = 0; fs->fs_time = time_second; + if (fs->fs_flags & FS_DOSOFTDEP) + softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (suspended) @@ -1869,9 +1863,6 @@ ffs_bufwrite(struct buf *bp) } BO_UNLOCK(bp->b_bufobj); - /* Mark the buffer clean */ - bundirty(bp); - /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the @@ -1912,9 +1903,16 @@ ffs_bufwrite(struct buf *bp) newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES - /* move over the dependencies */ - if (!LIST_EMPTY(&bp->b_dep)) - softdep_move_dependencies(bp, newbp); + /* + * Move over the dependencies. If there are rollbacks, + * leave the parent buffer dirtied as it will need to + * be written again. + */ + if (LIST_EMPTY(&bp->b_dep) || + softdep_move_dependencies(bp, newbp) == 0) + bundirty(bp); +#else + bundirty(bp); #endif /* @@ -1927,8 +1925,11 @@ ffs_bufwrite(struct buf *bp) */ bqrelse(bp); bp = newbp; - } + } else + /* Mark the buffer clean */ + bundirty(bp); + /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c (revision 202342) +++ sys/ufs/ffs/ffs_softdep.c (working copy) @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #ifndef DEBUG #define DEBUG #endif +#define SUJ_DEBUG #include #include @@ -62,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -130,10 +132,12 @@ softdep_setup_inomapdep(bp, ip, newinum) } void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; + int frags; + int oldfrags; { panic("softdep_setup_blkmapdep called"); @@ -403,31 +407,13 @@ softdep_get_depcounts(struct mount *mp, * These definitions need to be adapted to the system to which * this file is being ported. */ -/* - * malloc types defined for the softdep system. - */ -static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); -static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); -static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); -static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); -static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); -static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); -static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); -static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); -static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); -static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); -static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); -static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); -static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); -static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); -static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) #define D_PAGEDEP 0 #define D_INODEDEP 1 -#define D_NEWBLK 2 -#define D_BMSAFEMAP 3 +#define D_BMSAFEMAP 2 +#define D_NEWBLK 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 @@ -438,8 +424,68 @@ softdep_get_depcounts(struct mount *mp, #define D_MKDIR 11 #define D_DIRREM 12 #define D_NEWDIRBLK 13 -#define D_LAST D_NEWDIRBLK +#define D_FREEWORK 14 +#define D_FREEDEP 15 +#define D_JADDREF 16 +#define D_JREMREF 17 +#define D_JMVREF 18 +#define D_JNEWBLK 19 +#define D_JFREEBLK 20 +#define D_JFREEFRAG 21 +#define D_JSEG 22 +#define D_JSEGDEP 23 +#define D_SBDEP 24 +#define D_JTRUNC 25 +#define D_LAST D_JTRUNC +unsigned long dep_current[D_LAST + 1]; +unsigned long dep_total[D_LAST + 1]; + + +SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, + "total dependencies allocated"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, + "current dependencies allocated"); + +#define SOFTDEP_TYPE(type, str, long) \ + static MALLOC_DEFINE(M_ ## type, #str, long); \ + SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ + &dep_total[D_ ## type], 0, ""); \ + SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ + &dep_current[D_ ## type], 0, ""); + +SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); +SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); +SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, + "Block or frag allocated from cyl group map"); +SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); +SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); +SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); +SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); +SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); +SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); +SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); +SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); +SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); +SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); +SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); +SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); +SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); +SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); +SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); +SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); +SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); +SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); +SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); +SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); +SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); +SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); +SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); + +static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); +static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); + /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX @@ -447,8 +493,8 @@ softdep_get_depcounts(struct mount *mp, static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, + M_BMSAFEMAP, M_NEWBLK, - M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, @@ -458,7 +504,19 @@ static struct malloc_type *memtype[] = { M_DIRADD, M_MKDIR, M_DIRREM, - M_NEWDIRBLK + M_NEWDIRBLK, + M_FREEWORK, + M_FREEDEP, + M_JADDREF, + M_JREMREF, + M_JMVREF, + M_JNEWBLK, + M_JFREEBLK, + M_JFREEFRAG, + M_JSEG, + M_JSEGDEP, + M_SBDEP, + M_JTRUNC }; #define DtoM(type) (memtype[type]) @@ -467,17 +525,21 @@ static struct malloc_type *memtype[] = { * Names of malloc types. */ #define TYPENAME(type) \ - ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") + ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ +#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) +#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) + /* * Forward declarations. */ struct inodedep_hashhead; struct newblk_hashhead; struct pagedep_hashhead; +struct bmsafemap_hashhead; /* * Internal function prototypes. @@ -487,59 +549,171 @@ static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct mtx *, int); static void clear_remove(struct thread *); static void clear_inodedeps(struct thread *); +static void unlinked_inodedep(struct mount *, struct inodedep *); +static void clear_unlinked_inodedep(struct inodedep *); +static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); +static void free_pagedep(struct pagedep *); +static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); +static int handle_written_sbdep(struct sbdep *, struct buf *); +static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); +static int handle_written_indirdep(struct indirdep *, struct buf *, + struct buf**); static int handle_written_inodeblock(struct inodedep *, struct buf *); -static void handle_allocdirect_partdone(struct allocdirect *); +static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); +static void handle_written_jaddref(struct jaddref *, struct jseg *); +static void handle_written_jremref(struct jremref *, struct jseg *); +static void handle_written_jseg(struct jseg *, struct buf *); +static void handle_written_jnewblk(struct jnewblk *, struct jseg *); +static void handle_written_jfreeblk(struct jfreeblk *, struct jseg *); +static void handle_written_jfreefrag(struct jfreefrag *, struct jseg *); +static void complete_jseg(struct jseg *); +static void jseg_write(struct fs *, struct jblocks *, struct jseg *, + uint8_t *); +static void jaddref_write(struct jaddref *, uint8_t *); +static void jremref_write(struct jremref *, uint8_t *); +static void jmvref_write(struct jmvref *, uint8_t *); +static void jtrunc_write(struct jtrunc *, uint8_t *); +static void jnewblk_write(struct jnewblk *, uint8_t *); +static void jfreeblk_write(struct jfreeblk *, uint8_t *); +static void jfreefrag_write(struct jfreefrag *, uint8_t *); +static inline void inoref_write(struct inoref *, struct jrefrec *); +static void handle_allocdirect_partdone(struct allocdirect *, + struct workhead *); +static void cancel_newblk(struct newblk *, struct workhead *); +static void indirdep_complete(struct indirdep *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); +static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); +static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static void handle_workitem_remove(struct dirrem *, struct vnode *); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); -static void free_diradd(struct diradd *); -static void free_allocindir(struct allocindir *, struct inodedep *); +static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, + struct freeblks *); +static void free_indirdep(struct indirdep *); +static void free_diradd(struct diradd *, struct workhead *); +static void merge_diradd(struct inodedep *, struct diradd *); +static void complete_diradd(struct diradd *); +static struct diradd *diradd_lookup(struct pagedep *, int); +static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, + struct jremref *, struct jremref *); +static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, + struct jremref *); +static void cancel_allocindir(struct allocindir *, struct inodedep *, + struct freeblks *); +static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); -static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, - ufs2_daddr_t *); -static void deallocate_dependencies(struct buf *, struct inodedep *); -static void free_allocdirect(struct allocdirectlst *, - struct allocdirect *, int); +static void free_jremref(struct jremref *); +static void free_jaddref(struct jaddref *); +static void free_jsegdep(struct jsegdep *); +static void free_jseg(struct jseg *); +static void free_jnewblk(struct jnewblk *); +static void free_jfreeblk(struct jfreeblk *); +static void free_jfreefrag(struct jfreefrag *); +static void free_freedep(struct freedep *); +static void journal_jremref(struct dirrem *, struct jremref *, + struct inodedep *); +static void cancel_jnewblk(struct jnewblk *, struct workhead *); +static int cancel_jaddref(struct jaddref *, struct inodedep *, + struct workhead *); +static void cancel_jfreefrag(struct jfreefrag *); +static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); +static int deallocate_dependencies(struct buf *, struct inodedep *, + struct freeblks *); +static void free_newblk(struct newblk *); +static void cancel_allocdirect(struct allocdirectlst *, + struct allocdirect *, struct freeblks *, int); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); +static void freework_freeblock(struct freework *); static void handle_workitem_freeblocks(struct freeblks *, int); +static void handle_complete_freeblocks(struct freeblks *); +static void handle_workitem_indirblk(struct freework *); +static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, - struct allocindir *); + struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, - ufs2_daddr_t); + ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); -static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); +static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, + ufs_lbn_t); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); -static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); -static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, - struct newblk **); -static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); +static struct freefrag *allocindir_merge(struct allocindir *, + struct allocindir *); +static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, + struct bmsafemap **); +static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, + int cg); +static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, + int, struct newblk **); +static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); -static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); +static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, + struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct mount *mp, int, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int); -static void add_to_worklist(struct worklist *); +static void process_removes(struct vnode *); +static void jwork_move(struct workhead *, struct workhead *); +static void add_to_worklist(struct worklist *, int); +static void remove_from_worklist(struct worklist *); static void softdep_flush(void); static int softdep_speedup(void); +static void worklist_speedup(void); +static int journal_mount(struct mount *, struct fs *, struct ucred *); +static void journal_unmount(struct mount *); +static int journal_space(struct ufsmount *, int); +static void journal_suspend(struct ufsmount *); +static void softdep_prelink(struct vnode *, struct vnode *); +static void add_to_journal(struct worklist *); +static void remove_from_journal(struct worklist *); +static void softdep_process_journal(struct mount *, int); +static struct jremref *newjremref(struct dirrem *, struct inode *, + struct inode *ip, off_t, nlink_t); +static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, + uint16_t); +static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, + uint16_t); +static inline struct jsegdep *inoref_segattach(struct inoref *, struct jseg *); +static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); +static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, + ufs2_daddr_t, int); +static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, + ufs2_daddr_t, long, ufs_lbn_t); +static struct freework *newfreework(struct freeblks *, struct freework *, + ufs_lbn_t, ufs2_daddr_t, int, int); +static void jwait(struct worklist *wk); +static struct inodedep *inodedep_lookup_ip(struct inode *); +static int bmsafemap_rollbacks(struct bmsafemap *); +static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); +static void handle_jwork(struct workhead *); +static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, + struct mkdir **); +static struct jblocks *jblocks_create(void); +static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); +static void jblocks_free(struct jblocks *, struct mount *, int); +static void jblocks_destroy(struct jblocks *); +static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. @@ -572,40 +746,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) +#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT +#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE + #else /* DEBUG */ -static void worklist_insert(struct workhead *, struct worklist *); -static void worklist_remove(struct worklist *); +static void worklist_insert(struct workhead *, struct worklist *, int); +static void worklist_remove(struct worklist *, int); -#define WORKLIST_INSERT(head, item) worklist_insert(head, item) -#define WORKLIST_REMOVE(item) worklist_remove(item) +#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) +#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) +#define WORKLIST_REMOVE(item) worklist_remove(item, 1) +#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) static void -worklist_insert(head, item) +worklist_insert(head, item, locked) struct workhead *head; struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if (item->wk_state & ONWORKLIST) - panic("worklist_insert: already on list"); + panic("worklist_insert: %p %s(0x%X) already on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void -worklist_remove(item) +worklist_remove(item, locked) struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if ((item->wk_state & ONWORKLIST) == 0) - panic("worklist_remove: not on list"); + panic("worklist_remove: %p %s(0x%X) not on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* + * Merge two jsegdeps keeping only the oldest one as newer references + * can't be discarded until after older references. + */ +static inline struct jsegdep * +jsegdep_merge(struct jsegdep *one, struct jsegdep *two) +{ + struct jsegdep *swp; + + if (two == NULL) + return (one); + + if (one->jd_seg->js_seq > two->jd_seg->js_seq) { + swp = one; + one = two; + two = swp; + } + WORKLIST_REMOVE(&two->jd_list); + free_jsegdep(two); + + return (one); +} + +/* + * If two freedeps are compatible free one to reduce list size. + */ +static inline struct freedep * +freedep_merge(struct freedep *one, struct freedep *two) +{ + if (two == NULL) + return (one); + + if (one->fd_freework == two->fd_freework) { + WORKLIST_REMOVE(&two->fd_list); + free_freedep(two); + } + return (one); +} + +/* + * Move journal work from one list to another. Duplicate freedeps and + * jsegdeps are coalesced to keep the lists as small as possible. + */ +static void +jwork_move(dst, src) + struct workhead *dst; + struct workhead *src; +{ + struct freedep *freedep; + struct jsegdep *jsegdep; + struct worklist *wkn; + struct worklist *wk; + + KASSERT(dst != src, + ("jwork_move: dst == src")); + freedep = NULL; + jsegdep = NULL; + LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { + if (wk->wk_type == D_JSEGDEP) + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } + + mtx_assert(&lk, MA_OWNED); + while ((wk = LIST_FIRST(src)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(dst, wk); + if (wk->wk_type == D_JSEGDEP) { + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + continue; + } + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } +} + +/* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); @@ -623,13 +885,16 @@ workitem_free(item, type) #ifdef DEBUG if (item->wk_state & ONWORKLIST) - panic("workitem_free: still on list"); + panic("workitem_free: %s(0x%X) still on list", + TYPENAME(item->wk_type), item->wk_state); if (item->wk_type != type) - panic("workitem_free: type mismatch"); + panic("workitem_free: type mismatch %s != %s", + TYPENAME(item->wk_type), TYPENAME(type)); #endif ump = VFSTOUFS(item->wk_mp); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); + dep_current[type]--; free(item, DtoM(type)); } @@ -643,6 +908,8 @@ workitem_alloc(item, type, mp) item->wk_mp = mp; item->wk_state = 0; ACQUIRE_LOCK(&lk); + dep_current[type]++; + dep_total[type]++; VFSTOUFS(mp)->softdep_deps++; VFSTOUFS(mp)->softdep_accdeps++; FREE_LOCK(&lk); @@ -679,23 +946,38 @@ static int stat_inode_bitmap; /* bufs redirtied as static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ -SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); -/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ +SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, + &max_softdeps, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, + &tickdelay, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, + &maxindirdeps, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, + &stat_worklist_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, + &stat_blk_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, + &stat_ino_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, + &stat_blk_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, + &stat_ino_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, + &stat_sync_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, + &stat_indir_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, + &stat_inode_bitmap, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, + &stat_direct_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, + &stat_dir_entry, 0, ""); SYSCTL_DECL(_vfs_ffs); +LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; +static u_long bmsafemap_hash; /* size of hash table - 1 */ + static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); @@ -770,16 +1052,22 @@ softdep_flush(void) } } -static int -softdep_speedup(void) +static void +worklist_speedup(void) { - mtx_assert(&lk, MA_OWNED); if (req_pending == 0) { req_pending = 1; wakeup(&req_pending); } +} +static int +softdep_speedup(void) +{ + + worklist_speedup(); + bd_speedup(); return speedup_syncer(); } @@ -791,15 +1079,17 @@ softdep_flush(void) * and does so in order from first to last. */ static void -add_to_worklist(wk) +add_to_worklist(wk, nodelay) struct worklist *wk; + int nodelay; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); ump = VFSTOUFS(wk->wk_mp); if (wk->wk_state & ONWORKLIST) - panic("add_to_worklist: already on list"); + panic("add_to_worklist: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (LIST_EMPTY(&ump->softdep_workitem_pending)) LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); @@ -807,9 +1097,33 @@ static void LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; ump->softdep_on_worklist += 1; + if (nodelay) + worklist_speedup(); } /* + * Remove the item to be processed. If we are removing the last + * item on the list, we need to recalculate the tail pointer. + */ +static void +remove_from_worklist(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + struct worklist *wkend; + + ump = VFSTOUFS(wk->wk_mp); + WORKLIST_REMOVE(wk); + if (wk == ump->softdep_worklist_tail) { + LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) + if (LIST_NEXT(wkend, wk_list) == NULL) + break; + ump->softdep_worklist_tail = wkend; + } + ump->softdep_on_worklist -= 1; +} + +/* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they @@ -838,8 +1152,9 @@ softdep_process_worklist(mp, full) ACQUIRE_LOCK(&lk); loopcount = 1; starttime = time_second; + softdep_process_journal(mp, full?MNT_WAIT:0); while (ump->softdep_on_worklist > 0) { - if ((cnt = process_worklist_item(mp, 0)) == -1) + if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) break; else matchcnt += cnt; @@ -871,16 +1186,61 @@ softdep_process_worklist(mp, full) * second. Otherwise the other mountpoints may get * excessively backlogged. */ - if (!full && starttime != time_second) { - matchcnt = -1; + if (!full && starttime != time_second) break; - } } FREE_LOCK(&lk); return (matchcnt); } /* + * Process all removes associated with a vnode if we are running out of + * journal space. Any other process which attempts to flush these will + * be unable as we have the vnodes locked. + */ +static void +process_removes(vp) + struct vnode *vp; +{ + struct inodedep *inodedep; + struct dirrem *dirrem; + struct mount *mp; + ino_t inum; + + mtx_assert(&lk, MA_OWNED); + + mp = vp->v_mount; + inum = VTOI(vp)->i_number; + for (;;) { + if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) + return; + LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) + if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == + (COMPLETE | ONWORKLIST)) + break; + if (dirrem == NULL) + return; + /* + * If another thread is trying to lock this vnode it will + * fail but we must wait for it to do so before we can + * proceed. + */ + if (dirrem->dm_state & INPROGRESS) { + dirrem->dm_state |= IOWAITING; + msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); + continue; + } + remove_from_worklist(&dirrem->dm_list); + FREE_LOCK(&lk); + if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) + panic("process_removes: suspended filesystem"); + handle_workitem_remove(dirrem, vp); + vn_finished_secondary_write(mp); + ACQUIRE_LOCK(&lk); + } +} + +/* * Process one item on the worklist. */ static int @@ -888,7 +1248,7 @@ process_worklist_item(mp, flags) struct mount *mp; int flags; { - struct worklist *wk, *wkend; + struct worklist *wk, *wkXXX; struct ufsmount *ump; struct vnode *vp; int matchcnt = 0; @@ -908,11 +1268,14 @@ process_worklist_item(mp, flags) * inodes, we have to skip over any dirrem requests whose * vnodes are resident and locked. */ + vp = NULL; ump = VFSTOUFS(mp); - vp = NULL; LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { - if (wk->wk_state & INPROGRESS) + if (wk->wk_state & INPROGRESS) { + wkXXX = wk; continue; + } + wkXXX = wk; /* Record the last valid wk pointer. */ if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) break; wk->wk_state |= INPROGRESS; @@ -921,6 +1284,10 @@ process_worklist_item(mp, flags) ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); ACQUIRE_LOCK(&lk); + if (wk->wk_state & IOWAITING) { + wk->wk_state &= ~IOWAITING; + wakeup(wk); + } wk->wk_state &= ~INPROGRESS; ump->softdep_on_worklist_inprogress--; if (vp != NULL) @@ -928,21 +1295,7 @@ process_worklist_item(mp, flags) } if (wk == 0) return (-1); - /* - * Remove the item to be processed. If we are removing the last - * item on the list, we need to recalculate the tail pointer. - * As this happens rarely and usually when the list is short, - * we just run down the list to find it rather than tracking it - * in the above loop. - */ - WORKLIST_REMOVE(wk); - if (wk == ump->softdep_worklist_tail) { - LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) - if (LIST_NEXT(wkend, wk_list) == NULL) - break; - ump->softdep_worklist_tail = wkend; - } - ump->softdep_on_worklist -= 1; + remove_from_worklist(wk); FREE_LOCK(&lk); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); @@ -952,6 +1305,8 @@ process_worklist_item(mp, flags) case D_DIRREM: /* removal of a directory entry */ handle_workitem_remove(WK_DIRREM(wk), vp); + if (vp) + vput(vp); break; case D_FREEBLKS: @@ -969,6 +1324,11 @@ process_worklist_item(mp, flags) handle_workitem_freefile(WK_FREEFILE(wk)); break; + case D_FREEWORK: + /* Final block in an indirect was freed. */ + handle_workitem_indirblk(WK_FREEWORK(wk)); + break; + default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); @@ -982,19 +1342,22 @@ process_worklist_item(mp, flags) /* * Move dependencies from one buffer to another. */ -void +int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; + int dirty; - if (!LIST_EMPTY(&newbp->b_dep)) - panic("softdep_move_dependencies: need merge code"); - wktail = 0; + dirty = 0; + wktail = NULL; ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); + if (wk->wk_type == D_BMSAFEMAP && + bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) + dirty = 1; if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else @@ -1002,6 +1365,8 @@ softdep_move_dependencies(oldbp, newbp) wktail = wk; } FREE_LOCK(&lk); + + return (dirty); } /* @@ -1198,23 +1563,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, paged * This routine must be called with splbio interrupts blocked. */ static int -pagedep_lookup(ip, lbn, flags, pagedeppp) - struct inode *ip; +pagedep_lookup(mp, ino, lbn, flags, pagedeppp) + struct mount *mp; + ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; - struct mount *mp; int ret; int i; mtx_assert(&lk, MA_OWNED); - mp = ITOV(ip)->v_mount; - pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); + pagedephd = PAGEDEP_HASH(mp, ino, lbn); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp || (flags & DEPALLOC) == 0) return (ret); FREE_LOCK(&lk); @@ -1222,12 +1586,12 @@ static int M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(&lk); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp) { WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } - pagedep->pd_ino = ip->i_number; + pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); @@ -1314,10 +1678,13 @@ inodedep_lookup(mp, inum, flags, inodedeppp) inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; - inodedep->id_buf = NULL; + inodedep->id_bmsafemap = NULL; + inodedep->id_mkdiradd = NULL; + LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); + TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); @@ -1336,17 +1703,29 @@ u_long newblk_hash; /* size of hash table - 1 */ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static int -newblk_find(newblkhd, fs, newblkno, newblkpp) +newblk_find(newblkhd, mp, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; - struct fs *fs; + struct mount *mp; ufs2_daddr_t newblkno; + int flags; struct newblk **newblkpp; { struct newblk *newblk; - LIST_FOREACH(newblk, newblkhd, nb_hash) - if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) - break; + LIST_FOREACH(newblk, newblkhd, nb_hash) { + if (newblkno != newblk->nb_newblkno) + continue; + if (mp != newblk->nb_list.wk_mp) + continue; + /* + * If we're creating a new dependency don't match those that + * have already been converted to allocdirects. This is for + * a frag extend. + */ + if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) + continue; + break; + } if (newblk) { *newblkpp = newblk; return (1); @@ -1361,8 +1740,8 @@ static int * Found or allocated entry is returned in newblkpp. */ static int -newblk_lookup(fs, newblkno, flags, newblkpp) - struct fs *fs; +newblk_lookup(mp, newblkno, flags, newblkpp) + struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; @@ -1370,21 +1749,25 @@ static int struct newblk *newblk; struct newblk_hashhead *newblkhd; - newblkhd = NEWBLK_HASH(fs, newblkno); - if (newblk_find(newblkhd, fs, newblkno, newblkpp)) + newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); + if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(&lk); - newblk = malloc(sizeof(struct newblk), - M_NEWBLK, M_SOFTDEP_FLAGS); + newblk = malloc(sizeof(union allblk), M_NEWBLK, + M_SOFTDEP_FLAGS | M_ZERO); + workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(&lk); - if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { - free(newblk, M_NEWBLK); + if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { + WORKITEM_FREE(newblk, D_NEWBLK); return (1); } - newblk->nb_state = 0; - newblk->nb_fs = fs; + newblk->nb_freefrag = NULL; + LIST_INIT(&newblk->nb_indirdeps); + LIST_INIT(&newblk->nb_newdirblk); + LIST_INIT(&newblk->nb_jwork); + newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; @@ -1401,10 +1784,10 @@ softdep_initialize() LIST_INIT(&mkdirlisthd); max_softdeps = desiredvnodes * 4; - pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, - &pagedep_hash); + pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); - newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); + newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); + bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; @@ -1428,6 +1811,7 @@ softdep_uninitialize() hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); + hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); } /* @@ -1457,9 +1841,16 @@ softdep_mount(devvp, mp, fs, cred) MNT_IUNLOCK(mp); ump = VFSTOUFS(mp); LIST_INIT(&ump->softdep_workitem_pending); + LIST_INIT(&ump->softdep_journal_pending); + TAILQ_INIT(&ump->softdep_unlinked); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; + if ((fs->fs_flags & FS_SUJ) && + (error = journal_mount(mp, fs, cred)) != 0) { + printf("Failed to start journal: %d\n", error); + return (error); + } /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation @@ -1493,7 +1884,1953 @@ softdep_mount(devvp, mp, fs, cred) return (0); } +void +softdep_unmount(mp) + struct mount *mp; +{ + + if (mp->mnt_flag & MNT_SUJ) + journal_unmount(mp); +} + +struct jblocks { + struct jseglst jb_segs; /* TAILQ of current segments. */ + struct jseg *jb_writeseg; /* Next write to complete. */ + struct jextent *jb_extent; /* Extent array. */ + uint64_t jb_nextseq; /* Next sequence number. */ + uint64_t jb_oldestseq; /* Oldest active sequence number. */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ + int jb_blocks; /* Total disk blocks covered. */ + int jb_free; /* Total disk blocks free. */ + int jb_min; /* Minimum free space. */ + int jb_low; /* Low on space. */ + int jb_age; /* Insertion time of oldest rec. */ + int jb_suspended; /* Did journal suspend writes? */ +}; + +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + + jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); + TAILQ_INIT(&jblocks->jb_segs); + jblocks->jb_avail = 10; + jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + + return (jblocks); +} + +static ufs2_daddr_t +jblocks_alloc(jblocks, bytes, actual) + struct jblocks *jblocks; + int bytes; + int *actual; +{ + ufs2_daddr_t daddr; + struct jextent *jext; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + jblocks->jb_head = 0; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + jblocks->jb_off += freecnt; + jblocks->jb_free -= freecnt; + + return (daddr); +} + +static void +jblocks_free(jblocks, mp, bytes) + struct jblocks *jblocks; + struct mount *mp; + int bytes; +{ + + jblocks->jb_free += bytes / DEV_BSIZE; + if (jblocks->jb_suspended) + worklist_speedup(); + wakeup(jblocks); +} + +static void +jblocks_destroy(jblocks) + struct jblocks *jblocks; +{ + + if (jblocks->jb_extent) + free(jblocks->jb_extent, M_JBLOCKS); + free(jblocks, M_JBLOCKS); +} + +static void +jblocks_add(jblocks, daddr, blocks) + struct jblocks *jblocks; + ufs2_daddr_t daddr; + int blocks; +{ + struct jextent *jext; + + jblocks->jb_blocks += blocks; + jblocks->jb_free += blocks; + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + memcpy(jext, jblocks->jb_extent, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent, M_JBLOCKS); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; +} + /* + * Open and verify the journal file. + */ +static int +journal_mount(mp, fs, cred) + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct jblocks *jblocks; + struct vnode *vp; + struct inode *ip; + ufs2_daddr_t blkno; + int bcount; + int error; + int i; + + mp->mnt_flag |= MNT_SUJ; + error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp); + if (error) + return (error); + ip = VTOI(vp); + if (ip->i_size < SUJ_MIN) { + error = ENOSPC; + goto out; + } + bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ + jblocks = jblocks_create(); + for (i = 0; i < bcount; i++) { + error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); + if (error) + break; + jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); + } + if (error) { + jblocks_destroy(jblocks); + goto out; + } + jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ + jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ + DIP_SET(ip, i_modrev, fs->fs_mtime); + ip->i_flags |= IN_MODIFIED; + ffs_update(vp, 1); + VFSTOUFS(mp)->softdep_jblocks = jblocks; +out: + vput(vp); + return (error); +} + +static void +journal_unmount(mp) + struct mount *mp; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + if (ump->softdep_jblocks) + jblocks_destroy(ump->softdep_jblocks); + ump->softdep_jblocks = NULL; +} + +/* + * Called when a journal record is ready to be written. Space is allocated + * and the journal entry is created when the journal is flushed to stable + * store. + */ +static void +add_to_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + ump = VFSTOUFS(wk->wk_mp); + if (wk->wk_state & ONWORKLIST) + panic("add_to_journal: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); + wk->wk_state |= ONWORKLIST | DEPCOMPLETE; + if (LIST_EMPTY(&ump->softdep_journal_pending)) { + ump->softdep_jblocks->jb_age = ticks; + LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); + } else + LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); + ump->softdep_journal_tail = wk; + ump->softdep_on_journal += 1; +} + +/* + * Remove an arbitrary item for the journal worklist maintain the tail + * pointer. This happens when a new operation obviates the need to + * journal an old operation. + */ +static void +remove_from_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + ump = VFSTOUFS(wk->wk_mp); +#ifdef DEBUG /* XXX Expensive, temporary. */ + { + struct worklist *wkn; + + LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) + if (wkn == wk) + break; + if (wkn == NULL) + panic("remove_from_journal: %p is not in journal", wk); + } +#endif + /* + * We emulate a TAILQ to save space in most structures which do not + * require TAILQ semantics. Here we must update the tail position + * when removing the tail which is not the final entry. + */ + if (ump->softdep_journal_tail == wk) + ump->softdep_journal_tail = + (struct worklist *)wk->wk_list.le_prev; + + WORKLIST_REMOVE(wk); + ump->softdep_on_journal -= 1; +} + +static int +journal_space(ump, thresh) + struct ufsmount *ump; + int thresh; +{ + struct jblocks *jblocks; + int avail; + + jblocks = ump->softdep_jblocks; + avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; + avail = jblocks->jb_free - avail; + + return (avail > thresh); +} + +static void +journal_suspend(ump) + struct ufsmount *ump; +{ + struct jblocks *jblocks; + struct mount *mp; + + mp = UFSTOVFS(ump); + jblocks = ump->softdep_jblocks; + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { + mp->mnt_kern_flag |= MNTK_SUSPEND; + mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); + } + jblocks->jb_suspended = 1; + MNT_IUNLOCK(mp); +} + +/* + * Called before any allocation function to be certain that there is + * sufficient space in the journal prior to creating any new records. + * Since in the case of block allocation we may have multiple locked + * buffers at the time of the actual allocation we can not block + * when the journal records are created. Doing so would create a deadlock + * if any of these buffers needed to be flushed to reclaim space. Instead + * we require a sufficiently large amount of available space such that + * each thread in the system could have passed this allocation check and + * still have sufficient free space. With 20% of a minimum journal size + * of 1MB we have 6553 records available. + */ +int +softdep_prealloc(vp, waitok) + struct vnode *vp; + int waitok; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + + if (DOINGSUJ(vp) == 0) + return (0); + ump = VFSTOUFS(vp->v_mount); + jblocks = ump->softdep_jblocks; + ACQUIRE_LOCK(&lk); + if (journal_space(ump, jblocks->jb_low)) { + FREE_LOCK(&lk); + return (0); + } + FREE_LOCK(&lk); + if (waitok == MNT_NOWAIT) + return (ENOSPC); + /* + * Attempt to sync this vnode once to flush any journal + * work attached to it. + */ + ffs_syncvnode(vp, waitok); + ACQUIRE_LOCK(&lk); + process_removes(vp); + if (journal_space(ump, jblocks->jb_low) == 0) { + softdep_speedup(); + if (journal_space(ump, jblocks->jb_min) == 0) + journal_suspend(ump); + } + FREE_LOCK(&lk); + + return (0); +} + +static void +softdep_prelink(dvp, vp) + struct vnode *dvp; + struct vnode *vp; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + + ump = VFSTOUFS(dvp->v_mount); + jblocks = ump->softdep_jblocks; + mtx_assert(&lk, MA_OWNED); + if (journal_space(ump, jblocks->jb_low)) + return; + FREE_LOCK(&lk); + if (vp) + ffs_syncvnode(vp, MNT_NOWAIT); + ffs_syncvnode(dvp, MNT_WAIT); + ACQUIRE_LOCK(&lk); + /* Process vp before dvp as it may create .. removes. */ + if (vp) + process_removes(vp); + process_removes(dvp); + softdep_speedup(); + process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); + process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); + if (journal_space(ump, jblocks->jb_low) == 0) { + softdep_speedup(); + if (journal_space(ump, jblocks->jb_min) == 0) + journal_suspend(ump); + } +} + +static void +jseg_write(fs, jblocks, jseg, data) + struct fs *fs; + struct jblocks *jblocks; + struct jseg *jseg; + uint8_t *data; +{ + struct jsegrec *rec; + + rec = (struct jsegrec *)data; + rec->jsr_seq = jseg->js_seq; + rec->jsr_oldest = jblocks->jb_oldestseq; + rec->jsr_cnt = jseg->js_cnt; + rec->jsr_crc = 0; + rec->jsr_time = fs->fs_mtime; +} + +static inline void +inoref_write(inoref, rec) + struct inoref *inoref; + struct jrefrec *rec; +{ + rec->jr_ino = inoref->if_ino; + rec->jr_parent = inoref->if_parent; + rec->jr_nlink = inoref->if_nlink; + rec->jr_mode = inoref->if_mode; + rec->jr_diroff = inoref->if_diroff; +} + +static void +jaddref_write(jaddref, data) + struct jaddref *jaddref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_ADDREF; + inoref_write(&jaddref->ja_ref, rec); +} + +static void +jremref_write(jremref, data) + struct jremref *jremref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_REMREF; + inoref_write(&jremref->jr_ref, rec); +} + +static void +jmvref_write(jmvref, data) + struct jmvref *jmvref; + uint8_t *data; +{ + struct jmvrec *rec; + + rec = (struct jmvrec *)data; + rec->jm_op = JOP_MVREF; + rec->jm_ino = jmvref->jm_ino; + rec->jm_parent = jmvref->jm_parent; + rec->jm_oldoff = jmvref->jm_oldoff; + rec->jm_newoff = jmvref->jm_newoff; +} + +static void +jnewblk_write(jnewblk, data) + struct jnewblk *jnewblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_NEWBLK; + rec->jb_ino = jnewblk->jn_ino; + rec->jb_blkno = jnewblk->jn_blkno; + rec->jb_lbn = jnewblk->jn_lbn; + rec->jb_frags = jnewblk->jn_frags; + rec->jb_oldfrags = jnewblk->jn_oldfrags; +} + +static void +jfreeblk_write(jfreeblk, data) + struct jfreeblk *jfreeblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreeblk->jf_ino; + rec->jb_blkno = jfreeblk->jf_blkno; + rec->jb_lbn = jfreeblk->jf_lbn; + rec->jb_frags = jfreeblk->jf_frags; + rec->jb_oldfrags = 0; +} + +static void +jfreefrag_write(jfreefrag, data) + struct jfreefrag *jfreefrag; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreefrag->fr_ino; + rec->jb_blkno = jfreefrag->fr_blkno; + rec->jb_lbn = jfreefrag->fr_lbn; + rec->jb_frags = jfreefrag->fr_frags; + rec->jb_oldfrags = 0; +} + +static void +jtrunc_write(jtrunc, data) + struct jtrunc *jtrunc; + uint8_t *data; +{ + struct jtrncrec *rec; + + rec = (struct jtrncrec *)data; + rec->jt_op = JOP_TRUNC; + rec->jt_ino = jtrunc->jt_ino; + rec->jt_size = jtrunc->jt_size; + rec->jt_extsize = jtrunc->jt_extsize; +} + +/* + * Flush some journal records to disk. + */ +static void +softdep_process_journal(mp, flags) + struct mount *mp; + int flags; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + struct worklist *wk; + struct jseg *jseg; + struct buf *bp; + uint8_t *data; + struct fs *fs; + int segwritten; + int jrecmin; /* Minimum write size. */ + int jrecmax; /* Maximum write size. */ + int size; + int cnt; + + if ((mp->mnt_flag & MNT_SUJ) == 0) + return; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + jblocks = ump->softdep_jblocks; + /* + * We write anywhere between a disk block and fs block. The upper + * bound is picked to prevent buffer cache fragmentation and limit + * processing time per I/O. + */ + jrecmax = fs->fs_bsize / JREC_SIZE; + jrecmin = DEV_BSIZE / JREC_SIZE; + segwritten = 0; + while ((cnt = ump->softdep_on_journal) != 0) { + /* + * Create a new segment to hold as many as 'cnt' journal + * entries and add them to the segment. Notice cnt is + * off by one to account for the space required by the + * jsegrec. If we don't have a full block to log skip it + * unless we haven't written anything in 10 seconds. + */ + cnt++; + if (cnt < jrecmax) { + if (segwritten) + return; + if (flags != MNT_WAIT && + (ticks - jblocks->jb_age) > hz*10) + break; + } + /* + * Verify some free journal space. softdep_prealloc() should + * guarantee that we don't run out so this is indicative of + * a problem with the flow control. Try to recover + * gracefully in any event. + */ + while (jblocks->jb_free == 0) { + if (flags != MNT_WAIT) + break; + printf("softdep: Out of journal space!\n"); + softdep_speedup(); + msleep(jblocks, &lk, PRIBIO, "jblocks", 1); + } + FREE_LOCK(&lk); + jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); + workitem_alloc(&jseg->js_list, D_JSEG, mp); + LIST_INIT(&jseg->js_entries); + jseg->js_state = ATTACHED; + jseg->js_refs = 1; /* Self reference. */ + jseg->js_jblocks = jblocks; + size = roundup2(cnt * JREC_SIZE, DEV_BSIZE); + bp = geteblk(fs->fs_bsize, 0); + ACQUIRE_LOCK(&lk); + /* + * If there was a race while we were allocating the block + * and jseg the entry we care about was likely written. + * We bail out in both the WAIT and NOWAIT case and assume + * the caller will loop if the entry it cares about is + * not written. + */ + if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + WORKITEM_FREE(jseg, D_JSEG); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); + break; + } + /* + * Calculate the disk block size required for the available + * records rounded to the min size. + */ + cnt = ump->softdep_on_journal + 1; + if (cnt < jrecmax) + cnt = roundup2(cnt, jrecmin); + else + cnt = jrecmax; + size = cnt * JREC_SIZE; + /* + * Allocate a disk block for this journal data and account + * for truncation of the requested size if enough contiguous + * space was not available. + */ + bp->b_blkno = bp->b_lblkno = jblocks_alloc(jblocks, size, + &size); + bp->b_offset = bp->b_blkno * DEV_BSIZE; + bp->b_bcount = size; + bp->b_bufobj = &ump->um_devvp->v_bufobj; + bp->b_flags &= ~B_INVAL; + /* + * Initialize our jseg with as many as cnt - 1 records. + * Assign the next sequence number to it and link it + * in-order. + */ + cnt = MIN(ump->softdep_on_journal, (size / JREC_SIZE) - 1); + jseg->js_buf = bp; + jseg->js_cnt = cnt; + jseg->js_size = size; + jseg->js_seq = jblocks->jb_nextseq++; + if (TAILQ_EMPTY(&jblocks->jb_segs)) + jblocks->jb_oldestseq = jseg->js_seq; + TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); + if (jblocks->jb_writeseg == NULL) + jblocks->jb_writeseg = jseg; + /* + * Start filling in records from the pending list. + */ + data = bp->b_data; + jseg_write(fs, jblocks, jseg, data); + data += JREC_SIZE; + while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) + != NULL) { + remove_from_journal(wk); + wk->wk_state |= IOSTARTED; + WORKLIST_INSERT(&jseg->js_entries, wk); + switch (wk->wk_type) { + case D_JADDREF: + jaddref_write(WK_JADDREF(wk), data); + break; + case D_JREMREF: + jremref_write(WK_JREMREF(wk), data); + break; + case D_JMVREF: + jmvref_write(WK_JMVREF(wk), data); + break; + case D_JNEWBLK: + jnewblk_write(WK_JNEWBLK(wk), data); + break; + case D_JFREEBLK: + jfreeblk_write(WK_JFREEBLK(wk), data); + break; + case D_JFREEFRAG: + jfreefrag_write(WK_JFREEFRAG(wk), data); + break; + case D_JTRUNC: + jtrunc_write(WK_JTRUNC(wk), data); + break; + default: + panic("process_journal: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + data += JREC_SIZE; + if (--cnt == 0) + break; + } + /* + * Write this one buffer and continue. + */ +#if 1 + WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); + FREE_LOCK(&lk); + BO_LOCK(bp->b_bufobj); + bgetvp(ump->um_devvp, bp); + BO_UNLOCK(bp->b_bufobj); + /* XXX Could bawrite here. */ + bwrite(bp); + ACQUIRE_LOCK(&lk); +#else + /* This case simulates the write but does not log anything. */ + handle_written_jseg(jseg, bp); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); +#endif + segwritten++; + } + /* + * If we've suspended the filesystem because we ran out of journal + * space either try to sync it here to make some progress or + * unsuspend it if we already have. + */ + if (flags == 0 && jblocks && jblocks->jb_suspended) { + if (journal_space(ump, jblocks->jb_min)) { + FREE_LOCK(&lk); + jblocks->jb_suspended = 0; + mp->mnt_susp_owner = curthread; + vfs_write_resume(mp); + ACQUIRE_LOCK(&lk); + return; + } + FREE_LOCK(&lk); + VFS_SYNC(mp, MNT_NOWAIT); + ffs_sbupdate(ump, MNT_WAIT, 0); + ACQUIRE_LOCK(&lk); + } +} + +/* + * Complete a jseg, allowing all dependencies awaiting journal writes + * to proceed. Each journal dependency also attaches a jsegdep to dependent + * structures so that the journal segment can be freed to reclaim space. + */ +static void +complete_jseg(jseg) + struct jseg *jseg; +{ + struct worklist *wk; + struct jmvref *jmvref; + int waiting; + int i; + + i = 0; + while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { + WORKLIST_REMOVE(wk); + waiting = wk->wk_state & IOWAITING; + wk->wk_state &= ~(IOSTARTED | IOWAITING); + wk->wk_state |= COMPLETE; + KASSERT(i < jseg->js_cnt, + ("handle_written_jseg: overflow %d >= %d", + i, jseg->js_cnt)); + jseg->js_refs++; /* Ref goes to the jsegdep below. */ + switch (wk->wk_type) { + case D_JADDREF: + handle_written_jaddref(WK_JADDREF(wk), jseg); + break; + case D_JREMREF: + handle_written_jremref(WK_JREMREF(wk), jseg); + break; + case D_JMVREF: + jseg->js_refs--; /* No jsegdep here. */ + jmvref = WK_JMVREF(wk); + LIST_REMOVE(jmvref, jm_deps); + free_pagedep(jmvref->jm_pagedep); + WORKITEM_FREE(jmvref, D_JMVREF); + break; + case D_JNEWBLK: + handle_written_jnewblk(WK_JNEWBLK(wk), jseg); + break; + case D_JFREEBLK: + handle_written_jfreeblk(WK_JFREEBLK(wk), jseg); + break; + case D_JFREEFRAG: + handle_written_jfreefrag(WK_JFREEFRAG(wk), jseg); + break; + case D_JTRUNC: + WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg; + WORKITEM_FREE(wk, D_JTRUNC); + break; + default: + panic("handle_written_jseg: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + if (waiting) + wakeup(wk); + } + /* Release the self reference so the structure may be freed. */ + free_jseg(jseg); +} + +/* + * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg + * completions in order only. + */ +static void +handle_written_jseg(jseg, bp) + struct jseg *jseg; + struct buf *bp; +{ + struct jblocks *jblocks; + struct jseg *jsegn; + + if (jseg->js_refs == 0) + panic("handle_written_jseg: No self-reference on %p", jseg); + jseg->js_state |= DEPCOMPLETE; + /* + * We'll never need this buffer again, set flags so it will be + * discarded. + */ + bp->b_flags |= B_INVAL | B_NOCACHE; + jblocks = jseg->js_jblocks; + /* + * Don't allow out of order completions. If this isn't the first + * block wait for it to write before we're done. + */ + if (jseg != jblocks->jb_writeseg) + return; + /* Iterate through available jsegs processing their entries. */ + do { + jsegn = TAILQ_NEXT(jseg, js_next); + complete_jseg(jseg); + jseg = jsegn; + } while (jseg && jseg->js_state & DEPCOMPLETE); + jblocks->jb_writeseg = jseg; +} + +static inline struct jsegdep * +inoref_segattach(inoref, jseg) + struct inoref *inoref; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + + jsegdep = inoref->if_jsegdep; + inoref->if_jsegdep = NULL; + jsegdep->jd_seg = jseg; + + return (jsegdep); +} + +/* + * Called once a jremref has made it to stable store. The jremref is marked + * complete and we attempt to free it. Any pagedeps writes sleeping waiting + * for the jremref to complete will be awoken by free_jremref. + */ +static void +handle_written_jremref(jremref, jseg) + struct jremref *jremref; + struct jseg *jseg; +{ + struct inodedep *inodedep; + struct jsegdep *jsegdep; + struct dirrem *dirrem; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = inoref_segattach(&jremref->jr_ref, jseg); + /* + * Remove us from the inoref list. + */ + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, + 0, &inodedep) == 0) + panic("handle_written_jremref: Lost inodedep"); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + /* + * Complete the dirrem. + */ + dirrem = jremref->jr_dirrem; + jremref->jr_dirrem = NULL; + LIST_REMOVE(jremref, jr_deps); + jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; + WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd) && + (dirrem->dm_state & COMPLETE) != 0) + add_to_worklist(&dirrem->dm_list, 0); + free_jremref(jremref); +} + +/* + * Called once a jaddref has made it to stable store. The dependency is + * marked complete and any dependent structures are added to the inode + * bufwait list to be completed as soon as it is written. If a bitmap write + * depends on this entry we move the inode into the inodedephd of the + * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. + */ +static void +handle_written_jaddref(jaddref, jseg) + struct jaddref *jaddref; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + struct inodedep *inodedep; + struct diradd *diradd; + struct mkdir *mkdir; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = inoref_segattach(&jaddref->ja_ref, jseg); + mkdir = NULL; + diradd = NULL; + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("handle_written_jaddref: Lost inodedep."); + if (jaddref->ja_diradd == NULL) + panic("handle_written_jaddref: No dependency"); + if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { + diradd = jaddref->ja_diradd; + WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); + } else if (jaddref->ja_state & MKDIR_PARENT) { + mkdir = jaddref->ja_mkdir; + WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); + } else if (jaddref->ja_state & MKDIR_BODY) + mkdir = jaddref->ja_mkdir; + else + panic("handle_written_jaddref: Unknown dependency %p", + jaddref->ja_diradd); + jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ + /* + * Remove us from the inode list. + */ + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); + /* + * The mkdir may be waiting on the jaddref to clear before freeing. + */ + if (mkdir) { + KASSERT(mkdir->md_list.wk_type == D_MKDIR, + ("handle_written_jaddref: Incorrect type for mkdir %s", + TYPENAME(mkdir->md_list.wk_type))); + mkdir->md_jaddref = NULL; + diradd = mkdir->md_diradd; + mkdir->md_state |= DEPCOMPLETE; + complete_mkdir(mkdir); + } + WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); + if (jaddref->ja_state & NEWBLOCK) { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, + inodedep, id_deps); + } + free_jaddref(jaddref); +} + +/* + * Called once a jnewblk journal is written. The allocdirect or allocindir + * is placed in the bmsafemap to await notification of a written bitmap. + */ +static void +handle_written_jnewblk(jnewblk, jseg) + struct jnewblk *jnewblk; + struct jseg *jseg; +{ + struct bmsafemap *bmsafemap; + struct jsegdep *jsegdep; + struct newblk *newblk; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = jnewblk->jn_jsegdep; + jnewblk->jn_jsegdep = NULL; + jsegdep->jd_seg = jseg; + /* + * Add the written block to the bmsafemap so it can be notified when + * the bitmap is on disk. + */ + newblk = jnewblk->jn_newblk; + jnewblk->jn_newblk = NULL; + if (newblk == NULL) + panic("handle_written_jnewblk: No dependency for the segdep."); + + newblk->nb_jnewblk = NULL; + bmsafemap = newblk->nb_bmsafemap; + WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + free_jnewblk(jnewblk); +} + +/* + * Cancel a jfreefrag that won't be needed, probably due to colliding with + * an in-flight allocation that has not yet been committed. Divorce us + * from the freefrag and mark it DEPCOMPLETE so that it may be added + * to the worklist. + */ +static void +cancel_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + struct freefrag *freefrag; + + if (jfreefrag->fr_jsegdep) { + free_jsegdep(jfreefrag->fr_jsegdep); + jfreefrag->fr_jsegdep = NULL; + } + freefrag = jfreefrag->fr_freefrag; + jfreefrag->fr_freefrag = NULL; + freefrag->ff_jfreefrag = NULL; + free_jfreefrag(jfreefrag); + freefrag->ff_state |= DEPCOMPLETE; +} + +/* + * Free a jfreefrag when the parent freefrag is rendered obsolete. + */ +static void +free_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + + if (jfreefrag->fr_state & IOSTARTED) + WORKLIST_REMOVE(&jfreefrag->fr_list); + else if (jfreefrag->fr_state & ONWORKLIST) + remove_from_journal(&jfreefrag->fr_list); + if (jfreefrag->fr_freefrag != NULL) + panic("free_jfreefrag: Still attached to a freefrag."); + WORKITEM_FREE(jfreefrag, D_JFREEFRAG); +} + +/* + * Called when the journal write for a jfreefrag completes. The parent + * freefrag is added to the worklist if this completes its dependencies. + */ +static void +handle_written_jfreefrag(jfreefrag, jseg) + struct jfreefrag *jfreefrag; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + struct freefrag *freefrag; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = jfreefrag->fr_jsegdep; + jfreefrag->fr_jsegdep = NULL; + jsegdep->jd_seg = jseg; + freefrag = jfreefrag->fr_freefrag; + if (freefrag == NULL) + panic("handle_written_jfreefrag: No freefrag."); + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jfreefrag = NULL; + WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); + jfreefrag->fr_freefrag = NULL; + free_jfreefrag(jfreefrag); +} + +/* + * Called when the journal write for a jfreeblk completes. The jfreeblk + * is removed from the freeblks list of pending journal writes and the + * jsegdep is moved to the freeblks jwork to be completed when all blocks + * have been reclaimed. + */ +static void +handle_written_jfreeblk(jfreeblk, jseg) + struct jfreeblk *jfreeblk; + struct jseg *jseg; +{ + struct freeblks *freeblks; + struct jsegdep *jsegdep; + + /* Attach the jsegdep to the jseg. */ + jsegdep = jfreeblk->jf_jsegdep; + jfreeblk->jf_jsegdep = NULL; + jsegdep->jd_seg = jseg; + freeblks = jfreeblk->jf_freeblks; + LIST_REMOVE(jfreeblk, jf_deps); + WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); + /* + * If the freeblks is all journaled, we can add it to the worklist. + */ + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && + (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { + /* Remove from the b_dep that is waiting on this write. */ + if (freeblks->fb_state & ONWORKLIST) + WORKLIST_REMOVE(&freeblks->fb_list); + add_to_worklist(&freeblks->fb_list, 1); + } + + free_jfreeblk(jfreeblk); +} + +static struct jsegdep * +newjsegdep(struct worklist *wk) +{ + struct jsegdep *jsegdep; + + jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); + jsegdep->jd_seg = NULL; + + return (jsegdep); +} + +static struct jmvref * +newjmvref(dp, ino, oldoff, newoff) + struct inode *dp; + ino_t ino; + off_t oldoff; + off_t newoff; +{ + struct jmvref *jmvref; + + jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); + jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; + jmvref->jm_parent = dp->i_number; + jmvref->jm_ino = ino; + jmvref->jm_oldoff = oldoff; + jmvref->jm_newoff = newoff; + + return (jmvref); +} + +/* + * Allocate a new jremref that tracks the removal of ip from dp with the + * directory entry offset of diroff. Mark the entry as ATTACHED and + * DEPCOMPLETE as we have all the information required for the journal write + * and the directory has already been removed from the buffer. The caller + * is responsible for linking the jremref into the pagedep and adding it + * to the journal to write. The MKDIR_PARENT flag is set if we're doing + * a DOTDOT addition so handle_workitem_remove() can properly assign + * the jsegdep when we're done. + */ +static struct jremref * +newjremref(dirrem, dp, ip, diroff, nlink) + struct dirrem *dirrem; + struct inode *dp; + struct inode *ip; + off_t diroff; + nlink_t nlink; +{ + struct jremref *jremref; + + jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); + jremref->jr_state = ATTACHED; + newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, + nlink, ip->i_mode); + jremref->jr_dirrem = dirrem; + + return (jremref); +} + +static inline void +newinoref(inoref, ino, parent, diroff, nlink, mode) + struct inoref *inoref; + ino_t ino; + ino_t parent; + off_t diroff; + nlink_t nlink; + uint16_t mode; +{ + + inoref->if_jsegdep = newjsegdep(&inoref->if_list); + inoref->if_diroff = diroff; + inoref->if_ino = ino; + inoref->if_parent = parent; + inoref->if_nlink = nlink; + inoref->if_mode = mode; +} + +/* + * Allocate a new jaddref to track the addition of ino to dp at diroff. The + * directory offset may not be known until later. The caller is responsible + * adding the entry to the journal when this information is available. nlink + * should be the link count prior to the addition and mode is only required + * to have the correct FMT. + */ +static struct jaddref * +newjaddref(dp, ino, diroff, nlink, mode) + struct inode *dp; + ino_t ino; + off_t diroff; + int16_t nlink; + uint16_t mode; +{ + struct jaddref *jaddref; + + jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); + jaddref->ja_state = ATTACHED; + jaddref->ja_mkdir = NULL; + newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); + + return (jaddref); +} + +/* + * Create a new free dependency for a freework. The caller is responsible + * for adjusting the reference count when it has the lock held. The freedep + * will track an outstanding bitmap write that will ultimately clear the + * freework to continue. + */ +static struct freedep * +newfreedep(struct freework *freework) +{ + struct freedep *freedep; + + freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); + freedep->fd_freework = freework; + + return (freedep); +} + +/* + * Free a freedep structure once the buffer it is linked to is written. If + * this is the last reference to the freework schedule it for completion. + */ +static void +free_freedep(freedep) + struct freedep *freedep; +{ + + if (--freedep->fd_freework->fw_ref == 0) + add_to_worklist(&freedep->fd_freework->fw_list, 1); + WORKITEM_FREE(freedep, D_FREEDEP); +} + +/* + * Allocate a new freework structure that may be a level in an indirect + * when parent is not NULL or a top level block when it is. The top level + * freework structures are allocated without lk held and before the freeblks + * is visible outside of softdep_setup_freeblocks(). + */ +static struct freework * +newfreework(freeblks, parent, lbn, nb, frags, journal) + struct freeblks *freeblks; + struct freework *parent; + ufs_lbn_t lbn; + ufs2_daddr_t nb; + int frags; + int journal; +{ + struct freework *freework; + + freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); + workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); + freework->fw_freeblks = freeblks; + freework->fw_parent = parent; + freework->fw_lbn = lbn; + freework->fw_blkno = nb; + freework->fw_frags = frags; + freework->fw_ref = 0; + freework->fw_off = 0; + LIST_INIT(&freework->fw_jwork); + + if (parent == NULL) { + WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, + &freework->fw_list); + freeblks->fb_ref++; + } + if (journal) + newjfreeblk(freeblks, lbn, nb, frags); + + return (freework); +} + +/* + * Allocate a new jfreeblk to journal top level block pointer when truncating + * a file. The caller must add this to the worklist when lk is held. + */ +static struct jfreeblk * +newjfreeblk(freeblks, lbn, blkno, frags) + struct freeblks *freeblks; + ufs_lbn_t lbn; + ufs2_daddr_t blkno; + int frags; +{ + struct jfreeblk *jfreeblk; + + jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); + jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); + jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; + jfreeblk->jf_ino = freeblks->fb_previousinum; + jfreeblk->jf_lbn = lbn; + jfreeblk->jf_blkno = blkno; + jfreeblk->jf_frags = frags; + jfreeblk->jf_freeblks = freeblks; + LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); + + return (jfreeblk); +} + +static void move_newblock_dep(struct jaddref *, struct inodedep *); +/* + * If we're canceling a new bitmap we have to search for another ref + * to move into the bmsafemap dep. This might be better expressed + * with another structure. + */ +static void +move_newblock_dep(jaddref, inodedep) + struct jaddref *jaddref; + struct inodedep *inodedep; +{ + struct inoref *inoref; + struct jaddref *jaddrefn; + + jaddrefn = NULL; + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) { + if ((jaddref->ja_state & NEWBLOCK) && + inoref->if_list.wk_type == D_JADDREF) { + jaddrefn = (struct jaddref *)inoref; + break; + } + } + if (jaddrefn == NULL) + return; + if (inodedep == NULL) + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("move_newblock_dep: Lost inodedep"); + jaddrefn->ja_state &= ~(ATTACHED | UNDONE); + jaddrefn->ja_state |= jaddref->ja_state & + (ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state |= ATTACHED; + LIST_REMOVE(jaddref, ja_bmdeps); + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, + ja_bmdeps); +} + +/* + * Cancel a jaddref either before it has been written or while it is being + * written. This happens when a link is removed before the add reaches + * the disk. The jaddref dependency is kept linked into the bmsafemap + * and inode to prevent the link count or bitmap from reaching the disk + * until handle_workitem_remove() re-adjusts the counts and bitmaps as + * required. + * + * Returns 1 if the canceled addref requires journaling of the remove and + * 0 otherwise. + */ +static int +cancel_jaddref(jaddref, inodedep, wkhd) + struct jaddref *jaddref; + struct inodedep *inodedep; + struct workhead *wkhd; +{ + struct inoref *inoref; + int needsj; + + KASSERT((jaddref->ja_state & COMPLETE) == 0, + ("cancel_jaddref: Canceling complete jaddref")); + if (jaddref->ja_state & (IOSTARTED | COMPLETE)) + needsj = 1; + else + needsj = 0; + /* + * If we're not journaling this remove we must adjust the nlink of + * any reference operation that follows us so that it is consistent + * with the in-memory reference. + */ + if (needsj == 0) + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) + inoref->if_nlink--; + if (jaddref->ja_ref.if_jsegdep) { + free_jsegdep(jaddref->ja_ref.if_jsegdep); + jaddref->ja_ref.if_jsegdep = NULL; + } + if (jaddref->ja_state & NEWBLOCK) + move_newblock_dep(jaddref, inodedep); + if (jaddref->ja_state & IOWAITING) { + jaddref->ja_state &= ~IOWAITING; + wakeup(&jaddref->ja_list); + } + jaddref->ja_mkdir = NULL; + if (jaddref->ja_state & IOSTARTED) { + jaddref->ja_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jaddref->ja_list); + } else + remove_from_journal(&jaddref->ja_list); + jaddref->ja_state |= GOINGAWAY; + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jaddref->ja_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jaddref->ja_list); + + return (needsj); +} + +/* + * Attempt to free a jaddref structure when some work completes. This + * should only succeed once the entry is written and all dependencies have + * been notified. + */ +static void +free_jaddref(jaddref) + struct jaddref *jaddref; +{ + + if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (jaddref->ja_ref.if_jsegdep) + panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", + jaddref, jaddref->ja_state); + if (jaddref->ja_state & NEWBLOCK) + LIST_REMOVE(jaddref, ja_bmdeps); + if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) + panic("free_jaddref: Bad state %p(0x%X)", + jaddref, jaddref->ja_state); + if (jaddref->ja_mkdir != NULL) + panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); + WORKITEM_FREE(jaddref, D_JADDREF); +} + +/* + * Free a jremref structure once it has been written or discarded. + */ +static void +free_jremref(jremref) + struct jremref *jremref; +{ + + if (jremref->jr_ref.if_jsegdep) + free_jsegdep(jremref->jr_ref.if_jsegdep); + if (jremref->jr_state & IOSTARTED) + panic("free_jremref: IO still pending"); + WORKITEM_FREE(jremref, D_JREMREF); +} + +/* + * Free a jnewblk structure. + */ +static void +free_jnewblk(jnewblk) + struct jnewblk *jnewblk; +{ + + if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(jnewblk, jn_deps); + if (jnewblk->jn_newblk != NULL) + panic("free_jnewblk: Dependency still attached."); + WORKITEM_FREE(jnewblk, D_JNEWBLK); +} + +/* + * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk + * is kept linked into the bmsafemap until the free completes, thus + * preventing the modified state from ever reaching disk. The free + * routine must pass this structure via ffs_blkfree() to + * softdep_setup_freeblks() so there is no race in releasing the space. + */ +static void +cancel_jnewblk(jnewblk, wkhd) + struct jnewblk *jnewblk; + struct workhead *wkhd; +{ + + if (jnewblk->jn_jsegdep) { + free_jsegdep(jnewblk->jn_jsegdep); + jnewblk->jn_jsegdep = NULL; + } + if (jnewblk->jn_state & IOWAITING) { + jnewblk->jn_state &= ~IOWAITING; + wakeup(&jnewblk->jn_list); + } + jnewblk->jn_newblk = NULL; + jnewblk->jn_state |= GOINGAWAY; + if (jnewblk->jn_state & IOSTARTED) { + jnewblk->jn_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jnewblk->jn_list); + } else + remove_from_journal(&jnewblk->jn_list); + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jnewblk->jn_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jnewblk->jn_list); +} + +static void +free_jfreeblk(jfreeblk) + struct jfreeblk *jfreeblk; +{ + + WORKITEM_FREE(jfreeblk, D_JFREEBLK); +} + +/* + * Release one reference to a jseg and free it if the count reaches 0. This + * should eventually reclaim journal space as well. + */ +static void +free_jseg(jseg) + struct jseg *jseg; +{ + struct jblocks *jblocks; + + KASSERT(jseg->js_refs > 0, + ("free_jseg: Invalid refcnt %d", jseg->js_refs)); + if (--jseg->js_refs != 0) + return; + /* + * Free only those jsegs which have none allocated before them to + * preserve the journal space ordering. + */ + jblocks = jseg->js_jblocks; + while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { + jblocks->jb_oldestseq = jseg->js_seq; + if (jseg->js_refs != 0) + break; + TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); + jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); + KASSERT(LIST_EMPTY(&jseg->js_entries), + ("free_jseg: Freed jseg has valid entries.")); + WORKITEM_FREE(jseg, D_JSEG); + } +} + +/* + * Release a jsegdep and decrement the jseg count. + */ +static void +free_jsegdep(jsegdep) + struct jsegdep *jsegdep; +{ + + if (jsegdep->jd_seg) + free_jseg(jsegdep->jd_seg); + WORKITEM_FREE(jsegdep, D_JSEGDEP); +} + +/* + * Wait for a journal item to make it to disk. Initiate journal processing + * if required. + */ +static void +jwait(wk) + struct worklist *wk; +{ + + /* + * If IO has not started we process the journal. We can't mark the + * worklist item as IOWAITING because we drop the lock while + * processing the journal and the worklist entry may be freed after + * this point. The caller may call back in and re-issue the request. + */ + if ((wk->wk_state & IOSTARTED) == 0) { + softdep_process_journal(wk->wk_mp, MNT_WAIT); + return; + } + wk->wk_state |= IOWAITING; + msleep(wk, &lk, PRIBIO, "jwait", 0); +} + +/* + * Lookup an inodedep based on an inode pointer and set the nlinkdelta as + * appropriate. This is a convenience function to reduce duplicate code + * for the setup and revert functions below. + */ +static struct inodedep * +inodedep_lookup_ip(ip) + struct inode *ip; +{ + struct inodedep *inodedep; + + KASSERT(ip->i_nlink >= ip->i_effnlink, + ("inodedep_lookup_ip: bad delta")); + (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, + DEPALLOC, &inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + + return (inodedep); +} + +/* + * Create a journal entry that describes a truncate that we're about to + * perform. The inode allocations and frees between here and the completion + * of the operation are done asynchronously and without journaling. At + * the end of the operation the vnode is sync'd and the journal space + * is released. Recovery will discover the partially completed truncate + * and complete it. + */ +void * +softdep_setup_trunc(vp, length, flags) + struct vnode *vp; + off_t length; + int flags; +{ + struct jsegdep *jsegdep; + struct jtrunc *jtrunc; + struct ufsmount *ump; + struct inode *ip; + + softdep_prealloc(vp, MNT_WAIT); + ip = VTOI(vp); + ump = VFSTOUFS(vp->v_mount); + jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); + workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount); + jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list); + jtrunc->jt_ino = ip->i_number; + jtrunc->jt_extsize = 0; + jtrunc->jt_size = length; + if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2) + jtrunc->jt_extsize = ip->i_din2->di_extsize; + if ((flags & IO_NORMAL) == 0) + jtrunc->jt_size = DIP(ip, i_size); + ACQUIRE_LOCK(&lk); + add_to_journal(&jtrunc->jt_list); + while (jsegdep->jd_seg == NULL) + jwait(&jtrunc->jt_list); + FREE_LOCK(&lk); + + return (jsegdep); +} + +/* + * After synchronous truncation is complete we free sync the vnode and + * release the jsegdep so the journal space can be freed. + */ +int +softdep_complete_trunc(vp, cookie) + struct vnode *vp; + void *cookie; +{ + int error; + + error = ffs_syncvnode(vp, MNT_WAIT); + ACQUIRE_LOCK(&lk); + free_jsegdep((struct jsegdep *)cookie); + FREE_LOCK(&lk); + + return (error); +} + +/* + * Called prior to creating a new inode and linking it to a directory. The + * jaddref structure must already be allocated by softdep_setup_inomapdep + * and it is discovered here so we can initialize the mode and update + * nlinkdelta. + */ +void +softdep_setup_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(ip->i_nlink == 1, + ("softdep_setup_create: Invalid link count.")); + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_create: No addref structure present.")); + jaddref->ja_mode = ip->i_mode; + softdep_prelink(dvp, NULL); + } + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track the addition of a DOTDOT link when + * we are reparenting an inode as part of a rename. This jaddref will be + * found by softdep_setup_directory_change. Adjusts nlinkdelta for + * non-journaling softdep. + */ +void +softdep_setup_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + jaddref = NULL; + /* + * We don't set MKDIR_PARENT as this is not tied to a mkdir and + * is used as a normal link would be. + */ + if (DOINGSUJ(dvp)) + jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (jaddref) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + } + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track a new link to an inode. The directory + * offset is not known until softdep_setup_directory_add or + * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling + * softdep. + */ +void +softdep_setup_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + jaddref = NULL; + if (DOINGSUJ(dvp)) + jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, + ip->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (jaddref) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + } + FREE_LOCK(&lk); +} + +/* + * Called to create the jaddref structures to track . and .. references as + * well as lookup and further initialize the incomplete jaddref created + * by softdep_setup_inomapdep when the inode was allocated. Adjusts + * nlinkdelta for non-journaling softdep. + */ +void +softdep_setup_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *dotdotaddref; + struct jaddref *dotaddref; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + dotaddref = dotdotaddref = NULL; + if (DOINGSUJ(dvp)) { + dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, + ip->i_mode); + dotaddref->ja_state |= MKDIR_BODY; + dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + dotdotaddref->ja_state |= MKDIR_PARENT; + } + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL, + ("softdep_setup_mkdir: No addref structure present.")); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_setup_mkdir: bad parent %d", + jaddref->ja_parent)); + jaddref->ja_mode = ip->i_mode; + TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, + if_deps); + } + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, + &dotdotaddref->ja_ref, if_deps); + softdep_prelink(ITOV(dp), NULL); + } + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlinking a directory. + */ +void +softdep_setup_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlink. + */ +void +softdep_setup_unlink(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed non-directory + * creation. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_create: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed dotdot link + * creation. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_dotdot_link: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed link + * addition. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_link: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed mkdir + * attempt. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dotdot addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_mkdir: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dot addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to correct nlinkdelta after a failed rmdir. + */ +void +softdep_revert_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(&lk); +} + +/* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem @@ -1536,22 +3873,44 @@ softdep_setup_inomapdep(bp, ip, newinum) { struct inodedep *inodedep; struct bmsafemap *bmsafemap; + struct jaddref *jaddref; + struct mount *mp; + struct fs *fs; + mp = UFSTOVFS(ip->i_ump); + fs = ip->i_ump->um_fs; + jaddref = NULL; + /* + * Allocate the journal reference add structure so that the bitmap + * can be dependent on it. + */ + if (mp->mnt_flag & MNT_SUJ) { + jaddref = newjaddref(ip, newinum, 0, 0, 0); + jaddref->ja_state |= NEWBLOCK; + } + + /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); - if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, - &inodedep))) - panic("softdep_setup_inomapdep: dependency for new inode " - "already exists"); - inodedep->id_buf = bp; + if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) + panic("softdep_setup_inomapdep: dependency %p for new" + "inode already exists", inodedep); + bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); + if (jaddref) { + LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + } else { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); + } + inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; - bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } @@ -1560,29 +3919,98 @@ softdep_setup_inomapdep(bp, ip, newinum) * allocate block or fragment. */ void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ + int frags; /* Number of fragments. */ + int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; struct fs *fs; fs = VFSTOUFS(mp)->um_fs; + jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ + if (mp->mnt_flag & MNT_SUJ) { + jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); + jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); + jnewblk->jn_state = ATTACHED; + jnewblk->jn_blkno = newblkno; + jnewblk->jn_frags = frags; + jnewblk->jn_oldfrags = oldfrags; +#ifdef SUJ_DEBUG + { + struct cg *cgp; + uint8_t *blksfree; + long bno; + int i; + + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isset(blksfree, bno + i)) + panic("softdep_setup_blkmapdep: " + "free fragment %d from %d-%d " + "state 0x%X dep %p", i, + jnewblk->jn_oldfrags, + jnewblk->jn_frags, + jnewblk->jn_state, + jnewblk->jn_newblk); + } + } +#endif + } ACQUIRE_LOCK(&lk); - if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) + if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); - newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, + dtog(fs, newblkno)); + if (jnewblk) { + jnewblk->jn_newblk = newblk; + LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); + } else { + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + } + newblk->nb_bmsafemap = bmsafemap; + newblk->nb_jnewblk = jnewblk; FREE_LOCK(&lk); } +#define BMSAFEMAP_HASH(fs, cg) \ + (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) + +static int +bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) + struct bmsafemap_hashhead *bmsafemaphd; + struct mount *mp; + int cg; + struct bmsafemap **bmsafemapp; +{ + struct bmsafemap *bmsafemap; + + LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) + if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) + break; + if (bmsafemap) { + *bmsafemapp = bmsafemap; + return (1); + } + *bmsafemapp = NULL; + + return (0); +} + /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when @@ -1590,27 +4018,43 @@ void * splbio interrupts blocked. */ static struct bmsafemap * -bmsafemap_lookup(mp, bp) +bmsafemap_lookup(mp, bp, cg) struct mount *mp; struct buf *bp; + int cg; { - struct bmsafemap *bmsafemap; + struct bmsafemap_hashhead *bmsafemaphd; + struct bmsafemap *bmsafemap, *collision; struct worklist *wk; + struct fs *fs; mtx_assert(&lk, MA_OWNED); - LIST_FOREACH(wk, &bp->b_dep, wk_list) - if (wk->wk_type == D_BMSAFEMAP) - return (WK_BMSAFEMAP(wk)); + if (bp) + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_BMSAFEMAP) + return (WK_BMSAFEMAP(wk)); + fs = VFSTOUFS(mp)->um_fs; + bmsafemaphd = BMSAFEMAP_HASH(fs, cg); + if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) + return (bmsafemap); FREE_LOCK(&lk); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); bmsafemap->sm_buf = bp; - LIST_INIT(&bmsafemap->sm_allocdirecthd); - LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); + LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); + LIST_INIT(&bmsafemap->sm_newblkwr); + LIST_INIT(&bmsafemap->sm_jaddrefhd); + LIST_INIT(&bmsafemap->sm_jnewblkhd); ACQUIRE_LOCK(&lk); + if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (collision); + } + bmsafemap->sm_cg = cg; + LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } @@ -1645,9 +4089,9 @@ static struct bmsafemap * * unreferenced fragments. */ void -softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) +softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ - ufs_lbn_t lbn; /* block pointer within inode */ + ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ @@ -1656,34 +4100,33 @@ void { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; - struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; + ufs_lbn_t lbn; + lbn = bp->b_lblkno; mp = UFSTOVFS(ip->i_ump); - adp = malloc(sizeof(struct allocdirect), - M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; - adp->ad_newblkno = newblkno; - adp->ad_oldblkno = oldblkno; - adp->ad_newsize = newsize; - adp->ad_oldsize = oldsize; - adp->ad_state = ATTACHED; - LIST_INIT(&adp->ad_newdirblk); - if (newblkno == oldblkno) - adp->ad_freefrag = NULL; + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + freefrag = NULL; ACQUIRE_LOCK(&lk); - if (lbn >= NDADDR) { + if (off >= NDADDR) { + if (lbn > 0) + panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", + lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { + if (off != lbn) + panic("softdep_setup_allocdirect: lbn %jd != off %jd", + lbn, off); /* * Allocating a direct block. * @@ -1692,26 +4135,39 @@ void * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, off, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } - if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocdirect: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + newblk->nb_list.wk_type = D_ALLOCDIRECT; + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); - + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; - WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); + + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the @@ -1726,24 +4182,25 @@ void */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); } @@ -1761,10 +4218,11 @@ allocdirect_merge(adphead, newadp, oldadp) struct freefrag *freefrag; struct newdirblk *newdirblk; + freefrag = NULL; mtx_assert(&lk, MA_OWNED); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || - newadp->ad_lbn >= NDADDR) + newadp->ad_offset >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, @@ -1779,7 +4237,7 @@ allocdirect_merge(adphead, newadp, oldadp) * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on - * the worklist when it is freed by free_allocdirect. It is + * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. @@ -1788,8 +4246,8 @@ allocdirect_merge(adphead, newadp, oldadp) * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ + freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { - freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } @@ -1804,32 +4262,118 @@ allocdirect_merge(adphead, newadp, oldadp) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); } - free_allocdirect(adphead, oldadp, 0); + TAILQ_REMOVE(adphead, oldadp, ad_next); + /* + * We need to move any journal dependencies over to the freefrag + * that releases this block if it exists. Otherwise we are + * extending an existing block and we'll wait until that is + * complete to release the journal space and extend the + * new journal to cover this old space as well. + */ + if (freefrag == NULL) { + struct jnewblk *jnewblk; + struct jnewblk *njnewblk; + + if (oldadp->ad_newblkno != newadp->ad_newblkno) + panic("allocdirect_merge: %jd != %jd", + oldadp->ad_newblkno, newadp->ad_newblkno); + jnewblk = oldadp->ad_block.nb_jnewblk; + cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork); + /* + * We have an unwritten jnewblk, we need to merge the + * frag bits with our own. The newer adp's journal can not + * be written prior to the old one so no need to check for + * it here. + */ + if (jnewblk) { + njnewblk = newadp->ad_block.nb_jnewblk; + if (njnewblk == NULL) + panic("allocdirect_merge: No jnewblk"); + if (jnewblk->jn_state & UNDONE) { + njnewblk->jn_state |= UNDONE | NEWBLOCK; + njnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state &= ~UNDONE; + } + njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; + WORKLIST_REMOVE(&jnewblk->jn_list); + jnewblk->jn_state |= ATTACHED | COMPLETE; + free_jnewblk(jnewblk); + } + } else { + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocdirect that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jfreefrag) + cancel_jfreefrag(freefrag->ff_jfreefrag); + cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork); + } + free_newblk(&oldadp->ad_block); } - + /* - * Allocate a new freefrag structure if needed. + * Allocate a jfreefrag structure to journal a single block free. */ +static struct jfreefrag * +newjfreefrag(freefrag, ip, blkno, size, lbn) + struct freefrag *freefrag; + struct inode *ip; + ufs2_daddr_t blkno; + long size; + ufs_lbn_t lbn; +{ + struct jfreefrag *jfreefrag; + struct fs *fs; + + fs = ip->i_fs; + jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, + M_SOFTDEP_FLAGS); + workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); + jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); + jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; + jfreefrag->fr_ino = ip->i_number; + jfreefrag->fr_lbn = lbn; + jfreefrag->fr_blkno = blkno; + jfreefrag->fr_frags = numfrags(fs, size); + jfreefrag->fr_freefrag = freefrag; + + return (jfreefrag); +} + +/* + * Allocate a new freefrag structure. + */ static struct freefrag * -newfreefrag(ip, blkno, size) +newfreefrag(ip, blkno, size, lbn) struct inode *ip; ufs2_daddr_t blkno; long size; + ufs_lbn_t lbn; { struct freefrag *freefrag; struct fs *fs; - if (blkno == 0) - return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), - M_FREEFRAG, M_SOFTDEP_FLAGS); + M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); + freefrag->ff_state = ATTACHED; + LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; + + if (fs->fs_flags & FS_SUJ) { + freefrag->ff_jfreefrag = + newjfreefrag(freefrag, ip, blkno, size, lbn); + } else { + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jfreefrag = NULL; + } + return (freefrag); } @@ -1842,9 +4386,17 @@ handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); + struct workhead wkhd; + /* + * It would be illegal to add new completion items to the + * freefrag after it was schedule to be done so it must be + * safe to modify the list head here. + */ + LIST_INIT(&wkhd); + LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum); + freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); @@ -1856,9 +4408,9 @@ handle_workitem_freefrag(freefrag) * See the description of softdep_setup_allocdirect above for details. */ void -softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) +softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; - ufs_lbn_t lbn; + ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; @@ -1867,50 +4419,55 @@ void { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; - struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; + ufs_lbn_t lbn; + if (off >= NXADDR) + panic("softdep_setup_allocext: lbn %lld > NXADDR", + (long long)off); + + lbn = bp->b_lblkno; mp = UFSTOVFS(ip->i_ump); - adp = malloc(sizeof(struct allocdirect), - M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; - adp->ad_newblkno = newblkno; - adp->ad_oldblkno = oldblkno; - adp->ad_newsize = newsize; - adp->ad_oldsize = oldsize; - adp->ad_state = ATTACHED | EXTDATA; - LIST_INIT(&adp->ad_newdirblk); - if (newblkno == oldblkno) - adp->ad_freefrag = NULL; + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + freefrag = NULL; ACQUIRE_LOCK(&lk); - if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocext: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + newblk->nb_list.wk_type = D_ALLOCDIRECT; + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + adp->ad_state |= EXTDATA; + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; - if (newblk->nb_state == DEPCOMPLETE) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); - } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); - - WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); - if (lbn >= NXADDR) - panic("softdep_setup_allocext: lbn %lld > NXADDR", - (long long)lbn); + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the @@ -1925,23 +4482,23 @@ void */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } @@ -1975,22 +4532,39 @@ void * Allocate a new allocindir structure. */ static struct allocindir * -newallocindir(ip, ptrno, newblkno, oldblkno) +newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ + ufs_lbn_t lbn; { + struct newblk *newblk; struct allocindir *aip; + struct freefrag *freefrag; + struct jnewblk *jnewblk; - aip = malloc(sizeof(struct allocindir), - M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); - aip->ai_state = ATTACHED; + if (oldblkno) + freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); + else + freefrag = NULL; + ACQUIRE_LOCK(&lk); + if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) + panic("new_allocindir: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("newallocindir: newblk already initialized")); + newblk->nb_list.wk_type = D_ALLOCINDIR; + newblk->nb_freefrag = freefrag; + aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; - aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; - aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); return (aip); } @@ -2008,22 +4582,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { + struct inodedep *inodedep; struct allocindir *aip; struct pagedep *pagedep; + struct mount *mp; + if (lbn != nbp->b_lblkno) + panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", + lbn, bp->b_lblkno); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); - aip = newallocindir(ip, ptrno, newblkno, oldblkno); - ACQUIRE_LOCK(&lk); + mp = UFSTOVFS(ip->i_ump); + aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); - WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } @@ -2039,38 +4619,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { + struct inodedep *inodedep; struct allocindir *aip; + ufs_lbn_t lbn; + lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); - aip = newallocindir(ip, ptrno, newblkno, 0); - ACQUIRE_LOCK(&lk); - WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + aip = newallocindir(ip, ptrno, newblkno, 0, lbn); + inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } +static void +indirdep_complete(indirdep) + struct indirdep *indirdep; +{ + struct allocindir *aip; + + LIST_REMOVE(indirdep, ir_next); + indirdep->ir_state &= ~ONDEPLIST; + + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + free_newblk(&aip->ai_block); + } + /* + * If this indirdep is not attached to a buf it was simply waiting + * on completion to clear completehd. free_indirdep() asserts + * that nothing is dangling. + */ + if ((indirdep->ir_state & ONWORKLIST) == 0) + free_indirdep(indirdep); +} + /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void -setup_allocindir_phase2(bp, ip, aip) +setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ + struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ + ufs_lbn_t lbn; /* Logical block number for this block. */ { struct worklist *wk; + struct fs *fs; + struct newblk *newblk; struct indirdep *indirdep, *newindirdep; - struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; - struct newblk *newblk; + struct mount *mp; ufs2_daddr_t blkno; + mp = UFSTOVFS(ip->i_ump); + fs = ip->i_fs; mtx_assert(&lk, MA_OWNED); if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); - for (indirdep = NULL, newindirdep = NULL; ; ) { + for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; @@ -2079,49 +4689,41 @@ static void } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; + newindirdep = NULL; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); - newindirdep = NULL; + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, + &newblk)) { + indirdep->ir_state |= ONDEPLIST; + LIST_INSERT_HEAD(&newblk->nb_indirdeps, + indirdep, ir_next); + } else + indirdep->ir_state |= DEPCOMPLETE; } if (indirdep) { - if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, - &newblk) == 0) - panic("setup_allocindir: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { - aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - aip->ai_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, - aip, ai_deps); - } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old - * dependency into the new one. + * dependency into the new one. This happens + * as a result of reallocblk only. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else - LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) + LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, + ai_next) if (oldaip->ai_offset == aip->ai_offset) break; - freefrag = NULL; - if (oldaip != NULL) { - if (oldaip->ai_newblkno != aip->ai_oldblkno) - panic("setup_allocindir_phase2: blkno"); - aip->ai_oldblkno = oldaip->ai_oldblkno; - freefrag = aip->ai_freefrag; - aip->ai_freefrag = oldaip->ai_freefrag; - oldaip->ai_freefrag = NULL; - free_allocindir(oldaip, NULL); - } + if (oldaip != NULL) + freefrag = allocindir_merge(aip, oldaip); LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); + KASSERT(aip->ai_offset >= 0 && + aip->ai_offset < NINDIR(ip->i_ump->um_fs), + ("setup_allocindir_phase2: Bad offset %d", + aip->ai_offset)); + KASSERT(indirdep->ir_savebp != NULL, + ("setup_allocindir_phase2 NULL ir_savebp")); if (ip->i_ump->um_fstype == UFS1) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; @@ -2148,13 +4750,16 @@ static void } newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); - workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, - UFSTOVFS(ip->i_ump)); + workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; + newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); + LIST_INIT(&newindirdep->ir_writehd); + LIST_INIT(&newindirdep->ir_completehd); + LIST_INIT(&newindirdep->ir_jwork); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); @@ -2169,6 +4774,51 @@ static void } /* + * Merge two allocindirs which refer to the same block. Move newblock + * dependencies and setup the freefrags appropriately. + */ +static struct freefrag * +allocindir_merge(aip, oldaip) + struct allocindir *aip; + struct allocindir *oldaip; +{ + struct newdirblk *newdirblk; + struct freefrag *freefrag; + struct worklist *wk; + + if (oldaip->ai_newblkno != aip->ai_oldblkno) + panic("allocindir_merge: blkno"); + aip->ai_oldblkno = oldaip->ai_oldblkno; + freefrag = aip->ai_freefrag; + aip->ai_freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = NULL; + KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); + /* + * If we are tracking a new directory-block allocation, + * move it from the old allocindir to the new allocindir. + */ + if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { + newdirblk = WK_NEWDIRBLK(wk); + WORKLIST_REMOVE(&newdirblk->db_list); + if (!LIST_EMPTY(&oldaip->ai_newdirblk)) + panic("allocindir_merge: extra newdirblk"); + WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); + } + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocindir that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jfreefrag) + cancel_jfreefrag(freefrag->ff_jfreefrag); + LIST_REMOVE(oldaip, ai_next); + cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork); + free_newblk(&oldaip->ai_block); + + return (freefrag); +} + +/* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before @@ -2206,6 +4856,7 @@ softdep_setup_freeblocks(ip, length, flags) struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; + struct jfreeblk *jfreeblk; struct bufobj *bo; struct vnode *vp; struct buf *bp; @@ -2213,6 +4864,13 @@ softdep_setup_freeblocks(ip, length, flags) ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; + ufs2_daddr_t blkno; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + long oldextsize; + long oldsize; + int frags; + int needj; fs = ip->i_fs; mp = UFSTOVFS(ip->i_ump); @@ -2221,32 +4879,53 @@ softdep_setup_freeblocks(ip, length, flags) freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); + LIST_INIT(&freeblks->fb_jfreeblkhd); + LIST_INIT(&freeblks->fb_jwork); freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; + freeblks->fb_chkcnt = 0; ACQUIRE_LOCK(&lk); + /* + * If we're truncating a removed file that will never be written + * we don't need to journal the block frees. The canceled journals + * for the allocations will suffice. + */ + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || + (fs->fs_flags & FS_SUJ) == 0) + needj = 0; + else + needj = 1; num_freeblkdep++; FREE_LOCK(&lk); extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks = DIP(ip, i_blocks) - extblocks; - if ((flags & IO_NORMAL) == 0) { - freeblks->fb_oldsize = 0; - freeblks->fb_chkcnt = 0; - } else { - freeblks->fb_oldsize = ip->i_size; + if ((flags & IO_NORMAL) != 0) { + oldsize = ip->i_size; ip->i_size = 0; DIP_SET(ip, i_size, 0); freeblks->fb_chkcnt = datablocks; for (i = 0; i < NDADDR; i++) { - freeblks->fb_dblks[i] = DIP(ip, i_db[i]); + blkno = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], 0); + if (blkno == 0) + continue; + frags = sblksize(fs, oldsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, i, blkno, frags, needj); } - for (i = 0; i < NIADDR; i++) { - freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + i++, tmpval *= NINDIR(fs)) { + blkno = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], 0); + if (blkno) + newfreework(freeblks, NULL, -lbn - i, blkno, + fs->fs_frag, needj); + lbn += tmpval; } /* * If the file was removed, then the space being freed was @@ -2259,17 +4938,23 @@ softdep_setup_freeblocks(ip, length, flags) UFS_UNLOCK(ip->i_ump); } } - if ((flags & IO_EXT) == 0) { - freeblks->fb_oldextsize = 0; - } else { - freeblks->fb_oldextsize = ip->i_din2->di_extsize; + if ((flags & IO_EXT) != 0) { + oldextsize = ip->i_din2->di_extsize; ip->i_din2->di_extsize = 0; freeblks->fb_chkcnt += extblocks; for (i = 0; i < NXADDR; i++) { - freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; + blkno = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; + if (blkno == 0) + continue; + frags = sblksize(fs, oldextsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, -1 - i, blkno, frags, + needj); } } + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) + needj = 0; DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); /* * Push the zero'ed inode to to its disk buffer so that we are free @@ -2304,7 +4989,9 @@ softdep_setup_freeblocks(ip, length, flags) */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) - WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); + WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); + else if (needj) + freeblks->fb_state |= DEPCOMPLETE | COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated @@ -2318,14 +5005,19 @@ softdep_setup_freeblocks(ip, length, flags) merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) - free_allocdirect(&inodedep->id_inoupdt, adp, delay); + cancel_allocdirect(&inodedep->id_inoupdt, adp, + freeblks, delay); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) - free_allocdirect(&inodedep->id_extupdt, adp, delay); + cancel_allocdirect(&inodedep->id_extupdt, adp, + freeblks, delay); } + LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) + add_to_journal(&jfreeblk->jf_list); + FREE_LOCK(&lk); bdwrite(bp); /* @@ -2349,9 +5041,9 @@ restart: BO_UNLOCK(bo); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); - deallocate_dependencies(bp, inodedep); + if (deallocate_dependencies(bp, inodedep, freeblks)) + bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); - bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); BO_LOCK(bo); goto restart; @@ -2361,7 +5053,7 @@ restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); - if(delay) { + if (delay) { freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk @@ -2371,16 +5063,16 @@ restart: * the request here than in the !delay case. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) - add_to_worklist(&freeblks->fb_list); + add_to_worklist(&freeblks->fb_list, 1); } FREE_LOCK(&lk); /* - * If the inode has never been written to disk (delay == 0), - * then we can process the freeblks now that we have deleted - * the dependencies. + * If the inode has never been written to disk (delay == 0) and + * we're not waiting on any journal writes, then we can process the + * freeblks now that we have deleted the dependencies. */ - if (!delay) + if (!delay && !needj) handle_workitem_freeblocks(freeblks, 0); } @@ -2389,19 +5081,23 @@ restart: * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's - * associated with related dependencies do not occur. + * associated with related dependencies do not occur. Returns 1 if + * all dependencies were cleared, 0 otherwise. */ -static void -deallocate_dependencies(bp, inodedep) +static int +deallocate_dependencies(bp, inodedep, freeblks) struct buf *bp; struct inodedep *inodedep; + struct freeblks *freeblks; { struct worklist *wk; struct indirdep *indirdep; + struct newdirblk *newdirblk; struct allocindir *aip; struct pagedep *pagedep; + struct jremref *jremref; + struct jmvref *jmvref; struct dirrem *dirrem; - struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); @@ -2410,47 +5106,24 @@ restart: case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); - /* - * None of the indirect pointers will ever be visible, - * so they can simply be tossed. GOINGAWAY ensures - * that allocated pointers will be saved in the buffer - * cache until they are freed. Note that they will - * only be able to be found by their physical address - * since the inode mapping the logical address will - * be gone. The save buffer used for the safe copy - * was allocated in setup_allocindir_phase2 using - * the physical address so it could be used for this - * purpose. Hence we swap the safe copy with the real - * copy, allowing the safe copy to be freed and holding - * on to the real copy for later use in indir_trunc. - */ - if (indirdep->ir_state & GOINGAWAY) - panic("deallocate_dependencies: already gone"); - indirdep->ir_state |= GOINGAWAY; - VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; - while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) - free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); - bcopy(bp->b_data, indirdep->ir_savebp->b_data, - bp->b_bcount); - WORKLIST_REMOVE(wk); - WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); + cancel_indirdep(indirdep, bp, inodedep, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* - * None of the directory additions will ever be - * visible, so they can simply be tossed. + * There should be no directory add dependencies present + * as the directory could not be truncated until all + * children were removed. */ + KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, + ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) - while ((dap = - LIST_FIRST(&pagedep->pd_diraddhd[i]))) - free_diradd(dap); - while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) - free_diradd(dap); + KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, + ("deallocate_dependencies: diraddhd != NULL")); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. @@ -2458,36 +5131,47 @@ restart: * can be dumped directly onto the work list. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + /* + * If there are any dirrems we wait for + * the journal write to complete and + * then restart the buf scan as the lock + * has been dropped. + */ + while ((jremref = + LIST_FIRST(&dirrem->dm_jremrefhd)) + != NULL) { + jwait(&jremref->jr_list); + return (0); + } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == - ALLCOMPLETE) - add_to_worklist(&dirrem->dm_list); - else + ALLCOMPLETE) { + dirrem->dm_state |= COMPLETE; + add_to_worklist(&dirrem->dm_list, 0); + } else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } if ((pagedep->pd_state & NEWBLOCK) != 0) { - LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) - if (wk->wk_type == D_NEWDIRBLK && - WK_NEWDIRBLK(wk)->db_pagedep == - pagedep) - break; - if (wk != NULL) { - WORKLIST_REMOVE(wk); - free_newdirblk(WK_NEWDIRBLK(wk)); - } else - panic("deallocate_dependencies: " - "lost pagedep"); + newdirblk = pagedep->pd_newdirblk; + WORKLIST_REMOVE(&newdirblk->db_list); + free_newdirblk(newdirblk); } + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) + != NULL) { + jwait(&jmvref->jm_list); + return (0); + } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: - free_allocindir(WK_ALLOCINDIR(wk), inodedep); + aip = WK_ALLOCINDIR(wk); + cancel_allocindir(aip, inodedep, freeblks); continue; case D_ALLOCDIRECT: @@ -2502,46 +5186,155 @@ restart: /* NOTREACHED */ } } + + return (1); } /* - * Free an allocdirect. Generate a new freefrag work request if appropriate. - * This routine must be called with splbio interrupts blocked. + * An allocdirect is being canceled due to a truncate. We must make sure + * the journal entry is released in concert with the blkfree that releases + * the storage. Completed journal entries must not be released until the + * space is no longer pointed to by the inode or in the bitmap. */ static void -free_allocdirect(adphead, adp, delay) +cancel_allocdirect(adphead, adp, freeblks, delay) struct allocdirectlst *adphead; struct allocdirect *adp; + struct freeblks *freeblks; int delay; { + struct freework *freework; + struct newblk *newblk; + struct worklist *wk; + ufs_lbn_t lbn; + + TAILQ_REMOVE(adphead, adp, ad_next); + newblk = (struct newblk *)adp; + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the freework to be + * freed when freework_freeblock() is called. If the journal has + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ + if (newblk->nb_jnewblk == NULL) { + cancel_newblk(newblk, &freeblks->fb_jwork); + goto found; + } + lbn = newblk->nb_jnewblk->jn_lbn; + /* + * Find the correct freework structure so it releases the canceled + * journal when the bitmap is cleared. This preserves rollback + * until the allocation is reverted. + */ + LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { + freework = WK_FREEWORK(wk); + if (freework->fw_lbn != lbn) + continue; + cancel_newblk(newblk, &freework->fw_jwork); + goto found; + } + panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); +found: + if (delay) + WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, + &newblk->nb_list); + else + free_newblk(newblk); + return; +} + + +static void +cancel_newblk(newblk, wkhd) + struct newblk *newblk; + struct workhead *wkhd; +{ + struct indirdep *indirdep; + struct allocindir *aip; + + while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { + indirdep->ir_state &= ~ONDEPLIST; + LIST_REMOVE(indirdep, ir_next); + /* + * If an indirdep is not on the buf worklist we need to + * free it here as deallocate_dependencies() will never + * find it. These pointers were never visible on disk and + * can be discarded immediately. + */ + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + cancel_newblk(&aip->ai_block, wkhd); + free_newblk(&aip->ai_block); + } + /* + * If this indirdep is not attached to a buf it was simply + * waiting on completion to clear completehd. free_indirdep() + * asserts that nothing is dangling. + */ + if ((indirdep->ir_state & ONWORKLIST) == 0) + free_indirdep(indirdep); + } + if (newblk->nb_state & ONDEPLIST) { + newblk->nb_state &= ~ONDEPLIST; + LIST_REMOVE(newblk, nb_deps); + } + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + /* + * If the journal entry hasn't been written we hold onto the dep + * until it is safe to free along with the other journal work. + */ + if (newblk->nb_jnewblk != NULL) { + cancel_jnewblk(newblk->nb_jnewblk, wkhd); + newblk->nb_jnewblk = NULL; + } + if (!LIST_EMPTY(&newblk->nb_jwork)) + jwork_move(wkhd, &newblk->nb_jwork); +} + +/* + * Free a newblk. Generate a new freefrag work request if appropriate. + * This must be called after the inode pointer and any direct block pointers + * are valid or fully removed via truncate or frag extension. + */ +static void +free_newblk(newblk) + struct newblk *newblk; +{ + struct indirdep *indirdep; struct newdirblk *newdirblk; + struct freefrag *freefrag; struct worklist *wk; mtx_assert(&lk, MA_OWNED); - if ((adp->ad_state & DEPCOMPLETE) == 0) - LIST_REMOVE(adp, ad_deps); - TAILQ_REMOVE(adphead, adp, ad_next); - if ((adp->ad_state & COMPLETE) == 0) - WORKLIST_REMOVE(&adp->ad_list); - if (adp->ad_freefrag != NULL) { - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &adp->ad_freefrag->ff_list); - else - add_to_worklist(&adp->ad_freefrag->ff_list); + if (newblk->nb_state & ONDEPLIST) + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + LIST_REMOVE(newblk, nb_hash); + if ((freefrag = newblk->nb_freefrag) != NULL) { + freefrag->ff_state |= COMPLETE; + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); } - if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { + if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); - if (!LIST_EMPTY(&adp->ad_newdirblk)) - panic("free_allocdirect: extra newdirblk"); - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &newdirblk->db_list); - else - free_newdirblk(newdirblk); + if (!LIST_EMPTY(&newblk->nb_newdirblk)) + panic("free_newblk: extra newdirblk"); + free_newdirblk(newdirblk); } - WORKITEM_FREE(adp, D_ALLOCDIRECT); + while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { + indirdep->ir_state |= DEPCOMPLETE; + indirdep_complete(indirdep); + } + KASSERT(newblk->nb_jnewblk == NULL, + ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); + handle_jwork(&newblk->nb_jwork); + newblk->nb_list.wk_type = D_NEWBLK; + WORKITEM_FREE(newblk, D_NEWBLK); } /* @@ -2554,6 +5347,7 @@ free_newdirblk(newdirblk) { struct pagedep *pagedep; struct diradd *dap; + struct worklist *wk; int i; mtx_assert(&lk, MA_OWNED); @@ -2571,17 +5365,25 @@ free_newdirblk(newdirblk) pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) break; - if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { + if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && + LIST_EMPTY(&pagedep->pd_jmvrefhd)) { + KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, + ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } + /* Should only ever be one item in the list. */ + while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { + WORKLIST_REMOVE(wk); + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } @@ -2608,6 +5410,7 @@ softdep_freefile(pvp, ino, mode) freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; + LIST_INIT(&freefile->fx_jwork); if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendinginodes += 1; @@ -2618,11 +5421,29 @@ softdep_freefile(pvp, ino, mode) * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either - * case we can free the file immediately. + * case we can free the file immediately. If the journal was + * canceled before being written the inode will never make it to + * disk and we must send the canceled journal entrys to + * ffs_freefile() to be cleared in conjunction with the bitmap. + * Any blocks waiting on the inode to write can be safely freed + * here as it will never been written. */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || - check_inode_unwritten(inodedep)) { + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + /* + * Remove this inode from the unlinked list and set + * GOINGAWAY as appropriate to indicate that this inode + * will never be written. + */ + if (inodedep && inodedep->id_state & UNLINKED) { + clear_unlinked_inodedep(inodedep); + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) { + inodedep->id_state |= GOINGAWAY; + handle_bufwait(inodedep, &freefile->fx_jwork); + } + } + if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); handle_workitem_freefile(freefile); return; @@ -2654,7 +5475,8 @@ check_inode_unwritten(inodedep) { mtx_assert(&lk, MA_OWNED); - if ((inodedep->id_state & DEPCOMPLETE) != 0 || + + if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || @@ -2662,9 +5484,9 @@ check_inode_unwritten(inodedep) !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || + inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); - /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". @@ -2673,9 +5495,11 @@ check_inode_unwritten(inodedep) inodedep->id_savedino1 == NULL) return (0); + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); + inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; + inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { @@ -2696,17 +5520,23 @@ free_inodedep(inodedep) { mtx_assert(&lk, MA_OWNED); - if ((inodedep->id_state & ONWORKLIST) != 0 || + if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || + !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || - inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) + inodedep->id_mkdiradd != NULL || + inodedep->id_nlinkdelta != 0 || + inodedep->id_savedino1 != NULL) return (0); + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; @@ -2714,6 +5544,123 @@ free_inodedep(inodedep) } /* + * Free the block referenced by a freework structure. The parent freeblks + * structure is released and completed when the final cg bitmap reaches + * the disk. This routine may be freeing a jnewblk which never made it to + * disk in which case we do not have to wait as the operation is undone + * in memory immediately. + */ +static void +freework_freeblock(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct workhead wkhd; + struct fs *fs; + int complete; + int pending; + int bsize; + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + complete = 0; + LIST_INIT(&wkhd); + /* + * If we are canceling an existing jnewblk pass it to the free + * routine, otherwise pass the freeblk which will ultimately + * release the freeblks + */ + if (!LIST_EMPTY(&freework->fw_jwork)) { + LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); + complete = 1; + } else + WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); + bsize = lfragtosize(fs, freework->fw_frags); + pending = btodb(bsize); + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= pending; + FREE_LOCK(&lk); + /* + * extattr blocks don't show up in pending blocks. XXX why? + */ + if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { + UFS_LOCK(ump); + fs->fs_pendingblocks -= pending; + UFS_UNLOCK(ump); + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, + bsize, freeblks->fb_previousinum, &wkhd); + if (complete == 0) + return; + /* + * The jnewblk will be discarded and the bits in the map never + * made it to disk. We can immediately free the freeblk. + */ + ACQUIRE_LOCK(&lk); + handle_written_freework(freework); + FREE_LOCK(&lk); +} + +/* + * Start, continue, or finish the process of freeing an indirect block tree. + * The free operation may be paused at any point with fw_off containing the + * offset to restart from. This enables us to implement some flow control + * for large truncates which may fan out and generate a huge number of + * dependencies. + */ +static void +handle_workitem_indirblk(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct fs *fs; + + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + if (freework->fw_off == NINDIR(fs)) + freework_freeblock(freework); + else + indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), + freework->fw_lbn); +} + +/* + * Called when a freework structure attached to a cg buf is written. The + * ref on either the parent or the freeblks structure is released and + * either may be added to the worklist if it is the final ref. + */ +static void +handle_written_freework(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct freework *parent; + + freeblks = freework->fw_freeblks; + parent = freework->fw_parent; + if (parent) { + if (--parent->fw_ref != 0) + parent = NULL; + freeblks = NULL; + } else if (--freeblks->fb_ref != 0) + freeblks = NULL; + WORKITEM_FREE(freework, D_FREEWORK); + /* + * Don't delay these block frees or it takes an intolerable amount + * of time to process truncates and free their journal entries. + */ + if (freeblks) + add_to_worklist(&freeblks->fb_list, 1); + if (parent) + add_to_worklist(&parent->fw_list, 1); +} + +/* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, @@ -2726,99 +5673,79 @@ handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { + struct freework *freework; + struct worklist *wk; + + KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), + ("handle_workitem_freeblocks: Journal entries not written.")); + if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { + handle_complete_freeblocks(freeblks); + return; + } + freeblks->fb_ref++; + while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { + KASSERT(wk->wk_type == D_FREEWORK, + ("handle_workitem_freeblocks: Unknown type %s", + TYPENAME(wk->wk_type))); + WORKLIST_REMOVE_UNLOCKED(wk); + freework = WK_FREEWORK(wk); + if (freework->fw_lbn <= -NDADDR) + handle_workitem_indirblk(freework); + else + freework_freeblock(freework); + } + ACQUIRE_LOCK(&lk); + if (--freeblks->fb_ref != 0) + freeblks = NULL; + FREE_LOCK(&lk); + if (freeblks) + handle_complete_freeblocks(freeblks); +} + +/* + * Once all of the freework workitems are complete we can retire the + * freeblocks dependency and any journal work awaiting completion. This + * can not be called until all other dependencies are stable on disk. + */ +static void +handle_complete_freeblocks(freeblks) + struct freeblks *freeblks; +{ struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; - int i, nblocks, level, bsize; - ufs2_daddr_t bn, blocksreleased = 0; - int error, allerror = 0; - ufs_lbn_t baselbns[NIADDR], tmpval; - int fs_pendingblocks; + int flags; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; - fs_pendingblocks = 0; - tmpval = 1; - baselbns[0] = NDADDR; - for (i = 1; i < NIADDR; i++) { - tmpval *= NINDIR(fs); - baselbns[i] = baselbns[i - 1] + tmpval; - } - nblocks = btodb(fs->fs_bsize); - blocksreleased = 0; + flags = LK_NOWAIT; + /* - * Release all extended attribute blocks or frags. - */ - if (freeblks->fb_oldextsize > 0) { - for (i = (NXADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_eblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldextsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - blocksreleased += btodb(bsize); - } - } - /* - * Release all data blocks or frags. - */ - if (freeblks->fb_oldsize > 0) { - /* - * Indirect blocks first. - */ - for (level = (NIADDR - 1); level >= 0; level--) { - if ((bn = freeblks->fb_iblks[level]) == 0) - continue; - if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), - level, baselbns[level], &blocksreleased)) != 0) - allerror = error; - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, - fs->fs_bsize, freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - blocksreleased += nblocks; - } - /* - * All direct blocks or frags. - */ - for (i = (NDADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_dblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - fs_pendingblocks += btodb(bsize); - blocksreleased += btodb(bsize); - } - } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); - /* * If we still have not finished background cleanup, then check * to see if the block count needs to be adjusted. */ - if (freeblks->fb_chkcnt != blocksreleased && - (fs->fs_flags & FS_UNCLEAN) != 0 && + if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, - (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) - == 0) { + (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { ip = VTOI(vp); - DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ - freeblks->fb_chkcnt - blocksreleased); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); ip->i_flag |= IN_CHANGE; vput(vp); } #ifdef INVARIANTS - if (freeblks->fb_chkcnt != blocksreleased && + if (freeblks->fb_chkcnt != 0 && ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) printf("handle_workitem_freeblocks: block count\n"); - if (allerror) - softdep_error("handle_workitem_freeblks", allerror); #endif /* INVARIANTS */ ACQUIRE_LOCK(&lk); + /* + * All of the freeblock deps must be complete prior to this call + * so it's now safe to complete earlier outstanding journal entries. + */ + handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); num_freeblkdep--; FREE_LOCK(&lk); @@ -2830,29 +5757,39 @@ handle_workitem_freeblocks(freeblks, flags) * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ -static int -indir_trunc(freeblks, dbn, level, lbn, countp) - struct freeblks *freeblks; +static void +indir_trunc(freework, dbn, lbn) + struct freework *freework; ufs2_daddr_t dbn; - int level; ufs_lbn_t lbn; - ufs2_daddr_t *countp; { + struct workhead wkhd; + struct jnewblk *jnewblk; + struct freeblks *freeblks; struct buf *bp; struct fs *fs; + struct worklist *wkn; struct worklist *wk; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; - ufs2_daddr_t nb, *bap2 = 0; + ufs2_daddr_t nb, nnb, *bap2 = 0; ufs_lbn_t lbnadd; int i, nblocks, ufs1fmt; - int error, allerror = 0; int fs_pendingblocks; + int freedeps; + int level; + int cnt; + LIST_INIT(&wkhd); + level = lbn_level(lbn); + if (level == -1) + panic("indir_trunc: Invalid lbn %jd\n", lbn); + freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; + freedeps = 0; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); @@ -2877,13 +5814,14 @@ handle_workitem_freeblocks(freeblks, flags) ACQUIRE_LOCK(&lk); if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || - (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || - (indirdep->ir_state & GOINGAWAY) == 0) - panic("indir_trunc: lost indirdep"); - WORKLIST_REMOVE(wk); - WORKITEM_FREE(indirdep, D_INDIRDEP); + (wk->wk_state & GOINGAWAY) == 0) + panic("indir_trunc: lost indirdep %p", wk); + indirdep = WK_INDIRDEP(wk); + LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); + free_indirdep(indirdep); if (!LIST_EMPTY(&bp->b_dep)) - panic("indir_trunc: dangling dep"); + panic("indir_trunc: dangling dep %p", + LIST_FIRST(&bp->b_dep)); ump->um_numindirdeps -= 1; FREE_LOCK(&lk); } else { @@ -2892,11 +5830,10 @@ handle_workitem_freeblocks(freeblks, flags) brelse(bp); #endif FREE_LOCK(&lk); - error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, - NOCRED, &bp); - if (error) { + if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, + NOCRED, &bp) != 0) { brelse(bp); - return (error); + return; } } /* @@ -2909,57 +5846,245 @@ handle_workitem_freeblocks(freeblks, flags) ufs1fmt = 0; bap2 = (ufs2_daddr_t *)bp->b_data; } - nblocks = btodb(fs->fs_bsize); - for (i = NINDIR(fs) - 1; i >= 0; i--) { - if (ufs1fmt) + /* + * Reclaim indirect blocks which never made it to disk. + */ + cnt = 0; + LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { + struct workhead freewk; + if (wk->wk_type != D_JNEWBLK) + continue; + WORKLIST_REMOVE_UNLOCKED(wk); + LIST_INIT(&freewk); + WORKLIST_INSERT_UNLOCKED(&freewk, wk); + jnewblk = WK_JNEWBLK(wk); + if (jnewblk->jn_lbn > 0) + i = (jnewblk->jn_lbn - -lbn) / lbnadd; + else + i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd; + KASSERT(i >= 0 && i < NINDIR(fs), + ("indir_trunc: Index out of range %d parent %jd lbn %jd", + i, lbn, jnewblk->jn_lbn)); + /* Clear the pointer so it isn't found below. */ + if (ufs1fmt) { nb = bap1[i]; - else + bap1[i] = 0; + } else { nb = bap2[i]; + bap2[i] = 0; + } + KASSERT(nb == jnewblk->jn_blkno, + ("indir_trunc: Block mismatch %jd != %jd", + nb, jnewblk->jn_blkno)); + ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno, + fs->fs_bsize, freeblks->fb_previousinum, &freewk); + cnt++; + } + ACQUIRE_LOCK(&lk); + freework->fw_ref += NINDIR(fs) + 1; + /* Any remaining journal work can be completed with freeblks. */ + jwork_move(&freeblks->fb_jwork, &wkhd); + FREE_LOCK(&lk); + nblocks = btodb(fs->fs_bsize); + if (ufs1fmt) + nb = bap1[0]; + else + nb = bap2[0]; + /* + * Reclaim on disk blocks. + */ + for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { + if (i != NINDIR(fs) - 1) { + if (ufs1fmt) + nnb = bap1[i+1]; + else + nnb = bap2[i+1]; + } else + nnb = 0; if (nb == 0) continue; + cnt++; if (level != 0) { - if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), - level - 1, lbn + (i * lbnadd), countp)) != 0) - allerror = error; + struct freework *nfreework; + ufs_lbn_t nlbn; + + nlbn = (lbn + 1) - (i * lbnadd); + nfreework = newfreework(freeblks, freework, nlbn, nb, + fs->fs_frag, 0); + freedeps++; + indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); + } else { + struct freedep *freedep; + + /* + * Attempt to aggregate freedep dependencies for + * all blocks being released to the same CG. + */ + LIST_INIT(&wkhd); + if (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb))) { + freedep = newfreedep(freework); + WORKLIST_INSERT_UNLOCKED(&wkhd, + &freedep->fd_list); + freedeps++; + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, + fs->fs_bsize, freeblks->fb_previousinum, &wkhd); + fs_pendingblocks += nblocks; } - ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, - freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - *countp += nblocks; } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); + ACQUIRE_LOCK(&lk); + freework->fw_off = i; + if (level == 0) + fs_pendingblocks = (nblocks * cnt); + freework->fw_ref += freedeps; + freework->fw_ref -= NINDIR(fs) + 1; + if (freework->fw_ref != 0) + freework = NULL; + FREE_LOCK(&lk); + if (fs_pendingblocks) { + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= fs_pendingblocks; + FREE_LOCK(&lk); + UFS_LOCK(ump); + fs->fs_pendingblocks -= fs_pendingblocks; + UFS_UNLOCK(ump); + } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); - return (allerror); + if (freework) + handle_workitem_indirblk(freework); + return; } /* - * Free an allocindir. - * This routine must be called with splbio interrupts blocked. + * Cancel an allocindir when it is removed via truncation. */ static void -free_allocindir(aip, inodedep) +cancel_allocindir(aip, inodedep, freeblks) struct allocindir *aip; struct inodedep *inodedep; + struct freeblks *freeblks; { - struct freefrag *freefrag; + struct newblk *newblk; - mtx_assert(&lk, MA_OWNED); - if ((aip->ai_state & DEPCOMPLETE) == 0) - LIST_REMOVE(aip, ai_deps); - if (aip->ai_state & ONWORKLIST) - WORKLIST_REMOVE(&aip->ai_list); + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the indirdep to be + * freed when indir_trunc() is called. If the journal has already + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ LIST_REMOVE(aip, ai_next); - if ((freefrag = aip->ai_freefrag) != NULL) { + newblk = (struct newblk *)aip; + if (newblk->nb_jnewblk == NULL) + cancel_newblk(newblk, &freeblks->fb_jwork); + else + cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork); + if (inodedep && inodedep->id_state & DEPCOMPLETE) + WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); + else + free_newblk(newblk); +} + +/* + * Create the mkdir dependencies for . and .. in a new directory. Link them + * in to a newdirblk so any subsequent additions are tracked properly. The + * caller is responsible for adding the mkdir1 dependency to the journal + * and updating id_mkdiradd. This function returns with lk held. + */ +static struct mkdir * +setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) + struct diradd *dap; + ino_t newinum; + ino_t dinum; + struct buf *newdirbp; + struct mkdir **mkdirp; +{ + struct newblk *newblk; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct newdirblk *newdirblk = 0; + struct mkdir *mkdir1, *mkdir2; + struct worklist *wk; + struct jaddref *jaddref; + struct mount *mp; + + mp = dap->da_list.wk_mp; + newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, + M_SOFTDEP_FLAGS); + workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); + mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); + mkdir1->md_state = ATTACHED | MKDIR_BODY; + mkdir1->md_diradd = dap; + mkdir1->md_jaddref = NULL; + mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); + mkdir2->md_state = ATTACHED | MKDIR_PARENT; + mkdir2->md_diradd = dap; + mkdir2->md_jaddref = NULL; + if ((mp->mnt_flag & MNT_SUJ) == 0) { + mkdir1->md_state |= DEPCOMPLETE; + mkdir2->md_state |= DEPCOMPLETE; + } + /* + * Dependency on "." and ".." being written to disk. + */ + mkdir1->md_buf = newdirbp; + ACQUIRE_LOCK(&lk); + LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); + /* + * We must link the pagedep, allocdirect, and newdirblk for + * the initial file page so the pointer to the new directory + * is not written until the directory contents are live and + * any subsequent additions are not marked live until the + * block is reachable via the inode. + */ + if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) + panic("setup_newdir: lost pagedep"); + LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) + if (wk->wk_type == D_ALLOCDIRECT) + break; + if (wk == NULL) + panic("setup_newdir: lost allocdirect"); + newblk = WK_NEWBLK(wk); + pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; + newdirblk->db_pagedep = pagedep; + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); + WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); + /* + * Look up the inodedep for the parent directory so that we + * can link mkdir2 into the pending dotdot jaddref or + * the inode write if there is none. If the inode is + * ALLCOMPLETE and no jaddref is present all dependencies have + * been satisfied and mkdir2 can be freed. + */ + inodedep_lookup(mp, dinum, 0, &inodedep); + if (mp->mnt_flag & MNT_SUJ) { if (inodedep == NULL) - add_to_worklist(&freefrag->ff_list); - else - WORKLIST_INSERT(&inodedep->id_bufwait, - &freefrag->ff_list); + panic("setup_newdir: Lost parent."); + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && + (jaddref->ja_state & MKDIR_PARENT), + ("setup_newdir: bad dotdot jaddref %p", jaddref)); + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + mkdir2->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir2; + } else if (inodedep == NULL || + (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + dap->da_state &= ~MKDIR_PARENT; + WORKITEM_FREE(mkdir2, D_MKDIR); + } else { + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } - WORKITEM_FREE(aip, D_ALLOCINDIR); + *mkdirp = mkdir2; + + return (mkdir1); } /* @@ -2998,12 +6123,14 @@ softdep_setup_directory_add(bp, dp, diroffset, new ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; - struct allocdirect *adp; + struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; + struct jaddref *jaddref; struct mount *mp; + int isindir; /* * Whiteouts have no dependencies. @@ -3013,6 +6140,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new bdwrite(newdirbp); return (0); } + jaddref = NULL; + mkdir1 = mkdir2 = NULL; mp = UFSTOVFS(dp->i_ump); fs = dp->i_fs; lbn = lblkno(fs, diroffset); @@ -3023,111 +6152,123 @@ softdep_setup_directory_add(bp, dp, diroffset, new dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; - if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { + LIST_INIT(&dap->da_jwork); + isindir = bp->b_lblkno >= NDADDR; + if (isnewblk && + (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); } + /* + * If we're creating a new directory setup the dependencies and set + * the dap state to wait for them. Otherwise it's COMPLETE and + * we can move on. + */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; - mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, - M_SOFTDEP_FLAGS); - workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); - mkdir1->md_state = MKDIR_BODY; - mkdir1->md_diradd = dap; - mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, - M_SOFTDEP_FLAGS); - workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); - mkdir2->md_state = MKDIR_PARENT; - mkdir2->md_diradd = dap; - /* - * Dependency on "." and ".." being written to disk. - */ - mkdir1->md_buf = newdirbp; - ACQUIRE_LOCK(&lk); - LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); - WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); - FREE_LOCK(&lk); - bdwrite(newdirbp); - /* - * Dependency on link count increase for parent directory - */ - ACQUIRE_LOCK(&lk); - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 - || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { - dap->da_state &= ~MKDIR_PARENT; - WORKITEM_FREE(mkdir2, D_MKDIR); - } else { - LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); - WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); - } + mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, + &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); +#ifdef DEBUG + if (diradd_lookup(pagedep, offset) != NULL) + panic("softdep_setup_directory_add: %p already at off %d\n", + diradd_lookup(pagedep, offset), offset); +#endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); /* - * Link into its inodedep. Put it on the id_bufwait list if the inode - * is not yet written. If it is written, do the post-inode write - * processing to put it on the id_pendinghd list. + * If we're journaling, link the diradd into the jaddref so it + * may be completed after the journal entry is written. Otherwise, + * link the diradd into its inodedep. If the inode is not yet + * written place it on the bufwait list, otherwise do the post-inode + * write processing to put it on the id_pendinghd list. */ - (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); - if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) + if (mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_add: bad jaddref %p", jaddref)); + jaddref->ja_diroff = diroffset; + jaddref->ja_diradd = dap; + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); - if (isnewblk) { + /* + * Add the journal entries for . and .. links now that the primary + * link is written. + */ + if (mkdir1 != NULL && mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, + inoreflst, if_deps); + KASSERT(jaddref != NULL && + jaddref->ja_ino == jaddref->ja_parent && + (jaddref->ja_state & MKDIR_BODY), + ("softdep_setup_directory_add: bad dot jaddref %p", + jaddref)); + mkdir1->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir1; /* - * Directories growing into indirect blocks are rare - * enough and the frequency of new block allocation - * in those cases even more rare, that we choose not - * to bother tracking them. Rather we simply force the - * new directory entry to disk. + * It is important that the dotdot journal entry + * is added prior to the dot entry since dot writes + * both the dot and dotdot links. These both must + * be added after the primary link for the journal + * to remain consistent. */ - if (lbn >= NDADDR) { - FREE_LOCK(&lk); - /* - * We only have a new allocation when at the - * beginning of a new block, not when we are - * expanding into an existing block. - */ - if (blkoff(fs, diroffset) == 0) - return (1); - return (0); - } + add_to_journal(&mkdir2->md_jaddref->ja_list); + add_to_journal(&jaddref->ja_list); + } + /* + * If we are adding a new directory remember this diradd so that if + * we rename it we can keep the dot and dotdot dependencies. If + * we are adding a new name for an inode that has a mkdiradd we + * must be in rename and we have to move the dot and dotdot + * dependencies to this new name. The old name is being orphaned + * soon. + */ + if (mkdir1 != NULL) { + if (inodedep->id_mkdiradd != NULL) + panic("softdep_setup_directory_add: Existing mkdir"); + inodedep->id_mkdiradd = dap; + } else if (inodedep->id_mkdiradd) + merge_diradd(inodedep, dap); + if (newdirblk) { /* - * We only have a new allocation when at the beginning - * of a new fragment, not when we are expanding into an - * existing fragment. Also, there is nothing to do if we - * are already tracking this block. + * There is nothing to do if we are already tracking + * this block. */ - if (fragoff(fs, diroffset) != 0) { - FREE_LOCK(&lk); - return (0); - } if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(&lk); return (0); } - /* - * Find our associated allocdirect and have it track us. - */ - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) - panic("softdep_setup_directory_add: lost inodedep"); - adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); - if (adp == NULL || adp->ad_lbn != lbn) + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) + == 0) panic("softdep_setup_directory_add: lost entry"); + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; - WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); + FREE_LOCK(&lk); + /* + * If we extended into an indirect signal direnter to sync. + */ + if (isindir) + return (1); + return (0); } FREE_LOCK(&lk); return (0); @@ -3141,7 +6282,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new * occur while the move is in progress. */ void -softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) +softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) + struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ @@ -3150,40 +6292,204 @@ void { int offset, oldoffset, newoffset; struct pagedep *pagedep; + struct jmvref *jmvref; struct diradd *dap; + struct direct *de; + struct mount *mp; ufs_lbn_t lbn; + int flags; - ACQUIRE_LOCK(&lk); + mp = UFSTOVFS(dp->i_ump); + de = (struct direct *)oldloc; + jmvref = NULL; + flags = 0; + /* + * Moves are always journaled as it would be too complex to + * determine if any affected adds or removes are present in the + * journal. + */ + if (mp->mnt_flag & MNT_SUJ) { + flags = DEPALLOC; + jmvref = newjmvref(dp, de->d_ino, + dp->i_offset + (oldloc - base), + dp->i_offset + (newloc - base)); + } lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) - goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); + ACQUIRE_LOCK(&lk); + if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { + if (pagedep) + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + goto done; + } + dap = diradd_lookup(pagedep, oldoffset); + if (dap) { + dap->da_offset = newoffset; + newoffset = DIRADDHASH(newoffset); + oldoffset = DIRADDHASH(oldoffset); + if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && + newoffset != oldoffset) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], + dap, da_pdlist); + } + } +done: + if (jmvref) { + jmvref->jm_pagedep = pagedep; + LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); + add_to_journal(&jmvref->jm_list); + } + bcopy(oldloc, newloc, entrysize); + FREE_LOCK(&lk); +} - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { - if (dap->da_offset != oldoffset) - continue; - dap->da_offset = newoffset; - if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) - break; +/* + * Move the mkdir dependencies and journal work from one diradd to another + * when renaming a directory. The new name must depend on the mkdir deps + * completing as the old name did. Directories can only have one valid link + * at a time so one must be canonical. + */ +static void +merge_diradd(inodedep, newdap) + struct inodedep *inodedep; + struct diradd *newdap; +{ + struct diradd *olddap; + struct mkdir *mkdir, *nextmd; + short state; + + olddap = inodedep->id_mkdiradd; + inodedep->id_mkdiradd = newdap; + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + newdap->da_state &= ~DEPCOMPLETE; + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != olddap) + continue; + mkdir->md_diradd = newdap; + state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); + newdap->da_state |= state; + olddap->da_state &= ~state; + if ((olddap->da_state & + (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; + } + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("merge_diradd: unfound ref"); + } + /* + * Any mkdir related journal items are not safe to be freed until + * the new name is stable. + */ + jwork_move(&newdap->da_jwork, &olddap->da_jwork); + olddap->da_state |= DEPCOMPLETE; + complete_diradd(olddap); +} + +/* + * Move the diradd to the pending list when all diradd dependencies are + * complete. + */ +static void +complete_diradd(dap) + struct diradd *dap; +{ + struct pagedep *pagedep; + + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], - dap, da_pdlist); - break; + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } - if (dap == NULL) { +} - LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { - if (dap->da_offset == oldoffset) { - dap->da_offset = newoffset; - break; +/* + * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal + * add entries and conditonally journal the remove. + */ +static void +cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) + struct diradd *dap; + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct inoref *inoref; + struct mkdir *mkdir; + + /* + * If no remove references were allocated we're on a non-journaled + * filesystem and can skip the cancel step. + */ + if (jremref == NULL) { + free_diradd(dap, NULL); + return; + } + /* + * Cancel the primary name an free it if it does not require + * journaling. + */ + if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, + 0, &inodedep) != 0) { + /* Abort the addref that reference this diradd. */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if (inoref->if_list.wk_type != D_JADDREF) + continue; + jaddref = (struct jaddref *)inoref; + if (jaddref->ja_diradd != dap) + continue; + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(jremref); + jremref = NULL; } + break; } } -done: - bcopy(oldloc, newloc, entrysize); - FREE_LOCK(&lk); + /* + * Cancel subordinate names and free them if they do not require + * journaling. + */ + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { + if (mkdir->md_diradd != dap) + continue; + if ((jaddref = mkdir->md_jaddref) == NULL) + continue; + mkdir->md_jaddref = NULL; + if (mkdir->md_state & MKDIR_PARENT) { + if (cancel_jaddref(jaddref, NULL, + &dirrem->dm_jwork) == 0) { + free_jremref(dotdotremref); + dotdotremref = NULL; + } + } else { + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(dotremref); + dotremref = NULL; + } + } + } + } + + if (jremref) + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); + jwork_move(&dirrem->dm_jwork, &dap->da_jwork); + free_diradd(dap, &dirrem->dm_jwork); } /* @@ -3191,8 +6497,9 @@ void * with splbio interrupts blocked. */ static void -free_diradd(dap) +free_diradd(dap, wkhd) struct diradd *dap; + struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; @@ -3200,32 +6507,48 @@ static void struct mkdir *mkdir, *nextmd; mtx_assert(&lk, MA_OWNED); - WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); + if (dap->da_state & ONWORKLIST) + WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + dirrem->dm_state |= COMPLETE; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) - (void) free_inodedep(inodedep); + if (inodedep->id_mkdiradd == dap) + inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; - dap->da_state &= ~mkdir->md_state; - WORKLIST_REMOVE(&mkdir->md_list); + dap->da_state &= + ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + if (mkdir->md_jaddref != NULL) + panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } + if (inodedep) + free_inodedep(inodedep); + /* + * Free any journal segments waiting for the directory write. + */ + handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } @@ -3254,11 +6577,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir) int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; + struct inodedep *inodedep; + int direct; /* - * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want + * newdirrem() to setup the full directory remove which requires + * isrmdir > 1. */ - dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem); + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. + */ + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) + panic("softdep_setup_remove: Lost inodedep."); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active @@ -3280,12 +6616,148 @@ softdep_setup_remove(bp, dp, ip, isrmdir) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(&lk); - handle_workitem_remove(dirrem, NULL); + if (direct) + handle_workitem_remove(dirrem, NULL); } } /* + * Check for an entry matching 'offset' on both the pd_dirraddhd list and the + * pd_pendinghd list of a pagedep. + */ +static struct diradd * +diradd_lookup(pagedep, offset) + struct pagedep *pagedep; + int offset; +{ + struct diradd *dap; + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) + if (dap->da_offset == offset) + return (dap); + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) + if (dap->da_offset == offset) + return (dap); + return (NULL); +} + +/* + * Search for a .. diradd dependency in a directory that is being removed. + * If the directory was renamed to a new parent we have a diradd rather + * than a mkdir for the .. entry. We need to cancel it now before + * it is found in truncate(). + */ +static struct jremref * +cancel_diradd_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct pagedep *pagedep; + struct diradd *dap; + struct worklist *wk; + + if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, + &pagedep) == 0) + return (jremref); + dap = diradd_lookup(pagedep, DOTDOT_OFFSET); + if (dap == NULL) + return (jremref); + cancel_diradd(dap, dirrem, jremref, NULL, NULL); + /* + * Mark any journal work as belonging to the parent so it is freed + * with the .. reference. + */ + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + wk->wk_state |= MKDIR_PARENT; + return (NULL); +} + +/* + * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to + * replace it with a dirrem/diradd pair as a result of re-parenting a + * directory. This ensures that we don't simultaneously have a mkdir and + * a diradd for the same .. entry. + */ +static struct jremref * +cancel_mkdir_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct mkdir *mkdir; + struct diradd *dap; + + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) + panic("cancel_mkdir_dotdot: Lost inodedep"); + dap = inodedep->id_mkdiradd; + if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) + return (jremref); + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; + mkdir = LIST_NEXT(mkdir, md_mkdirs)) + if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) + break; + if (mkdir == NULL) + panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); + if ((jaddref = mkdir->md_jaddref) != NULL) { + mkdir->md_jaddref = NULL; + if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, + &inodedep) == 0) + panic("cancel_mkdir_dotdot: Lost parent inodedep"); + if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { + journal_jremref(dirrem, jremref, inodedep); + jremref = NULL; + } + } + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + mkdir->md_state |= ALLCOMPLETE; + complete_mkdir(mkdir); + return (jremref); +} + +static void +journal_jremref(dirrem, jremref, inodedep) + struct dirrem *dirrem; + struct jremref *jremref; + struct inodedep *inodedep; +{ + + if (inodedep == NULL) + if (inodedep_lookup(jremref->jr_list.wk_mp, + jremref->jr_ref.if_ino, 0, &inodedep) == 0) + panic("journal_jremref: Lost inodedep"); + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + add_to_journal(&jremref->jr_list); +} + +static void +dirrem_journal(dirrem, jremref, dotremref, dotdotremref) + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + + + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, + &inodedep) == 0) + panic("dirrem_journal: Lost inodedep"); + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); +} + +/* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ @@ -3303,12 +6775,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; + struct vnode *dvp; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); + dvp = ITOV(dp); /* * If we are over our limit, try to improve the situation. * Limiting the number of dirrem structures will also limit @@ -3321,34 +6798,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) FREE_LOCK(&lk); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); + workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); + LIST_INIT(&dirrem->dm_jremrefhd); + LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; - + /* + * Allocate remove reference structures to track journal write + * dependencies. We will always have one for the link and + * when doing directories we will always have one more for dot. + * When renaming a directory we skip the dotdot link change so + * this is not needed. + */ + jremref = dotremref = dotdotremref = NULL; + if (DOINGSUJ(dvp)) { + if (isrmdir) { + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 2); + dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, + ip->i_effnlink + 1); + } else + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 1); + if (isrmdir > 1) { + dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, + dp->i_effnlink + 1); + dotdotremref->jr_state |= MKDIR_PARENT; + } + } ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* + * If we're renaming a .. link to a new directory, cancel any + * existing MKDIR_PARENT mkdir. If it has already been canceled + * the jremref is preserved for any potential diradd in this + * location. This can not coincide with a rmdir. + */ + if (dp->i_offset == DOTDOT_OFFSET) { + if (isrmdir) + panic("newdirrem: .. directory change during remove?"); + jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); + } + /* + * If we're removing a directory search for the .. dependency now and + * cancel it. Any pending journal work will be added to the dirrem + * to be completed when the workitem remove completes. + */ + if (isrmdir > 1) + dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); + /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can - * be de-allocated. Check for an entry on both the pd_dirraddhd - * list and the pd_pendinghd list. + * be de-allocated. */ - - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) - if (dap->da_offset == offset) - break; + dap = diradd_lookup(pagedep, offset); if (dap == NULL) { - - LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) - if (dap->da_offset == offset) - break; - if (dap == NULL) - return (dirrem); + /* + * Link the jremref structures into the dirrem so they are + * written prior to the pagedep. + */ + if (jremref) + dirrem_journal(dirrem, jremref, dotremref, + dotdotremref); + return (dirrem); } /* * Must be ATTACHED at this point. @@ -3373,7 +6891,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; - free_diradd(dap); + cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); +#ifdef SUJ_DEBUG + if (isrmdir == 0) { + struct worklist *wk; + + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) + panic("bad wk %p (0x%X)\n", wk, wk->wk_state); + } +#endif + return (dirrem); } @@ -3407,6 +6935,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; + struct jaddref *jaddref; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); @@ -3422,6 +6951,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; + LIST_INIT(&dap->da_jwork); } /* @@ -3454,11 +6984,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(&lk); return; } + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. A valid nlinkdelta ensures that this lookup + * will not fail. + */ + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) + panic("softdep_setup_directory_change: Lost inodedep."); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active @@ -3483,15 +7023,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } /* - * Link into its inodedep. Put it on the id_bufwait list if the inode + * Lookup the jaddref for this journal entry. We must finish + * initializing it and make the diradd write dependent on it. + * If we're not journaling Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ - if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || - (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); + if (mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_change: bad jaddref %p", + jaddref)); + jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diradd = dap; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], + dap, da_pdlist); + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); @@ -3500,6 +7054,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } + /* + * If we're making a new name for a directory that has not been + * committed when need to move the dot and dotdot references to + * this new name. + */ + if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) + merge_diradd(inodedep, dap); FREE_LOCK(&lk); } @@ -3516,8 +7077,7 @@ softdep_change_linkcnt(ip) struct inodedep *inodedep; ACQUIRE_LOCK(&lk); - (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, - DEPALLOC, &inodedep); + inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; @@ -3574,6 +7134,304 @@ softdep_releasefile(ip) } /* + * Attach a sbdep dependency to the superblock buf so that we can keep + * track of the head of the linked list of referenced but unlinked inodes. + */ +void +softdep_setup_sbupdate(ump, fs, bp) + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; +{ + struct sbdep *sbdep; + struct worklist *wk; + + if ((fs->fs_flags & FS_SUJ) == 0) + return; + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_SBDEP) + break; + if (wk != NULL) + return; + sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); + sbdep->sb_fs = fs; + sbdep->sb_ump = ump; + ACQUIRE_LOCK(&lk); + WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); + FREE_LOCK(&lk); +} + +/* + * Return the first unlinked inodedep which is ready to be the head of the + * list. The inodedep and all those after it must have valid next pointers. + */ +static struct inodedep * +first_unlinked_inodedep(ump) + struct ufsmount *ump; +{ + struct inodedep *inodedep; + struct inodedep *idp; + + for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); + inodedep; inodedep = idp) { + if ((inodedep->id_state & UNLINKNEXT) == 0) + return (NULL); + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) + break; + if ((inodedep->id_state & UNLINKPREV) == 0) + panic("first_unlinked_inodedep: prev != next"); + } + if (inodedep == NULL) + return (NULL); + + return (inodedep); +} + +/* + * Set the sujfree unlinked head pointer prior to writing a superblock. + */ +static void +initiate_write_sbdep(sbdep) + struct sbdep *sbdep; +{ + struct inodedep *inodedep; + struct fs *bpfs; + struct fs *fs; + + bpfs = sbdep->sb_fs; + fs = sbdep->sb_ump->um_fs; + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if (inodedep) { + fs->fs_sujfree = inodedep->id_ino; + inodedep->id_state |= UNLINKPREV; + } else + fs->fs_sujfree = 0; + bpfs->fs_sujfree = fs->fs_sujfree; +} + +/* + * After a superblock is written determine whether it must be written again + * due to a changing unlinked list head. + */ +static int +handle_written_sbdep(sbdep, bp) + struct sbdep *sbdep; + struct buf *bp; +{ + struct inodedep *inodedep; + struct mount *mp; + struct fs *fs; + + fs = sbdep->sb_fs; + mp = UFSTOVFS(sbdep->sb_ump); + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || + (inodedep == NULL && fs->fs_sujfree != 0)) { + bdirty(bp); + return (1); + } + WORKITEM_FREE(sbdep, D_SBDEP); + if (fs->fs_sujfree == 0) + return (0); + if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) + panic("handle_written_sbdep: lost inodedep"); + /* + * Now that we have a record of this indode in stable store we can + * discard any pending work. + */ + for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { + if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) + panic("handle_written_sbdep: Bad inodedep %p (0x%X)", + inodedep, inodedep->id_state); + if (handle_bufwait(inodedep, NULL) != NULL) + panic("handle_written_sbdep: freefile on " + "unlinked inodedep"); + } + + return (0); +} + +/* + * Mark an inodedep has unlinked and insert it into the in-memory unlinked + * list. + */ +static void +unlinked_inodedep(mp, inodedep) + struct mount *mp; + struct inodedep *inodedep; +{ + struct ufsmount *ump; + + if ((mp->mnt_flag & MNT_SUJ) == 0) + return; + ump = VFSTOUFS(mp); + ump->um_fs->fs_fmod = 1; + inodedep->id_state |= UNLINKED; + TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); +} + +/* + * Remove an inodedep from the unlinked inodedep list. This may require + * disk writes if the inode has made it that far. + */ +static void +clear_unlinked_inodedep(inodedep) + struct inodedep *inodedep; +{ + struct ufsmount *ump; + struct inodedep *idp; + struct inodedep *idn; + struct fs *fs; + struct buf *bp; + ino_t ino; + ino_t nino; + ino_t pino; + int error; + + ump = VFSTOUFS(inodedep->id_list.wk_mp); + fs = ump->um_fs; + ino = inodedep->id_ino; + error = 0; + for (;;) { + /* + * If nothing has yet been written simply remove us from + * the in memory list and return. This is the most common + * case where handle_workitem_remove() loses the final + * reference. + */ + if ((inodedep->id_state & UNLINKLINKS) == 0) + break; + /* + * If we have a NEXT pointer and no PREV pointer we can simply + * clear NEXT's PREV and remove ourselves from the list. Be + * careful not to clear PREV if the superblock points at + * next as well. + */ + idn = TAILQ_NEXT(inodedep, id_unlinked); + if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { + if (idn && fs->fs_sujfree != idn->id_ino) + idn->id_state &= ~UNLINKPREV; + break; + } + /* + * Here we have an inodedep which is actually linked into + * the list. We must remove it by forcing a write to the + * link before us, whether it be the superblock or an inode. + * Unfortunately the list may change while we're waiting + * on the buf lock for either resource so we must loop until + * we lock. the right one. If both the superblock and an + * inode point to this inode we must clear the inode first + * followed by the superblock. + */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + pino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + pino = idp->id_ino; + FREE_LOCK(&lk); + if (pino == 0) + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + else + error = bread(ump->um_devvp, + fsbtodb(fs, ino_to_fsba(fs, pino)), + (int)fs->fs_bsize, NOCRED, &bp); + ACQUIRE_LOCK(&lk); + if (error) + break; + /* If the list has changed restart the loop. */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + nino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + nino = idp->id_ino; + if (nino != pino || + (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); + continue; + } + /* + * Remove us from the in memory list. After this we cannot + * access the inodedep. + */ + idn = TAILQ_NEXT(inodedep, id_unlinked); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + /* + * Determine the next inode number. + */ + nino = 0; + if (idn) { + /* + * If next isn't on the list we can just clear prev's + * state and schedule it to be fixed later. No need + * to synchronously write if we're not in the real + * list. + */ + if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { + idp->id_state &= ~UNLINKNEXT; + if ((idp->id_state & ONWORKLIST) == 0) + WORKLIST_INSERT(&bp->b_dep, + &idp->id_list); + FREE_LOCK(&lk); + bawrite(bp); + ACQUIRE_LOCK(&lk); + return; + } + nino = idn->id_ino; + } + FREE_LOCK(&lk); + /* + * The predecessor's next pointer is manually updated here + * so that the NEXT flag is never cleared for an element + * that is in the list. + */ + if (pino == 0) { + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + } else if (fs->fs_magic == FS_UFS1_MAGIC) + ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + else + ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + /* + * If the bwrite fails we have no recourse to recover. The + * filesystem is corrupted already. + */ + bwrite(bp); + ACQUIRE_LOCK(&lk); + /* + * If the superblock pointer still needs to be cleared force + * a write here. + */ + if (fs->fs_sujfree == ino) { + FREE_LOCK(&lk); + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + bwrite(bp); + ACQUIRE_LOCK(&lk); + } + if (fs->fs_sujfree != ino) + return; + panic("clear_unlinked_inodedep: Failed to clear free head"); + } + if (inodedep->id_ino == fs->fs_sujfree) + panic("clear_unlinked_inodedep: Freeing head of free list"); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + return; +} + +/* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ @@ -3584,23 +7442,55 @@ handle_workitem_remove(dirrem, xp) { struct thread *td = curthread; struct inodedep *inodedep; + struct workhead dotdotwk; + struct worklist *wk; + struct ufsmount *ump; + struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; int error; + if (dirrem->dm_state & ONWORKLIST) + panic("handle_workitem_remove: dirrem %p still on worklist", + dirrem); + oldinum = dirrem->dm_oldinum; + mp = dirrem->dm_list.wk_mp; + ump = VFSTOUFS(mp); if ((vp = xp) == NULL && - (error = ffs_vgetf(dirrem->dm_list.wk_mp, - dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { + (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); ACQUIRE_LOCK(&lk); - if ((inodedep_lookup(dirrem->dm_list.wk_mp, - dirrem->dm_oldinum, 0, &inodedep)) == 0) + if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); + if (dirrem->dm_state & ONDEPLIST) + LIST_REMOVE(dirrem, dm_inonext); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_workitem_remove: Journal entries not written.")); + /* + * Move all dependencies waiting on the remove to complete + * from the dirrem to the inode inowait list to be completed + * after the inode has been updated and written to disk. Any + * marked MKDIR_PARENT are saved to be completed when the .. ref + * is removed. + */ + LIST_INIT(&dotdotwk); + while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { + WORKLIST_REMOVE(wk); + if (wk->wk_state & MKDIR_PARENT) { + wk->wk_state &= ~MKDIR_PARENT; + WORKLIST_INSERT(&dotdotwk, wk); + continue; + } + WORKLIST_INSERT(&inodedep->id_inowait, wk); + } + LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); + /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { @@ -3609,12 +7499,16 @@ handle_workitem_remove(dirrem, xp) ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; num_dirrem -= 1; + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: worklist not empty. %s", + TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); - vput(vp); - return; + goto out; } /* * Directory deletion. Decrement reference count for both the @@ -3628,6 +7522,8 @@ handle_workitem_remove(dirrem, xp) ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) @@ -3639,36 +7535,47 @@ handle_workitem_remove(dirrem, xp) * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: DIRCHG and worklist not empty.")); num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); - vput(vp); - return; + goto out; } + dirrem->dm_state = ONDEPLIST; + dirrem->dm_oldinum = dirrem->dm_dirinum; /* - * If the inodedep does not exist, then the zero'ed inode has - * been written to disk. If the allocated inode has never been - * written to disk, then the on-disk inode is zero'ed. In either - * case we can remove the file immediately. + * Place the dirrem on the parent's diremhd list. */ - dirrem->dm_state = 0; - oldinum = dirrem->dm_oldinum; - dirrem->dm_oldinum = dirrem->dm_dirinum; - if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, - 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { + if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) + panic("handle_workitem_remove: lost dir inodedep"); + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); + /* + * If the allocated inode has never been written to disk, then + * the on-disk inode is zero'ed and we can remove the file + * immediately. When journaling if the inode has been marked + * unlinked and not DEPCOMPLETE we know it can never be written. + */ + inodedep_lookup(mp, oldinum, 0, &inodedep); + if (inodedep == NULL || + (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || + check_inode_unwritten(inodedep)) { if (xp != NULL) - add_to_worklist(&dirrem->dm_list); + add_to_worklist(&dirrem->dm_list, 0); FREE_LOCK(&lk); - vput(vp); - if (xp == NULL) + if (xp == NULL) { + vput(vp); handle_workitem_remove(dirrem, NULL); + } return; } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); ip->i_flag |= IN_CHANGE; +out: ffs_update(vp, 0); - vput(vp); + if (xp == NULL) + vput(vp); } /* @@ -3689,6 +7596,7 @@ static void handle_workitem_freefile(freefile) struct freefile *freefile; { + struct workhead wkhd; struct fs *fs; struct inodedep *idp; struct ufsmount *ump; @@ -3701,13 +7609,15 @@ handle_workitem_freefile(freefile) error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(&lk); if (error) - panic("handle_workitem_freefile: inodedep survived"); + panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); + LIST_INIT(&wkhd); + LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, - freefile->fx_oldinum, freefile->fx_mode)) != 0) + freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefile, D_FREEFILE); @@ -3757,8 +7667,10 @@ softdep_disk_io_initiation(bp) { struct worklist *wk; struct worklist marker; - struct indirdep *indirdep; struct inodedep *inodedep; + struct freeblks *freeblks; + struct jfreeblk *jfreeblk; + struct newblk *newblk; /* * We only care about write operations. There should never @@ -3767,6 +7679,10 @@ softdep_disk_io_initiation(bp) if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("softdep_disk_io_initiation: Writing buffer with " + "background write in progress: %p", bp); + marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ @@ -3792,46 +7708,58 @@ softdep_disk_io_initiation(bp) continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_io_initiation: indirdep gone"); + initiate_write_indirdep(WK_INDIRDEP(wk), bp); + continue; + + case D_BMSAFEMAP: + initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); + continue; + + case D_JSEG: + WK_JSEG(wk)->js_buf = NULL; + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); /* - * If there are no remaining dependencies, this - * will be writing the real pointers, so the - * dependency can be freed. + * We have to wait for the jfreeblks to be journaled + * before we can write an inodeblock with updated + * pointers. Be careful to arrange the marker so + * we revisit the jfreeblk if it's not removed by + * the first jwait(). */ - if (LIST_EMPTY(&indirdep->ir_deplisthd)) { - struct buf *bp; - - bp = indirdep->ir_savebp; - bp->b_flags |= B_INVAL | B_NOCACHE; - /* inline expand WORKLIST_REMOVE(wk); */ - wk->wk_state &= ~ONWORKLIST; - LIST_REMOVE(wk, wk_list); - WORKITEM_FREE(indirdep, D_INDIRDEP); - FREE_LOCK(&lk); - brelse(bp); - ACQUIRE_LOCK(&lk); - continue; + if (jfreeblk != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&jfreeblk->jf_list); } + continue; + case D_ALLOCDIRECT: + case D_ALLOCINDIR: /* - * Replace up-to-date version with safe version. + * We have to wait for the jnewblk to be journaled + * before we can write to a block otherwise the + * contents may be confused with an earlier file + * at recovery time. Handle the marker as described + * above. */ - FREE_LOCK(&lk); - indirdep->ir_saveddata = malloc(bp->b_bcount, - M_INDIRDEP, M_SOFTDEP_FLAGS); - ACQUIRE_LOCK(&lk); - indirdep->ir_state &= ~ATTACHED; - indirdep->ir_state |= UNDONE; - bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); - bcopy(indirdep->ir_savebp->b_data, bp->b_data, - bp->b_bcount); + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&newblk->nb_jnewblk->jn_list); + } continue; + case D_SBDEP: + initiate_write_sbdep(WK_SBDEP(wk)); + continue; + case D_MKDIR: - case D_BMSAFEMAP: - case D_ALLOCDIRECT: - case D_ALLOCINDIR: + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: continue; default: @@ -3855,6 +7783,9 @@ initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { + struct jremref *jremref; + struct jmvref *jmvref; + struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; @@ -3869,6 +7800,18 @@ initiate_write_filepage(pagedep, bp) return; } pagedep->pd_state |= IOSTARTED; + /* + * Wait for all journal remove dependencies to hit the disk. + * We can not allow any potentially conflicting directory adds + * to be visible before removes and rollback is too difficult. + * lk may be dropped and re-acquired, however we hold the buf + * locked so the dependency can not go away. + */ + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) + while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) + jwait(&jremref->jr_list); + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) + jwait(&jmvref->jm_list); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) @@ -3905,6 +7848,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; + struct inoref *inoref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -3918,7 +7862,21 @@ initiate_write_inodeblock_ufs1(inodedep, bp) fs = inodedep->id_fs; dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if (inon) + dp->di_freelink = inon->id_ino; + else + dp->di_freelink = 0; + } + /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ @@ -3933,6 +7891,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; + dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* @@ -3940,32 +7899,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; - if (TAILQ_EMPTY(&inodedep->id_inoupdt)) + inodedep->id_savednlink = dp->di_nlink; + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* + * Revert the link count to that of the first unwritten journal entry. + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn - NDADDR, - dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -3981,14 +7948,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4012,8 +7979,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4030,7 +7997,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* @@ -4051,6 +8018,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; + struct inoref *inoref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -4064,7 +8032,21 @@ initiate_write_inodeblock_ufs2(inodedep, bp) fs = inodedep->id_fs; dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if (inon) + dp->di_freelink = inon->id_ino; + else + dp->di_freelink = 0; + } + /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ @@ -4079,6 +8061,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; + dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* @@ -4086,25 +8069,38 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; + inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && - TAILQ_EMPTY(&inodedep->id_extupdt)) + TAILQ_EMPTY(&inodedep->id_extupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* + * Revert the link count to that of the first unwritten journal entry. + * + * XXX What if it is canceled? Could entries after it be expired + * before we remove this? Thus leaving us with an incorrect link on + * disk with no journal entries to cover it? + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + + /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4120,12 +8116,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NXADDR; i++) { + dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4142,8 +8138,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; @@ -4154,24 +8150,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", - (intmax_t)adp->ad_lbn - NDADDR, - (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4187,14 +8183,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); @@ -4218,8 +8214,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4236,16 +8232,363 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* + * Cancel an indirdep as a result of truncation. Release all of the + * children allocindirs and place their journal work on the appropriate + * list. + */ +static void +cancel_indirdep(indirdep, bp, inodedep, freeblks) + struct indirdep *indirdep; + struct buf *bp; + struct inodedep *inodedep; + struct freeblks *freeblks; +{ + struct allocindir *aip; + + /* + * None of the indirect pointers will ever be visible, + * so they can simply be tossed. GOINGAWAY ensures + * that allocated pointers will be saved in the buffer + * cache until they are freed. Note that they will + * only be able to be found by their physical address + * since the inode mapping the logical address will + * be gone. The save buffer used for the safe copy + * was allocated in setup_allocindir_phase2 using + * the physical address so it could be used for this + * purpose. Hence we swap the safe copy with the real + * copy, allowing the safe copy to be freed and holding + * on to the real copy for later use in indir_trunc. + */ + if (indirdep->ir_state & GOINGAWAY) + panic("cancel_indirdep: already gone"); + if (indirdep->ir_state & ONDEPLIST) { + indirdep->ir_state &= ~ONDEPLIST; + LIST_REMOVE(indirdep, ir_next); + } + indirdep->ir_state |= GOINGAWAY; + VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; + while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); + WORKLIST_REMOVE(&indirdep->ir_list); + WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); + indirdep->ir_savebp = NULL; +} + +/* + * Free an indirdep once it no longer has new pointers to track. + */ +static void +free_indirdep(indirdep) + struct indirdep *indirdep; +{ + + KASSERT(LIST_EMPTY(&indirdep->ir_jwork), + ("free_indirdep: Journal work not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_completehd), + ("free_indirdep: Complete head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_writehd), + ("free_indirdep: write head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_donehd), + ("free_indirdep: done head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), + ("free_indirdep: deplist head not empty.")); + KASSERT(indirdep->ir_savebp == NULL, + ("free_indirdep: %p ir_savebp != NULL", indirdep)); + KASSERT((indirdep->ir_state & ONDEPLIST) == 0, + ("free_indirdep: %p still on deplist.", indirdep)); + if (indirdep->ir_state & ONWORKLIST) + WORKLIST_REMOVE(&indirdep->ir_list); + WORKITEM_FREE(indirdep, D_INDIRDEP); +} + +/* + * Called before a write to an indirdep. This routine is responsible for + * rolling back pointers to a safe state which includes only those + * allocindirs which have been completed. + */ +static void +initiate_write_indirdep(indirdep, bp) + struct indirdep *indirdep; + struct buf *bp; +{ + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_io_initiation: indirdep gone"); + + /* + * If there are no remaining dependencies, this will be writing + * the real pointers. + */ + if (LIST_EMPTY(&indirdep->ir_deplisthd)) + return; + /* + * Replace up-to-date version with safe version. + */ + FREE_LOCK(&lk); + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, + M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(&lk); + indirdep->ir_state &= ~ATTACHED; + indirdep->ir_state |= UNDONE; + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + bcopy(indirdep->ir_savebp->b_data, bp->b_data, + bp->b_bcount); +} + +/* + * Called when an inode has been cleared in a cg bitmap. This finally + * eliminates any canceled jaddrefs + */ +void +softdep_setup_inofree(mp, bp, ino, wkhd) + struct mount *mp; + struct buf *bp; + ino_t ino; + struct workhead *wkhd; +{ + struct worklist *wk, *wkn; + struct inodedep *inodedep; + uint8_t *inosused; + struct cg *cgp; + struct fs *fs; + + ACQUIRE_LOCK(&lk); + fs = VFSTOUFS(mp)->um_fs; + cgp = (struct cg *)bp->b_data; + inosused = cg_inosused(cgp); + if (isset(inosused, ino % fs->fs_ipg)) + panic("softdep_setup_inofree: inode %d not freed.", ino); + if (inodedep_lookup(mp, ino, 0, &inodedep)) + panic("softdep_setup_inofree: ino %d has existing inodedep %p", + ino, inodedep); + if (wkhd) { + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JADDREF) + continue; + WORKLIST_REMOVE(wk); + /* + * We can free immediately even if the jaddref + * isn't attached in a background write as now + * the bitmaps are reconciled. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jaddref(WK_JADDREF(wk)); + } + jwork_move(&bp->b_dep, wkhd); + } + FREE_LOCK(&lk); +} + + +/* + * Called via ffs_blkfree() after a set of frags has been cleared from a cg + * map. Any dependencies waiting for the write to clear are added to the + * buf's list and any jnewblks that are being canceled are discarded + * immediately. + */ +void +softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) + struct mount *mp; + struct buf *bp; + ufs2_daddr_t blkno; + int frags; + struct workhead *wkhd; +{ + struct jnewblk *jnewblk; + struct worklist *wk, *wkn; +#ifdef SUJ_DEBUG + struct bmsafemap *bmsafemap; + struct fs *fs; + uint8_t *blksfree; + struct cg *cgp; + ufs2_daddr_t jstart; + ufs2_daddr_t jend; + ufs2_daddr_t end; + long bno; + int i; +#endif + + ACQUIRE_LOCK(&lk); + /* + * Detach any jnewblks which have been canceled. They must linger + * until the bitmap is cleared again by ffs_blkfree() to prevent + * an unjournaled allocation from hitting the disk. + */ + if (wkhd) { + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JNEWBLK) + continue; + jnewblk = WK_JNEWBLK(wk); + KASSERT(jnewblk->jn_state & GOINGAWAY, + ("softdep_setup_blkfree: jnewblk not canceled.")); + WORKLIST_REMOVE(wk); +#ifdef SUJ_DEBUG + /* + * Assert that this block is free in the bitmap + * before we discard the jnewblk. + */ + fs = VFSTOUFS(mp)->um_fs; + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; + i < jnewblk->jn_frags; i++) { + if (isset(blksfree, bno + i)) + continue; + panic("softdep_setup_blkfree: not free"); + } +#endif + /* + * Even if it's not attached we can free immediately + * as the new bitmap is correct. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jnewblk(jnewblk); + } + /* + * The buf must be locked by the caller otherwise these could + * be added while it's being written and the write would + * complete them before they made it to disk. + */ + jwork_move(&bp->b_dep, wkhd); + } + +#ifdef SUJ_DEBUG + /* + * Assert that we are not freeing a block which has an outstanding + * allocation dependency. + */ + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); + end = blkno + frags; + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + /* + * Don't match against blocks that will be freed when the + * background write is done. + */ + if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == + (COMPLETE | DEPCOMPLETE)) + continue; + jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; + jend = jnewblk->jn_blkno + jnewblk->jn_frags; + if ((blkno >= jstart && blkno < jend) || + (end > jstart && end <= jend)) { + printf("state 0x%X %jd - %d %d dep %p\n", + jnewblk->jn_state, jnewblk->jn_blkno, + jnewblk->jn_oldfrags, jnewblk->jn_frags, + jnewblk->jn_newblk); + panic("softdep_setup_blkfree: " + "%jd-%jd(%d) overlaps with %jd-%jd", + blkno, end, frags, jstart, jend); + } + } +#endif + FREE_LOCK(&lk); +} + +static void +initiate_write_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; /* The cg block. */ +{ + struct jaddref *jaddref; + struct jnewblk *jnewblk; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + int cleared; + ino_t ino; + long bno; + int i; + + if (bmsafemap->sm_state & IOSTARTED) + panic("initiate_write_bmsafemap: Already started\n"); + bmsafemap->sm_state |= IOSTARTED; + /* + * Clear any inode allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { + ino = jaddref->ja_ino % fs->fs_ipg; + /* + * If this is a background copy the inode may not + * be marked used yet. + */ + if (isset(inosused, ino)) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir--; + cgp->cg_cs.cs_nifree++; + clrbit(inosused, ino); + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: inode %d " + "marked free", jaddref->ja_ino); + } + } + /* + * Clear any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + bno = dtogd(fs, jnewblk->jn_blkno); + cleared = 0; + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isclr(blksfree, bno + i)) { + cleared = 1; + setbit(blksfree, bno + i); + } + } + /* + * We may not clear the block if it's a background + * copy. In that case there is no reason to detach + * it. + */ + if (cleared) { + jnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: block %jd " + "marked free", jnewblk->jn_blkno); + } + } + /* + * Move allocation lists to the written lists so they can be + * cleared once the block write is complete. + */ + LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, + inodedep, id_deps); + LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); +} + +/* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. + * */ static void softdep_disk_write_complete(bp) @@ -4254,12 +8597,7 @@ softdep_disk_write_complete(bp) struct worklist *wk; struct worklist *owk; struct workhead reattach; - struct newblk *newblk; - struct allocindir *aip; - struct allocdirect *adp; - struct indirdep *indirdep; - struct inodedep *inodedep; - struct bmsafemap *bmsafemap; + struct buf *sbp; /* * If an error occurred while doing the write, then the data @@ -4271,8 +8609,9 @@ softdep_disk_write_complete(bp) /* * This lock must not be released anywhere in this code segment. */ + sbp = NULL; + owk = NULL; ACQUIRE_LOCK(&lk); - owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); if (wk == owk) @@ -4291,33 +8630,8 @@ softdep_disk_write_complete(bp) continue; case D_BMSAFEMAP: - bmsafemap = WK_BMSAFEMAP(wk); - while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { - newblk->nb_state |= DEPCOMPLETE; - newblk->nb_bmsafemap = NULL; - LIST_REMOVE(newblk, nb_deps); - } - while ((adp = - LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - LIST_REMOVE(adp, ad_deps); - handle_allocdirect_partdone(adp); - } - while ((aip = - LIST_FIRST(&bmsafemap->sm_allocindirhd))) { - aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; - LIST_REMOVE(aip, ai_deps); - handle_allocindir_partdone(aip); - } - while ((inodedep = - LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { - inodedep->id_state |= DEPCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; - } - WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: @@ -4325,37 +8639,47 @@ softdep_disk_write_complete(bp) continue; case D_ALLOCDIRECT: - adp = WK_ALLOCDIRECT(wk); - adp->ad_state |= COMPLETE; - handle_allocdirect_partdone(adp); + wk->wk_state |= COMPLETE; + handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: - aip = WK_ALLOCINDIR(wk); - aip->ai_state |= COMPLETE; - handle_allocindir_partdone(aip); + wk->wk_state |= COMPLETE; + handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_write_complete: indirdep gone"); - bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); - free(indirdep->ir_saveddata, M_INDIRDEP); - indirdep->ir_saveddata = 0; - indirdep->ir_state &= ~UNDONE; - indirdep->ir_state |= ATTACHED; - while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { - handle_allocindir_partdone(aip); - if (aip == LIST_FIRST(&indirdep->ir_donehd)) - panic("disk_write_complete: not gone"); - } - WORKLIST_INSERT(&reattach, wk); - if ((bp->b_flags & B_DELWRI) == 0) - stat_indir_blk_ptrs++; - bdirty(bp); + if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) + WORKLIST_INSERT(&reattach, wk); continue; + case D_FREEBLKS: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk, 1); + continue; + + case D_FREEWORK: + handle_written_freework(WK_FREEWORK(wk)); + break; + + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + + case D_JSEG: + handle_written_jseg(WK_JSEG(wk), bp); + continue; + + case D_SBDEP: + if (handle_written_sbdep(WK_SBDEP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); + continue; + default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); @@ -4370,6 +8694,8 @@ softdep_disk_write_complete(bp) WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(&lk); + if (sbp) + brelse(sbp); } /* @@ -4378,18 +8704,17 @@ softdep_disk_write_complete(bp) * splbio interrupts blocked. */ static void -handle_allocdirect_partdone(adp) +handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ + struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; - long bsize, delay; + long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (adp->ad_buf != NULL) - panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode @@ -4439,25 +8764,27 @@ static void return; } /* - * If we have found the just finished dependency, then free + * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. - * If the inode still has a bitmap dependency, then it has - * never been written to disk, hence the on-disk inode cannot - * reference the old fragment so we can free it without delay. + * Since the pointer has not yet been written in the inode + * as the dependency prevents it, place the allocdirect on the + * bufwait list where it will be freed once the pointer is + * valid. */ - delay = (inodedep->id_state & DEPCOMPLETE); + if (wkhd == NULL) + wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - free_allocdirect(listhead, adp, delay); + TAILQ_REMOVE(listhead, adp, ad_next); + WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* - * Called from within softdep_disk_write_complete above. Note that - * this routine is always called from interrupt level with further - * splbio interrupts blocked. + * Called from within softdep_disk_write_complete above. This routine + * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) @@ -4467,11 +8794,9 @@ handle_allocindir_partdone(aip) if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (aip->ai_buf != NULL) - panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; + LIST_REMOVE(aip, ai_next); if (indirdep->ir_state & UNDONE) { - LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } @@ -4481,13 +8806,130 @@ handle_allocindir_partdone(aip) else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; - LIST_REMOVE(aip, ai_next); - if (aip->ai_freefrag != NULL) - add_to_worklist(&aip->ai_freefrag->ff_list); - WORKITEM_FREE(aip, D_ALLOCINDIR); + /* + * Await the pointer write before freeing the allocindir. + */ + LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* + * Release segments held on a jwork list. + */ +static void +handle_jwork(wkhd) + struct workhead *wkhd; +{ + struct worklist *wk; + + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + default: + panic("handle_jwork: Unknown type %s\n", + TYPENAME(wk->wk_type)); + } + } +} + +/* + * Handle the bufwait list on an inode when it is safe to release items + * held there. This normally happens after an inode block is written but + * may be delayed and handle later if there are pending journal items that + * are not yet safe to be released. + */ +static struct freefile * +handle_bufwait(inodedep, refhd) + struct inodedep *inodedep; + struct workhead *refhd; +{ + struct jaddref *jaddref; + struct freefile *freefile; + struct worklist *wk; + + freefile = NULL; + while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_FREEFILE: + /* + * We defer adding freefile to the worklist + * until all other additions have been made to + * ensure that it will be done after all the + * old blocks have been freed. + */ + if (freefile != NULL) + panic("handle_bufwait: freefile"); + freefile = WK_FREEFILE(wk); + continue; + + case D_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); + continue; + + case D_DIRADD: + diradd_inode_written(WK_DIRADD(wk), inodedep); + continue; + + case D_FREEFRAG: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk, 0); + continue; + + case D_DIRREM: + wk->wk_state |= COMPLETE; + add_to_worklist(wk, 0); + continue; + + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + free_newblk(WK_NEWBLK(wk)); + continue; + + case D_JNEWBLK: + wk->wk_state |= COMPLETE; + free_jnewblk(WK_JNEWBLK(wk)); + continue; + + /* + * Save freed journal segments and add references on + * the supplied list which will delay their release + * until the cg bitmap is cleared on disk. + */ + case D_JSEGDEP: + if (refhd == NULL) + free_jsegdep(WK_JSEGDEP(wk)); + else + WORKLIST_INSERT(refhd, wk); + continue; + + case D_JADDREF: + jaddref = WK_JADDREF(wk); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + /* + * Transfer any jaddrefs to the list to be freed with + * the bitmap if we're handling a removed file. + */ + if (refhd == NULL) { + wk->wk_state |= COMPLETE; + free_jaddref(jaddref); + } else + WORKLIST_INSERT(refhd, wk); + continue; + + default: + panic("handle_bufwait: Unknown type %p(%s)", + wk, TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + return (freefile); +} +/* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further @@ -4498,12 +8940,17 @@ handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { - struct worklist *wk, *filefree; + struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; + struct workhead wkhd; int hadchanges, fstype; + ino_t freelink; + LIST_INIT(&wkhd); + hadchanges = 0; + freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; @@ -4511,12 +8958,30 @@ handle_written_inodeblock(inodedep, bp) fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp2->di_freelink; } /* + * If we wrote a freelink pointer during the last write record it + * here. If we did not, keep the buffer dirty until we do. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if ((inon == NULL && freelink == 0) || + (inon && inon->id_ino == freelink)) { + if (inon) + inon->id_state |= UNLINKPREV; + inodedep->id_state |= UNLINKNEXT; + } else + hadchanges = 1; + } + /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until @@ -4524,6 +8989,7 @@ handle_written_inodeblock(inodedep, bp) * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { + hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else @@ -4533,6 +8999,13 @@ handle_written_inodeblock(inodedep, bp) if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); + /* + * If the inode is clear here and GOINGAWAY it will never + * be written. Process the bufwait and clear any pending + * work which may include the freefile. + */ + if (inodedep->id_state & GOINGAWAY) + goto bufwait; return (1); } inodedep->id_state |= COMPLETE; @@ -4540,50 +9013,49 @@ handle_written_inodeblock(inodedep, bp) * Roll forward anything that had to be rolled back before * the inode could be updated. */ - hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { - if (adp->ad_lbn < NDADDR) { - if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", - (intmax_t)adp->ad_lbn, - dp1->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, - dp1->di_ib[adp->ad_lbn - NDADDR]); - dp1->di_ib[adp->ad_lbn - NDADDR] = + (intmax_t)adp->ad_offset - NDADDR, + dp1->di_ib[adp->ad_offset - NDADDR]); + dp1->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } else { - if (adp->ad_lbn < NDADDR) { - if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, + (intmax_t)adp->ad_offset - NDADDR, (intmax_t) - dp2->di_ib[adp->ad_lbn - NDADDR]); - dp2->di_ib[adp->ad_lbn - NDADDR] = + dp2->di_ib[adp->ad_offset - NDADDR]); + dp2->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } @@ -4595,13 +9067,13 @@ handle_written_inodeblock(inodedep, bp) nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); - if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) + if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; @@ -4613,12 +9085,23 @@ handle_written_inodeblock(inodedep, bp) */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); + if (inodedep->id_savednlink > LINK_MAX) + panic("handle_written_inodeblock: Invalid link count " + "%d for inodedep %p", inodedep->id_savednlink, inodedep); if (fstype == UFS1) { + if (dp1->di_nlink != inodedep->id_savednlink) { + dp1->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { + if (dp2->di_nlink != inodedep->id_savednlink) { + dp2->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; @@ -4630,6 +9113,7 @@ handle_written_inodeblock(inodedep, bp) } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; + inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in @@ -4637,69 +9121,49 @@ handle_written_inodeblock(inodedep, bp) */ if (hadchanges) bdirty(bp); +bufwait: /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples - * before the old ones have been deleted. + * before the old ones have been deleted. Completely + * unlinked inodes are not processed until the unlinked + * inode list is written or the last reference is removed. */ - filefree = NULL; - while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { - WORKLIST_REMOVE(wk); - switch (wk->wk_type) { - - case D_FREEFILE: - /* - * We defer adding filefree to the worklist until - * all other additions have been made to ensure - * that it will be done after all the old blocks - * have been freed. - */ - if (filefree != NULL) - panic("handle_written_inodeblock: filefree"); - filefree = wk; - continue; - - case D_MKDIR: - handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); - continue; - - case D_DIRADD: - diradd_inode_written(WK_DIRADD(wk), inodedep); - continue; - - case D_FREEBLKS: - wk->wk_state |= COMPLETE; - if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) - continue; - /* -- fall through -- */ - case D_FREEFRAG: - case D_DIRREM: - add_to_worklist(wk); - continue; - - case D_NEWDIRBLK: - free_newdirblk(WK_NEWDIRBLK(wk)); - continue; - - default: - panic("handle_written_inodeblock: Unknown type %s", - TYPENAME(wk->wk_type)); - /* NOTREACHED */ + if ((inodedep->id_state & UNLINKED) == 0) { + freefile = handle_bufwait(inodedep, NULL); + if (freefile && !LIST_EMPTY(&wkhd)) { + WORKLIST_INSERT(&wkhd, &freefile->fx_list); + freefile = NULL; } } - if (filefree != NULL) { + /* + * Move rolled forward dependency completions to the bufwait list + * now that those that were already written have been processed. + */ + if (!LIST_EMPTY(&wkhd) && hadchanges == 0) + panic("handle_written_inodeblock: bufwait but no changes"); + jwork_move(&inodedep->id_bufwait, &wkhd); + + if (freefile != NULL) { + /* + * If the inode is goingaway it was never written. Fake up + * the state here so free_inodedep() can succeed. + */ + if (inodedep->id_state & GOINGAWAY) + inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) - panic("handle_written_inodeblock: live inodedep"); - add_to_worklist(filefree); + panic("handle_written_inodeblock: live inodedep %p", + inodedep); + add_to_worklist(&freefile->fx_list, 0); return (0); } @@ -4707,12 +9171,101 @@ handle_written_inodeblock(inodedep, bp) * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || - (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && - TAILQ_FIRST(&inodedep->id_extupdt) == 0)) + (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && + TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && + TAILQ_FIRST(&inodedep->id_extupdt) == 0 && + LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } +static int +handle_written_indirdep(indirdep, bp, bpp) + struct indirdep *indirdep; + struct buf *bp; + struct buf **bpp; +{ + struct allocindir *aip; + int chgs; + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_write_complete: indirdep gone"); + chgs = 0; + /* + * If there were rollbacks revert them here. + */ + if (indirdep->ir_saveddata) { + bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); + free(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = 0; + chgs = 1; + } + indirdep->ir_state &= ~UNDONE; + indirdep->ir_state |= ATTACHED; + /* + * Move allocindirs with written pointers to the completehd if + * the the indirdep's pointer is not yet written. Otherwise + * free them here. + */ + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { + LIST_REMOVE(aip, ai_next); + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, + ai_next); + continue; + } + free_newblk(&aip->ai_block); + } + /* + * Move allocindirs that have finished dependency processing from + * the done list to the write list after updating the pointers. + */ + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { + handle_allocindir_partdone(aip); + if (aip == LIST_FIRST(&indirdep->ir_donehd)) + panic("disk_write_complete: not gone"); + chgs = 1; + } + /* + * If this indirdep has been detached from its newblk during + * I/O we need to keep this dep attached to the buffer so + * deallocate_dependencies can find it and properly resolve + * any outstanding dependencies. + */ + if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) + chgs = 1; + if ((bp->b_flags & B_DELWRI) == 0) + stat_indir_blk_ptrs++; + /* + * If there were no changes we can discard the savedbp and detach + * ourselves from the buf. We are only carrying completed pointers + * in this case. + */ + if (chgs == 0) { + struct buf *sbp; + + sbp = indirdep->ir_savebp; + sbp->b_flags |= B_INVAL | B_NOCACHE; + indirdep->ir_savebp = NULL; + if (*bpp != NULL) + panic("handle_written_indirdep: bp already exists."); + *bpp = sbp; + } else + bdirty(bp); + /* + * If there are no fresh dependencies and none waiting on writes + * we can free the indirdep. + */ + if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { + if (indirdep->ir_state & ONDEPLIST) + LIST_REMOVE(indirdep, ir_next); + free_indirdep(indirdep); + return (0); + } + + return (chgs); +} + /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. @@ -4722,50 +9275,200 @@ diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { - struct pagedep *pagedep; dap->da_state |= COMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); - } + complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* - * Handle the completion of a mkdir dependency. + * Returns true if the bmsafemap will have rollbacks when written. Must + * only be called with lk and the buf lock on the cg held. */ +static int +bmsafemap_rollbacks(bmsafemap) + struct bmsafemap *bmsafemap; +{ + + return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | + !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); +} + +/* + * Complete a write to a bmsafemap structure. Roll forward any bitmap + * changes if it's not a background write. Set all written dependencies + * to DEPCOMPLETE and free the structure if possible. + */ +static int +handle_written_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; +{ + struct newblk *newblk; + struct inodedep *inodedep; + struct jaddref *jaddref, *jatmp; + struct jnewblk *jnewblk, *jntmp; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ino_t ino; + long bno; + int chgs; + int i; + + if ((bmsafemap->sm_state & IOSTARTED) == 0) + panic("initiate_write_bmsafemap: Not started\n"); + chgs = 0; + bmsafemap->sm_state &= ~IOSTARTED; + /* + * Restore unwritten inode allocation pending jaddref writes. + */ + if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, + ja_bmdeps, jatmp) { + if ((jaddref->ja_state & UNDONE) == 0) + continue; + ino = jaddref->ja_ino % fs->fs_ipg; + if (isset(inosused, ino)) + panic("handle_written_bmsafemap: " + "re-allocated inode"); + if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir++; + cgp->cg_cs.cs_nifree--; + setbit(inosused, ino); + chgs = 1; + } + jaddref->ja_state &= ~UNDONE; + jaddref->ja_state |= ATTACHED; + free_jaddref(jaddref); + } + } + /* + * Restore any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, + jntmp) { + if ((jnewblk->jn_state & UNDONE) == 0) + continue; + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (bp->b_xflags & BX_BKGRDMARKER) + break; + if ((jnewblk->jn_state & NEWBLOCK) == 0 && + isclr(blksfree, bno + i)) + panic("handle_written_bmsafemap: " + "re-allocated fragment"); + clrbit(blksfree, bno + i); + chgs = 1; + } + jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); + jnewblk->jn_state |= ATTACHED; + free_jnewblk(jnewblk); + } + } + while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { + newblk->nb_state |= DEPCOMPLETE; + newblk->nb_state &= ~ONDEPLIST; + newblk->nb_bmsafemap = NULL; + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_list.wk_type == D_ALLOCDIRECT) + handle_allocdirect_partdone( + WK_ALLOCDIRECT(&newblk->nb_list), NULL); + else if (newblk->nb_list.wk_type == D_ALLOCINDIR) + handle_allocindir_partdone( + WK_ALLOCINDIR(&newblk->nb_list)); + else if (newblk->nb_list.wk_type != D_NEWBLK) + panic("handle_written_bmsafemap: Unexpected type: %s", + TYPENAME(newblk->nb_list.wk_type)); + } + while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { + inodedep->id_state |= DEPCOMPLETE; + inodedep->id_state &= ~ONDEPLIST; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_bmsafemap = NULL; + } + if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && + LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && + LIST_EMPTY(&bmsafemap->sm_newblkhd) && + LIST_EMPTY(&bmsafemap->sm_inodedephd)) { + if (chgs) + bdirty(bp); + LIST_REMOVE(bmsafemap, sm_hash); + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (0); + } + bdirty(bp); + return (1); +} + +/* + * Try to free a mkdir dependency. + */ static void -handle_written_mkdir(mkdir, type) +complete_mkdir(mkdir) struct mkdir *mkdir; - int type; { struct diradd *dap; - struct pagedep *pagedep; - if (mkdir->md_state != type) - panic("handle_written_mkdir: bad type"); + if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; - dap->da_state &= ~type; - if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + complete_diradd(dap); } - LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* + * Handle the completion of a mkdir dependency. + */ +static void +handle_written_mkdir(mkdir, type) + struct mkdir *mkdir; + int type; +{ + + if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) + panic("handle_written_mkdir: bad type"); + mkdir->md_state |= COMPLETE; + complete_mkdir(mkdir); +} + +static void +free_pagedep(pagedep) + struct pagedep *pagedep; +{ + int i; + + if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) + return; + for (i = 0; i < DAHASHSZ; i++) + if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) + return; + if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) + return; + if (!LIST_EMPTY(&pagedep->pd_dirremhd)) + return; + if (!LIST_EMPTY(&pagedep->pd_pendinghd)) + return; + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, D_PAGEDEP); +} + +/* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. @@ -4790,8 +9493,11 @@ handle_written_filepage(pagedep, bp) */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); + dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_written_filepage: Journal entries not written.")); + add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. @@ -4800,7 +9506,7 @@ handle_written_filepage(pagedep, bp) */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * Uncommitted directory entries must be restored. */ @@ -4845,7 +9551,8 @@ handle_written_filepage(pagedep, bp) * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ - if ((pagedep->pd_state & NEWBLOCK) == 0) { + if ((pagedep->pd_state & NEWBLOCK) == 0 && + LIST_EMPTY(&pagedep->pd_jmvrefhd)) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } @@ -4880,8 +9587,8 @@ softdep_load_inodeblock(ip) */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); - if (inodedep_lookup(UFSTOVFS(ip->i_ump), - ip->i_number, 0, &inodedep) == 0) { + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) { FREE_LOCK(&lk); return; } @@ -4908,6 +9615,7 @@ softdep_update_inodeblock(ip, bp, waitfor) int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; + struct inoref *inoref; struct worklist *wk; struct mount *mp; struct buf *ibp; @@ -4922,6 +9630,7 @@ softdep_update_inodeblock(ip, bp, waitfor) */ mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); +again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) @@ -4931,6 +9640,19 @@ softdep_update_inodeblock(ip, bp, waitfor) if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* + * If we're flushing all dependencies we must also move any waiting + * for journal writes onto the bufwait list prior to I/O. + */ + if (waitfor) { + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto again; + } + } + } + /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ @@ -4945,10 +9667,12 @@ softdep_update_inodeblock(ip, bp, waitfor) */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), + NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), + NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk @@ -4971,11 +9695,11 @@ softdep_update_inodeblock(ip, bp, waitfor) return; } retry: - if ((inodedep->id_state & DEPCOMPLETE) != 0) { + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(&lk); return; } - ibp = inodedep->id_buf; + ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, &lk, MNT_WAIT); if (ibp == NULL) { /* @@ -5007,13 +9731,13 @@ merge_inode_lists(newlisthead, oldlisthead) newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { - if (listadp->ad_lbn < newadp->ad_lbn) { + if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); - if (listadp->ad_lbn == newadp->ad_lbn) { + if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; @@ -5036,6 +9760,7 @@ softdep_fsync(vp) { struct inodedep *inodedep; struct pagedep *pagedep; + struct inoref *inoref; struct worklist *wk; struct diradd *dap; struct mount *mp; @@ -5052,17 +9777,24 @@ softdep_fsync(vp) fs = ip->i_fs; mp = vp->v_mount; ACQUIRE_LOCK(&lk); +restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return (0); } + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (!LIST_EMPTY(&inodedep->id_inowait) || - !LIST_EMPTY(&inodedep->id_bufwait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) - panic("softdep_fsync: pending ops"); + panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; @@ -5254,8 +9986,8 @@ int softdep_sync_metadata(struct vnode *vp) { struct pagedep *pagedep; - struct allocdirect *adp; struct allocindir *aip; + struct newblk *newblk; struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; @@ -5319,27 +10051,15 @@ loop: switch (wk->wk_type) { case D_ALLOCDIRECT: - adp = WK_ALLOCDIRECT(wk); - if (adp->ad_state & DEPCOMPLETE) - continue; - nbp = adp->ad_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; + case D_ALLOCINDIR: + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + goto restart; } - ACQUIRE_LOCK(&lk); - continue; - - case D_ALLOCINDIR: - aip = WK_ALLOCINDIR(wk); - if (aip->ai_state & DEPCOMPLETE) + if (newblk->nb_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; @@ -5355,10 +10075,16 @@ loop: case D_INDIRDEP: restart: - LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { - if (aip->ai_state & DEPCOMPLETE) + LIST_FOREACH(aip, + &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { + newblk = (struct newblk *)aip; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + goto restart; + } + if (newblk->nb_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, MNT_WAIT); if (nbp == NULL) goto restart; @@ -5371,14 +10097,6 @@ loop: } continue; - case D_INODEDEP: - if ((error = flush_inodedep_deps(wk->wk_mp, - WK_INODEDEP(wk)->id_ino)) != 0) { - FREE_LOCK(&lk); - break; - } - continue; - case D_PAGEDEP: /* * We are trying to sync a directory that may @@ -5400,48 +10118,6 @@ loop: } continue; - case D_MKDIR: - /* - * This case should never happen if the vnode has - * been properly sync'ed. However, if this function - * is used at a place where the vnode has not yet - * been sync'ed, this dependency can show up. So, - * rather than panic, just flush it. - */ - nbp = WK_MKDIR(wk)->md_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; - } - ACQUIRE_LOCK(&lk); - continue; - - case D_BMSAFEMAP: - /* - * This case should never happen if the vnode has - * been properly sync'ed. However, if this function - * is used at a place where the vnode has not yet - * been sync'ed, this dependency can show up. So, - * rather than panic, just flush it. - */ - nbp = WK_BMSAFEMAP(wk)->sm_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; - } - ACQUIRE_LOCK(&lk); - continue; - default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); @@ -5489,7 +10165,8 @@ loop: BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); - return (0); + return ffs_update(vp, 1); + /* return (0); */ } /* @@ -5502,6 +10179,7 @@ flush_inodedep_deps(mp, ino) ino_t ino; { struct inodedep *inodedep; + struct inoref *inoref; int error, waitfor; /* @@ -5522,8 +10200,16 @@ flush_inodedep_deps(mp, ino) return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); +restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || @@ -5555,13 +10241,19 @@ flush_deplist(listhead, waitfor, errorp) int *errorp; { struct allocdirect *adp; + struct newblk *newblk; struct buf *bp; mtx_assert(&lk, MA_OWNED); TAILQ_FOREACH(adp, listhead, ad_next) { - if (adp->ad_state & DEPCOMPLETE) + newblk = (struct newblk *)adp; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + return (1); + } + if (newblk->nb_state & DEPCOMPLETE) continue; - bp = adp->ad_buf; + bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) @@ -5582,6 +10274,100 @@ flush_deplist(listhead, waitfor, errorp) } /* + * Flush dependencies associated with an allocdirect block. + */ +static int +flush_newblk_dep(vp, mp, lbn) + struct vnode *vp; + struct mount *mp; + ufs_lbn_t lbn; +{ + struct newblk *newblk; + struct bufobj *bo; + struct inode *ip; + struct buf *bp; + ufs2_daddr_t blkno; + int error; + + error = 0; + bo = &vp->v_bufobj; + ip = VTOI(vp); + blkno = DIP(ip, i_db[lbn]); + if (blkno == 0) + panic("flush_newblk_dep: Missing block"); + ACQUIRE_LOCK(&lk); + /* + * Loop until all dependencies related to this block are satisfied. + * We must be careful to restart after each sleep in case a write + * completes some part of this process for us. + */ + for (;;) { + if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { + FREE_LOCK(&lk); + break; + } + if (newblk->nb_list.wk_type != D_ALLOCDIRECT) + panic("flush_newblk_deps: Bad newblk %p", newblk); + /* + * Flush the journal. + */ + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + continue; + } + /* + * Write the bitmap dependency. + */ + if ((newblk->nb_state & DEPCOMPLETE) == 0) { + bp = newblk->nb_bmsafemap->sm_buf; + bp = getdirtybuf(bp, &lk, MNT_WAIT); + if (bp == NULL) + continue; + FREE_LOCK(&lk); + error = bwrite(bp); + if (error) + break; + ACQUIRE_LOCK(&lk); + continue; + } + /* + * Write the buffer. + */ + FREE_LOCK(&lk); + BO_LOCK(bo); + bp = gbincore(bo, lbn); + if (bp != NULL) { + error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | + LK_INTERLOCK, BO_MTX(bo)); + if (error == ENOLCK) { + ACQUIRE_LOCK(&lk); + continue; /* Slept, retry */ + } + if (error != 0) + break; /* Failed */ + if (bp->b_flags & B_DELWRI) { + bremfree(bp); + error = bwrite(bp); + if (error) + break; + } else + BUF_UNLOCK(bp); + } else + BO_UNLOCK(bo); + /* + * We have to wait for the direct pointers to + * point at the newdirblk before the dependency + * will go away. + */ + error = ffs_update(vp, MNT_WAIT); + if (error) + break; + ACQUIRE_LOCK(&lk); + } + return (error); +} + +/* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ @@ -5592,16 +10378,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp) struct diraddhd *diraddhdp; { struct inodedep *inodedep; + struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; - struct bufobj *bo; int error = 0; struct buf *bp; ino_t inum; - struct worklist *wk; ump = VFSTOUFS(mp); +restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry @@ -5609,7 +10395,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp) */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); - if ((error = ffs_update(pvp, 1)) != 0) + if ((error = ffs_update(pvp, MNT_WAIT)) != 0) break; ACQUIRE_LOCK(&lk); /* @@ -5623,84 +10409,51 @@ flush_pagedep_deps(pvp, mp, diraddhdp) /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be - * committed in its parent. We do not want or need - * the full semantics of a synchronous ffs_syncvnode as - * that may end up here again, once for each directory - * level in the filesystem. Instead, we push the blocks - * and wait for them to clear. We have to fsync twice - * because the first call may choose to defer blocks - * that still have dependencies, but deferral will - * happen at most once. + * committed in its parent. */ inum = dap->da_newinum; + if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) + panic("flush_pagedep_deps: lost inode1"); + /* + * Wait for any pending journal adds to complete so we don't + * cause rollbacks while syncing. + */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(&lk); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; - if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || - (error=ffs_syncvnode(vp, MNT_NOWAIT))) { - vput(vp); - break; - } - bo = &vp->v_bufobj; - BO_LOCK(bo); - drain_output(vp); + error = flush_newblk_dep(vp, mp, 0); /* - * If first block is still dirty with a D_MKDIR - * dependency then it needs to be written now. + * If we still have the dependency we might need to + * update the vnode to sync the new link count to + * disk. */ - for (;;) { - error = 0; - bp = gbincore(bo, 0); - if (bp == NULL) - break; /* First block not present */ - error = BUF_LOCK(bp, - LK_EXCLUSIVE | - LK_SLEEPFAIL | - LK_INTERLOCK, - BO_MTX(bo)); - BO_LOCK(bo); - if (error == ENOLCK) - continue; /* Slept, retry */ - if (error != 0) - break; /* Failed */ - if ((bp->b_flags & B_DELWRI) == 0) { - BUF_UNLOCK(bp); - break; /* Buffer not dirty */ - } - for (wk = LIST_FIRST(&bp->b_dep); - wk != NULL; - wk = LIST_NEXT(wk, wk_list)) - if (wk->wk_type == D_MKDIR) - break; - if (wk == NULL) - BUF_UNLOCK(bp); /* Dependency gone */ - else { - /* - * D_MKDIR dependency remains, - * must write buffer to stable - * storage. - */ - BO_UNLOCK(bo); - bremfree(bp); - error = bwrite(bp); - BO_LOCK(bo); - } - break; - } - BO_UNLOCK(bo); + if (error == 0 && dap == LIST_FIRST(diraddhdp)) + error = ffs_update(vp, MNT_WAIT); vput(vp); if (error != 0) - break; /* Flushing of first block failed */ + break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; - if (dap->da_state & MKDIR_BODY) - panic("flush_pagedep_deps: MKDIR_BODY"); + if (dap->da_state & MKDIR_BODY) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, + &inodedep); + panic("flush_pagedep_deps: MKDIR_BODY " + "inodedep %p dap %p vp %p", + inodedep, dap, vp); + } } /* * Flush the inode on which the directory entry depends. @@ -5719,8 +10472,8 @@ retry: * If the inode still has bitmap dependencies, * push them to disk. */ - if ((inodedep->id_state & DEPCOMPLETE) == 0) { - bp = inodedep->id_buf; + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { + bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, MNT_WAIT); if (bp == NULL) goto retry; @@ -5733,24 +10486,29 @@ retry: } /* * If the inode is still sitting in a buffer waiting - * to be written, push it to disk. + * to be written or waiting for the link count to be + * adjusted update it here to flush it to disk. */ - FREE_LOCK(&lk); - if ((error = bread(ump->um_devvp, - fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), - (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { - brelse(bp); - break; + if (dap == LIST_FIRST(diraddhdp)) { + FREE_LOCK(&lk); + if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ))) + break; + error = ffs_update(vp, MNT_WAIT); + vput(vp); + if (error) + break; + ACQUIRE_LOCK(&lk); } - if ((error = bwrite(bp)) != 0) - break; - ACQUIRE_LOCK(&lk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ - if (dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush failed"); + if (dap == LIST_FIRST(diraddhdp)) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); + panic("flush_pagedep_deps: failed to flush " + "inodedep %p ino %d dap %p", inodedep, inum, dap); + } } if (error) ACQUIRE_LOCK(&lk); @@ -6100,10 +10858,13 @@ softdep_count_dependencies(bp, wantcount) int wantcount; { struct worklist *wk; + struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct indirdep *indirdep; + struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; + struct dirrem *dirrem; struct diradd *dap; int i, retval; @@ -6132,6 +10893,12 @@ softdep_count_dependencies(bp, wantcount) if (!wantcount) goto out; } + if (TAILQ_FIRST(&inodedep->id_inoreflst)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } continue; case D_INDIRDEP: @@ -6147,6 +10914,14 @@ softdep_count_dependencies(bp, wantcount) case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + if (LIST_FIRST(&dirrem->dm_jremrefhd)) { + /* Journal remove ref dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { @@ -6159,14 +10934,44 @@ softdep_count_dependencies(bp, wantcount) continue; case D_BMSAFEMAP: + bmsafemap = WK_BMSAFEMAP(wk); + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { + /* Allocate block dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { + /* Freeblk journal dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: case D_ALLOCDIRECT: case D_ALLOCINDIR: case D_MKDIR: + case D_JSEG: + case D_SBDEP: /* never a dependency on these blocks */ continue; default: - panic("softdep_check_for_rollback: Unexpected type %s", + panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } @@ -6382,6 +11187,45 @@ softdep_error(func, error) #ifdef DDB +static void +inodedep_print(struct inodedep *inodedep, int verbose) +{ + db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" + " saveino %p\n", + inodedep, inodedep->id_fs, inodedep->id_state, + (intmax_t)inodedep->id_ino, + (intmax_t)fsbtodb(inodedep->id_fs, + ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), + inodedep->id_nlinkdelta, inodedep->id_savednlink, + inodedep->id_savedino1); + + if (verbose == 0) + return; + + db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " + "mkdiradd %p\n", + LIST_FIRST(&inodedep->id_pendinghd), + LIST_FIRST(&inodedep->id_bufwait), + LIST_FIRST(&inodedep->id_inowait), + TAILQ_FIRST(&inodedep->id_inoreflst), + inodedep->id_mkdiradd); + db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", + TAILQ_FIRST(&inodedep->id_inoupdt), + TAILQ_FIRST(&inodedep->id_newinoupdt), + TAILQ_FIRST(&inodedep->id_extupdt), + TAILQ_FIRST(&inodedep->id_newextupdt)); +} + +DB_SHOW_COMMAND(inodedep, db_show_inodedep) +{ + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + inodedep_print((struct inodedep*)addr, 1); +} + DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) { struct inodedep_hashhead *inodedephd; @@ -6395,15 +11239,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) LIST_FOREACH(inodedep, inodedephd, id_hash) { if (fs != NULL && fs != inodedep->id_fs) continue; - db_printf("%p fs %p st %x ino %jd inoblk %jd\n", - inodedep, inodedep->id_fs, inodedep->id_state, - (intmax_t)inodedep->id_ino, - (intmax_t)fsbtodb(inodedep->id_fs, - ino_to_fsba(inodedep->id_fs, inodedep->id_ino))); + inodedep_print(inodedep, 0); } } } +DB_SHOW_COMMAND(worklist, db_show_worklist) +{ + struct worklist *wk; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wk = (struct worklist *)addr; + printf("worklist: %p type %s state 0x%X\n", + wk, TYPENAME(wk->wk_type), wk->wk_state); +} + +DB_SHOW_COMMAND(workhead, db_show_workhead) +{ + struct workhead *wkhd; + struct worklist *wk; + int i; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wkhd = (struct workhead *)addr; + wk = LIST_FIRST(wkhd); + for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) + db_printf("worklist: %p type %s state 0x%X", + wk, TYPENAME(wk->wk_type), wk->wk_state); + if (i == 100) + db_printf("workhead overflow"); + printf("\n"); +} + + +DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) +{ + struct jaddref *jaddref; + struct diradd *diradd; + struct mkdir *mkdir; + + LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { + diradd = mkdir->md_diradd; + db_printf("mkdir: %p state 0x%X dap %p state 0x%X", + mkdir, mkdir->md_state, diradd, diradd->da_state); + if ((jaddref = mkdir->md_jaddref) != NULL) + db_printf(" jaddref %p jaddref state 0x%X", + jaddref, jaddref->ja_state); + db_printf("\n"); + } +} + #endif /* DDB */ #endif /* SOFTUPDATES */ Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c (revision 202342) +++ sys/ufs/ffs/ffs_vnops.c (working copy) @@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor) wait = (waitfor == MNT_WAIT); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); bo = &vp->v_bufobj; + ip->i_flag &= ~IN_NEEDSYNC; /* * Flush all dirty buffers associated with a vnode. Index: sys/ufs/ffs/ffs_extern.h =================================================================== --- sys/ufs/ffs/ffs_extern.h (revision 202342) +++ sys/ufs/ffs/ffs_extern.h (working copy) @@ -47,6 +47,7 @@ struct ucred; struct vnode; struct vop_fsync_args; struct vop_reallocblks_args; +struct workhead; int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int, struct ucred *, ufs2_daddr_t *); @@ -56,20 +57,23 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_st struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t); + ufs2_daddr_t, long, ino_t, struct workhead *); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_bdflush(struct bufobj *, struct buf *); int ffs_copyonwrite(struct vnode *, struct buf *); int ffs_flushfiles(struct mount *, int, struct thread *); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, - int); + int, struct workhead *); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); int ffs_mountroot(void); +void ffs_oldfscompat_write(struct fs *, struct ufsmount *); int ffs_reallocblks(struct vop_reallocblks_args *); int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); @@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2; int softdep_check_suspend(struct mount *, struct vnode *, int, int, int, int); +int softdep_complete_trunc(struct vnode *, void *); void softdep_get_depcounts(struct mount *, int *, int *); void softdep_initialize(void); void softdep_uninitialize(void); int softdep_mount(struct vnode *, struct mount *, struct fs *, struct ucred *); -void softdep_move_dependencies(struct buf *, struct buf *); +void softdep_unmount(struct mount *); +int softdep_move_dependencies(struct buf *, struct buf *); int softdep_flushworklist(struct mount *, int *, struct thread *); int softdep_flushfiles(struct mount *, int, struct thread *); void softdep_update_inodeblock(struct inode *, struct buf *, int); @@ -117,7 +123,8 @@ void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t); -void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t); +void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, + int, int); void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, @@ -126,11 +133,18 @@ void softdep_setup_allocindir_meta(struct buf *, s struct buf *, int, ufs2_daddr_t); void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *); +void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int, + struct workhead *); +void softdep_setup_inofree(struct mount *, struct buf *, ino_t, + struct workhead *); +void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *); +void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags); void softdep_fsync_mountdev(struct vnode *); int softdep_sync_metadata(struct vnode *); int softdep_process_worklist(struct mount *, int); int softdep_fsync(struct vnode *); int softdep_waitidle(struct mount *); +int softdep_prealloc(struct vnode *, int); int ffs_rdonly(struct inode *); Index: sys/ufs/ffs/ffs_alloc.c =================================================================== --- sys/ufs/ffs/ffs_alloc.c (revision 202342) +++ sys/ufs/ffs/ffs_alloc.c (working copy) @@ -94,23 +94,23 @@ __FBSDID("$FreeBSD$"); #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref, - int size); + int size, int rsize); -static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int); +static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int, int); static ufs2_daddr_t - ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t); + ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif -static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int); -static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *, - ufs1_daddr_t, int); +static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int, + int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int); static void ffs_fserr(struct fs *, ino_t, char *); static ufs2_daddr_t ffs_hashalloc - (struct inode *, int, ufs2_daddr_t, int, allocfcn_t *); -static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int); + (struct inode *, int, ufs2_daddr_t, int, int, allocfcn_t *); +static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int, + int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); @@ -187,7 +187,7 @@ retry: cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); - bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); if (ip->i_flag & IN_SPACECOUNTED) { @@ -385,16 +385,12 @@ retry: panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } - bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, - ip->i_number); - if (nsize < request) - ffs_blkfree(ump, fs, ip->i_devvp, - bno + numfrags(fs, nsize), - (long)(request - nsize), ip->i_number); + ip->i_number, NULL); delta = btodb(nsize - osize); if (ip->i_flag & IN_SPACECOUNTED) { UFS_LOCK(ump); @@ -483,6 +479,14 @@ ffs_reallocblks(ap) if (doreallocblks == 0) return (ENOSPC); + /* + * We can't wait in softdep prealloc as it may fsync and recurse + * here. Instead we simply fail to reallocate blocks if this + * rare condition arises. + */ + if (DOINGSOFTDEP(ap->a_vp)) + if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) + return (ENOSPC); if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); @@ -583,7 +587,7 @@ ffs_reallocblks_ufs1(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -669,7 +673,7 @@ ffs_reallocblks_ufs1(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -791,7 +795,7 @@ ffs_reallocblks_ufs2(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -877,7 +881,7 @@ ffs_reallocblks_ufs2(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -964,7 +968,7 @@ ffs_valloc(pvp, mode, cred, vpp) if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } - ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; @@ -1273,11 +1277,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap) */ /*VARARGS5*/ static ufs2_daddr_t -ffs_hashalloc(ip, cg, pref, size, allocator) +ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; int cg; ufs2_daddr_t pref; - int size; /* size for data blocks, mode for inodes */ + int size; /* Search size for data blocks, mode for inodes */ + int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; @@ -1293,7 +1298,7 @@ static ufs2_daddr_t /* * 1: preferred cylinder group */ - result = (*allocator)(ip, cg, pref, size); + result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* @@ -1303,7 +1308,7 @@ static ufs2_daddr_t cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } @@ -1314,7 +1319,7 @@ static ufs2_daddr_t */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; @@ -1396,7 +1401,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, + frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); @@ -1414,11 +1420,12 @@ fail: * and if it is, allocate it. */ static ufs2_daddr_t -ffs_alloccg(ip, cg, bpref, size) +ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; int cg; ufs2_daddr_t bpref; int size; + int rsize; { struct fs *fs; struct cg *cgp; @@ -1446,7 +1453,7 @@ static ufs2_daddr_t cgp->cg_old_time = cgp->cg_time = time_second; if (size == fs->fs_bsize) { UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); @@ -1470,21 +1477,14 @@ static ufs2_daddr_t if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); - bno = dtogd(fs, blkno); - for (i = frags; i < fs->fs_frag; i++) - setbit(blksfree, bno + i); - i = fs->fs_frag - frags; - cgp->cg_cs.cs_nffree += i; - fs->fs_cstotal.cs_nffree += i; - fs->fs_cs(fs, cg).cs_nffree += i; - fs->fs_fmod = 1; - cgp->cg_frsum[i]++; + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } + KASSERT(size == rsize, + ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; @@ -1502,7 +1502,7 @@ static ufs2_daddr_t ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); @@ -1524,10 +1524,11 @@ fail: * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t -ffs_alloccgblk(ip, bp, bpref) +ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; + int size; { struct fs *fs; struct cg *cgp; @@ -1535,6 +1536,7 @@ static ufs2_daddr_t ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; + int i; fs = ip->i_fs; ump = ip->i_ump; @@ -1562,16 +1564,32 @@ static ufs2_daddr_t gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); - ffs_clusteracct(ump, fs, cgp, blkno, -1); + ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; + /* + * If the caller didn't want the whole block free the frags here. + */ + size = numfrags(fs, size); + if (size != fs->fs_frag) { + bno = dtogd(fs, blkno); + for (i = size; i < fs->fs_frag; i++) + setbit(blksfree, bno + i); + i = fs->fs_frag - size; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, + size, 0); UFS_LOCK(ump); return (blkno); } @@ -1584,11 +1602,12 @@ gotit: * take the first one that we find following bpref. */ static ufs2_daddr_t -ffs_clusteralloc(ip, cg, bpref, len) +ffs_clusteralloc(ip, cg, bpref, len, unused) struct inode *ip; int cg; ufs2_daddr_t bpref; int len; + int unused; { struct fs *fs; struct cg *cgp; @@ -1684,7 +1703,7 @@ static ufs2_daddr_t len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) - if (ffs_alloccgblk(ip, bp, bno + i) != bno + i) + if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); @@ -1708,11 +1727,12 @@ fail: * inode in the specified cylinder group. */ static ufs2_daddr_t -ffs_nodealloccg(ip, cg, ipref, mode) +ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; int cg; ufs2_daddr_t ipref; int mode; + int unused; { struct fs *fs; struct cg *cgp; @@ -1815,28 +1835,6 @@ gotit: } /* - * check if a block is free - */ -static int -ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0); - case 4: - return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); - case 2: - return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); - case 1: - return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); - default: - panic("ffs_isfreeblock"); - } - return (0); -} - -/* * Free a block or fragment. * * The specified block or fragment is placed back in the @@ -1844,13 +1842,14 @@ gotit: * block reassembly is checked. */ void -ffs_blkfree(ump, fs, devvp, bno, size, inum) +ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + struct workhead *dephd; { struct cg *cgp; struct buf *bp; @@ -1917,7 +1916,7 @@ void panic("ffs_blkfree: freeing free block"); } ffs_setblock(fs, blksfree, fragno); - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1957,7 +1956,7 @@ void cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1966,6 +1965,9 @@ void fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, + numfrags(fs, size), dephd); bdwrite(bp); } @@ -2036,7 +2038,8 @@ ffs_vfree(pvp, ino, mode) return (0); } ip = VTOI(pvp); - return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode)); + return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, + NULL)); } /* @@ -2044,12 +2047,13 @@ ffs_vfree(pvp, ino, mode) * The specified inode is placed back in the free map. */ int -ffs_freefile(ump, fs, devvp, ino, mode) +ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; + struct workhead *wkhd; { struct cg *cgp; struct buf *bp; @@ -2105,6 +2109,9 @@ int fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_inofree(UFSTOVFS(ump), bp, + ino + cg * fs->fs_ipg, wkhd); bdwrite(bp); return (0); } @@ -2218,101 +2225,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz) } /* - * Update the cluster map because of an allocation or free. - * - * Cnt == 1 means free; cnt == -1 means allocating. - */ -void -ffs_clusteracct(ump, fs, cgp, blkno, cnt) - struct ufsmount *ump; - struct fs *fs; - struct cg *cgp; - ufs1_daddr_t blkno; - int cnt; -{ - int32_t *sump; - int32_t *lp; - u_char *freemapp, *mapp; - int i, start, end, forw, back, map, bit; - - mtx_assert(UFS_MTX(ump), MA_OWNED); - - if (fs->fs_contigsumsize <= 0) - return; - freemapp = cg_clustersfree(cgp); - sump = cg_clustersum(cgp); - /* - * Allocate or clear the actual block. - */ - if (cnt > 0) - setbit(freemapp, blkno); - else - clrbit(freemapp, blkno); - /* - * Find the size of the cluster going forward. - */ - start = blkno + 1; - end = start + fs->fs_contigsumsize; - if (end >= cgp->cg_nclusterblks) - end = cgp->cg_nclusterblks; - mapp = &freemapp[start / NBBY]; - map = *mapp++; - bit = 1 << (start % NBBY); - for (i = start; i < end; i++) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != (NBBY - 1)) { - bit <<= 1; - } else { - map = *mapp++; - bit = 1; - } - } - forw = i - start; - /* - * Find the size of the cluster going backward. - */ - start = blkno - 1; - end = start - fs->fs_contigsumsize; - if (end < 0) - end = -1; - mapp = &freemapp[start / NBBY]; - map = *mapp--; - bit = 1 << (start % NBBY); - for (i = start; i > end; i--) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != 0) { - bit >>= 1; - } else { - map = *mapp--; - bit = 1 << (NBBY - 1); - } - } - back = start - i; - /* - * Account for old cluster and the possibly new forward and - * back clusters. - */ - i = back + forw + 1; - if (i > fs->fs_contigsumsize) - i = fs->fs_contigsumsize; - sump[i] += cnt; - if (back > 0) - sump[back] -= cnt; - if (forw > 0) - sump[forw] -= cnt; - /* - * Update cluster summary information. - */ - lp = &sump[fs->fs_contigsumsize]; - for (i = fs->fs_contigsumsize; i > 0; i--) - if (*lp-- > 0) - break; - fs->fs_maxcluster[cgp->cg_cgx] = i; -} - -/* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: @@ -2532,7 +2444,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) #endif /* DEBUG */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, - cmd.value, filetype))) + cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; @@ -2560,7 +2472,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, ROOTINO); + blksize * fs->fs_fsize, ROOTINO, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; Index: sys/ufs/ffs/softdep.h =================================================================== --- sys/ufs/ffs/softdep.h (revision 202342) +++ sys/ufs/ffs/softdep.h (working copy) @@ -94,22 +94,28 @@ * The ONWORKLIST flag shows whether the structure is currently linked * onto a worklist. */ -#define ATTACHED 0x0001 -#define UNDONE 0x0002 -#define COMPLETE 0x0004 -#define DEPCOMPLETE 0x0008 -#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */ -#define MKDIR_BODY 0x0020 /* diradd & mkdir only */ -#define RMDIR 0x0040 /* dirrem only */ -#define DIRCHG 0x0080 /* diradd & dirrem only */ -#define GOINGAWAY 0x0100 /* indirdep only */ -#define IOSTARTED 0x0200 /* inodedep & pagedep only */ -#define SPACECOUNTED 0x0400 /* inodedep only */ -#define NEWBLOCK 0x0800 /* pagedep only */ -#define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */ -#define UFS1FMT 0x2000 /* indirdep only */ -#define EXTDATA 0x4000 /* allocdirect only */ -#define ONWORKLIST 0x8000 +#define ATTACHED 0x000001 +#define UNDONE 0x000002 +#define COMPLETE 0x000004 +#define DEPCOMPLETE 0x000008 +#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */ +#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */ +#define RMDIR 0x000040 /* dirrem only */ +#define DIRCHG 0x000080 /* diradd, dirrem only */ +#define GOINGAWAY 0x000100 /* indirdep, jremref only */ +#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */ +#define SPACECOUNTED 0x000400 /* inodedep only */ +#define NEWBLOCK 0x000800 /* pagedep, jaddref only */ +#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */ +#define UFS1FMT 0x002000 /* indirdep only */ +#define EXTDATA 0x004000 /* allocdirect only */ +#define ONWORKLIST 0x008000 +#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */ +#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */ +#define UNLINKED 0x040000 /* inodedep has been unlinked. */ +#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */ +#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */ +#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV) #define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) @@ -135,25 +141,38 @@ * and the macros below changed to use it. */ struct worklist { + LIST_ENTRY(worklist) wk_list; /* list of work requests */ struct mount *wk_mp; /* Mount we live in */ - LIST_ENTRY(worklist) wk_list; /* list of work requests */ - unsigned short wk_type; /* type of request */ - unsigned short wk_state; /* state flags */ + unsigned int wk_type:8, /* type of request */ + wk_state:24; /* state flags */ }; #define WK_DATA(wk) ((void *)(wk)) #define WK_PAGEDEP(wk) ((struct pagedep *)(wk)) #define WK_INODEDEP(wk) ((struct inodedep *)(wk)) #define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk)) +#define WK_NEWBLK(wk) ((struct newblk *)(wk)) #define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk)) #define WK_INDIRDEP(wk) ((struct indirdep *)(wk)) #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk)) #define WK_FREEFRAG(wk) ((struct freefrag *)(wk)) #define WK_FREEBLKS(wk) ((struct freeblks *)(wk)) +#define WK_FREEWORK(wk) ((struct freework *)(wk)) #define WK_FREEFILE(wk) ((struct freefile *)(wk)) #define WK_DIRADD(wk) ((struct diradd *)(wk)) #define WK_MKDIR(wk) ((struct mkdir *)(wk)) #define WK_DIRREM(wk) ((struct dirrem *)(wk)) #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk)) +#define WK_JADDREF(wk) ((struct jaddref *)(wk)) +#define WK_JREMREF(wk) ((struct jremref *)(wk)) +#define WK_JMVREF(wk) ((struct jmvref *)(wk)) +#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk)) +#define WK_JSEG(wk) ((struct jseg *)(wk)) +#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk)) +#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk)) +#define WK_FREEDEP(wk) ((struct freedep *)(wk)) +#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk)) +#define WK_SBDEP(wk) ((struct sbdep *)wk) +#define WK_JTRUNC(wk) ((struct jtrunc *)(wk)) /* * Various types of lists @@ -165,6 +184,15 @@ LIST_HEAD(inodedephd, inodedep); LIST_HEAD(allocindirhd, allocindir); LIST_HEAD(allocdirecthd, allocdirect); TAILQ_HEAD(allocdirectlst, allocdirect); +LIST_HEAD(indirdephd, indirdep); +LIST_HEAD(jaddrefhd, jaddref); +LIST_HEAD(jremrefhd, jremref); +LIST_HEAD(jmvrefhd, jmvref); +LIST_HEAD(jnewblkhd, jnewblk); +LIST_HEAD(jfreeblkhd, jfreeblk); +LIST_HEAD(freeworkhd, freework); +TAILQ_HEAD(jseglst, jseg); +TAILQ_HEAD(inoreflst, inoref); /* * The "pagedep" structure tracks the various dependencies related to @@ -192,9 +220,11 @@ struct pagedep { LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */ ino_t pd_ino; /* associated file */ ufs_lbn_t pd_lbn; /* block within file */ + struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */ struct dirremhd pd_dirremhd; /* dirrem's waiting for page */ struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */ struct diraddhd pd_pendinghd; /* directory entries awaiting write */ + struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */ }; /* @@ -248,13 +278,18 @@ struct inodedep { struct worklist id_list; /* buffer holding inode block */ # define id_state id_list.wk_state /* inode dependency state */ LIST_ENTRY(inodedep) id_hash; /* hashed lookup */ + TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */ struct fs *id_fs; /* associated filesystem */ ino_t id_ino; /* dependent inode */ nlink_t id_nlinkdelta; /* saved effective link count */ + nlink_t id_savednlink; /* Link saved during rollback */ LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ - struct buf *id_buf; /* related bmsafemap (if pending) */ + struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */ + struct diradd *id_mkdiradd; /* diradd for a mkdir. */ + struct inoreflst id_inoreflst; /* Inode reference adjustments. */ long id_savedextsize; /* ext size saved during rollback */ off_t id_savedsize; /* file size saved during rollback */ + struct dirremhd id_dirremhd; /* Removals pending. */ struct workhead id_pendinghd; /* entries awaiting directory write */ struct workhead id_bufwait; /* operations after inode written */ struct workhead id_inowait; /* operations waiting inode update */ @@ -271,23 +306,6 @@ struct inodedep { #define id_savedino2 id_un.idu_savedino2 /* - * A "newblk" structure is attached to a bmsafemap structure when a block - * or fragment is allocated from a cylinder group. Its state is set to - * DEPCOMPLETE when its cylinder group map is written. It is consumed by - * an associated allocdirect or allocindir allocation which will attach - * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag - * is not set (i.e., its cylinder group map has not been written). - */ -struct newblk { - LIST_ENTRY(newblk) nb_hash; /* hashed lookup */ - struct fs *nb_fs; /* associated filesystem */ - int nb_state; /* state of bitmap dependency */ - ufs2_daddr_t nb_newblkno; /* allocated block number */ - LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */ - struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */ -}; - -/* * A "bmsafemap" structure maintains a list of dependency structures * that depend on the update of a particular cylinder group map. * It has lists for newblks, allocdirects, allocindirs, and inodedeps. @@ -299,14 +317,44 @@ struct inodedep { */ struct bmsafemap { struct worklist sm_list; /* cylgrp buffer */ +# define sm_state sm_list.wk_state + int sm_cg; + LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */ struct buf *sm_buf; /* associated buffer */ struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */ + struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */ struct allocindirhd sm_allocindirhd; /* allocindir deps */ + struct allocindirhd sm_allocindirwr; /* writing allocindir deps */ struct inodedephd sm_inodedephd; /* inodedep deps */ + struct inodedephd sm_inodedepwr; /* writing inodedep deps */ struct newblkhd sm_newblkhd; /* newblk deps */ + struct newblkhd sm_newblkwr; /* writing newblk deps */ + struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */ + struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */ }; /* + * A "newblk" structure is attached to a bmsafemap structure when a block + * or fragment is allocated from a cylinder group. Its state is set to + * DEPCOMPLETE when its cylinder group map is written. It is converted to + * an allocdirect or allocindir allocation once the allocator calls the + * appropriate setup function. + */ +struct newblk { + struct worklist nb_list; +# define nb_state nb_list.wk_state + LIST_ENTRY(newblk) nb_hash; /* hashed lookup */ + LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */ + struct jnewblk *nb_jnewblk; /* New block journal entry. */ + struct bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */ + struct freefrag *nb_freefrag; /* fragment to be freed (if any) */ + struct indirdephd nb_indirdeps; /* Children indirect blocks. */ + struct workhead nb_newdirblk; /* dir block to notify when written */ + struct workhead nb_jwork; /* Journal work pending. */ + ufs2_daddr_t nb_newblkno; /* new value of block pointer */ +}; + +/* * An "allocdirect" structure is attached to an "inodedep" when a new block * or fragment is allocated and pointed to by the inode described by * "inodedep". The worklist is linked to the buffer that holds the block. @@ -334,20 +382,18 @@ struct bmsafemap { * and inodedep->id_pendinghd lists. */ struct allocdirect { - struct worklist ad_list; /* buffer holding block */ -# define ad_state ad_list.wk_state /* block pointer state */ + struct newblk ad_block; /* Common block logic */ +# define ad_state ad_block.nb_list.wk_state /* block pointer state */ TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */ - ufs_lbn_t ad_lbn; /* block within file */ - ufs2_daddr_t ad_newblkno; /* new value of block pointer */ - ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ - long ad_newsize; /* size of new block */ - long ad_oldsize; /* size of old block */ - LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */ - struct buf *ad_buf; /* cylgrp buffer (if pending) */ struct inodedep *ad_inodedep; /* associated inodedep */ - struct freefrag *ad_freefrag; /* fragment to be freed (if any) */ - struct workhead ad_newdirblk; /* dir block to notify when written */ + ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ + int ad_offset; /* Pointer offset in parent. */ + long ad_newsize; /* size of new block */ + long ad_oldsize; /* size of old block */ }; +#define ad_newblkno ad_block.nb_newblkno +#define ad_freefrag ad_block.nb_freefrag +#define ad_newdirblk ad_block.nb_newdirblk /* * A single "indirdep" structure manages all allocation dependencies for @@ -369,10 +415,14 @@ struct allocdirect { struct indirdep { struct worklist ir_list; /* buffer holding indirect block */ # define ir_state ir_list.wk_state /* indirect block pointer state */ - caddr_t ir_saveddata; /* buffer cache contents */ + LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */ + caddr_t ir_saveddata; /* buffer cache contents */ struct buf *ir_savebp; /* buffer holding safe copy */ + struct allocindirhd ir_completehd; /* waiting for indirdep complete */ + struct allocindirhd ir_writehd; /* Waiting for the pointer write. */ struct allocindirhd ir_donehd; /* done waiting to update safecopy */ struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ + struct workhead ir_jwork; /* Journal work pending. */ }; /* @@ -389,31 +439,39 @@ struct indirdep { * can then be freed as it is no longer applicable. */ struct allocindir { - struct worklist ai_list; /* buffer holding indirect block */ -# define ai_state ai_list.wk_state /* indirect block pointer state */ + struct newblk ai_block; /* Common block area */ +# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */ LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */ - int ai_offset; /* pointer offset in indirect block */ - ufs2_daddr_t ai_newblkno; /* new block pointer value */ - ufs2_daddr_t ai_oldblkno; /* old block pointer value */ - struct freefrag *ai_freefrag; /* block to be freed when complete */ struct indirdep *ai_indirdep; /* address of associated indirdep */ - LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */ - struct buf *ai_buf; /* cylgrp buffer (if pending) */ + ufs2_daddr_t ai_oldblkno; /* old value of block pointer */ + int ai_offset; /* Pointer offset in parent. */ }; +#define ai_newblkno ai_block.nb_newblkno +#define ai_freefrag ai_block.nb_freefrag +#define ai_newdirblk ai_block.nb_newdirblk /* + * The allblk union is used to size the newblk structure on allocation so + * that it may be any one of three types. + */ +union allblk { + struct allocindir ab_allocindir; + struct allocdirect ab_allocdirect; + struct newblk ab_newblk; +}; + +/* * A "freefrag" structure is attached to an "inodedep" when a previously * allocated fragment is replaced with a larger fragment, rather than extended. * The "freefrag" structure is constructed and attached when the replacement * block is first allocated. It is processed after the inode claiming the - * bigger block that replaces it has been written to disk. Note that the - * ff_state field is is used to store the uid, so may lose data. However, - * the uid is used only in printing an error message, so is not critical. - * Keeping it in a short keeps the data structure down to 32 bytes. + * bigger block that replaces it has been written to disk. */ struct freefrag { struct worklist ff_list; /* id_inowait or delayed worklist */ -# define ff_state ff_list.wk_state /* owning user; should be uid_t */ +# define ff_state ff_list.wk_state + struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */ + struct workhead ff_jwork; /* Journal work pending. */ ufs2_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ @@ -423,23 +481,60 @@ struct freefrag { * A "freeblks" structure is attached to an "inodedep" when the * corresponding file's length is reduced to zero. It records all * the information needed to free the blocks of a file after its - * zero'ed inode has been written to disk. + * zero'ed inode has been written to disk. The actual work is done + * by child freework structures which are responsible for individual + * inode pointers while freeblks is responsible for retiring the + * entire operation when it is complete and holding common members. */ struct freeblks { struct worklist fb_list; /* id_inowait or delayed worklist */ # define fb_state fb_list.wk_state /* inode and dirty block state */ + struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */ + struct workhead fb_freeworkhd; /* Work items pending */ + struct workhead fb_jwork; /* Journal work pending */ ino_t fb_previousinum; /* inode of previous owner of blocks */ uid_t fb_uid; /* uid of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ - long fb_oldextsize; /* previous ext data size */ - off_t fb_oldsize; /* previous file size */ ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */ - ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */ - ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */ - ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */ + int fb_ref; /* Children outstanding. */ }; /* + * A "freework" structure handles the release of a tree of blocks or a single + * block. Each indirect block in a tree is allocated its own freework + * structure so that the indrect block may be freed only when all of its + * children are freed. In this way we enforce the rule that an allocated + * block must have a valid path to a root that is journaled. Each child + * block acquires a reference and when the ref hits zero the parent ref + * is decremented. If there is no parent the freeblks ref is decremented. + */ +struct freework { + struct worklist fw_list; +# define fw_state fw_list.wk_state + LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */ + struct freeblks *fw_freeblks; /* Root of operation. */ + struct freework *fw_parent; /* Parent indirect. */ + ufs2_daddr_t fw_blkno; /* Our block #. */ + ufs_lbn_t fw_lbn; /* Original lbn before free. */ + int fw_frags; /* Number of frags. */ + int fw_ref; /* Number of children out. */ + int fw_off; /* Current working position. */ + struct workhead fw_jwork; /* Journal work pending. */ +}; + +/* + * A "freedep" structure is allocated to track the completion of a bitmap + * write for a freework. One freedep may cover many freed blocks so long + * as they reside in the same cylinder group. When the cg is written + * the freedep decrements the ref on the freework which may permit it + * to be freed as well. + */ +struct freedep { + struct worklist fd_list; + struct freework *fd_freework; /* Parent freework. */ +}; + +/* * A "freefile" structure is attached to an inode when its * link count is reduced to zero. It marks the inode as free in * the cylinder group map after the zero'ed inode has been written @@ -450,6 +545,7 @@ struct freefile { mode_t fx_mode; /* mode of inode */ ino_t fx_oldinum; /* inum of the unlinked file */ struct vnode *fx_devvp; /* filesystem device vnode */ + struct workhead fx_jwork; /* journal work pending. */ }; /* @@ -482,12 +578,11 @@ struct freefile { * than zero. * * The overlaying of da_pagedep and da_previous is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. If a - * da_previous entry is present, the pointer to its pagedep is available - * in the associated dirrem entry. If the DIRCHG flag is set, the - * da_previous entry is valid; if not set the da_pagedep entry is valid. - * The DIRCHG flag never changes; it is set when the structure is created - * if appropriate and is never cleared. + * structure down. If a da_previous entry is present, the pointer to its + * pagedep is available in the associated dirrem entry. If the DIRCHG flag + * is set, the da_previous entry is valid; if not set the da_pagedep entry + * is valid. The DIRCHG flag never changes; it is set when the structure + * is created if appropriate and is never cleared. */ struct diradd { struct worklist da_list; /* id_inowait or id_pendinghd list */ @@ -499,6 +594,7 @@ struct diradd { struct dirrem *dau_previous; /* entry being replaced in dir change */ struct pagedep *dau_pagedep; /* pagedep dependency for addition */ } da_un; + struct workhead da_jwork; /* Journal work awaiting completion. */ }; #define da_previous da_un.dau_previous #define da_pagedep da_un.dau_pagedep @@ -525,12 +621,13 @@ struct diradd { * mkdir structures that reference it. The deletion would be faster if the * diradd structure were simply augmented to have two pointers that referenced * the associated mkdir's. However, this would increase the size of the diradd - * structure from 32 to 64-bits to speed a very infrequent operation. + * structure to speed a very infrequent operation. */ struct mkdir { struct worklist md_list; /* id_inowait or buffer holding dir */ # define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */ struct diradd *md_diradd; /* associated diradd */ + struct jaddref *md_jaddref; /* dependent jaddref. */ struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */ LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */ }; @@ -542,20 +639,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; * list of the pagedep for the directory page that contains the entry. * It is processed after the directory page with the deleted entry has * been written to disk. - * - * The overlaying of dm_pagedep and dm_dirinum is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. It works - * because they are never used concurrently. */ struct dirrem { struct worklist dm_list; /* delayed worklist */ # define dm_state dm_list.wk_state /* state of the old directory entry */ LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */ + LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */ + struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */ ino_t dm_oldinum; /* inum of the removed dir entry */ union { struct pagedep *dmu_pagedep; /* pagedep dependency for remove */ ino_t dmu_dirinum; /* parent inode number (for rmdir) */ } dm_un; + struct workhead dm_jwork; /* Journal work awaiting completion. */ }; #define dm_pagedep dm_un.dmu_pagedep #define dm_dirinum dm_un.dmu_dirinum @@ -577,9 +673,200 @@ struct dirrem { * blocks using a similar scheme with the allocindir structures. Rather * than adding this level of complexity, we simply write those newly * allocated indirect blocks synchronously as such allocations are rare. + * In the case of a new directory the . and .. links are tracked with + * a mkdir rather than a pagedep. In this case we track the mkdir + * so it can be released when it is written. A workhead is used + * to simplify canceling a mkdir that is removed by a subsequent dirrem. */ struct newdirblk { struct worklist db_list; /* id_inowait or pg_newdirblk */ # define db_state db_list.wk_state /* unused */ struct pagedep *db_pagedep; /* associated pagedep */ + struct workhead db_mkdir; }; + +/* + * The inoref structure holds the elements common to jaddref and jremref + * so they may easily be queued in-order on the inodedep. + */ +struct inoref { + struct worklist if_list; +# define if_state if_list.wk_state + TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */ + struct jsegdep *if_jsegdep; + off_t if_diroff; /* Directory offset. */ + ino_t if_ino; /* Inode number. */ + ino_t if_parent; /* Parent inode number. */ + nlink_t if_nlink; /* nlink before addition. */ + uint16_t if_mode; /* File mode, needed for IFMT. */ +}; + +/* + * A "jaddref" structure tracks a new reference (link count) on an inode + * and prevents the link count increase and bitmap allocation until a + * journal entry can be written. Once the journal entry is written, + * the inode is put on the pendinghd of the bmsafemap and a diradd or + * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE + * flag is used to indicate that all of the required information for writing + * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to + * differentiate . and .. links from regular file names. NEWBLOCK indicates + * a bitmap is still pending. If a new reference is canceled by a delete + * prior to writing the journal the jaddref write is canceled and the + * structure persists to prevent any disk-visible changes until it is + * ultimately released when the file is freed or the link is dropped again. + */ +struct jaddref { + struct inoref ja_ref; +# define ja_list ja_ref.if_list /* Journal pending or jseg entries. */ +# define ja_state ja_ref.if_list.wk_state + LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */ + union { + struct diradd *jau_diradd; /* Pending diradd. */ + struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */ + } ja_un; +}; +#define ja_diradd ja_un.jau_diradd +#define ja_mkdir ja_un.jau_mkdir +#define ja_diroff ja_ref.if_diroff +#define ja_ino ja_ref.if_ino +#define ja_parent ja_ref.if_parent +#define ja_mode ja_ref.if_mode + +/* + * A "jremref" structure tracks a removed reference (unlink) on an + * inode and prevents the directory remove from proceeding until the + * journal entry is written. Once the journal has been written the remove + * may proceed as normal. + */ +struct jremref { + struct inoref jr_ref; +# define jr_list jr_ref.if_list /* Journal pending or jseg entries. */ +# define jr_state jr_ref.if_list.wk_state + LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */ + struct dirrem *jr_dirrem; /* Back pointer to dirrem. */ +}; + +struct jmvref { + struct worklist jm_list; + LIST_ENTRY(jmvref) jm_deps; + struct pagedep *jm_pagedep; + ino_t jm_parent; + ino_t jm_ino; + off_t jm_oldoff; + off_t jm_newoff; +}; + +/* + * A "jnewblk" structure tracks a newly allocated block or fragment and + * prevents the direct or indirect block pointer as well as the cg bitmap + * from being written until it is logged. After it is logged the jsegdep + * is attached to the allocdirect or allocindir until the operation is + * completed or reverted. If the operation is reverted prior to the journal + * write the jnewblk structure is maintained to prevent the bitmaps from + * reaching the disk. Ultimately the jnewblk structure will be passed + * to the free routine as the in memory cg is modified back to the free + * state at which time it can be released. + */ +struct jnewblk { + struct worklist jn_list; +# define jn_state jn_list.wk_state + struct jsegdep *jn_jsegdep; + LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */ + struct newblk *jn_newblk; + ino_t jn_ino; + ufs_lbn_t jn_lbn; + ufs2_daddr_t jn_blkno; + int jn_oldfrags; + int jn_frags; +}; + +/* + * A "jfreeblk" structure tracks the journal write for freeing a block + * or tree of blocks. The block pointer must not be cleared in the inode + * or indirect prior to the jfreeblk being written. + */ +struct jfreeblk { + struct worklist jf_list; +# define jf_state jf_list.wk_state + struct jsegdep *jf_jsegdep; + struct freeblks *jf_freeblks; + LIST_ENTRY(jfreeblk) jf_deps; + ino_t jf_ino; + ufs_lbn_t jf_lbn; + ufs2_daddr_t jf_blkno; + int jf_frags; +}; + +/* + * A "jfreefrag" tracks the freeing of a single block when a fragment is + * extended or an indirect page is replaced. It is not part of a larger + * freeblks operation. + */ +struct jfreefrag { + struct worklist fr_list; +# define fr_state fr_list.wk_state + struct jsegdep *fr_jsegdep; + struct freefrag *fr_freefrag; + ino_t fr_ino; + ufs_lbn_t fr_lbn; + ufs2_daddr_t fr_blkno; + int fr_frags; +}; + +/* + * A "jtrunc" journals the intent to truncate an inode to a non-zero + * value. This is done synchronously prior to the synchronous partial + * truncation process. The jsegdep is not released until the truncation + * is complete and the truncated inode is fsync'd. + */ +struct jtrunc { + struct worklist jt_list; + struct jsegdep *jt_jsegdep; + ino_t jt_ino; + off_t jt_size; + int jt_extsize; +}; + +/* + * A "jsegdep" structure tracks a single reference to a written journal + * segment so the journal space can be reclaimed when all dependencies + * have been written. + */ +struct jsegdep { + struct worklist jd_list; +# define jd_state jd_list.wk_state + struct jseg *jd_seg; +}; + +/* + * A "jseg" structure contains all of the journal records written in a + * single disk write. jaddref and jremref structures are linked into + * js_entries so thay may be completed when the write completes. The + * js_deps array contains as many entries as there are ref counts to + * reduce the number of allocations required per journal write to one. + */ +struct jseg { + struct worklist js_list; /* b_deps link for journal */ +# define js_state js_list.wk_state + struct workhead js_entries; /* Entries awaiting write */ + TAILQ_ENTRY(jseg) js_next; + struct jblocks *js_jblocks; /* Back pointer to block/seg list */ + struct buf *js_buf; /* Buffer while unwritten */ + uint64_t js_seq; + int js_size; /* Allocated size in bytes */ + int js_cnt; /* Total items allocated */ + int js_refs; /* Count of items pending completion */ +}; + +/* + * A 'sbdep' structure tracks the head of the free inode list and + * superblock writes. This makes sure the superblock is always pointing at + * the first possible unlinked inode for the suj recovery process. If a + * block write completes and we discover a new head is available the buf + * is dirtied and the dep is kept. + */ +struct sbdep { + struct worklist sb_list; /* b_dep linkage */ + struct fs *sb_fs; /* Filesystem pointer within buf. */ + struct ufsmount *sb_ump; +}; Index: sys/ufs/ffs/ffs_subr.c =================================================================== --- sys/ufs/ffs/ffs_subr.c (revision 202342) +++ sys/ufs/ffs/ffs_subr.c (working copy) @@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$"); #ifndef _KERNEL #include #include -#include "fsck.h" #else #include #include @@ -223,12 +222,43 @@ ffs_isblock(fs, cp, h) mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: +#ifdef _KERNEL panic("ffs_isblock"); +#endif + break; } return (0); } /* + * check if a block is free + */ +int +ffs_isfreeblock(fs, cp, h) + struct fs *fs; + u_char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0); + case 4: + return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); + case 2: + return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); + case 1: + return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); + default: +#ifdef _KERNEL + panic("ffs_isfreeblock"); +#endif + break; + } + return (0); +} + +/* * take a block out of the map */ void @@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h) cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_clrblock"); +#endif + break; } } @@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h) cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_setblock"); +#endif + break; } } + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +void +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + ufs1_daddr_t blkno; + int cnt; +{ + int32_t *sump; + int32_t *lp; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; + /* + * Update cluster summary information. + */ + lp = &sump[fs->fs_contigsumsize]; + for (i = fs->fs_contigsumsize; i > 0; i--) + if (*lp-- > 0) + break; + fs->fs_maxcluster[cgp->cg_cgx] = i; +} Index: sys/ufs/ffs/ffs_balloc.c =================================================================== --- sys/ufs/ffs/ffs_balloc.c (revision 202342) +++ sys/ufs/ffs/ffs_balloc.c (working copy) @@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (lbn < 0) return (EFBIG); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment @@ -418,6 +420,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -473,7 +477,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } @@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (lbn < 0) return (EFBIG); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + /* * Check for allocating external data. */ @@ -930,6 +937,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -985,7 +994,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } Index: sys/ufs/ffs/ffs_inode.c =================================================================== --- sys/ufs/ffs/ffs_inode.c (revision 202342) +++ sys/ufs/ffs/ffs_inode.c (working copy) @@ -92,15 +92,6 @@ ffs_update(vp, waitfor) fs = ip->i_fs; if (fs->fs_ronly) return (0); - /* - * Ensure that uid and gid are correct. This is a temporary - * fix until fsck has been changed to do the update. - */ - if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ - fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ - ip->i_din1->di_ouid = ip->i_uid; /* XXX */ - ip->i_din1->di_ogid = ip->i_gid; /* XXX */ - } /* XXX */ error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { @@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td) ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; ufs2_daddr_t count, blocksreleased = 0, datablocks; + void *cookie; struct bufobj *bo; struct fs *fs; struct buf *bp; @@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td) fs = ip->i_fs; ump = ip->i_ump; bo = &vp->v_bufobj; + cookie = NULL; ASSERT_VOP_LOCKED(vp, "ffs_truncate"); if (length < 0) return (EINVAL); + if (length > fs->fs_maxfilesize) + return (EFBIG); /* * Historically clients did not have to specify which data * they were truncating. So, if not specified, we assume @@ -212,6 +207,8 @@ ffs_truncate(vp, length, flags, cred, td) panic("ffs_truncate: partial trunc of extdata"); if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) return (error); + if (DOINGSUJ(vp)) + cookie = softdep_setup_trunc(vp, length, flags); osize = ip->i_din2->di_extsize; ip->i_din2->di_blocks -= extblocks; #ifdef QUOTA @@ -227,19 +224,19 @@ ffs_truncate(vp, length, flags, cred, td) } ip->i_flag |= IN_CHANGE; if ((error = ffs_update(vp, 1))) - return (error); + goto out; for (i = 0; i < NXADDR; i++) { if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i], - sblksize(fs, osize, i), ip->i_number); + sblksize(fs, osize, i), ip->i_number, NULL); } } } - if ((flags & IO_NORMAL) == 0) - return (0); - if (length > fs->fs_maxfilesize) - return (EFBIG); + if ((flags & IO_NORMAL) == 0) { + error = 0; + goto out; + } if (vp->v_type == VLNK && (ip->i_size < vp->v_mount->mnt_maxsymlinklen || datablocks == 0)) { @@ -253,24 +250,52 @@ ffs_truncate(vp, length, flags, cred, td) ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) softdep_setup_freeblocks(ip, length, IO_EXT); - return (ffs_update(vp, 1)); + error = ffs_update(vp, 1); + goto out; } if (ip->i_size == length) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) softdep_setup_freeblocks(ip, length, IO_EXT); - return (ffs_update(vp, 0)); + error = ffs_update(vp, 0); + goto out; } if (fs->fs_ronly) panic("ffs_truncate: read-only filesystem"); #ifdef QUOTA error = getinoquota(ip); if (error) - return (error); + goto out; #endif if ((ip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(vp); vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + osize = ip->i_size; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + if (osize < length) { + vnode_pager_setsize(vp, length); + flags |= BA_CLRBUF; + error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); + if (error) { + vnode_pager_setsize(vp, osize); + goto out; + } + ip->i_size = length; + DIP_SET(ip, i_size, length); + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + error = ffs_update(vp, 1); + goto out; + } if (DOINGSOFTDEP(vp)) { if (length > 0 || softdepslowdown) { /* @@ -283,11 +308,18 @@ ffs_truncate(vp, length, flags, cred, td) * so that it will have no data structures left. */ if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) - return (error); + goto out; UFS_LOCK(ump); if (ip->i_flag & IN_SPACECOUNTED) fs->fs_pendingblocks -= datablocks; UFS_UNLOCK(ump); + /* + * We have to journal the truncation before we change + * any blocks so we don't leave the file partially + * truncated. + */ + if (DOINGSUJ(vp) && cookie == NULL) + cookie = softdep_setup_trunc(vp, length, flags); } else { #ifdef QUOTA (void) chkdq(ip, -datablocks, NOCRED, 0); @@ -301,35 +333,11 @@ ffs_truncate(vp, length, flags, cred, td) OFF_TO_IDX(lblktosize(fs, -extblocks))); vnode_pager_setsize(vp, 0); ip->i_flag |= IN_CHANGE | IN_UPDATE; - return (ffs_update(vp, 0)); + error = ffs_update(vp, 0); + goto out; } } - osize = ip->i_size; /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of osize is 0, length will be at least 1. - */ - if (osize < length) { - vnode_pager_setsize(vp, length); - flags |= BA_CLRBUF; - error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); - if (error) { - vnode_pager_setsize(vp, osize); - return (error); - } - ip->i_size = length; - DIP_SET(ip, i_size, length); - if (bp->b_bufsize == fs->fs_bsize) - bp->b_flags |= B_CLUSTEROK; - if (flags & IO_SYNC) - bwrite(bp); - else - bawrite(bp); - ip->i_flag |= IN_CHANGE | IN_UPDATE; - return (ffs_update(vp, 1)); - } - /* * Shorten the size of the file. If the file is not being * truncated to a block boundary, the contents of the * partial block following the end of the file must be @@ -345,9 +353,8 @@ ffs_truncate(vp, length, flags, cred, td) lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); - if (error) { - return (error); - } + if (error) + goto out; /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized @@ -359,7 +366,7 @@ ffs_truncate(vp, length, flags, cred, td) if (DOINGSOFTDEP(vp) && lbn < NDADDR && fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT)) != 0) - return (error); + goto out; ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); @@ -445,7 +452,7 @@ ffs_truncate(vp, length, flags, cred, td) if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ip->i_devvp, bn, - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); blocksreleased += nblocks; } } @@ -464,7 +471,8 @@ ffs_truncate(vp, length, flags, cred, td) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); - ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number); + ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number, + NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -496,7 +504,7 @@ ffs_truncate(vp, length, flags, cred, td) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ip->i_devvp, bn, - oldspace - newspace, ip->i_number); + oldspace - newspace, ip->i_number, NULL); blocksreleased += btodb(oldspace - newspace); } } @@ -528,7 +536,14 @@ done: #ifdef QUOTA (void) chkdq(ip, -blocksreleased, NOCRED, 0); #endif - return (allerror); + error = allerror; +out: + if (cookie) { + allerror = softdep_complete_trunc(vp, cookie); + if (allerror != 0 && error == 0) + error = allerror; + } + return (error); } /* @@ -638,7 +653,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp blocksreleased += blkcount; } ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); blocksreleased += nblocks; } Index: sys/ufs/ffs/fs.h =================================================================== --- sys/ufs/ffs/fs.h (revision 202342) +++ sys/ufs/ffs/fs.h (working copy) @@ -340,7 +340,10 @@ struct fs { int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ - int32_t fs_sparecon32[26]; /* reserved for future constants */ + int32_t fs_sujournal; /* SUJ journal file */ + int32_t fs_sujfree; /* SUJ free list */ + ufs_time_t fs_mtime; /* Last mount or fsck time. */ + int32_t fs_sparecon32[22]; /* reserved for future constants */ int32_t fs_flags; /* see FS_ flags below */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ @@ -414,6 +417,7 @@ CTASSERT(sizeof(struct fs) == 1376); #define FS_GJOURNAL 0x0040 /* gjournaled file system */ #define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */ #define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */ +#define FS_SUJ 0x200 /* Filesystem using softupdate journal */ /* * Macros to access bits in the fs_active array. @@ -603,8 +607,32 @@ struct cg { ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (size))))) - /* + * Indirect lbns are aligned on NDADDR addresses where single indirects + * are the negated address of the lowest lbn reachable, double indirects + * are this lbn - 1 and triple indirects are this lbn - 2. This yields + * an unusual bit order to determine level. + */ +static inline int +lbn_level(ufs_lbn_t lbn) +{ + if (lbn >= 0) + return 0; + switch (lbn & 0x3) { + case 0: + return (0); + case 1: + break; + case 2: + return (2); + case 3: + return (1); + default: + break; + } + return (-1); +} +/* * Number of inodes in a secondary storage block/fragment. */ #define INOPB(fs) ((fs)->fs_inopb) @@ -615,6 +643,89 @@ struct cg { */ #define NINDIR(fs) ((fs)->fs_nindir) +/* + * Softdep journal record format. + */ + +#define JOP_ADDREF 1 /* Add a reference to an inode. */ +#define JOP_REMREF 2 /* Remove a reference from an inode. */ +#define JOP_NEWBLK 3 /* Allocate a block. */ +#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ +#define JOP_MVREF 5 /* Move a reference from one off to another. */ +#define JOP_TRUNC 6 /* Partial truncation record. */ + +#define JREC_SIZE 32 /* Record and segment header size. */ + +#define SUJ_MIN (1 * 1024 * 1024) /* Minimum journal size */ +#define SUJ_MAX (64 * SUJ_MIN) /* Maximum journal size */ + +/* + * Size of the segment record header. There is at most one for each disk + * block and at least one for each filesystem block in the journal. The + * segment header is followed by an array of records. + */ +struct jsegrec { + uint64_t jsr_seq; /* Our sequence number */ + uint64_t jsr_oldest; /* Oldest valid sequence number */ + uint32_t jsr_cnt; /* Count of valid records */ + uint32_t jsr_crc; /* 32bit crc of the valid space */ + ufs_time_t jsr_time; /* timestamp for mount instance */ +}; + +struct jrefrec { + uint32_t jr_op; + ino_t jr_ino; + ino_t jr_parent; + uint16_t jr_nlink; + uint16_t jr_mode; + off_t jr_diroff; + uint64_t jr_unused; +}; + +struct jmvrec { + uint32_t jm_op; + ino_t jm_ino; + ino_t jm_parent; + uint16_t jm_unused; + off_t jm_oldoff; + off_t jm_newoff; +}; + +struct jblkrec { + uint32_t jb_op; + uint32_t jb_ino; + ufs2_daddr_t jb_blkno; + ufs_lbn_t jb_lbn; + uint16_t jb_frags; + uint16_t jb_oldfrags; + uint32_t jb_unused; +}; + +struct jtrncrec { + uint32_t jt_op; + uint32_t jt_ino; + off_t jt_size; + uint32_t jt_extsize; + uint32_t jt_pad[3]; +}; + +union jrec { + struct jsegrec rec_jsegrec; + struct jrefrec rec_jrefrec; + struct jmvrec rec_jmvrec; + struct jblkrec rec_jblkrec; + struct jtrncrec rec_jtrncrec; +}; + +#ifdef CTASSERT +CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); +CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); +CTASSERT(sizeof(struct jmvrec) == JREC_SIZE); +CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); +CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE); +CTASSERT(sizeof(union jrec) == JREC_SIZE); +#endif + extern int inside[], around[]; extern u_char *fragtbl[]; Index: sys/ufs/ffs/ffs_snapshot.c =================================================================== --- sys/ufs/ffs/ffs_snapshot.c (revision 202342) +++ sys/ufs/ffs/ffs_snapshot.c (working copy) @@ -582,7 +582,8 @@ loop: len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, - DIP(xp, i_db[loc]), len, xp->i_number); + DIP(xp, i_db[loc]), len, xp->i_number, + NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -598,7 +599,7 @@ loop: DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { @@ -700,7 +701,7 @@ out1: copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; @@ -1220,7 +1221,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } @@ -1500,7 +1501,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c (revision 202342) +++ sys/kern/vfs_bio.c (working copy) @@ -216,6 +216,14 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLA static int bd_request; /* + * Request for the buf daemon to write more buffers than is indicated by + * lodirtybuf. This may be necessary to push out excess dependencies or + * defragment the address space where a simple count of the number of dirty + * buffers is insufficient to characterize the demand for flushing them. + */ +static int bd_speedupreq; + +/* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; @@ -467,12 +475,20 @@ bd_wakeup(int dirtybuflevel) * bd_speedup - speedup the buffer cache flushing code */ -static __inline void bd_speedup(void) { + int needwake; - bd_wakeup(1); + mtx_lock(&bdlock); + needwake = 0; + if (bd_speedupreq == 0 || bd_request == 0) + needwake = 1; + bd_speedupreq = 1; + bd_request = 1; + if (needwake) + wakeup(&bd_request); + mtx_unlock(&bdlock); } /* @@ -2120,6 +2136,7 @@ buf_do_flush(struct vnode *vp) static void buf_daemon() { + int lodirtysave; /* * This process needs to be suspended prior to shutdown sync. @@ -2137,7 +2154,11 @@ buf_daemon() mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); - + lodirtysave = lodirtybuffers; + if (bd_speedupreq) { + lodirtybuffers = numdirtybuffers / 2; + bd_speedupreq = 0; + } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate @@ -2149,6 +2170,7 @@ buf_daemon() break; uio_yield(); } + lodirtybuffers = lodirtysave; /* * Only clear bd_request if we have reached our low water Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c (revision 202342) +++ sys/kern/vfs_subr.c (working copy) @@ -2833,6 +2833,7 @@ DB_SHOW_COMMAND(mount, db_show_mount) MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); + MNT_FLAG(MNT_SOFTDEP); #undef MNT_FLAG if (flags != 0) { if (buf[0] != '\0') Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h (revision 202342) +++ sys/sys/mount.h (working copy) @@ -240,6 +240,7 @@ void __mnt_vnode_markerfree(struct vnode #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ #define MNT_NFS4ACLS 0x00000010 +#define MNT_SUJ 0x00000080 /* softdep journaling */ /* * NFS export related mount flags. @@ -275,7 +276,8 @@ void __mnt_vnode_markerfree(struct vnode MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ - MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | MNT_NFS4ACLS) + MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ + MNT_NFS4ACLS | MNT_SUJ) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ Index: sys/sys/buf.h =================================================================== --- sys/sys/buf.h (revision 202342) +++ sys/sys/buf.h (working copy) @@ -493,6 +493,7 @@ int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); +void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **);