GENERIC HEAD from 2010-01-19 11:42:15 UTC, r202614M, vmcore.44 KDB: debugger backends: ddb KDB: current backend: ddb Copyright (c) 1992-2010 The FreeBSD Project. Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994 The Regents of the University of California. All rights reserved. FreeBSD is a registered trademark of The FreeBSD Foundation. FreeBSD 9.0-CURRENT #0 r202614M: Tue Jan 19 13:59:33 CET 2010 pho@crashbox.osted.lan:/usr/src/sys/i386/compile/PHO i386 WARNING: WITNESS option enabled, expect reduced performance. WARNING: DIAGNOSTIC option enabled, expect reduced performance. Timecounter "i8254" frequency 1193182 Hz quality 0 CPU: Intel(R) XEON(TM) CPU 1.80GHz (1799.80-MHz 686-class CPU) Origin = "GenuineIntel" Id = 0xf24 Stepping = 4 Features=0x3febfbff real memory = 1073741824 (1024 MB) avail memory = 1031360512 (983 MB) : Trying to mount root from ufs:/dev/ad0s1a WARNING: / was not properly dismounted Enter full pathname of shell or RETURN for /bin/sh: # fsck -y / ** /dev/ad0s1a ** Last Mounted on / ** Root file system ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 4408 files, 238909 used, 267578 free (602 frags, 33372 blocks, 0.1% fragmentation) ***** FILE SYSTEM MARKED CLEAN ***** # newfs -U /dev/ad0s1e > /dev/null # tunefs -j enable /dev/ad0s1e Using inode 4 in cg 0 for 33554432 byte journal tunefs: soft updates journaling set # mount /tmp # umount /tmp lock order reversal: 1st 0xc44c046c ufs (ufs) @ kern/vfs_mount.c:1204 2nd 0xc46c5b38 devfs (devfs) @ ufs/ffs/ffs_vfsops.c:1236 KDB: stack backtrace: db_trace_self_wrapper(c0cabf9f,e69139c8,c08d8065,c08c8abb,c0caef50,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c8abb,c0caef50,c413e1c0,c413e088,e6913a24,...) at kdb_backtrace+0x29 _witness_debugger(c0caef50,c46c5b38,c0c9d438,c413e088,c0cd1c0a,...) at _witness_debugger+0x25 witness_checkorder(c46c5b38,9,c0cd1c01,4d4,c46c5ba4,...) at witness_checkorder+0x839 __lockmgr_args(c46c5b38,80400,c46c5ba4,0,0,...) at __lockmgr_args+0x804 vop_stdlock(e6913b40,c0cd2acf,79,80400,c46c5ae0,...) at vop_stdlock+0x65 VOP_LOCK1_APV(c0d929a0,e6913b40,c46c515c,c0dd3fa0,c46c5ae0,...) at VOP_LOCK1_APV+0xb5 _vn_lock(c46c5ae0,80400,c0cd1c01,4d4,c46bbb00,...) at _vn_lock+0x78 ffs_flushfiles(c4764000,0,c4733900,e6913bc8,3,...) at ffs_flushfiles+0x11a softdep_flushfiles(c4764000,0,c4733900,0,1,...) at softdep_flushfiles+0x2e ffs_unmount(c4764000,8000000,c0cb58e4,4f9,80,...) at ffs_unmount+0x18f dounmount(c4764000,8000000,c4733900,47e,2d9c374f,...) at dounmount+0x46d unmount(c4733900,e6913cf8,8,c4733900,c0d96768,...) at unmount+0x2ff syscall(e6913d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (22, FreeBSD ELF32, unmount), eip = 0x280dae8f, esp = 0xbfbfe64c, ebp = 0xbfbfe718 --- # fsck -y ** /dev/ad0s1a ** Last Mounted on / ** Root file system ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 4408 files, 238909 used, 267578 free (602 frags, 33372 blocks, 0.1% fragmentation) ***** FILE SYSTEM IS CLEAN ***** ** /dev/ad0s1f (NO WRITE) ** Last Mounted on /home ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 1962 files, 60070 used, 446417 free (513 frags, 55738 blocks, 0.1% fragmentation) ** /dev/ad0s1e ** Last Mounted on /tmp ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 3 files, 16394 used, 2012637 free (21 frags, 251577 blocks, 0.0% fragmentation) ***** FILE SYSTEM IS CLEAN ***** ** /dev/ad0s1d ** Last Mounted on /usr ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 429322 files, 2188686 used, 2888393 free (50353 frags, 354755 blocks, 1.0% fragmentation) ***** FILE SYSTEM MARKED CLEAN ***** ** /dev/ad0s1g ** Last Mounted on /var ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 394857 files, 7446128 used, 18445217 free (23449 frags, 2302721 blocks, 0.1% fragmentation) ***** FILE SYSTEM MARKED CLEAN ***** # umount /home # fsck -y /home ** /dev/ad0s1f ** Last Mounted on /home ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 1962 files, 60070 used, 446417 free (513 frags, 55738 blocks, 0.1% fragmentation) ***** FILE SYSTEM MARKED CLEAN ***** # exit Entropy harvesting: interrupts ethernet point_to_point kickstart. Fast boot: skipping disk checks. lock order reversal: 1st 0xd81027c0 bufwait (bufwait) @ kern/vfs_bio.c:2581 2nd 0xc46a5c00 dirhash (dirhash) @ ufs/ufs/ufs_dirhash.c:283 KDB: stack backtrace: db_trace_self_wrapper(c0cabf9f,e699987c,c08d8065,c08c8abb,c0caef50,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c8abb,c0caef50,c413af60,c413e228,e69998d8,...) at kdb_backtrace+0x29 _witness_debugger(c0caef50,c46a5c00,c0cd2952,c413e228,c0cd25e4,...) at _witness_debugger+0x25 witness_checkorder(c46a5c00,9,c0cd25db,11b,0,...) at witness_checkorder+0x839 _sx_xlock(c46a5c00,0,c0cd25db,11b,c479c910,...) at _sx_xlock+0x85 ufsdirhash_acquire(d8102760,e6999a1c,164,d87694ac,e69999a8,...) at ufsdirhash_acquire+0x48 ufsdirhash_add(c479c910,e6999a1c,4ac,e6999994,e6999998,...) at ufsdirhash_add+0x13 ufs_direnter(c482515c,c47a1d98,e6999a1c,e6999c00,d81051a0,...) at ufs_direnter+0x749 ufs_mkdir(e6999c28,c0ce8045,0,0,e6999b6c,...) at ufs_mkdir+0x993 VOP_MKDIR_APV(c0db95c0,e6999c28,e6999c00,e6999b6c,0,...) at VOP_MKDIR_APV+0xc5 kern_mkdirat(c4822240,ffffff9c,bfbfef5a,0,1ff,...) at kern_mkdirat+0x21b kern_mkdir(c4822240,bfbfef5a,0,1ff,e6999d2c,...) at kern_mkdir+0x2e mkdir(c4822240,e6999cf8,8,c0caf801,c0d973e0,...) at mkdir+0x29 syscall(e6999d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (136, FreeBSD ELF32, mkdir), eip = 0x2816b313, esp = 0xbfbfed6c, ebp = 0xbfbfee38 --- fxp0: link state changed to UP Starting Network: lo0 fxp0. add net default: gateway 192.168.1.1 Additional ABI support: linux. Starting mountd. Configuring syscons: keymap blanktime. Local package initialization:lock order reversal: 1st 0xc47a01b4 ufs (ufs) @ kern/vfs_subr.c:2091 2nd 0xd810f960 bufwait (bufwait) @ ufs/ffs/ffs_softdep.c:10915 3rd 0xc48bac94 ufs (ufs) @ kern/vfs_subr.c:2091 KDB: stack backtrace: db_trace_self_wrapper(c0cabf9f,e6a5f86c,c08d8065,c08c8abb,c0caef69,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c8abb,c0caef69,c413af60,c413e1c0,e6a5f8c8,...) at kdb_backtrace+0x29 _witness_debugger(c0caef69,c48bac94,c0ca157a,c413e1c0,c0cb60ef,...) at _witness_debugger+0x25 witness_checkorder(c48bac94,9,c0cb60e6,82b,0,...) at witness_checkorder+0x839 __lockmgr_args(c48bac94,80100,c48bad00,0,0,...) at __lockmgr_args+0x804 ffs_lock(e6a5f9e8,c08d7e0b,c0cb55cd,80100,c48bac3c,...) at ffs_lock+0xa1 VOP_LOCK1_APV(c0db95c0,e6a5f9e8,109,c0dd3fa0,c48bac3c,...) at VOP_LOCK1_APV+0xb5 _vn_lock(c48bac3c,80100,c0cb60e6,82b,4,...) at _vn_lock+0x78 vget(c48bac3c,80100,c4a67480,50,0,...) at vget+0xbb vfs_hash_get(c476387c,61c00,80000,c4a67480,e6a5fb38,...) at vfs_hash_get+0xed ffs_vgetf(c476387c,61c00,80000,e6a5fb38,1,...) at ffs_vgetf+0x49 softdep_sync_metadata(c47a015c,0,c0cd215a,147,0,...) at softdep_sync_metadata+0x663 ffs_syncvnode(c47a015c,1,c4a67480,547,c0cb6b7d,...) at ffs_syncvnode+0x3e2 ffs_sync(c476387c,1,c0cb58e4,4f9,80,...) at ffs_sync+0x26f dounmount(c476387c,8000000,c4a67480,47e,2d9c374f,...) at dounmount+0x44e unmount(c4a67480,e6a5fcf8,8,c4a67480,c0d96768,...) at unmount+0x2ff syscall(e6a5fd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (22, FreeBSD ELF32, unmount), eip = 0x280dae8f, esp = 0xbfbfe68c, ebp = 0xbfbfe758 --- fsync: giving up on dirty 0xc46c5ae0: tag devfs, type VCHR usecount 1, writecount 0, refcount 16 mountedhere 0xc46bad00 flags () v_object 0xc471fbb0 ref 0 pages 50 lock type devfs: EXCL by thread 0xc4a67480 (pid 985) #0 0xc087abbe at __lockmgr_args+0xbfe #1 0xc0916e15 at vop_stdlock+0x65 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc0ad88c5 at ffs_sync+0x3d5 #5 0xc091f3ce at dounmount+0x44e #6 0xc091f97f at unmount+0x2ff #7 0xc0bd5464 at syscall+0x2b4 #8 0xc0bb7790 at Xint0x80_syscall+0x20 dev ad0s1e umount: unmount of /tmp failed: Resource temporarily unavailable ** /dev/ad0s1e (NO WRITE) ** Last Mounted on /tmp ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 7 files, 16398 used, 2012633 free (49 frags, 251573 blocks, 0.0% fragmentation) mount: /dev/ad0s1e : Operation not permitted usage: kill [-s signal_name] pid ... kill -l [exit_status] kill -signal_name pid ... kill -signal_number pid ... fsck -y /tmp watchdogd. Tue Jan 19 15:54:37 CET 2010 Jan 19 15:54:55 crashbox su: pho to root on /dev/pts/0 Stopping inetd. Stopping moused. Waiting for PIDS: 1016. Shutting down local packages:. Stopping cron. Stopping sshd. Stopping ntpd. Stopping nfsd. Stopping rpcbind. Stopping devd. Writing entropy file:. Terminated . Jan 19 15:55:00 crashbox syslogd: exiting on signal 15 Enter full pathname of shell or RETURN for /bin/sh: # umount /tmp fsync: giving up on dirty 0xc46c5ae0: tag devfs, type VCHR usecount 1, writecount 0, refcount 16 mountedhere 0xc46bad00 flags () v_object 0xc471fbb0 ref 0 pages 50 lock type devfs: EXCL by thread 0xc475fb40 (pid 1333) #0 0xc087abbe at __lockmgr_args+0xbfe #1 0xc0916e15 at vop_stdlock+0x65 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc0ad88c5 at ffs_sync+0x3d5 #5 0xc091f3ce at dounmount+0x44e #6 0xc091f97f at unmount+0x2ff #7 0xc0bd5464 at syscall+0x2b4 #8 0xc0bb7790 at Xint0x80_syscall+0x20 dev ad0s1e umount: unmount of /tmp failed: Resource temporarily unavailable # fstat /tmp USER CMD PID FD MOUNT INUM MODE SZ|DV R/W NAME # mount /dev/ad0s1a on / (ufs, local) devfs on /dev (devfs, local, multilabel) /dev/ad0s1f on /home (ufs, local) /dev/ad0s1e on /tmp (ufs, NFS exported, local, union, soft-updates) /dev/ad0s1d on /usr (ufs, local) /dev/ad0s1g on /var (ufs, local) # dumpfs /dev/ad0s1e | less magic 19540119 (UFS2) time Tue Jan 19 15:54:35 2010 superblock location 65536 id [ 4b55c6d0 2d9c374f ] ncg 23 size 2097152 blocks 2029031 bsize 16384 shift 14 mask 0xffffc000 fsize 2048 shift 11 mask 0xfffff800 frag 8 shift 3 fsbtodb 2 minfree 8% optim time symlinklen 120 maxbsize 16384 maxbpg 2048 maxcontig 8 contigsumsize 8 nbfree 251573 ndir 6 nifree 541687 nffree 49 bpg 11761 fpg 94088 ipg 23552 unrefs 0 nindir 2048 inopb 64 maxfilesize 140806241583103 sbsize 2048 cgsize 16384 csaddr 3000 cssize 2048 sblkno 40 cblkno 48 iblkno 56 dblkno 3000 cgrotor 0 fmod 0 ronly 0 clean 0 avgfpdir 64 avgfilesize 16384 flags soft-updates unknown flags (0x200) fsmnt /tmp volname swuid 0 # ~KDB: enter: Line break on console [thread pid 11 tid 100006 ] Stopped at kdb_enter+0x3a: movl $0,kdb_why db> show mount 0xc46c0b50 /dev/ad0s1a on / (ufs) 0xc46c1000 devfs on /dev (devfs) 0xc4763b50 /dev/ad0s1f on /home (ufs) 0xc476387c /dev/ad0s1e on /tmp (ufs) 0xc47635a8 /dev/ad0s1d on /usr (ufs) 0xc47632d4 /dev/ad0s1g on /var (ufs) More info: show mount db> show mount 0xc476387c 0xc476387c /dev/ad0s1e on /tmp (ufs) mnt_flag = UNION, SOFTDEP, EXPORTED, DEFEXPORTED, LOCAL mnt_kern_flag = SOFTDEP, MPSAFE, LOOKUP_SHARED, 0x00000040 mnt_opt = fstype, fspath, from, errmsg, rw, noro mnt_stat = { version=537068824 type=5 flags=0x0000000000201320 bsize=2048 iosize=16384 blocks=2029031 bfree=2012633 bavail=1850311 files=541694 ffree=541687 syncwrites=0 asyncwrites=0 syncreads=0 asyncreads=0 namemax=255 owner=0 fsid=[1263912656, 765212495] } mnt_cred = { uid=0 ruid=0 } mnt_ref = 8 mnt_gen = 1 mnt_nvnodelistsize = 8 mnt_writeopcount = 0 mnt_noasync = 1 mnt_maxsymlinklen = 120 mnt_iosize_max = 131072 mnt_hashseed = 4211062285 mnt_secondary_writes = 0 mnt_secondary_accwrites = 21 mnt_gjprovider = NULL vnode 0xc47a02b8: tag ufs, type VREG usecount 0, writecount 0, refcount 1 mountedhere 0 flags () lock type ufs: UNLOCKED #0 0xc087abbe at __lockmgr_args+0xbfe #1 0xc0ad7a01 at ffs_vgetf+0x1e1 #2 0xc0ad7dce at ffs_vget+0x2e #3 0xc0ac739b at softdep_mount+0xeb #4 0xc0adb5f2 at ffs_mount+0x2452 #5 0xc09209c8 at vfs_donmount+0x1018 #6 0xc0922115 at nmount+0x75 #7 0xc0bd5464 at syscall+0x2b4 #8 0xc0bb7790 at Xint0x80_syscall+0x20 ino 4, on dev ad0s1e vnode 0xc47a015c: tag ufs, type VDIR usecount 0, writecount 0, refcount 2 mountedhere 0 flags (VV_ROOT) v_object 0xc4823b28 ref 0 pages 1 lock type ufs: UNLOCKED #0 0xc087a552 at __lockmgr_args+0x592 #1 0xc0add191 at ffs_lock+0xa1 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09278fb at vget+0xbb #5 0xc091a7bd at vfs_hash_get+0xed #6 0xc0ad7869 at ffs_vgetf+0x49 #7 0xc0ad7dce at ffs_vget+0x2e #8 0xc0ae9888 at ufs_root+0x28 #9 0xc091bd51 at lookup+0x9a1 #10 0xc091c86f at namei+0x57f #11 0xc092bf72 at kern_statat_vnhook+0x72 #12 0xc092c0cc at kern_statat+0x3c #13 0xc092c216 at kern_stat+0x36 #14 0xc092c2bf at stat+0x2f #15 0xc0bd5464 at syscall+0x2b4 #16 0xc0bb7790 at Xint0x80_syscall+0x20 ino 2, on dev ad0s1e vnode 0xc48bb15c: tag ufs, type VDIR usecount 0, writecount 0, refcount 1 mountedhere 0 flags () lock type ufs: UNLOCKED #0 0xc087a552 at __lockmgr_args+0x592 #1 0xc0add191 at ffs_lock+0xa1 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09278fb at vget+0xbb #5 0xc091a7bd at vfs_hash_get+0xed #6 0xc0ad7869 at ffs_vgetf+0x49 #7 0xc0ad7dce at ffs_vget+0x2e #8 0xc0ae5411 at ufs_lookup_ino+0xaf1 #9 0xc0ae549a at ufs_lookup+0x2a #10 0xc0beca85 at VOP_CACHEDLOOKUP_APV+0xc5 #11 0xc0914686 at vfs_cache_lookup+0xd6 #12 0xc0bef435 at VOP_LOOKUP_APV+0xe5 #13 0xc091ba2b at lookup+0x67b #14 0xc091c86f at namei+0x57f #15 0xc092bf72 at kern_statat_vnhook+0x72 #16 0xc092c0cc at kern_statat+0x3c #17 0xc092c106 at kern_lstat+0x36 ino 70656, on dev ad0s1e vnode 0xc48bb000: tag ufs, type VDIR usecount 0, writecount 0, refcount 1 mountedhere 0 flags () lock type ufs: UNLOCKED #0 0xc087a552 at __lockmgr_args+0x592 #1 0xc0add191 at ffs_lock+0xa1 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09278fb at vget+0xbb #5 0xc091a7bd at vfs_hash_get+0xed #6 0xc0ad7869 at ffs_vgetf+0x49 #7 0xc0ad7dce at ffs_vget+0x2e #8 0xc0ae5411 at ufs_lookup_ino+0xaf1 #9 0xc0ae549a at ufs_lookup+0x2a #10 0xc0beca85 at VOP_CACHEDLOOKUP_APV+0xc5 #11 0xc0914686 at vfs_cache_lookup+0xd6 #12 0xc0bef435 at VOP_LOOKUP_APV+0xe5 #13 0xc091ba2b at lookup+0x67b #14 0xc091c86f at namei+0x57f #15 0xc092bf72 at kern_statat_vnhook+0x72 #16 0xc092c0cc at kern_statat+0x3c #17 0xc092c106 at kern_lstat+0x36 ino 494592, on dev ad0s1e vnode 0xc48bad98: tag ufs, type VDIR usecount 0, writecount 0, refcount 1 mountedhere 0 flags () lock type ufs: UNLOCKED #0 0xc087a552 at __lockmgr_args+0x592 #1 0xc0add191 at ffs_lock+0xa1 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09278fb at vget+0xbb #5 0xc091447b at cache_lookup+0x67b #6 0xc091465d at vfs_cache_lookup+0xad #7 0xc0bef435 at VOP_LOOKUP_APV+0xe5 #8 0xc091ba2b at lookup+0x67b #9 0xc091c86f at namei+0x57f #10 0xc092bdb6 at kern_pathconf+0x56 #11 0xc092beb1 at lpathconf+0x31 #12 0xc0bd5464 at syscall+0x2b4 #13 0xc0bb7790 at Xint0x80_syscall+0x20 ino 23552, on dev ad0s1e vnode 0xc48bac3c: tag ufs, type VDIR usecount 0, writecount 0, refcount 1 mountedhere 0 flags () lock type ufs: UNLOCKED #0 0xc087a552 at __lockmgr_args+0x592 #1 0xc0add191 at ffs_lock+0xa1 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09278fb at vget+0xbb #5 0xc091a7bd at vfs_hash_get+0xed #6 0xc0ad7869 at ffs_vgetf+0x49 #7 0xc0ad7dce at ffs_vget+0x2e #8 0xc0ae5411 at ufs_lookup_ino+0xaf1 #9 0xc0ae549a at ufs_lookup+0x2a #10 0xc0beca85 at VOP_CACHEDLOOKUP_APV+0xc5 #11 0xc0914686 at vfs_cache_lookup+0xd6 #12 0xc0bef435 at VOP_LOOKUP_APV+0xe5 #13 0xc091ba2b at lookup+0x67b #14 0xc091c86f at namei+0x57f #15 0xc092bf72 at kern_statat_vnhook+0x72 #16 0xc092c0cc at kern_statat+0x3c #17 0xc092c106 at kern_lstat+0x36 ino 400384, on dev ad0s1e vnode 0xc4a7cd98: tag ufs, type VDIR usecount 0, writecount 0, refcount 0 mountedhere 0 flags (VI_FREE) lock type ufs: UNLOCKED #0 0xc087abbe at __lockmgr_args+0xbfe #1 0xc0ad7a01 at ffs_vgetf+0x1e1 #2 0xc0ad7dce at ffs_vget+0x2e #3 0xc0ae5411 at ufs_lookup_ino+0xaf1 #4 0xc0ae549a at ufs_lookup+0x2a #5 0xc0beca85 at VOP_CACHEDLOOKUP_APV+0xc5 #6 0xc0914686 at vfs_cache_lookup+0xd6 #7 0xc0bef435 at VOP_LOOKUP_APV+0xe5 #8 0xc091ba2b at lookup+0x67b #9 0xc091c86f at namei+0x57f #10 0xc092bf72 at kern_statat_vnhook+0x72 #11 0xc092c0cc at kern_statat+0x3c #12 0xc092c106 at kern_lstat+0x36 #13 0xc092c1af at lstat+0x2f #14 0xc0bd5464 at syscall+0x2b4 #15 0xc0bb7790 at Xint0x80_syscall+0x20 ino 3, on dev ad0s1e vnode 0xc4cc8d98: tag syncer, type VNON usecount 1, writecount 0, refcount 1 mountedhere 0 flags () lock type syncer: UNLOCKED #0 0xc087abbe at __lockmgr_args+0xbfe #1 0xc0916e15 at vop_stdlock+0x65 #2 0xc0bee2b5 at VOP_LOCK1_APV+0xb5 #3 0xc09347f8 at _vn_lock+0x78 #4 0xc09292a2 at sync_vnode+0x142 #5 0xc0929613 at sched_sync+0x273 #6 0xc0867d58 at fork_exit+0xb8 #7 0xc0bb77a0 at fork_trampoline+0x8 db> run pho db:0:pho> bt Tracing pid 11 tid 100006 td 0xc417f480 kdb_enter(c0c51169,c0c919a2,0,c438a380,0,...) at kdb_enter+0x3a uart_intr(c438a300,c417f480,c415d8d0,c4183100,4,...) at uart_intr+0x126 intr_event_handle(c4183100,c3f0ac34,0,1f4,c4425400,...) at intr_event_handle+0x5c intr_execute_handlers(c415d8d0,c3f0ac34,0,c3f0ac74,c0bb7af4,...) at intr_execute_handlers+0x49 lapic_handle_intr(38,c3f0ac34) at lapic_handle_intr+0x4c Xapic_isr1() at Xapic_isr1+0x34 --- interrupt, eip = 0xc0babf15, esp = 0xc3f0ac74, ebp = 0xc3f0ac74 --- acpi_cpu_c1(1,0,0,c08b76a1,c0e1b3e0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c3f0acb4,c0bc36cb,1,c3f0acf8,c08b799e,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(1,c3f0acf8,c08b799e,1,c3f0acd4,...) at cpu_idle_acpi+0x1b cpu_idle(1,c3f0acd4,c0caa6ae,3b0,c417f480,...) at cpu_idle+0x1b sched_idletd(0,c3f0ad38,c0ca3ea7,343,c417daa0,...) at sched_idletd+0x23e fork_exit(c08b7760,0,c3f0ad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f0ad70, ebp = 0 --- db:0:bt> show allpcpu Current CPU: 0 cpuid = 0 dynamic pcpu = 0x650980 curthread = 0xc417f480: pid 11 "idle: cpu0" curpcb = 0xc3f0ad90 fpcurthread = none idlethread = 0xc417f480: pid 11 "idle: cpu0" APIC ID = 0 currentldt = 0x50 spin locks held: cpuid = 1 dynamic pcpu = 0x310c980 curthread = 0xc417f6c0: pid 11 "idle: cpu1" curpcb = 0xc3f07d90 fpcurthread = none idlethread = 0xc417f6c0: pid 11 "idle: cpu1" APIC ID = 1 currentldt = 0x50 spin locks held: cpuid = 2 dynamic pcpu = 0x310f980 curthread = 0xc417f900: pid 11 "idle: cpu2" curpcb = 0xc3f04d90 fpcurthread = none idlethread = 0xc417f900: pid 11 "idle: cpu2" APIC ID = 6 currentldt = 0x50 spin locks held: cpuid = 3 dynamic pcpu = 0x3112980 curthread = 0xc417fb40: pid 11 "idle: cpu3" curpcb = 0xc3f01d90 fpcurthread = none idlethread = 0xc417fb40: pid 11 "idle: cpu3" APIC ID = 7 currentldt = 0x50 spin locks held: db:0:allpcpu> show alllocks db:0:alllocks> show lockedvnods Locked vnodes db:0:lockedvnods> show mount 0xc46c0b50 /dev/ad0s1a on / (ufs) 0xc46c1000 devfs on /dev (devfs) 0xc4763b50 /dev/ad0s1f on /home (ufs) 0xc476387c /dev/ad0s1e on /tmp (ufs) 0xc47635a8 /dev/ad0s1d on /usr (ufs) 0xc47632d4 /dev/ad0s1g on /var (ufs) More info: show mount db:0:mount> ps pid ppid pgrp uid state wmesg wchan cmd 1332 1 1332 0 Ss+ ttyin 0xc4290a70 sh 19 0 0 0 DL flowclea 0xc0f75888 [flowcleaner] 18 0 0 0 DL sdflush 0xc0f810e0 [softdepflush] 17 0 0 0 DL syncer 0xc0f75698 [syncer] 16 0 0 0 DL vlruwt 0xc465dd48 [vnlru] 15 0 0 0 DL psleep 0xc0f753c8 [bufdaemon] 9 0 0 0 DL pgzero 0xc0f81f14 [pagezero] 8 0 0 0 DL psleep 0xc0f81b44 [vmdaemon] 7 0 0 0 DL psleep 0xc0f81b0c [pagedaemon] 6 0 0 0 DL - 0xc429063c [fdc0] 14 0 0 0 DL (threaded) [usb] 100034 D - 0xc43a3dac [usbus0] 100033 D - 0xc43a3d7c [usbus0] 100032 D - 0xc43a3d4c [usbus0] 100031 D - 0xc43a3d1c [usbus0] 5 0 0 0 DL ccb_scan 0xc0dd5354 [xpt_thrd] 13 0 0 0 DL - 0xc0e08fc4 [yarrow] 4 0 0 0 DL - 0xc0e06d64 [g_down] 3 0 0 0 DL - 0xc0e06d60 [g_up] 2 0 0 0 DL - 0xc0e06d58 [g_event] 12 0 0 0 WL (threaded) [intr] 100042 I [irq7: ppc0] 100040 I [swi0: uart uart] 100039 I [irq12: psm0] 100038 I [irq1: atkbd0] 100037 I [irq15: ata1] 100036 I [irq14: ata0] 100035 I [irq17: fxp0] 100030 I [irq16: uhci0] 100028 I [irq9: acpi0] 100024 I [swi2: cambio] 100022 I [swi6: task queue] 100021 I [swi6: Giant taskq] 100019 I [swi5: +] 100012 I [swi1: netisr 0] 100011 I [swi3: vm] 100010 I [swi4: clock] 100009 I [swi4: clock] 100008 I [swi4: clock] 100007 I [swi4: clock] 11 0 0 0 RL (threaded) [idle] 100006 Run CPU 0 [idle: cpu0] 100005 Run CPU 1 [idle: cpu1] 100004 Run CPU 2 [idle: cpu2] 100003 Run CPU 3 [idle: cpu3] 1 0 1 0 SLs wait 0xc417dd48 [init] 10 0 0 0 DL audit_wo 0xc0f80900 [audit] 0 0 0 0 DLs (threaded) [kernel] 100029 D - 0xc4387340 [em0 taskq] 100027 D - 0xc4344100 [acpi_task_2] 100026 D - 0xc4344100 [acpi_task_1] 100025 D - 0xc4344100 [acpi_task_0] 100020 D - 0xc4344380 [thread taskq] 100018 D - 0xc4344600 [kqueue taskq] 100016 D - 0xc4164e00 [firmware taskq] 100000 D sched 0xc0e06e40 [swapper] db:0:ps> allt Tracing command sh pid 1332 tid 100127 td 0xc49846c0 sched_switch(c49846c0,0,104,191,be823364,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c49846c0,0,c0cad0c9,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(c088233a,c4290a04,0,c0ca723a,c49846c0,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4290a70,0,e6a4bb0c,101,0,...) at sleepq_wait_sig+0x17 _cv_wait_sig(c4290a70,c4290a04,c0cb1070,511,0,...) at _cv_wait_sig+0x240 tty_wait(c4290a00,c4290a70,3ff,e6a4bb87,c0da5520,...) at tty_wait+0x71 ttydisc_read(c4290a00,e6a4bc58,0,9f,0,...) at ttydisc_read+0xef ttydev_read(c416c800,e6a4bc58,0,0,3ff,...) at ttydev_read+0xaa devfs_read_f(c4768968,e6a4bc58,c4183380,0,c49846c0,...) at devfs_read_f+0x7e dofileread(e6a4bc58,ffffffff,ffffffff,0,c4768968,...) at dofileread+0x96 kern_readv(c49846c0,0,e6a4bc58,e6a4bc78,1,...) at kern_readv+0x58 read(c49846c0,e6a4bcf8,c,c0c9079c,c0d96554,...) at read+0x4f syscall(e6a4bd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (3, FreeBSD ELF32, read), eip = 0x281ebee3, esp = 0xbfbfedbc, ebp = 0xbfbfedf8 --- Tracing command flowcleaner pid 19 tid 100050 td 0xc436b6c0 sched_switch(c436b6c0,0,104,191,45996f6a,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c436b6c0,0,c0cad0c9,283,c436b6c0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f75888,0,e4703cc4,1,0,...) at sleepq_timedwait+0x6b _cv_timedwait(c0f75888,c0f75890,2710,3f0,0,...) at _cv_timedwait+0x250 flowtable_cleaner(0,e4703d38,c0ca3ea7,343,c465d550,...) at flowtable_cleaner+0x1bf fork_exit(c093e040,0,e4703d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4703d70, ebp = 0 --- Tracing command softdepflush pid 18 tid 100049 td 0xc436b900 sched_switch(c436b900,0,104,191,97981186,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,44,...) at mi_switch+0x200 sleepq_switch(c436b900,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f810e0,44,c0cd1708,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f810e0,c0f81094,44,c0cd1708,3e8,...) at _sleep+0x339 softdep_flush(0,e4700d38,c0ca3ea7,343,c465d7f8,...) at softdep_flush+0x250 fork_exit(c0ad3ce0,0,e4700d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4700d70, ebp = 0 --- Tracing command syncer pid 17 tid 100048 td 0xc436bb40 sched_switch(c436bb40,0,104,191,98738a2a,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c436bb40,0,c0cad0c9,283,c436bb40,...) at sleepq_switch+0x15f sleepq_timedwait(c0f75698,0,e46fdc88,1,0,...) at sleepq_timedwait+0x6b _cv_timedwait(c0f75698,c0f75684,3e8,6d4,4e20,...) at _cv_timedwait+0x250 sched_sync(0,e46fdd38,c0ca3ea7,343,c465daa0,...) at sched_sync+0x502 fork_exit(c09293a0,0,e46fdd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46fdd70, ebp = 0 --- Tracing command vnlru pid 16 tid 100047 td 0xc436bd80 sched_switch(c436bd80,0,104,191,845a6f46,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,50,...) at mi_switch+0x200 sleepq_switch(c436bd80,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c465dd48,50,c0cb70c7,0,0,...) at sleepq_timedwait+0x6b _sleep(c465dd48,c0f75658,250,c0cb70c7,3e8,...) at _sleep+0x339 vnlru_proc(0,e46fad38,c0ca3ea7,343,c465dd48,...) at vnlru_proc+0xe7 fork_exit(c0929f70,0,e46fad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46fad70, ebp = 0 --- Tracing command bufdaemon pid 15 tid 100046 td 0xc441f000 sched_switch(c441f000,0,104,191,9bfddf12,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,44,...) at mi_switch+0x200 sleepq_switch(c441f000,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f753c8,44,c0cb4624,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f753c8,c0f753d0,44,c0cb4624,3e8,...) at _sleep+0x339 buf_daemon(0,e46f7d38,c0ca3ea7,343,c417e2a8,...) at buf_daemon+0x16e fork_exit(c0911000,0,e46f7d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46f7d70, ebp = 0 --- Tracing command pagezero pid 9 tid 100045 td 0xc441f240 sched_switch(c441f240,0,104,191,ed7503f0,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c441f240,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f81f14,0,c0cd7570,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f81f14,c0f81a00,0,c0cd7570,493e0,...) at _sleep+0x339 vm_pagezero(0,e46f4d38,c0ca3ea7,343,c417e550,...) at vm_pagezero+0xdc fork_exit(c0b13700,0,e46f4d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46f4d70, ebp = 0 --- Tracing command vmdaemon pid 8 tid 100044 td 0xc441f480 sched_switch(c441f480,0,104,191,360d9454,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,68,...) at mi_switch+0x200 sleepq_switch(c441f480,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0f81b44,68,c0cb4624,0,0,...) at sleepq_wait+0x63 _sleep(c0f81b44,c0f81b48,68,c0cb4624,0,...) at _sleep+0x36b vm_daemon(0,e46f1d38,c0ca3ea7,343,c417e7f8,...) at vm_daemon+0x59 fork_exit(c0b0db90,0,e46f1d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46f1d70, ebp = 0 --- Tracing command pagedaemon pid 7 tid 100043 td 0xc441f6c0 sched_switch(c441f6c0,0,104,191,5cd6e4fa,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,44,...) at mi_switch+0x200 sleepq_switch(c441f6c0,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f81b0c,44,c0cb4624,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f81b0c,c0f81a00,44,c0cb4624,1388,...) at _sleep+0x339 vm_pageout(0,e46eed38,c0ca3ea7,343,c417eaa0,...) at vm_pageout+0x2bb fork_exit(c0b0ea30,0,e46eed38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46eed70, ebp = 0 --- Tracing command fdc0 pid 6 tid 100041 td 0xc441fb40 sched_switch(c441fb40,0,104,191,84ff3bf0,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c441fb40,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c429063c,4c,c0c9e2e9,0,0,...) at sleepq_timedwait+0x6b _sleep(c429063c,c42906f0,4c,c0c9e2e9,3e8,...) at _sleep+0x339 fdc_thread(c4290600,e46e8d38,c0ca3ea7,343,c417ed48,...) at fdc_thread+0x27d fork_exit(c0b90d40,c4290600,e46e8d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46e8d70, ebp = 0 --- Tracing command usb pid 14 tid 100034 td 0xc43696c0 sched_switch(c43696c0,0,104,191,b870d188,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c43696c0,0,c0cad0c9,260,c43696c0,...) at sleepq_switch+0x15f sleepq_wait(c43a3dac,0,c3f88cbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c43a3dac,c43a3e4c,c0c96fd9,6c,c43a3db4,...) at _cv_wait+0x240 usb_process(c43a3da4,c3f88d38,c0ca3ea7,343,c4346000,...) at usb_process+0x193 fork_exit(c07c2850,c43a3da4,c3f88d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f88d70, ebp = 0 --- Tracing command usb pid 14 tid 100033 td 0xc4369900 sched_switch(c4369900,0,104,191,99f2d9b2,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4369900,0,c0cad0c9,260,c4369900,...) at sleepq_switch+0x15f sleepq_wait(c43a3d7c,0,c3f85cbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c43a3d7c,c43a3e4c,c0c96fd9,6c,c43a3d84,...) at _cv_wait+0x240 usb_process(c43a3d74,c3f85d38,c0ca3ea7,343,c4346000,...) at usb_process+0x193 fork_exit(c07c2850,c43a3d74,c3f85d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f85d70, ebp = 0 --- Tracing command usb pid 14 tid 100032 td 0xc4369b40 sched_switch(c4369b40,0,104,191,b7fb48c4,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4369b40,0,c0cad0c9,260,c4369b40,...) at sleepq_switch+0x15f sleepq_wait(c43a3d4c,0,c3f82cbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c43a3d4c,c43a3e4c,c0c96fd9,6c,c43a3d54,...) at _cv_wait+0x240 usb_process(c43a3d44,c3f82d38,c0ca3ea7,343,c4346000,...) at usb_process+0x193 fork_exit(c07c2850,c43a3d44,c3f82d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f82d70, ebp = 0 --- Tracing command usb pid 14 tid 100031 td 0xc4369d80 sched_switch(c4369d80,0,104,191,b7fb1a0c,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4369d80,0,c0cad0c9,260,c4369d80,...) at sleepq_switch+0x15f sleepq_wait(c43a3d1c,0,c3f7fcbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c43a3d1c,c43a3e4c,c0c96fd9,6c,c43a3d24,...) at _cv_wait+0x240 usb_process(c43a3d14,c3f7fd38,c0ca3ea7,343,c4346000,...) at usb_process+0x193 fork_exit(c07c2850,c43a3d14,c3f7fd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f7fd70, ebp = 0 --- Tracing command xpt_thrd pid 5 tid 100023 td 0xc4359000 sched_switch(c4359000,0,104,191,b7fad8ec,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c4359000,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0dd5354,4c,c0c4295a,0,0,...) at sleepq_wait+0x63 _sleep(c0dd5354,c0dd536c,4c,c0c4295a,0,...) at _sleep+0x36b xpt_scanner_thread(0,c3f40d38,c0ca3ea7,343,c43462a8,...) at xpt_scanner_thread+0x4a fork_exit(c0484d30,0,c3f40d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f40d70, ebp = 0 --- Tracing command yarrow pid 13 tid 100017 td 0xc4181240 sched_switch(c4181240,0,104,191,d0693dbc,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4181240,0,c0cad0c9,283,2,...) at sleepq_switch+0x15f sleepq_timedwait(c0e08fc4,0,c0c9e2e9,2,0,...) at sleepq_timedwait+0x6b _sleep(c0e08fc4,0,0,c0c9e2e9,64,...) at _sleep+0x339 pause(c0c9e2e9,64,c0c8acd4,111,0,...) at pause+0x47 random_kthread(0,c3f2ed38,c0ca3ea7,343,c4346550,...) at random_kthread+0x1ef fork_exit(c0739360,0,c3f2ed38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f2ed70, ebp = 0 --- Tracing command g_down pid 4 tid 100015 td 0xc41816c0 sched_switch(c41816c0,0,104,191,4fdd34d2,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c41816c0,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0e06d64,4c,c0c9e2e9,0,0,...) at sleepq_wait+0x63 _sleep(c0e06d64,c0e06cc8,24c,c0c9e2e9,0,...) at _sleep+0x36b g_io_schedule_down(c41816c0,0,c0c9f9f5,74,0,...) at g_io_schedule_down+0x56 g_down_procbody(0,c3f28d38,c0ca3ea7,343,c417d000,...) at g_down_procbody+0x8d fork_exit(c082eda0,0,c3f28d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f28d70, ebp = 0 --- Tracing command g_up pid 3 tid 100014 td 0xc4181900 sched_switch(c4181900,0,104,191,4fe735fa,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c4181900,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0e06d60,4c,c0c9e2e9,0,0,...) at sleepq_wait+0x63 _sleep(c0e06d60,c0e06ce8,24c,c0c9e2e9,0,...) at _sleep+0x36b g_io_schedule_up(c4181900,0,c0c9f9f5,5d,0,...) at g_io_schedule_up+0x11e g_up_procbody(0,c3f25d38,c0ca3ea7,343,c417d2a8,...) at g_up_procbody+0x8d fork_exit(c082ee30,0,c3f25d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f25d70, ebp = 0 --- Tracing command g_event pid 2 tid 100013 td 0xc4181b40 sched_switch(c4181b40,0,104,191,d1607c42,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c4181b40,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0e06d58,4c,c0c9e2e9,0,0,...) at sleepq_timedwait+0x6b _sleep(c0e06d58,0,4c,c0c9e2e9,64,...) at _sleep+0x339 g_event_procbody(0,c3f22d38,c0ca3ea7,343,c417d550,...) at g_event_procbody+0xcb fork_exit(c082eec0,0,c3f22d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f22d70, ebp = 0 --- Tracing command intr pid 12 tid 100042 td 0xc441f900 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100040 td 0xc441fd80 sched_switch(c441fd80,0,109,191,be820fa0,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c44163f0,...) at mi_switch+0x200 ithread_loop(c441ab50,e46dbd38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c441ab50,e46dbd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46dbd70, ebp = 0 --- Tracing command intr pid 12 tid 100039 td 0xc4420000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100038 td 0xc4359d80 sched_switch(c4359d80,0,109,191,b783f47c,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41832f0,...) at mi_switch+0x200 ithread_loop(c441a000,e46d5d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c441a000,e46d5d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46d5d70, ebp = 0 --- Tracing command intr pid 12 tid 100037 td 0xc4369000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100036 td 0xc4369240 sched_switch(c4369240,0,109,191,4fe49fbe,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c4182b70,...) at mi_switch+0x200 ithread_loop(c4410830,e46ccd38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c4410830,e46ccd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46ccd70, ebp = 0 --- Tracing command intr pid 12 tid 100035 td 0xc4369480 sched_switch(c4369480,0,109,191,e157248,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41829f0,...) at mi_switch+0x200 ithread_loop(c43b4b20,e46c6d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c43b4b20,e46c6d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe46c6d70, ebp = 0 --- Tracing command intr pid 12 tid 100030 td 0xc436b000 sched_switch(c436b000,0,109,191,b755bf34,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c4182a70,...) at mi_switch+0x200 ithread_loop(c4389840,c3f7cd38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c4389840,c3f7cd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f7cd70, ebp = 0 --- Tracing command intr pid 12 tid 100028 td 0xc436b480 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100024 td 0xc4206d80 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100022 td 0xc4359240 sched_switch(c4359240,0,109,191,4308273e,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41c5bf0,...) at mi_switch+0x200 ithread_loop(c4119950,c3f3dd38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c4119950,c3f3dd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f3dd70, ebp = 0 --- Tracing command intr pid 12 tid 100021 td 0xc4359480 sched_switch(c4359480,0,109,191,5526675e,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41c5c70,...) at mi_switch+0x200 ithread_loop(c4119960,c3f3ad38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c4119960,c3f3ad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f3ad70, ebp = 0 --- Tracing command intr pid 12 tid 100019 td 0xc4359900 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100012 td 0xc4181d80 sched_switch(c4181d80,0,109,191,5566aaf8,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c4207d70,...) at mi_switch+0x200 ithread_loop(c417c0a0,c3f1fd38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c417c0a0,c3f1fd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f1fd70, ebp = 0 --- Tracing command intr pid 12 tid 100011 td 0xc4206000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100010 td 0xc4206240 sched_switch(c4206240,0,109,191,4598496c,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c4207e70,...) at mi_switch+0x200 ithread_loop(c417c0c0,c3f19d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c417c0c0,c3f19d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f19d70, ebp = 0 --- Tracing command intr pid 12 tid 100009 td 0xc4206480 sched_switch(c4206480,0,109,191,9bfd9bd8,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41836f0,...) at mi_switch+0x200 ithread_loop(c417c0d0,c3f16d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c417c0d0,c3f16d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f16d70, ebp = 0 --- Tracing command intr pid 12 tid 100008 td 0xc417f000 sched_switch(c417f000,0,109,191,d160413a,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c4183770,...) at mi_switch+0x200 ithread_loop(c417c0e0,c3f13d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c417c0e0,c3f13d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f13d70, ebp = 0 --- Tracing command intr pid 12 tid 100007 td 0xc417f240 sched_switch(c417f240,0,109,191,d2281fca,...) at sched_switch+0x406 mi_switch(109,0,c0ca4126,52d,c41837f0,...) at mi_switch+0x200 ithread_loop(c417c0f0,c3f10d38,c0ca3ea7,343,c417d7f8,...) at ithread_loop+0x1f6 fork_exit(c086ad40,c417c0f0,c3f10d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f10d70, ebp = 0 --- Tracing command idle pid 11 tid 100006 td 0xc417f480 kdb_enter(c0c51169,c0c919a2,0,c438a380,0,...) at kdb_enter+0x3a uart_intr(c438a300,c417f480,c415d8d0,c4183100,4,...) at uart_intr+0x126 intr_event_handle(c4183100,c3f0ac34,0,1f4,c4425400,...) at intr_event_handle+0x5c intr_execute_handlers(c415d8d0,c3f0ac34,0,c3f0ac74,c0bb7af4,...) at intr_execute_handlers+0x49 lapic_handle_intr(38,c3f0ac34) at lapic_handle_intr+0x4c Xapic_isr1() at Xapic_isr1+0x34 --- interrupt, eip = 0xc0babf15, esp = 0xc3f0ac74, ebp = 0xc3f0ac74 --- acpi_cpu_c1(1,0,0,c08b76a1,c0e1b3e0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c3f0acb4,c0bc36cb,1,c3f0acf8,c08b799e,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(1,c3f0acf8,c08b799e,1,c3f0acd4,...) at cpu_idle_acpi+0x1b cpu_idle(1,c3f0acd4,c0caa6ae,3b0,c417f480,...) at cpu_idle+0x1b sched_idletd(0,c3f0ad38,c0ca3ea7,343,c417daa0,...) at sched_idletd+0x23e fork_exit(c08b7760,0,c3f0ad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f0ad70, ebp = 0 --- Tracing command idle pid 11 tid 100005 td 0xc417f6c0 cpustop_handler(2,c3f07c28,c0bd56e6,c0e091dc,c3f07bbc,...) at cpustop_handler+0x32 ipi_nmi_handler(c0e091dc,c3f07bbc,c0881f84,c0e091dc,c417daa0,...) at ipi_nmi_handler+0x2f trap(c3f07c34) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0xc0babf15, esp = 0xc3f07c74, ebp = 0xc3f07c74 --- acpi_cpu_c1(1,0,1,c08b76a1,c0e1b3e0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c3f07cb4,c0bc36cb,0,c3f07cf8,c08b799e,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(0,c3f07cf8,c08b799e,0,c3f07cd4,...) at cpu_idle_acpi+0x1b cpu_idle(0,c3f07cd4,c0caa6ae,3b0,c417f6c0,...) at cpu_idle+0x1b sched_idletd(0,c3f07d38,c0ca3ea7,343,c417daa0,...) at sched_idletd+0x23e fork_exit(c08b7760,0,c3f07d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f07d70, ebp = 0 --- Tracing command idle pid 11 tid 100004 td 0xc417f900 cpustop_handler(4,c3f04c28,c0bd56e6,c0e09218,c3f04bbc,...) at cpustop_handler+0x32 ipi_nmi_handler(c0e09218,c3f04bbc,c0881f84,c0e09218,c417daa0,...) at ipi_nmi_handler+0x2f trap(c3f04c34) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0xc0babf15, esp = 0xc3f04c74, ebp = 0xc3f04c74 --- acpi_cpu_c1(1,0,2,c08b76a1,c0e1b3e0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c3f04cb4,c0bc36cb,0,c3f04cf8,c08b799e,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(0,c3f04cf8,c08b799e,0,c3f04cd4,...) at cpu_idle_acpi+0x1b cpu_idle(0,c3f04cd4,c0caa6ae,a09,c417f900,...) at cpu_idle+0x1b sched_idletd(0,c3f04d38,c0ca3ea7,343,c417daa0,...) at sched_idletd+0x23e fork_exit(c08b7760,0,c3f04d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f04d70, ebp = 0 --- Tracing command idle pid 11 tid 100003 td 0xc417fb40 cpustop_handler(8,c3f01c28,c0bd56e6,c0e09254,c3f01bbc,...) at cpustop_handler+0x32 ipi_nmi_handler(c0e09254,c3f01bbc,c0881f84,c0e09254,c417daa0,...) at ipi_nmi_handler+0x2f trap(c3f01c34) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0xc0babf15, esp = 0xc3f01c74, ebp = 0xc3f01c74 --- acpi_cpu_c1(1,0,3,c08b76a1,c0e1b3e0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c3f01cb4,c0bc36cb,0,c3f01cf8,c08b799e,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(0,c3f01cf8,c08b799e,0,c3f01cd4,...) at cpu_idle_acpi+0x1b cpu_idle(0,c3f01cd4,c0caa6ae,3b0,c417fb40,...) at cpu_idle+0x1b sched_idletd(0,c3f01d38,c0ca3ea7,343,c417daa0,...) at sched_idletd+0x23e fork_exit(c08b7760,0,c3f01d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f01d70, ebp = 0 --- Tracing command init pid 1 tid 100002 td 0xc417fd80 sched_switch(c417fd80,0,104,191,99ce52a4,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c417fd80,0,c0cad0c9,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0cad0c9,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c417dd48,5c,c0caf8d6,100,0,...) at sleepq_wait_sig+0x17 _sleep(c417dd48,c417ddd0,15c,c0caf8d6,0,...) at _sleep+0x354 kern_wait(c417fd80,ffffffff,c3efdc74,2,0,...) at kern_wait+0xb76 wait4(c417fd80,c3efdcf8,10,c417fd80,c0d965c4,...) at wait4+0x3b syscall(c3efdd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x8054eaf, esp = 0xbfbfe86c, ebp = 0xbfbfe888 --- Tracing command audit pid 10 tid 100001 td 0xc4181000 sched_switch(c4181000,0,104,191,b7f53114,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4181000,0,c0cad0c9,260,c4181000,...) at sleepq_switch+0x15f sleepq_wait(c0f80900,0,c3efac9c,1,0,...) at sleepq_wait+0x63 _cv_wait(c0f80900,c0f808e4,c0ccd278,194,0,...) at _cv_wait+0x240 audit_worker(0,c3efad38,c0ca3ea7,343,c417e000,...) at audit_worker+0x84 fork_exit(c0a9aa90,0,c3efad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3efad70, ebp = 0 --- Tracing command kernel pid 0 tid 100029 td 0xc436b240 sched_switch(c436b240,0,104,191,b7552778,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c436b240,0,c0cad0c9,260,c436b240,...) at sleepq_switch+0x15f sleepq_wait(c4387340,0,c0ca96a7,c0c9e2e9,0,...) at sleepq_wait+0x63 msleep_spin(c4387340,c4387358,c0c9e2e9,0,c0ca723a,...) at msleep_spin+0x21d taskqueue_thread_loop(c438f5a0,c3f78d38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0x94 fork_exit(c08d1150,c438f5a0,c3f78d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f78d70, ebp = 0 --- Tracing command kernel pid 0 tid 100027 td 0xc42066c0 sched_switch(c42066c0,0,104,191,1c8e6ae0,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c42066c0,0,c0cad0c9,260,c42066c0,...) at sleepq_switch+0x15f sleepq_wait(c4344100,0,c0ca96a7,c0c9e2e9,0,...) at sleepq_wait+0x63 msleep_spin(c4344100,c4344118,c0c9e2e9,0,c0ca723a,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dd81a0,c3f4cd38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0x94 fork_exit(c08d1150,c0dd81a0,c3f4cd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f4cd70, ebp = 0 --- Tracing command kernel pid 0 tid 100026 td 0xc4206900 sched_switch(c4206900,0,104,191,1c8e4c44,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4206900,0,c0cad0c9,260,c4206900,...) at sleepq_switch+0x15f sleepq_wait(c4344100,0,c0ca96a7,c0c9e2e9,0,...) at sleepq_wait+0x63 msleep_spin(c4344100,c4344118,c0c9e2e9,0,c0ca723a,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dd81a0,c3f49d38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0x94 fork_exit(c08d1150,c0dd81a0,c3f49d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f49d70, ebp = 0 --- Tracing command kernel pid 0 tid 100025 td 0xc4206b40 sched_switch(c4206b40,0,104,191,1c8e2774,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4206b40,0,c0cad0c9,260,c4206b40,...) at sleepq_switch+0x15f sleepq_wait(c4344100,0,c0ca96a7,c0c9e2e9,0,...) at sleepq_wait+0x63 msleep_spin(c4344100,c4344118,c0c9e2e9,0,c0ca723a,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dd81a0,c3f46d38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0x94 fork_exit(c08d1150,c0dd81a0,c3f46d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f46d70, ebp = 0 --- Tracing command kernel pid 0 tid 100020 td 0xc43596c0 sched_switch(c43596c0,0,104,191,328a8cc8,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c43596c0,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4344380,0,c0c9e2e9,0,0,...) at sleepq_wait+0x63 _sleep(c4344380,c4344398,0,c0c9e2e9,0,...) at _sleep+0x36b taskqueue_thread_loop(c0e1b5c8,c3f37d38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0xba fork_exit(c08d1150,c0e1b5c8,c3f37d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f37d70, ebp = 0 --- Tracing command kernel pid 0 tid 100018 td 0xc4359b40 sched_switch(c4359b40,0,104,191,1c7b7198,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4359b40,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4344600,0,c0c9e2e9,0,0,...) at sleepq_wait+0x63 _sleep(c4344600,c4344618,0,c0c9e2e9,0,...) at _sleep+0x36b taskqueue_thread_loop(c0e076d8,c3f31d38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0xba fork_exit(c08d1150,c0e076d8,c3f31d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f31d70, ebp = 0 --- Tracing command kernel pid 0 tid 100016 td 0xc4181480 sched_switch(c4181480,0,104,191,5738d30a,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4181480,0,c0cad0c9,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4164e00,0,c0c9e2e9,0,0,...) at sleepq_wait+0x63 _sleep(c4164e00,c4164e18,0,c0c9e2e9,0,...) at _sleep+0x36b taskqueue_thread_loop(c0e1a060,c3f2bd38,c0ca3ea7,343,c0e06e40,...) at taskqueue_thread_loop+0xba fork_exit(c08d1150,c0e1a060,c3f2bd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc3f2bd70, ebp = 0 --- Tracing command kernel pid 0 tid 100000 td 0xc0e070f0 sched_switch(c0e070f0,0,104,191,49f80ec0,...) at sched_switch+0x406 mi_switch(104,0,c0cad0c9,1eb,44,...) at mi_switch+0x200 sleepq_switch(c0e070f0,0,c0cad0c9,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0e06e40,44,c0caaf23,0,0,...) at sleepq_timedwait+0x6b _sleep(c0e06e40,0,44,c0caaf23,2710,...) at _sleep+0x339 scheduler(0,141ec00,141ec00,141e000,1425000,...) at scheduler+0x23e mi_startup() at mi_startup+0x96 begin() at begin+0x2c db:0:allt> call doadump Physical memory: 1007 MB Dumping 67 MB: 52 36 20 4 Dump complete = 0xf db:0:doadump> reset $ svn diff -x -p /usr/src/sys Index: /usr/src/sys/ufs/ufs/ufs_dirhash.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_dirhash.c (revision 202614) +++ /usr/src/sys/ufs/ufs/ufs_dirhash.c (working copy) @@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$"); static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); -static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); - static int ufs_mindirhashsize = DIRBLKSIZ * 5; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, &ufs_mindirhashsize, Index: /usr/src/sys/ufs/ufs/inode.h =================================================================== --- /usr/src/sys/ufs/ufs/inode.h (revision 202614) +++ /usr/src/sys/ufs/ufs/inode.h (working copy) @@ -120,7 +120,7 @@ struct inode { #define IN_CHANGE 0x0002 /* Inode change time update request. */ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ -#define IN_RENAME 0x0010 /* Inode is being renamed. */ +#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ #define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ #define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */ #define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the @@ -175,6 +175,7 @@ struct indir { /* Determine if soft dependencies are being done */ #define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP) #define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) +#define DOINGSUJ(vp) ((vp)->v_mount->mnt_flag & MNT_SUJ) /* This overlays the fid structure (see mount.h). */ struct ufid { Index: /usr/src/sys/ufs/ufs/dinode.h =================================================================== --- /usr/src/sys/ufs/ufs/dinode.h (revision 202614) +++ /usr/src/sys/ufs/ufs/dinode.h (working copy) @@ -146,7 +146,8 @@ struct ufs2_dinode { ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */ - int64_t di_spare[2]; /* 240: Reserved; currently unused */ + ino_t di_freelink; /* 240: SUJ: Next unlinked inode. */ + uint32_t di_spare[3]; /* 244: Reserved; currently unused */ }; /* @@ -167,9 +168,7 @@ struct ufs2_dinode { struct ufs1_dinode { u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ int16_t di_nlink; /* 2: File link count. */ - union { - u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */ - } di_u; + ino_t di_freelink; /* 4: SUJ: Next unlinked inode. */ u_int64_t di_size; /* 8: File byte count. */ int32_t di_atime; /* 16: Last access time. */ int32_t di_atimensec; /* 20: Last access time. */ @@ -186,7 +185,5 @@ struct ufs1_dinode { u_int32_t di_gid; /* 116: File group. */ u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */ }; -#define di_ogid di_u.oldids[1] -#define di_ouid di_u.oldids[0] #endif /* _UFS_UFS_DINODE_H_ */ Index: /usr/src/sys/ufs/ufs/ufs_vnops.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_vnops.c (revision 202614) +++ /usr/src/sys/ufs/ufs/ufs_vnops.c (working copy) @@ -114,6 +114,8 @@ static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; static vop_pathconf_t ufsfifo_pathconf; +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + /* * A virgin directory (no blushing please). */ @@ -974,6 +976,9 @@ ufs_link(ap) error = EXDEV; goto out; } + if (VTOI(tdvp)->i_effnlink < 2) + panic("ufs_link: Bad link count %d on parent", + VTOI(tdvp)->i_effnlink); ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; @@ -988,11 +993,11 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); + error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); } if (error) { @@ -1001,7 +1006,7 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_revert_link(VTOI(tdvp), ip); } out: return (error); @@ -1043,7 +1048,7 @@ ufs_whiteout(ap) newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; - error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); + error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); break; case DELETE: @@ -1062,6 +1067,11 @@ ufs_whiteout(ap) return (error); } +static volatile int rename_restarts; +SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, + __DEVOLATILE(int *, &rename_restarts), 0, + "Times rename had to restart due to lock contention"); + /* * Rename system call. * rename("foo", "bar"); @@ -1101,111 +1111,183 @@ ufs_rename(ap) struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; + struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; - struct inode *ip, *xp, *dp; + struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; - int doingdirectory = 0, oldparent = 0, newparent = 0; + off_t endoff; + int doingdirectory, newparent; int error = 0, ioflag; - ino_t fvp_ino; + struct mount *mp; + ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif + endoff = 0; + mp = tdvp->v_mount; + VOP_UNLOCK(tdvp, 0); + if (tvp && tvp != tdvp) + VOP_UNLOCK(tvp, 0); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; -abortit: - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vput(tvp); - vrele(fdvp); + mp = NULL; + goto releout; + } + error = vfs_busy(mp, 0); + if (error) { + mp = NULL; + goto releout; + } +relock: + /* + * We need to acquire 2 to 4 locks depending on whether tvp is NULL + * and fdvp and tdvp are the same directory. Subsequently we need + * to double-check all paths and in the directory rename case we + * need to verify that we are not creating a directory loop. To + * handle this we acquire all but fdvp using non-blocking + * acquisitions. If we fail to acquire any lock in the path we will + * drop all held locks, acquire the new lock in a blocking fashion, + * and then release it and restart the rename. This acquire/release + * step ensures that we do not spin on a lock waiting for release. + */ + error = vn_lock(fdvp, LK_EXCLUSIVE); + if (error) + goto releout; + if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto releout; + VOP_UNLOCK(tdvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * Re-resolve fvp to be certain it still exists and fetch the + * correct vnode. + */ + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + goto releout; + } + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); vrele(fvp); - return (error); + fvp = nvp; + atomic_add_int(&rename_restarts, 1); + goto relock; } - + vrele(fvp); + fvp = nvp; + /* + * Re-resolve tvp and acquire the vnode lock if present. + */ + error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); + if (error != 0 && error != EJUSTRETURN) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + goto releout; + } + /* + * If tvp disappeared we just carry on. + */ + if (error == EJUSTRETURN && tvp != NULL) { + vrele(tvp); + tvp = NULL; + } + /* + * Get the tvp ino if the lookup succeeded. We may have to restart + * if the non-blocking acquire fails. + */ + if (error == 0) { + nvp = NULL; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (tvp) + vrele(tvp); + tvp = nvp; + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + } + fdp = VTOI(fdvp); + fip = VTOI(fvp); + tdp = VTOI(tdvp); + tip = NULL; + if (tvp) + tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; - goto abortit; + goto unlockout; } - /* * Renaming a file to itself has no effect. The upper layers should - * not call us in that case. Temporarily just warn if they do. + * not call us in that case. However, things could change after + * we drop the locks above. */ if (fvp == tvp) { - printf("ufs_rename: fvp == tvp (can't happen)\n"); error = 0; - goto abortit; + goto unlockout; } - - if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) - goto abortit; - dp = VTOI(fdvp); - ip = VTOI(fvp); - if (ip->i_nlink >= LINK_MAX) { - VOP_UNLOCK(fvp, 0); + doingdirectory = 0; + newparent = 0; + ino = fip->i_number; + if (fip->i_nlink >= LINK_MAX) { error = EMLINK; - goto abortit; + goto unlockout; } - if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) - || (dp->i_flags & APPEND)) { - VOP_UNLOCK(fvp, 0); + if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) + || (fdp->i_flags & APPEND)) { error = EPERM; - goto abortit; + goto unlockout; } - if ((ip->i_mode & IFMT) == IFDIR) { + if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || - (ip->i_flag & IN_RENAME)) { - VOP_UNLOCK(fvp, 0); + fdp == fip || + (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; - goto abortit; + goto unlockout; } - ip->i_flag |= IN_RENAME; - oldparent = dp->i_number; + if (fdp->i_number != tdp->i_number) + newparent = tdp->i_number; doingdirectory = 1; } - vrele(fdvp); - - /* - * When the target exists, both the directory - * and target vnodes are returned locked. - */ - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); - - /* - * 1) Bump link count while we're moving stuff - * around. If we crash somewhere before - * completing our work, the link count - * may be wrong, but correctable. - */ - ip->i_effnlink++; - ip->i_nlink++; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | - DOINGASYNC(fvp)))) != 0) { - VOP_UNLOCK(fvp, 0); - goto bad; + if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) { + error = EXDEV; + goto unlockout; } /* @@ -1214,88 +1296,93 @@ ufs_rename(ap) * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so - * as to be able to change "..". We must repeat the call - * to namei, as the parent directory is unlocked by the - * call to checkpath(). + * as to be able to change "..". */ - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); - fvp_ino = ip->i_number; - VOP_UNLOCK(fvp, 0); - if (oldparent != dp->i_number) - newparent = dp->i_number; if (doingdirectory && newparent) { - if (error) /* write access check above */ - goto bad; - if (xp != NULL) - vput(tvp); - error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred); + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) - goto out; + goto unlockout; + error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, + &ino); + /* + * We encountered a lock that we have to wait for. Unlock + * everything else and VGET before restarting. + */ + if (ino) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_SHARED, &nvp); + if (error == 0) + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + if (error) + goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); - VREF(tdvp); - error = relookup(tdvp, &tvp, tcnp); - if (error) - goto out; - vrele(tdvp); - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); } + if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || + tdp->i_effnlink == 0) + panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + fip->i_effnlink++; + fip->i_nlink++; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_setup_link(tdp, fip); + error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp))); + if (error) + goto bad; + + /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ - if (xp == NULL) { - if (dp->i_dev != ip->i_dev) + if (tip == NULL) { + if (tdp->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); - /* - * Account for ".." in new directory. - * When source and destination have the same - * parent we don't fool with the link count. - */ if (doingdirectory && newparent) { - if ((nlink_t)dp->i_nlink >= LINK_MAX) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't adjust the link count. The + * actual link modification is completed when + * .. is rewritten below. + */ + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } - dp->i_effnlink++; - dp->i_nlink++; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | - DOINGASYNC(tdvp))); - if (error) - goto bad; } - ufs_makedirentry(ip, tcnp, &newdir); - error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); - if (error) { - if (doingdirectory && newparent) { - dp->i_effnlink--; - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - (void)UFS_UPDATE(tdvp, 1); - } + ufs_makedirentry(fip, tcnp, &newdir); + error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); + if (error) goto bad; - } - vput(tdvp); + /* Setup tdvp for directory compaction if needed. */ + if (tdp->i_count && tdp->i_endoff && + tdp->i_endoff < tdp->i_size) + endoff = tdp->i_endoff; } else { - if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ - if (xp->i_number == ip->i_number) + if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller @@ -1303,7 +1390,7 @@ ufs_rename(ap) * destination of the rename. This implements append-only * directories. */ - if ((dp->i_mode & S_ISTXT) && + if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; @@ -1314,9 +1401,9 @@ ufs_rename(ap) * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ - if ((xp->i_mode&IFMT) == IFDIR) { - if ((xp->i_effnlink > 2) || - !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { + if ((tip->i_mode & IFMT) == IFDIR) { + if ((tip->i_effnlink > 2) || + !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } @@ -1329,21 +1416,31 @@ ufs_rename(ap) error = EISDIR; goto bad; } - error = ufs_dirrewrite(dp, xp, ip->i_number, - IFTODT(ip->i_mode), - (doingdirectory && newparent) ? newparent : doingdirectory); - if (error) - goto bad; if (doingdirectory) { if (!newparent) { - dp->i_effnlink--; + tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); + softdep_change_linkcnt(tdp); } - xp->i_effnlink--; + tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(xp); + softdep_change_linkcnt(tip); } + error = ufs_dirrewrite(tdp, tip, fip->i_number, + IFTODT(fip->i_mode), + (doingdirectory && newparent) ? newparent : doingdirectory); + if (error) { + if (doingdirectory) { + if (!newparent) { + tdp->i_effnlink++; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + tip->i_effnlink++; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(tip); + } + } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory @@ -1357,115 +1454,107 @@ ufs_rename(ap) * them now. */ if (!newparent) { - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; + tdp->i_nlink--; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; } - xp->i_nlink--; - DIP_SET(xp, i_nlink, xp->i_nlink); - xp->i_flag |= IN_CHANGE; + tip->i_nlink--; + DIP_SET(tip, i_nlink, tip->i_nlink); + tip->i_flag |= IN_CHANGE; ioflag = IO_NORMAL; if (!DOINGASYNC(tvp)) ioflag |= IO_SYNC; + /* Don't go to bad here as the new link exists. */ if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) - goto bad; + goto unlockout; } - vput(tdvp); - vput(tvp); - xp = NULL; } /* - * 3) Unlink the source. + * 3) Unlink the source. We have to resolve the path again to + * fixup the directory offset and count for ufs_dirremove. */ - fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); - VREF(fdvp); - error = relookup(fdvp, &fvp, fcnp); - if (error == 0) - vrele(fdvp); - if (fvp != NULL) { - xp = VTOI(fvp); - dp = VTOI(fdvp); - } else { - /* - * From name has disappeared. IN_RENAME is not sufficient - * to protect against directory races due to timing windows, - * so we have to remove the panic. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. - */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - vrele(ap->a_fvp); - return (0); + if (fdvp == tdvp) { + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + panic("ufs_rename: from entry went away!"); + if (ino != fip->i_number) + panic("ufs_rename: ino mismatch %d != %d\n", ino, + fip->i_number); } /* - * Ensure that the directory entry still exists and has not - * changed while the new name has been entered. If the source is - * a file then the entry may have been unlinked or renamed. In - * either case there is no further work to be done. If the source - * is a directory then it cannot have been rmdir'ed; the IN_RENAME - * flag ensures that it cannot be moved by another rename or removed - * by a rmdir. + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. */ - if (xp != ip) { + if (doingdirectory && newparent) { /* - * From name resolves to a different inode. IN_RENAME is - * not sufficient protection against timing window races - * so we can't panic here. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. + * If tip exists we simply use its link, otherwise we must + * add a new one. */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - } else { - /* - * If the source is a directory with a - * new parent, the link count of the old - * parent directory must be decremented - * and ".." set to point to the new parent. - */ - if (doingdirectory && newparent) { - xp->i_offset = mastertemplate.dot_reclen; - ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); - cache_purge(fdvp); + if (tip == NULL) { + tdp->i_effnlink++; + tdp->i_nlink++; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_setup_dotdot_link(tdp, fip); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + /* Don't go to bad here as the new link exists. */ + if (error) + goto unlockout; } - error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); - xp->i_flag &= ~IN_RENAME; + fip->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); + cache_purge(fdvp); } - if (dp) - vput(fdvp); - if (xp) - vput(fvp); - vrele(ap->a_fvp); + error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); + +unlockout: + vput(fdvp); + vput(fvp); + if (tvp) + vput(tvp); + /* + * If compaction or fsync was requested do it now that other locks + * are no longer needed. + */ + if (error == 0 && endoff != 0) { +#ifdef UFS_DIRHASH + if (tdp->i_dirhash != NULL) + ufsdirhash_dirtrunc(tdp, endoff); +#endif + UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred, + td); + } + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + vput(tdvp); + if (mp) + vfs_unbusy(mp); return (error); bad: - if (xp) - vput(ITOV(xp)); - vput(ITOV(dp)); -out: - if (doingdirectory) - ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - ip->i_flag &= ~IN_RENAME; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - vput(fvp); - } else - vrele(fvp); + fip->i_effnlink--; + fip->i_nlink--; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_revert_link(tdp, fip); + goto unlockout; + +releout: + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp) + vrele(tvp); + if (mp) + vfs_unbusy(mp); + return (error); } @@ -1664,8 +1753,7 @@ ufs_mkdir(ap) ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); - if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); @@ -1681,8 +1769,8 @@ ufs_mkdir(ap) DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); + softdep_setup_mkdir(dp, ip); + error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; #ifdef MAC @@ -1791,7 +1879,7 @@ ufs_mkdir(ap) else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); + error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0); bad: if (error == 0) { @@ -1807,8 +1895,6 @@ bad: dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. @@ -1818,7 +1904,8 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_mkdir(dp, ip); + vput(tvp); } out: @@ -1854,10 +1941,13 @@ ufs_rmdir(ap) * tries to remove a locally mounted on directory). */ error = 0; - if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) { + if (ip->i_effnlink < 2) { error = EINVAL; goto out; } + if (dp->i_effnlink < 3) + panic("ufs_dirrem: Bad link count %d on parent", + dp->i_effnlink); if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; @@ -1881,18 +1971,14 @@ ufs_rmdir(ap) */ dp->i_effnlink--; ip->i_effnlink--; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_revert_rmdir(dp, ip); goto out; } cache_purge(dvp); @@ -2401,6 +2487,9 @@ ufs_makeinode(mode, dvp, vpp, cnp) if ((mode & IFMT) == 0) mode |= IFREG; + if (VTOI(dvp)->i_effnlink < 2) + panic("ufs_makeinode: Bad link count %d on parent", + VTOI(dvp)->i_effnlink); error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); @@ -2530,7 +2619,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { ip->i_mode &= ~ISGID; @@ -2594,7 +2683,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); + error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); if (error) goto bad; *vpp = tvp; @@ -2610,7 +2699,7 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_create(VTOI(dvp), ip); vput(tvp); return (error); } Index: /usr/src/sys/ufs/ufs/ufsmount.h =================================================================== --- /usr/src/sys/ufs/ufs/ufsmount.h (revision 202614) +++ /usr/src/sys/ufs/ufs/ufsmount.h (working copy) @@ -57,7 +57,11 @@ struct ucred; struct uio; struct vnode; struct ufs_extattr_per_mount; +struct jblocks; +struct inodedep; +TAILQ_HEAD(inodedeplst, inodedep); + /* This structure describes the UFS specific mount structure data. */ struct ufsmount { struct mount *um_mountp; /* filesystem vfs structure */ @@ -75,6 +79,11 @@ struct ufsmount { long um_numindirdeps; /* outstanding indirdeps */ struct workhead softdep_workitem_pending; /* softdep work queue */ struct worklist *softdep_worklist_tail; /* Tail pointer for above */ + struct workhead softdep_journal_pending; /* journal work queue */ + struct worklist *softdep_journal_tail; /* Tail pointer for above */ + struct jblocks *softdep_jblocks; /* Journal block information */ + struct inodedeplst softdep_unlinked; /* Unlinked inodes */ + int softdep_on_journal; /* Items on the journal list */ int softdep_on_worklist; /* Items on the worklist */ int softdep_on_worklist_inprogress; /* Busy items on worklist */ int softdep_deps; /* Total dependency count */ Index: /usr/src/sys/ufs/ufs/ufs_lookup.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_lookup.c (revision 202614) +++ /usr/src/sys/ufs/ufs/ufs_lookup.c (working copy) @@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) -static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *, - ino_t *); - static int ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, struct thread *td) @@ -189,11 +186,11 @@ ufs_lookup(ap) } */ *ap; { - return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); + return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } -static int -ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, +int +ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ @@ -524,6 +521,8 @@ notfound: return (ENOENT); found: + if (dd_ino != NULL) + *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* @@ -546,11 +545,6 @@ found: if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); - if (dd_ino != NULL) { - *dd_ino = ino; - return (0); - } - /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. @@ -558,17 +552,6 @@ found: if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); - if ((error = VFS_VGET(vdp->v_mount, ino, - LK_EXCLUSIVE, &tdp)) != 0) - return (error); - - error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); - if (error) { - vput(tdp); - return (error); - } - - /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there @@ -585,6 +568,16 @@ found: dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; + if (dd_ino != NULL) + return (0); + if ((error = VFS_VGET(vdp->v_mount, ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; @@ -616,6 +609,8 @@ found: dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); + if (dd_ino != NULL) + return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -650,6 +645,8 @@ found: cnp->cn_flags |= SAVENAME; return (0); } + if (dd_ino != NULL) + return (0); /* * Step through the translation in the name. We do not `vput' the @@ -681,7 +678,7 @@ found: * to the inode we looked up before vdp lock was * dropped. */ - error = ufs_lookup_(pdp, NULL, cnp, &ino1); + error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); @@ -825,12 +822,13 @@ ufs_makedirentry(ip, cnp, newdirp) * soft dependency code). */ int -ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) +ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; + int isrename; { struct ucred *cr; struct thread *td; @@ -903,22 +901,28 @@ int blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, dp->i_offset, - dirp->d_ino, newdirbp, 1) == 0) { - bdwrite(bp); + dirp->d_ino, newdirbp, 1)) + dp->i_flag |= IN_NEEDSYNC; + if (newdirbp) + bdwrite(newdirbp); + bdwrite(bp); + if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); - } - /* We have just allocated a directory block in an - * indirect block. Rather than tracking when it gets - * claimed by the inode, we simply do a VOP_FSYNC - * now to ensure that it is there (in case the user - * does a future fsync). Note that we have to unlock - * the inode for the entry that we just entered, as - * the VOP_FSYNC may need to lock other inodes which - * can lead to deadlock if we also hold a lock on - * the newly entered node. + /* + * We have just allocated a directory block in an + * indirect block. We must prevent holes in the + * directory created if directory entries are + * written out of order. To accomplish this we + * fsync when we extend a directory into indirects. + * During rename it's not safe to drop the tvp lock + * so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and + * the kernel were taught to fill in sparse + * directories rather than panic. */ - if ((error = bwrite(bp))) - return (error); + if (isrename) + return (0); if (tvp != NULL) VOP_UNLOCK(tvp, 0); error = VOP_FSYNC(dvp, MNT_WAIT, td); @@ -1007,7 +1011,7 @@ int dp->i_offset + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) - softdep_change_directoryentry_offset(dp, dirbuf, + softdep_change_directoryentry_offset(bp, dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); @@ -1059,6 +1063,8 @@ int (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); + if (newdirbp != NULL) + bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { @@ -1076,7 +1082,8 @@ int * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ - if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { + if (isrename == 0 && error == 0 && + dp->i_endoff && dp->i_endoff < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp, 0); #ifdef UFS_DIRHASH @@ -1117,6 +1124,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir) dp = VTOI(dvp); + /* + * Adjust the link count early so softdep can block if necessary. + */ + if (ip) { + ip->i_effnlink--; + if (DOINGSOFTDEP(dvp)) { + softdep_setup_unlink(dp, ip); + } else { + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + } + } if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to WINO. @@ -1146,6 +1166,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir) if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, rep, dp->i_offset); #endif + if (ip && rep->d_ino != ip->i_number) + panic("ufs_dirremove: ip %d does not match dirent ino %d\n", + ip->i_number, rep->d_ino); if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. @@ -1164,31 +1187,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir) dp->i_offset & ~(DIRBLKSIZ - 1)); #endif out: + error = 0; if (DOINGSOFTDEP(dvp)) { - if (ip) { - ip->i_effnlink--; - softdep_change_linkcnt(ip); + if (ip) softdep_setup_remove(bp, dp, ip, isrmdir); - } - if (softdep_slowdown(dvp)) { + if (softdep_slowdown(dvp)) error = bwrite(bp); - } else { + else bdwrite(bp); - error = 0; - } } else { - if (ip) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - } if (flags & DOWHITEOUT) error = bwrite(bp); - else if (DOINGASYNC(dvp) && dp->i_count != 0) { + else if (DOINGASYNC(dvp) && dp->i_count != 0) bdwrite(bp); - error = 0; - } else + else error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; @@ -1221,6 +1233,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) struct vnode *vdp = ITOV(dp); int error; + /* + * Drop the link before we lock the buf so softdep can block if + * necessary. + */ + oip->i_effnlink--; + if (DOINGSOFTDEP(vdp)) { + softdep_setup_unlink(dp, oip); + } else { + oip->i_nlink--; + DIP_SET(oip, i_nlink, oip->i_nlink); + oip->i_flag |= IN_CHANGE; + } + error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); if (error) return (error); @@ -1232,15 +1257,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; - oip->i_effnlink--; if (DOINGSOFTDEP(vdp)) { - softdep_change_linkcnt(oip); softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { - oip->i_nlink--; - DIP_SET(oip, i_nlink, oip->i_nlink); - oip->i_flag |= IN_CHANGE; if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; @@ -1355,25 +1375,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cre /* * Check if source directory is in the path of the target directory. - * Target is supplied locked, source is unlocked. - * The target is always vput before returning. */ int -ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred) +ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { - struct vnode *vp, *vp1; + struct mount *mp; + struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; - vp = ITOV(target); - if (target->i_number == source_ino) { - error = EEXIST; - goto out; - } + vp = tvp = ITOV(target); + mp = vp->v_mount; + *wait_ino = 0; + if (target->i_number == source_ino) + return (EEXIST); + if (target->i_number == parent_ino) + return (0); + if (target->i_number == ROOTINO) + return (0); error = 0; - if (target->i_number == ROOTINO) - goto out; - for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) @@ -1384,9 +1404,13 @@ int } if (dd_ino == ROOTINO) break; - error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1); - if (error != 0) + if (dd_ino == parent_ino) break; + error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); + if (error != 0) { + *wait_ino = dd_ino; + break; + } /* Recheck that ".." still points to vp1 after relock of vp */ error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) { @@ -1398,14 +1422,14 @@ int vput(vp1); continue; } - vput(vp); + if (vp != tvp) + vput(vp); vp = vp1; } -out: if (error == ENOTDIR) - printf("checkpath: .. not a directory\n"); - if (vp != NULL) + panic("checkpath: .. not a directory\n"); + if (vp != tvp) vput(vp); return (error); } Index: /usr/src/sys/ufs/ufs/ufs_extern.h =================================================================== --- /usr/src/sys/ufs/ufs/ufs_extern.h (revision 202614) +++ /usr/src/sys/ufs/ufs/ufs_extern.h (working copy) @@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *); int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, struct buf *, int *, int *); int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); -int ufs_checkpath(ino_t, struct inode *, struct ucred *); +int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); @@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args *); void ufs_makedirentry(struct inode *, struct componentname *, struct direct *); int ufs_direnter(struct vnode *, struct vnode *, struct direct *, - struct componentname *, struct buf *); + struct componentname *, struct buf *, int); int ufs_dirremove(struct vnode *, struct inode *, int, int); int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, + ino_t *); int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); int ufs_inactive(struct vop_inactive_args *); int ufs_init(struct vfsconf *); @@ -81,19 +83,33 @@ vfs_root_t ufs_root; int ufs_uninit(struct vfsconf *); int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); +#include +SYSCTL_DECL(_vfs_ufs); + /* * Soft update function prototypes. */ int softdep_setup_directory_add(struct buf *, struct inode *, off_t, ino_t, struct buf *, int); -void softdep_change_directoryentry_offset(struct inode *, caddr_t, - caddr_t, caddr_t, int); +void softdep_change_directoryentry_offset(struct buf *, struct inode *, + caddr_t, caddr_t, caddr_t, int); void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int); void softdep_setup_directory_change(struct buf *, struct inode *, struct inode *, ino_t, int); void softdep_change_linkcnt(struct inode *); void softdep_releasefile(struct inode *); int softdep_slowdown(struct vnode *); +void softdep_setup_create(struct inode *, struct inode *); +void softdep_setup_dotdot_link(struct inode *, struct inode *); +void softdep_setup_link(struct inode *, struct inode *); +void softdep_setup_mkdir(struct inode *, struct inode *); +void softdep_setup_rmdir(struct inode *, struct inode *); +void softdep_setup_unlink(struct inode *, struct inode *); +void softdep_revert_create(struct inode *, struct inode *); +void softdep_revert_dotdot_link(struct inode *, struct inode *); +void softdep_revert_link(struct inode *, struct inode *); +void softdep_revert_mkdir(struct inode *, struct inode *); +void softdep_revert_rmdir(struct inode *, struct inode *); /* * Flags to low-level allocation routines. The low 16-bits are reserved Index: /usr/src/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_vfsops.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_vfsops.c (working copy) @@ -79,7 +79,6 @@ static int ffs_reload(struct mount *, struct threa static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); -static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; @@ -331,6 +330,7 @@ ffs_mount(struct mount *mp) MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); + fs->fs_mtime = time_second; fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); @@ -898,6 +898,7 @@ ffs_mountfs(devvp, mp, td) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); + mp->mnt_stat.f_iosize = fs->fs_bsize; if( mp->mnt_flag & MNT_ROOTFS) { /* @@ -909,6 +910,7 @@ ffs_mountfs(devvp, mp, td) } if (ronly == 0) { + fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); @@ -939,7 +941,6 @@ ffs_mountfs(devvp, mp, td) * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ - mp->mnt_stat.f_iosize = fs->fs_bsize; (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ @@ -1039,7 +1040,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc) * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ -static void +void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; @@ -1134,6 +1135,7 @@ ffs_unmount(mp, mntflags) fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); + softdep_unmount(mp); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); @@ -1575,16 +1577,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags) DIP_SET(ip, i_gen, ip->i_gen); } } - /* - * Ensure that uid and gid are correct. This is a temporary - * fix until fsck has been changed to do the update. - */ - if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ - fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ - ip->i_uid = ip->i_din1->di_ouid; /* XXX */ - ip->i_gid = ip->i_din1->di_ogid; /* XXX */ - } /* XXX */ - #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* @@ -1728,6 +1720,8 @@ ffs_sbupdate(mp, waitfor, suspended) } fs->fs_fmod = 0; fs->fs_time = time_second; + if (fs->fs_flags & FS_DOSOFTDEP) + softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (suspended) @@ -1869,9 +1863,6 @@ ffs_bufwrite(struct buf *bp) } BO_UNLOCK(bp->b_bufobj); - /* Mark the buffer clean */ - bundirty(bp); - /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the @@ -1912,9 +1903,16 @@ ffs_bufwrite(struct buf *bp) newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES - /* move over the dependencies */ - if (!LIST_EMPTY(&bp->b_dep)) - softdep_move_dependencies(bp, newbp); + /* + * Move over the dependencies. If there are rollbacks, + * leave the parent buffer dirtied as it will need to + * be written again. + */ + if (LIST_EMPTY(&bp->b_dep) || + softdep_move_dependencies(bp, newbp) == 0) + bundirty(bp); +#else + bundirty(bp); #endif /* @@ -1927,8 +1925,11 @@ ffs_bufwrite(struct buf *bp) */ bqrelse(bp); bp = newbp; - } + } else + /* Mark the buffer clean */ + bundirty(bp); + /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); Index: /usr/src/sys/ufs/ffs/ffs_softdep.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_softdep.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_softdep.c (working copy) @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -130,10 +131,12 @@ softdep_setup_inomapdep(bp, ip, newinum) } void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; + int frags; + int oldfrags; { panic("softdep_setup_blkmapdep called"); @@ -403,31 +406,13 @@ softdep_get_depcounts(struct mount *mp, * These definitions need to be adapted to the system to which * this file is being ported. */ -/* - * malloc types defined for the softdep system. - */ -static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); -static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); -static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); -static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); -static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); -static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); -static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); -static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); -static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); -static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); -static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); -static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); -static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); -static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); -static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) #define D_PAGEDEP 0 #define D_INODEDEP 1 -#define D_NEWBLK 2 -#define D_BMSAFEMAP 3 +#define D_BMSAFEMAP 2 +#define D_NEWBLK 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 @@ -438,8 +423,66 @@ softdep_get_depcounts(struct mount *mp, #define D_MKDIR 11 #define D_DIRREM 12 #define D_NEWDIRBLK 13 -#define D_LAST D_NEWDIRBLK +#define D_FREEWORK 14 +#define D_FREEDEP 15 +#define D_JADDREF 16 +#define D_JREMREF 17 +#define D_JMVREF 18 +#define D_JNEWBLK 19 +#define D_JFREEBLK 20 +#define D_JFREEFRAG 21 +#define D_JSEG 22 +#define D_JSEGDEP 23 +#define D_SBDEP 24 +#define D_LAST D_SBDEP +unsigned long dep_current[D_LAST + 1]; +unsigned long dep_total[D_LAST + 1]; + + +SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, + "total dependencies allocated"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, + "current dependencies allocated"); + +#define SOFTDEP_TYPE(type, str, long) \ + static MALLOC_DEFINE(M_ ## type, #str, long); \ + SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ + &dep_total[D_ ## type], 0, ""); \ + SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ + &dep_current[D_ ## type], 0, ""); + +SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); +SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); +SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, + "Block or frag allocated from cyl group map"); +SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); +SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); +SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); +SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); +SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); +SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); +SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); +SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); +SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); +SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); +SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); +SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); +SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); +SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); +SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); +SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); +SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); +SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); +SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); +SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); +SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); +SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); + +static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); +static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); + /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX @@ -447,8 +490,8 @@ softdep_get_depcounts(struct mount *mp, static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, + M_BMSAFEMAP, M_NEWBLK, - M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, @@ -458,7 +501,18 @@ static struct malloc_type *memtype[] = { M_DIRADD, M_MKDIR, M_DIRREM, - M_NEWDIRBLK + M_NEWDIRBLK, + M_FREEWORK, + M_FREEDEP, + M_JADDREF, + M_JREMREF, + M_JMVREF, + M_JNEWBLK, + M_JFREEBLK, + M_JFREEFRAG, + M_JSEG, + M_JSEGDEP, + M_SBDEP }; #define DtoM(type) (memtype[type]) @@ -467,17 +521,21 @@ static struct malloc_type *memtype[] = { * Names of malloc types. */ #define TYPENAME(type) \ - ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") + ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ +#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) +#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) + /* * Forward declarations. */ struct inodedep_hashhead; struct newblk_hashhead; struct pagedep_hashhead; +struct bmsafemap_hashhead; /* * Internal function prototypes. @@ -487,59 +545,170 @@ static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct mtx *, int); static void clear_remove(struct thread *); static void clear_inodedeps(struct thread *); +static void unlinked_inodedep(struct mount *, struct inodedep *); +static void clear_unlinked_inodedep(struct inodedep *); +static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); +static void free_pagedep(struct pagedep *); +static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); +static int handle_written_sbdep(struct sbdep *, struct buf *); +static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); +static int handle_written_indirdep(struct indirdep *, struct buf *, + struct buf**); static int handle_written_inodeblock(struct inodedep *, struct buf *); -static void handle_allocdirect_partdone(struct allocdirect *); +static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); +static void handle_written_jaddref(struct jaddref *, struct jseg *); +static void handle_written_jremref(struct jremref *, struct jseg *); +static void handle_written_jseg(struct jseg *, struct buf *); +static void handle_written_jnewblk(struct jnewblk *, struct jseg *); +static void handle_written_jfreeblk(struct jfreeblk *, struct jseg *); +static void handle_written_jfreefrag(struct jfreefrag *, struct jseg *); +static void complete_jseg(struct jseg *); +static void jseg_write(struct fs *, struct jblocks *, struct jseg *, + uint8_t *); +static void jaddref_write(struct jaddref *, uint8_t *); +static void jremref_write(struct jremref *, uint8_t *); +static void jmvref_write(struct jmvref *, uint8_t *); +static void jnewblk_write(struct jnewblk *, uint8_t *); +static void jfreeblk_write(struct jfreeblk *, uint8_t *); +static void jfreefrag_write(struct jfreefrag *, uint8_t *); +static inline void inoref_write(struct inoref *, struct jrefrec *); +static void handle_allocdirect_partdone(struct allocdirect *, + struct workhead *); +static void cancel_newblk(struct newblk *, struct workhead *); +static void indirdep_complete(struct indirdep *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); +static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); +static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static void handle_workitem_remove(struct dirrem *, struct vnode *); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); -static void free_diradd(struct diradd *); -static void free_allocindir(struct allocindir *, struct inodedep *); +static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, + struct freeblks *); +static void free_indirdep(struct indirdep *); +static void free_diradd(struct diradd *, struct workhead *); +static void merge_diradd(struct inodedep *, struct diradd *); +static void complete_diradd(struct diradd *); +static struct diradd *diradd_lookup(struct pagedep *, int); +static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, + struct jremref *, struct jremref *); +static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, + struct jremref *); +static void cancel_allocindir(struct allocindir *, struct inodedep *, + struct freeblks *); +static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); -static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, - ufs2_daddr_t *); -static void deallocate_dependencies(struct buf *, struct inodedep *); -static void free_allocdirect(struct allocdirectlst *, - struct allocdirect *, int); +static void free_jremref(struct jremref *); +static void free_jaddref(struct jaddref *); +static void free_jsegdep(struct jsegdep *); +static void free_jseg(struct jseg *); +static void free_jnewblk(struct jnewblk *); +static void free_jfreeblk(struct jfreeblk *); +static void free_jfreefrag(struct jfreefrag *); +static void free_freedep(struct freedep *); +static void journal_jremref(struct dirrem *, struct jremref *, + struct inodedep *); +static void cancel_jnewblk(struct jnewblk *, struct workhead *); +static int cancel_jaddref(struct jaddref *, struct inodedep *, + struct workhead *); +static void cancel_jfreefrag(struct jfreefrag *); +static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); +static int deallocate_dependencies(struct buf *, struct inodedep *, + struct freeblks *); +static void free_newblk(struct newblk *); +static void cancel_allocdirect(struct allocdirectlst *, + struct allocdirect *, struct freeblks *, int); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); +static void freework_freeblock(struct freework *); static void handle_workitem_freeblocks(struct freeblks *, int); +static void handle_complete_freeblocks(struct freeblks *); +static void handle_workitem_indirblk(struct freework *); +static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, - struct allocindir *); + struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, - ufs2_daddr_t); + ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); -static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); +static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, + ufs_lbn_t); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); -static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); -static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, - struct newblk **); -static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); +static struct freefrag *allocindir_merge(struct allocindir *, + struct allocindir *); +static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, + struct bmsafemap **); +static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, + int cg); +static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, + int, struct newblk **); +static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); -static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); +static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, + struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct mount *mp, int, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int); -static void add_to_worklist(struct worklist *); +static void process_removes(struct vnode *); +static void jwork_move(struct workhead *, struct workhead *); +static void add_to_worklist(struct worklist *, int); +static void remove_from_worklist(struct worklist *); static void softdep_flush(void); static int softdep_speedup(void); +static void worklist_speedup(void); +static int journal_mount(struct mount *, struct fs *, struct ucred *); +static void journal_unmount(struct mount *); +static int journal_space(struct ufsmount *, int); +static void journal_suspend(struct ufsmount *); +static void softdep_prelink(struct vnode *, struct vnode *); +static void add_to_journal(struct worklist *); +static void remove_from_journal(struct worklist *); +static void softdep_process_journal(struct mount *, int); +static struct jremref *newjremref(struct dirrem *, struct inode *, + struct inode *ip, off_t, nlink_t); +static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, + uint16_t); +static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, + uint16_t); +static inline struct jsegdep *inoref_segattach(struct inoref *, struct jseg *); +static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); +static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, + ufs2_daddr_t, int); +static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, + ufs2_daddr_t, long, ufs_lbn_t); +static struct freework *newfreework(struct freeblks *, struct freework *, + ufs_lbn_t, ufs2_daddr_t, int, int); +static void jwait(struct worklist *wk); +static struct inodedep *inodedep_lookup_ip(struct inode *); +static int bmsafemap_rollbacks(struct bmsafemap *); +static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); +static void handle_jwork(struct workhead *); +static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, + struct mkdir **); +static struct jblocks *jblocks_create(void); +static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); +static void jblocks_free(struct jblocks *, struct mount *, int); +static void jblocks_destroy(struct jblocks *); +static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. @@ -572,40 +741,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) +#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT +#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE + #else /* DEBUG */ -static void worklist_insert(struct workhead *, struct worklist *); -static void worklist_remove(struct worklist *); +static void worklist_insert(struct workhead *, struct worklist *, int); +static void worklist_remove(struct worklist *, int); -#define WORKLIST_INSERT(head, item) worklist_insert(head, item) -#define WORKLIST_REMOVE(item) worklist_remove(item) +#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) +#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) +#define WORKLIST_REMOVE(item) worklist_remove(item, 1) +#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) static void -worklist_insert(head, item) +worklist_insert(head, item, locked) struct workhead *head; struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if (item->wk_state & ONWORKLIST) - panic("worklist_insert: already on list"); + panic("worklist_insert: %p %s(0x%X) already on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void -worklist_remove(item) +worklist_remove(item, locked) struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if ((item->wk_state & ONWORKLIST) == 0) - panic("worklist_remove: not on list"); + panic("worklist_remove: %p %s(0x%X) not on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* + * Merge two jsegdeps keeping only the oldest one as newer references + * can't be discarded until after older references. + */ +static inline struct jsegdep * +jsegdep_merge(struct jsegdep *one, struct jsegdep *two) +{ + struct jsegdep *swp; + + if (two == NULL) + return (one); + + if (one->jd_seg->js_seq > two->jd_seg->js_seq) { + swp = one; + one = two; + two = swp; + } + WORKLIST_REMOVE(&two->jd_list); + free_jsegdep(two); + + return (one); +} + +/* + * If two freedeps are compatible free one to reduce list size. + */ +static inline struct freedep * +freedep_merge(struct freedep *one, struct freedep *two) +{ + if (two == NULL) + return (one); + + if (one->fd_freework == two->fd_freework) { + WORKLIST_REMOVE(&two->fd_list); + free_freedep(two); + } + return (one); +} + +/* + * Move journal work from one list to another. Duplicate freedeps and + * jsegdeps are coalesced to keep the lists as small as possible. + */ +static void +jwork_move(dst, src) + struct workhead *dst; + struct workhead *src; +{ + struct freedep *freedep; + struct jsegdep *jsegdep; + struct worklist *wkn; + struct worklist *wk; + + KASSERT(dst != src, + ("jwork_move: dst == src")); + freedep = NULL; + jsegdep = NULL; + LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { + if (wk->wk_type == D_JSEGDEP) + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } + + mtx_assert(&lk, MA_OWNED); + while ((wk = LIST_FIRST(src)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(dst, wk); + if (wk->wk_type == D_JSEGDEP) { + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + continue; + } + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } +} + +/* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); @@ -623,13 +880,16 @@ workitem_free(item, type) #ifdef DEBUG if (item->wk_state & ONWORKLIST) - panic("workitem_free: still on list"); + panic("workitem_free: %s(0x%X) still on list", + TYPENAME(item->wk_type), item->wk_state); if (item->wk_type != type) - panic("workitem_free: type mismatch"); + panic("workitem_free: type mismatch %s != %s", + TYPENAME(item->wk_type), TYPENAME(type)); #endif ump = VFSTOUFS(item->wk_mp); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); + dep_current[type]--; free(item, DtoM(type)); } @@ -643,6 +903,8 @@ workitem_alloc(item, type, mp) item->wk_mp = mp; item->wk_state = 0; ACQUIRE_LOCK(&lk); + dep_current[type]++; + dep_total[type]++; VFSTOUFS(mp)->softdep_deps++; VFSTOUFS(mp)->softdep_accdeps++; FREE_LOCK(&lk); @@ -679,23 +941,38 @@ static int stat_inode_bitmap; /* bufs redirtied as static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ -SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); -SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); -SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); -/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ +SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, + &max_softdeps, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, + &tickdelay, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, + &maxindirdeps, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, + &stat_worklist_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, + &stat_blk_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, + &stat_ino_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, + &stat_blk_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, + &stat_ino_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, + &stat_sync_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, + &stat_indir_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, + &stat_inode_bitmap, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, + &stat_direct_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, + &stat_dir_entry, 0, ""); SYSCTL_DECL(_vfs_ffs); +LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; +static u_long bmsafemap_hash; /* size of hash table - 1 */ + static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); @@ -770,16 +1047,22 @@ softdep_flush(void) } } -static int -softdep_speedup(void) +static void +worklist_speedup(void) { - mtx_assert(&lk, MA_OWNED); if (req_pending == 0) { req_pending = 1; wakeup(&req_pending); } +} +static int +softdep_speedup(void) +{ + + worklist_speedup(); + bd_speedup(); return speedup_syncer(); } @@ -791,15 +1074,17 @@ softdep_flush(void) * and does so in order from first to last. */ static void -add_to_worklist(wk) +add_to_worklist(wk, nodelay) struct worklist *wk; + int nodelay; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); ump = VFSTOUFS(wk->wk_mp); if (wk->wk_state & ONWORKLIST) - panic("add_to_worklist: already on list"); + panic("add_to_worklist: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (LIST_EMPTY(&ump->softdep_workitem_pending)) LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); @@ -807,9 +1092,33 @@ static void LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; ump->softdep_on_worklist += 1; + if (nodelay) + worklist_speedup(); } /* + * Remove the item to be processed. If we are removing the last + * item on the list, we need to recalculate the tail pointer. + */ +static void +remove_from_worklist(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + struct worklist *wkend; + + ump = VFSTOUFS(wk->wk_mp); + WORKLIST_REMOVE(wk); + if (wk == ump->softdep_worklist_tail) { + LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) + if (LIST_NEXT(wkend, wk_list) == NULL) + break; + ump->softdep_worklist_tail = wkend; + } + ump->softdep_on_worklist -= 1; +} + +/* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they @@ -838,8 +1147,9 @@ softdep_process_worklist(mp, full) ACQUIRE_LOCK(&lk); loopcount = 1; starttime = time_second; + softdep_process_journal(mp, full?MNT_WAIT:0); while (ump->softdep_on_worklist > 0) { - if ((cnt = process_worklist_item(mp, 0)) == -1) + if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) break; else matchcnt += cnt; @@ -871,16 +1181,61 @@ softdep_process_worklist(mp, full) * second. Otherwise the other mountpoints may get * excessively backlogged. */ - if (!full && starttime != time_second) { - matchcnt = -1; + if (!full && starttime != time_second) break; - } } FREE_LOCK(&lk); return (matchcnt); } /* + * Process all removes associated with a vnode if we are running out of + * journal space. Any other process which attempts to flush these will + * be unable as we have the vnodes locked. + */ +static void +process_removes(vp) + struct vnode *vp; +{ + struct inodedep *inodedep; + struct dirrem *dirrem; + struct mount *mp; + ino_t inum; + + mtx_assert(&lk, MA_OWNED); + + mp = vp->v_mount; + inum = VTOI(vp)->i_number; + for (;;) { + if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) + return; + LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) + if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == + (COMPLETE | ONWORKLIST)) + break; + if (dirrem == NULL) + return; + /* + * If another thread is trying to lock this vnode it will + * fail but we must wait for it to do so before we can + * proceed. + */ + if (dirrem->dm_state & INPROGRESS) { + dirrem->dm_state |= IOWAITING; + msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); + continue; + } + remove_from_worklist(&dirrem->dm_list); + FREE_LOCK(&lk); + if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) + panic("process_removes: suspended filesystem"); + handle_workitem_remove(dirrem, vp); + vn_finished_secondary_write(mp); + ACQUIRE_LOCK(&lk); + } +} + +/* * Process one item on the worklist. */ static int @@ -888,7 +1243,7 @@ process_worklist_item(mp, flags) struct mount *mp; int flags; { - struct worklist *wk, *wkend; + struct worklist *wk, *wkXXX; struct ufsmount *ump; struct vnode *vp; int matchcnt = 0; @@ -908,11 +1263,14 @@ process_worklist_item(mp, flags) * inodes, we have to skip over any dirrem requests whose * vnodes are resident and locked. */ + vp = NULL; ump = VFSTOUFS(mp); - vp = NULL; LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { - if (wk->wk_state & INPROGRESS) + if (wk->wk_state & INPROGRESS) { + wkXXX = wk; continue; + } + wkXXX = wk; /* Record the last valid wk pointer. */ if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) break; wk->wk_state |= INPROGRESS; @@ -921,6 +1279,10 @@ process_worklist_item(mp, flags) ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); ACQUIRE_LOCK(&lk); + if (wk->wk_state & IOWAITING) { + wk->wk_state &= ~IOWAITING; + wakeup(wk); + } wk->wk_state &= ~INPROGRESS; ump->softdep_on_worklist_inprogress--; if (vp != NULL) @@ -928,21 +1290,7 @@ process_worklist_item(mp, flags) } if (wk == 0) return (-1); - /* - * Remove the item to be processed. If we are removing the last - * item on the list, we need to recalculate the tail pointer. - * As this happens rarely and usually when the list is short, - * we just run down the list to find it rather than tracking it - * in the above loop. - */ - WORKLIST_REMOVE(wk); - if (wk == ump->softdep_worklist_tail) { - LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) - if (LIST_NEXT(wkend, wk_list) == NULL) - break; - ump->softdep_worklist_tail = wkend; - } - ump->softdep_on_worklist -= 1; + remove_from_worklist(wk); FREE_LOCK(&lk); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); @@ -952,6 +1300,8 @@ process_worklist_item(mp, flags) case D_DIRREM: /* removal of a directory entry */ handle_workitem_remove(WK_DIRREM(wk), vp); + if (vp) + vput(vp); break; case D_FREEBLKS: @@ -969,6 +1319,11 @@ process_worklist_item(mp, flags) handle_workitem_freefile(WK_FREEFILE(wk)); break; + case D_FREEWORK: + /* Final block in an indirect was freed. */ + handle_workitem_indirblk(WK_FREEWORK(wk)); + break; + default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); @@ -982,19 +1337,22 @@ process_worklist_item(mp, flags) /* * Move dependencies from one buffer to another. */ -void +int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; + int dirty; - if (!LIST_EMPTY(&newbp->b_dep)) - panic("softdep_move_dependencies: need merge code"); - wktail = 0; + dirty = 0; + wktail = NULL; ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); + if (wk->wk_type == D_BMSAFEMAP && + bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) + dirty = 1; if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else @@ -1002,6 +1360,8 @@ softdep_move_dependencies(oldbp, newbp) wktail = wk; } FREE_LOCK(&lk); + + return (dirty); } /* @@ -1198,23 +1558,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, paged * This routine must be called with splbio interrupts blocked. */ static int -pagedep_lookup(ip, lbn, flags, pagedeppp) - struct inode *ip; +pagedep_lookup(mp, ino, lbn, flags, pagedeppp) + struct mount *mp; + ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; - struct mount *mp; int ret; int i; mtx_assert(&lk, MA_OWNED); - mp = ITOV(ip)->v_mount; - pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); + pagedephd = PAGEDEP_HASH(mp, ino, lbn); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp || (flags & DEPALLOC) == 0) return (ret); FREE_LOCK(&lk); @@ -1222,12 +1581,12 @@ static int M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(&lk); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp) { WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } - pagedep->pd_ino = ip->i_number; + pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); @@ -1314,10 +1673,13 @@ inodedep_lookup(mp, inum, flags, inodedeppp) inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; - inodedep->id_buf = NULL; + inodedep->id_bmsafemap = NULL; + inodedep->id_mkdiradd = NULL; + LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); + TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); @@ -1336,17 +1698,29 @@ u_long newblk_hash; /* size of hash table - 1 */ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static int -newblk_find(newblkhd, fs, newblkno, newblkpp) +newblk_find(newblkhd, mp, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; - struct fs *fs; + struct mount *mp; ufs2_daddr_t newblkno; + int flags; struct newblk **newblkpp; { struct newblk *newblk; - LIST_FOREACH(newblk, newblkhd, nb_hash) - if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) - break; + LIST_FOREACH(newblk, newblkhd, nb_hash) { + if (newblkno != newblk->nb_newblkno) + continue; + if (mp != newblk->nb_list.wk_mp) + continue; + /* + * If we're creating a new dependency don't match those that + * have already been converted to allocdirects. This is for + * a frag extend. + */ + if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) + continue; + break; + } if (newblk) { *newblkpp = newblk; return (1); @@ -1361,8 +1735,8 @@ static int * Found or allocated entry is returned in newblkpp. */ static int -newblk_lookup(fs, newblkno, flags, newblkpp) - struct fs *fs; +newblk_lookup(mp, newblkno, flags, newblkpp) + struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; @@ -1370,21 +1744,25 @@ static int struct newblk *newblk; struct newblk_hashhead *newblkhd; - newblkhd = NEWBLK_HASH(fs, newblkno); - if (newblk_find(newblkhd, fs, newblkno, newblkpp)) + newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); + if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(&lk); - newblk = malloc(sizeof(struct newblk), - M_NEWBLK, M_SOFTDEP_FLAGS); + newblk = malloc(sizeof(union allblk), M_NEWBLK, + M_SOFTDEP_FLAGS | M_ZERO); + workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(&lk); - if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { - free(newblk, M_NEWBLK); + if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { + WORKITEM_FREE(newblk, D_NEWBLK); return (1); } - newblk->nb_state = 0; - newblk->nb_fs = fs; + newblk->nb_freefrag = NULL; + LIST_INIT(&newblk->nb_indirdeps); + LIST_INIT(&newblk->nb_newdirblk); + LIST_INIT(&newblk->nb_jwork); + newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; @@ -1401,10 +1779,10 @@ softdep_initialize() LIST_INIT(&mkdirlisthd); max_softdeps = desiredvnodes * 4; - pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, - &pagedep_hash); + pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); - newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); + newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); + bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; @@ -1428,6 +1806,7 @@ softdep_uninitialize() hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); + hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); } /* @@ -1457,9 +1836,17 @@ softdep_mount(devvp, mp, fs, cred) MNT_IUNLOCK(mp); ump = VFSTOUFS(mp); LIST_INIT(&ump->softdep_workitem_pending); + LIST_INIT(&ump->softdep_journal_pending); + TAILQ_INIT(&ump->softdep_unlinked); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; + if ((fs->fs_flags & FS_SUJ) && + (error = journal_mount(mp, fs, cred)) != 0) { + printf("fs->fs_flags 0x%X\n", fs->fs_flags); + printf("Failed to start journal: %d\n", error); + return (error); + } /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation @@ -1493,7 +1880,1872 @@ softdep_mount(devvp, mp, fs, cred) return (0); } +void +softdep_unmount(mp) + struct mount *mp; +{ + + if (mp->mnt_flag & MNT_SUJ) + journal_unmount(mp); +} + +struct jblocks { + struct jseglst jb_segs; /* TAILQ of current segments. */ + struct jseg *jb_writeseg; /* Next write to complete. */ + struct jextent *jb_extent; /* Extent array. */ + uint64_t jb_nextseq; /* Next sequence number. */ + uint64_t jb_oldestseq; /* Oldest active sequence number. */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ + int jb_blocks; /* Total disk blocks covered. */ + int jb_free; /* Total disk blocks free. */ + int jb_min; /* Minimum free space. */ + int jb_low; /* Low on space. */ + int jb_age; /* Insertion time of oldest rec. */ + int jb_suspended; /* Did journal suspend writes? */ +}; + +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + + jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); + TAILQ_INIT(&jblocks->jb_segs); + jblocks->jb_avail = 10; + jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + + return (jblocks); +} + +static ufs2_daddr_t +jblocks_alloc(jblocks, bytes, actual) + struct jblocks *jblocks; + int bytes; + int *actual; +{ + ufs2_daddr_t daddr; + struct jextent *jext; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + jblocks->jb_head = 0; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + jblocks->jb_off += freecnt; + jblocks->jb_free -= freecnt; + + return (daddr); +} + +static void +jblocks_free(jblocks, mp, bytes) + struct jblocks *jblocks; + struct mount *mp; + int bytes; +{ + + jblocks->jb_free += bytes / DEV_BSIZE; + if (jblocks->jb_suspended) + worklist_speedup(); + wakeup(jblocks); +} + +static void +jblocks_destroy(jblocks) + struct jblocks *jblocks; +{ + + if (jblocks->jb_extent) + free(jblocks->jb_extent, M_JBLOCKS); + free(jblocks, M_JBLOCKS); +} + +static void +jblocks_add(jblocks, daddr, blocks) + struct jblocks *jblocks; + ufs2_daddr_t daddr; + int blocks; +{ + struct jextent *jext; + + jblocks->jb_blocks += blocks; + jblocks->jb_free += blocks; + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + memcpy(jext, jblocks->jb_extent, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent, M_JBLOCKS); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; +} + /* + * Open and verify the journal file. + */ +static int +journal_mount(mp, fs, cred) + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct jblocks *jblocks; + struct vnode *vp; + struct inode *ip; + ufs2_daddr_t blkno; + int bcount; + int error; + int i; + + mp->mnt_flag |= MNT_SUJ; + error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp); + if (error) + return (error); + ip = VTOI(vp); + if (ip->i_size < SUJ_MIN) { + error = ENOSPC; + goto out; + } + bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ + jblocks = jblocks_create(); + for (i = 0; i < bcount; i++) { + error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); + if (error) + break; + jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); + } + if (error) { + jblocks_destroy(jblocks); + goto out; + } + jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ + jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ + DIP_SET(ip, i_modrev, fs->fs_mtime); + ip->i_flags |= IN_MODIFIED; + ffs_update(vp, 1); + VFSTOUFS(mp)->softdep_jblocks = jblocks; +out: + vput(vp); + return (error); +} + +static void +journal_unmount(mp) + struct mount *mp; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + if (ump->softdep_jblocks) + jblocks_destroy(ump->softdep_jblocks); + ump->softdep_jblocks = NULL; +} + +/* + * Called when a journal record is ready to be written. Space is allocated + * and the journal entry is created when the journal is flushed to stable + * store. + */ +static void +add_to_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + ump = VFSTOUFS(wk->wk_mp); + if (wk->wk_state & ONWORKLIST) + panic("add_to_journal: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); + wk->wk_state |= ONWORKLIST | DEPCOMPLETE; + if (LIST_EMPTY(&ump->softdep_journal_pending)) { + ump->softdep_jblocks->jb_age = ticks; + LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); + } else + LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); + ump->softdep_journal_tail = wk; + ump->softdep_on_journal += 1; +} + +/* + * Remove an arbitrary item for the journal worklist maintain the tail + * pointer. This happens when a new operation obviates the need to + * journal an old operation. + */ +static void +remove_from_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + ump = VFSTOUFS(wk->wk_mp); +#ifdef DEBUG /* XXX Expensive, temporary. */ + { + struct worklist *wkn; + + LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) + if (wkn == wk) + break; + if (wkn == NULL) + panic("remove_from_journal: %p is not in journal", wk); + } +#endif + /* + * We emulate a TAILQ to save space in most structures which do not + * require TAILQ semantics. Here we must update the tail position + * when removing the tail which is not the final entry. + */ + if (ump->softdep_journal_tail == wk) + ump->softdep_journal_tail = + (struct worklist *)wk->wk_list.le_prev; + + WORKLIST_REMOVE(wk); + ump->softdep_on_journal -= 1; +} + +static int +journal_space(ump, thresh) + struct ufsmount *ump; + int thresh; +{ + struct jblocks *jblocks; + int avail; + + jblocks = ump->softdep_jblocks; + avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; + avail = jblocks->jb_free - avail; + + return (avail > thresh); +} + +static void +journal_suspend(ump) + struct ufsmount *ump; +{ + struct jblocks *jblocks; + struct mount *mp; + + mp = UFSTOVFS(ump); + jblocks = ump->softdep_jblocks; + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { + mp->mnt_kern_flag |= MNTK_SUSPEND; + mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); + } + jblocks->jb_suspended = 1; + MNT_IUNLOCK(mp); +} + +/* + * Called before any allocation function to be certain that there is + * sufficient space in the journal prior to creating any new records. + * Since in the case of block allocation we may have multiple locked + * buffers at the time of the actual allocation we can not block + * when the journal records are created. Doing so would create a deadlock + * if any of these buffers needed to be flushed to reclaim space. Instead + * we require a sufficiently large amount of available space such that + * each thread in the system could have passed this allocation check and + * still have sufficient free space. With 20% of a minimum journal size + * of 1MB we have 6553 records available. + */ +int +softdep_prealloc(vp, waitok) + struct vnode *vp; + int waitok; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + + if (DOINGSUJ(vp) == 0) + return (0); + ump = VFSTOUFS(vp->v_mount); + jblocks = ump->softdep_jblocks; + ACQUIRE_LOCK(&lk); + if (journal_space(ump, jblocks->jb_low)) { + FREE_LOCK(&lk); + return (0); + } + FREE_LOCK(&lk); + if (waitok == MNT_NOWAIT) + return (ENOSPC); + /* + * Attempt to sync this vnode once to flush any journal + * work attached to it. + */ + ffs_syncvnode(vp, waitok); + ACQUIRE_LOCK(&lk); + process_removes(vp); + if (journal_space(ump, jblocks->jb_low) == 0) { + softdep_speedup(); + if (journal_space(ump, jblocks->jb_min) == 0) + journal_suspend(ump); + } + FREE_LOCK(&lk); + + return (0); +} + +static void +softdep_prelink(dvp, vp) + struct vnode *dvp; + struct vnode *vp; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + + ump = VFSTOUFS(dvp->v_mount); + jblocks = ump->softdep_jblocks; + mtx_assert(&lk, MA_OWNED); + if (journal_space(ump, jblocks->jb_low)) + return; + FREE_LOCK(&lk); + if (vp) + ffs_syncvnode(vp, MNT_NOWAIT); + ffs_syncvnode(dvp, MNT_WAIT); + ACQUIRE_LOCK(&lk); + /* Process vp before dvp as it may create .. removes. */ + if (vp) + process_removes(vp); + process_removes(dvp); + softdep_speedup(); + process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); + process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); + if (journal_space(ump, jblocks->jb_low) == 0) { + softdep_speedup(); + if (journal_space(ump, jblocks->jb_min) == 0) + journal_suspend(ump); + } +} + +static void +jseg_write(fs, jblocks, jseg, data) + struct fs *fs; + struct jblocks *jblocks; + struct jseg *jseg; + uint8_t *data; +{ + struct jsegrec *rec; + + rec = (struct jsegrec *)data; + rec->jsr_seq = jseg->js_seq; + rec->jsr_oldest = jblocks->jb_oldestseq; + rec->jsr_cnt = jseg->js_cnt; + rec->jsr_crc = 0; + rec->jsr_time = fs->fs_mtime; +} + +static inline void +inoref_write(inoref, rec) + struct inoref *inoref; + struct jrefrec *rec; +{ + rec->jr_ino = inoref->if_ino; + rec->jr_parent = inoref->if_parent; + rec->jr_nlink = inoref->if_nlink; + rec->jr_mode = inoref->if_mode; + rec->jr_diroff = inoref->if_diroff; +} + +static void +jaddref_write(jaddref, data) + struct jaddref *jaddref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_ADDREF; + inoref_write(&jaddref->ja_ref, rec); +} + +static void +jremref_write(jremref, data) + struct jremref *jremref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_REMREF; + inoref_write(&jremref->jr_ref, rec); +} + +static void +jmvref_write(jmvref, data) + struct jmvref *jmvref; + uint8_t *data; +{ + struct jmvrec *rec; + + rec = (struct jmvrec *)data; + rec->jm_op = JOP_MVREF; + rec->jm_ino = jmvref->jm_ino; + rec->jm_parent = jmvref->jm_parent; + rec->jm_oldoff = jmvref->jm_oldoff; + rec->jm_newoff = jmvref->jm_newoff; +} + +static void +jnewblk_write(jnewblk, data) + struct jnewblk *jnewblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_NEWBLK; + rec->jb_ino = jnewblk->jn_ino; + rec->jb_blkno = jnewblk->jn_blkno; + rec->jb_lbn = jnewblk->jn_lbn; + rec->jb_frags = jnewblk->jn_frags; + rec->jb_oldfrags = jnewblk->jn_oldfrags; +} + +static void +jfreeblk_write(jfreeblk, data) + struct jfreeblk *jfreeblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreeblk->jf_ino; + rec->jb_blkno = jfreeblk->jf_blkno; + rec->jb_lbn = jfreeblk->jf_lbn; + rec->jb_frags = jfreeblk->jf_frags; + rec->jb_oldfrags = 0; +} + +static void +jfreefrag_write(jfreefrag, data) + struct jfreefrag *jfreefrag; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreefrag->fr_ino; + rec->jb_blkno = jfreefrag->fr_blkno; + rec->jb_lbn = jfreefrag->fr_lbn; + rec->jb_frags = jfreefrag->fr_frags; + rec->jb_oldfrags = 0; +} + +/* + * Flush some journal records to disk. + */ +static void +softdep_process_journal(mp, flags) + struct mount *mp; + int flags; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + struct worklist *wk; + struct jseg *jseg; + struct buf *bp; + uint8_t *data; + struct fs *fs; + int segwritten; + int jrecmin; /* Minimum write size. */ + int jrecmax; /* Maximum write size. */ + int size; + int cnt; + + if ((mp->mnt_flag & MNT_SUJ) == 0) + return; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + jblocks = ump->softdep_jblocks; + /* + * We write anywhere between a disk block and fs block. The upper + * bound is picked to prevent buffer cache fragmentation and limit + * processing time per I/O. + */ + jrecmax = fs->fs_bsize / JREC_SIZE; + jrecmin = DEV_BSIZE / JREC_SIZE; + segwritten = 0; + while ((cnt = ump->softdep_on_journal) != 0) { + /* + * Create a new segment to hold as many as 'cnt' journal + * entries and add them to the segment. Notice cnt is + * off by one to account for the space required by the + * jsegrec. If we don't have a full block to log skip it + * unless we haven't written anything in 10 seconds. + */ + cnt++; + if (cnt < jrecmax) { + if (segwritten) + return; + if (flags != MNT_WAIT && + (ticks - jblocks->jb_age) > hz*10) + break; + } + /* + * Verify some free journal space. softdep_prealloc() should + * guarantee that we don't run out so this is indicative of + * a problem with the flow control. Try to recover + * gracefully in any event. + */ + while (jblocks->jb_free == 0) { + if (flags != MNT_WAIT) + break; + printf("softdep: Out of journal space!\n"); + softdep_speedup(); + msleep(jblocks, &lk, PRIBIO, "jblocks", 1); + } + FREE_LOCK(&lk); + jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); + workitem_alloc(&jseg->js_list, D_JSEG, mp); + LIST_INIT(&jseg->js_entries); + jseg->js_state = ATTACHED; + jseg->js_refs = 1; /* Self reference. */ + jseg->js_jblocks = jblocks; + size = roundup2(cnt * JREC_SIZE, DEV_BSIZE); + bp = geteblk(fs->fs_bsize, 0); + ACQUIRE_LOCK(&lk); + /* + * If there was a race while we were allocating the block + * and jseg the entry we care about was likely written. + * We bail out in both the WAIT and NOWAIT case and assume + * the caller will loop if the entry it cares about is + * not written. + */ + if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + WORKITEM_FREE(jseg, D_JSEG); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); + break; + } + /* + * Calculate the disk block size required for the available + * records rounded to the min size. + */ + cnt = ump->softdep_on_journal + 1; + if (cnt < jrecmax) + cnt = roundup2(cnt, jrecmin); + else + cnt = jrecmax; + size = cnt * JREC_SIZE; + /* + * Allocate a disk block for this journal data and account + * for truncation of the requested size if enough contiguous + * space was not available. + */ + bp->b_blkno = bp->b_lblkno = jblocks_alloc(jblocks, size, + &size); + bp->b_offset = bp->b_blkno * DEV_BSIZE; + bp->b_bcount = size; + bp->b_bufobj = &ump->um_devvp->v_bufobj; + bp->b_flags &= ~B_INVAL; + /* + * Initialize our jseg with as many as cnt - 1 records. + * Assign the next sequence number to it and link it + * in-order. + */ + cnt = MIN(ump->softdep_on_journal, (size / JREC_SIZE) - 1); + jseg->js_buf = bp; + jseg->js_cnt = cnt; + jseg->js_size = size; + jseg->js_seq = jblocks->jb_nextseq++; + if (TAILQ_EMPTY(&jblocks->jb_segs)) + jblocks->jb_oldestseq = jseg->js_seq; + TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); + if (jblocks->jb_writeseg == NULL) + jblocks->jb_writeseg = jseg; + /* + * Start filling in records from the pending list. + */ + data = bp->b_data; + jseg_write(fs, jblocks, jseg, data); + data += JREC_SIZE; + while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) + != NULL) { + remove_from_journal(wk); + wk->wk_state |= IOSTARTED; + WORKLIST_INSERT(&jseg->js_entries, wk); + switch (wk->wk_type) { + case D_JADDREF: + jaddref_write(WK_JADDREF(wk), data); + break; + case D_JREMREF: + jremref_write(WK_JREMREF(wk), data); + break; + case D_JMVREF: + jmvref_write(WK_JMVREF(wk), data); + break; + case D_JNEWBLK: + jnewblk_write(WK_JNEWBLK(wk), data); + break; + case D_JFREEBLK: + jfreeblk_write(WK_JFREEBLK(wk), data); + break; + case D_JFREEFRAG: + jfreefrag_write(WK_JFREEFRAG(wk), data); + break; + default: + panic("process_journal: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + data += JREC_SIZE; + if (--cnt == 0) + break; + } + /* + * Write this one buffer and continue. + */ +#if 1 + WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); + FREE_LOCK(&lk); + BO_LOCK(bp->b_bufobj); + bgetvp(ump->um_devvp, bp); + BO_UNLOCK(bp->b_bufobj); + /* XXX Could bawrite here. */ + bwrite(bp); + ACQUIRE_LOCK(&lk); +#else + /* This case simulates the write but does not log anything. */ + handle_written_jseg(jseg, bp); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); +#endif + segwritten++; + } + /* + * If we've suspended the filesystem because we ran out of journal + * space either try to sync it here to make some progress or + * unsuspend it if we already have. + */ + if (flags == 0 && jblocks && jblocks->jb_suspended) { + if (journal_space(ump, jblocks->jb_min)) { + FREE_LOCK(&lk); + jblocks->jb_suspended = 0; + mp->mnt_susp_owner = curthread; + vfs_write_resume(mp); + ACQUIRE_LOCK(&lk); + return; + } + FREE_LOCK(&lk); + VFS_SYNC(mp, MNT_NOWAIT); + ffs_sbupdate(ump, MNT_WAIT, 0); + ACQUIRE_LOCK(&lk); + } +} + +/* + * Complete a jseg, allowing all dependencies awaiting journal writes + * to proceed. Each journal dependency also attaches a jsegdep to dependent + * structures so that the journal segment can be freed to reclaim space. + */ +static void +complete_jseg(jseg) + struct jseg *jseg; +{ + struct worklist *wk; + struct jmvref *jmvref; + int waiting; + int i; + + i = 0; + while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { + WORKLIST_REMOVE(wk); + waiting = wk->wk_state & IOWAITING; + wk->wk_state &= ~(IOSTARTED | IOWAITING); + wk->wk_state |= COMPLETE; + KASSERT(i < jseg->js_cnt, + ("handle_written_jseg: overflow %d >= %d", + i, jseg->js_cnt)); + jseg->js_refs++; /* Ref goes to the jsegdep below. */ + switch (wk->wk_type) { + case D_JADDREF: + handle_written_jaddref(WK_JADDREF(wk), jseg); + break; + case D_JREMREF: + handle_written_jremref(WK_JREMREF(wk), jseg); + break; + case D_JMVREF: + jseg->js_refs--; /* No jsegdep here. */ + jmvref = WK_JMVREF(wk); + LIST_REMOVE(jmvref, jm_deps); + free_pagedep(jmvref->jm_pagedep); + WORKITEM_FREE(jmvref, D_JMVREF); + break; + case D_JNEWBLK: + handle_written_jnewblk(WK_JNEWBLK(wk), jseg); + break; + case D_JFREEBLK: + handle_written_jfreeblk(WK_JFREEBLK(wk), jseg); + break; + case D_JFREEFRAG: + handle_written_jfreefrag(WK_JFREEFRAG(wk), jseg); + break; + default: + panic("handle_written_jseg: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + if (waiting) + wakeup(wk); + } + /* Release the self reference so the structure may be freed. */ + free_jseg(jseg); +} + +/* + * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg + * completions in order only. + */ +static void +handle_written_jseg(jseg, bp) + struct jseg *jseg; + struct buf *bp; +{ + struct jblocks *jblocks; + struct jseg *jsegn; + + if (jseg->js_refs == 0) + panic("handle_written_jseg: No self-reference on %p", jseg); + jseg->js_state |= DEPCOMPLETE; + /* + * We'll never need this buffer again, set flags so it will be + * discarded. + */ + bp->b_flags |= B_INVAL | B_NOCACHE; + jblocks = jseg->js_jblocks; + /* + * Don't allow out of order completions. If this isn't the first + * block wait for it to write before we're done. + */ + if (jseg != jblocks->jb_writeseg) + return; + /* Iterate through available jsegs processing their entries. */ + do { + jsegn = TAILQ_NEXT(jseg, js_next); + complete_jseg(jseg); + jseg = jsegn; + } while (jseg && jseg->js_state & DEPCOMPLETE); + jblocks->jb_writeseg = jseg; +} + +static inline struct jsegdep * +inoref_segattach(inoref, jseg) + struct inoref *inoref; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + + jsegdep = inoref->if_jsegdep; + inoref->if_jsegdep = NULL; + jsegdep->jd_seg = jseg; + + return (jsegdep); +} + +/* + * Called once a jremref has made it to stable store. The jremref is marked + * complete and we attempt to free it. Any pagedeps writes sleeping waiting + * for the jremref to complete will be awoken by free_jremref. + */ +static void +handle_written_jremref(jremref, jseg) + struct jremref *jremref; + struct jseg *jseg; +{ + struct inodedep *inodedep; + struct jsegdep *jsegdep; + struct dirrem *dirrem; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = inoref_segattach(&jremref->jr_ref, jseg); + /* + * Remove us from the inoref list. + */ + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, + 0, &inodedep) == 0) + panic("handle_written_jremref: Lost inodedep"); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + /* + * Complete the dirrem. + */ + dirrem = jremref->jr_dirrem; + jremref->jr_dirrem = NULL; + LIST_REMOVE(jremref, jr_deps); + jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; + WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd) && + (dirrem->dm_state & COMPLETE) != 0) + add_to_worklist(&dirrem->dm_list, 0); + free_jremref(jremref); +} + +/* + * Called once a jaddref has made it to stable store. The dependency is + * marked complete and any dependent structures are added to the inode + * bufwait list to be completed as soon as it is written. If a bitmap write + * depends on this entry we move the inode into the inodedephd of the + * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. + */ +static void +handle_written_jaddref(jaddref, jseg) + struct jaddref *jaddref; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + struct inodedep *inodedep; + struct diradd *diradd; + struct mkdir *mkdir; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = inoref_segattach(&jaddref->ja_ref, jseg); + mkdir = NULL; + diradd = NULL; + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("handle_written_jaddref: Lost inodedep."); + if (jaddref->ja_diradd == NULL) + panic("handle_written_jaddref: No dependency"); + if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { + diradd = jaddref->ja_diradd; + WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); + } else if (jaddref->ja_state & MKDIR_PARENT) { + mkdir = jaddref->ja_mkdir; + WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); + } else if (jaddref->ja_state & MKDIR_BODY) + mkdir = jaddref->ja_mkdir; + else + panic("handle_written_jaddref: Unknown dependency %p", + jaddref->ja_diradd); + jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ + /* + * Remove us from the inode list. + */ + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); + /* + * The mkdir may be waiting on the jaddref to clear before freeing. + */ + if (mkdir) { + KASSERT(mkdir->md_list.wk_type == D_MKDIR, + ("handle_written_jaddref: Incorrect type for mkdir %s", + TYPENAME(mkdir->md_list.wk_type))); + mkdir->md_jaddref = NULL; + diradd = mkdir->md_diradd; + mkdir->md_state |= DEPCOMPLETE; + complete_mkdir(mkdir); + } + WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); + if (jaddref->ja_state & NEWBLOCK) { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, + inodedep, id_deps); + } + free_jaddref(jaddref); +} + +/* + * Called once a jnewblk journal is written. The allocdirect or allocindir + * is placed in the bmsafemap to await notification of a written bitmap. + */ +static void +handle_written_jnewblk(jnewblk, jseg) + struct jnewblk *jnewblk; + struct jseg *jseg; +{ + struct bmsafemap *bmsafemap; + struct jsegdep *jsegdep; + struct newblk *newblk; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = jnewblk->jn_jsegdep; + jnewblk->jn_jsegdep = NULL; + jsegdep->jd_seg = jseg; + /* + * Add the written block to the bmsafemap so it can be notified when + * the bitmap is on disk. + */ + newblk = jnewblk->jn_newblk; + jnewblk->jn_newblk = NULL; + if (newblk == NULL) + panic("handle_written_jnewblk: No dependency for the segdep."); + + newblk->nb_jnewblk = NULL; + bmsafemap = newblk->nb_bmsafemap; + WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + free_jnewblk(jnewblk); +} + +/* + * Cancel a jfreefrag that won't be needed, probably due to colliding with + * an in-flight allocation that has not yet been committed. Divorce us + * from the freefrag and mark it DEPCOMPLETE so that it may be added + * to the worklist. + */ +static void +cancel_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + struct freefrag *freefrag; + + if (jfreefrag->fr_jsegdep) { + free_jsegdep(jfreefrag->fr_jsegdep); + jfreefrag->fr_jsegdep = NULL; + } + freefrag = jfreefrag->fr_freefrag; + jfreefrag->fr_freefrag = NULL; + freefrag->ff_jfreefrag = NULL; + free_jfreefrag(jfreefrag); + freefrag->ff_state |= DEPCOMPLETE; +} + +/* + * Free a jfreefrag when the parent freefrag is rendered obsolete. + */ +static void +free_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + + if (jfreefrag->fr_state & IOSTARTED) + WORKLIST_REMOVE(&jfreefrag->fr_list); + else if (jfreefrag->fr_state & ONWORKLIST) + remove_from_journal(&jfreefrag->fr_list); + if (jfreefrag->fr_freefrag != NULL) + panic("free_jfreefrag: Still attached to a freefrag."); + WORKITEM_FREE(jfreefrag, D_JFREEFRAG); +} + +/* + * Called when the journal write for a jfreefrag completes. The parent + * freefrag is added to the worklist if this completes its dependencies. + */ +static void +handle_written_jfreefrag(jfreefrag, jseg) + struct jfreefrag *jfreefrag; + struct jseg *jseg; +{ + struct jsegdep *jsegdep; + struct freefrag *freefrag; + + /* + * Attach the jsegdep to the jseg. + */ + jsegdep = jfreefrag->fr_jsegdep; + jfreefrag->fr_jsegdep = NULL; + jsegdep->jd_seg = jseg; + freefrag = jfreefrag->fr_freefrag; + if (freefrag == NULL) + panic("handle_written_jfreefrag: No freefrag."); + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jfreefrag = NULL; + WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); + jfreefrag->fr_freefrag = NULL; + free_jfreefrag(jfreefrag); +} + +/* + * Called when the journal write for a jfreeblk completes. The jfreeblk + * is removed from the freeblks list of pending journal writes and the + * jsegdep is moved to the freeblks jwork to be completed when all blocks + * have been reclaimed. + */ +static void +handle_written_jfreeblk(jfreeblk, jseg) + struct jfreeblk *jfreeblk; + struct jseg *jseg; +{ + struct freeblks *freeblks; + struct jsegdep *jsegdep; + + /* Attach the jsegdep to the jseg. */ + jsegdep = jfreeblk->jf_jsegdep; + jfreeblk->jf_jsegdep = NULL; + jsegdep->jd_seg = jseg; + freeblks = jfreeblk->jf_freeblks; + LIST_REMOVE(jfreeblk, jf_deps); + WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); + /* + * If the freeblks is all journaled, we can add it to the worklist. + */ + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && + (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { + /* Remove from the b_dep that is waiting on this write. */ + if (freeblks->fb_state & ONWORKLIST) + WORKLIST_REMOVE(&freeblks->fb_list); + add_to_worklist(&freeblks->fb_list, 1); + } + + free_jfreeblk(jfreeblk); +} + +static struct jsegdep * +newjsegdep(struct worklist *wk) +{ + struct jsegdep *jsegdep; + + jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); + jsegdep->jd_seg = NULL; + + return (jsegdep); +} + +static struct jmvref * +newjmvref(dp, ino, oldoff, newoff) + struct inode *dp; + ino_t ino; + off_t oldoff; + off_t newoff; +{ + struct jmvref *jmvref; + + jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); + jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; + jmvref->jm_parent = dp->i_number; + jmvref->jm_ino = ino; + jmvref->jm_oldoff = oldoff; + jmvref->jm_newoff = newoff; + + return (jmvref); +} + +/* + * Allocate a new jremref that tracks the removal of ip from dp with the + * directory entry offset of diroff. Mark the entry as ATTACHED and + * DEPCOMPLETE as we have all the information required for the journal write + * and the directory has already been removed from the buffer. The caller + * is responsible for linking the jremref into the pagedep and adding it + * to the journal to write. The MKDIR_PARENT flag is set if we're doing + * a DOTDOT addition so handle_workitem_remove() can properly assign + * the jsegdep when we're done. + */ +static struct jremref * +newjremref(dirrem, dp, ip, diroff, nlink) + struct dirrem *dirrem; + struct inode *dp; + struct inode *ip; + off_t diroff; + nlink_t nlink; +{ + struct jremref *jremref; + + jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); + jremref->jr_state = ATTACHED; + newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, + nlink, ip->i_mode); + jremref->jr_dirrem = dirrem; + + return (jremref); +} + +static inline void +newinoref(inoref, ino, parent, diroff, nlink, mode) + struct inoref *inoref; + ino_t ino; + ino_t parent; + off_t diroff; + nlink_t nlink; + uint16_t mode; +{ + + inoref->if_jsegdep = newjsegdep(&inoref->if_list); + inoref->if_diroff = diroff; + inoref->if_ino = ino; + inoref->if_parent = parent; + inoref->if_nlink = nlink; + inoref->if_mode = mode; +} + +/* + * Allocate a new jaddref to track the addition of ino to dp at diroff. The + * directory offset may not be known until later. The caller is responsible + * adding the entry to the journal when this information is available. nlink + * should be the link count prior to the addition and mode is only required + * to have the correct FMT. + */ +static struct jaddref * +newjaddref(dp, ino, diroff, nlink, mode) + struct inode *dp; + ino_t ino; + off_t diroff; + int16_t nlink; + uint16_t mode; +{ + struct jaddref *jaddref; + + jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); + jaddref->ja_state = ATTACHED; + jaddref->ja_mkdir = NULL; + newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); + + return (jaddref); +} + +/* + * Create a new free dependency for a freework. The caller is responsible + * for adjusting the reference count when it has the lock held. The freedep + * will track an outstanding bitmap write that will ultimately clear the + * freework to continue. + */ +static struct freedep * +newfreedep(struct freework *freework) +{ + struct freedep *freedep; + + freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); + freedep->fd_freework = freework; + + return (freedep); +} + +/* + * Free a freedep structure once the buffer it is linked to is written. If + * this is the last reference to the freework schedule it for completion. + */ +static void +free_freedep(freedep) + struct freedep *freedep; +{ + + if (--freedep->fd_freework->fw_ref == 0) + add_to_worklist(&freedep->fd_freework->fw_list, 1); + WORKITEM_FREE(freedep, D_FREEDEP); +} + +/* + * Allocate a new freework structure that may be a level in an indirect + * when parent is not NULL or a top level block when it is. The top level + * freework structures are allocated without lk held and before the freeblks + * is visible outside of softdep_setup_freeblocks(). + */ +static struct freework * +newfreework(freeblks, parent, lbn, nb, frags, journal) + struct freeblks *freeblks; + struct freework *parent; + ufs_lbn_t lbn; + ufs2_daddr_t nb; + int frags; + int journal; +{ + struct freework *freework; + + freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); + workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); + freework->fw_freeblks = freeblks; + freework->fw_parent = parent; + freework->fw_lbn = lbn; + freework->fw_blkno = nb; + freework->fw_frags = frags; + freework->fw_ref = 0; + freework->fw_off = 0; + LIST_INIT(&freework->fw_jwork); + + if (parent == NULL) { + WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, + &freework->fw_list); + freeblks->fb_ref++; + } + if (journal) + newjfreeblk(freeblks, lbn, nb, frags); + + return (freework); +} + +/* + * Allocate a new jfreeblk to journal top level block pointer when truncating + * a file. The caller must add this to the worklist when lk is held. + */ +static struct jfreeblk * +newjfreeblk(freeblks, lbn, blkno, frags) + struct freeblks *freeblks; + ufs_lbn_t lbn; + ufs2_daddr_t blkno; + int frags; +{ + struct jfreeblk *jfreeblk; + + jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); + jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); + jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; + jfreeblk->jf_ino = freeblks->fb_previousinum; + jfreeblk->jf_lbn = lbn; + jfreeblk->jf_blkno = blkno; + jfreeblk->jf_frags = frags; + jfreeblk->jf_freeblks = freeblks; + LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); + + return (jfreeblk); +} + +static void move_newblock_dep(struct jaddref *, struct inodedep *); +/* + * If we're canceling a new bitmap we have to search for another ref + * to move into the bmsafemap dep. This might be better expressed + * with another structure. + */ +static void +move_newblock_dep(jaddref, inodedep) + struct jaddref *jaddref; + struct inodedep *inodedep; +{ + struct inoref *inoref; + struct jaddref *jaddrefn; + + jaddrefn = NULL; + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) { + if ((jaddref->ja_state & NEWBLOCK) && + inoref->if_list.wk_type == D_JADDREF) { + jaddrefn = (struct jaddref *)inoref; + break; + } + } + if (jaddrefn == NULL) + return; + if (inodedep == NULL) + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("move_newblock_dep: Lost inodedep"); + jaddrefn->ja_state &= ~(ATTACHED | UNDONE); + jaddrefn->ja_state |= jaddref->ja_state & + (ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state |= ATTACHED; + LIST_REMOVE(jaddref, ja_bmdeps); + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, + ja_bmdeps); +} + +/* + * Cancel a jaddref either before it has been written or while it is being + * written. This happens when a link is removed before the add reaches + * the disk. The jaddref dependency is kept linked into the bmsafemap + * and inode to prevent the link count or bitmap from reaching the disk + * until handle_workitem_remove() re-adjusts the counts and bitmaps as + * required. + * + * Returns 1 if the canceled addref requires journaling of the remove and + * 0 otherwise. + */ +static int +cancel_jaddref(jaddref, inodedep, wkhd) + struct jaddref *jaddref; + struct inodedep *inodedep; + struct workhead *wkhd; +{ + struct inoref *inoref; + int needsj; + + KASSERT((jaddref->ja_state & COMPLETE) == 0, + ("cancel_jaddref: Canceling complete jaddref")); + if (jaddref->ja_state & (IOSTARTED | COMPLETE)) + needsj = 1; + else + needsj = 0; + /* + * If we're not journaling this remove we must adjust the nlink of + * any reference operation that follows us so that it is consistent + * with the in-memory reference. + */ + if (needsj == 0) + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) + inoref->if_nlink--; + if (jaddref->ja_ref.if_jsegdep) { + free_jsegdep(jaddref->ja_ref.if_jsegdep); + jaddref->ja_ref.if_jsegdep = NULL; + } + if (jaddref->ja_state & NEWBLOCK) + move_newblock_dep(jaddref, inodedep); + if (jaddref->ja_state & IOWAITING) { + jaddref->ja_state &= ~IOWAITING; + wakeup(&jaddref->ja_list); + } + jaddref->ja_mkdir = NULL; + if (jaddref->ja_state & IOSTARTED) { + jaddref->ja_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jaddref->ja_list); + } else + remove_from_journal(&jaddref->ja_list); + jaddref->ja_state |= GOINGAWAY; + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jaddref->ja_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jaddref->ja_list); + + return (needsj); +} + +/* + * Attempt to free a jaddref structure when some work completes. This + * should only succeed once the entry is written and all dependencies have + * been notified. + */ +static void +free_jaddref(jaddref) + struct jaddref *jaddref; +{ + + if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (jaddref->ja_ref.if_jsegdep) + panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", + jaddref, jaddref->ja_state); + if (jaddref->ja_state & NEWBLOCK) + LIST_REMOVE(jaddref, ja_bmdeps); + if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) + panic("free_jaddref: Bad state %p(0x%X)", + jaddref, jaddref->ja_state); + if (jaddref->ja_mkdir != NULL) + panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); + WORKITEM_FREE(jaddref, D_JADDREF); +} + +/* + * Free a jremref structure once it has been written or discarded. + */ +static void +free_jremref(jremref) + struct jremref *jremref; +{ + + if (jremref->jr_ref.if_jsegdep) + free_jsegdep(jremref->jr_ref.if_jsegdep); + if (jremref->jr_state & IOSTARTED) + panic("free_jremref: IO still pending"); + WORKITEM_FREE(jremref, D_JREMREF); +} + +/* + * Free a jnewblk structure. + */ +static void +free_jnewblk(jnewblk) + struct jnewblk *jnewblk; +{ + + if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(jnewblk, jn_deps); + if (jnewblk->jn_newblk != NULL) + panic("free_jnewblk: Dependency still attached."); + WORKITEM_FREE(jnewblk, D_JNEWBLK); +} + +/* + * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk + * is kept linked into the bmsafemap until the free completes, thus + * preventing the modified state from ever reaching disk. The free + * routine must pass this structure via ffs_blkfree() to + * softdep_setup_freeblks() so there is no race in releasing the space. + */ +static void +cancel_jnewblk(jnewblk, wkhd) + struct jnewblk *jnewblk; + struct workhead *wkhd; +{ + + if (jnewblk->jn_jsegdep) { + free_jsegdep(jnewblk->jn_jsegdep); + jnewblk->jn_jsegdep = NULL; + } + if (jnewblk->jn_state & IOWAITING) { + jnewblk->jn_state &= ~IOWAITING; + wakeup(&jnewblk->jn_list); + } + jnewblk->jn_newblk = NULL; + jnewblk->jn_state |= GOINGAWAY; + if (jnewblk->jn_state & IOSTARTED) { + jnewblk->jn_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jnewblk->jn_list); + } else + remove_from_journal(&jnewblk->jn_list); + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jnewblk->jn_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jnewblk->jn_list); +} + +static void +free_jfreeblk(jfreeblk) + struct jfreeblk *jfreeblk; +{ + + WORKITEM_FREE(jfreeblk, D_JFREEBLK); +} + +/* + * Release one reference to a jseg and free it if the count reaches 0. This + * should eventually reclaim journal space as well. + */ +static void +free_jseg(jseg) + struct jseg *jseg; +{ + struct jblocks *jblocks; + + KASSERT(jseg->js_refs > 0, + ("free_jseg: Invalid refcnt %d", jseg->js_refs)); + if (--jseg->js_refs != 0) + return; + /* + * Free only those jsegs which have none allocated before them to + * preserve the journal space ordering. + */ + jblocks = jseg->js_jblocks; + while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { + jblocks->jb_oldestseq = jseg->js_seq; + if (jseg->js_refs != 0) + break; + TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); + jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); + KASSERT(LIST_EMPTY(&jseg->js_entries), + ("free_jseg: Freed jseg has valid entries.")); + WORKITEM_FREE(jseg, D_JSEG); + } +} + +/* + * Release a jsegdep and decrement the jseg count. + */ +static void +free_jsegdep(jsegdep) + struct jsegdep *jsegdep; +{ + + if (jsegdep->jd_seg) + free_jseg(jsegdep->jd_seg); + WORKITEM_FREE(jsegdep, D_JSEGDEP); +} + +/* + * Wait for a journal item to make it to disk. Initiate journal processing + * if required. + */ +static void +jwait(wk) + struct worklist *wk; +{ + + /* + * If IO has not started we process the journal. We can't mark the + * worklist item as IOWAITING because we drop the lock while + * processing the journal and the worklist entry may be freed after + * this point. The caller may call back in and re-issue the request. + */ + if ((wk->wk_state & IOSTARTED) == 0) { + softdep_process_journal(wk->wk_mp, MNT_WAIT); + return; + } + wk->wk_state |= IOWAITING; + msleep(wk, &lk, PRIBIO, "jwait", 0); +} + +/* + * Lookup an inodedep based on an inode pointer and set the nlinkdelta as + * appropriate. This is a convenience function to reduce duplicate code + * for the setup and revert functions below. + */ +static struct inodedep * +inodedep_lookup_ip(ip) + struct inode *ip; +{ + struct inodedep *inodedep; + + KASSERT(ip->i_nlink >= ip->i_effnlink, + ("inodedep_lookup_ip: bad delta")); + (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, + DEPALLOC, &inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + + return (inodedep); +} + +/* + * Called prior to creating a new inode and linking it to a directory. The + * jaddref structure must already be allocated by softdep_setup_inomapdep + * and it is discovered here so we can initialize the mode and update + * nlinkdelta. + */ +void +softdep_setup_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(ip->i_nlink == 1, + ("softdep_setup_create: Invalid link count.")); + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_create: No addref structure present.")); + jaddref->ja_mode = ip->i_mode; + softdep_prelink(dvp, NULL); + } + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track the addition of a DOTDOT link when + * we are reparenting an inode as part of a rename. This jaddref will be + * found by softdep_setup_directory_change. Adjusts nlinkdelta for + * non-journaling softdep. + */ +void +softdep_setup_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + jaddref = NULL; + /* + * We don't set MKDIR_PARENT as this is not tied to a mkdir and + * is used as a normal link would be. + */ + if (DOINGSUJ(dvp)) + jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (jaddref) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + } + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track a new link to an inode. The directory + * offset is not known until softdep_setup_directory_add or + * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling + * softdep. + */ +void +softdep_setup_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + jaddref = NULL; + if (DOINGSUJ(dvp)) + jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, + ip->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (jaddref) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + } + FREE_LOCK(&lk); +} + +/* + * Called to create the jaddref structures to track . and .. references as + * well as lookup and further initialize the incomplete jaddref created + * by softdep_setup_inomapdep when the inode was allocated. Adjusts + * nlinkdelta for non-journaling softdep. + */ +void +softdep_setup_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *dotdotaddref; + struct jaddref *dotaddref; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + dotaddref = dotdotaddref = NULL; + if (DOINGSUJ(dvp)) { + dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, + ip->i_mode); + dotaddref->ja_state |= MKDIR_BODY; + dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + dotdotaddref->ja_state |= MKDIR_PARENT; + } + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL, + ("softdep_setup_mkdir: No addref structure present.")); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_setup_mkdir: bad parent %d", + jaddref->ja_parent)); + jaddref->ja_mode = ip->i_mode; + TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, + if_deps); + } + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, + &dotdotaddref->ja_ref, if_deps); + softdep_prelink(ITOV(dp), NULL); + } + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlinking a directory. + */ +void +softdep_setup_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlink. + */ +void +softdep_setup_unlink(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed non-directory + * creation. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_create: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed dotdot link + * creation. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_dotdot_link: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed link + * addition. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_link: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed mkdir + * attempt. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + dvp = ITOV(dp); + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dotdot addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_mkdir: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dot addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(&lk); +} + +/* + * Called to correct nlinkdelta after a failed rmdir. + */ +void +softdep_revert_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(&lk); +} + +/* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem @@ -1536,22 +3788,44 @@ softdep_setup_inomapdep(bp, ip, newinum) { struct inodedep *inodedep; struct bmsafemap *bmsafemap; + struct jaddref *jaddref; + struct mount *mp; + struct fs *fs; + mp = UFSTOVFS(ip->i_ump); + fs = ip->i_ump->um_fs; + jaddref = NULL; + /* + * Allocate the journal reference add structure so that the bitmap + * can be dependent on it. + */ + if (mp->mnt_flag & MNT_SUJ) { + jaddref = newjaddref(ip, newinum, 0, 0, 0); + jaddref->ja_state |= NEWBLOCK; + } + + /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); - if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, - &inodedep))) - panic("softdep_setup_inomapdep: dependency for new inode " - "already exists"); - inodedep->id_buf = bp; + if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) + panic("softdep_setup_inomapdep: dependency %p for new" + "inode already exists", inodedep); + bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); + if (jaddref) { + LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + } else { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); + } + inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; - bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } @@ -1560,29 +3834,98 @@ softdep_setup_inomapdep(bp, ip, newinum) * allocate block or fragment. */ void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ + int frags; /* Number of fragments. */ + int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; struct fs *fs; fs = VFSTOUFS(mp)->um_fs; + jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ + if (mp->mnt_flag & MNT_SUJ) { + jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); + jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); + jnewblk->jn_state = ATTACHED; + jnewblk->jn_blkno = newblkno; + jnewblk->jn_frags = frags; + jnewblk->jn_oldfrags = oldfrags; +#ifdef SUJ_DEBUG + { + struct cg *cgp; + uint8_t *blksfree; + long bno; + int i; + + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isset(blksfree, bno + i)) + panic("softdep_setup_blkmapdep: " + "free fragment %d from %d-%d " + "state 0x%X dep %p", i, + jnewblk->jn_oldfrags, + jnewblk->jn_frags, + jnewblk->jn_state, + jnewblk->jn_newblk); + } + } +#endif + } ACQUIRE_LOCK(&lk); - if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) + if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); - newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, + dtog(fs, newblkno)); + if (jnewblk) { + jnewblk->jn_newblk = newblk; + LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); + } else { + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + } + newblk->nb_bmsafemap = bmsafemap; + newblk->nb_jnewblk = jnewblk; FREE_LOCK(&lk); } +#define BMSAFEMAP_HASH(fs, cg) \ + (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) + +static int +bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) + struct bmsafemap_hashhead *bmsafemaphd; + struct mount *mp; + int cg; + struct bmsafemap **bmsafemapp; +{ + struct bmsafemap *bmsafemap; + + LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) + if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) + break; + if (bmsafemap) { + *bmsafemapp = bmsafemap; + return (1); + } + *bmsafemapp = NULL; + + return (0); +} + /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when @@ -1590,27 +3933,43 @@ void * splbio interrupts blocked. */ static struct bmsafemap * -bmsafemap_lookup(mp, bp) +bmsafemap_lookup(mp, bp, cg) struct mount *mp; struct buf *bp; + int cg; { - struct bmsafemap *bmsafemap; + struct bmsafemap_hashhead *bmsafemaphd; + struct bmsafemap *bmsafemap, *collision; struct worklist *wk; + struct fs *fs; mtx_assert(&lk, MA_OWNED); - LIST_FOREACH(wk, &bp->b_dep, wk_list) - if (wk->wk_type == D_BMSAFEMAP) - return (WK_BMSAFEMAP(wk)); + if (bp) + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_BMSAFEMAP) + return (WK_BMSAFEMAP(wk)); + fs = VFSTOUFS(mp)->um_fs; + bmsafemaphd = BMSAFEMAP_HASH(fs, cg); + if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) + return (bmsafemap); FREE_LOCK(&lk); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); bmsafemap->sm_buf = bp; - LIST_INIT(&bmsafemap->sm_allocdirecthd); - LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); + LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); + LIST_INIT(&bmsafemap->sm_newblkwr); + LIST_INIT(&bmsafemap->sm_jaddrefhd); + LIST_INIT(&bmsafemap->sm_jnewblkhd); ACQUIRE_LOCK(&lk); + if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (collision); + } + bmsafemap->sm_cg = cg; + LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } @@ -1645,9 +4004,9 @@ static struct bmsafemap * * unreferenced fragments. */ void -softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) +softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ - ufs_lbn_t lbn; /* block pointer within inode */ + ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ @@ -1656,34 +4015,33 @@ void { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; - struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; + ufs_lbn_t lbn; + lbn = bp->b_lblkno; mp = UFSTOVFS(ip->i_ump); - adp = malloc(sizeof(struct allocdirect), - M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; - adp->ad_newblkno = newblkno; - adp->ad_oldblkno = oldblkno; - adp->ad_newsize = newsize; - adp->ad_oldsize = oldsize; - adp->ad_state = ATTACHED; - LIST_INIT(&adp->ad_newdirblk); - if (newblkno == oldblkno) - adp->ad_freefrag = NULL; + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + freefrag = NULL; ACQUIRE_LOCK(&lk); - if (lbn >= NDADDR) { + if (off >= NDADDR) { + if (lbn > 0) + panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", + lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { + if (off != lbn) + panic("softdep_setup_allocdirect: lbn %jd != off %jd", + lbn, off); /* * Allocating a direct block. * @@ -1692,26 +4050,39 @@ void * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, off, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } - if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocdirect: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + newblk->nb_list.wk_type = D_ALLOCDIRECT; + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); - + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; - WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); + + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the @@ -1726,24 +4097,25 @@ void */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); } @@ -1761,10 +4133,11 @@ allocdirect_merge(adphead, newadp, oldadp) struct freefrag *freefrag; struct newdirblk *newdirblk; + freefrag = NULL; mtx_assert(&lk, MA_OWNED); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || - newadp->ad_lbn >= NDADDR) + newadp->ad_offset >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, @@ -1779,7 +4152,7 @@ allocdirect_merge(adphead, newadp, oldadp) * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on - * the worklist when it is freed by free_allocdirect. It is + * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. @@ -1788,8 +4161,8 @@ allocdirect_merge(adphead, newadp, oldadp) * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ + freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { - freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } @@ -1804,32 +4177,118 @@ allocdirect_merge(adphead, newadp, oldadp) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); } - free_allocdirect(adphead, oldadp, 0); + TAILQ_REMOVE(adphead, oldadp, ad_next); + /* + * We need to move any journal dependencies over to the freefrag + * that releases this block if it exists. Otherwise we are + * extending an existing block and we'll wait until that is + * complete to release the journal space and extend the + * new journal to cover this old space as well. + */ + if (freefrag == NULL) { + struct jnewblk *jnewblk; + struct jnewblk *njnewblk; + + if (oldadp->ad_newblkno != newadp->ad_newblkno) + panic("allocdirect_merge: %jd != %jd", + oldadp->ad_newblkno, newadp->ad_newblkno); + jnewblk = oldadp->ad_block.nb_jnewblk; + cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork); + /* + * We have an unwritten jnewblk, we need to merge the + * frag bits with our own. The newer adp's journal can not + * be written prior to the old one so no need to check for + * it here. + */ + if (jnewblk) { + njnewblk = newadp->ad_block.nb_jnewblk; + if (njnewblk == NULL) + panic("allocdirect_merge: No jnewblk"); + if (jnewblk->jn_state & UNDONE) { + njnewblk->jn_state |= UNDONE | NEWBLOCK; + njnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state &= ~UNDONE; + } + njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; + WORKLIST_REMOVE(&jnewblk->jn_list); + jnewblk->jn_state |= ATTACHED | COMPLETE; + free_jnewblk(jnewblk); + } + } else { + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocdirect that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jfreefrag) + cancel_jfreefrag(freefrag->ff_jfreefrag); + cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork); + } + free_newblk(&oldadp->ad_block); } - + /* - * Allocate a new freefrag structure if needed. + * Allocate a jfreefrag structure to journal a single block free. */ +static struct jfreefrag * +newjfreefrag(freefrag, ip, blkno, size, lbn) + struct freefrag *freefrag; + struct inode *ip; + ufs2_daddr_t blkno; + long size; + ufs_lbn_t lbn; +{ + struct jfreefrag *jfreefrag; + struct fs *fs; + + fs = ip->i_fs; + jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, + M_SOFTDEP_FLAGS); + workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); + jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); + jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; + jfreefrag->fr_ino = ip->i_number; + jfreefrag->fr_lbn = lbn; + jfreefrag->fr_blkno = blkno; + jfreefrag->fr_frags = numfrags(fs, size); + jfreefrag->fr_freefrag = freefrag; + + return (jfreefrag); +} + +/* + * Allocate a new freefrag structure. + */ static struct freefrag * -newfreefrag(ip, blkno, size) +newfreefrag(ip, blkno, size, lbn) struct inode *ip; ufs2_daddr_t blkno; long size; + ufs_lbn_t lbn; { struct freefrag *freefrag; struct fs *fs; - if (blkno == 0) - return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), - M_FREEFRAG, M_SOFTDEP_FLAGS); + M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); + freefrag->ff_state = ATTACHED; + LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; + + if (fs->fs_flags & FS_SUJ) { + freefrag->ff_jfreefrag = + newjfreefrag(freefrag, ip, blkno, size, lbn); + } else { + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jfreefrag = NULL; + } + return (freefrag); } @@ -1842,9 +4301,17 @@ handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); + struct workhead wkhd; + /* + * It would be illegal to add new completion items to the + * freefrag after it was schedule to be done so it must be + * safe to modify the list head here. + */ + LIST_INIT(&wkhd); + LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum); + freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); @@ -1856,9 +4323,9 @@ handle_workitem_freefrag(freefrag) * See the description of softdep_setup_allocdirect above for details. */ void -softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) +softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; - ufs_lbn_t lbn; + ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; @@ -1867,50 +4334,55 @@ void { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; - struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; + ufs_lbn_t lbn; + if (off >= NXADDR) + panic("softdep_setup_allocext: lbn %lld > NXADDR", + (long long)off); + + lbn = bp->b_lblkno; mp = UFSTOVFS(ip->i_ump); - adp = malloc(sizeof(struct allocdirect), - M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; - adp->ad_newblkno = newblkno; - adp->ad_oldblkno = oldblkno; - adp->ad_newsize = newsize; - adp->ad_oldsize = oldsize; - adp->ad_state = ATTACHED | EXTDATA; - LIST_INIT(&adp->ad_newdirblk); - if (newblkno == oldblkno) - adp->ad_freefrag = NULL; + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + freefrag = NULL; ACQUIRE_LOCK(&lk); - if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocext: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + newblk->nb_list.wk_type = D_ALLOCDIRECT; + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + adp->ad_state |= EXTDATA; + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; - if (newblk->nb_state == DEPCOMPLETE) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); - } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); - - WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); - if (lbn >= NXADDR) - panic("softdep_setup_allocext: lbn %lld > NXADDR", - (long long)lbn); + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the @@ -1925,23 +4397,23 @@ void */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } @@ -1975,22 +4447,39 @@ void * Allocate a new allocindir structure. */ static struct allocindir * -newallocindir(ip, ptrno, newblkno, oldblkno) +newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ + ufs_lbn_t lbn; { + struct newblk *newblk; struct allocindir *aip; + struct freefrag *freefrag; + struct jnewblk *jnewblk; - aip = malloc(sizeof(struct allocindir), - M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); - aip->ai_state = ATTACHED; + if (oldblkno) + freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); + else + freefrag = NULL; + ACQUIRE_LOCK(&lk); + if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) + panic("new_allocindir: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("newallocindir: newblk already initialized")); + newblk->nb_list.wk_type = D_ALLOCINDIR; + newblk->nb_freefrag = freefrag; + aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; - aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; - aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jfreefrag != NULL) + add_to_journal(&freefrag->ff_jfreefrag->fr_list); return (aip); } @@ -2008,22 +4497,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { + struct inodedep *inodedep; struct allocindir *aip; struct pagedep *pagedep; + struct mount *mp; + if (lbn != nbp->b_lblkno) + panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", + lbn, bp->b_lblkno); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); - aip = newallocindir(ip, ptrno, newblkno, oldblkno); - ACQUIRE_LOCK(&lk); + mp = UFSTOVFS(ip->i_ump); + aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); - WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } @@ -2039,38 +4534,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { + struct inodedep *inodedep; struct allocindir *aip; + ufs_lbn_t lbn; + lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); - aip = newallocindir(ip, ptrno, newblkno, 0); - ACQUIRE_LOCK(&lk); - WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + aip = newallocindir(ip, ptrno, newblkno, 0, lbn); + inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } +static void +indirdep_complete(indirdep) + struct indirdep *indirdep; +{ + struct allocindir *aip; + + LIST_REMOVE(indirdep, ir_next); + indirdep->ir_state &= ~ONDEPLIST; + + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + free_newblk(&aip->ai_block); + } + /* + * If this indirdep is not attached to a buf it was simply waiting + * on completion to clear completehd. free_indirdep() asserts + * that nothing is dangling. + */ + if ((indirdep->ir_state & ONWORKLIST) == 0) + free_indirdep(indirdep); +} + /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void -setup_allocindir_phase2(bp, ip, aip) +setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ + struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ + ufs_lbn_t lbn; /* Logical block number for this block. */ { struct worklist *wk; + struct fs *fs; + struct newblk *newblk; struct indirdep *indirdep, *newindirdep; - struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; - struct newblk *newblk; + struct mount *mp; ufs2_daddr_t blkno; + mp = UFSTOVFS(ip->i_ump); + fs = ip->i_fs; mtx_assert(&lk, MA_OWNED); if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); - for (indirdep = NULL, newindirdep = NULL; ; ) { + for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; @@ -2079,49 +4604,41 @@ static void } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; + newindirdep = NULL; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); - newindirdep = NULL; + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, + &newblk)) { + indirdep->ir_state |= ONDEPLIST; + LIST_INSERT_HEAD(&newblk->nb_indirdeps, + indirdep, ir_next); + } else + indirdep->ir_state |= DEPCOMPLETE; } if (indirdep) { - if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, - &newblk) == 0) - panic("setup_allocindir: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { - aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; - } else { - bmsafemap = newblk->nb_bmsafemap; - aip->ai_buf = bmsafemap->sm_buf; - LIST_REMOVE(newblk, nb_deps); - LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, - aip, ai_deps); - } - LIST_REMOVE(newblk, nb_hash); - free(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old - * dependency into the new one. + * dependency into the new one. This happens + * as a result of reallocblk only. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else - LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) + LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, + ai_next) if (oldaip->ai_offset == aip->ai_offset) break; - freefrag = NULL; - if (oldaip != NULL) { - if (oldaip->ai_newblkno != aip->ai_oldblkno) - panic("setup_allocindir_phase2: blkno"); - aip->ai_oldblkno = oldaip->ai_oldblkno; - freefrag = aip->ai_freefrag; - aip->ai_freefrag = oldaip->ai_freefrag; - oldaip->ai_freefrag = NULL; - free_allocindir(oldaip, NULL); - } + if (oldaip != NULL) + freefrag = allocindir_merge(aip, oldaip); LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); + KASSERT(aip->ai_offset >= 0 && + aip->ai_offset < NINDIR(ip->i_ump->um_fs), + ("setup_allocindir_phase2: Bad offset %d", + aip->ai_offset)); + KASSERT(indirdep->ir_savebp != NULL, + ("setup_allocindir_phase2 NULL ir_savebp")); if (ip->i_ump->um_fstype == UFS1) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; @@ -2148,13 +4665,16 @@ static void } newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); - workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, - UFSTOVFS(ip->i_ump)); + workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; + newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); + LIST_INIT(&newindirdep->ir_writehd); + LIST_INIT(&newindirdep->ir_completehd); + LIST_INIT(&newindirdep->ir_jwork); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); @@ -2169,6 +4689,51 @@ static void } /* + * Merge two allocindirs which refer to the same block. Move newblock + * dependencies and setup the freefrags appropriately. + */ +static struct freefrag * +allocindir_merge(aip, oldaip) + struct allocindir *aip; + struct allocindir *oldaip; +{ + struct newdirblk *newdirblk; + struct freefrag *freefrag; + struct worklist *wk; + + if (oldaip->ai_newblkno != aip->ai_oldblkno) + panic("allocindir_merge: blkno"); + aip->ai_oldblkno = oldaip->ai_oldblkno; + freefrag = aip->ai_freefrag; + aip->ai_freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = NULL; + KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); + /* + * If we are tracking a new directory-block allocation, + * move it from the old allocindir to the new allocindir. + */ + if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { + newdirblk = WK_NEWDIRBLK(wk); + WORKLIST_REMOVE(&newdirblk->db_list); + if (!LIST_EMPTY(&oldaip->ai_newdirblk)) + panic("allocindir_merge: extra newdirblk"); + WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); + } + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocindir that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jfreefrag) + cancel_jfreefrag(freefrag->ff_jfreefrag); + LIST_REMOVE(oldaip, ai_next); + cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork); + free_newblk(&oldaip->ai_block); + + return (freefrag); +} + +/* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before @@ -2206,6 +4771,7 @@ softdep_setup_freeblocks(ip, length, flags) struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; + struct jfreeblk *jfreeblk; struct bufobj *bo; struct vnode *vp; struct buf *bp; @@ -2213,6 +4779,13 @@ softdep_setup_freeblocks(ip, length, flags) ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; + ufs2_daddr_t blkno; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + long oldextsize; + long oldsize; + int frags; + int needj; fs = ip->i_fs; mp = UFSTOVFS(ip->i_ump); @@ -2221,32 +4794,53 @@ softdep_setup_freeblocks(ip, length, flags) freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); + LIST_INIT(&freeblks->fb_jfreeblkhd); + LIST_INIT(&freeblks->fb_jwork); freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; + freeblks->fb_chkcnt = 0; ACQUIRE_LOCK(&lk); + /* + * If we're truncating a removed file that will never be written + * we don't need to journal the block frees. The canceled journals + * for the allocations will suffice. + */ + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || + (fs->fs_flags & FS_SUJ) == 0) + needj = 0; + else + needj = 1; num_freeblkdep++; FREE_LOCK(&lk); extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks = DIP(ip, i_blocks) - extblocks; - if ((flags & IO_NORMAL) == 0) { - freeblks->fb_oldsize = 0; - freeblks->fb_chkcnt = 0; - } else { - freeblks->fb_oldsize = ip->i_size; + if ((flags & IO_NORMAL) != 0) { + oldsize = ip->i_size; ip->i_size = 0; DIP_SET(ip, i_size, 0); freeblks->fb_chkcnt = datablocks; for (i = 0; i < NDADDR; i++) { - freeblks->fb_dblks[i] = DIP(ip, i_db[i]); + blkno = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], 0); + if (blkno == 0) + continue; + frags = sblksize(fs, oldsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, i, blkno, frags, needj); } - for (i = 0; i < NIADDR; i++) { - freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + i++, tmpval *= NINDIR(fs)) { + blkno = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], 0); + if (blkno) + newfreework(freeblks, NULL, -lbn - i, blkno, + fs->fs_frag, needj); + lbn += tmpval; } /* * If the file was removed, then the space being freed was @@ -2259,17 +4853,23 @@ softdep_setup_freeblocks(ip, length, flags) UFS_UNLOCK(ip->i_ump); } } - if ((flags & IO_EXT) == 0) { - freeblks->fb_oldextsize = 0; - } else { - freeblks->fb_oldextsize = ip->i_din2->di_extsize; + if ((flags & IO_EXT) != 0) { + oldextsize = ip->i_din2->di_extsize; ip->i_din2->di_extsize = 0; freeblks->fb_chkcnt += extblocks; for (i = 0; i < NXADDR; i++) { - freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; + blkno = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; + if (blkno == 0) + continue; + frags = sblksize(fs, oldextsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, -1 - i, blkno, frags, + needj); } } + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) + needj = 0; DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); /* * Push the zero'ed inode to to its disk buffer so that we are free @@ -2304,7 +4904,9 @@ softdep_setup_freeblocks(ip, length, flags) */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) - WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); + WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); + else if (needj) + freeblks->fb_state |= DEPCOMPLETE | COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated @@ -2318,14 +4920,19 @@ softdep_setup_freeblocks(ip, length, flags) merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) - free_allocdirect(&inodedep->id_inoupdt, adp, delay); + cancel_allocdirect(&inodedep->id_inoupdt, adp, + freeblks, delay); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) - free_allocdirect(&inodedep->id_extupdt, adp, delay); + cancel_allocdirect(&inodedep->id_extupdt, adp, + freeblks, delay); } + LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) + add_to_journal(&jfreeblk->jf_list); + FREE_LOCK(&lk); bdwrite(bp); /* @@ -2349,9 +4956,9 @@ restart: BO_UNLOCK(bo); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); - deallocate_dependencies(bp, inodedep); + if (deallocate_dependencies(bp, inodedep, freeblks)) + bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); - bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); BO_LOCK(bo); goto restart; @@ -2361,7 +4968,7 @@ restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); - if(delay) { + if (delay) { freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk @@ -2371,16 +4978,16 @@ restart: * the request here than in the !delay case. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) - add_to_worklist(&freeblks->fb_list); + add_to_worklist(&freeblks->fb_list, 1); } FREE_LOCK(&lk); /* - * If the inode has never been written to disk (delay == 0), - * then we can process the freeblks now that we have deleted - * the dependencies. + * If the inode has never been written to disk (delay == 0) and + * we're not waiting on any journal writes, then we can process the + * freeblks now that we have deleted the dependencies. */ - if (!delay) + if (!delay && !needj) handle_workitem_freeblocks(freeblks, 0); } @@ -2389,19 +4996,23 @@ restart: * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's - * associated with related dependencies do not occur. + * associated with related dependencies do not occur. Returns 1 if + * all dependencies were cleared, 0 otherwise. */ -static void -deallocate_dependencies(bp, inodedep) +static int +deallocate_dependencies(bp, inodedep, freeblks) struct buf *bp; struct inodedep *inodedep; + struct freeblks *freeblks; { struct worklist *wk; struct indirdep *indirdep; + struct newdirblk *newdirblk; struct allocindir *aip; struct pagedep *pagedep; + struct jremref *jremref; + struct jmvref *jmvref; struct dirrem *dirrem; - struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); @@ -2410,47 +5021,24 @@ restart: case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); - /* - * None of the indirect pointers will ever be visible, - * so they can simply be tossed. GOINGAWAY ensures - * that allocated pointers will be saved in the buffer - * cache until they are freed. Note that they will - * only be able to be found by their physical address - * since the inode mapping the logical address will - * be gone. The save buffer used for the safe copy - * was allocated in setup_allocindir_phase2 using - * the physical address so it could be used for this - * purpose. Hence we swap the safe copy with the real - * copy, allowing the safe copy to be freed and holding - * on to the real copy for later use in indir_trunc. - */ - if (indirdep->ir_state & GOINGAWAY) - panic("deallocate_dependencies: already gone"); - indirdep->ir_state |= GOINGAWAY; - VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; - while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) - free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); - bcopy(bp->b_data, indirdep->ir_savebp->b_data, - bp->b_bcount); - WORKLIST_REMOVE(wk); - WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); + cancel_indirdep(indirdep, bp, inodedep, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* - * None of the directory additions will ever be - * visible, so they can simply be tossed. + * There should be no directory add dependencies present + * as the directory could not be truncated until all + * children were removed. */ + KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, + ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) - while ((dap = - LIST_FIRST(&pagedep->pd_diraddhd[i]))) - free_diradd(dap); - while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) - free_diradd(dap); + KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, + ("deallocate_dependencies: diraddhd != NULL")); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. @@ -2458,36 +5046,47 @@ restart: * can be dumped directly onto the work list. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + /* + * If there are any dirrems we wait for + * the journal write to complete and + * then restart the buf scan as the lock + * has been dropped. + */ + while ((jremref = + LIST_FIRST(&dirrem->dm_jremrefhd)) + != NULL) { + jwait(&jremref->jr_list); + return (0); + } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == - ALLCOMPLETE) - add_to_worklist(&dirrem->dm_list); - else + ALLCOMPLETE) { + dirrem->dm_state |= COMPLETE; + add_to_worklist(&dirrem->dm_list, 0); + } else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } if ((pagedep->pd_state & NEWBLOCK) != 0) { - LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) - if (wk->wk_type == D_NEWDIRBLK && - WK_NEWDIRBLK(wk)->db_pagedep == - pagedep) - break; - if (wk != NULL) { - WORKLIST_REMOVE(wk); - free_newdirblk(WK_NEWDIRBLK(wk)); - } else - panic("deallocate_dependencies: " - "lost pagedep"); + newdirblk = pagedep->pd_newdirblk; + WORKLIST_REMOVE(&newdirblk->db_list); + free_newdirblk(newdirblk); } + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) + != NULL) { + jwait(&jmvref->jm_list); + return (0); + } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: - free_allocindir(WK_ALLOCINDIR(wk), inodedep); + aip = WK_ALLOCINDIR(wk); + cancel_allocindir(aip, inodedep, freeblks); continue; case D_ALLOCDIRECT: @@ -2502,46 +5101,155 @@ restart: /* NOTREACHED */ } } + + return (1); } /* - * Free an allocdirect. Generate a new freefrag work request if appropriate. - * This routine must be called with splbio interrupts blocked. + * An allocdirect is being canceled due to a truncate. We must make sure + * the journal entry is released in concert with the blkfree that releases + * the storage. Completed journal entries must not be released until the + * space is no longer pointed to by the inode or in the bitmap. */ static void -free_allocdirect(adphead, adp, delay) +cancel_allocdirect(adphead, adp, freeblks, delay) struct allocdirectlst *adphead; struct allocdirect *adp; + struct freeblks *freeblks; int delay; { + struct freework *freework; + struct newblk *newblk; + struct worklist *wk; + ufs_lbn_t lbn; + + TAILQ_REMOVE(adphead, adp, ad_next); + newblk = (struct newblk *)adp; + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the freework to be + * freed when freework_freeblock() is called. If the journal has + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ + if (newblk->nb_jnewblk == NULL) { + cancel_newblk(newblk, &freeblks->fb_jwork); + goto found; + } + lbn = newblk->nb_jnewblk->jn_lbn; + /* + * Find the correct freework structure so it releases the canceled + * journal when the bitmap is cleared. This preserves rollback + * until the allocation is reverted. + */ + LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { + freework = WK_FREEWORK(wk); + if (freework->fw_lbn != lbn) + continue; + cancel_newblk(newblk, &freework->fw_jwork); + goto found; + } + panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); +found: + if (delay) + WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, + &newblk->nb_list); + else + free_newblk(newblk); + return; +} + + +static void +cancel_newblk(newblk, wkhd) + struct newblk *newblk; + struct workhead *wkhd; +{ + struct indirdep *indirdep; + struct allocindir *aip; + + while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { + indirdep->ir_state &= ~ONDEPLIST; + LIST_REMOVE(indirdep, ir_next); + /* + * If an indirdep is not on the buf worklist we need to + * free it here as deallocate_dependencies() will never + * find it. These pointers were never visible on disk and + * can be discarded immediately. + */ + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + cancel_newblk(&aip->ai_block, wkhd); + free_newblk(&aip->ai_block); + } + /* + * If this indirdep is not attached to a buf it was simply + * waiting on completion to clear completehd. free_indirdep() + * asserts that nothing is dangling. + */ + if ((indirdep->ir_state & ONWORKLIST) == 0) + free_indirdep(indirdep); + } + if (newblk->nb_state & ONDEPLIST) { + newblk->nb_state &= ~ONDEPLIST; + LIST_REMOVE(newblk, nb_deps); + } + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + /* + * If the journal entry hasn't been written we hold onto the dep + * until it is safe to free along with the other journal work. + */ + if (newblk->nb_jnewblk != NULL) { + cancel_jnewblk(newblk->nb_jnewblk, wkhd); + newblk->nb_jnewblk = NULL; + } + if (!LIST_EMPTY(&newblk->nb_jwork)) + jwork_move(wkhd, &newblk->nb_jwork); +} + +/* + * Free a newblk. Generate a new freefrag work request if appropriate. + * This must be called after the inode pointer and any direct block pointers + * are valid or fully removed via truncate or frag extension. + */ +static void +free_newblk(newblk) + struct newblk *newblk; +{ + struct indirdep *indirdep; struct newdirblk *newdirblk; + struct freefrag *freefrag; struct worklist *wk; mtx_assert(&lk, MA_OWNED); - if ((adp->ad_state & DEPCOMPLETE) == 0) - LIST_REMOVE(adp, ad_deps); - TAILQ_REMOVE(adphead, adp, ad_next); - if ((adp->ad_state & COMPLETE) == 0) - WORKLIST_REMOVE(&adp->ad_list); - if (adp->ad_freefrag != NULL) { - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &adp->ad_freefrag->ff_list); - else - add_to_worklist(&adp->ad_freefrag->ff_list); + if (newblk->nb_state & ONDEPLIST) + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + LIST_REMOVE(newblk, nb_hash); + if ((freefrag = newblk->nb_freefrag) != NULL) { + freefrag->ff_state |= COMPLETE; + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); } - if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { + if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); - if (!LIST_EMPTY(&adp->ad_newdirblk)) - panic("free_allocdirect: extra newdirblk"); - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &newdirblk->db_list); - else - free_newdirblk(newdirblk); + if (!LIST_EMPTY(&newblk->nb_newdirblk)) + panic("free_newblk: extra newdirblk"); + free_newdirblk(newdirblk); } - WORKITEM_FREE(adp, D_ALLOCDIRECT); + while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { + indirdep->ir_state |= DEPCOMPLETE; + indirdep_complete(indirdep); + } + KASSERT(newblk->nb_jnewblk == NULL, + ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); + handle_jwork(&newblk->nb_jwork); + newblk->nb_list.wk_type = D_NEWBLK; + WORKITEM_FREE(newblk, D_NEWBLK); } /* @@ -2554,6 +5262,7 @@ free_newdirblk(newdirblk) { struct pagedep *pagedep; struct diradd *dap; + struct worklist *wk; int i; mtx_assert(&lk, MA_OWNED); @@ -2571,17 +5280,25 @@ free_newdirblk(newdirblk) pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) break; - if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { + if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && + LIST_EMPTY(&pagedep->pd_jmvrefhd)) { + KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, + ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } + /* Should only ever be one item in the list. */ + while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { + WORKLIST_REMOVE(wk); + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } @@ -2608,6 +5325,7 @@ softdep_freefile(pvp, ino, mode) freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; + LIST_INIT(&freefile->fx_jwork); if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendinginodes += 1; @@ -2618,11 +5336,29 @@ softdep_freefile(pvp, ino, mode) * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either - * case we can free the file immediately. + * case we can free the file immediately. If the journal was + * canceled before being written the inode will never make it to + * disk and we must send the canceled journal entrys to + * ffs_freefile() to be cleared in conjunction with the bitmap. + * Any blocks waiting on the inode to write can be safely freed + * here as it will never been written. */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || - check_inode_unwritten(inodedep)) { + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + /* + * Remove this inode from the unlinked list and set + * GOINGAWAY as appropriate to indicate that this inode + * will never be written. + */ + if (inodedep && inodedep->id_state & UNLINKED) { + clear_unlinked_inodedep(inodedep); + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) { + inodedep->id_state |= GOINGAWAY; + handle_bufwait(inodedep, &freefile->fx_jwork); + } + } + if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); handle_workitem_freefile(freefile); return; @@ -2654,7 +5390,8 @@ check_inode_unwritten(inodedep) { mtx_assert(&lk, MA_OWNED); - if ((inodedep->id_state & DEPCOMPLETE) != 0 || + + if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || @@ -2662,9 +5399,9 @@ check_inode_unwritten(inodedep) !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || + inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); - /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". @@ -2673,9 +5410,11 @@ check_inode_unwritten(inodedep) inodedep->id_savedino1 == NULL) return (0); + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); + inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; + inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { @@ -2696,17 +5435,23 @@ free_inodedep(inodedep) { mtx_assert(&lk, MA_OWNED); - if ((inodedep->id_state & ONWORKLIST) != 0 || + if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || + !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || - inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) + inodedep->id_mkdiradd != NULL || + inodedep->id_nlinkdelta != 0 || + inodedep->id_savedino1 != NULL) return (0); + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; @@ -2714,6 +5459,123 @@ free_inodedep(inodedep) } /* + * Free the block referenced by a freework structure. The parent freeblks + * structure is released and completed when the final cg bitmap reaches + * the disk. This routine may be freeing a jnewblk which never made it to + * disk in which case we do not have to wait as the operation is undone + * in memory immediately. + */ +static void +freework_freeblock(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct workhead wkhd; + struct fs *fs; + int complete; + int pending; + int bsize; + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + complete = 0; + LIST_INIT(&wkhd); + /* + * If we are canceling an existing jnewblk pass it to the free + * routine, otherwise pass the freeblk which will ultimately + * release the freeblks + */ + if (!LIST_EMPTY(&freework->fw_jwork)) { + LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); + complete = 1; + } else + WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); + bsize = lfragtosize(fs, freework->fw_frags); + pending = btodb(bsize); + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= pending; + FREE_LOCK(&lk); + /* + * extattr blocks don't show up in pending blocks. XXX why? + */ + if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { + UFS_LOCK(ump); + fs->fs_pendingblocks -= pending; + UFS_UNLOCK(ump); + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, + bsize, freeblks->fb_previousinum, &wkhd); + if (complete == 0) + return; + /* + * The jnewblk will be discarded and the bits in the map never + * made it to disk. We can immediately free the freeblk. + */ + ACQUIRE_LOCK(&lk); + handle_written_freework(freework); + FREE_LOCK(&lk); +} + +/* + * Start, continue, or finish the process of freeing an indirect block tree. + * The free operation may be paused at any point with fw_off containing the + * offset to restart from. This enables us to implement some flow control + * for large truncates which may fan out and generate a huge number of + * dependencies. + */ +static void +handle_workitem_indirblk(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct fs *fs; + + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + if (freework->fw_off == NINDIR(fs)) + freework_freeblock(freework); + else + indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), + freework->fw_lbn); +} + +/* + * Called when a freework structure attached to a cg buf is written. The + * ref on either the parent or the freeblks structure is released and + * either may be added to the worklist if it is the final ref. + */ +static void +handle_written_freework(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct freework *parent; + + freeblks = freework->fw_freeblks; + parent = freework->fw_parent; + if (parent) { + if (--parent->fw_ref != 0) + parent = NULL; + freeblks = NULL; + } else if (--freeblks->fb_ref != 0) + freeblks = NULL; + WORKITEM_FREE(freework, D_FREEWORK); + /* + * Don't delay these block frees or it takes an intolerable amount + * of time to process truncates and free their journal entries. + */ + if (freeblks) + add_to_worklist(&freeblks->fb_list, 1); + if (parent) + add_to_worklist(&parent->fw_list, 1); +} + +/* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, @@ -2726,99 +5588,79 @@ handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { + struct freework *freework; + struct worklist *wk; + + KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), + ("handle_workitem_freeblocks: Journal entries not written.")); + if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { + handle_complete_freeblocks(freeblks); + return; + } + freeblks->fb_ref++; + while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { + KASSERT(wk->wk_type == D_FREEWORK, + ("handle_workitem_freeblocks: Unknown type %s", + TYPENAME(wk->wk_type))); + WORKLIST_REMOVE_UNLOCKED(wk); + freework = WK_FREEWORK(wk); + if (freework->fw_lbn <= -NDADDR) + handle_workitem_indirblk(freework); + else + freework_freeblock(freework); + } + ACQUIRE_LOCK(&lk); + if (--freeblks->fb_ref != 0) + freeblks = NULL; + FREE_LOCK(&lk); + if (freeblks) + handle_complete_freeblocks(freeblks); +} + +/* + * Once all of the freework workitems are complete we can retire the + * freeblocks dependency and any journal work awaiting completion. This + * can not be called until all other dependencies are stable on disk. + */ +static void +handle_complete_freeblocks(freeblks) + struct freeblks *freeblks; +{ struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; - int i, nblocks, level, bsize; - ufs2_daddr_t bn, blocksreleased = 0; - int error, allerror = 0; - ufs_lbn_t baselbns[NIADDR], tmpval; - int fs_pendingblocks; + int flags; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; - fs_pendingblocks = 0; - tmpval = 1; - baselbns[0] = NDADDR; - for (i = 1; i < NIADDR; i++) { - tmpval *= NINDIR(fs); - baselbns[i] = baselbns[i - 1] + tmpval; - } - nblocks = btodb(fs->fs_bsize); - blocksreleased = 0; + flags = LK_NOWAIT; + /* - * Release all extended attribute blocks or frags. - */ - if (freeblks->fb_oldextsize > 0) { - for (i = (NXADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_eblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldextsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - blocksreleased += btodb(bsize); - } - } - /* - * Release all data blocks or frags. - */ - if (freeblks->fb_oldsize > 0) { - /* - * Indirect blocks first. - */ - for (level = (NIADDR - 1); level >= 0; level--) { - if ((bn = freeblks->fb_iblks[level]) == 0) - continue; - if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), - level, baselbns[level], &blocksreleased)) != 0) - allerror = error; - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, - fs->fs_bsize, freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - blocksreleased += nblocks; - } - /* - * All direct blocks or frags. - */ - for (i = (NDADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_dblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - fs_pendingblocks += btodb(bsize); - blocksreleased += btodb(bsize); - } - } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); - /* * If we still have not finished background cleanup, then check * to see if the block count needs to be adjusted. */ - if (freeblks->fb_chkcnt != blocksreleased && - (fs->fs_flags & FS_UNCLEAN) != 0 && + if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, - (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) - == 0) { + (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { ip = VTOI(vp); - DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ - freeblks->fb_chkcnt - blocksreleased); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); ip->i_flag |= IN_CHANGE; vput(vp); } #ifdef INVARIANTS - if (freeblks->fb_chkcnt != blocksreleased && + if (freeblks->fb_chkcnt != 0 && ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) printf("handle_workitem_freeblocks: block count\n"); - if (allerror) - softdep_error("handle_workitem_freeblks", allerror); #endif /* INVARIANTS */ ACQUIRE_LOCK(&lk); + /* + * All of the freeblock deps must be complete prior to this call + * so it's now safe to complete earlier outstanding journal entries. + */ + handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); num_freeblkdep--; FREE_LOCK(&lk); @@ -2830,29 +5672,39 @@ handle_workitem_freeblocks(freeblks, flags) * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ -static int -indir_trunc(freeblks, dbn, level, lbn, countp) - struct freeblks *freeblks; +static void +indir_trunc(freework, dbn, lbn) + struct freework *freework; ufs2_daddr_t dbn; - int level; ufs_lbn_t lbn; - ufs2_daddr_t *countp; { + struct workhead wkhd; + struct jnewblk *jnewblk; + struct freeblks *freeblks; struct buf *bp; struct fs *fs; + struct worklist *wkn; struct worklist *wk; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; - ufs2_daddr_t nb, *bap2 = 0; + ufs2_daddr_t nb, nnb, *bap2 = 0; ufs_lbn_t lbnadd; int i, nblocks, ufs1fmt; - int error, allerror = 0; int fs_pendingblocks; + int freedeps; + int level; + int cnt; + LIST_INIT(&wkhd); + level = lbn_level(lbn); + if (level == -1) + panic("indir_trunc: Invalid lbn %jd\n", lbn); + freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; + freedeps = 0; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); @@ -2877,13 +5729,14 @@ handle_workitem_freeblocks(freeblks, flags) ACQUIRE_LOCK(&lk); if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || - (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || - (indirdep->ir_state & GOINGAWAY) == 0) - panic("indir_trunc: lost indirdep"); - WORKLIST_REMOVE(wk); - WORKITEM_FREE(indirdep, D_INDIRDEP); + (wk->wk_state & GOINGAWAY) == 0) + panic("indir_trunc: lost indirdep %p", wk); + indirdep = WK_INDIRDEP(wk); + LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); + free_indirdep(indirdep); if (!LIST_EMPTY(&bp->b_dep)) - panic("indir_trunc: dangling dep"); + panic("indir_trunc: dangling dep %p", + LIST_FIRST(&bp->b_dep)); ump->um_numindirdeps -= 1; FREE_LOCK(&lk); } else { @@ -2892,11 +5745,10 @@ handle_workitem_freeblocks(freeblks, flags) brelse(bp); #endif FREE_LOCK(&lk); - error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, - NOCRED, &bp); - if (error) { + if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, + NOCRED, &bp) != 0) { brelse(bp); - return (error); + return; } } /* @@ -2909,57 +5761,245 @@ handle_workitem_freeblocks(freeblks, flags) ufs1fmt = 0; bap2 = (ufs2_daddr_t *)bp->b_data; } - nblocks = btodb(fs->fs_bsize); - for (i = NINDIR(fs) - 1; i >= 0; i--) { - if (ufs1fmt) + /* + * Reclaim indirect blocks which never made it to disk. + */ + cnt = 0; + LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { + struct workhead freewk; + if (wk->wk_type != D_JNEWBLK) + continue; + WORKLIST_REMOVE_UNLOCKED(wk); + LIST_INIT(&freewk); + WORKLIST_INSERT_UNLOCKED(&freewk, wk); + jnewblk = WK_JNEWBLK(wk); + if (jnewblk->jn_lbn > 0) + i = (jnewblk->jn_lbn - -lbn) / lbnadd; + else + i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd; + KASSERT(i >= 0 && i < NINDIR(fs), + ("indir_trunc: Index out of range %d parent %jd lbn %jd", + i, lbn, jnewblk->jn_lbn)); + /* Clear the pointer so it isn't found below. */ + if (ufs1fmt) { nb = bap1[i]; - else + bap1[i] = 0; + } else { nb = bap2[i]; + bap2[i] = 0; + } + KASSERT(nb == jnewblk->jn_blkno, + ("indir_trunc: Block mismatch %jd != %jd", + nb, jnewblk->jn_blkno)); + ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno, + fs->fs_bsize, freeblks->fb_previousinum, &freewk); + cnt++; + } + ACQUIRE_LOCK(&lk); + freework->fw_ref += NINDIR(fs) + 1; + /* Any remaining journal work can be completed with freeblks. */ + jwork_move(&freeblks->fb_jwork, &wkhd); + FREE_LOCK(&lk); + nblocks = btodb(fs->fs_bsize); + if (ufs1fmt) + nb = bap1[0]; + else + nb = bap2[0]; + /* + * Reclaim on disk blocks. + */ + for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { + if (i != NINDIR(fs) - 1) { + if (ufs1fmt) + nnb = bap1[i+1]; + else + nnb = bap2[i+1]; + } else + nnb = 0; if (nb == 0) continue; + cnt++; if (level != 0) { - if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), - level - 1, lbn + (i * lbnadd), countp)) != 0) - allerror = error; + struct freework *nfreework; + ufs_lbn_t nlbn; + + nlbn = (lbn + 1) - (i * lbnadd); + nfreework = newfreework(freeblks, freework, nlbn, nb, + fs->fs_frag, 0); + freedeps++; + indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); + } else { + struct freedep *freedep; + + /* + * Attempt to aggregate freedep dependencies for + * all blocks being released to the same CG. + */ + LIST_INIT(&wkhd); + if (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb))) { + freedep = newfreedep(freework); + WORKLIST_INSERT_UNLOCKED(&wkhd, + &freedep->fd_list); + freedeps++; + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, + fs->fs_bsize, freeblks->fb_previousinum, &wkhd); + fs_pendingblocks += nblocks; } - ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, - freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - *countp += nblocks; } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); + ACQUIRE_LOCK(&lk); + freework->fw_off = i; + if (level == 0) + fs_pendingblocks = (nblocks * cnt); + freework->fw_ref += freedeps; + freework->fw_ref -= NINDIR(fs) + 1; + if (freework->fw_ref != 0) + freework = NULL; + FREE_LOCK(&lk); + if (fs_pendingblocks) { + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= fs_pendingblocks; + FREE_LOCK(&lk); + UFS_LOCK(ump); + fs->fs_pendingblocks -= fs_pendingblocks; + UFS_UNLOCK(ump); + } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); - return (allerror); + if (freework) + handle_workitem_indirblk(freework); + return; } /* - * Free an allocindir. - * This routine must be called with splbio interrupts blocked. + * Cancel an allocindir when it is removed via truncation. */ static void -free_allocindir(aip, inodedep) +cancel_allocindir(aip, inodedep, freeblks) struct allocindir *aip; struct inodedep *inodedep; + struct freeblks *freeblks; { - struct freefrag *freefrag; + struct newblk *newblk; - mtx_assert(&lk, MA_OWNED); - if ((aip->ai_state & DEPCOMPLETE) == 0) - LIST_REMOVE(aip, ai_deps); - if (aip->ai_state & ONWORKLIST) - WORKLIST_REMOVE(&aip->ai_list); + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the indirdep to be + * freed when indir_trunc() is called. If the journal has already + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ LIST_REMOVE(aip, ai_next); - if ((freefrag = aip->ai_freefrag) != NULL) { + newblk = (struct newblk *)aip; + if (newblk->nb_jnewblk == NULL) + cancel_newblk(newblk, &freeblks->fb_jwork); + else + cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork); + if (inodedep && inodedep->id_state & DEPCOMPLETE) + WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); + else + free_newblk(newblk); +} + +/* + * Create the mkdir dependencies for . and .. in a new directory. Link them + * in to a newdirblk so any subsequent additions are tracked properly. The + * caller is responsible for adding the mkdir1 dependency to the journal + * and updating id_mkdiradd. This function returns with lk held. + */ +static struct mkdir * +setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) + struct diradd *dap; + ino_t newinum; + ino_t dinum; + struct buf *newdirbp; + struct mkdir **mkdirp; +{ + struct newblk *newblk; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct newdirblk *newdirblk = 0; + struct mkdir *mkdir1, *mkdir2; + struct worklist *wk; + struct jaddref *jaddref; + struct mount *mp; + + mp = dap->da_list.wk_mp; + newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, + M_SOFTDEP_FLAGS); + workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); + mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); + mkdir1->md_state = ATTACHED | MKDIR_BODY; + mkdir1->md_diradd = dap; + mkdir1->md_jaddref = NULL; + mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); + mkdir2->md_state = ATTACHED | MKDIR_PARENT; + mkdir2->md_diradd = dap; + mkdir2->md_jaddref = NULL; + if ((mp->mnt_flag & MNT_SUJ) == 0) { + mkdir1->md_state |= DEPCOMPLETE; + mkdir2->md_state |= DEPCOMPLETE; + } + /* + * Dependency on "." and ".." being written to disk. + */ + mkdir1->md_buf = newdirbp; + ACQUIRE_LOCK(&lk); + LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); + /* + * We must link the pagedep, allocdirect, and newdirblk for + * the initial file page so the pointer to the new directory + * is not written until the directory contents are live and + * any subsequent additions are not marked live until the + * block is reachable via the inode. + */ + if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) + panic("setup_newdir: lost pagedep"); + LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) + if (wk->wk_type == D_ALLOCDIRECT) + break; + if (wk == NULL) + panic("setup_newdir: lost allocdirect"); + newblk = WK_NEWBLK(wk); + pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; + newdirblk->db_pagedep = pagedep; + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); + WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); + /* + * Look up the inodedep for the parent directory so that we + * can link mkdir2 into the pending dotdot jaddref or + * the inode write if there is none. If the inode is + * ALLCOMPLETE and no jaddref is present all dependencies have + * been satisfied and mkdir2 can be freed. + */ + inodedep_lookup(mp, dinum, 0, &inodedep); + if (mp->mnt_flag & MNT_SUJ) { if (inodedep == NULL) - add_to_worklist(&freefrag->ff_list); - else - WORKLIST_INSERT(&inodedep->id_bufwait, - &freefrag->ff_list); + panic("setup_newdir: Lost parent."); + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && + (jaddref->ja_state & MKDIR_PARENT), + ("setup_newdir: bad dotdot jaddref %p", jaddref)); + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + mkdir2->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir2; + } else if (inodedep == NULL || + (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + dap->da_state &= ~MKDIR_PARENT; + WORKITEM_FREE(mkdir2, D_MKDIR); + } else { + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } - WORKITEM_FREE(aip, D_ALLOCINDIR); + *mkdirp = mkdir2; + + return (mkdir1); } /* @@ -2998,12 +6038,14 @@ softdep_setup_directory_add(bp, dp, diroffset, new ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; - struct allocdirect *adp; + struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; + struct jaddref *jaddref; struct mount *mp; + int isindir; /* * Whiteouts have no dependencies. @@ -3013,6 +6055,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new bdwrite(newdirbp); return (0); } + jaddref = NULL; + mkdir1 = mkdir2 = NULL; mp = UFSTOVFS(dp->i_ump); fs = dp->i_fs; lbn = lblkno(fs, diroffset); @@ -3023,111 +6067,123 @@ softdep_setup_directory_add(bp, dp, diroffset, new dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; - if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { + LIST_INIT(&dap->da_jwork); + isindir = bp->b_lblkno >= NDADDR; + if (isnewblk && + (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); } + /* + * If we're creating a new directory setup the dependencies and set + * the dap state to wait for them. Otherwise it's COMPLETE and + * we can move on. + */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; - mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, - M_SOFTDEP_FLAGS); - workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); - mkdir1->md_state = MKDIR_BODY; - mkdir1->md_diradd = dap; - mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, - M_SOFTDEP_FLAGS); - workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); - mkdir2->md_state = MKDIR_PARENT; - mkdir2->md_diradd = dap; - /* - * Dependency on "." and ".." being written to disk. - */ - mkdir1->md_buf = newdirbp; - ACQUIRE_LOCK(&lk); - LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); - WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); - FREE_LOCK(&lk); - bdwrite(newdirbp); - /* - * Dependency on link count increase for parent directory - */ - ACQUIRE_LOCK(&lk); - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 - || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { - dap->da_state &= ~MKDIR_PARENT; - WORKITEM_FREE(mkdir2, D_MKDIR); - } else { - LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); - WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); - } + mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, + &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); +#ifdef DEBUG + if (diradd_lookup(pagedep, offset) != NULL) + panic("softdep_setup_directory_add: %p already at off %d\n", + diradd_lookup(pagedep, offset), offset); +#endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); /* - * Link into its inodedep. Put it on the id_bufwait list if the inode - * is not yet written. If it is written, do the post-inode write - * processing to put it on the id_pendinghd list. + * If we're journaling, link the diradd into the jaddref so it + * may be completed after the journal entry is written. Otherwise, + * link the diradd into its inodedep. If the inode is not yet + * written place it on the bufwait list, otherwise do the post-inode + * write processing to put it on the id_pendinghd list. */ - (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); - if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) + if (mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_add: bad jaddref %p", jaddref)); + jaddref->ja_diroff = diroffset; + jaddref->ja_diradd = dap; + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); - if (isnewblk) { + /* + * Add the journal entries for . and .. links now that the primary + * link is written. + */ + if (mkdir1 != NULL && mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, + inoreflst, if_deps); + KASSERT(jaddref != NULL && + jaddref->ja_ino == jaddref->ja_parent && + (jaddref->ja_state & MKDIR_BODY), + ("softdep_setup_directory_add: bad dot jaddref %p", + jaddref)); + mkdir1->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir1; /* - * Directories growing into indirect blocks are rare - * enough and the frequency of new block allocation - * in those cases even more rare, that we choose not - * to bother tracking them. Rather we simply force the - * new directory entry to disk. + * It is important that the dotdot journal entry + * is added prior to the dot entry since dot writes + * both the dot and dotdot links. These both must + * be added after the primary link for the journal + * to remain consistent. */ - if (lbn >= NDADDR) { - FREE_LOCK(&lk); - /* - * We only have a new allocation when at the - * beginning of a new block, not when we are - * expanding into an existing block. - */ - if (blkoff(fs, diroffset) == 0) - return (1); - return (0); - } + add_to_journal(&mkdir2->md_jaddref->ja_list); + add_to_journal(&jaddref->ja_list); + } + /* + * If we are adding a new directory remember this diradd so that if + * we rename it we can keep the dot and dotdot dependencies. If + * we are adding a new name for an inode that has a mkdiradd we + * must be in rename and we have to move the dot and dotdot + * dependencies to this new name. The old name is being orphaned + * soon. + */ + if (mkdir1 != NULL) { + if (inodedep->id_mkdiradd != NULL) + panic("softdep_setup_directory_add: Existing mkdir"); + inodedep->id_mkdiradd = dap; + } else if (inodedep->id_mkdiradd) + merge_diradd(inodedep, dap); + if (newdirblk) { /* - * We only have a new allocation when at the beginning - * of a new fragment, not when we are expanding into an - * existing fragment. Also, there is nothing to do if we - * are already tracking this block. + * There is nothing to do if we are already tracking + * this block. */ - if (fragoff(fs, diroffset) != 0) { - FREE_LOCK(&lk); - return (0); - } if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(&lk); return (0); } - /* - * Find our associated allocdirect and have it track us. - */ - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) - panic("softdep_setup_directory_add: lost inodedep"); - adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); - if (adp == NULL || adp->ad_lbn != lbn) + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) + == 0) panic("softdep_setup_directory_add: lost entry"); + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; - WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); + FREE_LOCK(&lk); + /* + * If we extended into an indirect signal direnter to sync. + */ + if (isindir) + return (1); + return (0); } FREE_LOCK(&lk); return (0); @@ -3141,7 +6197,8 @@ softdep_setup_directory_add(bp, dp, diroffset, new * occur while the move is in progress. */ void -softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) +softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) + struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ @@ -3150,40 +6207,204 @@ void { int offset, oldoffset, newoffset; struct pagedep *pagedep; + struct jmvref *jmvref; struct diradd *dap; + struct direct *de; + struct mount *mp; ufs_lbn_t lbn; + int flags; - ACQUIRE_LOCK(&lk); + mp = UFSTOVFS(dp->i_ump); + de = (struct direct *)oldloc; + jmvref = NULL; + flags = 0; + /* + * Moves are always journaled as it would be too complex to + * determine if any affected adds or removes are present in the + * journal. + */ + if (mp->mnt_flag & MNT_SUJ) { + flags = DEPALLOC; + jmvref = newjmvref(dp, de->d_ino, + dp->i_offset + (oldloc - base), + dp->i_offset + (newloc - base)); + } lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) - goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); + ACQUIRE_LOCK(&lk); + if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { + if (pagedep) + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + goto done; + } + dap = diradd_lookup(pagedep, oldoffset); + if (dap) { + dap->da_offset = newoffset; + newoffset = DIRADDHASH(newoffset); + oldoffset = DIRADDHASH(oldoffset); + if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && + newoffset != oldoffset) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], + dap, da_pdlist); + } + } +done: + if (jmvref) { + jmvref->jm_pagedep = pagedep; + LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); + add_to_journal(&jmvref->jm_list); + } + bcopy(oldloc, newloc, entrysize); + FREE_LOCK(&lk); +} - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { - if (dap->da_offset != oldoffset) - continue; - dap->da_offset = newoffset; - if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) - break; +/* + * Move the mkdir dependencies and journal work from one diradd to another + * when renaming a directory. The new name must depend on the mkdir deps + * completing as the old name did. Directories can only have one valid link + * at a time so one must be canonical. + */ +static void +merge_diradd(inodedep, newdap) + struct inodedep *inodedep; + struct diradd *newdap; +{ + struct diradd *olddap; + struct mkdir *mkdir, *nextmd; + short state; + + olddap = inodedep->id_mkdiradd; + inodedep->id_mkdiradd = newdap; + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + newdap->da_state &= ~DEPCOMPLETE; + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != olddap) + continue; + mkdir->md_diradd = newdap; + state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); + newdap->da_state |= state; + olddap->da_state &= ~state; + if ((olddap->da_state & + (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; + } + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("merge_diradd: unfound ref"); + } + /* + * Any mkdir related journal items are not safe to be freed until + * the new name is stable. + */ + jwork_move(&newdap->da_jwork, &olddap->da_jwork); + olddap->da_state |= DEPCOMPLETE; + complete_diradd(olddap); +} + +/* + * Move the diradd to the pending list when all diradd dependencies are + * complete. + */ +static void +complete_diradd(dap) + struct diradd *dap; +{ + struct pagedep *pagedep; + + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], - dap, da_pdlist); - break; + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } - if (dap == NULL) { +} - LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { - if (dap->da_offset == oldoffset) { - dap->da_offset = newoffset; - break; +/* + * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal + * add entries and conditonally journal the remove. + */ +static void +cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) + struct diradd *dap; + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct inoref *inoref; + struct mkdir *mkdir; + + /* + * If no remove references were allocated we're on a non-journaled + * filesystem and can skip the cancel step. + */ + if (jremref == NULL) { + free_diradd(dap, NULL); + return; + } + /* + * Cancel the primary name an free it if it does not require + * journaling. + */ + if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, + 0, &inodedep) != 0) { + /* Abort the addref that reference this diradd. */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if (inoref->if_list.wk_type != D_JADDREF) + continue; + jaddref = (struct jaddref *)inoref; + if (jaddref->ja_diradd != dap) + continue; + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(jremref); + jremref = NULL; } + break; } } -done: - bcopy(oldloc, newloc, entrysize); - FREE_LOCK(&lk); + /* + * Cancel subordinate names and free them if they do not require + * journaling. + */ + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { + if (mkdir->md_diradd != dap) + continue; + if ((jaddref = mkdir->md_jaddref) == NULL) + continue; + mkdir->md_jaddref = NULL; + if (mkdir->md_state & MKDIR_PARENT) { + if (cancel_jaddref(jaddref, NULL, + &dirrem->dm_jwork) == 0) { + free_jremref(dotdotremref); + dotdotremref = NULL; + } + } else { + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(dotremref); + dotremref = NULL; + } + } + } + } + + if (jremref) + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); + jwork_move(&dirrem->dm_jwork, &dap->da_jwork); + free_diradd(dap, &dirrem->dm_jwork); } /* @@ -3191,8 +6412,9 @@ void * with splbio interrupts blocked. */ static void -free_diradd(dap) +free_diradd(dap, wkhd) struct diradd *dap; + struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; @@ -3200,32 +6422,48 @@ static void struct mkdir *mkdir, *nextmd; mtx_assert(&lk, MA_OWNED); - WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); + if (dap->da_state & ONWORKLIST) + WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + dirrem->dm_state |= COMPLETE; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) - (void) free_inodedep(inodedep); + if (inodedep->id_mkdiradd == dap) + inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; - dap->da_state &= ~mkdir->md_state; - WORKLIST_REMOVE(&mkdir->md_list); + dap->da_state &= + ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + if (mkdir->md_jaddref != NULL) + panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } + if (inodedep) + free_inodedep(inodedep); + /* + * Free any journal segments waiting for the directory write. + */ + handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } @@ -3254,11 +6492,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir) int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; + struct inodedep *inodedep; + int direct; /* - * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want + * newdirrem() to setup the full directory remove which requires + * isrmdir > 1. */ - dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem); + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. + */ + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) + panic("softdep_setup_remove: Lost inodedep."); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active @@ -3280,12 +6531,148 @@ softdep_setup_remove(bp, dp, ip, isrmdir) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(&lk); - handle_workitem_remove(dirrem, NULL); + if (direct) + handle_workitem_remove(dirrem, NULL); } } /* + * Check for an entry matching 'offset' on both the pd_dirraddhd list and the + * pd_pendinghd list of a pagedep. + */ +static struct diradd * +diradd_lookup(pagedep, offset) + struct pagedep *pagedep; + int offset; +{ + struct diradd *dap; + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) + if (dap->da_offset == offset) + return (dap); + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) + if (dap->da_offset == offset) + return (dap); + return (NULL); +} + +/* + * Search for a .. diradd dependency in a directory that is being removed. + * If the directory was renamed to a new parent we have a diradd rather + * than a mkdir for the .. entry. We need to cancel it now before + * it is found in truncate(). + */ +static struct jremref * +cancel_diradd_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct pagedep *pagedep; + struct diradd *dap; + struct worklist *wk; + + if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, + &pagedep) == 0) + return (jremref); + dap = diradd_lookup(pagedep, DOTDOT_OFFSET); + if (dap == NULL) + return (jremref); + cancel_diradd(dap, dirrem, jremref, NULL, NULL); + /* + * Mark any journal work as belonging to the parent so it is freed + * with the .. reference. + */ + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + wk->wk_state |= MKDIR_PARENT; + return (NULL); +} + +/* + * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to + * replace it with a dirrem/diradd pair as a result of re-parenting a + * directory. This ensures that we don't simultaneously have a mkdir and + * a diradd for the same .. entry. + */ +static struct jremref * +cancel_mkdir_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct mkdir *mkdir; + struct diradd *dap; + + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) + panic("cancel_mkdir_dotdot: Lost inodedep"); + dap = inodedep->id_mkdiradd; + if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) + return (jremref); + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; + mkdir = LIST_NEXT(mkdir, md_mkdirs)) + if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) + break; + if (mkdir == NULL) + panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); + if ((jaddref = mkdir->md_jaddref) != NULL) { + mkdir->md_jaddref = NULL; + if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, + &inodedep) == 0) + panic("cancel_mkdir_dotdot: Lost parent inodedep"); + if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { + journal_jremref(dirrem, jremref, inodedep); + jremref = NULL; + } + } + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + mkdir->md_state |= ALLCOMPLETE; + complete_mkdir(mkdir); + return (jremref); +} + +static void +journal_jremref(dirrem, jremref, inodedep) + struct dirrem *dirrem; + struct jremref *jremref; + struct inodedep *inodedep; +{ + + if (inodedep == NULL) + if (inodedep_lookup(jremref->jr_list.wk_mp, + jremref->jr_ref.if_ino, 0, &inodedep) == 0) + panic("journal_jremref: Lost inodedep"); + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + add_to_journal(&jremref->jr_list); +} + +static void +dirrem_journal(dirrem, jremref, dotremref, dotdotremref) + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + + + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, + &inodedep) == 0) + panic("dirrem_journal: Lost inodedep"); + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); +} + +/* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ @@ -3303,12 +6690,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; + struct vnode *dvp; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); + dvp = ITOV(dp); /* * If we are over our limit, try to improve the situation. * Limiting the number of dirrem structures will also limit @@ -3321,34 +6713,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) FREE_LOCK(&lk); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); - workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); + workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); + LIST_INIT(&dirrem->dm_jremrefhd); + LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; - + /* + * Allocate remove reference structures to track journal write + * dependencies. We will always have one for the link and + * when doing directories we will always have one more for dot. + * When renaming a directory we skip the dotdot link change so + * this is not needed. + */ + jremref = dotremref = dotdotremref = NULL; + if (DOINGSUJ(dvp)) { + if (isrmdir) { + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 2); + dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, + ip->i_effnlink + 1); + } else + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 1); + if (isrmdir > 1) { + dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, + dp->i_effnlink + 1); + dotdotremref->jr_state |= MKDIR_PARENT; + } + } ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* + * If we're renaming a .. link to a new directory, cancel any + * existing MKDIR_PARENT mkdir. If it has already been canceled + * the jremref is preserved for any potential diradd in this + * location. This can not coincide with a rmdir. + */ + if (dp->i_offset == DOTDOT_OFFSET) { + if (isrmdir) + panic("newdirrem: .. directory change during remove?"); + jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); + } + /* + * If we're removing a directory search for the .. dependency now and + * cancel it. Any pending journal work will be added to the dirrem + * to be completed when the workitem remove completes. + */ + if (isrmdir > 1) + dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); + /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can - * be de-allocated. Check for an entry on both the pd_dirraddhd - * list and the pd_pendinghd list. + * be de-allocated. */ - - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) - if (dap->da_offset == offset) - break; + dap = diradd_lookup(pagedep, offset); if (dap == NULL) { - - LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) - if (dap->da_offset == offset) - break; - if (dap == NULL) - return (dirrem); + /* + * Link the jremref structures into the dirrem so they are + * written prior to the pagedep. + */ + if (jremref) + dirrem_journal(dirrem, jremref, dotremref, + dotdotremref); + return (dirrem); } /* * Must be ATTACHED at this point. @@ -3373,7 +6806,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; - free_diradd(dap); + cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); +#ifdef SUJ_DEBUG + if (isrmdir == 0) { + struct worklist *wk; + + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) + panic("bad wk %p (0x%X)\n", wk, wk->wk_state); + } +#endif + return (dirrem); } @@ -3407,6 +6850,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; + struct jaddref *jaddref; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); @@ -3422,6 +6866,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; + LIST_INIT(&dap->da_jwork); } /* @@ -3454,11 +6899,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(&lk); return; } + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. A valid nlinkdelta ensures that this lookup + * will not fail. + */ + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) + panic("softdep_setup_directory_change: Lost inodedep."); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active @@ -3483,15 +6938,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); } /* - * Link into its inodedep. Put it on the id_bufwait list if the inode + * Lookup the jaddref for this journal entry. We must finish + * initializing it and make the diradd write dependent on it. + * If we're not journaling Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ - if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || - (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); + if (mp->mnt_flag & MNT_SUJ) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_change: bad jaddref %p", + jaddref)); + jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diradd = dap; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], + dap, da_pdlist); + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); @@ -3500,6 +6969,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } + /* + * If we're making a new name for a directory that has not been + * committed when need to move the dot and dotdot references to + * this new name. + */ + if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) + merge_diradd(inodedep, dap); FREE_LOCK(&lk); } @@ -3516,8 +6992,7 @@ softdep_change_linkcnt(ip) struct inodedep *inodedep; ACQUIRE_LOCK(&lk); - (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, - DEPALLOC, &inodedep); + inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; @@ -3574,6 +7049,304 @@ softdep_releasefile(ip) } /* + * Attach a sbdep dependency to the superblock buf so that we can keep + * track of the head of the linked list of referenced but unlinked inodes. + */ +void +softdep_setup_sbupdate(ump, fs, bp) + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; +{ + struct sbdep *sbdep; + struct worklist *wk; + + if ((fs->fs_flags & FS_SUJ) == 0) + return; + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_SBDEP) + break; + if (wk != NULL) + return; + sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); + sbdep->sb_fs = fs; + sbdep->sb_ump = ump; + ACQUIRE_LOCK(&lk); + WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); + FREE_LOCK(&lk); +} + +/* + * Return the first unlinked inodedep which is ready to be the head of the + * list. The inodedep and all those after it must have valid next pointers. + */ +static struct inodedep * +first_unlinked_inodedep(ump) + struct ufsmount *ump; +{ + struct inodedep *inodedep; + struct inodedep *idp; + + for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); + inodedep; inodedep = idp) { + if ((inodedep->id_state & UNLINKNEXT) == 0) + return (NULL); + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) + break; + if ((inodedep->id_state & UNLINKPREV) == 0) + panic("first_unlinked_inodedep: prev != next"); + } + if (inodedep == NULL) + return (NULL); + + return (inodedep); +} + +/* + * Set the sujfree unlinked head pointer prior to writing a superblock. + */ +static void +initiate_write_sbdep(sbdep) + struct sbdep *sbdep; +{ + struct inodedep *inodedep; + struct fs *bpfs; + struct fs *fs; + + bpfs = sbdep->sb_fs; + fs = sbdep->sb_ump->um_fs; + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if (inodedep) { + fs->fs_sujfree = inodedep->id_ino; + inodedep->id_state |= UNLINKPREV; + } else + fs->fs_sujfree = 0; + bpfs->fs_sujfree = fs->fs_sujfree; +} + +/* + * After a superblock is written determine whether it must be written again + * due to a changing unlinked list head. + */ +static int +handle_written_sbdep(sbdep, bp) + struct sbdep *sbdep; + struct buf *bp; +{ + struct inodedep *inodedep; + struct mount *mp; + struct fs *fs; + + fs = sbdep->sb_fs; + mp = UFSTOVFS(sbdep->sb_ump); + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || + (inodedep == NULL && fs->fs_sujfree != 0)) { + bdirty(bp); + return (1); + } + WORKITEM_FREE(sbdep, D_SBDEP); + if (fs->fs_sujfree == 0) + return (0); + if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) + panic("handle_written_sbdep: lost inodedep"); + /* + * Now that we have a record of this indode in stable store we can + * discard any pending work. + */ + for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { + if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) + panic("handle_written_sbdep: Bad inodedep %p (0x%X)", + inodedep, inodedep->id_state); + if (handle_bufwait(inodedep, NULL) != NULL) + panic("handle_written_sbdep: freefile on " + "unlinked inodedep"); + } + + return (0); +} + +/* + * Mark an inodedep has unlinked and insert it into the in-memory unlinked + * list. + */ +static void +unlinked_inodedep(mp, inodedep) + struct mount *mp; + struct inodedep *inodedep; +{ + struct ufsmount *ump; + + if ((mp->mnt_flag & MNT_SUJ) == 0) + return; + ump = VFSTOUFS(mp); + ump->um_fs->fs_fmod = 1; + inodedep->id_state |= UNLINKED; + TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); +} + +/* + * Remove an inodedep from the unlinked inodedep list. This may require + * disk writes if the inode has made it that far. + */ +static void +clear_unlinked_inodedep(inodedep) + struct inodedep *inodedep; +{ + struct ufsmount *ump; + struct inodedep *idp; + struct inodedep *idn; + struct fs *fs; + struct buf *bp; + ino_t ino; + ino_t nino; + ino_t pino; + int error; + + ump = VFSTOUFS(inodedep->id_list.wk_mp); + fs = ump->um_fs; + ino = inodedep->id_ino; + error = 0; + for (;;) { + /* + * If nothing has yet been written simply remove us from + * the in memory list and return. This is the most common + * case where handle_workitem_remove() loses the final + * reference. + */ + if ((inodedep->id_state & UNLINKLINKS) == 0) + break; + /* + * If we have a NEXT pointer and no PREV pointer we can simply + * clear NEXT's PREV and remove ourselves from the list. Be + * careful not to clear PREV if the superblock points at + * next as well. + */ + idn = TAILQ_NEXT(inodedep, id_unlinked); + if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { + if (idn && fs->fs_sujfree != idn->id_ino) + idn->id_state &= ~UNLINKPREV; + break; + } + /* + * Here we have an inodedep which is actually linked into + * the list. We must remove it by forcing a write to the + * link before us, whether it be the superblock or an inode. + * Unfortunately the list may change while we're waiting + * on the buf lock for either resource so we must loop until + * we lock. the right one. If both the superblock and an + * inode point to this inode we must clear the inode first + * followed by the superblock. + */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + pino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + pino = idp->id_ino; + FREE_LOCK(&lk); + if (pino == 0) + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + else + error = bread(ump->um_devvp, + fsbtodb(fs, ino_to_fsba(fs, pino)), + (int)fs->fs_bsize, NOCRED, &bp); + ACQUIRE_LOCK(&lk); + if (error) + break; + /* If the list has changed restart the loop. */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + nino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + nino = idp->id_ino; + if (nino != pino || + (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); + continue; + } + /* + * Remove us from the in memory list. After this we cannot + * access the inodedep. + */ + idn = TAILQ_NEXT(inodedep, id_unlinked); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + /* + * Determine the next inode number. + */ + nino = 0; + if (idn) { + /* + * If next isn't on the list we can just clear prev's + * state and schedule it to be fixed later. No need + * to synchronously write if we're not in the real + * list. + */ + if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { + idp->id_state &= ~UNLINKNEXT; + if ((idp->id_state & ONWORKLIST) == 0) + WORKLIST_INSERT(&bp->b_dep, + &idp->id_list); + FREE_LOCK(&lk); + bawrite(bp); + ACQUIRE_LOCK(&lk); + return; + } + nino = idn->id_ino; + } + FREE_LOCK(&lk); + /* + * The predecessor's next pointer is manually updated here + * so that the NEXT flag is never cleared for an element + * that is in the list. + */ + if (pino == 0) { + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + } else if (fs->fs_magic == FS_UFS1_MAGIC) + ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + else + ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + /* + * If the bwrite fails we have no recourse to recover. The + * filesystem is corrupted already. + */ + bwrite(bp); + ACQUIRE_LOCK(&lk); + /* + * If the superblock pointer still needs to be cleared force + * a write here. + */ + if (fs->fs_sujfree == ino) { + FREE_LOCK(&lk); + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + bwrite(bp); + ACQUIRE_LOCK(&lk); + } + if (fs->fs_sujfree != ino) + return; + panic("clear_unlinked_inodedep: Failed to clear free head"); + } + if (inodedep->id_ino == fs->fs_sujfree) + panic("clear_unlinked_inodedep: Freeing head of free list"); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + return; +} + +/* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ @@ -3584,23 +7357,55 @@ handle_workitem_remove(dirrem, xp) { struct thread *td = curthread; struct inodedep *inodedep; + struct workhead dotdotwk; + struct worklist *wk; + struct ufsmount *ump; + struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; int error; + if (dirrem->dm_state & ONWORKLIST) + panic("handle_workitem_remove: dirrem %p still on worklist", + dirrem); + oldinum = dirrem->dm_oldinum; + mp = dirrem->dm_list.wk_mp; + ump = VFSTOUFS(mp); if ((vp = xp) == NULL && - (error = ffs_vgetf(dirrem->dm_list.wk_mp, - dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { + (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); ACQUIRE_LOCK(&lk); - if ((inodedep_lookup(dirrem->dm_list.wk_mp, - dirrem->dm_oldinum, 0, &inodedep)) == 0) + if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); + if (dirrem->dm_state & ONDEPLIST) + LIST_REMOVE(dirrem, dm_inonext); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_workitem_remove: Journal entries not written.")); + /* + * Move all dependencies waiting on the remove to complete + * from the dirrem to the inode inowait list to be completed + * after the inode has been updated and written to disk. Any + * marked MKDIR_PARENT are saved to be completed when the .. ref + * is removed. + */ + LIST_INIT(&dotdotwk); + while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { + WORKLIST_REMOVE(wk); + if (wk->wk_state & MKDIR_PARENT) { + wk->wk_state &= ~MKDIR_PARENT; + WORKLIST_INSERT(&dotdotwk, wk); + continue; + } + WORKLIST_INSERT(&inodedep->id_inowait, wk); + } + LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); + /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { @@ -3609,12 +7414,16 @@ handle_workitem_remove(dirrem, xp) ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; num_dirrem -= 1; + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: worklist not empty. %s", + TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); - vput(vp); - return; + goto out; } /* * Directory deletion. Decrement reference count for both the @@ -3628,6 +7437,8 @@ handle_workitem_remove(dirrem, xp) ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) @@ -3639,36 +7450,47 @@ handle_workitem_remove(dirrem, xp) * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: DIRCHG and worklist not empty.")); num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); - vput(vp); - return; + goto out; } + dirrem->dm_state = ONDEPLIST; + dirrem->dm_oldinum = dirrem->dm_dirinum; /* - * If the inodedep does not exist, then the zero'ed inode has - * been written to disk. If the allocated inode has never been - * written to disk, then the on-disk inode is zero'ed. In either - * case we can remove the file immediately. + * Place the dirrem on the parent's diremhd list. */ - dirrem->dm_state = 0; - oldinum = dirrem->dm_oldinum; - dirrem->dm_oldinum = dirrem->dm_dirinum; - if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, - 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { + if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) + panic("handle_workitem_remove: lost dir inodedep"); + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); + /* + * If the allocated inode has never been written to disk, then + * the on-disk inode is zero'ed and we can remove the file + * immediately. When journaling if the inode has been marked + * unlinked and not DEPCOMPLETE we know it can never be written. + */ + inodedep_lookup(mp, oldinum, 0, &inodedep); + if (inodedep == NULL || + (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || + check_inode_unwritten(inodedep)) { if (xp != NULL) - add_to_worklist(&dirrem->dm_list); + add_to_worklist(&dirrem->dm_list, 0); FREE_LOCK(&lk); - vput(vp); - if (xp == NULL) + if (xp == NULL) { + vput(vp); handle_workitem_remove(dirrem, NULL); + } return; } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); ip->i_flag |= IN_CHANGE; +out: ffs_update(vp, 0); - vput(vp); + if (xp == NULL) + vput(vp); } /* @@ -3689,6 +7511,7 @@ static void handle_workitem_freefile(freefile) struct freefile *freefile; { + struct workhead wkhd; struct fs *fs; struct inodedep *idp; struct ufsmount *ump; @@ -3701,13 +7524,15 @@ handle_workitem_freefile(freefile) error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(&lk); if (error) - panic("handle_workitem_freefile: inodedep survived"); + panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); + LIST_INIT(&wkhd); + LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, - freefile->fx_oldinum, freefile->fx_mode)) != 0) + freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefile, D_FREEFILE); @@ -3757,8 +7582,10 @@ softdep_disk_io_initiation(bp) { struct worklist *wk; struct worklist marker; - struct indirdep *indirdep; struct inodedep *inodedep; + struct freeblks *freeblks; + struct jfreeblk *jfreeblk; + struct newblk *newblk; /* * We only care about write operations. There should never @@ -3767,6 +7594,10 @@ softdep_disk_io_initiation(bp) if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("softdep_disk_io_initiation: Writing buffer with " + "background write in progress: %p", bp); + marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ @@ -3792,46 +7623,58 @@ softdep_disk_io_initiation(bp) continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_io_initiation: indirdep gone"); + initiate_write_indirdep(WK_INDIRDEP(wk), bp); + continue; + + case D_BMSAFEMAP: + initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); + continue; + + case D_JSEG: + WK_JSEG(wk)->js_buf = NULL; + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); /* - * If there are no remaining dependencies, this - * will be writing the real pointers, so the - * dependency can be freed. + * We have to wait for the jfreeblks to be journaled + * before we can write an inodeblock with updated + * pointers. Be careful to arrange the marker so + * we revisit the jfreeblk if it's not removed by + * the first jwait(). */ - if (LIST_EMPTY(&indirdep->ir_deplisthd)) { - struct buf *bp; - - bp = indirdep->ir_savebp; - bp->b_flags |= B_INVAL | B_NOCACHE; - /* inline expand WORKLIST_REMOVE(wk); */ - wk->wk_state &= ~ONWORKLIST; - LIST_REMOVE(wk, wk_list); - WORKITEM_FREE(indirdep, D_INDIRDEP); - FREE_LOCK(&lk); - brelse(bp); - ACQUIRE_LOCK(&lk); - continue; + if (jfreeblk != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&jfreeblk->jf_list); } + continue; + case D_ALLOCDIRECT: + case D_ALLOCINDIR: /* - * Replace up-to-date version with safe version. + * We have to wait for the jnewblk to be journaled + * before we can write to a block otherwise the + * contents may be confused with an earlier file + * at recovery time. Handle the marker as described + * above. */ - FREE_LOCK(&lk); - indirdep->ir_saveddata = malloc(bp->b_bcount, - M_INDIRDEP, M_SOFTDEP_FLAGS); - ACQUIRE_LOCK(&lk); - indirdep->ir_state &= ~ATTACHED; - indirdep->ir_state |= UNDONE; - bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); - bcopy(indirdep->ir_savebp->b_data, bp->b_data, - bp->b_bcount); + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&newblk->nb_jnewblk->jn_list); + } continue; + case D_SBDEP: + initiate_write_sbdep(WK_SBDEP(wk)); + continue; + case D_MKDIR: - case D_BMSAFEMAP: - case D_ALLOCDIRECT: - case D_ALLOCINDIR: + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: continue; default: @@ -3855,6 +7698,9 @@ initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { + struct jremref *jremref; + struct jmvref *jmvref; + struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; @@ -3869,6 +7715,18 @@ initiate_write_filepage(pagedep, bp) return; } pagedep->pd_state |= IOSTARTED; + /* + * Wait for all journal remove dependencies to hit the disk. + * We can not allow any potentially conflicting directory adds + * to be visible before removes and rollback is too difficult. + * lk may be dropped and re-acquired, however we hold the buf + * locked so the dependency can not go away. + */ + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) + while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) + jwait(&jremref->jr_list); + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) + jwait(&jmvref->jm_list); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) @@ -3905,6 +7763,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; + struct inoref *inoref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -3918,7 +7777,21 @@ initiate_write_inodeblock_ufs1(inodedep, bp) fs = inodedep->id_fs; dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if (inon) + dp->di_freelink = inon->id_ino; + else + dp->di_freelink = 0; + } + /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ @@ -3933,6 +7806,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; + dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* @@ -3940,32 +7814,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; - if (TAILQ_EMPTY(&inodedep->id_inoupdt)) + inodedep->id_savednlink = dp->di_nlink; + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* + * Revert the link count to that of the first unwritten journal entry. + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn - NDADDR, - dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -3981,14 +7863,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4012,8 +7894,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4030,7 +7912,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* @@ -4051,6 +7933,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; + struct inoref *inoref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -4064,7 +7947,21 @@ initiate_write_inodeblock_ufs2(inodedep, bp) fs = inodedep->id_fs; dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if (inon) + dp->di_freelink = inon->id_ino; + else + dp->di_freelink = 0; + } + /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ @@ -4079,6 +7976,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; + dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* @@ -4086,25 +7984,38 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; + inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && - TAILQ_EMPTY(&inodedep->id_extupdt)) + TAILQ_EMPTY(&inodedep->id_extupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* + * Revert the link count to that of the first unwritten journal entry. + * + * XXX What if it is canceled? Could entries after it be expired + * before we remove this? Thus leaving us with an incorrect link on + * disk with no journal entries to cover it? + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + + /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4120,12 +8031,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NXADDR; i++) { + dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4142,8 +8053,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; @@ -4154,24 +8065,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", - (intmax_t)adp->ad_lbn - NDADDR, - (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4187,14 +8098,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); @@ -4218,8 +8129,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4236,16 +8147,365 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* + * Cancel an indirdep as a result of truncation. Release all of the + * children allocindirs and place their journal work on the appropriate + * list. + */ +static void +cancel_indirdep(indirdep, bp, inodedep, freeblks) + struct indirdep *indirdep; + struct buf *bp; + struct inodedep *inodedep; + struct freeblks *freeblks; +{ + struct allocindir *aip; + + /* + * None of the indirect pointers will ever be visible, + * so they can simply be tossed. GOINGAWAY ensures + * that allocated pointers will be saved in the buffer + * cache until they are freed. Note that they will + * only be able to be found by their physical address + * since the inode mapping the logical address will + * be gone. The save buffer used for the safe copy + * was allocated in setup_allocindir_phase2 using + * the physical address so it could be used for this + * purpose. Hence we swap the safe copy with the real + * copy, allowing the safe copy to be freed and holding + * on to the real copy for later use in indir_trunc. + */ + if (indirdep->ir_state & GOINGAWAY) + panic("cancel_indirdep: already gone"); + if (indirdep->ir_state & ONDEPLIST) { + indirdep->ir_state &= ~ONDEPLIST; + LIST_REMOVE(indirdep, ir_next); + } + indirdep->ir_state |= GOINGAWAY; + VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; + while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) + cancel_allocindir(aip, inodedep, freeblks); + bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); + WORKLIST_REMOVE(&indirdep->ir_list); + WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); + indirdep->ir_savebp = NULL; +} + +/* + * Free an indirdep once it no longer has new pointers to track. + */ +static void +free_indirdep(indirdep) + struct indirdep *indirdep; +{ + + KASSERT(LIST_EMPTY(&indirdep->ir_jwork), + ("free_indirdep: Journal work not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_completehd), + ("free_indirdep: Complete head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_writehd), + ("free_indirdep: write head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_donehd), + ("free_indirdep: done head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), + ("free_indirdep: deplist head not empty.")); + KASSERT(indirdep->ir_savebp == NULL, + ("free_indirdep: %p ir_savebp != NULL", indirdep)); + KASSERT((indirdep->ir_state & ONDEPLIST) == 0, + ("free_indirdep: %p still on deplist.", indirdep)); + if (indirdep->ir_state & ONWORKLIST) + WORKLIST_REMOVE(&indirdep->ir_list); + WORKITEM_FREE(indirdep, D_INDIRDEP); +} + +/* + * Called before a write to an indirdep. This routine is responsible for + * rolling back pointers to a safe state which includes only those + * allocindirs which have been completed. + */ +static void +initiate_write_indirdep(indirdep, bp) + struct indirdep *indirdep; + struct buf *bp; +{ + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_io_initiation: indirdep gone"); + + /* + * If there are no remaining dependencies, this will be writing + * the real pointers. + */ + if (LIST_EMPTY(&indirdep->ir_deplisthd)) + return; + /* + * Replace up-to-date version with safe version. + */ + FREE_LOCK(&lk); + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, + M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(&lk); + indirdep->ir_state &= ~ATTACHED; + indirdep->ir_state |= UNDONE; + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + bcopy(indirdep->ir_savebp->b_data, bp->b_data, + bp->b_bcount); +} + +/* + * Called when an inode has been cleared in a cg bitmap. This finally + * eliminates any canceled jaddrefs + */ +void +softdep_setup_inofree(mp, bp, ino, wkhd) + struct mount *mp; + struct buf *bp; + ino_t ino; + struct workhead *wkhd; +{ + struct worklist *wk, *wkn; + struct bmsafemap *bmsafemap; + struct inodedep *inodedep; + uint8_t *inosused; + struct cg *cgp; + struct fs *fs; + + ACQUIRE_LOCK(&lk); + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, ino)); + cgp = (struct cg *)bp->b_data; + inosused = cg_inosused(cgp); + if (isset(inosused, ino % fs->fs_ipg)) + panic("softdep_setup_inofree: inode %d not freed.", ino); + if (inodedep_lookup(mp, ino, 0, &inodedep)) + panic("softdep_setup_inofree: ino %d has existing inodedep %p", + ino, inodedep); + if (wkhd) { /* XXX Temporary. */ + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JADDREF) + continue; + WORKLIST_REMOVE(wk); + /* + * We can free immediately even if the jaddref isn't attached + * in a background write as now the bitmaps are reconciled. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jaddref(WK_JADDREF(wk)); + } + jwork_move(&bp->b_dep, wkhd); + } + FREE_LOCK(&lk); +} + + +/* + * Called via ffs_blkfree() after a set of frags has been cleared from a cg + * map. Any dependencies waiting for the write to clear are added to the + * buf's list and any jnewblks that are being canceled are discarded + * immediately. + */ +void +softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) + struct mount *mp; + struct buf *bp; + ufs2_daddr_t blkno; + int frags; + struct workhead *wkhd; +{ + struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; + struct worklist *wk, *wkn; + struct fs *fs; +#ifdef SUJ_DEBUG + uint8_t *blksfree; + struct cg *cgp; + ufs2_daddr_t jstart; + ufs2_daddr_t jend; + ufs2_daddr_t end; + long bno; + int i; +#endif + + if ((mp->mnt_flag & FS_SUJ) == 0) + return; + ACQUIRE_LOCK(&lk); + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); + /* + * Detach any jnewblks which have been canceled. They must linger + * until the bitmap is cleared again by ffs_blkfree() to prevent + * an unjournaled allocation from hitting the disk. + */ + if (wkhd) { + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JNEWBLK) + continue; + jnewblk = WK_JNEWBLK(wk); + KASSERT(jnewblk->jn_state & GOINGAWAY, + ("softdep_setup_blkfree: jnewblk not canceled.")); + WORKLIST_REMOVE(wk); +#ifdef SUJ_DEBUG + /* + * Assert that this block is free in the bitmap + * before we discard the jnewblk. + */ + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; + i < jnewblk->jn_frags; i++) + if (isclr(blksfree, bno + i) == 0) + continue; + panic("softdep_setup_blkfree: not free"); +#endif + /* + * Even if it's not attached we can free immediately + * as the new bitmap is correct. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jnewblk(jnewblk); + } + /* + * The buf must be locked by the caller otherwise these could + * be added while it's being written and the write would + * complete them before they made it to disk. + */ + jwork_move(&bp->b_dep, wkhd); + } + +#ifdef SUJ_DEBUG + /* + * Assert that we are not freeing a block which has an outstanding + * allocation dependency. + */ + fs = VFSTOUFS(mp)->um_fs; + end = blkno + frags; + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + /* + * Don't match against blocks that will be freed when the + * background write is done. + */ + if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == + (COMPLETE | DEPCOMPLETE)) + continue; + jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; + jend = jnewblk->jn_blkno + jnewblk->jn_frags; + if ((blkno >= jstart && blkno < jend) || + (end > jstart && end <= jend)) { + printf("state 0x%X %jd - %d %d dep %p\n", + jnewblk->jn_state, jnewblk->jn_blkno, + jnewblk->jn_oldfrags, jnewblk->jn_frags, + jnewblk->jn_newblk); + panic("softdep_setup_blkfree: " + "%jd-%jd(%d) overlaps with %jd-%jd", + blkno, end, frags, jstart, jend); + } + } +#endif + FREE_LOCK(&lk); +} + +static void +initiate_write_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; /* The cg block. */ +{ + struct jaddref *jaddref; + struct jnewblk *jnewblk; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + int cleared; + ino_t ino; + long bno; + int i; + + if (bmsafemap->sm_state & IOSTARTED) + panic("initiate_write_bmsafemap: Already started\n"); + bmsafemap->sm_state |= IOSTARTED; + /* + * Clear any inode allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { + ino = jaddref->ja_ino % fs->fs_ipg; + /* + * If this is a background copy the inode may not + * be marked used yet. + */ + if (isset(inosused, ino)) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir--; + cgp->cg_cs.cs_nifree++; + clrbit(inosused, ino); + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: inode %d " + "marked free", jaddref->ja_ino); + } + } + /* + * Clear any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + bno = dtogd(fs, jnewblk->jn_blkno); + cleared = 0; + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isclr(blksfree, bno + i)) { + cleared = 1; + setbit(blksfree, bno + i); + } + } + /* + * We may not clear the block if it's a background + * copy. In that case there is no reason to detach + * it. + */ + if (cleared) { + jnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: block %jd " + "marked free", jnewblk->jn_blkno); + } + } + /* + * Move allocation lists to the written lists so they can be + * cleared once the block write is complete. + */ + LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, + inodedep, id_deps); + LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); +} + +/* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. + * */ static void softdep_disk_write_complete(bp) @@ -4254,12 +8514,7 @@ softdep_disk_write_complete(bp) struct worklist *wk; struct worklist *owk; struct workhead reattach; - struct newblk *newblk; - struct allocindir *aip; - struct allocdirect *adp; - struct indirdep *indirdep; - struct inodedep *inodedep; - struct bmsafemap *bmsafemap; + struct buf *sbp; /* * If an error occurred while doing the write, then the data @@ -4271,8 +8526,9 @@ softdep_disk_write_complete(bp) /* * This lock must not be released anywhere in this code segment. */ + sbp = NULL; + owk = NULL; ACQUIRE_LOCK(&lk); - owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); if (wk == owk) @@ -4291,33 +8547,8 @@ softdep_disk_write_complete(bp) continue; case D_BMSAFEMAP: - bmsafemap = WK_BMSAFEMAP(wk); - while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { - newblk->nb_state |= DEPCOMPLETE; - newblk->nb_bmsafemap = NULL; - LIST_REMOVE(newblk, nb_deps); - } - while ((adp = - LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - LIST_REMOVE(adp, ad_deps); - handle_allocdirect_partdone(adp); - } - while ((aip = - LIST_FIRST(&bmsafemap->sm_allocindirhd))) { - aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; - LIST_REMOVE(aip, ai_deps); - handle_allocindir_partdone(aip); - } - while ((inodedep = - LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { - inodedep->id_state |= DEPCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; - } - WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: @@ -4325,37 +8556,47 @@ softdep_disk_write_complete(bp) continue; case D_ALLOCDIRECT: - adp = WK_ALLOCDIRECT(wk); - adp->ad_state |= COMPLETE; - handle_allocdirect_partdone(adp); + wk->wk_state |= COMPLETE; + handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: - aip = WK_ALLOCINDIR(wk); - aip->ai_state |= COMPLETE; - handle_allocindir_partdone(aip); + wk->wk_state |= COMPLETE; + handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_write_complete: indirdep gone"); - bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); - free(indirdep->ir_saveddata, M_INDIRDEP); - indirdep->ir_saveddata = 0; - indirdep->ir_state &= ~UNDONE; - indirdep->ir_state |= ATTACHED; - while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { - handle_allocindir_partdone(aip); - if (aip == LIST_FIRST(&indirdep->ir_donehd)) - panic("disk_write_complete: not gone"); - } - WORKLIST_INSERT(&reattach, wk); - if ((bp->b_flags & B_DELWRI) == 0) - stat_indir_blk_ptrs++; - bdirty(bp); + if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) + WORKLIST_INSERT(&reattach, wk); continue; + case D_FREEBLKS: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk, 1); + continue; + + case D_FREEWORK: + handle_written_freework(WK_FREEWORK(wk)); + break; + + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + + case D_JSEG: + handle_written_jseg(WK_JSEG(wk), bp); + continue; + + case D_SBDEP: + if (handle_written_sbdep(WK_SBDEP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); + continue; + default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); @@ -4370,6 +8611,8 @@ softdep_disk_write_complete(bp) WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(&lk); + if (sbp) + brelse(sbp); } /* @@ -4378,18 +8621,17 @@ softdep_disk_write_complete(bp) * splbio interrupts blocked. */ static void -handle_allocdirect_partdone(adp) +handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ + struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; - long bsize, delay; + long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (adp->ad_buf != NULL) - panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode @@ -4439,25 +8681,27 @@ static void return; } /* - * If we have found the just finished dependency, then free + * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. - * If the inode still has a bitmap dependency, then it has - * never been written to disk, hence the on-disk inode cannot - * reference the old fragment so we can free it without delay. + * Since the pointer has not yet been written in the inode + * as the dependency prevents it, place the allocdirect on the + * bufwait list where it will be freed once the pointer is + * valid. */ - delay = (inodedep->id_state & DEPCOMPLETE); + if (wkhd == NULL) + wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - free_allocdirect(listhead, adp, delay); + TAILQ_REMOVE(listhead, adp, ad_next); + WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* - * Called from within softdep_disk_write_complete above. Note that - * this routine is always called from interrupt level with further - * splbio interrupts blocked. + * Called from within softdep_disk_write_complete above. This routine + * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) @@ -4467,11 +8711,9 @@ handle_allocindir_partdone(aip) if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (aip->ai_buf != NULL) - panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; + LIST_REMOVE(aip, ai_next); if (indirdep->ir_state & UNDONE) { - LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } @@ -4481,13 +8723,130 @@ handle_allocindir_partdone(aip) else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; - LIST_REMOVE(aip, ai_next); - if (aip->ai_freefrag != NULL) - add_to_worklist(&aip->ai_freefrag->ff_list); - WORKITEM_FREE(aip, D_ALLOCINDIR); + /* + * Await the pointer write before freeing the allocindir. + */ + LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* + * Release segments held on a jwork list. + */ +static void +handle_jwork(wkhd) + struct workhead *wkhd; +{ + struct worklist *wk; + + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + default: + panic("handle_jwork: Unknown type %s\n", + TYPENAME(wk->wk_type)); + } + } +} + +/* + * Handle the bufwait list on an inode when it is safe to release items + * held there. This normally happens after an inode block is written but + * may be delayed and handle later if there are pending journal items that + * are not yet safe to be released. + */ +static struct freefile * +handle_bufwait(inodedep, refhd) + struct inodedep *inodedep; + struct workhead *refhd; +{ + struct jaddref *jaddref; + struct freefile *freefile; + struct worklist *wk; + + freefile = NULL; + while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_FREEFILE: + /* + * We defer adding freefile to the worklist + * until all other additions have been made to + * ensure that it will be done after all the + * old blocks have been freed. + */ + if (freefile != NULL) + panic("handle_bufwait: freefile"); + freefile = WK_FREEFILE(wk); + continue; + + case D_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); + continue; + + case D_DIRADD: + diradd_inode_written(WK_DIRADD(wk), inodedep); + continue; + + case D_FREEFRAG: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk, 0); + continue; + + case D_DIRREM: + wk->wk_state |= COMPLETE; + add_to_worklist(wk, 0); + continue; + + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + free_newblk(WK_NEWBLK(wk)); + continue; + + case D_JNEWBLK: + wk->wk_state |= COMPLETE; + free_jnewblk(WK_JNEWBLK(wk)); + continue; + + /* + * Save freed journal segments and add references on + * the supplied list which will delay their release + * until the cg bitmap is cleared on disk. + */ + case D_JSEGDEP: + if (refhd == NULL) + free_jsegdep(WK_JSEGDEP(wk)); + else + WORKLIST_INSERT(refhd, wk); + continue; + + case D_JADDREF: + jaddref = WK_JADDREF(wk); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + /* + * Transfer any jaddrefs to the list to be freed with + * the bitmap if we're handling a removed file. + */ + if (refhd == NULL) { + wk->wk_state |= COMPLETE; + free_jaddref(jaddref); + } else + WORKLIST_INSERT(refhd, wk); + continue; + + default: + panic("handle_bufwait: Unknown type %p(%s)", + wk, TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + return (freefile); +} +/* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further @@ -4498,12 +8857,17 @@ handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { - struct worklist *wk, *filefree; + struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; + struct workhead wkhd; int hadchanges, fstype; + ino_t freelink; + LIST_INIT(&wkhd); + hadchanges = 0; + freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; @@ -4511,12 +8875,30 @@ handle_written_inodeblock(inodedep, bp) fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp2->di_freelink; } /* + * If we wrote a freelink pointer during the last write record it + * here. If we did not, keep the buffer dirty until we do. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if ((inon == NULL && freelink == 0) || + (inon && inon->id_ino == freelink)) { + if (inon) + inon->id_state |= UNLINKPREV; + inodedep->id_state |= UNLINKNEXT; + } else + hadchanges = 1; + } + /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until @@ -4524,6 +8906,7 @@ handle_written_inodeblock(inodedep, bp) * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { + hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else @@ -4533,6 +8916,13 @@ handle_written_inodeblock(inodedep, bp) if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); + /* + * If the inode is clear here and GOINGAWAY it will never + * be written. Process the bufwait and clear any pending + * work which may include the freefile. + */ + if (inodedep->id_state & GOINGAWAY) + goto bufwait; return (1); } inodedep->id_state |= COMPLETE; @@ -4540,50 +8930,49 @@ handle_written_inodeblock(inodedep, bp) * Roll forward anything that had to be rolled back before * the inode could be updated. */ - hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { - if (adp->ad_lbn < NDADDR) { - if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", - (intmax_t)adp->ad_lbn, - dp1->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, - dp1->di_ib[adp->ad_lbn - NDADDR]); - dp1->di_ib[adp->ad_lbn - NDADDR] = + (intmax_t)adp->ad_offset - NDADDR, + dp1->di_ib[adp->ad_offset - NDADDR]); + dp1->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } else { - if (adp->ad_lbn < NDADDR) { - if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, + (intmax_t)adp->ad_offset - NDADDR, (intmax_t) - dp2->di_ib[adp->ad_lbn - NDADDR]); - dp2->di_ib[adp->ad_lbn - NDADDR] = + dp2->di_ib[adp->ad_offset - NDADDR]); + dp2->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } @@ -4595,13 +8984,13 @@ handle_written_inodeblock(inodedep, bp) nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); - if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) + if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; @@ -4613,12 +9002,23 @@ handle_written_inodeblock(inodedep, bp) */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); + if (inodedep->id_savednlink > LINK_MAX) + panic("handle_written_inodeblock: Invalid link count " + "%d for inodedep %p", inodedep->id_savednlink, inodedep); if (fstype == UFS1) { + if (dp1->di_nlink != inodedep->id_savednlink) { + dp1->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { + if (dp2->di_nlink != inodedep->id_savednlink) { + dp2->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; @@ -4630,6 +9030,7 @@ handle_written_inodeblock(inodedep, bp) } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; + inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in @@ -4637,69 +9038,49 @@ handle_written_inodeblock(inodedep, bp) */ if (hadchanges) bdirty(bp); +bufwait: /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples - * before the old ones have been deleted. + * before the old ones have been deleted. Completely + * unlinked inodes are not processed until the unlinked + * inode list is written or the last reference is removed. */ - filefree = NULL; - while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { - WORKLIST_REMOVE(wk); - switch (wk->wk_type) { - - case D_FREEFILE: - /* - * We defer adding filefree to the worklist until - * all other additions have been made to ensure - * that it will be done after all the old blocks - * have been freed. - */ - if (filefree != NULL) - panic("handle_written_inodeblock: filefree"); - filefree = wk; - continue; - - case D_MKDIR: - handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); - continue; - - case D_DIRADD: - diradd_inode_written(WK_DIRADD(wk), inodedep); - continue; - - case D_FREEBLKS: - wk->wk_state |= COMPLETE; - if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) - continue; - /* -- fall through -- */ - case D_FREEFRAG: - case D_DIRREM: - add_to_worklist(wk); - continue; - - case D_NEWDIRBLK: - free_newdirblk(WK_NEWDIRBLK(wk)); - continue; - - default: - panic("handle_written_inodeblock: Unknown type %s", - TYPENAME(wk->wk_type)); - /* NOTREACHED */ + if ((inodedep->id_state & UNLINKED) == 0) { + freefile = handle_bufwait(inodedep, NULL); + if (freefile && !LIST_EMPTY(&wkhd)) { + WORKLIST_INSERT(&wkhd, &freefile->fx_list); + freefile = NULL; } } - if (filefree != NULL) { + /* + * Move rolled forward dependency completions to the bufwait list + * now that those that were already written have been processed. + */ + if (!LIST_EMPTY(&wkhd) && hadchanges == 0) + panic("handle_written_inodeblock: bufwait but no changes"); + jwork_move(&inodedep->id_bufwait, &wkhd); + + if (freefile != NULL) { + /* + * If the inode is goingaway it was never written. Fake up + * the state here so free_inodedep() can succeed. + */ + if (inodedep->id_state & GOINGAWAY) + inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) - panic("handle_written_inodeblock: live inodedep"); - add_to_worklist(filefree); + panic("handle_written_inodeblock: live inodedep %p", + inodedep); + add_to_worklist(&freefile->fx_list, 0); return (0); } @@ -4707,12 +9088,101 @@ handle_written_inodeblock(inodedep, bp) * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || - (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && - TAILQ_FIRST(&inodedep->id_extupdt) == 0)) + (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && + TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && + TAILQ_FIRST(&inodedep->id_extupdt) == 0 && + LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } +static int +handle_written_indirdep(indirdep, bp, bpp) + struct indirdep *indirdep; + struct buf *bp; + struct buf **bpp; +{ + struct allocindir *aip; + int chgs; + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_write_complete: indirdep gone"); + chgs = 0; + /* + * If there were rollbacks revert them here. + */ + if (indirdep->ir_saveddata) { + bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); + free(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = 0; + chgs = 1; + } + indirdep->ir_state &= ~UNDONE; + indirdep->ir_state |= ATTACHED; + /* + * Move allocindirs with written pointers to the completehd if + * the the indirdep's pointer is not yet written. Otherwise + * free them here. + */ + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { + LIST_REMOVE(aip, ai_next); + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, + ai_next); + continue; + } + free_newblk(&aip->ai_block); + } + /* + * Move allocindirs that have finished dependency processing from + * the done list to the write list after updating the pointers. + */ + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { + handle_allocindir_partdone(aip); + if (aip == LIST_FIRST(&indirdep->ir_donehd)) + panic("disk_write_complete: not gone"); + chgs = 1; + } + /* + * If this indirdep has been detached from its newblk during + * I/O we need to keep this dep attached to the buffer so + * deallocate_dependencies can find it and properly resolve + * any outstanding dependencies. + */ + if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) + chgs = 1; + if ((bp->b_flags & B_DELWRI) == 0) + stat_indir_blk_ptrs++; + /* + * If there were no changes we can discard the savedbp and detach + * ourselves from the buf. We are only carrying completed pointers + * in this case. + */ + if (chgs == 0) { + struct buf *sbp; + + sbp = indirdep->ir_savebp; + sbp->b_flags |= B_INVAL | B_NOCACHE; + indirdep->ir_savebp = NULL; + if (*bpp != NULL) + panic("handle_written_indirdep: bp already exists."); + *bpp = sbp; + } else + bdirty(bp); + /* + * If there are no fresh dependencies and none waiting on writes + * we can free the indirdep. + */ + if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { + if (indirdep->ir_state & ONDEPLIST) + LIST_REMOVE(indirdep, ir_next); + free_indirdep(indirdep); + return (0); + } + + return (chgs); +} + /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. @@ -4722,50 +9192,200 @@ diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { - struct pagedep *pagedep; dap->da_state |= COMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); - } + complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* - * Handle the completion of a mkdir dependency. + * Returns true if the bmsafemap will have rollbacks when written. Must + * only be called with lk and the buf lock on the cg held. */ +static int +bmsafemap_rollbacks(bmsafemap) + struct bmsafemap *bmsafemap; +{ + + return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | + !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); +} + +/* + * Complete a write to a bmsafemap structure. Roll forward any bitmap + * changes if it's not a background write. Set all written dependencies + * to DEPCOMPLETE and free the structure if possible. + */ +static int +handle_written_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; +{ + struct newblk *newblk; + struct inodedep *inodedep; + struct jaddref *jaddref, *jatmp; + struct jnewblk *jnewblk, *jntmp; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ino_t ino; + long bno; + int chgs; + int i; + + if ((bmsafemap->sm_state & IOSTARTED) == 0) + panic("initiate_write_bmsafemap: Not started\n"); + chgs = 0; + bmsafemap->sm_state &= ~IOSTARTED; + /* + * Restore unwritten inode allocation pending jaddref writes. + */ + if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, + ja_bmdeps, jatmp) { + if ((jaddref->ja_state & UNDONE) == 0) + continue; + ino = jaddref->ja_ino % fs->fs_ipg; + if (isset(inosused, ino)) + panic("handle_written_bmsafemap: " + "re-allocated inode"); + if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir++; + cgp->cg_cs.cs_nifree--; + setbit(inosused, ino); + chgs = 1; + } + jaddref->ja_state &= ~UNDONE; + jaddref->ja_state |= ATTACHED; + free_jaddref(jaddref); + } + } + /* + * Restore any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, + jntmp) { + if ((jnewblk->jn_state & UNDONE) == 0) + continue; + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (bp->b_xflags & BX_BKGRDMARKER) + break; + if ((jnewblk->jn_state & NEWBLOCK) == 0 && + isclr(blksfree, bno + i)) + panic("handle_written_bmsafemap: " + "re-allocated fragment"); + clrbit(blksfree, bno + i); + chgs = 1; + } + jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); + jnewblk->jn_state |= ATTACHED; + free_jnewblk(jnewblk); + } + } + while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { + newblk->nb_state |= DEPCOMPLETE; + newblk->nb_state &= ~ONDEPLIST; + newblk->nb_bmsafemap = NULL; + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_list.wk_type == D_ALLOCDIRECT) + handle_allocdirect_partdone( + WK_ALLOCDIRECT(&newblk->nb_list), NULL); + else if (newblk->nb_list.wk_type == D_ALLOCINDIR) + handle_allocindir_partdone( + WK_ALLOCINDIR(&newblk->nb_list)); + else if (newblk->nb_list.wk_type != D_NEWBLK) + panic("handle_written_bmsafemap: Unexpected type: %s", + TYPENAME(newblk->nb_list.wk_type)); + } + while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { + inodedep->id_state |= DEPCOMPLETE; + inodedep->id_state &= ~ONDEPLIST; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_bmsafemap = NULL; + } + if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && + LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && + LIST_EMPTY(&bmsafemap->sm_newblkhd) && + LIST_EMPTY(&bmsafemap->sm_inodedephd)) { + if (chgs) + bdirty(bp); + LIST_REMOVE(bmsafemap, sm_hash); + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (0); + } + bdirty(bp); + return (1); +} + +/* + * Try to free a mkdir dependency. + */ static void -handle_written_mkdir(mkdir, type) +complete_mkdir(mkdir) struct mkdir *mkdir; - int type; { struct diradd *dap; - struct pagedep *pagedep; - if (mkdir->md_state != type) - panic("handle_written_mkdir: bad type"); + if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; - dap->da_state &= ~type; - if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + complete_diradd(dap); } - LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* + * Handle the completion of a mkdir dependency. + */ +static void +handle_written_mkdir(mkdir, type) + struct mkdir *mkdir; + int type; +{ + + if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) + panic("handle_written_mkdir: bad type"); + mkdir->md_state |= COMPLETE; + complete_mkdir(mkdir); +} + +static void +free_pagedep(pagedep) + struct pagedep *pagedep; +{ + int i; + + if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) + return; + for (i = 0; i < DAHASHSZ; i++) + if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) + return; + if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) + return; + if (!LIST_EMPTY(&pagedep->pd_dirremhd)) + return; + if (!LIST_EMPTY(&pagedep->pd_pendinghd)) + return; + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, D_PAGEDEP); +} + +/* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. @@ -4790,8 +9410,11 @@ handle_written_filepage(pagedep, bp) */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); + dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_written_filepage: Journal entries not written.")); + add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. @@ -4800,7 +9423,7 @@ handle_written_filepage(pagedep, bp) */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * Uncommitted directory entries must be restored. */ @@ -4845,7 +9468,8 @@ handle_written_filepage(pagedep, bp) * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ - if ((pagedep->pd_state & NEWBLOCK) == 0) { + if ((pagedep->pd_state & NEWBLOCK) == 0 && + LIST_EMPTY(&pagedep->pd_jmvrefhd)) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } @@ -4880,8 +9504,8 @@ softdep_load_inodeblock(ip) */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); - if (inodedep_lookup(UFSTOVFS(ip->i_ump), - ip->i_number, 0, &inodedep) == 0) { + if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, + &inodedep) == 0) { FREE_LOCK(&lk); return; } @@ -4908,6 +9532,7 @@ softdep_update_inodeblock(ip, bp, waitfor) int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; + struct inoref *inoref; struct worklist *wk; struct mount *mp; struct buf *ibp; @@ -4922,6 +9547,7 @@ softdep_update_inodeblock(ip, bp, waitfor) */ mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); +again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) @@ -4931,6 +9557,19 @@ softdep_update_inodeblock(ip, bp, waitfor) if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* + * If we're flushing all dependencies we must also move any waiting + * for journal writes onto the bufwait list prior to I/O. + */ + if (waitfor) { + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto again; + } + } + } + /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ @@ -4945,10 +9584,12 @@ softdep_update_inodeblock(ip, bp, waitfor) */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), + NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), + NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk @@ -4971,11 +9612,11 @@ softdep_update_inodeblock(ip, bp, waitfor) return; } retry: - if ((inodedep->id_state & DEPCOMPLETE) != 0) { + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(&lk); return; } - ibp = inodedep->id_buf; + ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, &lk, MNT_WAIT); if (ibp == NULL) { /* @@ -5007,13 +9648,13 @@ merge_inode_lists(newlisthead, oldlisthead) newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { - if (listadp->ad_lbn < newadp->ad_lbn) { + if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); - if (listadp->ad_lbn == newadp->ad_lbn) { + if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; @@ -5036,6 +9677,7 @@ softdep_fsync(vp) { struct inodedep *inodedep; struct pagedep *pagedep; + struct inoref *inoref; struct worklist *wk; struct diradd *dap; struct mount *mp; @@ -5052,17 +9694,24 @@ softdep_fsync(vp) fs = ip->i_fs; mp = vp->v_mount; ACQUIRE_LOCK(&lk); +restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return (0); } + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (!LIST_EMPTY(&inodedep->id_inowait) || - !LIST_EMPTY(&inodedep->id_bufwait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) - panic("softdep_fsync: pending ops"); + panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; @@ -5254,8 +9903,8 @@ int softdep_sync_metadata(struct vnode *vp) { struct pagedep *pagedep; - struct allocdirect *adp; struct allocindir *aip; + struct newblk *newblk; struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; @@ -5319,27 +9968,15 @@ loop: switch (wk->wk_type) { case D_ALLOCDIRECT: - adp = WK_ALLOCDIRECT(wk); - if (adp->ad_state & DEPCOMPLETE) - continue; - nbp = adp->ad_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; + case D_ALLOCINDIR: + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + goto restart; } - ACQUIRE_LOCK(&lk); - continue; - - case D_ALLOCINDIR: - aip = WK_ALLOCINDIR(wk); - if (aip->ai_state & DEPCOMPLETE) + if (newblk->nb_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; @@ -5355,10 +9992,16 @@ loop: case D_INDIRDEP: restart: - LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { - if (aip->ai_state & DEPCOMPLETE) + LIST_FOREACH(aip, + &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { + newblk = (struct newblk *)aip; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + goto restart; + } + if (newblk->nb_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, MNT_WAIT); if (nbp == NULL) goto restart; @@ -5371,14 +10014,6 @@ loop: } continue; - case D_INODEDEP: - if ((error = flush_inodedep_deps(wk->wk_mp, - WK_INODEDEP(wk)->id_ino)) != 0) { - FREE_LOCK(&lk); - break; - } - continue; - case D_PAGEDEP: /* * We are trying to sync a directory that may @@ -5400,48 +10035,6 @@ loop: } continue; - case D_MKDIR: - /* - * This case should never happen if the vnode has - * been properly sync'ed. However, if this function - * is used at a place where the vnode has not yet - * been sync'ed, this dependency can show up. So, - * rather than panic, just flush it. - */ - nbp = WK_MKDIR(wk)->md_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; - } - ACQUIRE_LOCK(&lk); - continue; - - case D_BMSAFEMAP: - /* - * This case should never happen if the vnode has - * been properly sync'ed. However, if this function - * is used at a place where the vnode has not yet - * been sync'ed, this dependency can show up. So, - * rather than panic, just flush it. - */ - nbp = WK_BMSAFEMAP(wk)->sm_buf; - nbp = getdirtybuf(nbp, &lk, waitfor); - if (nbp == NULL) - continue; - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(nbp); - } else if ((error = bwrite(nbp)) != 0) { - break; - } - ACQUIRE_LOCK(&lk); - continue; - default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); @@ -5489,7 +10082,8 @@ loop: BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); - return (0); + return ffs_update(vp, 1); + /* return (0); */ } /* @@ -5502,6 +10096,7 @@ flush_inodedep_deps(mp, ino) ino_t ino; { struct inodedep *inodedep; + struct inoref *inoref; int error, waitfor; /* @@ -5522,8 +10117,16 @@ flush_inodedep_deps(mp, ino) return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); +restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || @@ -5555,13 +10158,19 @@ flush_deplist(listhead, waitfor, errorp) int *errorp; { struct allocdirect *adp; + struct newblk *newblk; struct buf *bp; mtx_assert(&lk, MA_OWNED); TAILQ_FOREACH(adp, listhead, ad_next) { - if (adp->ad_state & DEPCOMPLETE) + newblk = (struct newblk *)adp; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + return (1); + } + if (newblk->nb_state & DEPCOMPLETE) continue; - bp = adp->ad_buf; + bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) @@ -5582,6 +10191,100 @@ flush_deplist(listhead, waitfor, errorp) } /* + * Flush dependencies associated with an allocdirect block. + */ +static int +flush_newblk_dep(vp, mp, lbn) + struct vnode *vp; + struct mount *mp; + ufs_lbn_t lbn; +{ + struct newblk *newblk; + struct bufobj *bo; + struct inode *ip; + struct buf *bp; + ufs2_daddr_t blkno; + int error; + + error = 0; + bo = &vp->v_bufobj; + ip = VTOI(vp); + blkno = DIP(ip, i_db[lbn]); + if (blkno == 0) + panic("flush_newblk_dep: Missing block"); + ACQUIRE_LOCK(&lk); + /* + * Loop until all dependencies related to this block are satisfied. + * We must be careful to restart after each sleep in case a write + * completes some part of this process for us. + */ + for (;;) { + if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { + FREE_LOCK(&lk); + break; + } + if (newblk->nb_list.wk_type != D_ALLOCDIRECT) + panic("flush_newblk_deps: Bad newblk %p", newblk); + /* + * Flush the journal. + */ + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list); + continue; + } + /* + * Write the bitmap dependency. + */ + if ((newblk->nb_state & DEPCOMPLETE) == 0) { + bp = newblk->nb_bmsafemap->sm_buf; + bp = getdirtybuf(bp, &lk, MNT_WAIT); + if (bp == NULL) + continue; + FREE_LOCK(&lk); + error = bwrite(bp); + if (error) + break; + ACQUIRE_LOCK(&lk); + continue; + } + /* + * Write the buffer. + */ + FREE_LOCK(&lk); + BO_LOCK(bo); + bp = gbincore(bo, lbn); + if (bp != NULL) { + error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | + LK_INTERLOCK, BO_MTX(bo)); + if (error == ENOLCK) { + ACQUIRE_LOCK(&lk); + continue; /* Slept, retry */ + } + if (error != 0) + break; /* Failed */ + if (bp->b_flags & B_DELWRI) { + bremfree(bp); + error = bwrite(bp); + if (error) + break; + } else + BUF_UNLOCK(bp); + } else + BO_UNLOCK(bo); + /* + * We have to wait for the direct pointers to + * point at the newdirblk before the dependency + * will go away. + */ + error = ffs_update(vp, MNT_WAIT); + if (error) + break; + ACQUIRE_LOCK(&lk); + } + return (error); +} + +/* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ @@ -5592,16 +10295,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp) struct diraddhd *diraddhdp; { struct inodedep *inodedep; + struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; - struct bufobj *bo; int error = 0; struct buf *bp; ino_t inum; - struct worklist *wk; ump = VFSTOUFS(mp); +restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry @@ -5609,7 +10312,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp) */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); - if ((error = ffs_update(pvp, 1)) != 0) + if ((error = ffs_update(pvp, MNT_WAIT)) != 0) break; ACQUIRE_LOCK(&lk); /* @@ -5623,84 +10326,51 @@ flush_pagedep_deps(pvp, mp, diraddhdp) /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be - * committed in its parent. We do not want or need - * the full semantics of a synchronous ffs_syncvnode as - * that may end up here again, once for each directory - * level in the filesystem. Instead, we push the blocks - * and wait for them to clear. We have to fsync twice - * because the first call may choose to defer blocks - * that still have dependencies, but deferral will - * happen at most once. + * committed in its parent. */ inum = dap->da_newinum; + if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) + panic("flush_pagedep_deps: lost inode1"); + /* + * Wait for any pending journal adds to complete so we don't + * cause rollbacks while syncing. + */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list); + goto restart; + } + } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(&lk); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; - if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || - (error=ffs_syncvnode(vp, MNT_NOWAIT))) { - vput(vp); - break; - } - bo = &vp->v_bufobj; - BO_LOCK(bo); - drain_output(vp); + error = flush_newblk_dep(vp, mp, 0); /* - * If first block is still dirty with a D_MKDIR - * dependency then it needs to be written now. + * If we still have the dependency we might need to + * update the vnode to sync the new link count to + * disk. */ - for (;;) { - error = 0; - bp = gbincore(bo, 0); - if (bp == NULL) - break; /* First block not present */ - error = BUF_LOCK(bp, - LK_EXCLUSIVE | - LK_SLEEPFAIL | - LK_INTERLOCK, - BO_MTX(bo)); - BO_LOCK(bo); - if (error == ENOLCK) - continue; /* Slept, retry */ - if (error != 0) - break; /* Failed */ - if ((bp->b_flags & B_DELWRI) == 0) { - BUF_UNLOCK(bp); - break; /* Buffer not dirty */ - } - for (wk = LIST_FIRST(&bp->b_dep); - wk != NULL; - wk = LIST_NEXT(wk, wk_list)) - if (wk->wk_type == D_MKDIR) - break; - if (wk == NULL) - BUF_UNLOCK(bp); /* Dependency gone */ - else { - /* - * D_MKDIR dependency remains, - * must write buffer to stable - * storage. - */ - BO_UNLOCK(bo); - bremfree(bp); - error = bwrite(bp); - BO_LOCK(bo); - } - break; - } - BO_UNLOCK(bo); + if (error == 0 && dap == LIST_FIRST(diraddhdp)) + error = ffs_update(vp, MNT_WAIT); vput(vp); if (error != 0) - break; /* Flushing of first block failed */ + break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; - if (dap->da_state & MKDIR_BODY) - panic("flush_pagedep_deps: MKDIR_BODY"); + if (dap->da_state & MKDIR_BODY) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, + &inodedep); + panic("flush_pagedep_deps: MKDIR_BODY " + "inodedep %p dap %p vp %p", + inodedep, dap, vp); + } } /* * Flush the inode on which the directory entry depends. @@ -5719,8 +10389,8 @@ retry: * If the inode still has bitmap dependencies, * push them to disk. */ - if ((inodedep->id_state & DEPCOMPLETE) == 0) { - bp = inodedep->id_buf; + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { + bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, MNT_WAIT); if (bp == NULL) goto retry; @@ -5733,24 +10403,29 @@ retry: } /* * If the inode is still sitting in a buffer waiting - * to be written, push it to disk. + * to be written or waiting for the link count to be + * adjusted update it here to flush it to disk. */ - FREE_LOCK(&lk); - if ((error = bread(ump->um_devvp, - fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), - (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { - brelse(bp); - break; + if (dap == LIST_FIRST(diraddhdp)) { + FREE_LOCK(&lk); + if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ))) + break; + error = ffs_update(vp, MNT_WAIT); + vput(vp); + if (error) + break; + ACQUIRE_LOCK(&lk); } - if ((error = bwrite(bp)) != 0) - break; - ACQUIRE_LOCK(&lk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ - if (dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush failed"); + if (dap == LIST_FIRST(diraddhdp)) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); + panic("flush_pagedep_deps: failed to flush " + "inodedep %p ino %d dap %p", inodedep, inum, dap); + } } if (error) ACQUIRE_LOCK(&lk); @@ -6100,10 +10775,13 @@ softdep_count_dependencies(bp, wantcount) int wantcount; { struct worklist *wk; + struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct indirdep *indirdep; + struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; + struct dirrem *dirrem; struct diradd *dap; int i, retval; @@ -6132,6 +10810,12 @@ softdep_count_dependencies(bp, wantcount) if (!wantcount) goto out; } + if (TAILQ_FIRST(&inodedep->id_inoreflst)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } continue; case D_INDIRDEP: @@ -6147,6 +10831,14 @@ softdep_count_dependencies(bp, wantcount) case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + if (LIST_FIRST(&dirrem->dm_jremrefhd)) { + /* Journal remove ref dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { @@ -6159,14 +10851,44 @@ softdep_count_dependencies(bp, wantcount) continue; case D_BMSAFEMAP: + bmsafemap = WK_BMSAFEMAP(wk); + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { + /* Allocate block dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { + /* Freeblk journal dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: case D_ALLOCDIRECT: case D_ALLOCINDIR: case D_MKDIR: + case D_JSEG: + case D_SBDEP: /* never a dependency on these blocks */ continue; default: - panic("softdep_check_for_rollback: Unexpected type %s", + panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } @@ -6382,6 +11104,45 @@ softdep_error(func, error) #ifdef DDB +static void +inodedep_print(struct inodedep *inodedep, int verbose) +{ + db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" + " saveino %p\n", + inodedep, inodedep->id_fs, inodedep->id_state, + (intmax_t)inodedep->id_ino, + (intmax_t)fsbtodb(inodedep->id_fs, + ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), + inodedep->id_nlinkdelta, inodedep->id_savednlink, + inodedep->id_savedino1); + + if (verbose == 0) + return; + + db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " + "mkdiradd %p\n", + LIST_FIRST(&inodedep->id_pendinghd), + LIST_FIRST(&inodedep->id_bufwait), + LIST_FIRST(&inodedep->id_inowait), + TAILQ_FIRST(&inodedep->id_inoreflst), + inodedep->id_mkdiradd); + db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", + TAILQ_FIRST(&inodedep->id_inoupdt), + TAILQ_FIRST(&inodedep->id_newinoupdt), + TAILQ_FIRST(&inodedep->id_extupdt), + TAILQ_FIRST(&inodedep->id_newextupdt)); +} + +DB_SHOW_COMMAND(inodedep, db_show_inodedep) +{ + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + inodedep_print((struct inodedep*)addr, 1); +} + DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) { struct inodedep_hashhead *inodedephd; @@ -6395,15 +11156,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) LIST_FOREACH(inodedep, inodedephd, id_hash) { if (fs != NULL && fs != inodedep->id_fs) continue; - db_printf("%p fs %p st %x ino %jd inoblk %jd\n", - inodedep, inodedep->id_fs, inodedep->id_state, - (intmax_t)inodedep->id_ino, - (intmax_t)fsbtodb(inodedep->id_fs, - ino_to_fsba(inodedep->id_fs, inodedep->id_ino))); + inodedep_print(inodedep, 0); } } } +DB_SHOW_COMMAND(worklist, db_show_worklist) +{ + struct worklist *wk; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wk = (struct worklist *)addr; + printf("worklist: %p type %s state 0x%X\n", + wk, TYPENAME(wk->wk_type), wk->wk_state); +} + +DB_SHOW_COMMAND(workhead, db_show_workhead) +{ + struct workhead *wkhd; + struct worklist *wk; + int i; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wkhd = (struct workhead *)addr; + wk = LIST_FIRST(wkhd); + for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) + db_printf("worklist: %p type %s state 0x%X", + wk, TYPENAME(wk->wk_type), wk->wk_state); + if (i == 100) + db_printf("workhead overflow"); + printf("\n"); +} + + +DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) +{ + struct jaddref *jaddref; + struct diradd *diradd; + struct mkdir *mkdir; + + LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { + diradd = mkdir->md_diradd; + db_printf("mkdir: %p state 0x%X dap %p state 0x%X", + mkdir, mkdir->md_state, diradd, diradd->da_state); + if ((jaddref = mkdir->md_jaddref) != NULL) + db_printf(" jaddref %p jaddref state 0x%X", + jaddref, jaddref->ja_state); + db_printf("\n"); + } +} + #endif /* DDB */ #endif /* SOFTUPDATES */ Index: /usr/src/sys/ufs/ffs/ffs_vnops.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_vnops.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_vnops.c (working copy) @@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor) wait = (waitfor == MNT_WAIT); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); bo = &vp->v_bufobj; + ip->i_flag &= ~IN_NEEDSYNC; /* * Flush all dirty buffers associated with a vnode. Index: /usr/src/sys/ufs/ffs/ffs_alloc.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_alloc.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_alloc.c (working copy) @@ -94,23 +94,23 @@ __FBSDID("$FreeBSD$"); #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref, - int size); + int size, int rsize); -static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int); +static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int, int); static ufs2_daddr_t - ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t); + ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif -static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int); -static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *, - ufs1_daddr_t, int); +static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int, + int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int); static void ffs_fserr(struct fs *, ino_t, char *); static ufs2_daddr_t ffs_hashalloc - (struct inode *, int, ufs2_daddr_t, int, allocfcn_t *); -static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int); + (struct inode *, int, ufs2_daddr_t, int, int, allocfcn_t *); +static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int, + int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); @@ -187,7 +187,7 @@ retry: cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); - bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); if (ip->i_flag & IN_SPACECOUNTED) { @@ -385,16 +385,12 @@ retry: panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } - bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, - ip->i_number); - if (nsize < request) - ffs_blkfree(ump, fs, ip->i_devvp, - bno + numfrags(fs, nsize), - (long)(request - nsize), ip->i_number); + ip->i_number, NULL); delta = btodb(nsize - osize); if (ip->i_flag & IN_SPACECOUNTED) { UFS_LOCK(ump); @@ -483,6 +479,14 @@ ffs_reallocblks(ap) if (doreallocblks == 0) return (ENOSPC); + /* + * We can't wait in softdep prealloc as it may fsync and recurse + * here. Instead we simply fail to reallocate blocks if this + * rare condition arises. + */ + if (DOINGSOFTDEP(ap->a_vp)) + if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) + return (ENOSPC); if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); @@ -583,7 +587,7 @@ ffs_reallocblks_ufs1(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -669,7 +673,7 @@ ffs_reallocblks_ufs1(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -791,7 +795,7 @@ ffs_reallocblks_ufs2(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -877,7 +881,7 @@ ffs_reallocblks_ufs2(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -964,7 +968,7 @@ ffs_valloc(pvp, mode, cred, vpp) if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } - ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; @@ -1273,11 +1277,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap) */ /*VARARGS5*/ static ufs2_daddr_t -ffs_hashalloc(ip, cg, pref, size, allocator) +ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; int cg; ufs2_daddr_t pref; - int size; /* size for data blocks, mode for inodes */ + int size; /* Search size for data blocks, mode for inodes */ + int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; @@ -1293,7 +1298,7 @@ static ufs2_daddr_t /* * 1: preferred cylinder group */ - result = (*allocator)(ip, cg, pref, size); + result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* @@ -1303,7 +1308,7 @@ static ufs2_daddr_t cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } @@ -1314,7 +1319,7 @@ static ufs2_daddr_t */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; @@ -1396,7 +1401,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, + frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); @@ -1414,11 +1420,12 @@ fail: * and if it is, allocate it. */ static ufs2_daddr_t -ffs_alloccg(ip, cg, bpref, size) +ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; int cg; ufs2_daddr_t bpref; int size; + int rsize; { struct fs *fs; struct cg *cgp; @@ -1446,7 +1453,7 @@ static ufs2_daddr_t cgp->cg_old_time = cgp->cg_time = time_second; if (size == fs->fs_bsize) { UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); @@ -1470,21 +1477,14 @@ static ufs2_daddr_t if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); - bno = dtogd(fs, blkno); - for (i = frags; i < fs->fs_frag; i++) - setbit(blksfree, bno + i); - i = fs->fs_frag - frags; - cgp->cg_cs.cs_nffree += i; - fs->fs_cstotal.cs_nffree += i; - fs->fs_cs(fs, cg).cs_nffree += i; - fs->fs_fmod = 1; - cgp->cg_frsum[i]++; + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } + KASSERT(size == rsize, + ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; @@ -1502,7 +1502,7 @@ static ufs2_daddr_t ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); @@ -1524,10 +1524,11 @@ fail: * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t -ffs_alloccgblk(ip, bp, bpref) +ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; + int size; { struct fs *fs; struct cg *cgp; @@ -1535,6 +1536,7 @@ static ufs2_daddr_t ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; + int i; fs = ip->i_fs; ump = ip->i_ump; @@ -1562,16 +1564,32 @@ static ufs2_daddr_t gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); - ffs_clusteracct(ump, fs, cgp, blkno, -1); + ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; + /* + * If the caller didn't want the whole block free the frags here. + */ + size = numfrags(fs, size); + if (size != fs->fs_frag) { + bno = dtogd(fs, blkno); + for (i = size; i < fs->fs_frag; i++) + setbit(blksfree, bno + i); + i = fs->fs_frag - size; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, + size, 0); UFS_LOCK(ump); return (blkno); } @@ -1584,11 +1602,12 @@ gotit: * take the first one that we find following bpref. */ static ufs2_daddr_t -ffs_clusteralloc(ip, cg, bpref, len) +ffs_clusteralloc(ip, cg, bpref, len, unused) struct inode *ip; int cg; ufs2_daddr_t bpref; int len; + int unused; { struct fs *fs; struct cg *cgp; @@ -1684,7 +1703,7 @@ static ufs2_daddr_t len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) - if (ffs_alloccgblk(ip, bp, bno + i) != bno + i) + if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); @@ -1708,11 +1727,12 @@ fail: * inode in the specified cylinder group. */ static ufs2_daddr_t -ffs_nodealloccg(ip, cg, ipref, mode) +ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; int cg; ufs2_daddr_t ipref; int mode; + int unused; { struct fs *fs; struct cg *cgp; @@ -1815,28 +1835,6 @@ gotit: } /* - * check if a block is free - */ -static int -ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0); - case 4: - return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); - case 2: - return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); - case 1: - return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); - default: - panic("ffs_isfreeblock"); - } - return (0); -} - -/* * Free a block or fragment. * * The specified block or fragment is placed back in the @@ -1844,13 +1842,14 @@ gotit: * block reassembly is checked. */ void -ffs_blkfree(ump, fs, devvp, bno, size, inum) +ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + struct workhead *dephd; { struct cg *cgp; struct buf *bp; @@ -1917,7 +1916,7 @@ void panic("ffs_blkfree: freeing free block"); } ffs_setblock(fs, blksfree, fragno); - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1957,7 +1956,7 @@ void cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1966,6 +1965,9 @@ void fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, + numfrags(fs, size), dephd); bdwrite(bp); } @@ -2036,7 +2038,8 @@ ffs_vfree(pvp, ino, mode) return (0); } ip = VTOI(pvp); - return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode)); + return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, + NULL)); } /* @@ -2044,12 +2047,13 @@ ffs_vfree(pvp, ino, mode) * The specified inode is placed back in the free map. */ int -ffs_freefile(ump, fs, devvp, ino, mode) +ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; + struct workhead *wkhd; { struct cg *cgp; struct buf *bp; @@ -2105,6 +2109,9 @@ int fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_inofree(UFSTOVFS(ump), bp, + ino + cg * fs->fs_ipg, wkhd); bdwrite(bp); return (0); } @@ -2218,101 +2225,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz) } /* - * Update the cluster map because of an allocation or free. - * - * Cnt == 1 means free; cnt == -1 means allocating. - */ -void -ffs_clusteracct(ump, fs, cgp, blkno, cnt) - struct ufsmount *ump; - struct fs *fs; - struct cg *cgp; - ufs1_daddr_t blkno; - int cnt; -{ - int32_t *sump; - int32_t *lp; - u_char *freemapp, *mapp; - int i, start, end, forw, back, map, bit; - - mtx_assert(UFS_MTX(ump), MA_OWNED); - - if (fs->fs_contigsumsize <= 0) - return; - freemapp = cg_clustersfree(cgp); - sump = cg_clustersum(cgp); - /* - * Allocate or clear the actual block. - */ - if (cnt > 0) - setbit(freemapp, blkno); - else - clrbit(freemapp, blkno); - /* - * Find the size of the cluster going forward. - */ - start = blkno + 1; - end = start + fs->fs_contigsumsize; - if (end >= cgp->cg_nclusterblks) - end = cgp->cg_nclusterblks; - mapp = &freemapp[start / NBBY]; - map = *mapp++; - bit = 1 << (start % NBBY); - for (i = start; i < end; i++) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != (NBBY - 1)) { - bit <<= 1; - } else { - map = *mapp++; - bit = 1; - } - } - forw = i - start; - /* - * Find the size of the cluster going backward. - */ - start = blkno - 1; - end = start - fs->fs_contigsumsize; - if (end < 0) - end = -1; - mapp = &freemapp[start / NBBY]; - map = *mapp--; - bit = 1 << (start % NBBY); - for (i = start; i > end; i--) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != 0) { - bit >>= 1; - } else { - map = *mapp--; - bit = 1 << (NBBY - 1); - } - } - back = start - i; - /* - * Account for old cluster and the possibly new forward and - * back clusters. - */ - i = back + forw + 1; - if (i > fs->fs_contigsumsize) - i = fs->fs_contigsumsize; - sump[i] += cnt; - if (back > 0) - sump[back] -= cnt; - if (forw > 0) - sump[forw] -= cnt; - /* - * Update cluster summary information. - */ - lp = &sump[fs->fs_contigsumsize]; - for (i = fs->fs_contigsumsize; i > 0; i--) - if (*lp-- > 0) - break; - fs->fs_maxcluster[cgp->cg_cgx] = i; -} - -/* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: @@ -2532,7 +2444,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) #endif /* DEBUG */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, - cmd.value, filetype))) + cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; @@ -2560,7 +2472,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, ROOTINO); + blksize * fs->fs_fsize, ROOTINO, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; Index: /usr/src/sys/ufs/ffs/ffs_extern.h =================================================================== --- /usr/src/sys/ufs/ffs/ffs_extern.h (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_extern.h (working copy) @@ -47,6 +47,7 @@ struct ucred; struct vnode; struct vop_fsync_args; struct vop_reallocblks_args; +struct workhead; int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int, struct ucred *, ufs2_daddr_t *); @@ -56,20 +57,23 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_st struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t); + ufs2_daddr_t, long, ino_t, struct workhead *); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_bdflush(struct bufobj *, struct buf *); int ffs_copyonwrite(struct vnode *, struct buf *); int ffs_flushfiles(struct mount *, int, struct thread *); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, - int); + int, struct workhead *); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); int ffs_mountroot(void); +void ffs_oldfscompat_write(struct fs *, struct ufsmount *); int ffs_reallocblks(struct vop_reallocblks_args *); int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); @@ -108,7 +112,8 @@ void softdep_initialize(void); void softdep_uninitialize(void); int softdep_mount(struct vnode *, struct mount *, struct fs *, struct ucred *); -void softdep_move_dependencies(struct buf *, struct buf *); +void softdep_unmount(struct mount *); +int softdep_move_dependencies(struct buf *, struct buf *); int softdep_flushworklist(struct mount *, int *, struct thread *); int softdep_flushfiles(struct mount *, int, struct thread *); void softdep_update_inodeblock(struct inode *, struct buf *, int); @@ -117,7 +122,8 @@ void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t); -void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t); +void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, + int, int); void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, @@ -126,11 +132,17 @@ void softdep_setup_allocindir_meta(struct buf *, s struct buf *, int, ufs2_daddr_t); void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *); +void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int, + struct workhead *); +void softdep_setup_inofree(struct mount *, struct buf *, ino_t, + struct workhead *); +void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *); void softdep_fsync_mountdev(struct vnode *); int softdep_sync_metadata(struct vnode *); int softdep_process_worklist(struct mount *, int); int softdep_fsync(struct vnode *); int softdep_waitidle(struct mount *); +int softdep_prealloc(struct vnode *, int); int ffs_rdonly(struct inode *); Index: /usr/src/sys/ufs/ffs/ffs_subr.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_subr.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_subr.c (working copy) @@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$"); #ifndef _KERNEL #include #include -#include "fsck.h" #else #include #include @@ -223,12 +222,43 @@ ffs_isblock(fs, cp, h) mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: +#ifdef _KERNEL panic("ffs_isblock"); +#endif + break; } return (0); } /* + * check if a block is free + */ +int +ffs_isfreeblock(fs, cp, h) + struct fs *fs; + u_char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0); + case 4: + return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); + case 2: + return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); + case 1: + return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); + default: +#ifdef _KERNEL + panic("ffs_isfreeblock"); +#endif + break; + } + return (0); +} + +/* * take a block out of the map */ void @@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h) cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_clrblock"); +#endif + break; } } @@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h) cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_setblock"); +#endif + break; } } + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +void +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + ufs1_daddr_t blkno; + int cnt; +{ + int32_t *sump; + int32_t *lp; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; + /* + * Update cluster summary information. + */ + lp = &sump[fs->fs_contigsumsize]; + for (i = fs->fs_contigsumsize; i > 0; i--) + if (*lp-- > 0) + break; + fs->fs_maxcluster[cgp->cg_cgx] = i; +} Index: /usr/src/sys/ufs/ffs/softdep.h =================================================================== --- /usr/src/sys/ufs/ffs/softdep.h (revision 202614) +++ /usr/src/sys/ufs/ffs/softdep.h (working copy) @@ -94,22 +94,28 @@ * The ONWORKLIST flag shows whether the structure is currently linked * onto a worklist. */ -#define ATTACHED 0x0001 -#define UNDONE 0x0002 -#define COMPLETE 0x0004 -#define DEPCOMPLETE 0x0008 -#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */ -#define MKDIR_BODY 0x0020 /* diradd & mkdir only */ -#define RMDIR 0x0040 /* dirrem only */ -#define DIRCHG 0x0080 /* diradd & dirrem only */ -#define GOINGAWAY 0x0100 /* indirdep only */ -#define IOSTARTED 0x0200 /* inodedep & pagedep only */ -#define SPACECOUNTED 0x0400 /* inodedep only */ -#define NEWBLOCK 0x0800 /* pagedep only */ -#define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */ -#define UFS1FMT 0x2000 /* indirdep only */ -#define EXTDATA 0x4000 /* allocdirect only */ -#define ONWORKLIST 0x8000 +#define ATTACHED 0x000001 +#define UNDONE 0x000002 +#define COMPLETE 0x000004 +#define DEPCOMPLETE 0x000008 +#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */ +#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */ +#define RMDIR 0x000040 /* dirrem only */ +#define DIRCHG 0x000080 /* diradd, dirrem only */ +#define GOINGAWAY 0x000100 /* indirdep, jremref only */ +#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */ +#define SPACECOUNTED 0x000400 /* inodedep only */ +#define NEWBLOCK 0x000800 /* pagedep, jaddref only */ +#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */ +#define UFS1FMT 0x002000 /* indirdep only */ +#define EXTDATA 0x004000 /* allocdirect only */ +#define ONWORKLIST 0x008000 +#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */ +#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */ +#define UNLINKED 0x040000 /* inodedep has been unlinked. */ +#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */ +#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */ +#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV) #define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) @@ -135,25 +141,37 @@ * and the macros below changed to use it. */ struct worklist { + LIST_ENTRY(worklist) wk_list; /* list of work requests */ struct mount *wk_mp; /* Mount we live in */ - LIST_ENTRY(worklist) wk_list; /* list of work requests */ - unsigned short wk_type; /* type of request */ - unsigned short wk_state; /* state flags */ + unsigned int wk_type:8, /* type of request */ + wk_state:24; /* state flags */ }; #define WK_DATA(wk) ((void *)(wk)) #define WK_PAGEDEP(wk) ((struct pagedep *)(wk)) #define WK_INODEDEP(wk) ((struct inodedep *)(wk)) #define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk)) +#define WK_NEWBLK(wk) ((struct newblk *)(wk)) #define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk)) #define WK_INDIRDEP(wk) ((struct indirdep *)(wk)) #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk)) #define WK_FREEFRAG(wk) ((struct freefrag *)(wk)) #define WK_FREEBLKS(wk) ((struct freeblks *)(wk)) +#define WK_FREEWORK(wk) ((struct freework *)(wk)) #define WK_FREEFILE(wk) ((struct freefile *)(wk)) #define WK_DIRADD(wk) ((struct diradd *)(wk)) #define WK_MKDIR(wk) ((struct mkdir *)(wk)) #define WK_DIRREM(wk) ((struct dirrem *)(wk)) #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk)) +#define WK_JADDREF(wk) ((struct jaddref *)(wk)) +#define WK_JREMREF(wk) ((struct jremref *)(wk)) +#define WK_JMVREF(wk) ((struct jmvref *)(wk)) +#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk)) +#define WK_JSEG(wk) ((struct jseg *)(wk)) +#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk)) +#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk)) +#define WK_FREEDEP(wk) ((struct freedep *)(wk)) +#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk)) +#define WK_SBDEP(wk) ((struct sbdep *)wk) /* * Various types of lists @@ -165,6 +183,15 @@ LIST_HEAD(inodedephd, inodedep); LIST_HEAD(allocindirhd, allocindir); LIST_HEAD(allocdirecthd, allocdirect); TAILQ_HEAD(allocdirectlst, allocdirect); +LIST_HEAD(indirdephd, indirdep); +LIST_HEAD(jaddrefhd, jaddref); +LIST_HEAD(jremrefhd, jremref); +LIST_HEAD(jmvrefhd, jmvref); +LIST_HEAD(jnewblkhd, jnewblk); +LIST_HEAD(jfreeblkhd, jfreeblk); +LIST_HEAD(freeworkhd, freework); +TAILQ_HEAD(jseglst, jseg); +TAILQ_HEAD(inoreflst, inoref); /* * The "pagedep" structure tracks the various dependencies related to @@ -192,9 +219,11 @@ struct pagedep { LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */ ino_t pd_ino; /* associated file */ ufs_lbn_t pd_lbn; /* block within file */ + struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */ struct dirremhd pd_dirremhd; /* dirrem's waiting for page */ struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */ struct diraddhd pd_pendinghd; /* directory entries awaiting write */ + struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */ }; /* @@ -248,13 +277,18 @@ struct inodedep { struct worklist id_list; /* buffer holding inode block */ # define id_state id_list.wk_state /* inode dependency state */ LIST_ENTRY(inodedep) id_hash; /* hashed lookup */ + TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */ struct fs *id_fs; /* associated filesystem */ ino_t id_ino; /* dependent inode */ nlink_t id_nlinkdelta; /* saved effective link count */ + nlink_t id_savednlink; /* Link saved during rollback */ LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ - struct buf *id_buf; /* related bmsafemap (if pending) */ + struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */ + struct diradd *id_mkdiradd; /* diradd for a mkdir. */ + struct inoreflst id_inoreflst; /* Inode reference adjustments. */ long id_savedextsize; /* ext size saved during rollback */ off_t id_savedsize; /* file size saved during rollback */ + struct dirremhd id_dirremhd; /* Removals pending. */ struct workhead id_pendinghd; /* entries awaiting directory write */ struct workhead id_bufwait; /* operations after inode written */ struct workhead id_inowait; /* operations waiting inode update */ @@ -271,23 +305,6 @@ struct inodedep { #define id_savedino2 id_un.idu_savedino2 /* - * A "newblk" structure is attached to a bmsafemap structure when a block - * or fragment is allocated from a cylinder group. Its state is set to - * DEPCOMPLETE when its cylinder group map is written. It is consumed by - * an associated allocdirect or allocindir allocation which will attach - * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag - * is not set (i.e., its cylinder group map has not been written). - */ -struct newblk { - LIST_ENTRY(newblk) nb_hash; /* hashed lookup */ - struct fs *nb_fs; /* associated filesystem */ - int nb_state; /* state of bitmap dependency */ - ufs2_daddr_t nb_newblkno; /* allocated block number */ - LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */ - struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */ -}; - -/* * A "bmsafemap" structure maintains a list of dependency structures * that depend on the update of a particular cylinder group map. * It has lists for newblks, allocdirects, allocindirs, and inodedeps. @@ -299,14 +316,44 @@ struct inodedep { */ struct bmsafemap { struct worklist sm_list; /* cylgrp buffer */ +# define sm_state sm_list.wk_state + int sm_cg; + LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */ struct buf *sm_buf; /* associated buffer */ struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */ + struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */ struct allocindirhd sm_allocindirhd; /* allocindir deps */ + struct allocindirhd sm_allocindirwr; /* writing allocindir deps */ struct inodedephd sm_inodedephd; /* inodedep deps */ + struct inodedephd sm_inodedepwr; /* writing inodedep deps */ struct newblkhd sm_newblkhd; /* newblk deps */ + struct newblkhd sm_newblkwr; /* writing newblk deps */ + struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */ + struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */ }; /* + * A "newblk" structure is attached to a bmsafemap structure when a block + * or fragment is allocated from a cylinder group. Its state is set to + * DEPCOMPLETE when its cylinder group map is written. It is converted to + * an allocdirect or allocindir allocation once the allocator calls the + * appropriate setup function. + */ +struct newblk { + struct worklist nb_list; +# define nb_state nb_list.wk_state + LIST_ENTRY(newblk) nb_hash; /* hashed lookup */ + LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */ + struct jnewblk *nb_jnewblk; /* New block journal entry. */ + struct bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */ + struct freefrag *nb_freefrag; /* fragment to be freed (if any) */ + struct indirdephd nb_indirdeps; /* Children indirect blocks. */ + struct workhead nb_newdirblk; /* dir block to notify when written */ + struct workhead nb_jwork; /* Journal work pending. */ + ufs2_daddr_t nb_newblkno; /* new value of block pointer */ +}; + +/* * An "allocdirect" structure is attached to an "inodedep" when a new block * or fragment is allocated and pointed to by the inode described by * "inodedep". The worklist is linked to the buffer that holds the block. @@ -334,20 +381,18 @@ struct bmsafemap { * and inodedep->id_pendinghd lists. */ struct allocdirect { - struct worklist ad_list; /* buffer holding block */ -# define ad_state ad_list.wk_state /* block pointer state */ + struct newblk ad_block; /* Common block logic */ +# define ad_state ad_block.nb_list.wk_state /* block pointer state */ TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */ - ufs_lbn_t ad_lbn; /* block within file */ - ufs2_daddr_t ad_newblkno; /* new value of block pointer */ - ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ - long ad_newsize; /* size of new block */ - long ad_oldsize; /* size of old block */ - LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */ - struct buf *ad_buf; /* cylgrp buffer (if pending) */ struct inodedep *ad_inodedep; /* associated inodedep */ - struct freefrag *ad_freefrag; /* fragment to be freed (if any) */ - struct workhead ad_newdirblk; /* dir block to notify when written */ + ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ + int ad_offset; /* Pointer offset in parent. */ + long ad_newsize; /* size of new block */ + long ad_oldsize; /* size of old block */ }; +#define ad_newblkno ad_block.nb_newblkno +#define ad_freefrag ad_block.nb_freefrag +#define ad_newdirblk ad_block.nb_newdirblk /* * A single "indirdep" structure manages all allocation dependencies for @@ -369,10 +414,14 @@ struct allocdirect { struct indirdep { struct worklist ir_list; /* buffer holding indirect block */ # define ir_state ir_list.wk_state /* indirect block pointer state */ - caddr_t ir_saveddata; /* buffer cache contents */ + LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */ + caddr_t ir_saveddata; /* buffer cache contents */ struct buf *ir_savebp; /* buffer holding safe copy */ + struct allocindirhd ir_completehd; /* waiting for indirdep complete */ + struct allocindirhd ir_writehd; /* Waiting for the pointer write. */ struct allocindirhd ir_donehd; /* done waiting to update safecopy */ struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ + struct workhead ir_jwork; /* Journal work pending. */ }; /* @@ -389,31 +438,39 @@ struct indirdep { * can then be freed as it is no longer applicable. */ struct allocindir { - struct worklist ai_list; /* buffer holding indirect block */ -# define ai_state ai_list.wk_state /* indirect block pointer state */ + struct newblk ai_block; /* Common block area */ +# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */ LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */ - int ai_offset; /* pointer offset in indirect block */ - ufs2_daddr_t ai_newblkno; /* new block pointer value */ - ufs2_daddr_t ai_oldblkno; /* old block pointer value */ - struct freefrag *ai_freefrag; /* block to be freed when complete */ struct indirdep *ai_indirdep; /* address of associated indirdep */ - LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */ - struct buf *ai_buf; /* cylgrp buffer (if pending) */ + ufs2_daddr_t ai_oldblkno; /* old value of block pointer */ + int ai_offset; /* Pointer offset in parent. */ }; +#define ai_newblkno ai_block.nb_newblkno +#define ai_freefrag ai_block.nb_freefrag +#define ai_newdirblk ai_block.nb_newdirblk /* + * The allblk union is used to size the newblk structure on allocation so + * that it may be any one of three types. + */ +union allblk { + struct allocindir ab_allocindir; + struct allocdirect ab_allocdirect; + struct newblk ab_newblk; +}; + +/* * A "freefrag" structure is attached to an "inodedep" when a previously * allocated fragment is replaced with a larger fragment, rather than extended. * The "freefrag" structure is constructed and attached when the replacement * block is first allocated. It is processed after the inode claiming the - * bigger block that replaces it has been written to disk. Note that the - * ff_state field is is used to store the uid, so may lose data. However, - * the uid is used only in printing an error message, so is not critical. - * Keeping it in a short keeps the data structure down to 32 bytes. + * bigger block that replaces it has been written to disk. */ struct freefrag { struct worklist ff_list; /* id_inowait or delayed worklist */ -# define ff_state ff_list.wk_state /* owning user; should be uid_t */ +# define ff_state ff_list.wk_state + struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */ + struct workhead ff_jwork; /* Journal work pending. */ ufs2_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ @@ -423,23 +480,60 @@ struct freefrag { * A "freeblks" structure is attached to an "inodedep" when the * corresponding file's length is reduced to zero. It records all * the information needed to free the blocks of a file after its - * zero'ed inode has been written to disk. + * zero'ed inode has been written to disk. The actual work is done + * by child freework structures which are responsible for individual + * inode pointers while freeblks is responsible for retiring the + * entire operation when it is complete and holding common members. */ struct freeblks { struct worklist fb_list; /* id_inowait or delayed worklist */ # define fb_state fb_list.wk_state /* inode and dirty block state */ + struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */ + struct workhead fb_freeworkhd; /* Work items pending */ + struct workhead fb_jwork; /* Journal work pending */ ino_t fb_previousinum; /* inode of previous owner of blocks */ uid_t fb_uid; /* uid of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ - long fb_oldextsize; /* previous ext data size */ - off_t fb_oldsize; /* previous file size */ ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */ - ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */ - ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */ - ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */ + int fb_ref; /* Children outstanding. */ }; /* + * A "freework" structure handles the release of a tree of blocks or a single + * block. Each indirect block in a tree is allocated its own freework + * structure so that the indrect block may be freed only when all of its + * children are freed. In this way we enforce the rule that an allocated + * block must have a valid path to a root that is journaled. Each child + * block acquires a reference and when the ref hits zero the parent ref + * is decremented. If there is no parent the freeblks ref is decremented. + */ +struct freework { + struct worklist fw_list; +# define fw_state fw_list.wk_state + LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */ + struct freeblks *fw_freeblks; /* Root of operation. */ + struct freework *fw_parent; /* Parent indirect. */ + ufs2_daddr_t fw_blkno; /* Our block #. */ + ufs_lbn_t fw_lbn; /* Original lbn before free. */ + int fw_frags; /* Number of frags. */ + int fw_ref; /* Number of children out. */ + int fw_off; /* Current working position. */ + struct workhead fw_jwork; /* Journal work pending. */ +}; + +/* + * A "freedep" structure is allocated to track the completion of a bitmap + * write for a freework. One freedep may cover many freed blocks so long + * as they reside in the same cylinder group. When the cg is written + * the freedep decrements the ref on the freework which may permit it + * to be freed as well. + */ +struct freedep { + struct worklist fd_list; + struct freework *fd_freework; /* Parent freework. */ +}; + +/* * A "freefile" structure is attached to an inode when its * link count is reduced to zero. It marks the inode as free in * the cylinder group map after the zero'ed inode has been written @@ -450,6 +544,7 @@ struct freefile { mode_t fx_mode; /* mode of inode */ ino_t fx_oldinum; /* inum of the unlinked file */ struct vnode *fx_devvp; /* filesystem device vnode */ + struct workhead fx_jwork; /* journal work pending. */ }; /* @@ -482,12 +577,11 @@ struct freefile { * than zero. * * The overlaying of da_pagedep and da_previous is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. If a - * da_previous entry is present, the pointer to its pagedep is available - * in the associated dirrem entry. If the DIRCHG flag is set, the - * da_previous entry is valid; if not set the da_pagedep entry is valid. - * The DIRCHG flag never changes; it is set when the structure is created - * if appropriate and is never cleared. + * structure down. If a da_previous entry is present, the pointer to its + * pagedep is available in the associated dirrem entry. If the DIRCHG flag + * is set, the da_previous entry is valid; if not set the da_pagedep entry + * is valid. The DIRCHG flag never changes; it is set when the structure + * is created if appropriate and is never cleared. */ struct diradd { struct worklist da_list; /* id_inowait or id_pendinghd list */ @@ -499,6 +593,7 @@ struct diradd { struct dirrem *dau_previous; /* entry being replaced in dir change */ struct pagedep *dau_pagedep; /* pagedep dependency for addition */ } da_un; + struct workhead da_jwork; /* Journal work awaiting completion. */ }; #define da_previous da_un.dau_previous #define da_pagedep da_un.dau_pagedep @@ -525,12 +620,13 @@ struct diradd { * mkdir structures that reference it. The deletion would be faster if the * diradd structure were simply augmented to have two pointers that referenced * the associated mkdir's. However, this would increase the size of the diradd - * structure from 32 to 64-bits to speed a very infrequent operation. + * structure to speed a very infrequent operation. */ struct mkdir { struct worklist md_list; /* id_inowait or buffer holding dir */ # define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */ struct diradd *md_diradd; /* associated diradd */ + struct jaddref *md_jaddref; /* dependent jaddref. */ struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */ LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */ }; @@ -542,20 +638,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; * list of the pagedep for the directory page that contains the entry. * It is processed after the directory page with the deleted entry has * been written to disk. - * - * The overlaying of dm_pagedep and dm_dirinum is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. It works - * because they are never used concurrently. */ struct dirrem { struct worklist dm_list; /* delayed worklist */ # define dm_state dm_list.wk_state /* state of the old directory entry */ LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */ + LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */ + struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */ ino_t dm_oldinum; /* inum of the removed dir entry */ union { struct pagedep *dmu_pagedep; /* pagedep dependency for remove */ ino_t dmu_dirinum; /* parent inode number (for rmdir) */ } dm_un; + struct workhead dm_jwork; /* Journal work awaiting completion. */ }; #define dm_pagedep dm_un.dmu_pagedep #define dm_dirinum dm_un.dmu_dirinum @@ -577,9 +672,186 @@ struct dirrem { * blocks using a similar scheme with the allocindir structures. Rather * than adding this level of complexity, we simply write those newly * allocated indirect blocks synchronously as such allocations are rare. + * In the case of a new directory the . and .. links are tracked with + * a mkdir rather than a pagedep. In this case we track the mkdir + * so it can be released when it is written. A workhead is used + * to simplify canceling a mkdir that is removed by a subsequent dirrem. */ struct newdirblk { struct worklist db_list; /* id_inowait or pg_newdirblk */ # define db_state db_list.wk_state /* unused */ struct pagedep *db_pagedep; /* associated pagedep */ + struct workhead db_mkdir; }; + +/* + * The inoref structure holds the elements common to jaddref and jremref + * so they may easily be queued in-order on the inodedep. + */ +struct inoref { + struct worklist if_list; +# define if_state if_list.wk_state + TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */ + struct jsegdep *if_jsegdep; + off_t if_diroff; /* Directory offset. */ + ino_t if_ino; /* Inode number. */ + ino_t if_parent; /* Parent inode number. */ + nlink_t if_nlink; /* nlink before addition. */ + uint16_t if_mode; /* File mode, needed for IFMT. */ +}; + +/* + * A "jaddref" structure tracks a new reference (link count) on an inode + * and prevents the link count increase and bitmap allocation until a + * journal entry can be written. Once the journal entry is written, + * the inode is put on the pendinghd of the bmsafemap and a diradd or + * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE + * flag is used to indicate that all of the required information for writing + * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to + * differentiate . and .. links from regular file names. NEWBLOCK indicates + * a bitmap is still pending. If a new reference is canceled by a delete + * prior to writing the journal the jaddref write is canceled and the + * structure persists to prevent any disk-visible changes until it is + * ultimately released when the file is freed or the link is dropped again. + */ +struct jaddref { + struct inoref ja_ref; +# define ja_list ja_ref.if_list /* Journal pending or jseg entries. */ +# define ja_state ja_ref.if_list.wk_state + LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */ + union { + struct diradd *jau_diradd; /* Pending diradd. */ + struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */ + } ja_un; +}; +#define ja_diradd ja_un.jau_diradd +#define ja_mkdir ja_un.jau_mkdir +#define ja_diroff ja_ref.if_diroff +#define ja_ino ja_ref.if_ino +#define ja_parent ja_ref.if_parent +#define ja_mode ja_ref.if_mode + +/* + * A "jremref" structure tracks a removed reference (unlink) on an + * inode and prevents the directory remove from proceeding until the + * journal entry is written. Once the journal has been written the remove + * may proceed as normal. + */ +struct jremref { + struct inoref jr_ref; +# define jr_list jr_ref.if_list /* Journal pending or jseg entries. */ +# define jr_state jr_ref.if_list.wk_state + LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */ + struct dirrem *jr_dirrem; /* Back pointer to dirrem. */ +}; + +struct jmvref { + struct worklist jm_list; + LIST_ENTRY(jmvref) jm_deps; + struct pagedep *jm_pagedep; + ino_t jm_parent; + ino_t jm_ino; + off_t jm_oldoff; + off_t jm_newoff; +}; + +/* + * A "jnewblk" structure tracks a newly allocated block or fragment and + * prevents the direct or indirect block pointer as well as the cg bitmap + * from being written until it is logged. After it is logged the jsegdep + * is attached to the allocdirect or allocindir until the operation is + * completed or reverted. If the operation is reverted prior to the journal + * write the jnewblk structure is maintained to prevent the bitmaps from + * reaching the disk. Ultimately the jnewblk structure will be passed + * to the free routine as the in memory cg is modified back to the free + * state at which time it can be released. + */ +struct jnewblk { + struct worklist jn_list; +# define jn_state jn_list.wk_state + struct jsegdep *jn_jsegdep; + LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */ + struct newblk *jn_newblk; + ino_t jn_ino; + ufs_lbn_t jn_lbn; + ufs2_daddr_t jn_blkno; + int jn_oldfrags; + int jn_frags; +}; + +/* + * A "jfreeblk" structure tracks the journal write for freeing a block + * or tree of blocks. The block pointer must not be cleared in the inode + * or indirect prior to the jfreeblk being written. + */ +struct jfreeblk { + struct worklist jf_list; +# define jf_state jf_list.wk_state + struct jsegdep *jf_jsegdep; + struct freeblks *jf_freeblks; + LIST_ENTRY(jfreeblk) jf_deps; + ino_t jf_ino; + ufs_lbn_t jf_lbn; + ufs2_daddr_t jf_blkno; + int jf_frags; +}; + +/* + * A "jfreefrag" tracks the freeing of a single block when a fragment is + * extended or an indirect page is replaced. It is not part of a larger + * freeblks operation. + */ +struct jfreefrag { + struct worklist fr_list; +# define fr_state fr_list.wk_state + struct jsegdep *fr_jsegdep; + struct freefrag *fr_freefrag; + ino_t fr_ino; + ufs_lbn_t fr_lbn; + ufs2_daddr_t fr_blkno; + int fr_frags; +}; + +/* + * A "jsegdep" structure tracks a single reference to a written journal + * segment so the journal space can be reclaimed when all dependencies + * have been written. + */ +struct jsegdep { + struct worklist jd_list; +# define jd_state jd_list.wk_state + struct jseg *jd_seg; +}; + +/* + * A "jseg" structure contains all of the journal records written in a + * single disk write. jaddref and jremref structures are linked into + * js_entries so thay may be completed when the write completes. The + * js_deps array contains as many entries as there are ref counts to + * reduce the number of allocations required per journal write to one. + */ +struct jseg { + struct worklist js_list; /* b_deps link for journal */ +# define js_state js_list.wk_state + struct workhead js_entries; /* Entries awaiting write */ + TAILQ_ENTRY(jseg) js_next; + struct jblocks *js_jblocks; /* Back pointer to block/seg list */ + struct buf *js_buf; /* Buffer while unwritten */ + uint64_t js_seq; + int js_size; /* Allocated size in bytes */ + int js_cnt; /* Total items allocated */ + int js_refs; /* Count of items pending completion */ +}; + +/* + * A 'sbdep' structure tracks the head of the free inode list and + * superblock writes. This makes sure the superblock is always pointing at + * the first possible unlinked inode for the suj recovery process. If a + * block write completes and we discover a new head is available the buf + * is dirtied and the dep is kept. + */ +struct sbdep { + struct worklist sb_list; /* b_dep linkage */ + struct fs *sb_fs; /* Filesystem pointer within buf. */ + struct ufsmount *sb_ump; +}; Index: /usr/src/sys/ufs/ffs/ffs_balloc.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_balloc.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_balloc.c (working copy) @@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (lbn < 0) return (EFBIG); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment @@ -418,6 +420,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -473,7 +477,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } @@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (lbn < 0) return (EFBIG); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + /* * Check for allocating external data. */ @@ -930,6 +937,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -985,7 +994,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } Index: /usr/src/sys/ufs/ffs/ffs_inode.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_inode.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_inode.c (working copy) @@ -92,15 +92,6 @@ ffs_update(vp, waitfor) fs = ip->i_fs; if (fs->fs_ronly) return (0); - /* - * Ensure that uid and gid are correct. This is a temporary - * fix until fsck has been changed to do the update. - */ - if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ - fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ - ip->i_din1->di_ouid = ip->i_uid; /* XXX */ - ip->i_din1->di_ogid = ip->i_gid; /* XXX */ - } /* XXX */ error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { @@ -232,7 +223,7 @@ ffs_truncate(vp, length, flags, cred, td) if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i], - sblksize(fs, osize, i), ip->i_number); + sblksize(fs, osize, i), ip->i_number, NULL); } } } @@ -336,6 +327,8 @@ ffs_truncate(vp, length, flags, cred, td) * zero'ed in case it ever becomes accessible again because * of subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. + * + * XXX Still need to manually journal this. */ offset = blkoff(fs, length); if (offset == 0) { @@ -445,7 +438,7 @@ ffs_truncate(vp, length, flags, cred, td) if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ip->i_devvp, bn, - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); blocksreleased += nblocks; } } @@ -464,7 +457,8 @@ ffs_truncate(vp, length, flags, cred, td) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); - ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number); + ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number, + NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -496,7 +490,7 @@ ffs_truncate(vp, length, flags, cred, td) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ip->i_devvp, bn, - oldspace - newspace, ip->i_number); + oldspace - newspace, ip->i_number, NULL); blocksreleased += btodb(oldspace - newspace); } } @@ -638,7 +632,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp blocksreleased += blkcount; } ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); blocksreleased += nblocks; } Index: /usr/src/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_snapshot.c (revision 202614) +++ /usr/src/sys/ufs/ffs/ffs_snapshot.c (working copy) @@ -582,7 +582,8 @@ loop: len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, - DIP(xp, i_db[loc]), len, xp->i_number); + DIP(xp, i_db[loc]), len, xp->i_number, + NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -598,7 +599,7 @@ loop: DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { @@ -700,7 +701,7 @@ out1: copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; @@ -1220,7 +1221,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } @@ -1500,7 +1501,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } Index: /usr/src/sys/ufs/ffs/fs.h =================================================================== --- /usr/src/sys/ufs/ffs/fs.h (revision 202614) +++ /usr/src/sys/ufs/ffs/fs.h (working copy) @@ -340,7 +340,10 @@ struct fs { int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ - int32_t fs_sparecon32[26]; /* reserved for future constants */ + int32_t fs_sujournal; /* SUJ journal file */ + int32_t fs_sujfree; /* SUJ free list */ + ufs_time_t fs_mtime; /* Last mount or fsck time. */ + int32_t fs_sparecon32[22]; /* reserved for future constants */ int32_t fs_flags; /* see FS_ flags below */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ @@ -414,6 +417,7 @@ CTASSERT(sizeof(struct fs) == 1376); #define FS_GJOURNAL 0x0040 /* gjournaled file system */ #define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */ #define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */ +#define FS_SUJ 0x200 /* Filesystem using softupdate journal */ /* * Macros to access bits in the fs_active array. @@ -603,8 +607,32 @@ struct cg { ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (size))))) - /* + * Indirect lbns are aligned on NDADDR addresses where single indirects + * are the negated address of the lowest lbn reachable, double indirects + * are this lbn - 1 and triple indirects are this lbn - 2. This yields + * an unusual bit order to determine level. + */ +static inline int +lbn_level(ufs_lbn_t lbn) +{ + if (lbn >= 0) + return 0; + switch (lbn & 0x3) { + case 0: + return (0); + case 1: + break; + case 2: + return (2); + case 3: + return (1); + default: + break; + } + return (-1); +} +/* * Number of inodes in a secondary storage block/fragment. */ #define INOPB(fs) ((fs)->fs_inopb) @@ -615,6 +643,78 @@ struct cg { */ #define NINDIR(fs) ((fs)->fs_nindir) +/* + * Softdep journal record format. + */ + +#define JOP_ADDREF 1 /* Add a reference to an inode. */ +#define JOP_REMREF 2 /* Remove a reference from an inode. */ +#define JOP_NEWBLK 3 /* Allocate a block. */ +#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ +#define JOP_MVREF 5 /* Move a reference from one off to another. */ + +#define JREC_SIZE 32 /* Record and segment header size. */ + +#define SUJ_MIN (1 * 1024 * 1024) /* Minimum journal size */ +#define SUJ_MAX (64 * SUJ_MIN) /* Maximum journal size */ + +/* + * Size of the segment record header. There is at most one for each disk + * block and at least one for each filesystem block in the journal. The + * segment header is followed by an array of records. + */ +struct jsegrec { + uint64_t jsr_seq; /* Our sequence number */ + uint64_t jsr_oldest; /* Oldest valid sequence number */ + uint32_t jsr_cnt; /* Count of valid records */ + uint32_t jsr_crc; /* 32bit crc of the valid space */ + ufs_time_t jsr_time; /* timestamp for mount instance */ +}; + +struct jrefrec { + uint32_t jr_op; + ino_t jr_ino; + ino_t jr_parent; + uint16_t jr_nlink; + uint16_t jr_mode; + off_t jr_diroff; + uint64_t jr_unused; +}; + +struct jmvrec { + uint32_t jm_op; + ino_t jm_ino; + ino_t jm_parent; + uint16_t jm_unused; + off_t jm_oldoff; + off_t jm_newoff; +}; + +struct jblkrec { + uint32_t jb_op; + uint32_t jb_ino; + ufs2_daddr_t jb_blkno; + ufs_lbn_t jb_lbn; + uint16_t jb_frags; + uint16_t jb_oldfrags; + uint32_t jb_unused; +}; + +union jrec { + struct jsegrec rec_jsegrec; + struct jrefrec rec_jrefrec; + struct jmvrec rec_jmvrec; + struct jblkrec rec_jblkrec; +}; + +#ifdef CTASSERT +CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); +CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); +CTASSERT(sizeof(struct jmvrec) == JREC_SIZE); +CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); +CTASSERT(sizeof(union jrec) == JREC_SIZE); +#endif + extern int inside[], around[]; extern u_char *fragtbl[]; Index: /usr/src/sys/kern/vfs_bio.c =================================================================== --- /usr/src/sys/kern/vfs_bio.c (revision 202614) +++ /usr/src/sys/kern/vfs_bio.c (working copy) @@ -216,6 +216,14 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLA static int bd_request; /* + * Request for the buf daemon to write more buffers than is indicated by + * lodirtybuf. This may be necessary to push out excess dependencies or + * defragment the address space where a simple count of the number of dirty + * buffers is insufficient to characterize the demand for flushing them. + */ +static int bd_speedupreq; + +/* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; @@ -467,12 +475,20 @@ bd_wakeup(int dirtybuflevel) * bd_speedup - speedup the buffer cache flushing code */ -static __inline void bd_speedup(void) { + int needwake; - bd_wakeup(1); + mtx_lock(&bdlock); + needwake = 0; + if (bd_speedupreq == 0 || bd_request == 0) + needwake = 1; + bd_speedupreq = 1; + bd_request = 1; + if (needwake) + wakeup(&bd_request); + mtx_unlock(&bdlock); } /* @@ -2120,6 +2136,7 @@ buf_do_flush(struct vnode *vp) static void buf_daemon() { + int lodirtysave; /* * This process needs to be suspended prior to shutdown sync. @@ -2137,7 +2154,11 @@ buf_daemon() mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); - + lodirtysave = lodirtybuffers; + if (bd_speedupreq) { + lodirtybuffers = numdirtybuffers / 2; + bd_speedupreq = 0; + } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate @@ -2149,6 +2170,7 @@ buf_daemon() break; uio_yield(); } + lodirtybuffers = lodirtysave; /* * Only clear bd_request if we have reached our low water Index: /usr/src/sys/kern/vfs_subr.c =================================================================== --- /usr/src/sys/kern/vfs_subr.c (revision 202614) +++ /usr/src/sys/kern/vfs_subr.c (working copy) @@ -2816,6 +2816,7 @@ DB_SHOW_COMMAND(mount, db_show_mount) MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); + MNT_FLAG(MNT_SOFTDEP); #undef MNT_FLAG if (flags != 0) { if (buf[0] != '\0') Index: /usr/src/sys/sys/mount.h =================================================================== --- /usr/src/sys/sys/mount.h (revision 202614) +++ /usr/src/sys/sys/mount.h (working copy) @@ -240,6 +240,7 @@ void __mnt_vnode_markerfree(struct vnode #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ #define MNT_NFS4ACLS 0x00000010 +#define MNT_SUJ 0x00000020 /* softdep journaling */ /* * NFS export related mount flags. @@ -275,7 +276,8 @@ void __mnt_vnode_markerfree(struct vnode MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ - MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | MNT_NFS4ACLS) + MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ + MNT_NFS4ACLS | MNT_SUJ) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ Index: /usr/src/sys/sys/buf.h =================================================================== --- /usr/src/sys/sys/buf.h (revision 202614) +++ /usr/src/sys/sys/buf.h (working copy) @@ -493,6 +493,7 @@ int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); +void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **);