GENERIC HEAD from 2009-12-14 22:55:20 UTC, r200565M, vmcore.12 KDB: debugger backends: ddb KDB: current backend: ddb Copyright (c) 1992-2009 The FreeBSD Project. Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994 The Regents of the University of California. All rights reserved. FreeBSD is a registered trademark of The FreeBSD Foundation. FreeBSD 9.0-CURRENT #0 r200565M: Thu Dec 17 23:12:32 CET 2009 pho@crashbox.osted.lan:/usr/src/sys/i386/compile/PHO i386 WARNING: WITNESS option enabled, expect reduced performance. WARNING: DIAGNOSTIC option enabled, expect reduced performance. Timecounter "i8254" frequency 1193182 Hz quality 0 CPU: Intel(R) XEON(TM) CPU 1.80GHz (1799.81-MHz 686-class CPU) Origin = "GenuineIntel" Id = 0xf24 Stepping = 4 Features=0x3febfbff real memory = 1073741824 (1024 MB) avail memory = 1031368704 (983 MB) : Trying to mount root from ufs:/dev/ad0s1a Entropy harvesting: interrupts ethernet point_to_point kickstart. /dev/ad0s1a: FILE SYSTEM CLEAN; SKIPPING CHECKS /dev/ad0s1a: clean, 269387 free (651 frags, 33592 blocks, 0.1% fragmentation) /dev/ad0s1f: FILE SYSTEM CLEAN; SKIPPING CHECKS /dev/ad0s1f: clean, 449949 free (501 frags, 56181 blocks, 0.1% fragmentation) /dev/ad0s1e: FILE SYSTEM CLEAN; SKIPPING CHECKS /dev/ad0s1d: FILE SYSTEM CLEAN; SKIPPING CHECKS /dev/ad0s1d: clean, 2844438 free (50238 frags, 349275 blocks, 1.0% fragmentation) /dev/ad0s1g: FILE SYSTEM CLEAN; SKIPPING CHECKS /dev/ad0s1g: clean, 21745933 free (23413 frags, 2715315 blocks, 0.1% fragmentation) lock order reversal: 1st 0xd85049e0 bufwait (bufwait) @ kern/vfs_bio.c:2559 2nd 0xc483a800 dirhash (dirhash) @ ufs/ufs/ufs_dirhash.c:283 KDB: stack backtrace: db_trace_self_wrapper(c0ca001d,e6cf9880,c08d0105,c08c0beb,c0ca2fc1,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c0beb,c0ca2fc1,c4538030,c453b290,e6cf98dc,...) at kdb_backtrace+0x29 _witness_debugger(c0ca2fc1,c483a800,c0cc6214,c453b290,c0cc5eba,...) at _witness_debugger+0x25 witness_checkorder(c483a800,9,c0cc5eb1,11b,0,...) at witness_checkorder+0x839 _sx_xlock(c483a800,0,c0cc5eb1,11b,c4bfc488,...) at _sx_xlock+0x85 ufsdirhash_acquire(d8504980,e6cf9a1c,164,d8bbd4ac,e6cf99a8,...) at ufsdirhash_acquire+0x48 ufsdirhash_add(c4bfc488,e6cf9a1c,4ac,e6cf9994,e6cf9998,...) at ufsdirhash_add+0x13 ufs_direnter(c4b76d98,c4c01000,e6cf9a1c,e6cf9c00,d85073c0,...) at ufs_direnter+0x669 ufs_mkdir(e6cf9c28,c0cdb787,0,0,e6cf9b6c,...) at ufs_mkdir+0x981 VOP_MKDIR_APV(c0daad00,e6cf9c28,e6cf9c00,e6cf9b6c,0,...) at VOP_MKDIR_APV+0xc5 kern_mkdirat(c4b51b40,ffffff9c,bfbfef5a,0,1ff,...) at kern_mkdirat+0x21b kern_mkdir(c4b51b40,bfbfef5a,0,1ff,e6cf9d2c,...) at kern_mkdir+0x2e mkdir(c4b51b40,e6cf9cf8,8,c0ca3887,c0d88fe0,...) at mkdir+0x29 syscall(e6cf9d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (136, FreeBSD ELF32, mkdir), eip = 0x2816a5c3, esp = 0xbfbfed6c, ebp = 0xbfbfee38 --- fxp0: link state changed to UP Starting Network: lo0 fxp0. add net default: gateway 192.168.1.1 Additional ABI support: linux. Starting mountd. Configuring syscons: keymap blanktime. Local package initialization:lock order reversal: 1st 0xc4be146c ufs (ufs) @ kern/vfs_subr.c:2083 2nd 0xd8511b80 bufwait (bufwait) @ ufs/ffs/ffs_softdep.c:9723 3rd 0xc4daa46c ufs (ufs) @ kern/vfs_subr.c:2083 KDB: stack backtrace: db_trace_self_wrapper(c0ca001d,e6dfc85c,c08d0105,c08c0beb,c0ca2fda,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c0beb,c0ca2fda,c4538030,c453b228,e6dfc8b8,...) at kdb_backtrace+0x29 _witness_debugger(c0ca2fda,c4daa46c,c0c957ae,c453b228,c0caa1b4,...) at _witness_debugger+0x25 witness_checkorder(c4daa46c,9,c0caa1ab,823,0,...) at witness_checkorder+0x839 __lockmgr_args(c4daa46c,80100,c4daa4d8,0,0,...) at __lockmgr_args+0x824 ffs_lock(e6dfc9d8,c08cfeab,c0ca9692,80100,c4daa414,...) at ffs_lock+0xa1 VOP_LOCK1_APV(c0daad00,e6dfc9d8,c4be8e24,c0dc55a0,c4daa414,...) at VOP_LOCK1_APV+0xb5 _vn_lock(c4daa414,80100,c0caa1ab,823,4,...) at _vn_lock+0x78 vget(c4daa414,80100,c4be8d80,50,0,...) at vget+0xbb vfs_hash_get(c4aab5a8,67800,80000,c4be8d80,e6dfcb38,...) at vfs_hash_get+0xed ffs_vgetf(c4aab5a8,67800,80000,e6dfcb38,1,...) at ffs_vgetf+0x49 softdep_sync_metadata(c4be1414,0,c0cc5afd,146,0,...) at softdep_sync_metadata+0x5f3 ffs_syncvnode(c4be1414,1,c4be8d80,51d,c4be12b8,...) at ffs_syncvnode+0x3e2 ffs_sync(c4aab5a8,1,c0ca99a9,4f9,80,...) at ffs_sync+0x26f dounmount(c4aab5a8,8000000,c4be8d80,47e,9d13dd6,...) at dounmount+0x44e unmount(c4be8d80,e6dfccf8,8,c4be8d80,c0d88368,...) at unmount+0x2ff syscall(e6dfcd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (22, FreeBSD ELF32, unmount), eip = 0x280da13f, esp = 0xbfbfe68c, ebp = 0xbfbfe758 --- lock order reversal: 1st 0xc4b4b880 ufs (ufs) @ kern/vfs_mount.c:1204 2nd 0xc4b631b4 devfs (devfs) @ ufs/ffs/ffs_softdep.c:1276 KDB: stack backtrace: db_trace_self_wrapper(c0ca001d,e6dfc9dc,c08d0105,c08c0beb,c0ca2fc1,...) at db_trace_self_wrapper+0x26 kdb_backtrace(c08c0beb,c0ca2fc1,c453b228,c453b0f0,e6dfca38,...) at kdb_backtrace+0x29 _witness_debugger(c0ca2fc1,c4b631b4,c0c916b0,c453b0f0,c0cc31ed,...) at _witness_debugger+0x25 witness_checkorder(c4b631b4,9,c0cc31e4,4fc,c4b63220,...) at witness_checkorder+0x839 __lockmgr_args(c4b631b4,80400,c4b63220,0,0,...) at __lockmgr_args+0x824 vop_stdlock(e6dfcb54,c0c9b36c,df,80400,c4b6315c,...) at vop_stdlock+0x65 VOP_LOCK1_APV(c0d845a0,e6dfcb54,0,c0dc55a0,c4b6315c,...) at VOP_LOCK1_APV+0xb5 _vn_lock(c4b6315c,80400,c0cc31e4,4fc,c4aab5a8,...) at _vn_lock+0x78 softdep_flushworklist(c4aab5a8,e6dfcc00,c4be8d80,52b,c4be12b8,...) at softdep_flushworklist+0x47 ffs_sync(c4aab5a8,1,c0ca99a9,4f9,80,...) at ffs_sync+0x2fd dounmount(c4aab5a8,8000000,c4be8d80,47e,9d13dd6,...) at dounmount+0x44e unmount(c4be8d80,e6dfccf8,8,c4be8d80,c0d88368,...) at unmount+0x2ff syscall(e6dfcd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (22, FreeBSD ELF32, unmount), eip = 0x280da13f, esp = 0xbfbfe68c, ebp = 0xbfbfe758 --- ** /dev/ad0s1e ** Last Mounted on /tmp ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts ** Phase 5 - Check Cyl groups 58 files, 32847 used, 1996184 free (264 frags, 249490 blocks, 0.0% fragmentation) ***** FILE SYSTEM IS CLEAN ***** usage: kill [-s signal_name] pid ... kill -l [exit_status] kill -signal_name pid ... kill -signal_number pid ... fsck -y /tmp watchdogd. Fri Dec 18 00:14:05 CET 2009 Dec 18 00:14:12 crashbox su: pho to root on /dev/pts/0 Expensive timeout(9) function: 0xc0607140(0xc468e000) 0.037995209 s Expensive timeout(9) function: 0xc08eb030(0) 0.061813287 s Expensive timeout(9) function: 0xc08c67e0(0xc4be8d80) 0.113359326 s panic: handle_allocdirect_partdone: lost dep cpuid = 0 KDB: enter: panic [thread pid 3 tid 100014 ] Stopped at kdb_enter+0x3a: movl $0,kdb_why db> run pho db:0:pho> bt Tracing pid 3 tid 100014 td 0xc457e900 kdb_enter(c0c9cbe4,c0c9cbe4,c0cc4152,c42ffb4c,0,...) at kdb_enter+0x3a panic(c0cc4152,0,0,c6c7b4d0,0,...) at panic+0x136 handle_allocdirect_partdone(8c26,0,16f88,0,c4538a58,...) at handle_allocdirect_partdone+0xce softdep_disk_write_complete(d875b1a0,0,c0cc5693,6d5,c4b63254,...) at softdep_disk_write_complete+0xd8b ffs_backgroundwritedone(d875b1a0,c1879000,d875b1a0,c4800558,c42ffc98,...) at ffs_backgroundwritedone+0xa3 bufdone(d875b1a0,c454e9c0) at bufdone+0x53 g_vfs_done(c4800558,0,c0ca7da5,c16,c4800558,...) at g_vfs_done+0x85 biodone(c4800558,c0df82e8,24c,c0c92561,0,...) at biodone+0xa5 g_io_schedule_up(c457e900,0,c0c93c29,5d,0,...) at g_io_schedule_up+0xc7 g_up_procbody(0,c42ffd38,c0c980db,343,c457a2a8,...) at g_up_procbody+0x8d fork_exit(c082a170,0,c42ffd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42ffd70, ebp = 0 --- db:0:bt> show allpcpu Current CPU: 0 cpuid = 0 dynamic pcpu = 0x65f380 curthread = 0xc457e900: pid 3 "g_up" curpcb = 0xc42ffd90 fpcurthread = none idlethread = 0xc457c480: pid 11 "idle: cpu0" APIC ID = 0 currentldt = 0x50 spin locks held: cpuid = 1 dynamic pcpu = 0x34f5380 curthread = 0xc5598d80: pid 4853 "swap" curpcb = 0xe6f9bd90 fpcurthread = none idlethread = 0xc457c6c0: pid 11 "idle: cpu1" APIC ID = 1 currentldt = 0x50 spin locks held: cpuid = 2 dynamic pcpu = 0x34f8380 curthread = 0xc457c900: pid 11 "idle: cpu2" curpcb = 0xc42ded90 fpcurthread = none idlethread = 0xc457c900: pid 11 "idle: cpu2" APIC ID = 6 currentldt = 0x50 spin locks held: cpuid = 3 dynamic pcpu = 0x34fb380 curthread = 0xc457cb40: pid 11 "idle: cpu3" curpcb = 0xc42dbd90 fpcurthread = none idlethread = 0xc457cb40: pid 11 "idle: cpu3" APIC ID = 7 currentldt = 0x50 spin locks held: db:0:allpcpu> show alllocks Process 4855 (swap) thread 0xc672a900 (100265) exclusive sleep mutex swapdev (swapdev) r = 0 (0xc0f72ba0) locked @ vm/swap_pager.c:767 exclusive sleep mutex swap_pager swhash (swap_pager swhash) r = 0 (0xc0f72bec) locked @ vm/swap_pager.c:2031 exclusive sleep mutex vm object (standard object) r = 0 (0xc8198220) locked @ vm/vm_fault.c:269 shared sx user map (user map) r = 0 (0xc4b0ab28) locked @ vm/vm_map.c:3532 Process 1139 (sshd) thread 0xc517fd80 (100164) exclusive sx so_rcv_sx (so_rcv_sx) r = 0 (0xc4db4228) locked @ kern/uipc_sockbuf.c:148 Process 1137 (sshd) thread 0xc5180240 (100162) exclusive sx so_rcv_sx (so_rcv_sx) r = 0 (0xc4dbd08c) locked @ kern/uipc_sockbuf.c:148 Process 1056 (sshd) thread 0xc4e11480 (100133) exclusive sx so_rcv_sx (so_rcv_sx) r = 0 (0xc4dae3c4) locked @ kern/uipc_sockbuf.c:148 Process 16 (syncer) thread 0xc4767d80 (100047) exclusive lockmgr bufwait (bufwait) r = 0 (0xd872d1e0) locked @ kern/vfs_bio.c:1835 exclusive lockmgr devfs (devfs) r = 0 (0xc4b631b4) locked @ kern/vfs_subr.c:1693 Process 3 (g_up) thread 0xc457e900 (100014) exclusive sleep mutex Softdep Lock (Softdep Lock) r = 0 (0xc0f72a2c) locked @ ufs/ffs/ffs_softdep.c:7410 db:0:alllocks> show lockedvnods Locked vnodes 0xc4b6315c: tag devfs, type VCHR usecount 1, writecount 0, refcount 125 mountedhere 0xc4a6fc00 flags () v_object 0xc4b215d8 ref 0 pages 470 lock type devfs: EXCL by thread 0xc4767d80 (pid 16) #0 0xc0875c9e at __lockmgr_args+0xc1e #1 0xc090ed75 at vop_stdlock+0x65 #2 0xc0be3c65 at VOP_LOCK1_APV+0xb5 #3 0xc092c7f8 at _vn_lock+0x78 #4 0xc0920c12 at sync_vnode+0x142 #5 0xc0920f83 at sched_sync+0x273 #6 0xc0863018 at fork_exit+0xb8 #7 0xc0bad180 at fork_trampoline+0x8 dev ad0s1e db:0:lockedvnods> show mount 0xc4aabb50 /dev/ad0s1a on / (ufs) 0xc4aac000 devfs on /dev (devfs) 0xc4aab87c /dev/ad0s1f on /home (ufs) 0xc4aab2d4 /dev/ad0s1d on /usr (ufs) 0xc4aab000 /dev/ad0s1g on /var (ufs) 0xc506d000 /dev/ad0s1e on /tmp (ufs) More info: show mount db:0:mount> ps pid ppid pgrp uid state wmesg wchan cmd 4857 4847 1075 1001 R+ swap 4856 4847 1075 1001 SL+ swread 0xc29731e8 swap 4855 4847 1075 1001 SL+ swread 0xc2982410 swap 4854 4847 1075 1001 SL+ swread 0xc2939a80 swap 4853 4847 1075 1001 R+ CPU 1 swap 4852 4847 1075 1001 SL+ swread 0xc21abc80 swap 4851 4847 1075 1001 SL+ swread 0xc24d8db8 swap 4850 4847 1075 1001 SL+ swread 0xc1ad0410 swap 4849 4847 1075 1001 SL+ swread 0xc1e62d80 swap 4848 4847 1075 1001 SL+ swread 0xc1b9a088 swap 4847 4842 1075 1001 S+ wait 0xc5ba0d48 swap 4842 1083 1075 1001 S+ nanslp 0xc0dfa744 swap 1152 1145 1152 1001 Ss+ select 0xc5981564 top 1145 1139 1139 1001 S select 0xc5296664 sshd 1144 1143 1144 1001 Ss kqread 0xc5765d00 tail 1143 1137 1137 1001 S select 0xc52abae4 sshd 1139 945 1139 0 Ss sbwait 0xc4db4254 sshd 1137 945 1137 0 Ss sbwait 0xc4dbd0b8 sshd 1083 1082 1075 1001 S+ wait 0xc4e062a8 run 1082 1081 1075 1001 S+ wait 0xc4db9d48 run 1081 1075 1075 1001 S+ nanslp 0xc0dfa744 run 1075 1059 1075 1001 S+ wait 0xc4ad47f8 sh 1059 1058 1059 1001 Ss+ wait 0xc4e052a8 bash 1058 1056 1056 1001 S select 0xc4a9a0e4 sshd 1056 945 1056 0 Ss sbwait 0xc4dae3f0 sshd 1055 1 1055 0 Ss+ ttyin 0xc483a070 getty 1054 1 1054 0 Ss+ ttyin 0xc483a270 getty 1053 1 1053 0 Ss+ ttyin 0xc468ee70 getty 1052 1 1052 0 Ss+ ttyin 0xc468ec70 getty 1051 1 1051 0 Ss+ ttyin 0xc4820870 getty 1050 1 1050 0 Ss+ ttyin 0xc4820470 getty 1049 1 1049 0 Ss+ ttyin 0xc468ea70 getty 1048 1 1048 0 Ss+ ttyin 0xc468e670 getty 1026 1 1026 0 Ss select 0xc4da4ca4 inetd 1002 1 1002 0 Ss select 0xc4a3b1e4 moused 984 1 984 0 Ss nanslp 0xc0dfa744 watchdogd 962 1 962 0 Ss nanslp 0xc0dfa744 cron 956 1 956 25 Ss pause 0xc4dba300 sendmail 952 1 952 0 Ss select 0xc4a9a124 sendmail 945 1 945 0 Ss select 0xc4da5de4 sshd 908 1 908 0 Ss select 0xc4836564 ntpd 809 808 808 0 S (threaded) nfsd 100104 S rpcsvc 0xc4da4810 nfsd: service 100103 S rpcsvc 0xc4da4850 nfsd: service 100102 S rpcsvc 0xc4da4890 nfsd: service 100077 S rpcsvc 0xc4a6ec10 nfsd: master 808 1 808 0 Ss select 0xc4a3bea4 nfsd 806 1 806 0 Ss select 0xc4a6ec64 mountd 711 1 711 0 Ss select 0xc48350e4 rpcbind 691 1 691 0 Ss select 0xc4a9a7e4 syslogd 561 1 561 0 Ss select 0xc4834be4 devd 19 0 0 0 SL flowclea 0xc0f67288 [flowcleaner] 18 0 0 0 SL sdflush 0xc0f72a80 [softdepflush] 17 0 0 0 SL vlruwt 0xc4a51aa0 [vnlru] 16 0 0 0 SL biowr 0xd872d180 [syncer] 15 0 0 0 SL psleep 0xc0f66dc8 [bufdaemon] 9 0 0 0 SL pgzero 0xc0f73894 [pagezero] 8 0 0 0 SL psleep 0xc0f734c4 [vmdaemon] 7 0 0 0 SL psleep 0xc0f7348c [pagedaemon] 6 0 0 0 SL - 0xc468e83c [fdc0] 14 0 0 0 SL (threaded) [usb] 100034 D - 0xc479edac [usbus0] 100033 D - 0xc479ed7c [usbus0] 100032 D - 0xc479ed4c [usbus0] 100031 D - 0xc479ed1c [usbus0] 5 0 0 0 SL ccb_scan 0xc0dc6954 [xpt_thrd] 13 0 0 0 SL - 0xc0dfa5a4 [yarrow] 4 0 0 0 SL - 0xc0df8364 [g_down] 3 0 0 0 RL CPU 0 [g_up] 2 0 0 0 SL - 0xc0df8358 [g_event] 12 0 0 0 WL (threaded) [intr] 100042 I [irq7: ppc0] 100040 I [swi0: uart uart] 100039 I [irq12: psm0] 100038 I [irq1: atkbd0] 100037 I [irq15: ata1] 100036 I [irq14: ata0] 100035 I [irq17: fxp0] 100030 I [irq16: uhci0] 100028 I [irq9: acpi0] 100024 I [swi2: cambio] 100022 I [swi6: task queue] 100021 I [swi6: Giant taskq] 100019 I [swi5: +] 100012 I [swi4: clock] 100011 I [swi4: clock] 100010 I [swi4: clock] 100009 I [swi4: clock] 100008 I [swi3: vm] 100007 I [swi1: netisr 0] 11 0 0 0 RL (threaded) [idle] 100006 CanRun [idle: cpu0] 100005 CanRun [idle: cpu1] 100004 Run CPU 2 [idle: cpu2] 100003 Run CPU 3 [idle: cpu3] 1 0 1 0 SLs wait 0xc457ad48 [init] 10 0 0 0 SL audit_wo 0xc0f722c0 [audit] 0 0 0 0 SLs (threaded) [kernel] 100029 D - 0xc4782340 [em0 taskq] 100027 D - 0xc4749100 [acpi_task_2] 100026 D - 0xc4749100 [acpi_task_1] 100025 D - 0xc4749100 [acpi_task_0] 100020 D - 0xc4749380 [thread taskq] 100018 D - 0xc4749600 [kqueue taskq] 100016 D - 0xc4561dc0 [firmware taskq] 100000 D sched 0xc0df8440 [swapper] 4843 1083 1075 1001 Z+ mkdir db:0:ps> allt Tracing command swap pid 4857 tid 100142 td 0xc515d6c0 sched_switch(c515d6c0,0,207,18c,4bf4e740,...) at sched_switch+0x406 mi_switch(207,0,c0ca1979,d6,1,...) at mi_switch+0x200 ast(e6e6fd38) at ast+0x2b3 doreti_ast() at doreti_ast+0x17 Tracing command swap pid 4856 tid 100224 td 0xc5598b40 sched_switch(c5598b40,0,104,191,49027db4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c5598b40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c29731e8,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c29731e8,c5741d48,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c5741d48,e6f9ebb8,6,0,e6f9ec3c,...) at swap_pager_getpages+0x44d vm_fault(c4b0a2b8,28ada000,2,0,28ada000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c5110000,...) at trap_pfault+0x10d trap(e6f9ed38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4855 tid 100265 td 0xc672a900 sched_switch(c672a900,0,104,191,49d40f58,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c672a900,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c2982410,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c2982410,c8198220,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c8198220,e7035bb8,1,0,e7035c3c,...) at swap_pager_getpages+0x44d vm_fault(c4b0aae0,28a9d000,2,0,28a9d000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c6734d48,...) at trap_pfault+0x10d trap(e7035d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4854 tid 100093 td 0xc4be4b40 sched_switch(c4be4b40,0,104,191,46abd380,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c4be4b40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c2939a80,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c2939a80,c7251dd0,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c7251dd0,e6da9bb8,8,0,e6da9c3c,...) at swap_pager_getpages+0x44d vm_fault(c56460e8,289f8000,2,0,289f8000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c4be2aa0,...) at trap_pfault+0x10d trap(e6da9d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4853 tid 100223 td 0xc5598d80 cpustop_handler(2,e6f9bd2c,c0bcb0b6,e6f9bcbc,c087d094,...) at cpustop_handler+0x32 ipi_nmi_handler(e6f9bcbc,c087d094,c0dfa7bc,4,c51102a8,...) at ipi_nmi_handler+0x2f trap(e6f9bd38) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4852 tid 100252 td 0xc47f9480 sched_switch(c47f9480,0,104,191,47634290,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c47f9480,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c21abc80,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c21abc80,c9649ee0,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c9649ee0,e7001bb8,8,0,e7001c3c,...) at swap_pager_getpages+0x44d vm_fault(c5ba3488,289f0000,2,0,289f0000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c6736d48,...) at trap_pfault+0x10d trap(e7001d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4851 tid 100120 td 0xc4dc2900 sched_switch(c4dc2900,0,104,191,487f3fd0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c4dc2900,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c24d8db8,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c24d8db8,c5a7f6e8,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c5a7f6e8,e6e28bb8,8,0,e6e28c3c,...) at swap_pager_getpages+0x44d vm_fault(c4ad1828,28b08000,2,0,28b08000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c4e072a8,...) at trap_pfault+0x10d trap(e6e28d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4850 tid 100377 td 0xc7396900 sched_switch(c7396900,0,104,191,492d42f0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c7396900,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c1ad0410,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c1ad0410,c8804110,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c8804110,e71cabb8,8,0,e71cac3c,...) at swap_pager_getpages+0x44d vm_fault(c5646570,28b08000,2,0,28b08000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c533ed48,...) at trap_pfault+0x10d trap(e71cad38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4849 tid 100202 td 0xc4e10480 sched_switch(c4e10480,0,104,191,46302a1c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c4e10480,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c1e62d80,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c1e62d80,c81984c8,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c81984c8,e6f56bb8,8,0,e6f56c3c,...) at swap_pager_getpages+0x44d vm_fault(c56462b8,28a00000,2,0,28a00000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c5620d48,...) at trap_pfault+0x10d trap(e6f56d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4848 tid 100264 td 0xc672ab40 sched_switch(c672ab40,0,104,191,45f38548,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,40,...) at mi_switch+0x200 sleepq_switch(c672ab40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c1b9a088,40,c0cc73e0,0,0,...) at sleepq_timedwait+0x6b _sleep(c1b9a088,c5f8db28,40,c0cc73e0,4e20,...) at _sleep+0x339 swap_pager_getpages(c5f8db28,e7032bb8,8,0,e7032c3c,...) at swap_pager_getpages+0x44d vm_fault(c5643d98,28a78000,2,0,28a78000,...) at vm_fault+0xf00 trap_pfault(5,0,c0cd8944,2f0,c6735000,...) at trap_pfault+0x10d trap(e7032d38) at trap+0x2d0 calltrap() at calltrap+0x6 --- trap 0xc, eip = 0x8049518, esp = 0xbfbfe830, ebp = 0xbfbfe868 --- Tracing command swap pid 4847 tid 100315 td 0xc6728d80 sched_switch(c6728d80,0,104,191,718ddd4a,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c6728d80,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c5ba0d48,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c5ba0d48,c5ba0dd0,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c6728d80,12f0,e70e0c74,0,0,...) at kern_wait+0xb76 wait4(c6728d80,e70e0cf8,10,c6728d80,c0d881c4,...) at wait4+0x3b syscall(e70e0d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x2810107b, esp = 0xbfbfe86c, ebp = 0xbfbfe888 --- Tracing command swap pid 4842 tid 100327 td 0xc5baa480 sched_switch(c5baa480,0,104,191,df327418,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c5baa480,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(3e9,c08c67e0,c5baa480,0,100,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c0dfa744,5c,c0c9de91,100,0,...) at sleepq_timedwait_sig+0x1a _sleep(c0dfa744,0,15c,c0c9de91,3e9,...) at _sleep+0x31e kern_nanosleep(c5baa480,e7108c64,e7108c6c,1,0,...) at kern_nanosleep+0xc1 nanosleep(c5baa480,e7108cf8,8,c0ca3a99,c0d89b40,...) at nanosleep+0x6f syscall(e7108d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (240, FreeBSD ELF32, nanosleep), eip = 0x2816ead7, esp = 0xbfbfe88c, ebp = 0xbfbfe8b8 --- Tracing command top pid 1152 tid 100172 td 0xc55a2000 sched_switch(c55a2000,0,104,191,b1ef420,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c55a2000,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6ee1a4c,c087d44a,c5981550,0,c55a2000,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c5981564,0,e6ee1a7c,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c5981564,c5981550,3e9,603,c4b26ce8,...) at _cv_timedwait_sig+0x250 seltdwait(e6ee1c28,e6ee1c30,c4afce80,c55a2000,204b3635,...) at seltdwait+0x8a kern_select(c55a2000,2,bfbfebec,0,0,e6ee1c70,20,1,0) at kern_select+0x4f4 select(c55a2000,e6ee1cf8,14,c0c84e80,c0d88b2c,...) at select+0x66 syscall(e6ee1d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (93, FreeBSD ELF32, select), eip = 0x281e8033, esp = 0xbfbfeb8c, ebp = 0xbfbfece8 --- Tracing command sshd pid 1145 tid 100168 td 0xc517f6c0 sched_switch(c517f6c0,0,104,191,b3ee2b4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c517f6c0,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(c087d44a,c5296650,0,c0c9b36c,c517f6c0,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c5296664,0,e6ed3a7c,101,0,...) at sleepq_wait_sig+0x17 _cv_wait_sig(c5296664,c5296650,c0ca33c2,603,c4b264d0,...) at _cv_wait_sig+0x240 seltdwait(c4b264d0,58,c5040780,c517f6c0,200246,...) at seltdwait+0xa2 kern_select(c517f6c0,a,286030b8,286030dc,0,0,20,0,28100c70) at kern_select+0x4f4 select(c517f6c0,e6ed3cf8,14,c0c84e80,c0d88b2c,...) at select+0x66 syscall(e6ed3d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (93, FreeBSD ELF32, select), eip = 0x283ce033, esp = 0xbfbfde5c, ebp = 0xbfbfdea8 --- Tracing command tail pid 1144 tid 100087 td 0xc4c42d80 sched_switch(c4c42d80,0,104,191,9be5ec0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,58,...) at mi_switch+0x200 sleepq_switch(c4c42d80,0,c0ca113a,1a0,58,...) at sleepq_switch+0x15f sleepq_catch_signals(3e9,c08c67e0,c4c42d80,2,100,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c5765d00,58,c0c97d25,100,0,...) at sleepq_timedwait_sig+0x1a _sleep(c5765d00,c5765d00,158,c0c97d25,3e9,...) at _sleep+0x31e kern_kevent(c4c42d80,4,0,1,e6d7fc58,...) at kern_kevent+0x364 kevent(c4c42d80,e6d7fcf8,18,c0c3c109,c0d8a8b4,...) at kevent+0x19b syscall(e6d7fd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (363, FreeBSD ELF32, kevent), eip = 0x2815fcbb, esp = 0xbfbfeb5c, ebp = 0xbfbfec18 --- Tracing command sshd pid 1143 tid 100130 td 0xc4e11b40 Tracing command sshd pid 1139 tid 100164 td 0xc517fd80 Tracing command sshd pid 1137 tid 100162 td 0xc5180240 Tracing command run pid 1083 tid 100126 td 0xc4c43b40 sched_switch(c4c43b40,0,104,191,726f39f6,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4c43b40,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4e062a8,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c4e062a8,c4e06330,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c4c43b40,12ea,e6e3fc74,0,0,...) at kern_wait+0xb76 wait4(c4c43b40,e6e3fcf8,10,c4c43b40,c0d881c4,...) at wait4+0x3b syscall(e6e3fd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x2810007b, esp = 0xbfbfe38c, ebp = 0xbfbfe3a8 --- Tracing command run pid 1082 tid 100111 td 0xc4dc3d80 sched_switch(c4dc3d80,0,104,191,687a5996,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4dc3d80,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4db9d48,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c4db9d48,c4db9dd0,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c4dc3d80,43b,e6e0dc74,0,0,...) at kern_wait+0xb76 wait4(c4dc3d80,e6e0dcf8,10,c4dc3d80,c0d881c4,...) at wait4+0x3b syscall(e6e0dd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x2810007b, esp = 0xbfbfe83c, ebp = 0xbfbfe858 --- Tracing command run pid 1081 tid 100125 td 0xc4c43d80 sched_switch(c4c43d80,0,104,191,e5827d68,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4c43d80,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(3e9,c08c67e0,c4c43d80,0,100,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c0dfa744,5c,c0c9de91,100,0,...) at sleepq_timedwait_sig+0x1a _sleep(c0dfa744,0,15c,c0c9de91,3e9,...) at _sleep+0x31e kern_nanosleep(c4c43d80,e6e3bc64,e6e3bc6c,1,0,...) at kern_nanosleep+0xc1 nanosleep(c4c43d80,e6e3bcf8,8,c0ca3a99,c0d89b40,...) at nanosleep+0x6f syscall(e6e3bd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (240, FreeBSD ELF32, nanosleep), eip = 0x2816dad7, esp = 0xbfbfe85c, ebp = 0xbfbfe888 --- Tracing command sh pid 1075 tid 100051 td 0xc4ad5d80 sched_switch(c4ad5d80,0,104,191,62497470,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4ad5d80,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4ad47f8,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c4ad47f8,c4ad4880,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c4ad5d80,ffffffff,e6ccdc74,2,0,...) at kern_wait+0xb76 wait4(c4ad5d80,e6ccdcf8,10,c0ca393b,c0d881c4,...) at wait4+0x3b syscall(e6ccdd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x2815e07b, esp = 0xbfbfe82c, ebp = 0xbfbfe848 --- Tracing command bash pid 1059 tid 100132 td 0xc4e116c0 sched_switch(c4e116c0,0,104,191,59ab0ee4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4e116c0,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4e052a8,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c4e052a8,c4e05330,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c4e116c0,ffffffff,e6e51c74,6,0,...) at kern_wait+0xb76 wait4(c4e116c0,e6e51cf8,10,c0ca384b,c0d881c4,...) at wait4+0x3b syscall(e6e51d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x282be07b, esp = 0xbfbfe9cc, ebp = 0xbfbfe9e8 --- Tracing command sshd pid 1058 tid 100063 td 0xc4b51900 Tracing command sshd pid 1056 tid 100133 td 0xc4e11480 Tracing command getty pid 1055 tid 100099 td 0xc4be4000 Tracing command getty pid 1054 tid 100098 td 0xc4be4240 Tracing command getty pid 1053 tid 100081 td 0xc4c436c0 Tracing command getty pid 1052 tid 100062 td 0xc4b51b40 Tracing command getty pid 1051 tid 100113 td 0xc4dc3900 Tracing command getty pid 1050 tid 100112 td 0xc4dc3b40 Tracing command getty pid 1049 tid 100095 td 0xc4be46c0 Tracing command getty pid 1048 tid 100090 td 0xc4c426c0 Tracing command inetd pid 1026 tid 100106 td 0xc4c42000 Tracing command moused pid 1002 tid 100058 td 0xc4b08000 Tracing command watchdogd pid 984 tid 100122 td 0xc4dc2480 sched_switch(c4dc2480,0,104,191,b5351f2c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4dc2480,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(ea61,c08c67e0,c4dc2480,0,100,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c0dfa744,5c,c0c9de91,100,0,...) at sleepq_timedwait_sig+0x1a _sleep(c0dfa744,0,15c,c0c9de91,ea61,...) at _sleep+0x31e kern_nanosleep(c4dc2480,e6e2ec64,e6e2ec6c,3c,0,...) at kern_nanosleep+0xc1 nanosleep(c4dc2480,e6e2ecf8,8,c0cb4dc0,c0d89b40,...) at nanosleep+0x6f syscall(e6e2ed38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (240, FreeBSD ELF32, nanosleep), eip = 0x28185ad7, esp = 0xbfbfecec, ebp = 0xbfbfed18 --- Tracing command cron pid 962 tid 100108 td 0xc4be8b40 sched_switch(c4be8b40,0,104,191,f4507942,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c4be8b40,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(e679,c08c67e0,c4be8b40,2,100,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c0dfa744,5c,c0c9de91,100,0,...) at sleepq_timedwait_sig+0x1a _sleep(c0dfa744,0,15c,c0c9de91,e679,...) at _sleep+0x31e kern_nanosleep(c4be8b40,e6e02c64,e6e02c6c,3b,0,...) at kern_nanosleep+0xc1 nanosleep(c4be8b40,e6e02cf8,8,c0ca3a99,c0d89b40,...) at nanosleep+0x6f syscall(e6e02d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (240, FreeBSD ELF32, nanosleep), eip = 0x28178ad7, esp = 0xbfbfec8c, ebp = 0xbfbfecb8 --- Tracing command sendmail pid 956 tid 100109 td 0xc4be8900 sched_switch(c4be8900,0,104,191,ac6c383c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,68,...) at mi_switch+0x200 sleepq_switch(c4be8900,0,c0ca113a,1a0,68,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4dba300,68,c0c509c3,100,0,...) at sleepq_wait_sig+0x17 _sleep(c4dba300,c4dba330,168,c0c509c3,0,...) at _sleep+0x354 kern_sigsuspend(c4be8900,0,0,0,0,...) at kern_sigsuspend+0xae sigsuspend(c4be8900,e6e07cf8,4,c0ca384b,c0d8a64c,...) at sigsuspend+0x4d syscall(e6e07d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (4, FreeBSD ELF32, write), eip = 0x28332efb, esp = 0xbfbfcf8c, ebp = 0xbfbfcfb8 --- Tracing command sendmail pid 952 tid 100064 td 0xc4ad5b40 sched_switch(c4ad5b40,0,104,191,ed03c57e,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4ad5b40,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6d01a4c,c087d44a,c4a9a110,0,c4ad5b40,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c4a9a124,0,e6d01a7c,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c4a9a124,c4a9a110,1389,603,c4cccb28,...) at _cv_timedwait_sig+0x250 seltdwait(e6d01c28,e6d01c30,c4a6b200,c4ad5b40,c188b014,...) at seltdwait+0x8a kern_select(c4ad5b40,5,bfbfc510,0,0,e6d01c70,20,5,0) at kern_select+0x4f4 select(c4ad5b40,e6d01cf8,14,c0ca3d6f,c0d88b2c,...) at select+0x66 syscall(e6d01d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (93, FreeBSD ELF32, select), eip = 0x283d7033, esp = 0xbfbfc47c, ebp = 0xbfbfcfa8 --- Tracing command sshd pid 945 tid 100096 td 0xc4be4480 Tracing command ntpd pid 908 tid 100115 td 0xc4dc3480 sched_switch(c4dc3480,0,104,191,42cadf50,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4dc3480,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(c087d44a,c4836550,0,c0c9b36c,c4dc3480,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4836564,0,e6e19a7c,101,0,...) at sleepq_wait_sig+0x17 _cv_wait_sig(c4836564,c4836550,c0ca33c2,603,c4c5ba10,...) at _cv_wait_sig+0x240 seltdwait(c4c5ba10,58,c4578100,c4dc3480,0,...) at seltdwait+0xa2 kern_select(c4dc3480,1c,bfbfed28,0,0,0,20,e6e19c98,246) at kern_select+0x4f4 select(c4dc3480,e6e19cf8,14,c4dc3480,c0d88b2c,...) at select+0x66 syscall(e6e19d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (93, FreeBSD ELF32, select), eip = 0x28353033, esp = 0xbfbfecfc, ebp = 0xbfbfedc8 --- Tracing command nfsd pid 809 tid 100104 td 0xc4b53480 sched_switch(c4b53480,0,104,191,9c10a6ea,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4b53480,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6df2bf8,c087d44a,c48ff180,0,c4b53480,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c4da4810,0,e6df2c28,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c4da4810,c48ff180,1388,3af,5a5a5a5a,...) at _cv_timedwait_sig+0x250 svc_run_internal(e6df2d24,c0863018,c48ff180,e6df2d38,c0c980db,...) at svc_run_internal+0x356 svc_thread_start(c48ff180,e6df2d38,c0c980db,343,c4c3f7f8,...) at svc_thread_start+0x10 fork_exit(c0a82c40,c48ff180,e6df2d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0x2e, eip = 0xc, esp = 0x33, ebp = 0 --- Tracing command nfsd pid 809 tid 100103 td 0xc4b536c0 sched_switch(c4b536c0,0,104,191,9c2d4cb6,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4b536c0,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6defbf8,c087d44a,c48ff180,0,c4b536c0,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c4da4850,0,e6defc28,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c4da4850,c48ff180,1388,3af,5a5a5a5a,...) at _cv_timedwait_sig+0x250 svc_run_internal(e6defd24,c0863018,c48ff180,e6defd38,c0c980db,...) at svc_run_internal+0x356 svc_thread_start(c48ff180,e6defd38,c0c980db,343,c4c3f7f8,...) at svc_thread_start+0x10 fork_exit(c0a82c40,c48ff180,e6defd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0x2e, eip = 0xc, esp = 0x33, ebp = 0 --- Tracing command nfsd pid 809 tid 100102 td 0xc4b53900 sched_switch(c4b53900,0,104,191,911383b0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4b53900,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6decbf8,c087d44a,c48ff180,0,c4b53900,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c4da4890,0,e6decc28,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c4da4890,c48ff180,1388,3af,5a5a5a5a,...) at _cv_timedwait_sig+0x250 svc_run_internal(e6decd24,c0863018,c48ff180,e6decd38,c0c980db,...) at svc_run_internal+0x356 svc_thread_start(c48ff180,e6decd38,c0c980db,343,c4c3f7f8,...) at svc_thread_start+0x10 fork_exit(c0a82c40,c48ff180,e6decd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0x2e, eip = 0xc, esp = 0x33, ebp = 0 --- Tracing command nfsd pid 809 tid 100077 td 0xc481b6c0 sched_switch(c481b6c0,0,104,191,9d76d8e6,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c481b6c0,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6d5dae8,c087d44a,c48ff180,0,c481b6c0,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c4a6ec10,0,e6d5db18,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c4a6ec10,c48ff180,1388,3af,e6d5db60,...) at _cv_timedwait_sig+0x250 svc_run_internal(c481b824,14,c0cc042d,c0cbeee1,e6d5dc3c,...) at svc_run_internal+0x356 svc_run(c48ff180,0,c0cbf1e5,1fd,0,...) at svc_run+0x7f nfssvc_nfsd(bfbfe8b0,e6d5dc3c,c,c4578100,e6d5dc50,...) at nfssvc_nfsd+0xad nfssvc_nfsserver(c481b6c0,e6d5dcf8,bfbfe8b0,c481b6c0,c4c3f7f8,...) at nfssvc_nfsserver+0x24f nfssvc(c481b6c0,e6d5dcf8,8,c0ca3d6f,c0d891f4,...) at nfssvc+0x83 syscall(e6d5dd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (155, FreeBSD ELF32, nfssvc), eip = 0x280daadb, esp = 0xbfbfe86c, ebp = 0xbfbfead8 --- Tracing command nfsd pid 808 tid 100089 td 0xc4c42900 Tracing command mountd pid 806 tid 100091 td 0xc4c42480 Tracing command rpcbind pid 711 tid 100069 td 0xc4ad5480 sched_switch(c4ad5480,0,104,191,7e5b2b28,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4ad5480,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(e6d1caa8,c087d44a,c48350d0,0,c4ad5480,...) at sleepq_catch_signals+0xb7 sleepq_timedwait_sig(c48350e4,0,e6d1cad8,101,0,...) at sleepq_timedwait_sig+0x1a _cv_timedwait_sig(c48350e4,c48350d0,7531,603,e6d1cb8c,...) at _cv_timedwait_sig+0x250 seltdwait(e6d1cc5c,e6d1cc64,511,c4ad5480,e6d1cb5c,...) at seltdwait+0x8a poll(c4ad5480,e6d1ccf8,c,c4ad5480,c0d897dc,...) at poll+0x300 syscall(e6d1cd38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (209, FreeBSD ELF32, poll), eip = 0x2813f01f, esp = 0xbfbfcc0c, ebp = 0xbfbfedd8 --- Tracing command syslogd pid 691 tid 100084 td 0xc4c43240 sched_switch(c4c43240,0,104,191,347e637a,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4c43240,0,c0ca113a,1a0,0,...) at sleepq_switch+0x15f sleepq_catch_signals(c087d44a,c4a9a7d0,0,c0c9b36c,c4c43240,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c4a9a7e4,0,e6d75a7c,101,0,...) at sleepq_wait_sig+0x17 _cv_wait_sig(c4a9a7e4,c4a9a7d0,c0ca33c2,603,c4ccc038,...) at _cv_wait_sig+0x240 seltdwait(c4ccc038,58,c4578100,c4c43240,1dd,...) at seltdwait+0xa2 kern_select(c4c43240,9,282290ac,0,0,0,20,0,281a5498) at kern_select+0x4f4 select(c4c43240,e6d75cf8,14,c0ca384b,c0d88b2c,...) at select+0x66 syscall(e6d75d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (93, FreeBSD ELF32, select), eip = 0x28191033, esp = 0xbfbfe2ac, ebp = 0xbfbfee18 --- Tracing command devd pid 561 tid 100066 td 0xc4b516c0 Tracing command flowcleaner pid 19 tid 100050 td 0xc47676c0 sched_switch(c47676c0,0,104,191,d14b656a,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c47676c0,0,c0ca113a,283,c47676c0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f67288,0,e4addcc4,1,0,...) at sleepq_timedwait+0x6b _cv_timedwait(c0f67288,c0f67290,2710,3f0,0,...) at _cv_timedwait+0x250 flowtable_cleaner(0,e4addd38,c0c980db,343,c4a51550,...) at flowtable_cleaner+0x1bf fork_exit(c0936040,0,e4addd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4addd70, ebp = 0 --- Tracing command softdepflush pid 18 tid 100049 td 0xc4767900 sched_switch(c4767900,0,104,191,f8d7bb14,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,44,...) at mi_switch+0x200 sleepq_switch(c4767900,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f72a80,44,c0cc5263,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f72a80,c0f72a2c,44,c0cc5263,3e8,...) at _sleep+0x339 softdep_flush(0,e4adad38,c0c980db,343,c4a517f8,...) at softdep_flush+0x244 fork_exit(c0acaac0,0,e4adad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4adad70, ebp = 0 --- Tracing command vnlru pid 17 tid 100048 td 0xc4767b40 sched_switch(c4767b40,0,104,191,140fc2a4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,50,...) at mi_switch+0x200 sleepq_switch(c4767b40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c4a51aa0,50,c0cab17b,0,0,...) at sleepq_timedwait+0x6b _sleep(c4a51aa0,c0f67054,250,c0cab17b,3e8,...) at _sleep+0x339 vnlru_proc(0,e4ad7d38,c0c980db,343,c4a51aa0,...) at vnlru_proc+0xe7 fork_exit(c09218e0,0,e4ad7d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ad7d70, ebp = 0 --- Tracing command syncer pid 16 tid 100047 td 0xc4767d80 sched_switch(c4767d80,0,104,191,45e0e7c4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c4767d80,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(d872d180,4c,c0ca80ad,0,0,...) at sleepq_wait+0x63 _sleep(d872d180,c454e704,4c,c0ca80ad,0,...) at _sleep+0x36b bwait(d872d180,4c,c0ca80ad,d872d180,e4ad4904,...) at bwait+0x6f bufwait(d872d180,d872d180,df,d872d180,d872d180,...) at bufwait+0x48 bufwrite(d872d180,0,c0cc5693,726,c4aef800) at bufwrite+0x165 ffs_bufwrite(d872d180,0,c0cc31e4,8d5,c08d0d46,...) at ffs_bufwrite+0x290 softdep_process_journal(19d5,0,d85ac7a0,d85ac7a0,c4b6315c,...) at softdep_process_journal+0x6a8 jwait(c0f72a2c,0,c0cc31e4,19d5,ffffffff,...) at jwait+0x20 softdep_disk_io_initiation(d85ac7a0,375,d85ac7a0,4,0,...) at softdep_disk_io_initiation+0x1758 ffs_geom_strategy(c4b63254,d85ac7a0,375,d85ac7a0,d85ac7a0,...) at ffs_geom_strategy+0x13f bufwrite(d85ac7a0,0,c0cc5693,726,268) at bufwrite+0x159 ffs_bufwrite(d85ac7a0,c4767d80,e4ad4bc0,246,c4539d08,...) at ffs_bufwrite+0x290 vfs_bio_awrite(d85ac7a0,0,c0ca9501,268,0,...) at vfs_bio_awrite+0x318 vop_stdfsync(e4ad4c7c,e4ad4c34,0,c0d845a0,e4ad4c7c,...) at vop_stdfsync+0x13b devfs_fsync(e4ad4c7c,c0cdaf32,c4b63220,e4ad4c7c,c4b6315c,...) at devfs_fsync+0x7c VOP_FSYNC_APV(c0d845a0,e4ad4c7c,c0caa1ab,69d,c4767d80,...) at VOP_FSYNC_APV+0xc5 sync_vnode(c0f67094,c0f67080,3e8,6cc,4e20,...) at sync_vnode+0x16b sched_sync(0,e4ad4d38,c0c980db,343,c4a51d48,...) at sched_sync+0x273 fork_exit(c0920d10,0,e4ad4d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ad4d70, ebp = 0 --- Tracing command bufdaemon pid 15 tid 100046 td 0xc481a000 sched_switch(c481a000,0,104,191,46e11dcc,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,44,...) at mi_switch+0x200 sleepq_switch(c481a000,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f66dc8,44,c0ca86e9,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f66dc8,c0f66dcc,44,c0ca86e9,3e8,...) at _sleep+0x339 buf_daemon(0,e4ad1d38,c0c980db,343,c457b2a8,...) at buf_daemon+0x138 fork_exit(c0908f20,0,e4ad1d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ad1d70, ebp = 0 --- Tracing command pagezero pid 9 tid 100045 td 0xc481a240 sched_switch(c481a240,0,104,191,f6e9201c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c481a240,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f73894,0,c0ccadd6,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f73894,c0f73380,0,c0ccadd6,493e0,...) at _sleep+0x339 vm_pagezero(0,e4aced38,c0c980db,343,c457b550,...) at vm_pagezero+0xdc fork_exit(c0b091a0,0,e4aced38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4aced70, ebp = 0 --- Tracing command vmdaemon pid 8 tid 100044 td 0xc481a480 sched_switch(c481a480,0,104,191,97159da,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,68,...) at mi_switch+0x200 sleepq_switch(c481a480,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0f734c4,68,c0ca86e9,0,0,...) at sleepq_wait+0x63 _sleep(c0f734c4,c0f734c8,68,c0ca86e9,0,...) at _sleep+0x36b vm_daemon(0,e4acbd38,c0c980db,343,c457b7f8,...) at vm_daemon+0x59 fork_exit(c0b03640,0,e4acbd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4acbd70, ebp = 0 --- Tracing command pagedaemon pid 7 tid 100043 td 0xc481a6c0 sched_switch(c481a6c0,0,104,191,d79f855a,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,44,...) at mi_switch+0x200 sleepq_switch(c481a6c0,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0f7348c,44,c0ca86e9,0,0,...) at sleepq_timedwait+0x6b _sleep(c0f7348c,c0f73380,44,c0ca86e9,1388,...) at _sleep+0x339 vm_pageout(0,e4ac8d38,c0c980db,343,c457baa0,...) at vm_pageout+0x2bb fork_exit(c0b044e0,0,e4ac8d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ac8d70, ebp = 0 --- Tracing command fdc0 pid 6 tid 100041 td 0xc481ab40 sched_switch(c481ab40,0,104,191,463b57ec,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c481ab40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c468e83c,4c,c0c92561,0,0,...) at sleepq_timedwait+0x6b _sleep(c468e83c,c468e8f0,4c,c0c92561,3e8,...) at _sleep+0x339 fdc_thread(c468e800,e4ac2d38,c0c980db,343,c457bd48,...) at fdc_thread+0x27d fork_exit(c0b86720,c468e800,e4ac2d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ac2d70, ebp = 0 --- Tracing command usb pid 14 tid 100034 td 0xc47656c0 sched_switch(c47656c0,0,104,191,7bc3c508,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c47656c0,0,c0ca113a,260,c47656c0,...) at sleepq_switch+0x15f sleepq_wait(c479edac,0,c4362cbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c479edac,c479ee4c,c0c8b5d3,6c,c479edb4,...) at _cv_wait+0x240 usb_process(c479eda4,c4362d38,c0c980db,343,c474b000,...) at usb_process+0x193 fork_exit(c07bf6e0,c479eda4,c4362d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4362d70, ebp = 0 --- Tracing command usb pid 14 tid 100033 td 0xc4765900 sched_switch(c4765900,0,104,191,9a6d7088,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4765900,0,c0ca113a,260,c4765900,...) at sleepq_switch+0x15f sleepq_wait(c479ed7c,0,c435fcbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c479ed7c,c479ee4c,c0c8b5d3,6c,c479ed84,...) at _cv_wait+0x240 usb_process(c479ed74,c435fd38,c0c980db,343,c474b000,...) at usb_process+0x193 fork_exit(c07bf6e0,c479ed74,c435fd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc435fd70, ebp = 0 --- Tracing command usb pid 14 tid 100032 td 0xc4765b40 sched_switch(c4765b40,0,104,191,7b4c6eac,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4765b40,0,c0ca113a,260,c4765b40,...) at sleepq_switch+0x15f sleepq_wait(c479ed4c,0,c435ccbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c479ed4c,c479ee4c,c0c8b5d3,6c,c479ed54,...) at _cv_wait+0x240 usb_process(c479ed44,c435cd38,c0c980db,343,c474b000,...) at usb_process+0x193 fork_exit(c07bf6e0,c479ed44,c435cd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc435cd70, ebp = 0 --- Tracing command usb pid 14 tid 100031 td 0xc4765d80 sched_switch(c4765d80,0,104,191,7b4c3e6c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4765d80,0,c0ca113a,260,c4765d80,...) at sleepq_switch+0x15f sleepq_wait(c479ed1c,0,c4359cbc,1,0,...) at sleepq_wait+0x63 _cv_wait(c479ed1c,c479ee4c,c0c8b5d3,6c,c479ed24,...) at _cv_wait+0x240 usb_process(c479ed14,c4359d38,c0c980db,343,c474b000,...) at usb_process+0x193 fork_exit(c07bf6e0,c479ed14,c4359d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4359d70, ebp = 0 --- Tracing command xpt_thrd pid 5 tid 100023 td 0xc4756000 sched_switch(c4756000,0,104,191,7b4c028c,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c4756000,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0dc6954,4c,c0c372ba,0,0,...) at sleepq_wait+0x63 _sleep(c0dc6954,c0dc696c,4c,c0c372ba,0,...) at _sleep+0x36b xpt_scanner_thread(0,c431ad38,c0c980db,343,c474b2a8,...) at xpt_scanner_thread+0x4a fork_exit(c0484a60,0,c431ad38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc431ad70, ebp = 0 --- Tracing command yarrow pid 13 tid 100017 td 0xc457e240 sched_switch(c457e240,0,104,191,47e439a8,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c457e240,0,c0ca113a,283,2,...) at sleepq_switch+0x15f sleepq_timedwait(c0dfa5a4,0,c0c92561,2,0,...) at sleepq_timedwait+0x6b _sleep(c0dfa5a4,0,0,c0c92561,64,...) at _sleep+0x339 pause(c0c92561,64,c0c7f55e,111,0,...) at pause+0x47 random_kthread(0,c4308d38,c0c980db,343,c474b550,...) at random_kthread+0x1ef fork_exit(c0738120,0,c4308d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4308d70, ebp = 0 --- Tracing command g_down pid 4 tid 100015 td 0xc457e6c0 sched_switch(c457e6c0,0,104,191,49d99af4,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c457e6c0,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c0df8364,4c,c0c92561,0,0,...) at sleepq_wait+0x63 _sleep(c0df8364,c0df82c8,24c,c0c92561,0,...) at _sleep+0x36b g_io_schedule_down(c457e6c0,0,c0c93c29,74,0,...) at g_io_schedule_down+0x56 g_down_procbody(0,c4302d38,c0c980db,343,c457a000,...) at g_down_procbody+0x8d fork_exit(c082a0e0,0,c4302d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4302d70, ebp = 0 --- Tracing command g_up pid 3 tid 100014 td 0xc457e900 kdb_enter(c0c9cbe4,c0c9cbe4,c0cc4152,c42ffb4c,0,...) at kdb_enter+0x3a panic(c0cc4152,0,0,c6c7b4d0,0,...) at panic+0x136 handle_allocdirect_partdone(8c26,0,16f88,0,c4538a58,...) at handle_allocdirect_partdone+0xce softdep_disk_write_complete(d875b1a0,0,c0cc5693,6d5,c4b63254,...) at softdep_disk_write_complete+0xd8b ffs_backgroundwritedone(d875b1a0,c1879000,d875b1a0,c4800558,c42ffc98,...) at ffs_backgroundwritedone+0xa3 bufdone(d875b1a0,c454e9c0) at bufdone+0x53 g_vfs_done(c4800558,0,c0ca7da5,c16,c4800558,...) at g_vfs_done+0x85 biodone(c4800558,c0df82e8,24c,c0c92561,0,...) at biodone+0xa5 g_io_schedule_up(c457e900,0,c0c93c29,5d,0,...) at g_io_schedule_up+0xc7 g_up_procbody(0,c42ffd38,c0c980db,343,c457a2a8,...) at g_up_procbody+0x8d fork_exit(c082a170,0,c42ffd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42ffd70, ebp = 0 --- Tracing command g_event pid 2 tid 100013 td 0xc457eb40 sched_switch(c457eb40,0,104,191,45f9fd18,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,4c,...) at mi_switch+0x200 sleepq_switch(c457eb40,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0df8358,4c,c0c92561,0,0,...) at sleepq_timedwait+0x6b _sleep(c0df8358,0,4c,c0c92561,64,...) at _sleep+0x339 g_event_procbody(0,c42fcd38,c0c980db,343,c457a550,...) at g_event_procbody+0xcb fork_exit(c082a200,0,c42fcd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42fcd70, ebp = 0 --- Tracing command intr pid 12 tid 100042 td 0xc481a900 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100040 td 0xc481ad80 sched_switch(c481ad80,0,109,191,a84daa7c,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c47acef0,...) at mi_switch+0x200 ithread_loop(c4815b20,e4ab5d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c4815b20,e4ab5d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4ab5d70, ebp = 0 --- Tracing command intr pid 12 tid 100039 td 0xc481b000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100038 td 0xc4756d80 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100037 td 0xc4765000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100036 td 0xc4765240 sched_switch(c4765240,0,109,191,49e31c08,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1370,...) at mi_switch+0x200 ithread_loop(c480d820,e4aa6d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c480d820,e4aa6d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4aa6d70, ebp = 0 --- Tracing command intr pid 12 tid 100035 td 0xc4765480 sched_switch(c4765480,0,109,191,1618b30c,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c11f0,...) at mi_switch+0x200 ithread_loop(c47afb10,e4aa0d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c47afb10,e4aa0d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xe4aa0d70, ebp = 0 --- Tracing command intr pid 12 tid 100030 td 0xc4767000 sched_switch(c4767000,0,109,191,7aa6c568,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1270,...) at mi_switch+0x200 ithread_loop(c4784830,c4356d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c4784830,c4356d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4356d70, ebp = 0 --- Tracing command intr pid 12 tid 100028 td 0xc4767480 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100024 td 0xc45c3d80 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100022 td 0xc4756240 sched_switch(c4756240,0,109,191,ff34985a,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c4714b70,...) at mi_switch+0x200 ithread_loop(c4516940,c4317d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c4516940,c4317d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4317d70, ebp = 0 --- Tracing command intr pid 12 tid 100021 td 0xc4756480 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100019 td 0xc4756900 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100012 td 0xc457ed80 sched_switch(c457ed80,0,109,191,46df1004,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1c70,...) at mi_switch+0x200 ithread_loop(c4579090,c42f9d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c4579090,c42f9d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42f9d70, ebp = 0 --- Tracing command intr pid 12 tid 100011 td 0xc45c3000 sched_switch(c45c3000,0,109,191,47deb260,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1cf0,...) at mi_switch+0x200 ithread_loop(c45790a0,c42f6d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c45790a0,c42f6d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42f6d70, ebp = 0 --- Tracing command intr pid 12 tid 100010 td 0xc45c3240 sched_switch(c45c3240,0,109,191,f8d4be50,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1d70,...) at mi_switch+0x200 ithread_loop(c45790b0,c42f3d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c45790b0,c42f3d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42f3d70, ebp = 0 --- Tracing command intr pid 12 tid 100009 td 0xc45c3480 sched_switch(c45c3480,0,109,191,49447504,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1df0,...) at mi_switch+0x200 ithread_loop(c45790c0,c42f0d38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c45790c0,c42f0d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42f0d70, ebp = 0 --- Tracing command intr pid 12 tid 100008 td 0xc457c000 fork_trampoline() at fork_trampoline Tracing command intr pid 12 tid 100007 td 0xc457c240 sched_switch(c457c240,0,109,191,2afdfff2,...) at sched_switch+0x406 mi_switch(109,0,c0c9835a,52d,c45c1ef0,...) at mi_switch+0x200 ithread_loop(c45790e0,c42ead38,c0c980db,343,c457a7f8,...) at ithread_loop+0x1f6 fork_exit(c0866000,c45790e0,c42ead38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42ead70, ebp = 0 --- Tracing command idle pid 11 tid 100006 td 0xc457c480 sched_switch(c457c480,0,60c,18c,49aebf70,...) at sched_switch+0x406 mi_switch(60c,0,c0c9e76d,815,0,...) at mi_switch+0x200 sched_preempt(c457c480,0,1f4,c42e4c74,c0bad7fe,...) at sched_preempt+0x9f ipi_bitmap_handler(c0df0008,28,c0c90028,c4820218,c4820200,...) at ipi_bitmap_handler+0x34 Xipi_intr_bitmap_handler() at Xipi_intr_bitmap_handler+0x2e --- interrupt, eip = 0xc0ba18f5, esp = 0xc42e4c74, ebp = 0xc42e4c74 --- acpi_cpu_c1(1,1,0,c08af9c1,c0e0cde0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c42e4cb4,c0bb904b,1,c42e4cf8,c08afcbe,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(1,c42e4cf8,c08afcbe,1,c42e4cd4,...) at cpu_idle_acpi+0x1b cpu_idle(1,c42e4cd4,c0c9e76d,3b0,c457c480,...) at cpu_idle+0x1b sched_idletd(0,c42e4d38,c0c980db,343,c457aaa0,...) at sched_idletd+0x23e fork_exit(c08afa80,0,c42e4d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42e4d70, ebp = 0 --- Tracing command idle pid 11 tid 100005 td 0xc457c6c0 sched_switch(c457c6c0,0,108,18c,d7b6a0d4,...) at sched_switch+0x406 mi_switch(108,0,c0c9e76d,3ae,c457c6c0,...) at mi_switch+0x200 sched_idletd(0,c42e1d38,c0c980db,343,c457aaa0,...) at sched_idletd+0x19b fork_exit(c08afa80,0,c42e1d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42e1d70, ebp = 0 --- Tracing command idle pid 11 tid 100004 td 0xc457c900 cpustop_handler(4,c42dec28,c0bcb0b6,c42debb8,c087d094,...) at cpustop_handler+0x32 ipi_nmi_handler(c42debb8,c087d094,c0dfa7f8,4,c457aaa0,...) at ipi_nmi_handler+0x2f trap(c42dec34) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0xc0ba18f5, esp = 0xc42dec74, ebp = 0xc42dec74 --- acpi_cpu_c1(1,2,2,c08af9c1,c0e0cde0,...) at acpi_cpu_c1+0x5 acpi_cpu_idle(c42decb4,c0bb904b,0,c42decf8,c08afcbe,...) at acpi_cpu_idle+0x11c cpu_idle_acpi(0,c42decf8,c08afcbe,0,c42decd4,...) at cpu_idle_acpi+0x1b cpu_idle(0,c42decd4,c0c9e76d,a09,c457c900,...) at cpu_idle+0x1b sched_idletd(0,c42ded38,c0c980db,343,c457aaa0,...) at sched_idletd+0x23e fork_exit(c08afa80,0,c42ded38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42ded70, ebp = 0 --- Tracing command idle pid 11 tid 100003 td 0xc457cb40 cpustop_handler(8,c42dbc54,c0bcb0b6,c457cb40,c42dbbd8,...) at cpustop_handler+0x32 ipi_nmi_handler(c457cb40,c42dbbd8,46,c0dfa834,c457aaa0,...) at ipi_nmi_handler+0x2f trap(c42dbc60) at trap+0x36 calltrap() at calltrap+0x6 --- trap 0x13, eip = 0xc087d08f, esp = 0xc42dbca0, ebp = 0xc42dbcb4 --- _mtx_unlock_spin_flags(c0dff8c0,0,c0c9e76d,a09,c457cb40,...) at _mtx_unlock_spin_flags+0xef sched_idletd(0,c42dbd38,c0c980db,343,c457aaa0,...) at sched_idletd+0x2aa fork_exit(c08afa80,0,c42dbd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42dbd70, ebp = 0 --- Tracing command init pid 1 tid 100002 td 0xc457cd80 sched_switch(c457cd80,0,104,191,4d2dcf64,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,5c,...) at mi_switch+0x200 sleepq_switch(c457cd80,0,c0ca113a,1a0,5c,...) at sleepq_switch+0x15f sleepq_catch_signals(c0ca113a,160,0,100,100,...) at sleepq_catch_signals+0xb7 sleepq_wait_sig(c457ad48,5c,c0ca395c,100,0,...) at sleepq_wait_sig+0x17 _sleep(c457ad48,c457add0,15c,c0ca395c,0,...) at _sleep+0x354 kern_wait(c457cd80,ffffffff,c42d7c74,0,0,...) at kern_wait+0xb76 wait4(c457cd80,c42d7cf8,10,c0ca378c,c0d881c4,...) at wait4+0x3b syscall(c42d7d38) at syscall+0x2b4 Xint0x80_syscall() at Xint0x80_syscall+0x20 --- syscall (7, FreeBSD ELF32, wait4), eip = 0x8054def, esp = 0xbfbfe90c, ebp = 0xbfbfe928 --- Tracing command audit pid 10 tid 100001 td 0xc457e000 sched_switch(c457e000,0,104,191,7b4400cc,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c457e000,0,c0ca113a,260,c457e000,...) at sleepq_switch+0x15f sleepq_wait(c0f722c0,0,c42d4c9c,1,0,...) at sleepq_wait+0x63 _cv_wait(c0f722c0,c0f722a4,c0cc1189,194,0,...) at _cv_wait+0x240 audit_worker(0,c42d4d38,c0c980db,343,c457b000,...) at audit_worker+0x84 fork_exit(c0a92d60,0,c42d4d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc42d4d70, ebp = 0 --- Tracing command kernel pid 0 tid 100029 td 0xc4767240 sched_switch(c4767240,0,104,191,7aa62da0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4767240,0,c0ca113a,260,c4767240,...) at sleepq_switch+0x15f sleepq_wait(c4782340,0,c0c9d7d9,c0c92561,0,...) at sleepq_wait+0x63 msleep_spin(c4782340,c4782358,c0c92561,0,c0c9b36c,...) at msleep_spin+0x21d taskqueue_thread_loop(c478a5a0,c4352d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0x94 fork_exit(c08c9200,c478a5a0,c4352d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4352d70, ebp = 0 --- Tracing command kernel pid 0 tid 100027 td 0xc45c36c0 sched_switch(c45c36c0,0,104,191,dfe81a04,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c45c36c0,0,c0ca113a,260,c45c36c0,...) at sleepq_switch+0x15f sleepq_wait(c4749100,0,c0c9d7d9,c0c92561,0,...) at sleepq_wait+0x63 msleep_spin(c4749100,c4749118,c0c92561,0,c0c9b36c,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dc97a0,c4326d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0x94 fork_exit(c08c9200,c0dc97a0,c4326d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4326d70, ebp = 0 --- Tracing command kernel pid 0 tid 100026 td 0xc45c3900 sched_switch(c45c3900,0,104,191,dfe7f768,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c45c3900,0,c0ca113a,260,c45c3900,...) at sleepq_switch+0x15f sleepq_wait(c4749100,0,c0c9d7d9,c0c92561,0,...) at sleepq_wait+0x63 msleep_spin(c4749100,c4749118,c0c92561,0,c0c9b36c,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dc97a0,c4323d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0x94 fork_exit(c08c9200,c0dc97a0,c4323d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4323d70, ebp = 0 --- Tracing command kernel pid 0 tid 100025 td 0xc45c3b40 sched_switch(c45c3b40,0,104,191,dfe7d2f0,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c45c3b40,0,c0ca113a,260,c45c3b40,...) at sleepq_switch+0x15f sleepq_wait(c4749100,0,c0c9d7d9,c0c92561,0,...) at sleepq_wait+0x63 msleep_spin(c4749100,c4749118,c0c92561,0,c0c9b36c,...) at msleep_spin+0x21d taskqueue_thread_loop(c0dc97a0,c4320d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0x94 fork_exit(c08c9200,c0dc97a0,c4320d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4320d70, ebp = 0 --- Tracing command kernel pid 0 tid 100020 td 0xc47566c0 sched_switch(c47566c0,0,104,191,f5ecaf38,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c47566c0,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4749380,0,c0c92561,0,0,...) at sleepq_wait+0x63 _sleep(c4749380,c4749398,0,c0c92561,0,...) at _sleep+0x36b taskqueue_thread_loop(c0e0cfc8,c4311d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0xba fork_exit(c08c9200,c0e0cfc8,c4311d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4311d70, ebp = 0 --- Tracing command kernel pid 0 tid 100018 td 0xc4756b40 sched_switch(c4756b40,0,104,191,dfd47500,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c4756b40,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4749600,0,c0c92561,0,0,...) at sleepq_wait+0x63 _sleep(c4749600,c4749618,0,c0c92561,0,...) at _sleep+0x36b taskqueue_thread_loop(c0df8cd0,c430bd38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0xba fork_exit(c08c9200,c0df8cd0,c430bd38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc430bd70, ebp = 0 --- Tracing command kernel pid 0 tid 100016 td 0xc457e480 sched_switch(c457e480,0,104,191,1662ba82,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,0,...) at mi_switch+0x200 sleepq_switch(c457e480,0,c0ca113a,260,0,...) at sleepq_switch+0x15f sleepq_wait(c4561dc0,0,c0c92561,0,0,...) at sleepq_wait+0x63 _sleep(c4561dc0,c4561dd8,0,c0c92561,0,...) at _sleep+0x36b taskqueue_thread_loop(c0e0ba60,c4305d38,c0c980db,343,c0df8440,...) at taskqueue_thread_loop+0xba fork_exit(c08c9200,c0e0ba60,c4305d38) at fork_exit+0xb8 fork_trampoline() at fork_trampoline+0x8 --- trap 0, eip = 0, esp = 0xc4305d70, ebp = 0 --- Tracing command kernel pid 0 tid 100000 td 0xc0df86f0 sched_switch(c0df86f0,0,104,191,d25f99fa,...) at sched_switch+0x406 mi_switch(104,0,c0ca113a,1eb,44,...) at mi_switch+0x200 sleepq_switch(c0df86f0,0,c0ca113a,283,0,...) at sleepq_switch+0x15f sleepq_timedwait(c0df8440,44,c0c9efe2,0,0,...) at sleepq_timedwait+0x6b _sleep(c0df8440,0,44,c0c9efe2,2710,...) at _sleep+0x339 scheduler(0,141ec00,141ec00,141e000,1425000,...) at scheduler+0x23e mi_startup() at mi_startup+0x96 begin() at begin+0x2c db:0:allt> call doadump Physical memory: 1007 MB Dumping 160 MB: 145 129 113 97 81 65 49 33 17 1 Dump complete = 0xf db:0:doadump> reset (kgdb) bt #0 doadump () at pcpu.h:246 #1 0xc04d0989 in db_fncall (dummy1=0xc08bc9ba, dummy2=0x0, dummy3=0xffffffff, dummy4=0xc42ff7ec "") at ../../../ddb/db_command.c:548 #2 0xc04d0dbf in db_command (last_cmdp=0xc0dc797c, cmd_table=0x0, dopager=0x0) at ../../../ddb/db_command.c:445 #3 0xc04d0e74 in db_command_script (command=0xc0dc8888 "call doadump") at ../../../ddb/db_command.c:516 #4 0xc04d4ff0 in db_script_exec (scriptname=0xc0dc81e0 "doadump", warnifnotfound=Variable "warnifnotfound" is not available. ) at ../../../ddb/db_script.c:302 #5 0xc04d5081 in db_run_cmd (addr=0x1, have_addr=0x0, count=0xc0fc71e0, modif=0xc42ff924 "") at ../../../ddb/db_script.c:375 #6 0xc04d0d81 in db_command (last_cmdp=0xc0dc797c, cmd_table=0x0, dopager=0x1) at ../../../ddb/db_command.c:445 #7 0xc04d0eda in db_command_loop () at ../../../ddb/db_command.c:498 #8 0xc04d2d7d in db_trap (type=0x3, code=0x0) at ../../../ddb/db_main.c:229 #9 0xc08bc836 in kdb_trap (type=0x3, code=0x0, tf=0xc42ffacc) at ../../../kern/subr_kdb.c:535 #10 0xc0bcb6ab in trap (frame=0xc42ffacc) at ../../../i386/i386/trap.c:690 #11 0xc0bad10b in calltrap () at ../../../i386/i386/exception.s:165 #12 0xc08bc9ba in kdb_enter (why=0xc0c9cbe4 "panic", msg=0xc0c9cbe4 "panic") at cpufunc.h:71 #13 0xc088ce16 in panic (fmt=0xc0cc4152 "handle_allocdirect_partdone: lost dep") at ../../../kern/kern_shutdown.c:562 #14 0xc0ac391e in handle_allocdirect_partdone (adp=0xc6c7ba80, wkhd=0x0) at ../../../ufs/ffs/ffs_softdep.c:7555 #15 0xc0ac5b7b in softdep_disk_write_complete (bp=0xd875b1a0) at ../../../ufs/ffs/ffs_softdep.c:8126 #16 0xc0acdc73 in ffs_backgroundwritedone (bp=0xd875b1a0) at buf.h:411 #17 0xc0907cc3 in bufdone (bp=0xd875b1a0) at ../../../kern/vfs_bio.c:3255 #18 0xc082ece5 in g_vfs_done (bip=0xc4800558) at ../../../geom/geom_vfs.c:97 #19 0xc09042f5 in biodone (bp=0xc4800558) at ../../../kern/vfs_bio.c:3096 #20 0xc0829a47 in g_io_schedule_up (tp=0xc457e900) at ../../../geom/geom_io.c:669 #21 0xc082a1fd in g_up_procbody () at ../../../geom/geom_kern.c:95 #22 0xc0863018 in fork_exit (callout=0xc082a170 , arg=0x0, frame=0xc42ffd38) at ../../../kern/kern_fork.c:843 #23 0xc0bad180 in fork_trampoline () at ../../../i386/i386/exception.s:270 (kgdb) f 14 #14 0xc0ac391e in handle_allocdirect_partdone (adp=0xc6c7ba80, wkhd=0x0) at ../../../ufs/ffs/ffs_softdep.c:7555 7555 panic("handle_allocdirect_partdone: lost dep"); (kgdb) l 7550 TAILQ_FOREACH(listadp, listhead, ad_next) 7551 /* found our block */ 7552 if (listadp == adp) 7553 break; 7554 if (listadp == NULL) 7555 panic("handle_allocdirect_partdone: lost dep"); 7556 #endif /* DEBUG */ 7557 return; 7558 } 7559 /* (kgdb) info loc listhead = (struct allocdirectlst *) 0xc6c7b4d0 listadp = Variable "listadp" is not available. (kgdb) p *listhead $1 = {tqh_first = 0x0, tqh_last = 0xc6c7b4d0} (kgdb) p listhead->tqh_last $3 = (struct allocdirect **) 0xc6c7b4d0 (kgdb) p *listhead->tqh_last $4 = (struct allocdirect *) 0x0 (kgdb) $ svn diff -x -p /usr/src/sys Index: /usr/src/sys/ufs/ufs/ufs_dirhash.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_dirhash.c (revision 200565) +++ /usr/src/sys/ufs/ufs/ufs_dirhash.c (working copy) @@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$"); static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); -static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); - static int ufs_mindirhashsize = DIRBLKSIZ * 5; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, &ufs_mindirhashsize, Index: /usr/src/sys/ufs/ufs/ufs_vnops.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_vnops.c (revision 200565) +++ /usr/src/sys/ufs/ufs/ufs_vnops.c (working copy) @@ -114,6 +114,8 @@ static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; static vop_pathconf_t ufsfifo_pathconf; +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + /* * A virgin directory (no blushing please). */ @@ -902,6 +904,9 @@ ufs_link(ap) error = EXDEV; goto out; } + if (VTOI(tdvp)->i_effnlink < 2) + panic("ufs_link: Bad link count %d on parent", + VTOI(tdvp)->i_effnlink); ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; @@ -916,7 +921,7 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); @@ -929,7 +934,7 @@ ufs_link(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_change_linkcnt(ip); + softdep_revert_link(VTOI(tdvp), ip); } out: return (error); @@ -990,6 +995,11 @@ ufs_whiteout(ap) return (error); } +static volatile int rename_restarts; +SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, + __DEVOLATILE(int *, &rename_restarts), 0, + "Times rename had to restart due to lock contention"); + /* * Rename system call. * rename("foo", "bar"); @@ -1029,14 +1039,16 @@ ufs_rename(ap) struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; + struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; - struct inode *ip, *xp, *dp; + struct inode *ip, *xp, *tdp, *fdp; struct direct newdir; - int doingdirectory = 0, oldparent = 0, newparent = 0; + int doingdirectory, newparent; int error = 0, ioflag; - ino_t fvp_ino; + struct mount *mp; + ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || @@ -1049,7 +1061,6 @@ ufs_rename(ap) if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; -abortit: if (tdvp == tvp) vrele(tdvp); else @@ -1060,63 +1071,202 @@ ufs_rename(ap) vrele(fvp); return (error); } - + mp = tdvp->v_mount; + VOP_UNLOCK(tdvp, 0); + if (tvp && tvp != tdvp) + VOP_UNLOCK(tvp, 0); + error = vfs_busy(mp, 0); + if (error) { + mp = NULL; + goto releout; + } +relock: + /* + * We need to acquire 2 to 4 locks depending on whether tvp is NULL + * and fdvp and tdvp are the same directory. Subsequently we need + * to double-check all paths and in the directory rename case we + * need to verify that we are not creating a directory loop. To + * handle this we acquire fdvp and fvp in order using blocking + * locks followed by non-blocking acquisitions for all remaining + * locks. If we fail to acquire any lock in the path we will + * drop all held locks, acquire the new lock in a blocking fashion, + * and then release it and restart the rename. This acquire/release + * step ensures that we do not spin on a lock waiting for release. + */ + error = vn_lock(fdvp, LK_EXCLUSIVE); + if (error) + goto releout; + error = vn_lock(fvp, LK_EXCLUSIVE); + if (error) { + VOP_UNLOCK(fdvp, 0); + goto releout; + } + if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto releout; + VOP_UNLOCK(tdvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * If vn_lock fails due to VI_DOOMED being set on fdvp, fvp, or tdvp + * it is a fatal condition. If it occurs on tvp we carry on as if + * it was removed and ufs_lookup_ino() below will resolve the + * condition that cleared it. + */ + if (tvp && vn_lock(tvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + error = vn_lock(tvp, LK_EXCLUSIVE); + if (error) + tvp = NULL; + else + VOP_UNLOCK(tvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + fdp = VTOI(fdvp); + ip = VTOI(fvp); + tdp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + /* + * Re-resolve fvp to be certain it still exists at the same inode + * number. If the lookup fails abort the rename. If the inode + * number has changed we need to VGET it and then restart the + * whole lock operation. + */ + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + goto unlockout; + if (ino != ip->i_number) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + vput(fvp); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &fvp); + if (error) + goto releout; + VOP_UNLOCK(fvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * Re-lookup to now that all of the locks are held. + */ + error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); + if (error != 0 && error != EJUSTRETURN) + goto unlockout; + /* + * If tvp disappeared we just need to restart. + */ + if (error == EJUSTRETURN && tvp != NULL) { + vput(tvp); + tvp = NULL; + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * If tvp changed out from under us we need to drop the old one, + * VGET the new one, and restart. + */ + if (error == 0 && (tvp == NULL || xp->i_number != ino)) { + if (tvp != NULL) + vput(tvp); + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &tvp); + if (error) + tvp = NULL; + else + VOP_UNLOCK(tvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; - goto abortit; + goto unlockout; } - /* * Renaming a file to itself has no effect. The upper layers should - * not call us in that case. Temporarily just warn if they do. + * not call us in that case. */ - if (fvp == tvp) { - printf("ufs_rename: fvp == tvp (can't happen)\n"); - error = 0; - goto abortit; - } - - if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) - goto abortit; - dp = VTOI(fdvp); - ip = VTOI(fvp); + if (fvp == tvp) + panic("ufs_rename: fvp == tvp (can't happen)\n"); + doingdirectory = 0; + newparent = 0; + ino = ip->i_number; if (ip->i_nlink >= LINK_MAX) { - VOP_UNLOCK(fvp, 0); error = EMLINK; - goto abortit; + goto unlockout; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) - || (dp->i_flags & APPEND)) { - VOP_UNLOCK(fvp, 0); + || (fdp->i_flags & APPEND)) { error = EPERM; - goto abortit; + goto unlockout; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || - (ip->i_flag & IN_RENAME)) { - VOP_UNLOCK(fvp, 0); + fdp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; - goto abortit; + goto unlockout; } - ip->i_flag |= IN_RENAME; - oldparent = dp->i_number; + if (fdp->i_number != tdp->i_number) + newparent = tdp->i_number; doingdirectory = 1; } - vrele(fdvp); /* - * When the target exists, both the directory - * and target vnodes are returned locked. + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory hierarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". */ - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); + if (doingdirectory && newparent) { + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); + if (error) + goto unlockout; + error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, + &ino); + /* + * We encountered a lock that we have to wait for. Unlock + * everything else and VGET before restarting. + */ + if (ino) { + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error == 0) + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + if (error) + goto unlockout; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + } + if (ip->i_effnlink == 0 || fdp->i_effnlink == 0 || tdp->i_effnlink == 0) + panic("Bad effnlink ip %p, fdp %p, tdp %p", ip, fdp, tdp); /* * 1) Bump link count while we're moving stuff @@ -1129,49 +1279,12 @@ ufs_rename(ap) DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | - DOINGASYNC(fvp)))) != 0) { - VOP_UNLOCK(fvp, 0); + softdep_setup_link(tdp, ip); + error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp))); + if (error) goto bad; - } /* - * If ".." must be changed (ie the directory gets a new - * parent) then the source directory must not be in the - * directory hierarchy above the target, as this would - * orphan everything below the source directory. Also - * the user must have write permission in the source so - * as to be able to change "..". We must repeat the call - * to namei, as the parent directory is unlocked by the - * call to checkpath(). - */ - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); - fvp_ino = ip->i_number; - VOP_UNLOCK(fvp, 0); - if (oldparent != dp->i_number) - newparent = dp->i_number; - if (doingdirectory && newparent) { - if (error) /* write access check above */ - goto bad; - if (xp != NULL) - vput(tvp); - error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred); - if (error) - goto out; - if ((tcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost to startdir"); - VREF(tdvp); - error = relookup(tdvp, &tvp, tcnp); - if (error) - goto out; - vrele(tdvp); - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); - } - /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory @@ -1179,46 +1292,27 @@ ufs_rename(ap) * expunge the original entry's existence. */ if (xp == NULL) { - if (dp->i_dev != ip->i_dev) + if (tdp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); - /* - * Account for ".." in new directory. - * When source and destination have the same - * parent we don't fool with the link count. - */ if (doingdirectory && newparent) { - if ((nlink_t)dp->i_nlink >= LINK_MAX) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't adjust the link count. The + * actual link modification is completed when + * .. is rewritten below. + */ + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } - dp->i_effnlink++; - dp->i_nlink++; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | - DOINGASYNC(tdvp))); - if (error) - goto bad; } ufs_makedirentry(ip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); - if (error) { - if (doingdirectory && newparent) { - dp->i_effnlink--; - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - (void)UFS_UPDATE(tdvp, 1); - } + if (error) goto bad; - } - vput(tdvp); } else { - if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + if (xp->i_dev != tdp->i_dev || xp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). @@ -1231,7 +1325,7 @@ ufs_rename(ap) * destination of the rename. This implements append-only * directories. */ - if ((dp->i_mode & S_ISTXT) && + if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; @@ -1242,9 +1336,9 @@ ufs_rename(ap) * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ - if ((xp->i_mode&IFMT) == IFDIR) { + if ((xp->i_mode & IFMT) == IFDIR) { if ((xp->i_effnlink > 2) || - !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { + !ufs_dirempty(xp, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } @@ -1257,16 +1351,16 @@ ufs_rename(ap) error = EISDIR; goto bad; } - error = ufs_dirrewrite(dp, xp, ip->i_number, + error = ufs_dirrewrite(tdp, xp, ip->i_number, IFTODT(ip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) goto bad; if (doingdirectory) { if (!newparent) { - dp->i_effnlink--; + tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); + softdep_change_linkcnt(tdp); } xp->i_effnlink--; if (DOINGSOFTDEP(tvp)) @@ -1285,9 +1379,9 @@ ufs_rename(ap) * them now. */ if (!newparent) { - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; + tdp->i_nlink--; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; } xp->i_nlink--; DIP_SET(xp, i_nlink, xp->i_nlink); @@ -1295,105 +1389,80 @@ ufs_rename(ap) ioflag = IO_NORMAL; if (!DOINGASYNC(tvp)) ioflag |= IO_SYNC; + /* Don't go to bad here as the new link exists. */ if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) - goto bad; + goto unlockout; } - vput(tdvp); - vput(tvp); - xp = NULL; } /* - * 3) Unlink the source. + * 3) Unlink the source. We have to resolve the path again to + * fixup the directory offset and count for ufs_dirremove. */ - fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); - VREF(fdvp); - error = relookup(fdvp, &fvp, fcnp); - if (error == 0) - vrele(fdvp); - if (fvp != NULL) { - xp = VTOI(fvp); - dp = VTOI(fdvp); - } else { - /* - * From name has disappeared. IN_RENAME is not sufficient - * to protect against directory races due to timing windows, - * so we have to remove the panic. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. - */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - vrele(ap->a_fvp); - return (0); + if (fdvp == tdvp) { + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + panic("ufs_rename: from entry went away!"); } /* - * Ensure that the directory entry still exists and has not - * changed while the new name has been entered. If the source is - * a file then the entry may have been unlinked or renamed. In - * either case there is no further work to be done. If the source - * is a directory then it cannot have been rmdir'ed; the IN_RENAME - * flag ensures that it cannot be moved by another rename or removed - * by a rmdir. + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. */ - if (xp != ip) { + if (doingdirectory && newparent) { /* - * From name resolves to a different inode. IN_RENAME is - * not sufficient protection against timing window races - * so we can't panic here. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. + * If xp exists we simply use its link, otherwise we must + * add a new one. */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - } else { - /* - * If the source is a directory with a - * new parent, the link count of the old - * parent directory must be decremented - * and ".." set to point to the new parent. - */ - if (doingdirectory && newparent) { - xp->i_offset = mastertemplate.dot_reclen; - ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); - cache_purge(fdvp); + if (xp == NULL) { + tdp->i_effnlink++; + tdp->i_nlink++; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_setup_dotdot_link(tdp, ip); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + /* Don't go to bad here as the new link exists. */ + if (error) + goto unlockout; } - error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); - xp->i_flag &= ~IN_RENAME; + ip->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(ip, fdp, newparent, DT_DIR, 0); + cache_purge(fdvp); } - if (dp) - vput(fdvp); - if (xp) - vput(fvp); - vrele(ap->a_fvp); + error = ufs_dirremove(fdvp, ip, fcnp->cn_flags, 0); + +unlockout: + vput(tdvp); + vput(fdvp); + vput(fvp); + if (tvp) + vput(tvp); + if (mp) + vfs_unbusy(mp); return (error); bad: - if (xp) - vput(ITOV(xp)); - vput(ITOV(dp)); -out: - if (doingdirectory) - ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - ip->i_flag &= ~IN_RENAME; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - vput(fvp); - } else - vrele(fvp); + ip->i_effnlink--; + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_revert_link(tdp, ip); + goto unlockout; + +releout: + vrele(tdvp); + if (tvp) + vrele(tvp); + vrele(fdvp); + vrele(fvp); + if (mp) + vfs_unbusy(mp); + return (error); } @@ -1565,8 +1634,7 @@ ufs_mkdir(ap) ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); - if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); @@ -1582,8 +1650,8 @@ ufs_mkdir(ap) DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); + softdep_setup_mkdir(dp, ip); + error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; #ifdef MAC @@ -1701,8 +1769,6 @@ bad: dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(dvp)) - softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. @@ -1712,7 +1778,8 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_mkdir(dp, ip); + vput(tvp); } out: @@ -1748,10 +1815,13 @@ ufs_rmdir(ap) * tries to remove a locally mounted on directory). */ error = 0; - if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) { + if (ip->i_effnlink < 2) { error = EINVAL; goto out; } + if (dp->i_effnlink < 3) + panic("ufs_dirrem: Bad link count %d on parent", + dp->i_effnlink); if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; @@ -1775,18 +1845,14 @@ ufs_rmdir(ap) */ dp->i_effnlink--; ip->i_effnlink--; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; - if (DOINGSOFTDEP(vp)) { - softdep_change_linkcnt(dp); - softdep_change_linkcnt(ip); - } + if (DOINGSOFTDEP(vp)) + softdep_revert_rmdir(dp, ip); goto out; } cache_purge(dvp); @@ -2282,6 +2348,9 @@ ufs_makeinode(mode, dvp, vpp, cnp) if ((mode & IFMT) == 0) mode |= IFREG; + if (VTOI(dvp)->i_effnlink < 2) + panic("ufs_makeinode: Bad link count %d on parent", + VTOI(dvp)->i_effnlink); error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); @@ -2411,7 +2480,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { ip->i_mode &= ~ISGID; @@ -2484,7 +2553,7 @@ bad: DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(ip); + softdep_revert_create(VTOI(dvp), ip); vput(tvp); return (error); } Index: /usr/src/sys/ufs/ufs/ufsmount.h =================================================================== --- /usr/src/sys/ufs/ufs/ufsmount.h (revision 200565) +++ /usr/src/sys/ufs/ufs/ufsmount.h (working copy) @@ -57,6 +57,7 @@ struct ucred; struct uio; struct vnode; struct ufs_extattr_per_mount; +struct jblocks; /* This structure describes the UFS specific mount structure data. */ struct ufsmount { @@ -75,6 +76,10 @@ struct ufsmount { long um_numindirdeps; /* outstanding indirdeps */ struct workhead softdep_workitem_pending; /* softdep work queue */ struct worklist *softdep_worklist_tail; /* Tail pointer for above */ + struct workhead softdep_journal_pending; /* journal work queue */ + struct worklist *softdep_journal_tail; /* Tail pointer for above */ + struct jblocks *softdep_jblocks; /* Journal block information */ + int softdep_on_journal; /* Items on the journal list */ int softdep_on_worklist; /* Items on the worklist */ int softdep_on_worklist_inprogress; /* Busy items on worklist */ int softdep_deps; /* Total dependency count */ Index: /usr/src/sys/ufs/ufs/ufs_lookup.c =================================================================== --- /usr/src/sys/ufs/ufs/ufs_lookup.c (revision 200565) +++ /usr/src/sys/ufs/ufs/ufs_lookup.c (working copy) @@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) -static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *, - ino_t *); - /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. @@ -134,11 +131,11 @@ ufs_lookup(ap) } */ *ap; { - return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); + return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } -static int -ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, +int +ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ @@ -464,6 +461,8 @@ notfound: return (ENOENT); found: + if (dd_ino != NULL) + *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* @@ -486,11 +485,6 @@ found: if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); - if (dd_ino != NULL) { - *dd_ino = ino; - return (0); - } - /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. @@ -520,6 +514,8 @@ found: dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; + if (dd_ino != NULL) + return (0); if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; @@ -560,6 +556,8 @@ found: dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); + if (dd_ino != NULL) + return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -567,6 +565,8 @@ found: cnp->cn_flags |= SAVENAME; return (0); } + if (dd_ino != NULL) + return (0); /* * Step through the translation in the name. We do not `vput' the @@ -598,7 +598,7 @@ found: * to the inode we looked up before vdp lock was * dropped. */ - error = ufs_lookup_(pdp, NULL, cnp, &ino1); + error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); @@ -819,29 +819,12 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } - if (softdep_setup_directory_add(bp, dp, dp->i_offset, - dirp->d_ino, newdirbp, 1) == 0) { - bdwrite(bp); - return (UFS_UPDATE(dvp, 0)); - } - /* We have just allocated a directory block in an - * indirect block. Rather than tracking when it gets - * claimed by the inode, we simply do a VOP_FSYNC - * now to ensure that it is there (in case the user - * does a future fsync). Note that we have to unlock - * the inode for the entry that we just entered, as - * the VOP_FSYNC may need to lock other inodes which - * can lead to deadlock if we also hold a lock on - * the newly entered node. - */ - if ((error = bwrite(bp))) - return (error); - if (tvp != NULL) - VOP_UNLOCK(tvp, 0); - error = VOP_FSYNC(dvp, MNT_WAIT, td); - if (tvp != NULL) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); - return (error); + softdep_setup_directory_add(bp, dp, dp->i_offset, + dirp->d_ino, newdirbp, 1); + if (newdirbp) + bdwrite(newdirbp); + bdwrite(bp); + return (UFS_UPDATE(dvp, 0)); } if (DOINGASYNC(dvp)) { bdwrite(bp); @@ -976,6 +959,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); + if (newdirbp) + bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { @@ -1084,7 +1069,7 @@ out: if (DOINGSOFTDEP(dvp)) { if (ip) { ip->i_effnlink--; - softdep_change_linkcnt(ip); + softdep_setup_unlink(dp, ip); softdep_setup_remove(bp, dp, ip, isrmdir); } if (softdep_slowdown(dvp)) { @@ -1146,7 +1131,7 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) ep->d_type = newtype; oip->i_effnlink--; if (DOINGSOFTDEP(vdp)) { - softdep_change_linkcnt(oip); + softdep_setup_unlink(dp, oip); softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { @@ -1267,25 +1252,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cre /* * Check if source directory is in the path of the target directory. - * Target is supplied locked, source is unlocked. - * The target is always vput before returning. */ int -ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred) +ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { - struct vnode *vp, *vp1; + struct mount *mp; + struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; - vp = ITOV(target); - if (target->i_number == source_ino) { - error = EEXIST; - goto out; - } - error = 0; + vp = tvp = ITOV(target); + mp = vp->v_mount; + *wait_ino = 0; + if (target->i_number == source_ino) + return (EEXIST); + if (target->i_number == parent_ino) + return (0); if (target->i_number == ROOTINO) - goto out; - + return (0); + error = 0; for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) @@ -1296,9 +1281,13 @@ int } if (dd_ino == ROOTINO) break; - error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1); - if (error != 0) + if (dd_ino == parent_ino) break; + error = VFS_VGET(mp, dd_ino, LK_EXCLUSIVE | LK_NOWAIT, &vp1); + if (error != 0) { + *wait_ino = dd_ino; + break; + } /* Recheck that ".." still points to vp1 after relock of vp */ error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) { @@ -1310,14 +1299,14 @@ int vput(vp1); continue; } - vput(vp); + if (vp != tvp) + vput(vp); vp = vp1; } -out: if (error == ENOTDIR) - printf("checkpath: .. not a directory\n"); - if (vp != NULL) + panic("checkpath: .. not a directory\n"); + if (vp != tvp) vput(vp); return (error); } Index: /usr/src/sys/ufs/ufs/ufs_extern.h =================================================================== --- /usr/src/sys/ufs/ufs/ufs_extern.h (revision 200565) +++ /usr/src/sys/ufs/ufs/ufs_extern.h (working copy) @@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *); int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, struct buf *, int *, int *); int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); -int ufs_checkpath(ino_t, struct inode *, struct ucred *); +int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); @@ -69,6 +69,8 @@ int ufs_direnter(struct vnode *, struct vnode *, struct componentname *, struct buf *); int ufs_dirremove(struct vnode *, struct inode *, int, int); int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, + ino_t *); int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); int ufs_inactive(struct vop_inactive_args *); int ufs_init(struct vfsconf *); @@ -81,6 +83,9 @@ vfs_root_t ufs_root; int ufs_uninit(struct vfsconf *); int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); +#include +SYSCTL_DECL(_vfs_ufs); + /* * Soft update function prototypes. */ @@ -94,6 +99,17 @@ void softdep_setup_directory_change(struct buf *, void softdep_change_linkcnt(struct inode *); void softdep_releasefile(struct inode *); int softdep_slowdown(struct vnode *); +void softdep_setup_create(struct inode *, struct inode *); +void softdep_setup_dotdot_link(struct inode *, struct inode *); +void softdep_setup_link(struct inode *, struct inode *); +void softdep_setup_mkdir(struct inode *, struct inode *); +void softdep_setup_rmdir(struct inode *, struct inode *); +void softdep_setup_unlink(struct inode *, struct inode *); +void softdep_revert_create(struct inode *, struct inode *); +void softdep_revert_dotdot_link(struct inode *, struct inode *); +void softdep_revert_link(struct inode *, struct inode *); +void softdep_revert_mkdir(struct inode *, struct inode *); +void softdep_revert_rmdir(struct inode *, struct inode *); /* * Flags to low-level allocation routines. The low 16-bits are reserved Index: /usr/src/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_vfsops.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_vfsops.c (working copy) @@ -858,6 +858,7 @@ ffs_mountfs(devvp, mp, td) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); + mp->mnt_stat.f_iosize = fs->fs_bsize; if( mp->mnt_flag & MNT_ROOTFS) { /* @@ -899,7 +900,6 @@ ffs_mountfs(devvp, mp, td) * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ - mp->mnt_stat.f_iosize = fs->fs_bsize; (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ @@ -1829,9 +1829,6 @@ ffs_bufwrite(struct buf *bp) } BO_UNLOCK(bp->b_bufobj); - /* Mark the buffer clean */ - bundirty(bp); - /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the @@ -1872,9 +1869,16 @@ ffs_bufwrite(struct buf *bp) newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES - /* move over the dependencies */ - if (!LIST_EMPTY(&bp->b_dep)) - softdep_move_dependencies(bp, newbp); + /* + * Move over the dependencies. If there are rollbacks, + * leave the parent buffer dirtied as it will need to + * be written again. + */ + if (LIST_EMPTY(&bp->b_dep) || + softdep_move_dependencies(bp, newbp) == 0) + bundirty(bp); +#else + bundirty(bp); #endif /* @@ -1887,8 +1891,11 @@ ffs_bufwrite(struct buf *bp) */ bqrelse(bp); bp = newbp; - } + } else + /* Mark the buffer clean */ + bundirty(bp); + /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); Index: /usr/src/sys/ufs/ffs/ffs_softdep.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_softdep.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_softdep.c (working copy) @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -130,10 +131,12 @@ softdep_setup_inomapdep(bp, ip, newinum) } void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; + int frags; + int oldfrags; { panic("softdep_setup_blkmapdep called"); @@ -403,24 +406,6 @@ softdep_get_depcounts(struct mount *mp, * These definitions need to be adapted to the system to which * this file is being ported. */ -/* - * malloc types defined for the softdep system. - */ -static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); -static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); -static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); -static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); -static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); -static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); -static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); -static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); -static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); -static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); -static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); -static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); -static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); -static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); -static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) @@ -438,8 +423,62 @@ softdep_get_depcounts(struct mount *mp, #define D_MKDIR 11 #define D_DIRREM 12 #define D_NEWDIRBLK 13 -#define D_LAST D_NEWDIRBLK +#define D_FREEWORK 14 +#define D_FREEDEP 15 +#define D_JADDREF 16 +#define D_JREMREF 17 +#define D_JNEWBLK 18 +#define D_JFREEBLK 19 +#define D_JFREEFRAG 20 +#define D_JSEG 21 +#define D_JSEGDEP 22 +#define D_LAST D_JSEGDEP +unsigned long dep_current[D_LAST + 1]; +unsigned long dep_total[D_LAST + 1]; + + +SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, + "total dependencies allocated"); +SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, + "current dependencies allocated"); + +#define SOFTDEP_TYPE(type, str, long) \ + static MALLOC_DEFINE(M_ ## type, #str, long); \ + SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ + &dep_total[D_ ## type], 0, ""); \ + SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ + &dep_current[D_ ## type], 0, ""); + +SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); +SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); +SOFTDEP_TYPE(NEWBLK, newblk, "New block allocation"); +SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, + "Block or frag allocated from cyl group map"); +SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); +SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); +SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); +SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); +SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); +SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); +SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); +SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); +SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); +SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); +SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); +SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); +SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); +SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); +SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); +SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); +SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); +SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); +SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); + +static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); +static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); + /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX @@ -458,7 +497,16 @@ static struct malloc_type *memtype[] = { M_DIRADD, M_MKDIR, M_DIRREM, - M_NEWDIRBLK + M_NEWDIRBLK, + M_FREEWORK, + M_FREEDEP, + M_JADDREF, + M_JREMREF, + M_JNEWBLK, + M_JFREEBLK, + M_JFREEFRAG, + M_JSEG, + M_JSEGDEP }; #define DtoM(type) (memtype[type]) @@ -467,17 +515,21 @@ static struct malloc_type *memtype[] = { * Names of malloc types. */ #define TYPENAME(type) \ - ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") + ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ +#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) +#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) + /* * Forward declarations. */ struct inodedep_hashhead; struct newblk_hashhead; struct pagedep_hashhead; +struct bmsafemap_hashhead; /* * Internal function prototypes. @@ -493,53 +545,128 @@ static int flush_inodedep_deps(struct mount *, ino static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); static void diradd_inode_written(struct diradd *, struct inodedep *); +static int handle_written_indirdep(struct indirdep *, struct buf *, + struct buf**); static int handle_written_inodeblock(struct inodedep *, struct buf *); -static void handle_allocdirect_partdone(struct allocdirect *); +static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); +static void handle_written_jaddref(struct jaddref *, struct jsegdep *); +static void handle_written_jremref(struct jremref *, struct jsegdep *); +static void handle_written_jseg(struct jseg *, struct buf *); +static void handle_written_jnewblk(struct jnewblk *, struct jsegdep *); +static void handle_written_jfreeblk(struct jfreeblk *, struct jsegdep *); +static void handle_written_jfreefrag(struct jfreefrag *, struct jsegdep *); +static void jseg_write(struct jblocks *, struct jseg *, uint8_t *); +static void jaddref_write(struct jaddref *, uint8_t *); +static void jremref_write(struct jremref *, uint8_t *); +static void jnewblk_write(struct jnewblk *, uint8_t *); +static void jfreeblk_write(struct jfreeblk *, uint8_t *); +static void jfreefrag_write(struct jfreefrag *, uint8_t *); +static void handle_allocdirect_partdone(struct allocdirect *, + struct workhead *); +static void allocdirect_jwork(struct allocdirect *, struct workhead *, + short type); +static void handle_workitem_indirdep(struct indirdep *); +static void indirdep_attach(struct indirdep *, struct buf *); +static void indirdep_complete(struct indirdep *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); +static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); +static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static void handle_workitem_remove(struct dirrem *, struct vnode *); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); -static void free_diradd(struct diradd *); -static void free_allocindir(struct allocindir *, struct inodedep *); +static void free_indirdep(struct indirdep *); +static void free_diradd(struct diradd *, struct workhead *); +static void merge_diradd(struct inodedep *, struct diradd *); +static void complete_diradd(struct diradd *); +static struct diradd *diradd_lookup(struct pagedep *, int); +static void cancel_diradd_dotdot(struct inode *, struct dirrem *); +static void free_allocindir(struct allocindir *, struct inodedep *, + struct workhead *, short); +static void cancel_allocindir(struct allocindir *, struct inodedep *, + struct freeblks *); +static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); -static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, - ufs2_daddr_t *); -static void deallocate_dependencies(struct buf *, struct inodedep *); -static void free_allocdirect(struct allocdirectlst *, - struct allocdirect *, int); +static void free_jremref(struct jremref *); +static void free_jaddref(struct jaddref *); +static void free_jsegdep(struct jsegdep *); +static void free_jseg(struct jseg *); +static void free_jnewblk(struct jnewblk *); +static void free_jfreeblk(struct jfreeblk *); +static void free_jfreefrag(struct jfreefrag *); +static void free_freedep(struct freedep *); +static void cancel_jnewblk(struct jnewblk *, struct workhead *); +static void cancel_jaddref(struct jaddref *, struct inodedep *, + struct workhead *); +static void cancel_jfreefrag(struct jfreefrag *); +static int indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); +static int deallocate_dependencies(struct buf *, struct inodedep *, + struct freeblks *); +static void free_allocdirect(struct allocdirect *); +static void cancel_allocdirect(struct allocdirectlst *, + struct allocdirect *, struct freeblks *, int); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); +static void freework_freeblock(struct freework *); +static void freework_freeindir(struct freework *); static void handle_workitem_freeblocks(struct freeblks *, int); +static void handle_complete_freeblocks(struct freeblks *); +static void handle_workitem_indirblk(struct freework *); +static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, - struct allocindir *); + struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, - ufs2_daddr_t); + ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); -static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); +static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, + ufs_lbn_t); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); -static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); +static struct freefrag *allocindir_merge(struct allocindir *, + struct allocindir *); +static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, + struct bmsafemap **); +static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, + int cg); static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, struct newblk **); static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); -static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); +static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, + struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct mount *mp, int, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int); +static void jwork_move(short, short, struct workhead *, struct workhead *); static void add_to_worklist(struct worklist *); static void softdep_flush(void); static int softdep_speedup(void); +static int journal_mount(struct mount *, struct fs *, struct ucred *); +static void add_to_journal(struct worklist *); +static void remove_from_journal(struct worklist *); +static void softdep_process_journal(struct mount *, int); +static struct jremref *newjremref(struct dirrem *, struct inode *, + struct inode *ip, off_t); +static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, + uint16_t); +static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, + ufs2_daddr_t, int); +static struct freework *newfreework(struct freeblks *, struct freework *, + ufs_lbn_t, ufs2_daddr_t, int, int); +static void jwait(struct worklist *wk); +static struct inodedep *inodedep_lookup_ip(struct inode *); +static int bmsafemap_rollbacks(struct bmsafemap *); +static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); +static void handle_jwork(struct workhead *); /* * Exported softdep operations. @@ -572,40 +699,134 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) +#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT +#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE + #else /* DEBUG */ -static void worklist_insert(struct workhead *, struct worklist *); -static void worklist_remove(struct worklist *); +static void worklist_insert(struct workhead *, struct worklist *, int); +static void worklist_remove(struct worklist *, int); -#define WORKLIST_INSERT(head, item) worklist_insert(head, item) -#define WORKLIST_REMOVE(item) worklist_remove(item) +#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) +#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) +#define WORKLIST_REMOVE(item) worklist_remove(item, 1) +#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) static void -worklist_insert(head, item) +worklist_insert(head, item, locked) struct workhead *head; struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if (item->wk_state & ONWORKLIST) - panic("worklist_insert: already on list"); + panic("worklist_insert: %p %s(0x%X) already on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void -worklist_remove(item) +worklist_remove(item, locked) struct worklist *item; + int locked; { - mtx_assert(&lk, MA_OWNED); + if (locked) + mtx_assert(&lk, MA_OWNED); if ((item->wk_state & ONWORKLIST) == 0) - panic("worklist_remove: not on list"); + panic("worklist_remove: %p %s(0x%X) not on list", + item, TYPENAME(item->wk_type), item->wk_state); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* + * Merge two jsegdeps keeping only the oldest one as newer references + * can't be discarded until after older references. + */ +static inline struct jsegdep * +jsegdep_merge(struct jsegdep *one, struct jsegdep *two) +{ + struct jsegdep *swp; + + if (two == NULL) + return (one); + + if (one->jd_seg->js_seq > two->jd_seg->js_seq) { + swp = one; + one = two; + two = swp; + } + WORKLIST_REMOVE(&two->jd_list); + free_jsegdep(two); + + return (one); +} + +/* + * If two freedeps are compatible free one to reduce list size. + */ +static inline struct freedep * +freedep_merge(struct freedep *one, struct freedep *two) +{ + if (two == NULL) + return (one); + + if (one->fd_freework == two->fd_freework) { + WORKLIST_REMOVE(&two->fd_list); + free_freedep(two); + } + return (one); +} + +/* + * Move journal work from one list to another. Duplicate freedeps and + * jsegdeps are coalesced to keep the lists as small as possible. + */ +static void +jwork_move(type, line, dst, src) + short type; + short line; + struct workhead *dst; + struct workhead *src; +{ + struct freedep *freedep; + struct jsegdep *jsegdep; + struct worklist *wkn; + struct worklist *wk; + + KASSERT(dst != src, + ("jwork_move: dst == src")); + freedep = NULL; + jsegdep = NULL; + LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { + if (wk->wk_type == D_JSEGDEP) + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } + + mtx_assert(&lk, MA_OWNED); + while ((wk = LIST_FIRST(src)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(dst, wk); + if (wk->wk_type == D_JSEGDEP) { + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); +#ifdef DEBUG + jsegdep->jd_type = type; + jsegdep->jd_line = line; +#endif + continue; + } + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } +} + +/* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); @@ -623,13 +844,16 @@ workitem_free(item, type) #ifdef DEBUG if (item->wk_state & ONWORKLIST) - panic("workitem_free: still on list"); + panic("workitem_free: %s(0x%X) still on list", + TYPENAME(item->wk_type), item->wk_state); if (item->wk_type != type) - panic("workitem_free: type mismatch"); + panic("workitem_free: type mismatch %s != %s", + TYPENAME(item->wk_type), TYPENAME(type)); #endif ump = VFSTOUFS(item->wk_mp); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); + dep_current[type]--; free(item, DtoM(type)); } @@ -643,6 +867,8 @@ workitem_alloc(item, type, mp) item->wk_mp = mp; item->wk_state = 0; ACQUIRE_LOCK(&lk); + dep_current[type]++; + dep_total[type]++; VFSTOUFS(mp)->softdep_deps++; VFSTOUFS(mp)->softdep_accdeps++; FREE_LOCK(&lk); @@ -696,6 +922,9 @@ SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW SYSCTL_DECL(_vfs_ffs); +LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; +static u_long bmsafemap_hash; /* size of hash table - 1 */ + static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); @@ -779,7 +1008,7 @@ softdep_speedup(void) req_pending = 1; wakeup(&req_pending); } - + /* XXX Don't we really want to speedup the buf daemon? */ return speedup_syncer(); } @@ -799,7 +1028,8 @@ add_to_worklist(wk) mtx_assert(&lk, MA_OWNED); ump = VFSTOUFS(wk->wk_mp); if (wk->wk_state & ONWORKLIST) - panic("add_to_worklist: already on list"); + panic("add_to_worklist: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (LIST_EMPTY(&ump->softdep_workitem_pending)) LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); @@ -838,6 +1068,7 @@ softdep_process_worklist(mp, full) ACQUIRE_LOCK(&lk); loopcount = 1; starttime = time_second; + softdep_process_journal(mp, 0); while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 0)) == -1) break; @@ -969,6 +1200,15 @@ process_worklist_item(mp, flags) handle_workitem_freefile(WK_FREEFILE(wk)); break; + case D_FREEWORK: + /* Final block in an indirect was freed. */ + handle_workitem_indirblk(WK_FREEWORK(wk)); + break; + + case D_INDIRDEP: + handle_workitem_indirdep(WK_INDIRDEP(wk)); + break; + default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); @@ -982,19 +1222,22 @@ process_worklist_item(mp, flags) /* * Move dependencies from one buffer to another. */ -void +int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; + int dirty; - if (!LIST_EMPTY(&newbp->b_dep)) - panic("softdep_move_dependencies: need merge code"); - wktail = 0; + dirty = 0; + wktail = NULL; ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); + if (wk->wk_type == D_BMSAFEMAP && + bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) + dirty = 1; if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else @@ -1002,6 +1245,8 @@ softdep_move_dependencies(oldbp, newbp) wktail = wk; } FREE_LOCK(&lk); + + return (dirty); } /* @@ -1198,23 +1443,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, paged * This routine must be called with splbio interrupts blocked. */ static int -pagedep_lookup(ip, lbn, flags, pagedeppp) - struct inode *ip; +pagedep_lookup(mp, ino, lbn, flags, pagedeppp) + struct mount *mp; + ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; - struct mount *mp; int ret; int i; mtx_assert(&lk, MA_OWNED); - mp = ITOV(ip)->v_mount; - pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); + pagedephd = PAGEDEP_HASH(mp, ino, lbn); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp || (flags & DEPALLOC) == 0) return (ret); FREE_LOCK(&lk); @@ -1222,12 +1466,12 @@ static int M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(&lk); - ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); + ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); if (*pagedeppp) { WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } - pagedep->pd_ino = ip->i_number; + pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); @@ -1314,10 +1558,12 @@ inodedep_lookup(mp, inum, flags, inodedeppp) inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; - inodedep->id_buf = NULL; + inodedep->id_bmsafemap = NULL; + inodedep->id_mkdiradd = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); + LIST_INIT(&inodedep->id_jaddrefhd); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); @@ -1405,6 +1651,7 @@ softdep_initialize() &pagedep_hash); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); + bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; @@ -1428,6 +1675,7 @@ softdep_uninitialize() hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); + hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); } /* @@ -1457,9 +1705,14 @@ softdep_mount(devvp, mp, fs, cred) MNT_IUNLOCK(mp); ump = VFSTOUFS(mp); LIST_INIT(&ump->softdep_workitem_pending); + LIST_INIT(&ump->softdep_journal_pending); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; + if ((error = journal_mount(mp, fs, cred)) != 0) { + printf("Failed to start journal: %d\n", error); + return (error); + } /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation @@ -1493,7 +1746,1492 @@ softdep_mount(devvp, mp, fs, cred) return (0); } +struct jblocks { + struct jseglst jb_segs; /* TAILQ of current segments. */ + struct jextent *jb_extent; /* Extent array. */ + uint64_t jb_nextseq; /* Next sequence number. */ + uint64_t jb_oldestseq; /* Oldest active sequence number. */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ + int jb_blocks; /* Total disk blocks covered. */ + int jb_free; /* Total disk blocks free. */ +}; +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + + jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); + TAILQ_INIT(&jblocks->jb_segs); + jblocks->jb_avail = 10; + jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + + return (jblocks); +} + +static ufs2_daddr_t +jblocks_alloc(struct jblocks *jblocks, int bytes, int *actual) +{ + ufs2_daddr_t daddr; + struct jextent *jext; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + jblocks->jb_head = 0; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + jblocks->jb_off += freecnt; + jblocks->jb_free -= freecnt; + + return (daddr); +} + +static void +jblocks_free(struct jblocks *jblocks, int bytes) +{ + + if (jblocks->jb_free == 0) + wakeup(jblocks); + jblocks->jb_free += bytes / DEV_BSIZE; +} + +static void +jblocks_destroy(struct jblocks *jblocks) +{ + + if (jblocks->jb_extent) + free(jblocks->jb_extent, M_JBLOCKS); + free(jblocks, M_JBLOCKS); +} + +static void +jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) +{ + struct jextent *jext; + + jblocks->jb_blocks += blocks; + jblocks->jb_free += blocks; + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + memcpy(jext, jblocks->jb_extent, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent, M_JBLOCKS); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; +} + +static int +journal_mount(mp, fs, cred) + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct jblocks *jblocks; + struct vnode *vp; + struct inode *ip; + ufs2_daddr_t blkno; + int bcount; + int error; + int i; + + error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp); + if (error) + return (error); + ip = VTOI(vp); + if (ip->i_size < 1 * 1024 * 1024) { + error = ENOSPC; + goto out; + } + bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ + jblocks = jblocks_create(); + for (i = 0; i < bcount; i++) { + error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); + if (error) + break; + jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); + } + if (error) + jblocks_destroy(jblocks); + else + VFSTOUFS(mp)->softdep_jblocks = jblocks; +out: + vput(vp); + return (error); +} + /* + * Called when a journal record is ready to be written. Space is allocated + * and the journal entry is created when the journal is flushed to stable + * store. + */ +static void +add_to_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + if ((wk->wk_state & DEPCOMPLETE) == 0) + panic("add_to_journal: Invalid wk state 0x%X", wk->wk_state); + ump = VFSTOUFS(wk->wk_mp); + if (wk->wk_state & ONWORKLIST) + panic("add_to_journal: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); + wk->wk_state |= ONWORKLIST; + if (LIST_EMPTY(&ump->softdep_journal_pending)) + LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); + else + LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); + ump->softdep_journal_tail = wk; + ump->softdep_on_journal += 1; +} + +/* + * Remove an arbitrary item for the journal worklist maintain the tail + * pointer. This happens when a new operation obviates the need to + * journal an old operation. + */ +static void +remove_from_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + mtx_assert(&lk, MA_OWNED); + ump = VFSTOUFS(wk->wk_mp); +#ifdef DEBUG /* XXX Expensive, temporary. */ + { + struct worklist *wkn; + + LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) + if (wkn == wk) + break; + if (wkn == NULL) + panic("remove_from_journal: %p is not in journal", wk); + } +#endif + /* + * We emulate a TAILQ to save space in most structures which do not + * require TAILQ semantics. Here we must update the tail position + * when removing the tail which is not the final entry. + */ + if (ump->softdep_journal_tail == wk) + ump->softdep_journal_tail = + (struct worklist *)wk->wk_list.le_prev; + + WORKLIST_REMOVE(wk); + ump->softdep_on_journal -= 1; +} + +#if 0 +#ifdef DDB +DB_SHOW_COMMAND(softdepsegs, db_show_softdepsegs) +{ + struct jsegdep *jsegdep; + struct jseg *jseg; + int i; + + TAILQ_FOREACH(jseg, &allsegs, js_next) { + db_printf("%p (%ld): %d refs\n", + jseg, jseg->js_seq, jseg->js_refs); + if (jseg->js_refs == 0) + continue; + for (i = 0; i < jseg->js_cnt; i++) { + jsegdep = &jseg->js_deps[i]; + if ((jsegdep->jd_state & ATTACHED) == 0) + continue; + db_printf("\tseg %p allocated to %s @ %d\n", + jsegdep, TYPENAME(jsegdep->jd_type), + jsegdep->jd_line); + } + } +} + +#endif +#endif + +static void +jseg_write(jblocks, jseg, data) + struct jblocks *jblocks; + struct jseg *jseg; + uint8_t *data; +{ + struct jsegrec *rec; + + rec = (struct jsegrec *)data; + rec->jsr_seq = jseg->js_seq; + rec->jsr_oldest = jblocks->jb_oldestseq; + rec->jsr_cnt = jseg->js_cnt; + rec->jsr_crc = 0; +} + +static void +jaddref_write(jaddref, data) + struct jaddref *jaddref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_ADDREF; + rec->jr_ino = jaddref->ja_ino; + rec->jr_parent = jaddref->ja_parent; + rec->jr_nlink = jaddref->ja_nlink; + rec->jr_mode = jaddref->ja_mode; + rec->jr_diroff = jaddref->ja_diroff; +} + +static void +jremref_write(jremref, data) + struct jremref *jremref; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_REMREF; + rec->jr_ino = jremref->jr_ino; + rec->jr_parent = jremref->jr_parent; + rec->jr_nlink = jremref->jr_nlink; + rec->jr_mode = jremref->jr_mode; + rec->jr_diroff = jremref->jr_diroff; +} + +static void +jnewblk_write(jnewblk, data) + struct jnewblk *jnewblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_NEWBLK; + rec->jb_ino = jnewblk->jn_ino; + rec->jb_blkno = jnewblk->jn_blkno; + rec->jb_lbn = jnewblk->jn_lbn; + rec->jb_frags = jnewblk->jn_frags; + rec->jb_oldfrags = jnewblk->jn_oldfrags; +} + +static void +jfreeblk_write(jfreeblk, data) + struct jfreeblk *jfreeblk; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreeblk->jf_ino; + rec->jb_blkno = jfreeblk->jf_blkno; + rec->jb_lbn = jfreeblk->jf_lbn; + rec->jb_frags = jfreeblk->jf_frags; + rec->jb_oldfrags = 0; +} + +static void +jfreefrag_write(jfreefrag, data) + struct jfreefrag *jfreefrag; + uint8_t *data; +{ + struct jblkrec *rec; + + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreefrag->fr_ino; + rec->jb_blkno = jfreefrag->fr_blkno; + rec->jb_lbn = jfreefrag->fr_lbn; + rec->jb_frags = jfreefrag->fr_frags; + rec->jb_oldfrags = 0; +} + +/* + * Flush some journal records to disk. + */ +static void +softdep_process_journal(mp, flags) + struct mount *mp; + int flags; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + struct worklist *wk; + struct jseg *jseg; + struct buf *bp; + uint8_t *data; + struct fs *fs; + int segwritten; + int jrecpb; /* records per fs block. */ + int jrecpf; /* records per fs frag. */ + int size; + int cnt; + int i; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + jblocks = ump->softdep_jblocks; + jrecpb = fs->fs_bsize / JREC_SIZE; + jrecpf = fs->fs_fsize / JREC_SIZE; + segwritten = 0; + while ((cnt = ump->softdep_on_journal) != 0) { + /* + * Create a new segment to hold as many as 'cnt' journal + * entries and add them to the segment. Notice cnt is + * off by one to account for the space required by the + * jsegrec. If we don't have a full block to log skip it + * unless flags == MNT_WAIT and we haven't written anything. + */ + cnt++; + if (cnt < jrecpb && (flags != MNT_WAIT || segwritten != 0)) + return; + if (cnt < jrecpb) + cnt = roundup2(cnt, jrecpf); + else + cnt = jrecpb; + /* + * If there is no space try to clean up some entries. + */ + while (jblocks->jb_free == 0) { + if (flags != MNT_WAIT) + return; + /* + * Items in the worklist should release journal + * space when they free storage. Process them + * until we have enough space to proceed. If + * we can't process an item we need to flush more + * bufs and wait. + */ +#if 0 /* + * XXX Currently this deadlocks because bufs are + * held locked while waiting on journal writes in + * truncate and softdep_update_inodeblock(). + */ + printf("softdep: Out of journal space!\n"); + if (process_worklist_item(mp, LK_NOWAIT) != -1) + continue; +#endif + printf("softdep: Out of journal space, no progress."); + softdep_speedup(); + msleep(jblocks, &lk, PRIBIO, "jblocks", 0); + } + FREE_LOCK(&lk); + cnt--; /* Temporarily forget the segment record. */ + size = sizeof(*jseg) + (sizeof(struct jsegdep) * cnt); + jseg = malloc(size, M_JSEG, M_SOFTDEP_FLAGS); + workitem_alloc(&jseg->js_list, D_JSEG, mp); + LIST_INIT(&jseg->js_entries); + jseg->js_refs = 1; /* Self reference. */ + jseg->js_jblocks = jblocks; + for (i = 0; i < cnt; i++) { + jseg->js_deps[i].jd_list.wk_mp = mp; + jseg->js_deps[i].jd_list.wk_type = D_JSEGDEP; + jseg->js_deps[i].jd_list.wk_state = 0; + jseg->js_deps[i].jd_seg = jseg; + } + size = fragroundup(fs, (cnt+1) * JREC_SIZE); + bp = geteblk(size, 0); + ACQUIRE_LOCK(&lk); + /* + * If there was a race while we were allocating the block + * and jseg the entry we care about was likely written. + * We bail out in both the WAIT and NOWAIT case and assume + * the caller will loop if the entry it cares about is + * not written. + */ + if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + WORKITEM_FREE(jseg, D_JSEG); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); + return; + } + /* + * XXX If we didn't preallocate jsegdeps we could go up to + * the size available in the block. + */ + cnt = MIN(ump->softdep_on_journal, cnt); + size = fragroundup(fs, (cnt + 1) * JREC_SIZE); + /* + * Allocate a disk block for this journal data and account + * for truncation of the requested size if enough contiguous + * space was not available. + */ + bp->b_blkno = bp->b_lblkno = jblocks_alloc(jblocks, size, + &size); + bp->b_offset = bp->b_blkno * DEV_BSIZE; + bp->b_bcount = size; + bp->b_bufobj = &ump->um_devvp->v_bufobj; + bp->b_flags &= ~B_INVAL; + /* + * Initialize our jseg with as many as cnt - 1 records. + * Assign the next sequence number to it and link it + * in-order. + */ + cnt = MIN(ump->softdep_on_journal, (size / JREC_SIZE) - 1); + jseg->js_buf = bp; + jseg->js_cnt = cnt; + jseg->js_size = size; + jseg->js_seq = jblocks->jb_nextseq++; + if (TAILQ_EMPTY(&jblocks->jb_segs)) + jblocks->jb_oldestseq = jseg->js_seq; + TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); + /* + * Start filling in records from the pending list. + */ + data = bp->b_data; + jseg_write(jblocks, jseg, data); + data += JREC_SIZE; + while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) + != NULL) { + remove_from_journal(wk); + wk->wk_state |= IOSTARTED; + WORKLIST_INSERT(&jseg->js_entries, wk); + switch (wk->wk_type) { + case D_JADDREF: + jaddref_write(WK_JADDREF(wk), data); + break; + case D_JREMREF: + jremref_write(WK_JREMREF(wk), data); + break; + case D_JNEWBLK: + jnewblk_write(WK_JNEWBLK(wk), data); + break; + case D_JFREEBLK: + jfreeblk_write(WK_JFREEBLK(wk), data); + break; + case D_JFREEFRAG: + jfreefrag_write(WK_JFREEFRAG(wk), data); + break; + default: + panic("process_journal: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + data += JREC_SIZE; + if (--cnt == 0) + break; + } + /* + * Write this one buffer and continue. + */ +#if 1 + WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); + FREE_LOCK(&lk); + BO_LOCK(bp->b_bufobj); + bgetvp(ump->um_devvp, bp); + BO_UNLOCK(bp->b_bufobj); + /* + * XXX bawrite? Completion ordering? We may permit writing + * the journal out of order but then we must complete + * those segments in order by delaying handle_written_jseg + * until all predecessors have completed. + */ + bwrite(bp); + ACQUIRE_LOCK(&lk); +#else + handle_written_jseg(jseg, bp); + FREE_LOCK(&lk); + brelse(bp); + ACQUIRE_LOCK(&lk); +#endif + segwritten++; + } +} + +/* + * Complete a jseg write, allowing all dependencies awaiting journal writes + * to proceed. Each journal dependency also attaches a jsegdep to dependent + * structures so that the journal segment can be freed to reclaim space. + */ +static void +handle_written_jseg(jseg, bp) + struct jseg *jseg; + struct buf *bp; +{ + struct worklist *wk; + struct jsegdep *jsegdep; + int waiting; + int i; + + if (jseg->js_refs == 0) + panic("handle_written_jseg: No self-reference on %p", jseg); + /* + * We'll never need this buffer again, set flags so it will be + * discarded. + */ + bp->b_flags |= B_INVAL | B_NOCACHE; + i = 0; + while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { + WORKLIST_REMOVE(wk); + waiting = wk->wk_state & IOWAITING; + wk->wk_state &= ~(IOSTARTED | IOWAITING); + wk->wk_state |= COMPLETE; + KASSERT(i < jseg->js_cnt, + ("handle_written_jseg: overflow %d >= %d", + i, jseg->js_cnt)); + jsegdep = &jseg->js_deps[i++]; +#ifdef DEBUG + if (jsegdep->jd_state != 0) + panic("Invalid jsegdep: %p jseg %p", jsegdep, jseg); + jsegdep->jd_state = ATTACHED; + jsegdep->jd_type = wk->wk_type; + jsegdep->jd_line = __LINE__; +#endif + jseg->js_refs++; + switch (wk->wk_type) { + case D_JADDREF: + handle_written_jaddref(WK_JADDREF(wk), jsegdep); + break; + case D_JREMREF: + handle_written_jremref(WK_JREMREF(wk), jsegdep); + break; + case D_JNEWBLK: + handle_written_jnewblk(WK_JNEWBLK(wk), jsegdep); + break; + case D_JFREEBLK: + handle_written_jfreeblk(WK_JFREEBLK(wk), jsegdep); + break; + case D_JFREEFRAG: + handle_written_jfreefrag(WK_JFREEFRAG(wk), jsegdep); + break; + default: + panic("handle_written_jseg: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + if (waiting) + wakeup(wk); + } + /* Reset the cnt for the real number of jsegdeps handed out. */ + jseg->js_cnt = i; + /* Release the self reference so the structure may be freed. */ + free_jseg(jseg); +} + +/* + * Called once a jremref has made it to stable store. The jremref is marked + * complete and we attempt to free it. Any pagedeps writes sleeping waiting + * for the jremref to complete will be awoken by free_jremref. + */ +static void +handle_written_jremref(jremref, jsegdep) + struct jremref *jremref; + struct jsegdep *jsegdep; +{ + struct dirrem *dirrem; + + dirrem = jremref->jr_dirrem; + jremref->jr_dirrem = NULL; + LIST_REMOVE(jremref, jr_deps); + jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; + WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd) && + (dirrem->dm_state & COMPLETE) != 0) + add_to_worklist(&dirrem->dm_list); + free_jremref(jremref); +} + +/* + * Called once a jaddref has made it to stable store. The dependency is + * marked complete and any dependent structures are added to the inode + * bufwait list to be completed as soon as it is written. If a bitmap + * write depends on this entry we move the inode into the inodedephd + * of the bmsafemap dependency and attempt to remove the jaddref from + * the bmsafemap. + */ +static void +handle_written_jaddref(jaddref, jsegdep) + struct jaddref *jaddref; + struct jsegdep *jsegdep; +{ + struct inodedep *inodedep; + struct diradd *diradd; + struct mkdir *mkdir; + + mkdir = NULL; + diradd = NULL; + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("handle_written_jaddref: Lost inodedep."); + if (jaddref->ja_diradd == NULL) + panic("handle_written_jaddref: No dependency"); + if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { + diradd = jaddref->ja_diradd; + WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); + } else if (jaddref->ja_state & MKDIR_PARENT) { + mkdir = jaddref->ja_mkdir; + WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); + } else if (jaddref->ja_state & MKDIR_BODY) + mkdir = jaddref->ja_mkdir; + else + panic("handle_written_jaddref: Unknown dependency %p", + jaddref->ja_diradd); + jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ + /* + * The mkdir may be waiting on the jaddref to clear before freeing. + */ + if (mkdir) { + KASSERT(mkdir->md_list.wk_type == D_MKDIR, + ("handle_written_jaddref: Incorrect type for mkdir %s", + TYPENAME(mkdir->md_list.wk_type))); + mkdir->md_jaddref = NULL; + diradd = mkdir->md_diradd; + mkdir->md_state |= DEPCOMPLETE; + complete_mkdir(mkdir); + } + WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); + if (jaddref->ja_state & NEWBLOCK) { + LIST_REMOVE(jaddref, ja_inodeps); + jaddref->ja_state &= ~ONDEPLIST; + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, + inodedep, id_deps); + } + free_jaddref(jaddref); +} + +/* + * Called once a jnewblk journal is written. The allocdirect or allocindir + * is placed in the bmsafemap to await notification of a written bitmap. + */ +static void +handle_written_jnewblk(jnewblk, jsegdep) + struct jnewblk *jnewblk; + struct jsegdep *jsegdep; +{ + struct bmsafemap *bmsafemap; + struct allocdirect *adp; + struct allocindir *aip; + + if (jnewblk->jn_dep == NULL) + panic("handle_written_jnewblk: No dependency for the segdep."); + jsegdep->jd_type = jnewblk->jn_dep->wk_type; + if (jnewblk->jn_dep->wk_type == D_ALLOCDIRECT) { + adp = WK_ALLOCDIRECT(jnewblk->jn_dep); + adp->ad_jnewblk = NULL; + WORKLIST_INSERT(&adp->ad_jwork, &jsegdep->jd_list); + bmsafemap = adp->ad_bmsafemap; + adp->ad_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); + } else if (jnewblk->jn_dep->wk_type == D_ALLOCINDIR) { + aip = WK_ALLOCINDIR(jnewblk->jn_dep); + aip->ai_jnewblk = NULL; + WORKLIST_INSERT(&aip->ai_jwork, &jsegdep->jd_list); + bmsafemap = aip->ai_bmsafemap; + aip->ai_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); + } else + panic("handle_written_jnewblk: Unexpected type %s", + TYPENAME(jnewblk->jn_dep->wk_type)); + jnewblk->jn_dep = NULL; + free_jnewblk(jnewblk); +} + +static void +cancel_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + struct freefrag *freefrag; + + freefrag = jfreefrag->fr_freefrag; + jfreefrag->fr_freefrag = NULL; + freefrag->ff_jfreefrag = NULL; + free_jfreefrag(jfreefrag); + freefrag->ff_state |= DEPCOMPLETE; +} + +/* + * Free a jfreefrag when the parent freefrag is rendered obsolete. + */ +static void +free_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + + if (jfreefrag->fr_state & IOSTARTED) + WORKLIST_REMOVE(&jfreefrag->fr_list); + else if (jfreefrag->fr_state & ONWORKLIST) + remove_from_journal(&jfreefrag->fr_list); + if (jfreefrag->fr_freefrag != NULL) + panic("free_jfreefrag: Still attached to a freefrag."); + WORKITEM_FREE(jfreefrag, D_JFREEFRAG); +} + +/* + * Called when the journal write for a jfreefrag completes. The parent + * freefrag is added to the worklist if this completes its dependencies. + */ +static void +handle_written_jfreefrag(jfreefrag, jsegdep) + struct jfreefrag *jfreefrag; + struct jsegdep *jsegdep; +{ + struct freefrag *freefrag; + + freefrag = jfreefrag->fr_freefrag; + if (freefrag == NULL) + panic("handle_written_jfreefrag: No freefrag."); + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jfreefrag = NULL; + WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list); + jfreefrag->fr_freefrag = NULL; + free_jfreefrag(jfreefrag); +} + +/* + * Called when the journal write for a jfreeblk completes. The jfreeblk + * is removed from the freeblks list of pending journal writes and the + * jsegdep is moved to the freeblks jwork to be completed when all blocks + * have been reclaimed. + */ +static void +handle_written_jfreeblk(jfreeblk, jsegdep) + struct jfreeblk *jfreeblk; + struct jsegdep *jsegdep; +{ + struct freeblks *freeblks; + + freeblks = jfreeblk->jf_freeblks; + LIST_REMOVE(jfreeblk, jf_deps); + WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); + + /* + * If the freeblks is all journaled, we can add it to the worklist. + */ + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && + (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { + /* Remove from the b_dep that is waiting on this write. */ + if (freeblks->fb_state & ONWORKLIST) + WORKLIST_REMOVE(&freeblks->fb_list); + add_to_worklist(&freeblks->fb_list); + } + + free_jfreeblk(jfreeblk); +} + +/* + * Allocate a new jremref that tracks the removal of ip from dp with the + * directory entry offset of diroff. Mark the entry as ATTACHED and + * DEPCOMPLETE as we have all the information required for the journal write + * and the directory has already been removed from the buffer. The caller + * is responsible for linking the jremref into the pagedep and adding it + * to the journal to write. The MKDIR_PARENT flag is set if we're doing + * a DOTDOT addition so handle_workitem_remove() can properly assign + * the jsegdep when we're done. + */ +static struct jremref * +newjremref(dirrem, dp, ip, diroff) + struct dirrem *dirrem; + struct inode *dp; + struct inode *ip; + off_t diroff; +{ + struct jremref *jremref; + + jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); + jremref->jr_state = ATTACHED | DEPCOMPLETE; + jremref->jr_dirrem = dirrem; + jremref->jr_diroff = diroff; + jremref->jr_ino = ip->i_number; + jremref->jr_parent = dp->i_number; + jremref->jr_mode = ip->i_mode; + jremref->jr_nlink = ip->i_nlink; + + return (jremref); +} + +/* + * Allocate a new jaddref to track the addition of ino to dp at diroff. The + * directory offset may not be known until later. The caller is responsible + * adding the entry to the journal when this information is available. nlink + * should be the link count prior to the addition and mode is only required + * to have the correct FMT. + */ +static struct jaddref * +newjaddref(dp, ino, diroff, nlink, mode) + struct inode *dp; + ino_t ino; + off_t diroff; + int16_t nlink; + uint16_t mode; +{ + struct jaddref *jaddref; + + jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); + jaddref->ja_mkdir = NULL; + jaddref->ja_state = ATTACHED | DEPCOMPLETE | ONDEPLIST; + jaddref->ja_diroff = diroff; + jaddref->ja_ino = ino; + jaddref->ja_parent = dp->i_number; + jaddref->ja_mode = mode; + jaddref->ja_nlink = nlink; + + return (jaddref); +} + +/* + * Create a new free dependency for a freework. The caller is responsible + * for adjusting the reference count when it has the lock held. The freedep + * will track an outstanding bitmap write that will ultimately clear the + * freework to continue. + */ +static struct freedep * +newfreedep(struct freework *freework) +{ + struct freedep *freedep; + + freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); + freedep->fd_freework = freework; + + return (freedep); +} + +/* + * Free a freedep structure once the buffer it is linked to is written. If + * this is the last reference to the freework schedule it for completion. + */ +static void +free_freedep(freedep) + struct freedep *freedep; +{ + + if (--freedep->fd_freework->fw_ref == 0) + add_to_worklist(&freedep->fd_freework->fw_list); + WORKITEM_FREE(freedep, D_FREEDEP); +} + +/* + * Allocate a new freework structure that may be a level in an indirect + * when parent is not NULL or a top level block when it is. The top level + * freework structures are allocated without lk held and before the freeblks + * is visible outside of softdep_setup_freeblocks(). + */ +static struct freework * +newfreework(freeblks, parent, lbn, nb, frags, journal) + struct freeblks *freeblks; + struct freework *parent; + ufs_lbn_t lbn; + ufs2_daddr_t nb; + int frags; + int journal; +{ + struct freework *freework; + + freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); + workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); + freework->fw_freeblks = freeblks; + freework->fw_parent = parent; + freework->fw_lbn = lbn; + freework->fw_blkno = nb; + freework->fw_frags = frags; + freework->fw_ref = 0; + LIST_INIT(&freework->fw_jwork); + + if (parent == NULL) { + WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, + &freework->fw_list); + freeblks->fb_ref++; + } + if (journal) + newjfreeblk(freeblks, lbn, nb, frags); + + return (freework); +} + +/* + * Allocate a new jfreeblk to journal top level block pointer when truncating + * a file. The caller must add this to the worklist when lk is held. + */ +static struct jfreeblk * +newjfreeblk(freeblks, lbn, blkno, frags) + struct freeblks *freeblks; + ufs_lbn_t lbn; + ufs2_daddr_t blkno; + int frags; +{ + struct jfreeblk *jfreeblk; + + jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); + jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; + jfreeblk->jf_ino = freeblks->fb_previousinum; + jfreeblk->jf_lbn = lbn; + jfreeblk->jf_blkno = blkno; + jfreeblk->jf_frags = frags; + jfreeblk->jf_freeblks = freeblks; + LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); + + return (jfreeblk); +} + +/* + * Cancel a jaddref either before it has been written or while it is being + * written. This happens when a link is removed before the add reaches + * the disk. The jaddref dependency is kept linked into the bmsafemap + * and inode to prevent the link count or bitmap from reaching the disk + * until handle_workitem_remove() re-adjusts the counts and bitmaps as + * required. + */ +static void +cancel_jaddref(jaddref, inodedep, wkhd) + struct jaddref *jaddref; + struct inodedep *inodedep; + struct workhead *wkhd; +{ + struct jaddref *jaddrefn; + + /* + * If we're canceling a new bitmap we have to search for another ref + * to move into the bmsafemap dep. This might be better expressed + * with another structure. + */ + if (jaddref->ja_state & NEWBLOCK && inodedep != NULL) { + LIST_FOREACH(jaddrefn, &inodedep->id_jaddrefhd, ja_inodeps) { + if (jaddrefn == jaddref) + continue; + if ((jaddrefn->ja_state & GOINGAWAY) != 0) + continue; + jaddrefn->ja_state &= ~(ATTACHED | UNDONE); + jaddrefn->ja_state |= + jaddref->ja_state & (ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state |= ATTACHED; + LIST_REMOVE(jaddref, ja_bmdeps); + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, + jaddrefn, ja_bmdeps); + } + } + if (jaddref->ja_state & IOWAITING) { + jaddref->ja_state &= ~IOWAITING; + wakeup(&jaddref->ja_list); + } + jaddref->ja_state |= GOINGAWAY; + jaddref->ja_mkdir = NULL; + if (jaddref->ja_state & IOSTARTED) { + jaddref->ja_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jaddref->ja_list); + } else + remove_from_journal(&jaddref->ja_list); + if (wkhd == NULL) + return; + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jaddref->ja_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jaddref->ja_list); +} + +/* + * Attempt to free a jaddref structure when some work completes. This + * should only succeed once the entry is written and all dependencies have + * been notified. + */ +static void +free_jaddref(jaddref) + struct jaddref *jaddref; +{ + + if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (jaddref->ja_state & NEWBLOCK) + LIST_REMOVE(jaddref, ja_bmdeps); + if (jaddref->ja_state & ONDEPLIST) + LIST_REMOVE(jaddref, ja_inodeps); + if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) + panic("free_jaddref: Bad state %p(0x%X)", + jaddref, jaddref->ja_state); + if (jaddref->ja_mkdir != NULL) + panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); + WORKITEM_FREE(jaddref, D_JADDREF); +} + +/* + * Free a jremref structure once all dependencies are complete. + */ +static void +free_jremref(jremref) + struct jremref *jremref; +{ + + if ((jremref->jr_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + /* If we were never written remove ourselves from the worklist. */ + if (jremref->jr_state & IOSTARTED) + panic("free_jremref: IO still pending"); + WORKITEM_FREE(jremref, D_JREMREF); +} + +/* + * Free a jnewblk structure. + */ +static void +free_jnewblk(jnewblk) + struct jnewblk *jnewblk; +{ + + if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(jnewblk, jn_deps); + if (jnewblk->jn_dep != NULL) + panic("free_jnewblk: Dependency still attached."); + WORKITEM_FREE(jnewblk, D_JNEWBLK); +} + +/* + * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk + * is kept linked into the bmsafemap until the free completes, thus + * preventing the modified state from ever reaching disk. The free + * routine must pass this structure via ffs_blkfree() to + * softdep_setup_freeblks() so there is no race in releasing the space. + */ +static void +cancel_jnewblk(jnewblk, wkhd) + struct jnewblk *jnewblk; + struct workhead *wkhd; +{ + + if (jnewblk->jn_state & IOWAITING) { + jnewblk->jn_state &= ~IOWAITING; + wakeup(&jnewblk->jn_list); + } + jnewblk->jn_dep = NULL; + jnewblk->jn_state |= GOINGAWAY; + if (jnewblk->jn_state & IOSTARTED) { + jnewblk->jn_state &= ~IOSTARTED; + WORKLIST_REMOVE(&jnewblk->jn_list); + } else + remove_from_journal(&jnewblk->jn_list); + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jnewblk->jn_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jnewblk->jn_list); +} + +static void +free_jfreeblk(jfreeblk) + struct jfreeblk *jfreeblk; +{ + + WORKITEM_FREE(jfreeblk, D_JFREEBLK); +} + +/* + * Release one reference to a jseg and free it if the count reaches 0. This + * should eventually reclaim journal space as well. + */ +static void +free_jseg(jseg) + struct jseg *jseg; +{ + struct jblocks *jblocks; + struct jsegdep *jsegdep; + + + KASSERT(jseg->js_refs > 0, + ("free_jseg: Invalid refcnt %d", jseg->js_refs)); + if (--jseg->js_refs != 0) + return; +#ifdef DEBUG + { + int errors; + int i; + errors = 0; + for (i = 0; i < jseg->js_cnt; i++) { + jsegdep = &jseg->js_deps[i]; + if (jsegdep->jd_state != 0) { + printf("free_jseg: %p(0x%X) allocated " + "to %s:%d\n", jsegdep, jsegdep->jd_state, + TYPENAME(jsegdep->jd_type), + jsegdep->jd_line); + errors = 1; + } + } + if (errors) + panic("errors"); + } +#endif + jblocks = jseg->js_jblocks; + /* + * Free only those jsegs which have none allocated before them to + * preserve the journal space ordering. + */ + while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { + jblocks->jb_oldestseq = jseg->js_seq; + if (jseg->js_refs != 0) + break; + TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); + jblocks_free(jblocks, jseg->js_size); + KASSERT(LIST_EMPTY(&jseg->js_entries), + ("free_jseg: Freed jseg has valid entries.")); + WORKITEM_FREE(jseg, D_JSEG); + } +} + +/* + * Release a jsegdep and decrement the jseg count. + */ +static void +free_jsegdep(jsegdep) + struct jsegdep *jsegdep; +{ + +#ifdef DEBUG + if (jsegdep->jd_state != ATTACHED) + panic("free_jsegdep: Illegal state 0x%X.", jsegdep->jd_state); + jsegdep->jd_state &= ~ATTACHED; +#endif + free_jseg(jsegdep->jd_seg); +} + +/* + * Wait for a journal item to make it to disk. Initiate journal processing + * if required. + */ +static void +jwait(wk) + struct worklist *wk; +{ + + /* + * If IO has not started we process the journal. We can't mark the + * worklist item as IOWAITING because we drop the lock while + * processing the journal and the worklist entry may be freed after + * this point. The caller may call back in and re-issue the request. + */ + if ((wk->wk_state & IOSTARTED) == 0) { + softdep_process_journal(wk->wk_mp, MNT_WAIT); + return; + } + wk->wk_state |= IOWAITING; + msleep(wk, &lk, PRIBIO, "jwait", 0); +} + +/* + * Lookup an inodedep based on an inode pointer and set the nlinkdelta as + * appropriate. This is a convenience function to reduce duplicate code + * for the setup and revert functions below. + */ +static struct inodedep * +inodedep_lookup_ip(ip) + struct inode *ip; +{ + struct inodedep *inodedep; + + KASSERT(ip->i_nlink >= ip->i_effnlink, + ("inodedep_lookup_ip: bad delta")); + (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, + DEPALLOC, &inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + + return (inodedep); +} + +/* + * Called prior to creating a new inode and linking it to a directory. The + * jaddref structure must already be allocated by softdep_setup_inomapdep + * and it is discovered here so we can initialize the mode and update + * nlinkdelta. + */ +void +softdep_setup_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + KASSERT(ip->i_nlink == 1, + ("softdep_setup_create: Invalid link count.")); + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + KASSERT(jaddref != NULL && + jaddref->ja_parent == dp->i_number && jaddref->ja_nlink == 0, + ("softdep_setup_create: No addref structure present.")); + jaddref->ja_mode = ip->i_mode; + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track the addition of a DOTDOT link when + * we are reparenting an inode as part of a rename. This jaddref will be + * found by softdep_setup_directory_change. + */ +void +softdep_setup_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + /* + * We don't set MKDIR_PARENT as this is not tied to a mkdir and + * is used as a normal link would be. + */ + jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_nlink - 1, + dp->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + LIST_INSERT_HEAD(&inodedep->id_jaddrefhd, jaddref, ja_inodeps); + FREE_LOCK(&lk); +} + +/* + * Create a jaddref structure to track a new link to an inode. The directory + * offset is not known until softdep_setup_directory_add or + * softdep_setup_directory_change. + */ +void +softdep_setup_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + jaddref = newjaddref(dp, ip->i_number, 0, ip->i_nlink - 1, ip->i_mode); + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + LIST_INSERT_HEAD(&inodedep->id_jaddrefhd, jaddref, ja_inodeps); + FREE_LOCK(&lk); +} + +/* + * Called to create the jaddref structures to track . and .. references as + * well as lookup and further initialize the incomplete jaddref created + * by softdep_setup_inomapdep when the inode was allocated.A + */ +void +softdep_setup_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *dotdotaddref; + struct jaddref *dotaddref; + struct jaddref *jaddref; + + dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, ip->i_mode); + dotaddref->ja_state |= MKDIR_BODY; + dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_nlink - 1, dp->i_mode); + dotdotaddref->ja_state |= MKDIR_PARENT; + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + KASSERT(jaddref != NULL, + ("softdep_setup_mkdir: No addref structure present.")); + KASSERT(jaddref->ja_parent == dp->i_number && jaddref->ja_nlink == 0, + ("softdep_setup_mkdir: bad parent/link %d/%d", + jaddref->ja_parent, jaddref->ja_nlink)); + jaddref->ja_mode = ip->i_mode; + LIST_INSERT_AFTER(jaddref, dotaddref, ja_inodeps); + inodedep = inodedep_lookup_ip(dp); + LIST_INSERT_HEAD(&inodedep->id_jaddrefhd, dotdotaddref, ja_inodeps); + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlinking a directory. + */ +void +softdep_setup_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(&lk); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlink. + */ +void +softdep_setup_unlink(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed non-directory + * creation. + */ +void +softdep_revert_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_create: addref parent mismatch")); + free_jaddref(jaddref); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed dotdot link + * creation. + */ +void +softdep_revert_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_dotdot_link: addref parent mismatch")); + free_jaddref(jaddref); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed link + * addition. + */ +void +softdep_revert_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(ip); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_link: addref parent mismatch")); + free_jaddref(jaddref); + FREE_LOCK(&lk); +} + +/* + * Called to release the journal structures created by a failed mkdir + * attempt. + */ +void +softdep_revert_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + + ACQUIRE_LOCK(&lk); + inodedep = inodedep_lookup_ip(dp); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dotdot addref parent mismatch")); + free_jaddref(jaddref); + inodedep = inodedep_lookup_ip(ip); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_mkdir: addref parent mismatch")); + free_jaddref(jaddref); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + jaddref->ja_state |= COMPLETE | DEPCOMPLETE; + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dot addref parent mismatch")); + free_jaddref(jaddref); + FREE_LOCK(&lk); +} + +/* + * Called to correct nlinkdelta after a failed rmdir. + */ +void +softdep_revert_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(&lk); +} + +/* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem @@ -1536,8 +3274,16 @@ softdep_setup_inomapdep(bp, ip, newinum) { struct inodedep *inodedep; struct bmsafemap *bmsafemap; + struct jaddref *jaddref; /* + * Allocate the journal reference add structure so that the bitmap + * can be dependent on it. + */ + jaddref = newjaddref(ip, newinum, 0, 0, 0); + jaddref->ja_state |= NEWBLOCK; + + /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding @@ -1546,12 +3292,14 @@ softdep_setup_inomapdep(bp, ip, newinum) ACQUIRE_LOCK(&lk); if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, &inodedep))) - panic("softdep_setup_inomapdep: dependency for new inode " - "already exists"); - inodedep->id_buf = bp; + panic("softdep_setup_inomapdep: dependency %p for new" + "inode already exists", inodedep); + bmsafemap = bmsafemap_lookup(UFSTOVFS(ip->i_ump), bp, + ino_to_cg(ip->i_ump->um_fs, newinum)); + LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); + inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; - bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); + LIST_INSERT_HEAD(&inodedep->id_jaddrefhd, jaddref, ja_inodeps); FREE_LOCK(&lk); } @@ -1560,13 +3308,16 @@ softdep_setup_inomapdep(bp, ip, newinum) * allocate block or fragment. */ void -softdep_setup_blkmapdep(bp, mp, newblkno) +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ + int frags; /* Number of fragments. */ + int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; struct fs *fs; fs = VFSTOUFS(mp)->um_fs; @@ -1575,14 +3326,70 @@ void * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ + jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); + jnewblk->jn_state = ATTACHED; + jnewblk->jn_dep = NULL; + jnewblk->jn_blkno = newblkno; + jnewblk->jn_frags = frags; + jnewblk->jn_oldfrags = oldfrags; +#ifdef DEBUG + { + struct cg *cgp; + uint8_t *blksfree; + long bno; + int i; + + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { + if (isset(blksfree, bno + i)) + panic("softdep_setup_blkmapdep: " + "free fragment %d from %d-%d " + "state 0x%X dep %p", i, + jnewblk->jn_oldfrags, + jnewblk->jn_frags, + jnewblk->jn_state, + jnewblk->jn_dep); + } + } +#endif ACQUIRE_LOCK(&lk); if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); - newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); - LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, + dtog(fs, newblkno)); + jnewblk->jn_newblk = newblk; + newblk->nb_jnewblk = jnewblk; + LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); FREE_LOCK(&lk); } +#define BMSAFEMAP_HASH(fs, cg) \ + (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) + +static int +bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) + struct bmsafemap_hashhead *bmsafemaphd; + struct mount *mp; + int cg; + struct bmsafemap **bmsafemapp; +{ + struct bmsafemap *bmsafemap; + + LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) + if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) + break; + if (bmsafemap) { + *bmsafemapp = bmsafemap; + return (1); + } + *bmsafemapp = NULL; + + return (0); +} + /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when @@ -1590,27 +3397,47 @@ void * splbio interrupts blocked. */ static struct bmsafemap * -bmsafemap_lookup(mp, bp) +bmsafemap_lookup(mp, bp, cg) struct mount *mp; struct buf *bp; + int cg; { - struct bmsafemap *bmsafemap; + struct bmsafemap_hashhead *bmsafemaphd; + struct bmsafemap *bmsafemap, *collision; struct worklist *wk; + struct fs *fs; mtx_assert(&lk, MA_OWNED); - LIST_FOREACH(wk, &bp->b_dep, wk_list) - if (wk->wk_type == D_BMSAFEMAP) - return (WK_BMSAFEMAP(wk)); + if (bp) + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_BMSAFEMAP) + return (WK_BMSAFEMAP(wk)); + fs = VFSTOUFS(mp)->um_fs; + bmsafemaphd = BMSAFEMAP_HASH(fs, cg); + if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) + return (bmsafemap); FREE_LOCK(&lk); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_allocdirecthd); + LIST_INIT(&bmsafemap->sm_allocdirectwr); LIST_INIT(&bmsafemap->sm_allocindirhd); + LIST_INIT(&bmsafemap->sm_allocindirwr); LIST_INIT(&bmsafemap->sm_inodedephd); + LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); + LIST_INIT(&bmsafemap->sm_newblkwr); + LIST_INIT(&bmsafemap->sm_jaddrefhd); + LIST_INIT(&bmsafemap->sm_jnewblkhd); ACQUIRE_LOCK(&lk); + if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (collision); + } + bmsafemap->sm_cg = cg; + LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } @@ -1645,8 +3472,10 @@ static struct bmsafemap * * unreferenced fragments. */ void -softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) +softdep_setup_allocdirect(ip, off, lbn, newblkno, oldblkno, newsize, oldsize, + bp) struct inode *ip; /* inode to which block is being added */ + int off; /* Offset from start of di_db. */ ufs_lbn_t lbn; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ @@ -1657,8 +3486,10 @@ void struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; @@ -1666,20 +3497,23 @@ void adp = malloc(sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; + adp->ad_offset = off; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; + LIST_INIT(&adp->ad_indirdeps); LIST_INIT(&adp->ad_newdirblk); + LIST_INIT(&adp->ad_jwork); + adp->ad_jnewblk = NULL; if (newblkno == oldblkno) adp->ad_freefrag = NULL; else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); ACQUIRE_LOCK(&lk); - if (lbn >= NDADDR) { + if (off >= NDADDR) { /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); @@ -1692,18 +3526,29 @@ void * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, off, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_dep = &adp->ad_list; + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + jnewblk->jn_state |= DEPCOMPLETE; + add_to_journal(&jnewblk->jn_list); + bmsafemap = newblk->nb_bmsafemap; + adp->ad_bmsafemap = bmsafemap; + adp->ad_jnewblk = jnewblk; + } else if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; + adp->ad_bmsafemap = NULL; } else { bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; + adp->ad_bmsafemap = bmsafemap; LIST_REMOVE(newblk, nb_deps); + adp->ad_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); @@ -1726,24 +3571,50 @@ void */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(&lk); - return; + goto out; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); +out: + /* + * If we have a freefrag and the inode has never been written we + * can simply free it here. Otherwise add the freefrag to the + * journal. + */ + if ((freefrag = adp->ad_freefrag) != NULL) { + struct jfreefrag *jfreefrag; + + jfreefrag = freefrag->ff_jfreefrag; +#if 0 + /* + * XXX We have to wait until we merge with the other inode + * list to see eliminate that dependency. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + if (jfreefrag) + cancel_jfreefrag(jfreefrag); + /* XXX Could be handle_workitem_freefrag ? */ + add_to_worklist(&freefrag->ff_list); + adp->ad_oldblkno = 0; + adp->ad_freefrag = NULL; + } else +#endif + if (jfreefrag && (jfreefrag->fr_state & ONWORKLIST) == 0) + add_to_journal(&jfreefrag->fr_list); + } FREE_LOCK(&lk); } @@ -1761,10 +3632,11 @@ allocdirect_merge(adphead, newadp, oldadp) struct freefrag *freefrag; struct newdirblk *newdirblk; + freefrag = NULL; mtx_assert(&lk, MA_OWNED); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || - newadp->ad_lbn >= NDADDR) + newadp->ad_offset >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, @@ -1788,8 +3660,8 @@ allocdirect_merge(adphead, newadp, oldadp) * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ + freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { - freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } @@ -1804,18 +3676,66 @@ allocdirect_merge(adphead, newadp, oldadp) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); } - free_allocdirect(adphead, oldadp, 0); + TAILQ_REMOVE(adphead, oldadp, ad_next); + /* + * We need to move any journal dependencies over to the freefrag + * that releases this block if it exists. Otherwise we are + * extending an existing block and we'll wait until that is + * complete to release the journal space and extend the + * new journal to cover this old space as well. + */ + if (freefrag == NULL) { + struct jnewblk *jnewblk; + struct jnewblk *njnewblk; + + if (oldadp->ad_newblkno != newadp->ad_newblkno) + panic("allocdirect_merge: %jd != %jd", + oldadp->ad_newblkno, newadp->ad_newblkno); + jnewblk = oldadp->ad_jnewblk; + allocdirect_jwork(oldadp, &newadp->ad_jwork, D_ALLOCDIRECT); + free_allocdirect(oldadp); + /* + * We have an unwritten jnewblk, we need to merge the + * frag bits with our own. The newer adp's journal can not + * be written prior to the old one so no need to check for + * it here. + */ + if (jnewblk) { + njnewblk = newadp->ad_jnewblk; + if (jnewblk->jn_state & UNDONE) { + njnewblk->jn_state |= UNDONE | NEWBLOCK; + njnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state &= ~UNDONE; + } + njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; + WORKLIST_REMOVE(&jnewblk->jn_list); + jnewblk->jn_state |= ATTACHED | COMPLETE; + free_jnewblk(jnewblk); + } + } else { + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocdirect that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jfreefrag) + cancel_jfreefrag(freefrag->ff_jfreefrag); + allocdirect_jwork(oldadp, &freefrag->ff_jwork, D_FREEFRAG); + free_allocdirect(oldadp); + } } /* * Allocate a new freefrag structure if needed. */ static struct freefrag * -newfreefrag(ip, blkno, size) +newfreefrag(ip, blkno, size, lbn) struct inode *ip; ufs2_daddr_t blkno; long size; + ufs_lbn_t lbn; { + struct jfreefrag *jfreefrag; struct freefrag *freefrag; struct fs *fs; @@ -1825,11 +3745,26 @@ static struct freefrag * if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), - M_FREEFRAG, M_SOFTDEP_FLAGS); + M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); + freefrag->ff_state = ATTACHED; + LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; + + jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, + M_SOFTDEP_FLAGS); + workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); + jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; + jfreefrag->fr_ino = ip->i_number; + jfreefrag->fr_lbn = lbn; + jfreefrag->fr_blkno = blkno; + jfreefrag->fr_frags = numfrags(fs, size); + + freefrag->ff_jfreefrag = jfreefrag; + jfreefrag->fr_freefrag = freefrag; + return (freefrag); } @@ -1842,9 +3777,17 @@ handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); + struct workhead wkhd; + /* + * It would be illegal to add new completion items to the + * freefrag after it was schedule to be done so it must be + * safe to modify the list head here. + */ + LIST_INIT(&wkhd); + LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum); + freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); @@ -1868,7 +3811,9 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; + struct freefrag *freefrag; struct inodedep *inodedep; + struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; @@ -1876,17 +3821,20 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno adp = malloc(sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); - adp->ad_lbn = lbn; + adp->ad_offset = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED | EXTDATA; + LIST_INIT(&adp->ad_indirdeps); LIST_INIT(&adp->ad_newdirblk); + LIST_INIT(&adp->ad_jwork); + adp->ad_jnewblk = NULL; if (newblkno == oldblkno) adp->ad_freefrag = NULL; else - adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize, -lbn); ACQUIRE_LOCK(&lk); if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) @@ -1895,13 +3843,24 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; - if (newblk->nb_state == DEPCOMPLETE) { + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_dep = &adp->ad_list; + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = -1 - lbn; /* Negative lbns for ext. */ + jnewblk->jn_state |= DEPCOMPLETE; + add_to_journal(&jnewblk->jn_list); + bmsafemap = newblk->nb_bmsafemap; + adp->ad_bmsafemap = bmsafemap; + adp->ad_jnewblk = jnewblk; + /* XXX Only for the !journaling case. */ + } else if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; + adp->ad_bmsafemap = NULL; } else { bmsafemap = newblk->nb_bmsafemap; - adp->ad_buf = bmsafemap->sm_buf; + adp->ad_bmsafemap = bmsafemap; LIST_REMOVE(newblk, nb_deps); + adp->ad_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); @@ -1925,24 +3884,50 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); - if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + if (oldadp == NULL || oldadp->ad_offset <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); - if (oldadp != NULL && oldadp->ad_lbn == lbn) + if (oldadp != NULL && oldadp->ad_offset == lbn) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(&lk); - return; + goto out; } TAILQ_FOREACH(oldadp, adphead, ad_next) { - if (oldadp->ad_lbn >= lbn) + if (oldadp->ad_offset >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); - if (oldadp->ad_lbn == lbn) + if (oldadp->ad_offset == lbn) allocdirect_merge(adphead, adp, oldadp); +out: + /* + * If we have a freefrag and the inode has never been written we + * can simply free it here. Otherwise add the freefrag to the + * journal. + */ + if ((freefrag = adp->ad_freefrag) != NULL) { + struct jfreefrag *jfreefrag; + + jfreefrag = freefrag->ff_jfreefrag; +#if 0 + /* + * XXX We have to wait until we merge with the other inode + * list to see eliminate that dependency. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + if (jfreefrag) + cancel_jfreefrag(jfreefrag); + /* XXX Could be handle_workitem_freefrag ? */ + add_to_worklist(&freefrag->ff_list); + adp->ad_oldblkno = 0; + adp->ad_freefrag = NULL; + } else +#endif + if (jfreefrag && (jfreefrag->fr_state & ONWORKLIST) == 0) + add_to_journal(&jfreefrag->fr_list); + } FREE_LOCK(&lk); } @@ -1975,11 +3960,12 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno * Allocate a new allocindir structure. */ static struct allocindir * -newallocindir(ip, ptrno, newblkno, oldblkno) +newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ + ufs_lbn_t lbn; { struct allocindir *aip; @@ -1990,7 +3976,11 @@ static struct allocindir * aip->ai_offset = ptrno; aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; - aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); + aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); + LIST_INIT(&aip->ai_indirdeps); + LIST_INIT(&aip->ai_newdirblk); + aip->ai_jnewblk = NULL; + LIST_INIT(&aip->ai_jwork); return (aip); } @@ -1999,31 +3989,36 @@ static struct allocindir * * to a newly allocated file page. */ void -softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) +softdep_setup_allocindir_page(ip, bp, indir, lvl, lbn, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ - ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ - int ptrno; /* offset of pointer in indirect block */ + struct indir *indir; /* Indirect block path. */ + int lvl; /* Indirect block level for parent. */ + ufs_lbn_t lbn; /* Logical block number of this block. */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { + struct inodedep *inodedep; struct allocindir *aip; struct pagedep *pagedep; + struct mount *mp; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); - aip = newallocindir(ip, ptrno, newblkno, oldblkno); + aip = newallocindir(ip, indir[lvl].in_off, newblkno, oldblkno, lbn); + mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && - pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } @@ -2032,45 +4027,118 @@ void * newly allocated indirect block. */ void -softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) +softdep_setup_allocindir_meta(nbp, ip, bp, indir, lvl, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ - int ptrno; /* offset of pointer in indirect block */ + struct indir *indir; /* Indirect block path. */ + int lvl; /* Indirect block level this block. */ ufs2_daddr_t newblkno; /* disk block number being added */ { + struct inodedep *inodedep; struct allocindir *aip; + ufs_lbn_t lbn; + lbn = indir[lvl].in_lbn; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); - aip = newallocindir(ip, ptrno, newblkno, 0); + aip = newallocindir(ip, indir[lvl - 1].in_off, newblkno, 0, lbn); ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, + &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); - setup_allocindir_phase2(bp, ip, aip); + setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(&lk); } +static void +handle_workitem_indirdep(indirdep) + struct indirdep *indirdep; +{ + struct buf *sbp; + + sbp = indirdep->ir_savebp; + ACQUIRE_LOCK(&lk); + free_indirdep(indirdep); + FREE_LOCK(&lk); + sbp->b_flags |= B_INVAL | B_NOCACHE; + brelse(sbp); +} + +static void +indirdep_complete(indirdep) + struct indirdep *indirdep; +{ + struct allocindir *aip; + + LIST_REMOVE(indirdep, ir_next); + indirdep->ir_state &= ~ONDEPLIST; + + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) + free_allocindir(aip, NULL, NULL, 0); + if ((indirdep->ir_state & ATTACHED) && + LIST_EMPTY(&indirdep->ir_deplisthd) && + LIST_EMPTY(&indirdep->ir_donehd) && + LIST_EMPTY(&indirdep->ir_writehd)) { + if (indirdep->ir_state & ONWORKLIST) + WORKLIST_REMOVE(&indirdep->ir_list); + add_to_worklist(&indirdep->ir_list); + } +} + +static void +indirdep_attach(indirdep, bp) + struct indirdep *indirdep; + struct buf *bp; +{ + struct allocdirect *adp; + struct allocindir *aip; + struct worklist *wk; + + WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); + + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + if (wk->wk_type == D_ALLOCDIRECT) { + adp = WK_ALLOCDIRECT(wk); + indirdep->ir_state |= ONDEPLIST; + LIST_INSERT_HEAD(&adp->ad_indirdeps, indirdep, ir_next); + return; + } + if (wk->wk_type == D_ALLOCINDIR) { + aip = WK_ALLOCINDIR(wk); + indirdep->ir_state |= ONDEPLIST; + LIST_INSERT_HEAD(&aip->ai_indirdeps, indirdep, ir_next); + return; + } + } + indirdep->ir_state |= DEPCOMPLETE; + return; +} + /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void -setup_allocindir_phase2(bp, ip, aip) +setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ + struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ + ufs_lbn_t lbn; /* Logical block number for this block. */ { struct worklist *wk; struct indirdep *indirdep, *newindirdep; struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; + struct jnewblk *jnewblk; struct newblk *newblk; ufs2_daddr_t blkno; mtx_assert(&lk, MA_OWNED); if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); - for (indirdep = NULL, newindirdep = NULL; ; ) { + for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; @@ -2079,20 +4147,30 @@ static void } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; - WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); newindirdep = NULL; + indirdep_attach(indirdep, bp); } if (indirdep) { if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, &newblk) == 0) panic("setup_allocindir: lost block"); - if (newblk->nb_state == DEPCOMPLETE) { + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_dep = &aip->ai_list; + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + jnewblk->jn_state |= DEPCOMPLETE; + add_to_journal(&jnewblk->jn_list); + bmsafemap = newblk->nb_bmsafemap; + aip->ai_bmsafemap = bmsafemap; + aip->ai_jnewblk = jnewblk; + } else if (newblk->nb_state == DEPCOMPLETE) { aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; + aip->ai_bmsafemap = NULL; } else { bmsafemap = newblk->nb_bmsafemap; - aip->ai_buf = bmsafemap->sm_buf; + aip->ai_bmsafemap = bmsafemap; LIST_REMOVE(newblk, nb_deps); + aip->ai_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); } @@ -2102,7 +4180,8 @@ static void /* * Check to see if there is an existing dependency * for this block. If there is, merge the old - * dependency into the new one. + * dependency into the new one. This happens + * as a result of reallocblk only. */ if (aip->ai_oldblkno == 0) oldaip = NULL; @@ -2111,17 +4190,18 @@ static void LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) if (oldaip->ai_offset == aip->ai_offset) break; - freefrag = NULL; - if (oldaip != NULL) { - if (oldaip->ai_newblkno != aip->ai_oldblkno) - panic("setup_allocindir_phase2: blkno"); - aip->ai_oldblkno = oldaip->ai_oldblkno; - freefrag = aip->ai_freefrag; - aip->ai_freefrag = oldaip->ai_freefrag; - oldaip->ai_freefrag = NULL; - free_allocindir(oldaip, NULL); - } + if (oldaip != NULL) + freefrag = allocindir_merge(aip, oldaip); + else if (aip->ai_freefrag) + add_to_journal( + &aip->ai_freefrag->ff_jfreefrag->fr_list); LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); + KASSERT(aip->ai_offset >= 0 && + aip->ai_offset < NINDIR(ip->i_ump->um_fs), + ("setup_allocindir_phase2: Bad offset %d", + aip->ai_offset)); + KASSERT(indirdep->ir_savebp != NULL, + ("setup_allocindir_phase2 NULL ir_savebp")); if (ip->i_ump->um_fstype == UFS1) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; @@ -2153,8 +4233,12 @@ static void newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; + newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); + LIST_INIT(&newindirdep->ir_writehd); + LIST_INIT(&newindirdep->ir_completehd); + LIST_INIT(&newindirdep->ir_jwork); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); @@ -2168,6 +4252,33 @@ static void } } +static struct freefrag * +allocindir_merge(aip, oldaip) + struct allocindir *aip; + struct allocindir *oldaip; +{ + struct freefrag *freefrag; + + if (oldaip->ai_newblkno != aip->ai_oldblkno) + panic("allocindir_merge: blkno"); + aip->ai_oldblkno = oldaip->ai_oldblkno; + freefrag = aip->ai_freefrag; + aip->ai_freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = NULL; + KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); + KASSERT(LIST_EMPTY(&oldaip->ai_indirdeps), + ("setup_allocindir_phase2: Merged allocindir has children")); + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocindir that is being + * removed after the freefrag completes. + */ + cancel_jfreefrag(freefrag->ff_jfreefrag); + free_allocindir(oldaip, NULL, &freefrag->ff_jwork, D_FREEFRAG); + + return (freefrag); +} + /* * Block de-allocation dependencies. * @@ -2206,6 +4317,7 @@ softdep_setup_freeblocks(ip, length, flags) struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; + struct jfreeblk *jfreeblk; struct bufobj *bo; struct vnode *vp; struct buf *bp; @@ -2213,6 +4325,13 @@ softdep_setup_freeblocks(ip, length, flags) ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; + ufs2_daddr_t blkno; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + long oldextsize; + long oldsize; + int frags; + int needj; fs = ip->i_fs; mp = UFSTOVFS(ip->i_ump); @@ -2221,32 +4340,52 @@ softdep_setup_freeblocks(ip, length, flags) freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); + LIST_INIT(&freeblks->fb_jfreeblkhd); + LIST_INIT(&freeblks->fb_jwork); freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; + freeblks->fb_chkcnt = 0; ACQUIRE_LOCK(&lk); + /* + * If we're truncating a removed file that will never be written + * we don't need to journal the block frees. The canceled journals + * for the allocations will suffice. + */ + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & DEPCOMPLETE) == 0 && ip->i_nlink == 0) + needj = 0; + else + needj = 1; num_freeblkdep++; FREE_LOCK(&lk); extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks = DIP(ip, i_blocks) - extblocks; - if ((flags & IO_NORMAL) == 0) { - freeblks->fb_oldsize = 0; - freeblks->fb_chkcnt = 0; - } else { - freeblks->fb_oldsize = ip->i_size; + if ((flags & IO_NORMAL) != 0) { + oldsize = ip->i_size; ip->i_size = 0; DIP_SET(ip, i_size, 0); freeblks->fb_chkcnt = datablocks; for (i = 0; i < NDADDR; i++) { - freeblks->fb_dblks[i] = DIP(ip, i_db[i]); + blkno = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], 0); + if (blkno == 0) + continue; + frags = sblksize(fs, oldsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, i, blkno, frags, needj); } - for (i = 0; i < NIADDR; i++) { - freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + i++, tmpval *= NINDIR(fs)) { + blkno = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], 0); + if (blkno) + newfreework(freeblks, NULL, -lbn - i, blkno, + fs->fs_frag, needj); + lbn += tmpval; } /* * If the file was removed, then the space being freed was @@ -2259,17 +4398,23 @@ softdep_setup_freeblocks(ip, length, flags) UFS_UNLOCK(ip->i_ump); } } - if ((flags & IO_EXT) == 0) { - freeblks->fb_oldextsize = 0; - } else { - freeblks->fb_oldextsize = ip->i_din2->di_extsize; + if ((flags & IO_EXT) != 0) { + oldextsize = ip->i_din2->di_extsize; ip->i_din2->di_extsize = 0; freeblks->fb_chkcnt += extblocks; for (i = 0; i < NXADDR; i++) { - freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; + blkno = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; + if (blkno == 0) + continue; + frags = sblksize(fs, oldextsize, i); + frags = numfrags(fs, frags); + newfreework(freeblks, NULL, -1 - i, blkno, frags, + needj); } } + if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) + needj = 0; DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); /* * Push the zero'ed inode to to its disk buffer so that we are free @@ -2304,7 +4449,9 @@ softdep_setup_freeblocks(ip, length, flags) */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) - WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); + WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); + else if (needj) + freeblks->fb_state |= DEPCOMPLETE | COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated @@ -2318,14 +4465,19 @@ softdep_setup_freeblocks(ip, length, flags) merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) - free_allocdirect(&inodedep->id_inoupdt, adp, delay); + cancel_allocdirect(&inodedep->id_inoupdt, adp, + freeblks, delay); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) - free_allocdirect(&inodedep->id_extupdt, adp, delay); + cancel_allocdirect(&inodedep->id_extupdt, adp, + freeblks, delay); } + LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) + add_to_journal(&jfreeblk->jf_list); + FREE_LOCK(&lk); bdwrite(bp); /* @@ -2349,9 +4501,9 @@ restart: BO_UNLOCK(bo); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); - deallocate_dependencies(bp, inodedep); + if (deallocate_dependencies(bp, inodedep, freeblks)) + bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); - bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); BO_LOCK(bo); goto restart; @@ -2361,7 +4513,7 @@ restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); - if(delay) { + if (delay) { freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk @@ -2376,11 +4528,11 @@ restart: FREE_LOCK(&lk); /* - * If the inode has never been written to disk (delay == 0), - * then we can process the freeblks now that we have deleted - * the dependencies. + * If the inode has never been written to disk (delay == 0) and + * we're not waiting on any journal writes, then we can process the + * freeblks now that we have deleted the dependencies. */ - if (!delay) + if (!delay && !needj) handle_workitem_freeblocks(freeblks, 0); } @@ -2389,19 +4541,22 @@ restart: * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's - * associated with related dependencies do not occur. + * associated with related dependencies do not occur. Returns 1 if + * all dependencies were cleared, 0 otherwise. */ -static void -deallocate_dependencies(bp, inodedep) +static int +deallocate_dependencies(bp, inodedep, freeblks) struct buf *bp; struct inodedep *inodedep; + struct freeblks *freeblks; { struct worklist *wk; struct indirdep *indirdep; + struct newdirblk *newdirblk; struct allocindir *aip; struct pagedep *pagedep; + struct jremref *jremref; struct dirrem *dirrem; - struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); @@ -2424,15 +4579,15 @@ restart: * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ + if (bp->b_lblkno >= 0 || + bp->b_blkno != indirdep->ir_savebp->b_lblkno) + panic("deallocate_dependencies: not indir"); if (indirdep->ir_state & GOINGAWAY) panic("deallocate_dependencies: already gone"); indirdep->ir_state |= GOINGAWAY; VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) - free_allocindir(aip, inodedep); - if (bp->b_lblkno >= 0 || - bp->b_blkno != indirdep->ir_savebp->b_lblkno) - panic("deallocate_dependencies: not indir"); + cancel_allocindir(aip, inodedep, freeblks); bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); WORKLIST_REMOVE(wk); @@ -2442,15 +4597,15 @@ restart: case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* - * None of the directory additions will ever be - * visible, so they can simply be tossed. + * There should be no directory add dependencies present + * as the directory could not be truncated until all + * children were removed. */ + KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, + ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) - while ((dap = - LIST_FIRST(&pagedep->pd_diraddhd[i]))) - free_diradd(dap); - while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) - free_diradd(dap); + KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, + ("deallocate_dependencies: diraddhd != NULL")); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. @@ -2458,28 +4613,33 @@ restart: * can be dumped directly onto the work list. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + /* + * If there are any dirrems we wait for + * the journal write to complete and + * then restart the buf scan as the lock + * has been dropped. + */ + while ((jremref = + LIST_FIRST(&dirrem->dm_jremrefhd)) + != NULL) { + jwait(&jremref->jr_list); + return (0); + } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == - ALLCOMPLETE) + ALLCOMPLETE) { + dirrem->dm_state |= COMPLETE; add_to_worklist(&dirrem->dm_list); - else + } else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } if ((pagedep->pd_state & NEWBLOCK) != 0) { - LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) - if (wk->wk_type == D_NEWDIRBLK && - WK_NEWDIRBLK(wk)->db_pagedep == - pagedep) - break; - if (wk != NULL) { - WORKLIST_REMOVE(wk); - free_newdirblk(WK_NEWDIRBLK(wk)); - } else - panic("deallocate_dependencies: " - "lost pagedep"); + newdirblk = pagedep->pd_newdirblk; + WORKLIST_REMOVE(&newdirblk->db_list); + free_newdirblk(newdirblk); } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); @@ -2487,7 +4647,8 @@ restart: continue; case D_ALLOCINDIR: - free_allocindir(WK_ALLOCINDIR(wk), inodedep); + aip = WK_ALLOCINDIR(wk); + cancel_allocindir(aip, inodedep, freeblks); continue; case D_ALLOCDIRECT: @@ -2502,45 +4663,126 @@ restart: /* NOTREACHED */ } } + + return (1); } /* - * Free an allocdirect. Generate a new freefrag work request if appropriate. - * This routine must be called with splbio interrupts blocked. + * An allocdirect is being canceled due to a truncate. We must make sure + * the journal entry is released in concert with the blkfree that releases + * the storage. Completed journal entries must not be released until the + * space is no longer pointed to by the inode or in the bitmap. */ static void -free_allocdirect(adphead, adp, delay) +cancel_allocdirect(adphead, adp, freeblks, delay) struct allocdirectlst *adphead; struct allocdirect *adp; + struct freeblks *freeblks; int delay; { + struct freework *freework; + struct indirdep *indirdep; + struct worklist *wk; + ufs_lbn_t lbn; + + TAILQ_REMOVE(adphead, adp, ad_next); + while ((indirdep = LIST_FIRST(&adp->ad_indirdeps)) != NULL) { + LIST_REMOVE(indirdep, ir_next); + indirdep->ir_state &= ~ONDEPLIST; + } + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the freework to be + * freed when freework_freeblock() is called. If the journal has + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ + if (adp->ad_jnewblk == NULL) { + allocdirect_jwork(adp, &freeblks->fb_jwork, D_FREEBLKS); + goto found; + } + lbn = adp->ad_jnewblk->jn_lbn; + /* + * Find the correct freework structure so it releases the canceled + * journal when the bitmap is cleared. This preserves rollback + * until the allocation is reverted. + */ + LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { + freework = WK_FREEWORK(wk); + if (freework->fw_lbn != lbn) + continue; + allocdirect_jwork(adp, &freework->fw_jwork, D_FREEWORK); + goto found; + } + panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); +found: + if (delay) { + if (adp->ad_state & ONWORKLIST) + WORKLIST_REMOVE(&adp->ad_list); + WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_list); + } else + free_allocdirect(adp); + return; +} + + +static void +allocdirect_jwork(adp, wkhd, type) + struct allocdirect *adp; + struct workhead *wkhd; + short type; +{ + /* + * If the journal entry hasn't been written we hold onto the dep + * until it is safe to free along with the other journal work. + */ + if (adp->ad_jnewblk != NULL) { + cancel_jnewblk(adp->ad_jnewblk, wkhd); + adp->ad_jnewblk = NULL; + } + if (!LIST_EMPTY(&adp->ad_jwork)) + jwork_move(type, __LINE__, wkhd, &adp->ad_jwork); +} + +/* + * Free an allocdirect. Generate a new freefrag work request if appropriate. + * This must be called after the inode pointer is valid or removed via + * truncate or frag extension. + */ +static void +free_allocdirect(adp) + struct allocdirect *adp; +{ + struct indirdep *indirdep; struct newdirblk *newdirblk; + struct freefrag *freefrag; struct worklist *wk; mtx_assert(&lk, MA_OWNED); - if ((adp->ad_state & DEPCOMPLETE) == 0) + if (adp->ad_state & ONDEPLIST) LIST_REMOVE(adp, ad_deps); - TAILQ_REMOVE(adphead, adp, ad_next); - if ((adp->ad_state & COMPLETE) == 0) + if (adp->ad_state & ONWORKLIST) WORKLIST_REMOVE(&adp->ad_list); - if (adp->ad_freefrag != NULL) { - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &adp->ad_freefrag->ff_list); - else - add_to_worklist(&adp->ad_freefrag->ff_list); + if ((freefrag = adp->ad_freefrag) != NULL) { + freefrag->ff_state |= COMPLETE; + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list); } if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); if (!LIST_EMPTY(&adp->ad_newdirblk)) panic("free_allocdirect: extra newdirblk"); - if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, - &newdirblk->db_list); - else - free_newdirblk(newdirblk); + free_newdirblk(newdirblk); } + while ((indirdep = LIST_FIRST(&adp->ad_indirdeps)) != NULL) { + indirdep->ir_state |= DEPCOMPLETE; + indirdep_complete(indirdep); + } + KASSERT(adp->ad_jnewblk == NULL, + ("free_allocdirect; jnewblk %p still attached", adp->ad_jnewblk)); + handle_jwork(&adp->ad_jwork); WORKITEM_FREE(adp, D_ALLOCDIRECT); } @@ -2554,6 +4796,7 @@ free_newdirblk(newdirblk) { struct pagedep *pagedep; struct diradd *dap; + struct worklist *wk; int i; mtx_assert(&lk, MA_OWNED); @@ -2571,7 +4814,7 @@ free_newdirblk(newdirblk) pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ @@ -2579,9 +4822,16 @@ free_newdirblk(newdirblk) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) break; if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { + KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, + ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } + /* Should only ever be one item in the list. */ + while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { + WORKLIST_REMOVE(wk); + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } @@ -2608,6 +4858,7 @@ softdep_freefile(pvp, ino, mode) freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; + LIST_INIT(&freefile->fx_jwork); if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendinginodes += 1; @@ -2618,11 +4869,19 @@ softdep_freefile(pvp, ino, mode) * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either - * case we can free the file immediately. + * case we can free the file immediately. If the journal was + * canceled before being written the inode will never make it to + * disk and we must send the canceled journal entrys to + * ffs_freefile() to be cleared in conjunction with the bitmap. + * Any blocks waiting on the inode to write can be safely freed + * here as it will never been written. */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || - check_inode_unwritten(inodedep)) { + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0 && + !LIST_EMPTY(&inodedep->id_jaddrefhd)) + handle_bufwait(inodedep, &freefile->fx_jwork); + if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); handle_workitem_freefile(freefile); return; @@ -2654,6 +4913,19 @@ check_inode_unwritten(inodedep) { mtx_assert(&lk, MA_OWNED); + /* + * The inode is unwritten but we have some canceled jaddrefs still, + * the inode will never be written but it is not yet safe to be + * freed either. Return 1 so callers don't place items on the + * bufwait/inowait lists that will never be written. + * + * XXX This breaks if the jaddrefhd is not canceled but is UNDONE + * and complete. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0 && + !LIST_EMPTY(&inodedep->id_jaddrefhd)) + return (1); + if ((inodedep->id_state & DEPCOMPLETE) != 0 || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || @@ -2662,9 +4934,9 @@ check_inode_unwritten(inodedep) !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || + inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); - /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". @@ -2674,8 +4946,9 @@ check_inode_unwritten(inodedep) return (0); inodedep->id_state |= ALLCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); + inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { @@ -2701,11 +4974,14 @@ free_inodedep(inodedep) !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || + !LIST_EMPTY(&inodedep->id_jaddrefhd) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || - inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) + inodedep->id_mkdiradd != NULL || + inodedep->id_nlinkdelta != 0 || + inodedep->id_savedino1 != NULL) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); @@ -2714,6 +4990,124 @@ free_inodedep(inodedep) } /* + * Free the block referenced by a freework structure. The parent freeblks + * structure is released and completed when the final cg bitmap reaches + * the disk. This routine may be freeing a jnewblk which never made it to + * disk in which case we do not have to wait as the operation is undone + * in memory immediately. + */ +static void +freework_freeblock(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct workhead wkhd; + struct fs *fs; + int complete; + int pending; + int bsize; + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + complete = 0; + LIST_INIT(&wkhd); + /* + * If we are canceling an existing jnewblk pass it to the free + * routine, otherwise pass the freeblk which will ultimately + * release the freeblks + */ + if (!LIST_EMPTY(&freework->fw_jwork)) { + LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); + complete = 1; + } else + WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); + bsize = lfragtosize(fs, freework->fw_frags); + pending = btodb(bsize); + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= pending; + FREE_LOCK(&lk); + /* + * extattr blocks don't show up in pending blocks. XXX why? + */ + if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { + UFS_LOCK(ump); + fs->fs_pendingblocks -= pending; + UFS_UNLOCK(ump); + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, + bsize, freeblks->fb_previousinum, &wkhd); + if (complete == 0) + return; + /* + * The jnewblk will be discarded and the bits in the map never + * made it to disk. We can immediately free the freeblk. + */ + ACQUIRE_LOCK(&lk); + handle_written_freework(freework); + FREE_LOCK(&lk); +} + +/* + * Start the process of freeing an indirect block tree. + */ +static void +freework_freeindir(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct fs *fs; + + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), + freework->fw_lbn); +} + +/* + * Called when the last oustanding block owned by an indirect is freed. + * It is now safe to free the indirect block itself. + */ +static void +handle_workitem_indirblk(freework) + struct freework *freework; +{ + + freework_freeblock(freework); +} + +/* + * Called when a freework structure attached to a cg buf is written. The + * ref on either the parent or the freeblks structure is released and + * either may be added to the worklist if it is the final ref. + */ +static void +handle_written_freework(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct freework *parent; + + freeblks = freework->fw_freeblks; + parent = freework->fw_parent; + if (parent) { + if (--parent->fw_ref != 0) + parent = NULL; + freeblks = NULL; + } else if (--freeblks->fb_ref != 0) + freeblks = NULL; + WORKITEM_FREE(freework, D_FREEWORK); + if (freeblks) + add_to_worklist(&freeblks->fb_list); + if (parent) + add_to_worklist(&parent->fw_list); +} + +/* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, @@ -2726,99 +5120,79 @@ handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { + struct freework *freework; + struct worklist *wk; + + KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), + ("handle_workitem_freeblocks: Journal entries not written.")); + if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { + handle_complete_freeblocks(freeblks); + return; + } + freeblks->fb_ref++; + while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { + KASSERT(wk->wk_type == D_FREEWORK, + ("handle_workitem_freeblocks: Unknown type %s", + TYPENAME(wk->wk_type))); + WORKLIST_REMOVE_UNLOCKED(wk); + freework = WK_FREEWORK(wk); + if (freework->fw_lbn <= -NDADDR) + freework_freeindir(freework); + else + freework_freeblock(freework); + } + ACQUIRE_LOCK(&lk); + if (--freeblks->fb_ref != 0) + freeblks = NULL; + FREE_LOCK(&lk); + if (freeblks) + handle_complete_freeblocks(freeblks); +} + +/* + * Once all of the freework workitems are complete we can retire the + * freeblocks dependency and any journal work awaiting completion. This + * can not be called until all other dependencies are stable on disk. + */ +static void +handle_complete_freeblocks(freeblks) + struct freeblks *freeblks; +{ struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; - int i, nblocks, level, bsize; - ufs2_daddr_t bn, blocksreleased = 0; - int error, allerror = 0; - ufs_lbn_t baselbns[NIADDR], tmpval; - int fs_pendingblocks; + int flags; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; - fs_pendingblocks = 0; - tmpval = 1; - baselbns[0] = NDADDR; - for (i = 1; i < NIADDR; i++) { - tmpval *= NINDIR(fs); - baselbns[i] = baselbns[i - 1] + tmpval; - } - nblocks = btodb(fs->fs_bsize); - blocksreleased = 0; + flags = LK_NOWAIT; + /* - * Release all extended attribute blocks or frags. - */ - if (freeblks->fb_oldextsize > 0) { - for (i = (NXADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_eblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldextsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - blocksreleased += btodb(bsize); - } - } - /* - * Release all data blocks or frags. - */ - if (freeblks->fb_oldsize > 0) { - /* - * Indirect blocks first. - */ - for (level = (NIADDR - 1); level >= 0; level--) { - if ((bn = freeblks->fb_iblks[level]) == 0) - continue; - if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), - level, baselbns[level], &blocksreleased)) != 0) - allerror = error; - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, - fs->fs_bsize, freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - blocksreleased += nblocks; - } - /* - * All direct blocks or frags. - */ - for (i = (NDADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_dblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldsize, i); - ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - fs_pendingblocks += btodb(bsize); - blocksreleased += btodb(bsize); - } - } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); - /* * If we still have not finished background cleanup, then check * to see if the block count needs to be adjusted. */ - if (freeblks->fb_chkcnt != blocksreleased && - (fs->fs_flags & FS_UNCLEAN) != 0 && + if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, - (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) - == 0) { + (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { ip = VTOI(vp); - DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ - freeblks->fb_chkcnt - blocksreleased); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); ip->i_flag |= IN_CHANGE; vput(vp); } #ifdef INVARIANTS - if (freeblks->fb_chkcnt != blocksreleased && + if (freeblks->fb_chkcnt != 0 && ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) printf("handle_workitem_freeblocks: block count\n"); - if (allerror) - softdep_error("handle_workitem_freeblks", allerror); #endif /* INVARIANTS */ ACQUIRE_LOCK(&lk); + /* + * All of the freeblock deps must be complete prior to this call + * so it's now safe to complete earlier outstanding journal entries. + */ + handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); num_freeblkdep--; FREE_LOCK(&lk); @@ -2831,28 +5205,39 @@ handle_workitem_freeblocks(freeblks, flags) * blocks. */ static int -indir_trunc(freeblks, dbn, level, lbn, countp) - struct freeblks *freeblks; +indir_trunc(freework, dbn, lbn) + struct freework *freework; ufs2_daddr_t dbn; - int level; ufs_lbn_t lbn; - ufs2_daddr_t *countp; { + struct workhead wkhd; + struct jnewblk *jnewblk; + struct freeblks *freeblks; struct buf *bp; struct fs *fs; + struct worklist *wkn; struct worklist *wk; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; - ufs2_daddr_t nb, *bap2 = 0; + ufs2_daddr_t nb, nnb, *bap2 = 0; ufs_lbn_t lbnadd; int i, nblocks, ufs1fmt; int error, allerror = 0; int fs_pendingblocks; + int freedeps; + int level; + int cnt; + LIST_INIT(&wkhd); + level = lbn_level(lbn); + if (level == -1) + panic("indir_trunc: Invalid lbn %jd\n", lbn); + freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; + freedeps = 0; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); @@ -2880,8 +5265,8 @@ static int (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: lost indirdep"); - WORKLIST_REMOVE(wk); - WORKITEM_FREE(indirdep, D_INDIRDEP); + LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); + free_indirdep(indirdep); if (!LIST_EMPTY(&bp->b_dep)) panic("indir_trunc: dangling dep"); ump->um_numindirdeps -= 1; @@ -2909,56 +5294,196 @@ static int ufs1fmt = 0; bap2 = (ufs2_daddr_t *)bp->b_data; } - nblocks = btodb(fs->fs_bsize); - for (i = NINDIR(fs) - 1; i >= 0; i--) { - if (ufs1fmt) + /* + * Reclaim indirect blocks which never made it to disk. + */ + cnt = 0; + LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { + struct workhead freewk; + if (wk->wk_type != D_JNEWBLK) + continue; + WORKLIST_REMOVE_UNLOCKED(wk); + LIST_INIT(&freewk); + WORKLIST_INSERT_UNLOCKED(&freewk, wk); + jnewblk = WK_JNEWBLK(wk); + if (jnewblk->jn_lbn > 0) + i = (jnewblk->jn_lbn - -lbn) / lbnadd; + else + i = (jnewblk->jn_lbn - lbn) / lbnadd; + KASSERT(i >= 0 && i < NINDIR(fs), + ("indir_trunc: Index out of range %d parent %jd lbn %jd", + i, lbn, jnewblk->jn_lbn)); + /* Clear the pointer so it isn't found below. */ + if (ufs1fmt) { nb = bap1[i]; - else + bap1[i] = 0; + } else { nb = bap2[i]; + bap2[i] = 0; + } + KASSERT(nb == jnewblk->jn_blkno, + ("indir_trunc: Block mismatch %jd != %jd", + nb, jnewblk->jn_blkno)); + ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno, + fs->fs_bsize, freeblks->fb_previousinum, &freewk); + cnt++; + } + ACQUIRE_LOCK(&lk); + freework->fw_ref += NINDIR(fs) + 1; + /* Any remaining journal work can be completed with freeblks. */ + jwork_move(D_FREEBLKS, __LINE__, &freeblks->fb_jwork, &wkhd); + FREE_LOCK(&lk); + nblocks = btodb(fs->fs_bsize); + if (ufs1fmt) + nb = bap1[NINDIR(fs) - 1]; + else + nb = bap2[NINDIR(fs) - 1]; + /* + * Reclaim on disk blocks. + */ + for (i = NINDIR(fs) - 1; i >= 0; i--, nb = nnb) { + if (i != 0) { + if (ufs1fmt) + nnb = bap1[i-1]; + else + nnb = bap2[i-1]; + } else + nnb = 0; if (nb == 0) continue; + cnt++; if (level != 0) { - if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), - level - 1, lbn + (i * lbnadd), countp)) != 0) + struct freework *nfreework; + ufs_lbn_t nlbn; + + nlbn = (lbn + 1) - (i * lbnadd); + nfreework = newfreework(freeblks, freework, nlbn, nb, + fs->fs_frag, 0); + freedeps++; + if ((error = indir_trunc(nfreework, fsbtodb(fs, nb), + nlbn)) != 0) allerror = error; + } else { + struct freedep *freedep; + + /* + * Attempt to aggregate freedep dependencies for + * all blocks being released to the same CG. + */ + LIST_INIT(&wkhd); + if (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb))) { + freedep = newfreedep(freework); + WORKLIST_INSERT_UNLOCKED(&wkhd, + &freedep->fd_list); + freedeps++; + } + ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, + fs->fs_bsize, freeblks->fb_previousinum, &wkhd); + fs_pendingblocks += nblocks; } - ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, - freeblks->fb_previousinum); - fs_pendingblocks += nblocks; - *countp += nblocks; } - UFS_LOCK(ump); - fs->fs_pendingblocks -= fs_pendingblocks; - UFS_UNLOCK(ump); + ACQUIRE_LOCK(&lk); + if (level == 0) + fs_pendingblocks = (nblocks * cnt); + freework->fw_ref += freedeps; + freework->fw_ref -= NINDIR(fs) + 1; + if (freework->fw_ref != 0) + freework = NULL; + FREE_LOCK(&lk); + if (fs_pendingblocks) { + ACQUIRE_LOCK(&lk); + freeblks->fb_chkcnt -= fs_pendingblocks; + FREE_LOCK(&lk); + UFS_LOCK(ump); + fs->fs_pendingblocks -= fs_pendingblocks; + UFS_UNLOCK(ump); + } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); + if (freework) + handle_workitem_indirblk(freework); return (allerror); } /* - * Free an allocindir. - * This routine must be called with splbio interrupts blocked. + * Cancel an allocindir when it is removed via truncation. */ static void -free_allocindir(aip, inodedep) +cancel_allocindir(aip, inodedep, freeblks) struct allocindir *aip; struct inodedep *inodedep; + struct freeblks *freeblks; { + struct indirdep *indirdep; + + while ((indirdep = LIST_FIRST(&aip->ai_indirdeps)) != NULL) { + indirdep->ir_state &= ~ONDEPLIST; + LIST_REMOVE(indirdep, ir_next); + } + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_freeblk that reclaims the space. We accomplish + * this by linking the journal dependency into the indirdep to be + * freed when indir_trunc() is called. If the journal has already + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ + if (aip->ai_jnewblk == NULL) + free_allocindir(aip, inodedep, &freeblks->fb_jwork, + D_FREEBLKS); + else + free_allocindir(aip, inodedep, &aip->ai_indirdep->ir_jwork, + D_ALLOCINDIR); +} + +/* + * Free an allocindir when it is made redundant by another allocation or + * a truncation or when it is successfully written. + */ +static void +free_allocindir(aip, inodedep, wkhd, type) + struct allocindir *aip; + struct inodedep *inodedep; + struct workhead *wkhd; + short type; +{ + struct newdirblk *newdirblk; + struct indirdep *indirdep; struct freefrag *freefrag; + struct worklist *wk; mtx_assert(&lk, MA_OWNED); - if ((aip->ai_state & DEPCOMPLETE) == 0) + if (aip->ai_jnewblk != NULL) + cancel_jnewblk(aip->ai_jnewblk, wkhd); + if (aip->ai_state & ONDEPLIST) LIST_REMOVE(aip, ai_deps); if (aip->ai_state & ONWORKLIST) WORKLIST_REMOVE(&aip->ai_list); LIST_REMOVE(aip, ai_next); if ((freefrag = aip->ai_freefrag) != NULL) { - if (inodedep == NULL) - add_to_worklist(&freefrag->ff_list); - else + if (inodedep == NULL) { + freefrag->ff_state |= COMPLETE; + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list); + } else WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } + while ((indirdep = LIST_FIRST(&aip->ai_indirdeps)) != NULL) { + indirdep->ir_state |= DEPCOMPLETE; + indirdep_complete(indirdep); + } + if ((wk = LIST_FIRST(&aip->ai_newdirblk)) != NULL) { + newdirblk = WK_NEWDIRBLK(wk); + WORKLIST_REMOVE(&newdirblk->db_list); + if (!LIST_EMPTY(&aip->ai_newdirblk)) + panic("free_allocindir: extra newdirblk"); + free_newdirblk(newdirblk); + } + if (wkhd) + jwork_move(type, __LINE__, wkhd, &aip->ai_jwork); + else + handle_jwork(&aip->ai_jwork); WORKITEM_FREE(aip, D_ALLOCINDIR); } @@ -2998,11 +5523,15 @@ softdep_setup_directory_add(bp, dp, diroffset, new ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; + struct allocindir *aip; struct allocdirect *adp; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; + struct newdirblk *mknewdirblk = 0; struct mkdir *mkdir1, *mkdir2; + struct worklist *wk; + struct jaddref *jaddref; struct mount *mp; /* @@ -3013,6 +5542,7 @@ softdep_setup_directory_add(bp, dp, diroffset, new bdwrite(newdirbp); return (0); } + mkdir1 = NULL; mp = UFSTOVFS(dp->i_ump); fs = dp->i_fs; lbn = lblkno(fs, diroffset); @@ -3023,25 +5553,31 @@ softdep_setup_directory_add(bp, dp, diroffset, new dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; - if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { + LIST_INIT(&dap->da_jwork); + if (isnewblk && fragoff(fs, diroffset) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); } if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { + mknewdirblk = malloc(sizeof(struct newdirblk), + M_NEWDIRBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&mknewdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&mknewdirblk->db_mkdir); dap->da_state |= MKDIR_BODY | MKDIR_PARENT; mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); - mkdir1->md_state = MKDIR_BODY; + mkdir1->md_state = ATTACHED | MKDIR_BODY; mkdir1->md_diradd = dap; mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); - mkdir2->md_state = MKDIR_PARENT; + mkdir2->md_state = ATTACHED | MKDIR_PARENT; mkdir2->md_diradd = dap; /* * Dependency on "." and ".." being written to disk. @@ -3049,85 +5585,134 @@ softdep_setup_directory_add(bp, dp, diroffset, new mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); - WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); - FREE_LOCK(&lk); - bdwrite(newdirbp); /* - * Dependency on link count increase for parent directory + * We must link the pagedep, allocdirect, and newdirblk for + * the initial file page so the pointer to the new directory + * is not written until the directory contents are live and + * any subsequent additions are not marked live until the + * block is reachable via the inode. */ - ACQUIRE_LOCK(&lk); - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 - || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { - dap->da_state &= ~MKDIR_PARENT; - WORKITEM_FREE(mkdir2, D_MKDIR); - } else { - LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); - WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); - } + if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) + panic("softdep_setup_directory_add: " + "lost mkdir pagedep"); + LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) + if (wk->wk_type == D_ALLOCDIRECT) + break; + if (wk == NULL) + panic("softdep_setup_directory_add: lost mkdir adp"); + adp = WK_ALLOCDIRECT(wk); + pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = mknewdirblk; + mknewdirblk->db_pagedep = pagedep; + WORKLIST_INSERT(&adp->ad_newdirblk, &mknewdirblk->db_list); + WORKLIST_INSERT(&mknewdirblk->db_mkdir, &mkdir1->md_list); + /* + * Look up the inodedep for the parent directory so that we + * can link mkdir2 into the pending dotdot jaddref or + * the inode write if there is none. If the inode is + * ALLCOMPLETE and no jaddref is present all dependencies have + * been satisfied and mkdir2 can be freed. + */ + if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) + panic("softdep_setup_directory_add: lost parent"); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && + (jaddref->ja_state & MKDIR_PARENT), + ("softdep_setup_directory_add: bad dotdot jaddref %p", + jaddref)); + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + mkdir2->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir2; + /* + * It is important that this journal entry is added prior + * to the dot entry since it writes both the dot and dotdot + * links. This entry must be visible to the recovery + * operation for it to correctly adjust the parent's link. + */ + add_to_journal(&jaddref->ja_list); } /* * Link into parent directory pagedep to await its being written. */ - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); /* - * Link into its inodedep. Put it on the id_bufwait list if the inode - * is not yet written. If it is written, do the post-inode write - * processing to put it on the id_pendinghd list. + * Link the diradd into the jaddref so it may be completed after + * the journal entry is written. The directory offset was not + * known until now so it must still exist as the first element + * of the jaddrefhd queue. */ - (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); - if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) - diradd_inode_written(dap, inodedep); - else - WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); - if (isnewblk) { + if (inodedep_lookup(mp, newinum, 0, &inodedep) == 0) + panic("softdep_setup_directory_add: Lost inodedep"); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_add: bad jaddref %p", jaddref)); + jaddref->ja_diroff = diroffset; + jaddref->ja_diradd = dap; + add_to_journal(&jaddref->ja_list); + /* + * If we are adding a new directory remember this diradd so that if + * we rename it we can keep the dot and dotdot dependencies. If + * we are adding a new name for an inode that has a mkdiradd we + * must be in rename and we have to move the dot and dotdot + * dependencies to this new name. The old name is being orphaned + * soon. + */ + if (mkdir1 != NULL) { + if (inodedep->id_mkdiradd != NULL) + panic("softdep_setup_directory_add: Existing mkdir"); + inodedep->id_mkdiradd = dap; + jaddref = LIST_NEXT(jaddref, ja_inodeps); + KASSERT(jaddref != NULL && + jaddref->ja_ino == jaddref->ja_parent && + (jaddref->ja_state & MKDIR_BODY), + ("softdep_setup_directory_add: bad dot jaddref %p", + jaddref)); + mkdir1->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir1; + add_to_journal(&jaddref->ja_list); + } else if (inodedep->id_mkdiradd) + merge_diradd(inodedep, dap); + if (newdirblk) { /* - * Directories growing into indirect blocks are rare - * enough and the frequency of new block allocation - * in those cases even more rare, that we choose not - * to bother tracking them. Rather we simply force the - * new directory entry to disk. + * There is nothing to do if we are already tracking + * this block. */ - if (lbn >= NDADDR) { - FREE_LOCK(&lk); - /* - * We only have a new allocation when at the - * beginning of a new block, not when we are - * expanding into an existing block. - */ - if (blkoff(fs, diroffset) == 0) - return (1); - return (0); - } - /* - * We only have a new allocation when at the beginning - * of a new fragment, not when we are expanding into an - * existing fragment. Also, there is nothing to do if we - * are already tracking this block. - */ - if (fragoff(fs, diroffset) != 0) { - FREE_LOCK(&lk); - return (0); - } if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(&lk); return (0); } /* - * Find our associated allocdirect and have it track us. + * Find our associated alloc{direct/indir} and have it + * track us. */ - if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) - panic("softdep_setup_directory_add: lost inodedep"); - adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); - if (adp == NULL || adp->ad_lbn != lbn) + adp = NULL; + aip = NULL; + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + if (wk->wk_type == D_ALLOCDIRECT) { + adp = WK_ALLOCDIRECT(wk); + break; + } + if (wk->wk_type == D_ALLOCINDIR) { + aip = WK_ALLOCINDIR(wk); + break; + } + } + if (adp == NULL && aip == NULL) panic("softdep_setup_directory_add: lost entry"); + wk = &newdirblk->db_list; + if (adp) + WORKLIST_INSERT(&adp->ad_newdirblk, wk); + else + WORKLIST_INSERT(&aip->ai_newdirblk, wk); pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; - WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); } FREE_LOCK(&lk); return (0); @@ -3156,12 +5741,14 @@ softdep_change_directoryentry_offset(dp, base, old ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) + if (pagedep_lookup(UFSTOVFS(dp->i_ump), + dp->i_number, lbn, 0, &pagedep) == 0) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], + da_pdlist) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; @@ -3184,48 +5771,139 @@ softdep_change_directoryentry_offset(dp, base, old done: bcopy(oldloc, newloc, entrysize); FREE_LOCK(&lk); + /* XXX Make a remove and add record, add to the pagedep. */ } /* + * Move the mkdir dependencies and journal work from one diradd to another + * when renaming a directory. The new name must depend on the mkdir deps + * completing as the old name did. Directories can only have one valid link + * at a time so one must be canonical. + */ +static void +merge_diradd(inodedep, newdap) + struct inodedep *inodedep; + struct diradd *newdap; +{ + struct diradd *olddap; + struct mkdir *mkdir, *nextmd; + short state; + + olddap = inodedep->id_mkdiradd; + inodedep->id_mkdiradd = newdap; + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + newdap->da_state &= ~DEPCOMPLETE; + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != olddap) + continue; + mkdir->md_diradd = newdap; + state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); + newdap->da_state |= state; + olddap->da_state &= ~state; + if ((olddap->da_state & + (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; + } + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("merge_diradd: unfound ref"); + } + /* + * Any mkdir related journal items are not safe to be freed until + * the new name is stable. + */ + jwork_move(D_DIRADD, __LINE__, &newdap->da_jwork, &olddap->da_jwork); + olddap->da_state |= DEPCOMPLETE; + complete_diradd(olddap); +} + +/* + * Move the diradd to the pending list when all diradd dependencies are + * complete. + */ +static void +complete_diradd(dap) + struct diradd *dap; +{ + struct pagedep *pagedep; + + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + } +} + +/* * Free a diradd dependency structure. This routine must be called - * with splbio interrupts blocked. + * with splbio interrupts blocked. If wkhd is NULL we should not find + * any pending jaddrefs and only jsegdeps to be retired. */ static void -free_diradd(dap) +free_diradd(dap, wkhd) struct diradd *dap; + struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; + struct jaddref *jaddref; mtx_assert(&lk, MA_OWNED); - WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); + if (dap->da_state & ONWORKLIST) + WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + dirrem->dm_state |= COMPLETE; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, - 0, &inodedep) != 0) + 0, &inodedep) != 0) { + /* Abort the addref that reference this diradd. */ + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) + if (jaddref->ja_diradd == dap) { + cancel_jaddref(jaddref, inodedep, wkhd); + break; + } + if (inodedep->id_mkdiradd == dap) + inodedep->id_mkdiradd = NULL; (void) free_inodedep(inodedep); + } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; - dap->da_state &= ~mkdir->md_state; - WORKLIST_REMOVE(&mkdir->md_list); + dap->da_state &= + ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + if ((jaddref = mkdir->md_jaddref) != NULL) + cancel_jaddref(jaddref, NULL, wkhd); WORKITEM_FREE(mkdir, D_MKDIR); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } + if (wkhd) + jwork_move(D_DIRREM, __LINE__, wkhd, &dap->da_jwork); + /* + * Free any journal segments waiting for the directory write. + */ + handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } @@ -3254,11 +5932,14 @@ softdep_setup_remove(bp, dp, ip, isrmdir) int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; + int direct; /* - * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want + * newdirrem() to setup the full directory remove which requires + * isrmdir > 1. */ - dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem); /* * If the COMPLETE flag is clear, then there were no active @@ -3280,12 +5961,64 @@ softdep_setup_remove(bp, dp, ip, isrmdir) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(&lk); - handle_workitem_remove(dirrem, NULL); + if (direct) + handle_workitem_remove(dirrem, NULL); } } /* + * Check for an entry matching 'offset' on both the pd_dirraddhd list and the + * pd_pendinghd list of a pagedep. + */ +static struct diradd * +diradd_lookup(pagedep, offset) + struct pagedep *pagedep; + int offset; +{ + struct diradd *dap; + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) + if (dap->da_offset == offset) + return (dap); + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) + if (dap->da_offset == offset) + return (dap); + return (NULL); +} + +/* + * Search for a .. diradd dependency in a directory that is being removed. + * If the directory was renamed to a new parent we have a diradd rather + * than a mkdir for the .. entry. We need to cancel it now before + * it is found in truncate(). + */ +static void +cancel_diradd_dotdot(ip, dirrem) + struct inode *ip; + struct dirrem *dirrem; +{ + struct pagedep *pagedep; + struct diradd *dap; + struct worklist *wk; + + if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, + &pagedep) == 0) + return; + dap = diradd_lookup(pagedep, DOTDOT_OFFSET); + if (dap == NULL) + return; + free_diradd(dap, &dirrem->dm_jwork); + /* + * Mark any journal work as belonging to the parent so it is freed + * with the .. reference. + */ + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + wk->wk_state |= MKDIR_PARENT; +} + +/* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ @@ -3303,6 +6036,9 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; /* * Whiteouts have no deletion dependencies. @@ -3322,33 +6058,67 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); + LIST_INIT(&dirrem->dm_jremrefhd); + LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; - + /* + * Allocate remove reference structures to track journal write + * dependencies. We will always have one for the link and + * when doing directories we will always have one more for dot. + * When renaming a directory we skip the dotdot link change so + * this is not needed. + */ + dotremref = dotdotremref = NULL; + jremref = newjremref(dirrem, dp, ip, dp->i_offset); + if (isrmdir) + dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET); + if (isrmdir > 1) { + dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET); + dotdotremref->jr_state |= MKDIR_PARENT; + } ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); - if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, + &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* + * If we're removing a .. link search for the dependency now and + * cancel it. Any pending journal work will be added to the dirrem + * to be completed when the workitem remove completes. + */ + if (isrmdir > 1) + cancel_diradd_dotdot(ip, dirrem); + /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can - * be de-allocated. Check for an entry on both the pd_dirraddhd - * list and the pd_pendinghd list. + * be de-allocated. */ - - LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) - if (dap->da_offset == offset) - break; + dap = diradd_lookup(pagedep, offset); if (dap == NULL) { - - LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) - if (dap->da_offset == offset) - break; - if (dap == NULL) - return (dirrem); + /* + * Link the jremref structures into the dirrem so they are + * written prior to the pagedep. + */ + if (jremref) { + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, + jr_deps); + add_to_journal(&jremref->jr_list); + if (dotremref) { + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, + dotremref, jr_deps); + add_to_journal(&dotremref->jr_list); + } + if (dotdotremref) { + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, + dotdotremref, jr_deps); + add_to_journal(&dotdotremref->jr_list); + } + } + return (dirrem); } /* * Must be ATTACHED at this point. @@ -3359,6 +6129,20 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); /* + * If we've found a diradd in memory it still has valid journal + * entries to complete. Rather than writing new journal entries + * we complete the segdeps only after the removal is complete. The + * recovery operation will simply find an incomplete add. If the + * addref journal is not written the jaddrefs will stay linked into + * the inodedep and bmsafemap preventing the writes of the new + * data until they are ultimately freed. + */ + WORKITEM_FREE(jremref, D_JREMREF); + if (dotremref) + WORKITEM_FREE(dotremref, D_JREMREF); + if (dotdotremref) + WORKITEM_FREE(dotdotremref, D_JREMREF); + /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). @@ -3373,7 +6157,18 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; - free_diradd(dap); + free_diradd(dap, &dirrem->dm_jwork); +#ifdef DEBUG + /* XXX Temporary. */ + if (isrmdir == 0) { + struct worklist *wk; + + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) + panic("bad wk %p (0x%X)\n", wk, wk->wk_state); + } +#endif + return (dirrem); } @@ -3407,6 +6202,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; + struct jaddref *jaddref; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); @@ -3422,6 +6218,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; + LIST_INIT(&dap->da_jwork); } /* @@ -3454,7 +6251,8 @@ softdep_setup_directory_change(bp, dp, ip, newinum dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); return; @@ -3483,23 +6281,30 @@ softdep_setup_directory_change(bp, dp, ip, newinum dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list); } /* - * Link into its inodedep. Put it on the id_bufwait list if the inode - * is not yet written. If it is written, do the post-inode write - * processing to put it on the id_pendinghd list. + * Lookup the jaddref for this journal entry. We must finish + * initializing it and make the diradd write dependent on it. */ - if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || - (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { - dap->da_state |= COMPLETE; - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); - WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); - } else { - LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], - dap, da_pdlist); - WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); - } + if (inodedep_lookup(mp, newinum, 0, &inodedep) == 0) + panic("softdep_setup_directory_change: Lost inodedep."); + jaddref = LIST_FIRST(&inodedep->id_jaddrefhd); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_change: bad jaddref %p", jaddref)); + jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diradd = dap; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, + da_pdlist); + add_to_journal(&jaddref->ja_list); + /* + * If we're making a new name for a directory that has not been + * committed when need to move the dot and dotdot references to + * this new name. + */ + if (inodedep->id_mkdiradd) + merge_diradd(inodedep, dap); FREE_LOCK(&lk); } @@ -3584,6 +6389,8 @@ handle_workitem_remove(dirrem, xp) { struct thread *td = curthread; struct inodedep *inodedep; + struct workhead dotdotwk; + struct worklist *wk; struct vnode *vp; struct inode *ip; ino_t oldinum; @@ -3600,7 +6407,27 @@ handle_workitem_remove(dirrem, xp) if ((inodedep_lookup(dirrem->dm_list.wk_mp, dirrem->dm_oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_workitem_remove: Journal entries not written.")); /* + * Move all dependencies waiting on the remove to complete + * from the dirrem to the inode inowait list to be completed + * after the inode has been updated and written to disk. Any + * marked MKDIR_PARENT are saved to be completed when the .. ref + * is removed. + */ + LIST_INIT(&dotdotwk); + while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { + WORKLIST_REMOVE(wk); + if (wk->wk_state & MKDIR_PARENT) { + wk->wk_state &= ~MKDIR_PARENT; + WORKLIST_INSERT(&dotdotwk, wk); + continue; + } + WORKLIST_INSERT(&inodedep->id_inowait, wk); + } + LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); + /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { @@ -3611,6 +6438,9 @@ handle_workitem_remove(dirrem, xp) panic("handle_workitem_remove: bad file delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; num_dirrem -= 1; + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: worklist not empty. %s", + TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); vput(vp); @@ -3639,6 +6469,8 @@ handle_workitem_remove(dirrem, xp) * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: DIRCHG and worklist not empty.")); num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); @@ -3689,6 +6521,7 @@ static void handle_workitem_freefile(freefile) struct freefile *freefile; { + struct workhead wkhd; struct fs *fs; struct inodedep *idp; struct ufsmount *ump; @@ -3701,13 +6534,15 @@ handle_workitem_freefile(freefile) error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(&lk); if (error) - panic("handle_workitem_freefile: inodedep survived"); + panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); + LIST_INIT(&wkhd); + LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, - freefile->fx_oldinum, freefile->fx_mode)) != 0) + freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefile, D_FREEFILE); @@ -3757,8 +6592,9 @@ softdep_disk_io_initiation(bp) { struct worklist *wk; struct worklist marker; - struct indirdep *indirdep; struct inodedep *inodedep; + struct freeblks *freeblks; + struct jfreeblk *jfreeblk; /* * We only care about write operations. There should never @@ -3767,6 +6603,10 @@ softdep_disk_io_initiation(bp) if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("softdep_disk_io_initiation: Writing buffer with " + "background write in progress: %p", bp); + marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ @@ -3792,46 +6632,40 @@ softdep_disk_io_initiation(bp) continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_io_initiation: indirdep gone"); + initiate_write_indirdep(WK_INDIRDEP(wk), bp); + continue; + + case D_BMSAFEMAP: + initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); + continue; + + case D_JSEG: + WK_JSEG(wk)->js_buf = NULL; + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); /* - * If there are no remaining dependencies, this - * will be writing the real pointers, so the - * dependency can be freed. + * We have to wait for the jfreeblks to be journaled + * before we can write an inodeblock with updated + * pointers. Be careful to arrange the marker so + * we revisit the jfreeblk if it's not removed by + * the first jwait(). */ - if (LIST_EMPTY(&indirdep->ir_deplisthd)) { - struct buf *bp; - - bp = indirdep->ir_savebp; - bp->b_flags |= B_INVAL | B_NOCACHE; - /* inline expand WORKLIST_REMOVE(wk); */ - wk->wk_state &= ~ONWORKLIST; - LIST_REMOVE(wk, wk_list); - WORKITEM_FREE(indirdep, D_INDIRDEP); - FREE_LOCK(&lk); - brelse(bp); - ACQUIRE_LOCK(&lk); - continue; + if (jfreeblk != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&jfreeblk->jf_list); } - /* - * Replace up-to-date version with safe version. - */ - FREE_LOCK(&lk); - indirdep->ir_saveddata = malloc(bp->b_bcount, - M_INDIRDEP, M_SOFTDEP_FLAGS); - ACQUIRE_LOCK(&lk); - indirdep->ir_state &= ~ATTACHED; - indirdep->ir_state |= UNDONE; - bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); - bcopy(indirdep->ir_savebp->b_data, bp->b_data, - bp->b_bcount); continue; case D_MKDIR: - case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: continue; default: @@ -3855,6 +6689,8 @@ initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { + struct jremref *jremref; + struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; @@ -3869,6 +6705,16 @@ initiate_write_filepage(pagedep, bp) return; } pagedep->pd_state |= IOSTARTED; + /* + * Wait for all journal remove dependencies to hit the disk. + * We can not allow any potentially conflicting directory adds + * to be visible before removes and rollback is too difficult. + * lk may be dropped and re-acquired, however we hold the buf + * locked so the dependency can not go away. + */ + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) + while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) + jwait(&jremref->jr_list); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) @@ -3905,6 +6751,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; + struct jaddref *jaddref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -3940,32 +6787,41 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; - if (TAILQ_EMPTY(&inodedep->id_inoupdt)) + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + LIST_EMPTY(&inodedep->id_jaddrefhd)) return; /* + * Revert the link count for every jaddref present. + */ + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) { + dp->di_nlink--; + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + } + /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn - NDADDR, - dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -3981,14 +6837,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4012,8 +6868,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4030,7 +6886,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* @@ -4051,6 +6907,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; + struct jaddref *jaddref; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS @@ -4087,24 +6944,33 @@ initiate_write_inodeblock_ufs2(inodedep, bp) inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && - TAILQ_EMPTY(&inodedep->id_extupdt)) + TAILQ_EMPTY(&inodedep->id_extupdt) && + LIST_EMPTY(&inodedep->id_jaddrefhd)) return; /* + * Revert the link count for every jaddref present. + */ + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) { + dp->di_nlink--; + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + } + /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4120,12 +6986,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NXADDR; i++) { + dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); @@ -4142,8 +7008,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; @@ -4154,24 +7020,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS - if (deplist != 0 && prevlbn >= adp->ad_lbn) + if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); - prevlbn = adp->ad_lbn; - if (adp->ad_lbn < NDADDR && - dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_lbn, - (intmax_t)dp->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_lbn >= NDADDR && - dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", - (intmax_t)adp->ad_lbn - NDADDR, - (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], + (intmax_t)adp->ad_offset - NDADDR, + (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); - deplist |= 1 << adp->ad_lbn; + deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); @@ -4187,14 +7053,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp) */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_lbn >= NDADDR) + if (adp->ad_offset >= NDADDR) break; - dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; - dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; - for (i = adp->ad_lbn + 1; i < NDADDR; i++) { + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); @@ -4218,8 +7084,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * we already checked for fragments in the loop above. */ if (lastadp != NULL && - dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { - for (i = lastadp->ad_lbn; i >= 0; i--) + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; @@ -4236,16 +7102,289 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_lbn - NDADDR] = 0; + dp->di_ib[adp->ad_offset - NDADDR] = 0; } +static void +free_indirdep(indirdep) + struct indirdep *indirdep; +{ + + KASSERT(LIST_EMPTY(&indirdep->ir_jwork), + ("free_indirdep: Journal work not empty.")); + if (indirdep->ir_state & ONWORKLIST) + WORKLIST_REMOVE(&indirdep->ir_list); + WORKITEM_FREE(indirdep, D_INDIRDEP); +} + +static void +initiate_write_indirdep(indirdep, bp) + struct indirdep *indirdep; + struct buf *bp; +{ + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_io_initiation: indirdep gone"); + + /* + * If there are no remaining dependencies, this will be writing + * the real pointers. + */ + if (LIST_EMPTY(&indirdep->ir_deplisthd)) + return; + /* + * Replace up-to-date version with safe version. + */ + FREE_LOCK(&lk); + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, + M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(&lk); + indirdep->ir_state &= ~ATTACHED; + indirdep->ir_state |= UNDONE; + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + bcopy(indirdep->ir_savebp->b_data, bp->b_data, + bp->b_bcount); +} + /* + * Called when an inode has been cleared in a cg bitmap. This finally + * eliminates any canceled jaddrefs + */ +void +softdep_setup_inofree(mp, bp, ino, wkhd) + struct mount *mp; + struct buf *bp; + ino_t ino; + struct workhead *wkhd; +{ + struct worklist *wk, *wkn; + struct bmsafemap *bmsafemap; + struct inodedep *inodedep; + uint8_t *inosused; + struct cg *cgp; + struct fs *fs; + + ACQUIRE_LOCK(&lk); + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, ino)); + cgp = (struct cg *)bp->b_data; + inosused = cg_inosused(cgp); + if (isset(inosused, ino % fs->fs_ipg)) + panic("softdep_setup_inofree: inode %d not freed.", ino); + if (inodedep_lookup(mp, ino, 0, &inodedep)) + panic("softdep_setup_inofree: ino %d has existing inodedep %p", + ino, inodedep); + if (wkhd) { /* XXX Temporary. */ + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JADDREF) + continue; + WORKLIST_REMOVE(wk); + /* + * We can free immediately even if the jaddref isn't attached + * in a background write as now the bitmaps are reconciled. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jaddref(WK_JADDREF(wk)); + } + } + FREE_LOCK(&lk); +} + + +/* + * Called via ffs_blkfree() after a set of frags has been cleared from a cg + * map. Any dependencies waiting for the write to clear are added to the + * buf's list and any jnewblks that are being canceled are discarded + * immediately. + */ +void +softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) + struct mount *mp; + struct buf *bp; + ufs2_daddr_t blkno; + int frags; + struct workhead *wkhd; +{ + struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; + struct worklist *wk, *wkn; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ufs2_daddr_t jstart; + ufs2_daddr_t jend; + ufs2_daddr_t end; + long bno; + int i; + + ACQUIRE_LOCK(&lk); + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); + + /* + * Detach any jnewblks which have been canceled. They must linger + * until the bitmap is cleared again by ffs_blkfree() to prevent + * an unjournaled allocation from hitting the disk. + */ + if (wkhd) { /* XXX Should be temporary. */ + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JNEWBLK) + continue; + jnewblk = WK_JNEWBLK(wk); + KASSERT(jnewblk->jn_state & GOINGAWAY, + ("softdep_setup_blkfree: Freed jnewblk not going away.")); + WORKLIST_REMOVE(wk); + /* + * Assert that this block is free in the bitmap before we + * discard the jnewblk. + */ + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) + if (isclr(blksfree, bno + i)) + panic("softdep_setup_blkfree: %jd - %jd(%jd) not free", + blkno, jnewblk->jn_blkno, jnewblk->jn_lbn); + /* + * Even if it's not attached we can free immediately as the + * new bitmap is correct. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jnewblk(jnewblk); + } + /* + * The buf must be locked by the caller otherwise these could be + * added while it's being written and the write would complete + * them before they made it to disk. + */ + jwork_move(D_BMSAFEMAP, __LINE__, &bp->b_dep, wkhd); + } + /* + * Assert that we are not freeing a block which has an outstanding + * allocation dependency. + */ + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + end = blkno + frags; + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + /* + * Don't match against blocks that will be freed when the + * background write is done. + */ + if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == + (COMPLETE | DEPCOMPLETE)) + continue; + jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; + jend = jnewblk->jn_blkno + jnewblk->jn_frags; + if ((blkno >= jstart && blkno < jend) || + (end > jstart && end <= jend)) { + printf("state 0x%X %jd - %d %d dep %p\n", + jnewblk->jn_state, jnewblk->jn_blkno, + jnewblk->jn_oldfrags, jnewblk->jn_frags, + jnewblk->jn_dep); + panic("softdep_setup_blkfree: " + "%jd-%jd(%d) overlaps with %jd-%jd", + blkno, end, frags, jstart, jend); + } + } + FREE_LOCK(&lk); +} + +static void +initiate_write_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; /* The cg block. */ +{ + struct jaddref *jaddref; + struct jnewblk *jnewblk; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + int cleared; + ino_t ino; + long bno; + int i; + + if (bmsafemap->sm_state & IOSTARTED) + panic("initiate_write_bmsafemap: Already started\n"); + bmsafemap->sm_state |= IOSTARTED; + /* + * Clear any inode allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { + ino = jaddref->ja_ino % fs->fs_ipg; + /* + * If this is a background copy the inode may not + * be marked used yet. + */ + if (isset(inosused, ino)) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir--; + cgp->cg_cs.cs_nifree++; + clrbit(inosused, ino); + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: inode %d " + "marked free", jaddref->ja_ino); + } + } + /* + * Clear any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + bno = dtogd(fs, jnewblk->jn_blkno); + cleared = 0; + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isclr(blksfree, bno + i)) { + cleared = 1; + setbit(blksfree, bno + i); + } + } + /* + * We may not clear the block if it's a background + * copy. In that case there is no reason to detach + * it. + */ + if (cleared) { + jnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state |= UNDONE; + } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) + panic("initiate_write_bmsafemap: block %jd " + "marked free", jnewblk->jn_blkno); + } + } + /* + * Move allocation lists to the written lists so they can be + * cleared once the block write is complete. + */ + LIST_SWAP(&bmsafemap->sm_allocdirecthd, &bmsafemap->sm_allocdirectwr, + allocdirect, ad_deps); + LIST_SWAP(&bmsafemap->sm_allocindirhd, &bmsafemap->sm_allocindirwr, + allocindir, ai_deps); + LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, + inodedep, id_deps); + LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); +} + +/* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. + * */ static void softdep_disk_write_complete(bp) @@ -4254,12 +7393,7 @@ softdep_disk_write_complete(bp) struct worklist *wk; struct worklist *owk; struct workhead reattach; - struct newblk *newblk; - struct allocindir *aip; - struct allocdirect *adp; - struct indirdep *indirdep; - struct inodedep *inodedep; - struct bmsafemap *bmsafemap; + struct buf *sbp; /* * If an error occurred while doing the write, then the data @@ -4271,8 +7405,9 @@ softdep_disk_write_complete(bp) /* * This lock must not be released anywhere in this code segment. */ + sbp = NULL; + owk = NULL; ACQUIRE_LOCK(&lk); - owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); if (wk == owk) @@ -4291,33 +7426,8 @@ softdep_disk_write_complete(bp) continue; case D_BMSAFEMAP: - bmsafemap = WK_BMSAFEMAP(wk); - while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { - newblk->nb_state |= DEPCOMPLETE; - newblk->nb_bmsafemap = NULL; - LIST_REMOVE(newblk, nb_deps); - } - while ((adp = - LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { - adp->ad_state |= DEPCOMPLETE; - adp->ad_buf = NULL; - LIST_REMOVE(adp, ad_deps); - handle_allocdirect_partdone(adp); - } - while ((aip = - LIST_FIRST(&bmsafemap->sm_allocindirhd))) { - aip->ai_state |= DEPCOMPLETE; - aip->ai_buf = NULL; - LIST_REMOVE(aip, ai_deps); - handle_allocindir_partdone(aip); - } - while ((inodedep = - LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { - inodedep->id_state |= DEPCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; - } - WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: @@ -4325,37 +7435,42 @@ softdep_disk_write_complete(bp) continue; case D_ALLOCDIRECT: - adp = WK_ALLOCDIRECT(wk); - adp->ad_state |= COMPLETE; - handle_allocdirect_partdone(adp); + wk->wk_state |= COMPLETE; + handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: - aip = WK_ALLOCINDIR(wk); - aip->ai_state |= COMPLETE; - handle_allocindir_partdone(aip); + wk->wk_state |= COMPLETE; + handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: - indirdep = WK_INDIRDEP(wk); - if (indirdep->ir_state & GOINGAWAY) - panic("disk_write_complete: indirdep gone"); - bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); - free(indirdep->ir_saveddata, M_INDIRDEP); - indirdep->ir_saveddata = 0; - indirdep->ir_state &= ~UNDONE; - indirdep->ir_state |= ATTACHED; - while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { - handle_allocindir_partdone(aip); - if (aip == LIST_FIRST(&indirdep->ir_donehd)) - panic("disk_write_complete: not gone"); - } - WORKLIST_INSERT(&reattach, wk); - if ((bp->b_flags & B_DELWRI) == 0) - stat_indir_blk_ptrs++; - bdirty(bp); + if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) + WORKLIST_INSERT(&reattach, wk); continue; + case D_FREEBLKS: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk); + continue; + + case D_FREEWORK: + handle_written_freework(WK_FREEWORK(wk)); + break; + + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + + case D_JSEG: + handle_written_jseg(WK_JSEG(wk), bp); + continue; + default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); @@ -4370,6 +7485,8 @@ softdep_disk_write_complete(bp) WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(&lk); + if (sbp) + brelse(sbp); } /* @@ -4378,17 +7495,18 @@ softdep_disk_write_complete(bp) * splbio interrupts blocked. */ static void -handle_allocdirect_partdone(adp) +handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ + struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; - long bsize, delay; + long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (adp->ad_buf != NULL) + if (adp->ad_bmsafemap != NULL) panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last @@ -4441,23 +7559,27 @@ static void /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. - * If the inode still has a bitmap dependency, then it has - * never been written to disk, hence the on-disk inode cannot - * reference the old fragment so we can free it without delay. + * If the inode is not ALLCOMPLETE the pointer may not yet + * be written. Place the allocdirect on the bufwait to + * be freed when we're sure it is reachable on disk. */ - delay = (inodedep->id_state & DEPCOMPLETE); + if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE) + wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - free_allocdirect(listhead, adp, delay); + TAILQ_REMOVE(listhead, adp, ad_next); + if (wkhd) + WORKLIST_INSERT(wkhd, &adp->ad_list); + else + free_allocdirect(adp); } } /* - * Called from within softdep_disk_write_complete above. Note that - * this routine is always called from interrupt level with further - * splbio interrupts blocked. + * Called from within softdep_disk_write_complete above. This routine + * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) @@ -4467,11 +7589,11 @@ handle_allocindir_partdone(aip) if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; - if (aip->ai_buf != NULL) + if (aip->ai_bmsafemap != NULL) panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; + LIST_REMOVE(aip, ai_next); if (indirdep->ir_state & UNDONE) { - LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } @@ -4481,12 +7603,119 @@ handle_allocindir_partdone(aip) else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; - LIST_REMOVE(aip, ai_next); - if (aip->ai_freefrag != NULL) - add_to_worklist(&aip->ai_freefrag->ff_list); - WORKITEM_FREE(aip, D_ALLOCINDIR); + /* + * Await the pointer write. + */ + LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } +static void +handle_jwork(wkhd) + struct workhead *wkhd; +{ + struct worklist *wk; + + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + default: + panic("handle_jwork: Unknown type %s\n", + TYPENAME(wk->wk_type)); + } + } +} + +static struct freefile * +handle_bufwait(inodedep, refhd) + struct inodedep *inodedep; + struct workhead *refhd; +{ + struct jaddref *jaddref; + struct freefile *freefile; + struct worklist *wk; + + freefile = NULL; + while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_FREEFILE: + /* + * We defer adding freefile to the worklist + * until all other additions have been made to + * ensure that it will be done after all the + * old blocks have been freed. + */ + if (freefile != NULL) + panic("handle_bufwait: freefile"); + freefile = WK_FREEFILE(wk); + continue; + + case D_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); + continue; + + case D_DIRADD: + diradd_inode_written(WK_DIRADD(wk), inodedep); + continue; + + case D_FREEFRAG: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) + continue; + add_to_worklist(wk); + continue; + + case D_DIRREM: + wk->wk_state |= COMPLETE; + add_to_worklist(wk); + continue; + + case D_ALLOCDIRECT: + free_allocdirect(WK_ALLOCDIRECT(wk)); + continue; + + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + + case D_JADDREF: + jaddref = WK_JADDREF(wk); + /* + * We have to remove this journal entry from the + * inode's list as soon as it's written so the + * inodedep can be freed. + */ + if (jaddref->ja_state & ONDEPLIST) { + jaddref->ja_state &= ~ONDEPLIST; + LIST_REMOVE(jaddref, ja_inodeps); + } + /* + * Transfer any jaddrefs to the list to be freed with + * the bitmap if we're handling a removed file. + */ + if (refhd == NULL) { + wk->wk_state |= COMPLETE; + free_jaddref(jaddref); + } else + WORKLIST_INSERT(refhd, wk); + continue; + + case D_JNEWBLK: + wk->wk_state |= COMPLETE; + free_jnewblk(WK_JNEWBLK(wk)); + continue; + + default: + panic("handle_bufwait: Unknown type %p(%s)", + wk, TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + return (freefile); +} /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note @@ -4498,10 +7727,12 @@ handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { - struct worklist *wk, *filefree; + struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; + struct jaddref *jaddref, *tmp; + struct workhead wkhd; int hadchanges, fstype; if ((inodedep->id_state & IOSTARTED) == 0) @@ -4541,49 +7772,64 @@ handle_written_inodeblock(inodedep, bp) * the inode could be updated. */ hadchanges = 0; + /* + * Restore the link count for every jaddref present. + */ + LIST_FOREACH_SAFE(jaddref, &inodedep->id_jaddrefhd, ja_inodeps, tmp) { + hadchanges = 1; + if ((jaddref->ja_state & UNDONE) == 0) + continue; + if (fstype == UFS1) + dp1->di_nlink++; + else + dp2->di_nlink++; + jaddref->ja_state &= ~UNDONE; + jaddref->ja_state |= ATTACHED; + free_jaddref(jaddref); + } for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { - if (adp->ad_lbn < NDADDR) { - if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", - (intmax_t)adp->ad_lbn, - dp1->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, + dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, - dp1->di_ib[adp->ad_lbn - NDADDR]); - dp1->di_ib[adp->ad_lbn - NDADDR] = + (intmax_t)adp->ad_offset - NDADDR, + dp1->di_ib[adp->ad_offset - NDADDR]); + dp1->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } else { - if (adp->ad_lbn < NDADDR) { - if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) + if (adp->ad_offset < NDADDR) { + if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_db[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) + if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_lbn - NDADDR, + (intmax_t)adp->ad_offset - NDADDR, (intmax_t) - dp2->di_ib[adp->ad_lbn - NDADDR]); - dp2->di_ib[adp->ad_lbn - NDADDR] = + dp2->di_ib[adp->ad_offset - NDADDR]); + dp2->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } @@ -4595,13 +7841,13 @@ handle_written_inodeblock(inodedep, bp) nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); - if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) + if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", - (intmax_t)adp->ad_lbn, "mismatch", - (intmax_t)dp2->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); - dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; + dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; @@ -4640,10 +7886,11 @@ handle_written_inodeblock(inodedep, bp) /* * Process any allocdirects that completed during the update. */ + LIST_INIT(&wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) - handle_allocdirect_partdone(adp); + handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode @@ -4651,55 +7898,24 @@ handle_written_inodeblock(inodedep, bp) * avoid creation of new triples * before the old ones have been deleted. */ - filefree = NULL; - while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { - WORKLIST_REMOVE(wk); - switch (wk->wk_type) { + freefile = handle_bufwait(inodedep, NULL); + if (freefile && !LIST_EMPTY(&wkhd)) { + WORKLIST_INSERT(&wkhd, &freefile->fx_list); + freefile = NULL; + } + /* + * Move rolled forward dependency completions to the bufwait list + * now that those that were already written have been processed. + */ + if (!LIST_EMPTY(&wkhd) && hadchanges == 0) + panic("handle_written_inodeblock: bufwait but no changes"); + jwork_move(D_INODEDEP, __LINE__, &inodedep->id_bufwait, &wkhd); - case D_FREEFILE: - /* - * We defer adding filefree to the worklist until - * all other additions have been made to ensure - * that it will be done after all the old blocks - * have been freed. - */ - if (filefree != NULL) - panic("handle_written_inodeblock: filefree"); - filefree = wk; - continue; - - case D_MKDIR: - handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); - continue; - - case D_DIRADD: - diradd_inode_written(WK_DIRADD(wk), inodedep); - continue; - - case D_FREEBLKS: - wk->wk_state |= COMPLETE; - if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) - continue; - /* -- fall through -- */ - case D_FREEFRAG: - case D_DIRREM: - add_to_worklist(wk); - continue; - - case D_NEWDIRBLK: - free_newdirblk(WK_NEWDIRBLK(wk)); - continue; - - default: - panic("handle_written_inodeblock: Unknown type %s", - TYPENAME(wk->wk_type)); - /* NOTREACHED */ - } - } - if (filefree != NULL) { + if (freefile != NULL) { if (free_inodedep(inodedep) == 0) - panic("handle_written_inodeblock: live inodedep"); - add_to_worklist(filefree); + panic("handle_written_inodeblock: live inodedep %p", + inodedep); + add_to_worklist(&freefile->fx_list); return (0); } @@ -4707,12 +7923,84 @@ handle_written_inodeblock(inodedep, bp) * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || - (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && - TAILQ_FIRST(&inodedep->id_extupdt) == 0)) + (LIST_FIRST(&inodedep->id_jaddrefhd) == 0 && + TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && + TAILQ_FIRST(&inodedep->id_extupdt) == 0 && + LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } +static int +handle_written_indirdep(indirdep, bp, bpp) + struct indirdep *indirdep; + struct buf *bp; + struct buf **bpp; +{ + struct allocindir *aip; + int chgs; + + if (indirdep->ir_state & GOINGAWAY) + panic("disk_write_complete: indirdep gone"); + chgs = 0; + /* + * If there were rollbacks revert them here. + */ + if (indirdep->ir_saveddata) { + bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); + free(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = 0; + chgs = 1; + } + indirdep->ir_state &= ~UNDONE; + indirdep->ir_state |= ATTACHED; + /* + * Move allocindirs with written pointers to the completehd if + * the the indirdep's pointer is not yet written. Otherwise + * free them here. + */ + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + LIST_REMOVE(aip, ai_next); + LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, + ai_next); + continue; + } + free_allocindir(aip, NULL, NULL, 0); + } + /* + * Move allocindirs that have finished dependency processing from + * the done list to the write list after updating the pointers. + */ + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { + handle_allocindir_partdone(aip); + if (aip == LIST_FIRST(&indirdep->ir_donehd)) + panic("disk_write_complete: not gone"); + chgs = 1; + } + /* + * If there are no fresh dependencies and none waiting on writes + * we can free the indirdep. The caller is responsble for + * releasing sbp when it is safe to do so. + */ + if ((indirdep->ir_state & DEPCOMPLETE) && + LIST_EMPTY(&indirdep->ir_deplisthd) && + LIST_EMPTY(&indirdep->ir_writehd)) { + struct buf *sbp; + sbp = indirdep->ir_savebp; + sbp->b_flags |= B_INVAL | B_NOCACHE; + if (indirdep->ir_state & ONDEPLIST) + LIST_REMOVE(indirdep, ir_next); + free_indirdep(indirdep); + *bpp = sbp; + } + if ((bp->b_flags & B_DELWRI) == 0) + stat_indir_blk_ptrs++; + if (chgs) + bdirty(bp); + return (chgs); +} + /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. @@ -4722,50 +8010,187 @@ diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { - struct pagedep *pagedep; dap->da_state |= COMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); - } + complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* - * Handle the completion of a mkdir dependency. + * Returns true if the bmsafemap will have rollbacks when written. Must + * only be called with lk and the buf lock on the cg held. */ +static int +bmsafemap_rollbacks(bmsafemap) + struct bmsafemap *bmsafemap; +{ + + return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | + !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); +} + +/* + * Complete a write to a bmsafemap structure. Roll forward any bitmap + * changes if it's not a background write. Set all written dependencies + * to DEPCOMPLETE and free the structure if possible. + */ +static int +handle_written_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; +{ + struct newblk *newblk; + struct allocindir *aip; + struct allocdirect *adp; + struct inodedep *inodedep; + struct jaddref *jaddref, *jatmp; + struct jnewblk *jnewblk, *jntmp; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ino_t ino; + long bno; + int chgs; + int i; + + if ((bmsafemap->sm_state & IOSTARTED) == 0) + panic("initiate_write_bmsafemap: Not started\n"); + chgs = 0; + bmsafemap->sm_state &= ~IOSTARTED; + /* + * Restore unwritten inode allocation pending jaddref writes. + */ + if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, + ja_bmdeps, jatmp) { + if ((jaddref->ja_state & UNDONE) == 0) + continue; + ino = jaddref->ja_ino % fs->fs_ipg; + if (isset(inosused, ino)) + panic("handle_written_bmsafemap: " + "re-allocated inode"); + if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir++; + cgp->cg_cs.cs_nifree--; + setbit(inosused, ino); + chgs = 1; + } + jaddref->ja_state &= ~UNDONE; + jaddref->ja_state |= ATTACHED; + free_jaddref(jaddref); + } + } + /* + * Restore any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, + jntmp) { + if ((jnewblk->jn_state & UNDONE) == 0) + continue; + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (bp->b_xflags & BX_BKGRDMARKER) + break; + if ((jnewblk->jn_state & NEWBLOCK) == 0 && + isclr(blksfree, bno + i)) + panic("handle_written_bmsafemap: " + "re-allocated fragment"); + clrbit(blksfree, bno + i); + chgs = 1; + } + jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); + jnewblk->jn_state |= ATTACHED; + free_jnewblk(jnewblk); + } + } + while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { + newblk->nb_state |= DEPCOMPLETE; + newblk->nb_bmsafemap = NULL; + LIST_REMOVE(newblk, nb_deps); + } + while ((adp = LIST_FIRST(&bmsafemap->sm_allocdirectwr))) { + adp->ad_state |= DEPCOMPLETE; + adp->ad_state &= ~ONDEPLIST; + adp->ad_bmsafemap = NULL; + LIST_REMOVE(adp, ad_deps); + handle_allocdirect_partdone(adp, NULL); + } + while ((aip = LIST_FIRST(&bmsafemap->sm_allocindirwr))) { + aip->ai_state |= DEPCOMPLETE; + aip->ai_state &= ~ONDEPLIST; + aip->ai_bmsafemap = NULL; + LIST_REMOVE(aip, ai_deps); + handle_allocindir_partdone(aip); + } + while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { + inodedep->id_state |= DEPCOMPLETE; + inodedep->id_state &= ~ONDEPLIST; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_bmsafemap = NULL; + } + if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && + LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && + LIST_EMPTY(&bmsafemap->sm_newblkhd) && + LIST_EMPTY(&bmsafemap->sm_allocdirecthd) && + LIST_EMPTY(&bmsafemap->sm_allocindirhd) && + LIST_EMPTY(&bmsafemap->sm_inodedephd)) { + if (chgs) + bdirty(bp); + LIST_REMOVE(bmsafemap, sm_hash); + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (0); + } + bdirty(bp); + return (1); +} + +/* + * Try to free a mkdir dependency. + */ static void -handle_written_mkdir(mkdir, type) +complete_mkdir(mkdir) struct mkdir *mkdir; - int type; { struct diradd *dap; - struct pagedep *pagedep; - if (mkdir->md_state != type) - panic("handle_written_mkdir: bad type"); + if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; - dap->da_state &= ~type; - if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + complete_diradd(dap); } - LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* + * Handle the completion of a mkdir dependency. + */ +static void +handle_written_mkdir(mkdir, type) + struct mkdir *mkdir; + int type; +{ + + if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) + panic("handle_written_mkdir: bad type"); + mkdir->md_state |= COMPLETE; + complete_mkdir(mkdir); +} + +/* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. @@ -4790,7 +8215,10 @@ handle_written_filepage(pagedep, bp) */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); + dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_written_filepage: Journal entries not written.")); add_to_worklist(&dirrem->dm_list); } /* @@ -4800,7 +8228,7 @@ handle_written_filepage(pagedep, bp) */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + free_diradd(dap, NULL); /* * Uncommitted directory entries must be restored. */ @@ -4908,6 +8336,7 @@ softdep_update_inodeblock(ip, bp, waitfor) int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; + struct jaddref *jaddref; struct worklist *wk; struct mount *mp; struct buf *ibp; @@ -4922,6 +8351,7 @@ softdep_update_inodeblock(ip, bp, waitfor) */ mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); +again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) @@ -4931,6 +8361,17 @@ softdep_update_inodeblock(ip, bp, waitfor) if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* + * If we're flushing all dependencies we must also move any waiting + * for journal writes onto the bufwait list prior to I/O. + */ + if (waitfor) + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) { + if (jaddref->ja_state & (GOINGAWAY | COMPLETE)) + continue; + jwait(&jaddref->ja_list); + goto again; + } + /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ @@ -4945,10 +8386,12 @@ softdep_update_inodeblock(ip, bp, waitfor) */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), + NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) - handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), + NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk @@ -4975,7 +8418,7 @@ retry: FREE_LOCK(&lk); return; } - ibp = inodedep->id_buf; + ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, &lk, MNT_WAIT); if (ibp == NULL) { /* @@ -5007,13 +8450,13 @@ merge_inode_lists(newlisthead, oldlisthead) newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { - if (listadp->ad_lbn < newadp->ad_lbn) { + if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); - if (listadp->ad_lbn == newadp->ad_lbn) { + if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; @@ -5057,12 +8500,11 @@ softdep_fsync(vp) return (0); } if (!LIST_EMPTY(&inodedep->id_inowait) || - !LIST_EMPTY(&inodedep->id_bufwait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) - panic("softdep_fsync: pending ops"); + panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; @@ -5320,9 +8762,11 @@ loop: case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); + if (adp->ad_jnewblk != NULL) + jwait(&adp->ad_jnewblk->jn_list); if (adp->ad_state & DEPCOMPLETE) continue; - nbp = adp->ad_buf; + nbp = adp->ad_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; @@ -5337,9 +8781,11 @@ loop: case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); + if (aip->ai_jnewblk != NULL) + jwait(&aip->ai_jnewblk->jn_list); if (aip->ai_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = aip->ai_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; @@ -5356,9 +8802,13 @@ loop: restart: LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { + if (aip->ai_jnewblk != NULL) { + jwait(&aip->ai_jnewblk->jn_list); + goto restart; + } if (aip->ai_state & DEPCOMPLETE) continue; - nbp = aip->ai_buf; + nbp = aip->ai_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, &lk, MNT_WAIT); if (nbp == NULL) goto restart; @@ -5489,7 +8939,8 @@ loop: BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); - return (0); + return ffs_update(vp, 1); + /* return (0); */ } /* @@ -5502,7 +8953,9 @@ flush_inodedep_deps(mp, ino) ino_t ino; { struct inodedep *inodedep; + struct jaddref *jaddref; int error, waitfor; + int loops = 0; /* * This work is done in two passes. The first pass grabs most @@ -5522,8 +8975,17 @@ flush_inodedep_deps(mp, ino) return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); +restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) { + if (jaddref->ja_state & (GOINGAWAY | COMPLETE)) + continue; + if (++loops > 20) + panic("stuck jaddref: %p\n", jaddref); + jwait(&jaddref->ja_list); + goto restart; + } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || @@ -5559,9 +9021,11 @@ flush_deplist(listhead, waitfor, errorp) mtx_assert(&lk, MA_OWNED); TAILQ_FOREACH(adp, listhead, ad_next) { + if (adp->ad_jnewblk != NULL) + jwait(&adp->ad_jnewblk->jn_list); if (adp->ad_state & DEPCOMPLETE) continue; - bp = adp->ad_buf; + bp = adp->ad_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) @@ -5592,6 +9056,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp) struct diraddhd *diraddhdp; { struct inodedep *inodedep; + struct jaddref *jaddref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; @@ -5690,6 +9155,13 @@ flush_pagedep_deps(pvp, mp, diraddhdp) break; } BO_UNLOCK(bo); + /* + * We have to wait for the direct pointers to point + * at the newdirblk before the dependency will go + * away. + */ + if (dap == LIST_FIRST(diraddhdp)) + ffs_update(vp, 1); vput(vp); if (error != 0) break; /* Flushing of first block failed */ @@ -5716,11 +9188,20 @@ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* + * Wait for any pending journal adds to complete. + */ + LIST_FOREACH(jaddref, &inodedep->id_jaddrefhd, ja_inodeps) { + if (jaddref->ja_state & (GOINGAWAY | COMPLETE)) + continue; + jwait(&jaddref->ja_list); + goto retry; + } + /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { - bp = inodedep->id_buf; + bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, &lk, MNT_WAIT); if (bp == NULL) goto retry; @@ -5749,8 +9230,11 @@ retry: * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ - if (dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush failed"); + if (dap == LIST_FIRST(diraddhdp)) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); + panic("flush_pagedep_deps: failed to flush " + "inodep %p ino %d dap %p", inodedep, inum, dap); + } } if (error) ACQUIRE_LOCK(&lk); @@ -6100,10 +9584,13 @@ softdep_count_dependencies(bp, wantcount) int wantcount; { struct worklist *wk; + struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct indirdep *indirdep; + struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; + struct dirrem *dirrem; struct diradd *dap; int i, retval; @@ -6132,6 +9619,12 @@ softdep_count_dependencies(bp, wantcount) if (!wantcount) goto out; } + if (LIST_FIRST(&inodedep->id_jaddrefhd)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } continue; case D_INDIRDEP: @@ -6147,6 +9640,14 @@ softdep_count_dependencies(bp, wantcount) case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + if (LIST_FIRST(&dirrem->dm_jremrefhd)) { + /* Journal remove ref dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { @@ -6159,14 +9660,43 @@ softdep_count_dependencies(bp, wantcount) continue; case D_BMSAFEMAP: + bmsafemap = WK_BMSAFEMAP(wk); + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { + /* Allocate block dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { + /* Freeblk journal dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: case D_ALLOCDIRECT: case D_ALLOCINDIR: case D_MKDIR: + case D_JSEG: /* never a dependency on these blocks */ continue; default: - panic("softdep_check_for_rollback: Unexpected type %s", + panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } @@ -6382,6 +9912,43 @@ softdep_error(func, error) #ifdef DDB +static void +inodedep_print(struct inodedep *inodedep, int verbose) +{ + db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d saveino %p\n", + inodedep, inodedep->id_fs, inodedep->id_state, + (intmax_t)inodedep->id_ino, + (intmax_t)fsbtodb(inodedep->id_fs, + ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), + inodedep->id_nlinkdelta, inodedep->id_savedino1); + + if (verbose == 0) + return; + + db_printf("\tpendinghd %p, bufwait %p, inowait %p, jaddrefhd %p, " + "mkdiradd %p\n", + LIST_FIRST(&inodedep->id_pendinghd), + LIST_FIRST(&inodedep->id_bufwait), + LIST_FIRST(&inodedep->id_inowait), + LIST_FIRST(&inodedep->id_jaddrefhd), + inodedep->id_mkdiradd); + db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", + TAILQ_FIRST(&inodedep->id_inoupdt), + TAILQ_FIRST(&inodedep->id_newinoupdt), + TAILQ_FIRST(&inodedep->id_extupdt), + TAILQ_FIRST(&inodedep->id_newextupdt)); +} + +DB_SHOW_COMMAND(inodedep, db_show_inodedep) +{ + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + inodedep_print((struct inodedep*)addr, 1); +} + DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) { struct inodedep_hashhead *inodedephd; @@ -6395,15 +9962,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) LIST_FOREACH(inodedep, inodedephd, id_hash) { if (fs != NULL && fs != inodedep->id_fs) continue; - db_printf("%p fs %p st %x ino %jd inoblk %jd\n", - inodedep, inodedep->id_fs, inodedep->id_state, - (intmax_t)inodedep->id_ino, - (intmax_t)fsbtodb(inodedep->id_fs, - ino_to_fsba(inodedep->id_fs, inodedep->id_ino))); + inodedep_print(inodedep, 0); } } } +DB_SHOW_COMMAND(worklist, db_show_worklist) +{ + struct worklist *wk; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wk = (struct worklist *)addr; + printf("worklist: %p type %s state 0x%X\n", + wk, TYPENAME(wk->wk_type), wk->wk_state); +} + +DB_SHOW_COMMAND(workhead, db_show_workhead) +{ + struct workhead *wkhd; + struct worklist *wk; + int i; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wkhd = (struct workhead *)addr; + wk = LIST_FIRST(wkhd); + for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) + db_printf("worklist: %p type %s state 0x%X", + wk, TYPENAME(wk->wk_type), wk->wk_state); + if (i == 100) + db_printf("workhead overflow"); + printf("\n"); +} + + +DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) +{ + struct jaddref *jaddref; + struct diradd *diradd; + struct mkdir *mkdir; + + LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { + diradd = mkdir->md_diradd; + db_printf("mkdir: %p state 0x%X dap %p state 0x%X", + mkdir, mkdir->md_state, diradd, diradd->da_state); + if ((jaddref = mkdir->md_jaddref) != NULL) + db_printf(" jaddref %p jaddref state 0x%X", + jaddref, jaddref->ja_state); + db_printf("\n"); + } +} + #endif /* DDB */ #endif /* SOFTUPDATES */ Index: /usr/src/sys/ufs/ffs/ffs_alloc.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_alloc.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_alloc.c (working copy) @@ -89,23 +89,23 @@ __FBSDID("$FreeBSD$"); #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref, - int size); + int size, int rsize); -static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int); +static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int, int); static ufs2_daddr_t - ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t); + ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif -static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int); -static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *, - ufs1_daddr_t, int); +static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int, + int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int); static void ffs_fserr(struct fs *, ino_t, char *); static ufs2_daddr_t ffs_hashalloc - (struct inode *, int, ufs2_daddr_t, int, allocfcn_t *); -static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int); + (struct inode *, int, ufs2_daddr_t, int, int, allocfcn_t *); +static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int, + int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); @@ -182,7 +182,7 @@ retry: cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); - bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); if (ip->i_flag & IN_SPACECOUNTED) { @@ -380,16 +380,12 @@ retry: panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } - bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, - ip->i_number); - if (nsize < request) - ffs_blkfree(ump, fs, ip->i_devvp, - bno + numfrags(fs, nsize), - (long)(request - nsize), ip->i_number); + ip->i_number, NULL); delta = btodb(nsize - osize); if (ip->i_flag & IN_SPACECOUNTED) { UFS_LOCK(ump); @@ -493,7 +489,7 @@ ffs_reallocblks_ufs1(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap = 0; struct cluster_save *buflist; struct ufsmount *ump; @@ -501,7 +497,7 @@ ffs_reallocblks_ufs1(ap) ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; - int i, len, start_lvl, end_lvl, ssize; + int i, len, start_lvl, end_lvl, ssize, lvl; vp = ap->a_vp; ip = VTOI(vp); @@ -578,7 +574,7 @@ ffs_reallocblks_ufs1(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -594,11 +590,17 @@ ffs_reallocblks_ufs1(ap) printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif + idp = start_ap; + bp = sbp; blkno = newblk; + lvl = start_lvl - 1; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; - soff = -i; + idp = end_ap; + bp = ebp; + lvl = end_lvl - 1; + idp[lvl].in_off = 0; } #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -614,12 +616,13 @@ ffs_reallocblks_ufs1(ap) if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, - blkno, *bap, fs->fs_bsize, fs->fs_bsize, - buflist->bs_children[i]); + start_lbn + i, blkno, *bap, fs->fs_bsize, + fs->fs_bsize, buflist->bs_children[i]); else - softdep_setup_allocindir_page(ip, start_lbn + i, - i < ssize ? sbp : ebp, soff + i, blkno, - *bap, buflist->bs_children[i]); + softdep_setup_allocindir_page(ip, bp, idp, + lvl, start_lbn + i, blkno, *bap, + buflist->bs_children[i]); + idp[lvl].in_off++; } *bap++ = blkno; } @@ -664,7 +667,7 @@ ffs_reallocblks_ufs1(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -702,14 +705,14 @@ ffs_reallocblks_ufs2(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap = 0; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; - int i, len, start_lvl, end_lvl, ssize; + int i, len, start_lvl, end_lvl, ssize, lvl; vp = ap->a_vp; ip = VTOI(vp); @@ -786,7 +789,7 @@ ffs_reallocblks_ufs2(ap) * Search the block map looking for an allocation of the desired size. */ if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, ffs_clusteralloc)) == 0) { + len, len, ffs_clusteralloc)) == 0) { UFS_UNLOCK(ump); goto fail; } @@ -802,11 +805,17 @@ ffs_reallocblks_ufs2(ap) printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif + idp = start_ap; + bp = sbp; blkno = newblk; + lvl = start_lvl - 1; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; - soff = -i; + idp = end_ap; + bp = ebp; + lvl = end_lvl - 1; + idp[lvl].in_off = 0; } #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -822,12 +831,13 @@ ffs_reallocblks_ufs2(ap) if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, - blkno, *bap, fs->fs_bsize, fs->fs_bsize, - buflist->bs_children[i]); + start_lbn + i, blkno, *bap, fs->fs_bsize, + fs->fs_bsize, buflist->bs_children[i]); else - softdep_setup_allocindir_page(ip, start_lbn + i, - i < ssize ? sbp : ebp, soff + i, blkno, - *bap, buflist->bs_children[i]); + softdep_setup_allocindir_page(ip, bp, idp, + lvl, start_lbn + i, blkno, *bap, + buflist->bs_children[i]); + idp[lvl].in_off++; } *bap++ = blkno; } @@ -872,7 +882,7 @@ ffs_reallocblks_ufs2(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -959,7 +969,7 @@ ffs_valloc(pvp, mode, cred, vpp) if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } - ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; @@ -1268,11 +1278,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap) */ /*VARARGS5*/ static ufs2_daddr_t -ffs_hashalloc(ip, cg, pref, size, allocator) +ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; int cg; ufs2_daddr_t pref; - int size; /* size for data blocks, mode for inodes */ + int size; /* Search size for data blocks, mode for inodes */ + int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; @@ -1288,7 +1299,7 @@ static ufs2_daddr_t /* * 1: preferred cylinder group */ - result = (*allocator)(ip, cg, pref, size); + result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* @@ -1298,7 +1309,7 @@ static ufs2_daddr_t cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } @@ -1309,7 +1320,7 @@ static ufs2_daddr_t */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; @@ -1391,7 +1402,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, + frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); @@ -1409,11 +1421,12 @@ fail: * and if it is, allocate it. */ static ufs2_daddr_t -ffs_alloccg(ip, cg, bpref, size) +ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; int cg; ufs2_daddr_t bpref; int size; + int rsize; { struct fs *fs; struct cg *cgp; @@ -1441,7 +1454,7 @@ static ufs2_daddr_t cgp->cg_old_time = cgp->cg_time = time_second; if (size == fs->fs_bsize) { UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); @@ -1465,21 +1478,14 @@ static ufs2_daddr_t if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); - blkno = ffs_alloccgblk(ip, bp, bpref); - bno = dtogd(fs, blkno); - for (i = frags; i < fs->fs_frag; i++) - setbit(blksfree, bno + i); - i = fs->fs_frag - frags; - cgp->cg_cs.cs_nffree += i; - fs->fs_cstotal.cs_nffree += i; - fs->fs_cs(fs, cg).cs_nffree += i; - fs->fs_fmod = 1; - cgp->cg_frsum[i]++; + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } + KASSERT(size == rsize, + ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; @@ -1497,7 +1503,7 @@ static ufs2_daddr_t ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); @@ -1519,10 +1525,11 @@ fail: * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t -ffs_alloccgblk(ip, bp, bpref) +ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; + int size; { struct fs *fs; struct cg *cgp; @@ -1530,6 +1537,7 @@ static ufs2_daddr_t ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; + int i; fs = ip->i_fs; ump = ip->i_ump; @@ -1557,16 +1565,32 @@ static ufs2_daddr_t gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); - ffs_clusteracct(ump, fs, cgp, blkno, -1); + ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; + /* + * If the caller didn't want the whole block free the frags here. + */ + size = numfrags(fs, size); + if (size != fs->fs_frag) { + bno = dtogd(fs, blkno); + for (i = size; i < fs->fs_frag; i++) + setbit(blksfree, bno + i); + i = fs->fs_frag - size; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, + size, 0); UFS_LOCK(ump); return (blkno); } @@ -1579,11 +1603,12 @@ gotit: * take the first one that we find following bpref. */ static ufs2_daddr_t -ffs_clusteralloc(ip, cg, bpref, len) +ffs_clusteralloc(ip, cg, bpref, len, unused) struct inode *ip; int cg; ufs2_daddr_t bpref; int len; + int unused; { struct fs *fs; struct cg *cgp; @@ -1679,7 +1704,7 @@ static ufs2_daddr_t len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) - if (ffs_alloccgblk(ip, bp, bno + i) != bno + i) + if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); @@ -1703,11 +1728,12 @@ fail: * inode in the specified cylinder group. */ static ufs2_daddr_t -ffs_nodealloccg(ip, cg, ipref, mode) +ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; int cg; ufs2_daddr_t ipref; int mode; + int unused; { struct fs *fs; struct cg *cgp; @@ -1810,28 +1836,6 @@ gotit: } /* - * check if a block is free - */ -static int -ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0); - case 4: - return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); - case 2: - return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); - case 1: - return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); - default: - panic("ffs_isfreeblock"); - } - return (0); -} - -/* * Free a block or fragment. * * The specified block or fragment is placed back in the @@ -1839,13 +1843,14 @@ gotit: * block reassembly is checked. */ void -ffs_blkfree(ump, fs, devvp, bno, size, inum) +ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + struct workhead *dephd; { struct cg *cgp; struct buf *bp; @@ -1912,7 +1917,7 @@ void panic("ffs_blkfree: freeing free block"); } ffs_setblock(fs, blksfree, fragno); - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1952,7 +1957,7 @@ void cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; - ffs_clusteracct(ump, fs, cgp, fragno, 1); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -1961,6 +1966,9 @@ void fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, + numfrags(fs, size), dephd); bdwrite(bp); } @@ -2031,7 +2039,8 @@ ffs_vfree(pvp, ino, mode) return (0); } ip = VTOI(pvp); - return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode)); + return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, + NULL)); } /* @@ -2039,12 +2048,13 @@ ffs_vfree(pvp, ino, mode) * The specified inode is placed back in the free map. */ int -ffs_freefile(ump, fs, devvp, ino, mode) +ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; + struct workhead *wkhd; { struct cg *cgp; struct buf *bp; @@ -2100,6 +2110,9 @@ int fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); + if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP) + softdep_setup_inofree(UFSTOVFS(ump), bp, + ino + cg * fs->fs_ipg, wkhd); bdwrite(bp); return (0); } @@ -2213,101 +2226,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz) } /* - * Update the cluster map because of an allocation or free. - * - * Cnt == 1 means free; cnt == -1 means allocating. - */ -void -ffs_clusteracct(ump, fs, cgp, blkno, cnt) - struct ufsmount *ump; - struct fs *fs; - struct cg *cgp; - ufs1_daddr_t blkno; - int cnt; -{ - int32_t *sump; - int32_t *lp; - u_char *freemapp, *mapp; - int i, start, end, forw, back, map, bit; - - mtx_assert(UFS_MTX(ump), MA_OWNED); - - if (fs->fs_contigsumsize <= 0) - return; - freemapp = cg_clustersfree(cgp); - sump = cg_clustersum(cgp); - /* - * Allocate or clear the actual block. - */ - if (cnt > 0) - setbit(freemapp, blkno); - else - clrbit(freemapp, blkno); - /* - * Find the size of the cluster going forward. - */ - start = blkno + 1; - end = start + fs->fs_contigsumsize; - if (end >= cgp->cg_nclusterblks) - end = cgp->cg_nclusterblks; - mapp = &freemapp[start / NBBY]; - map = *mapp++; - bit = 1 << (start % NBBY); - for (i = start; i < end; i++) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != (NBBY - 1)) { - bit <<= 1; - } else { - map = *mapp++; - bit = 1; - } - } - forw = i - start; - /* - * Find the size of the cluster going backward. - */ - start = blkno - 1; - end = start - fs->fs_contigsumsize; - if (end < 0) - end = -1; - mapp = &freemapp[start / NBBY]; - map = *mapp--; - bit = 1 << (start % NBBY); - for (i = start; i > end; i--) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != 0) { - bit >>= 1; - } else { - map = *mapp--; - bit = 1 << (NBBY - 1); - } - } - back = start - i; - /* - * Account for old cluster and the possibly new forward and - * back clusters. - */ - i = back + forw + 1; - if (i > fs->fs_contigsumsize) - i = fs->fs_contigsumsize; - sump[i] += cnt; - if (back > 0) - sump[back] -= cnt; - if (forw > 0) - sump[forw] -= cnt; - /* - * Update cluster summary information. - */ - lp = &sump[fs->fs_contigsumsize]; - for (i = fs->fs_contigsumsize; i > 0; i--) - if (*lp-- > 0) - break; - fs->fs_maxcluster[cgp->cg_cgx] = i; -} - -/* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: @@ -2505,7 +2423,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) #endif /* DEBUG */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, - cmd.value, filetype))) + cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; @@ -2533,7 +2451,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, ROOTINO); + blksize * fs->fs_fsize, ROOTINO, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; Index: /usr/src/sys/ufs/ffs/ffs_extern.h =================================================================== --- /usr/src/sys/ufs/ffs/ffs_extern.h (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_extern.h (working copy) @@ -56,18 +56,20 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_st struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t); + ufs2_daddr_t, long, ino_t, struct workhead *); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_bdflush(struct bufobj *, struct buf *); int ffs_copyonwrite(struct vnode *, struct buf *); int ffs_flushfiles(struct mount *, int, struct thread *); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, - int); + int, struct workhead *); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); int ffs_mountroot(void); int ffs_reallocblks(struct vop_reallocblks_args *); @@ -108,7 +110,7 @@ void softdep_initialize(void); void softdep_uninitialize(void); int softdep_mount(struct vnode *, struct mount *, struct fs *, struct ucred *); -void softdep_move_dependencies(struct buf *, struct buf *); +int softdep_move_dependencies(struct buf *, struct buf *); int softdep_flushworklist(struct mount *, int *, struct thread *); int softdep_flushfiles(struct mount *, int, struct thread *); void softdep_update_inodeblock(struct inode *, struct buf *, int); @@ -117,15 +119,21 @@ void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t); -void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t); -void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, +void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, + int, int); +void softdep_setup_allocdirect(struct inode *, int, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocindir_meta(struct buf *, struct inode *, - struct buf *, int, ufs2_daddr_t); -void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, - struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *); + struct buf *, struct indir *, int, ufs2_daddr_t); +void softdep_setup_allocindir_page(struct inode *, struct buf *, + struct indir *, int, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, + struct buf *); +void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int, + struct workhead *); +void softdep_setup_inofree(struct mount *, struct buf *, ino_t, + struct workhead *); void softdep_fsync_mountdev(struct vnode *); int softdep_sync_metadata(struct vnode *); int softdep_process_worklist(struct mount *, int); Index: /usr/src/sys/ufs/ffs/ffs_subr.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_subr.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_subr.c (working copy) @@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$"); #ifndef _KERNEL #include #include -#include "fsck.h" #else #include #include @@ -223,12 +222,43 @@ ffs_isblock(fs, cp, h) mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: +#ifdef _KERNEL panic("ffs_isblock"); +#endif + break; } return (0); } /* + * check if a block is free + */ +int +ffs_isfreeblock(fs, cp, h) + struct fs *fs; + u_char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0); + case 4: + return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); + case 2: + return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); + case 1: + return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); + default: +#ifdef _KERNEL + panic("ffs_isfreeblock"); +#endif + break; + } + return (0); +} + +/* * take a block out of the map */ void @@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h) cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_clrblock"); +#endif + break; } } @@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h) cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: +#ifdef _KERNEL panic("ffs_setblock"); +#endif + break; } } + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +void +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + ufs1_daddr_t blkno; + int cnt; +{ + int32_t *sump; + int32_t *lp; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; + /* + * Update cluster summary information. + */ + lp = &sump[fs->fs_contigsumsize]; + for (i = fs->fs_contigsumsize; i > 0; i--) + if (*lp-- > 0) + break; + fs->fs_maxcluster[cgp->cg_cgx] = i; +} Index: /usr/src/sys/ufs/ffs/softdep.h =================================================================== --- /usr/src/sys/ufs/ffs/softdep.h (revision 200565) +++ /usr/src/sys/ufs/ffs/softdep.h (working copy) @@ -98,18 +98,20 @@ #define UNDONE 0x0002 #define COMPLETE 0x0004 #define DEPCOMPLETE 0x0008 -#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */ -#define MKDIR_BODY 0x0020 /* diradd & mkdir only */ +#define MKDIR_PARENT 0x0010 /* diradd, mkdir, jaddref, jsegdep only */ +#define MKDIR_BODY 0x0020 /* diradd, mkdir, jaddref only */ #define RMDIR 0x0040 /* dirrem only */ -#define DIRCHG 0x0080 /* diradd & dirrem only */ -#define GOINGAWAY 0x0100 /* indirdep only */ -#define IOSTARTED 0x0200 /* inodedep & pagedep only */ +#define DIRCHG 0x0080 /* diradd, dirrem only */ +#define GOINGAWAY 0x0100 /* indirdep, jremref only */ +#define IOSTARTED 0x0200 /* inodedep, pagedep, bmsafemap only */ #define SPACECOUNTED 0x0400 /* inodedep only */ -#define NEWBLOCK 0x0800 /* pagedep only */ +#define NEWBLOCK 0x0800 /* pagedep, jaddref only */ #define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */ #define UFS1FMT 0x2000 /* indirdep only */ #define EXTDATA 0x4000 /* allocdirect only */ #define ONWORKLIST 0x8000 +#define IOWAITING 0x10000 /* Thread is waiting for IO to complete. */ +#define ONDEPLIST 0x20000 /* Structure is on a dependency list. */ #define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) @@ -135,10 +137,10 @@ * and the macros below changed to use it. */ struct worklist { + LIST_ENTRY(worklist) wk_list; /* list of work requests */ struct mount *wk_mp; /* Mount we live in */ - LIST_ENTRY(worklist) wk_list; /* list of work requests */ - unsigned short wk_type; /* type of request */ - unsigned short wk_state; /* state flags */ + unsigned int wk_type:8, /* type of request */ + wk_state:24; /* state flags */ }; #define WK_DATA(wk) ((void *)(wk)) #define WK_PAGEDEP(wk) ((struct pagedep *)(wk)) @@ -149,11 +151,20 @@ struct worklist { #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk)) #define WK_FREEFRAG(wk) ((struct freefrag *)(wk)) #define WK_FREEBLKS(wk) ((struct freeblks *)(wk)) +#define WK_FREEWORK(wk) ((struct freework *)(wk)) #define WK_FREEFILE(wk) ((struct freefile *)(wk)) #define WK_DIRADD(wk) ((struct diradd *)(wk)) #define WK_MKDIR(wk) ((struct mkdir *)(wk)) #define WK_DIRREM(wk) ((struct dirrem *)(wk)) #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk)) +#define WK_JADDREF(wk) ((struct jaddref *)(wk)) +#define WK_JREMREF(wk) ((struct jremref *)(wk)) +#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk)) +#define WK_JSEG(wk) ((struct jseg *)(wk)) +#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk)) +#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk)) +#define WK_FREEDEP(wk) ((struct freedep *)(wk)) +#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk)) /* * Various types of lists @@ -165,6 +176,13 @@ LIST_HEAD(inodedephd, inodedep); LIST_HEAD(allocindirhd, allocindir); LIST_HEAD(allocdirecthd, allocdirect); TAILQ_HEAD(allocdirectlst, allocdirect); +LIST_HEAD(indirdephd, indirdep); +LIST_HEAD(jremrefhd, jremref); +LIST_HEAD(jaddrefhd, jaddref); +LIST_HEAD(jnewblkhd, jnewblk); +LIST_HEAD(jfreeblkhd, jfreeblk); +LIST_HEAD(freeworkhd, freework); +TAILQ_HEAD(jseglst, jseg); /* * The "pagedep" structure tracks the various dependencies related to @@ -192,6 +210,7 @@ struct pagedep { LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */ ino_t pd_ino; /* associated file */ ufs_lbn_t pd_lbn; /* block within file */ + struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */ struct dirremhd pd_dirremhd; /* dirrem's waiting for page */ struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */ struct diraddhd pd_pendinghd; /* directory entries awaiting write */ @@ -252,7 +271,9 @@ struct inodedep { ino_t id_ino; /* dependent inode */ nlink_t id_nlinkdelta; /* saved effective link count */ LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ - struct buf *id_buf; /* related bmsafemap (if pending) */ + struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */ + struct diradd *id_mkdiradd; /* diradd for a mkdir. */ + struct jaddrefhd id_jaddrefhd; /* Journal add refs pending. */ long id_savedextsize; /* ext size saved during rollback */ off_t id_savedsize; /* file size saved during rollback */ struct workhead id_pendinghd; /* entries awaiting directory write */ @@ -283,8 +304,10 @@ struct newblk { struct fs *nb_fs; /* associated filesystem */ int nb_state; /* state of bitmap dependency */ ufs2_daddr_t nb_newblkno; /* allocated block number */ + /* XXX nb_deps not needed with journaling. */ LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */ struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */ + struct jnewblk *nb_jnewblk; /* Journal entry. */ }; /* @@ -299,11 +322,21 @@ struct newblk { */ struct bmsafemap { struct worklist sm_list; /* cylgrp buffer */ +# define sm_state sm_list.wk_state + int sm_cg; + LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */ struct buf *sm_buf; /* associated buffer */ struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */ + struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */ struct allocindirhd sm_allocindirhd; /* allocindir deps */ + struct allocindirhd sm_allocindirwr; /* writing allocindir deps */ struct inodedephd sm_inodedephd; /* inodedep deps */ - struct newblkhd sm_newblkhd; /* newblk deps */ + struct inodedephd sm_inodedepwr; /* writing inodedep deps */ + /* XXX newblk* not needed with journaling. */ + struct newblkhd sm_newblkhd; /* newblk deps, unused with journal */ + struct newblkhd sm_newblkwr; /* writing newblk deps */ + struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */ + struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */ }; /* @@ -337,16 +370,19 @@ struct allocdirect { struct worklist ad_list; /* buffer holding block */ # define ad_state ad_list.wk_state /* block pointer state */ TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */ - ufs_lbn_t ad_lbn; /* block within file */ + int ad_offset; /* inode disk address offset */ ufs2_daddr_t ad_newblkno; /* new value of block pointer */ ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ long ad_newsize; /* size of new block */ long ad_oldsize; /* size of old block */ LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */ - struct buf *ad_buf; /* cylgrp buffer (if pending) */ + struct bmsafemap *ad_bmsafemap;/* cylgrp dep (if pending) */ struct inodedep *ad_inodedep; /* associated inodedep */ struct freefrag *ad_freefrag; /* fragment to be freed (if any) */ + struct indirdephd ad_indirdeps; /* Children indirect blocks. */ struct workhead ad_newdirblk; /* dir block to notify when written */ + struct jnewblk *ad_jnewblk; /* New block journal entry. */ + struct workhead ad_jwork; /* Journal work pending. */ }; /* @@ -369,10 +405,14 @@ struct allocdirect { struct indirdep { struct worklist ir_list; /* buffer holding indirect block */ # define ir_state ir_list.wk_state /* indirect block pointer state */ - caddr_t ir_saveddata; /* buffer cache contents */ + LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */ + caddr_t ir_saveddata; /* buffer cache contents */ struct buf *ir_savebp; /* buffer holding safe copy */ + struct allocindirhd ir_completehd; /* waiting for indirdep complete */ + struct allocindirhd ir_writehd; /* Waiting for the pointer write. */ struct allocindirhd ir_donehd; /* done waiting to update safecopy */ struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ + struct workhead ir_jwork; /* Journal work pending. */ }; /* @@ -398,7 +438,11 @@ struct allocindir { struct freefrag *ai_freefrag; /* block to be freed when complete */ struct indirdep *ai_indirdep; /* address of associated indirdep */ LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */ - struct buf *ai_buf; /* cylgrp buffer (if pending) */ + struct bmsafemap *ai_bmsafemap;/* cylgrp dep (if pending) */ + struct indirdephd ai_indirdeps;/* Children indirect blocks. */ + struct workhead ai_newdirblk; /* dir block to notify when written */ + struct jnewblk *ai_jnewblk; /* New block journal entry. */ + struct workhead ai_jwork; /* Journal work pending. */ }; /* @@ -406,14 +450,13 @@ struct allocindir { * allocated fragment is replaced with a larger fragment, rather than extended. * The "freefrag" structure is constructed and attached when the replacement * block is first allocated. It is processed after the inode claiming the - * bigger block that replaces it has been written to disk. Note that the - * ff_state field is is used to store the uid, so may lose data. However, - * the uid is used only in printing an error message, so is not critical. - * Keeping it in a short keeps the data structure down to 32 bytes. + * bigger block that replaces it has been written to disk. */ struct freefrag { struct worklist ff_list; /* id_inowait or delayed worklist */ -# define ff_state ff_list.wk_state /* owning user; should be uid_t */ +# define ff_state ff_list.wk_state + struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */ + struct workhead ff_jwork; /* Journal work pending. */ ufs2_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ @@ -423,23 +466,59 @@ struct freefrag { * A "freeblks" structure is attached to an "inodedep" when the * corresponding file's length is reduced to zero. It records all * the information needed to free the blocks of a file after its - * zero'ed inode has been written to disk. + * zero'ed inode has been written to disk. The actual work is done + * by child freework structures which are responsible for individual + * inode pointers while freeblks is responsible for retiring the + * entire operation when it is complete and holding common members. */ struct freeblks { struct worklist fb_list; /* id_inowait or delayed worklist */ # define fb_state fb_list.wk_state /* inode and dirty block state */ + struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */ + struct workhead fb_freeworkhd; /* Work items pending */ + struct workhead fb_jwork; /* Journal work pending */ ino_t fb_previousinum; /* inode of previous owner of blocks */ uid_t fb_uid; /* uid of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ - long fb_oldextsize; /* previous ext data size */ - off_t fb_oldsize; /* previous file size */ ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */ - ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */ - ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */ - ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */ + int fb_ref; /* Children outstanding. */ }; /* + * A "freework" structure handles the release of a tree of blocks or a single + * block. Each indirect block in a tree is allocated its own freework + * structure so that the indrect block may be freed only when all of its + * children are freed. In this way we enforce the rule that an allocated + * block must have a valid path to a root that is journaled. Each child + * block acquires a reference and when the ref hits zero the parent ref + * is decremented. If there is no parent the freeblks ref is decremented. + */ +struct freework { + struct worklist fw_list; +# define fw_state fw_list.wk_state + LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */ + struct freeblks *fw_freeblks; /* Root of operation. */ + struct freework *fw_parent; /* Parent indirect. */ + ufs2_daddr_t fw_blkno; /* Our block #. */ + ufs_lbn_t fw_lbn; /* Original lbn before free. */ + int fw_frags; /* Number of frags. */ + int fw_ref; /* Number of children out. */ + struct workhead fw_jwork; /* Journal work pending. */ +}; + +/* + * A "freedep" structure is allocated to track the completion of a bitmap + * write for a freework. One freedep may cover many freed blocks so long + * as they reside in the same cylinder group. When the cg is written + * the freedep decrements the ref on the freework which may permit it + * to be freed as well. + */ +struct freedep { + struct worklist fd_list; + struct freework *fd_freework; /* Parent freework. */ +}; + +/* * A "freefile" structure is attached to an inode when its * link count is reduced to zero. It marks the inode as free in * the cylinder group map after the zero'ed inode has been written @@ -450,6 +529,7 @@ struct freefile { mode_t fx_mode; /* mode of inode */ ino_t fx_oldinum; /* inum of the unlinked file */ struct vnode *fx_devvp; /* filesystem device vnode */ + struct workhead fx_jwork; /* journal work pending. */ }; /* @@ -482,12 +562,11 @@ struct freefile { * than zero. * * The overlaying of da_pagedep and da_previous is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. If a - * da_previous entry is present, the pointer to its pagedep is available - * in the associated dirrem entry. If the DIRCHG flag is set, the - * da_previous entry is valid; if not set the da_pagedep entry is valid. - * The DIRCHG flag never changes; it is set when the structure is created - * if appropriate and is never cleared. + * structure down. If a da_previous entry is present, the pointer to its + * pagedep is available in the associated dirrem entry. If the DIRCHG flag + * is set, the da_previous entry is valid; if not set the da_pagedep entry + * is valid. The DIRCHG flag never changes; it is set when the structure + * is created if appropriate and is never cleared. */ struct diradd { struct worklist da_list; /* id_inowait or id_pendinghd list */ @@ -499,6 +578,7 @@ struct diradd { struct dirrem *dau_previous; /* entry being replaced in dir change */ struct pagedep *dau_pagedep; /* pagedep dependency for addition */ } da_un; + struct workhead da_jwork; /* Journal work awaiting completion. */ }; #define da_previous da_un.dau_previous #define da_pagedep da_un.dau_pagedep @@ -525,12 +605,13 @@ struct diradd { * mkdir structures that reference it. The deletion would be faster if the * diradd structure were simply augmented to have two pointers that referenced * the associated mkdir's. However, this would increase the size of the diradd - * structure from 32 to 64-bits to speed a very infrequent operation. + * structure to speed a very infrequent operation. */ struct mkdir { struct worklist md_list; /* id_inowait or buffer holding dir */ # define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */ struct diradd *md_diradd; /* associated diradd */ + struct jaddref *md_jaddref; /* dependent jaddref. */ struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */ LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */ }; @@ -542,20 +623,18 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; * list of the pagedep for the directory page that contains the entry. * It is processed after the directory page with the deleted entry has * been written to disk. - * - * The overlaying of dm_pagedep and dm_dirinum is done to keep the - * structure down to 32 bytes in size on a 32-bit machine. It works - * because they are never used concurrently. */ struct dirrem { struct worklist dm_list; /* delayed worklist */ # define dm_state dm_list.wk_state /* state of the old directory entry */ LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */ + struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */ ino_t dm_oldinum; /* inum of the removed dir entry */ union { struct pagedep *dmu_pagedep; /* pagedep dependency for remove */ ino_t dmu_dirinum; /* parent inode number (for rmdir) */ } dm_un; + struct workhead dm_jwork; /* Journal work awaiting completion. */ }; #define dm_pagedep dm_un.dmu_pagedep #define dm_dirinum dm_un.dmu_dirinum @@ -577,9 +656,156 @@ struct dirrem { * blocks using a similar scheme with the allocindir structures. Rather * than adding this level of complexity, we simply write those newly * allocated indirect blocks synchronously as such allocations are rare. + * In the case of a new directory the . and .. links are tracked with + * a mkdir rather than a pagedep. In this case we track the mkdir + * so it can be released when it is written. A workhead is used + * to simplify canceling a mkdir that is removed by a subsequent dirrem. */ struct newdirblk { struct worklist db_list; /* id_inowait or pg_newdirblk */ # define db_state db_list.wk_state /* unused */ struct pagedep *db_pagedep; /* associated pagedep */ + struct workhead db_mkdir; }; + +/* + * A "jaddref" structure tracks a new reference (link count) on an inode + * and prevents the link count increase and bitmap allocation until a + * journal entry can be written. Once the journal entry is written, + * the inode is put on the pendinghd of the bmsafemap and a diradd or + * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE + * flag is used to indicate that all of the required information for writing + * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to + * differentiate . and .. links from regular file names. NEWBLOCK indicates + * a bitmap is still pending. If a new reference is canceled by a delete + * prior to writing the journal the jaddref write is canceled and the + * structure persists to prevent any disk-visible changes until it is + * ultimately released when the file is freed or the link is dropped again. + */ +struct jaddref { + struct worklist ja_list; /* Journal pending or jseg entries. */ +# define ja_state ja_list.wk_state + union { + struct diradd *jau_diradd; /* Pending diradd. */ + struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */ + } ja_un; + LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */ + LIST_ENTRY(jaddref) ja_inodeps; /* Links for inodedep. */ + off_t ja_diroff; /* Directory offset. */ + ino_t ja_ino; /* Inode number. */ + ino_t ja_parent; /* Parent inode number. */ + int16_t ja_nlink; /* nlink before addition. */ + uint16_t ja_mode; /* File mode, needed for IFMT. */ +}; +#define ja_diradd ja_un.jau_diradd +#define ja_mkdir ja_un.jau_mkdir + +/* + * A "jremref" structure tracks a removed reference (unlink) on an + * inode and prevents the directory remove from proceeding until the + * journal entry is written. Once the journal has been written the remove + * may proceed as normal. + */ +struct jremref { + struct worklist jr_list; /* Journal pending or jseg entries. */ +# define jr_state jr_list.wk_state + struct dirrem *jr_dirrem; /* Back pointer to dirrem. */ + LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */ + off_t jr_diroff; /* Directory offset. */ + ino_t jr_ino; /* Inode number. */ + ino_t jr_parent; /* Parent inode number. */ + int16_t jr_nlink; /* nlink before the removal. */ + uint16_t jr_mode; /* File mode, needed for IFMT. */ +}; + +/* + * A "jnewblk" structure tracks a newly allocated block or fragment and + * prevents the direct or indirect block pointer as well as the cg bitmap + * from being written until it is logged. After it is logged the jsegdep + * is attached to the allocdirect or allocindir until the operation is + * completed or reverted. If the operation is reverted prior to the journal + * write the jnewblk structure is maintained to prevent the bitmaps from + * reaching the disk. Ultimately the jnewblk structure will be passed + * to the free routine as the in memory cg is modified back to the free + * state at which time it can be released. + */ +struct jnewblk { + struct worklist jn_list; +# define jn_state jn_list.wk_state + LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */ + union { + struct newblk *jnu_newblk; /* newblk if not DEPCOMPLETE */ + struct worklist *jnu_dep; /* allocdirect or allocindir */ + } jn_un; + ino_t jn_ino; + ufs_lbn_t jn_lbn; + ufs2_daddr_t jn_blkno; + int jn_oldfrags; + int jn_frags; +}; +#define jn_newblk jn_un.jnu_newblk +#define jn_dep jn_un.jnu_dep + +/* + * A "jfreeblk" structure tracks the journal write for freeing a block + * or tree of blocks. The block pointer must not be cleared in the inode + * or indirect prior to the jfreeblk being written. + */ +struct jfreeblk { + struct worklist jf_list; +# define jf_state jf_list.wk_state + struct freeblks *jf_freeblks; + LIST_ENTRY(jfreeblk) jf_deps; + ino_t jf_ino; + ufs_lbn_t jf_lbn; + ufs2_daddr_t jf_blkno; + int jf_frags; +}; + +/* + * A "jfreefrag" tracks the freeing of a single block when a fragment is + * extended or an indirect page is replaced. It is not part of a larger + * freeblks operation. + */ +struct jfreefrag { + struct worklist fr_list; +# define fr_state fr_list.wk_state + struct freefrag *fr_freefrag; + ino_t fr_ino; + ufs_lbn_t fr_lbn; + ufs2_daddr_t fr_blkno; + int fr_frags; +}; + +/* + * A "jsegdep" structure tracks a single reference to a written journal + * segment so the journal space can be reclaimed when all dependencies + * have been written. + */ +struct jsegdep { + struct worklist jd_list; +# define jd_state jd_list.wk_state + struct jseg *jd_seg; + short jd_type; + short jd_line; +}; + +/* + * A "jseg" structure contains all of the journal records written in a + * single disk write. jaddref and jremref structures are linked into + * js_entries so thay may be completed when the write completes. The + * js_deps array contains as many entries as there are ref counts to + * reduce the number of allocations required per journal write to one. + */ +struct jseg { + struct worklist js_list; /* b_deps link for journal */ + struct workhead js_entries; /* Entries awaiting write */ + TAILQ_ENTRY(jseg) js_next; + struct jblocks *js_jblocks; /* Back pointer to block/seg list */ + struct buf *js_buf; /* Buffer while unwritten */ + uint64_t js_seq; + int js_size; /* Allocated size in bytes */ + int js_cnt; /* Total items allocated */ + int js_refs; /* Count of items pending completion */ + struct jsegdep js_deps[0]; /* Dependencies for completion */ +}; Index: /usr/src/sys/ufs/ffs/ffs_balloc.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_balloc.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_balloc.c (working copy) @@ -138,7 +138,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (error) return (error); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, nb, + softdep_setup_allocdirect(ip, nb, nb, dbtofsb(fs, bp->b_blkno), dp->di_db[nb], fs->fs_bsize, osize, bp); ip->i_size = smalllblktosize(fs, nb + 1); @@ -190,7 +190,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (error) return (error); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, lbn, + softdep_setup_allocdirect(ip, lbn, lbn, dbtofsb(fs, bp->b_blkno), nb, nsize, osize, bp); } @@ -210,7 +210,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, lbn, newb, 0, + softdep_setup_allocdirect(ip, lbn, lbn, newb, 0, nsize, 0, bp); } dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); @@ -255,7 +255,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, - newb, 0, fs->fs_bsize, 0, bp); + indirs[0].in_lbn, newb, 0, fs->fs_bsize, 0, bp); bdwrite(bp); } else { /* @@ -305,8 +305,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { - softdep_setup_allocindir_meta(nbp, ip, bp, - indirs[i - 1].in_off, nb); + softdep_setup_allocindir_meta(nbp, ip, bp, indirs, i, + nb); bdwrite(nbp); } else { /* @@ -361,8 +361,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffse if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) - softdep_setup_allocindir_page(ip, lbn, bp, - indirs[i].in_off, nb, 0, nbp); + softdep_setup_allocindir_page(ip, bp, indirs, i, lbn, + nb, 0, nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use @@ -418,6 +418,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -473,7 +475,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } @@ -643,7 +645,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (error) return (error); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, nb, + softdep_setup_allocdirect(ip, nb, nb, dbtofsb(fs, bp->b_blkno), dp->di_db[nb], fs->fs_bsize, osize, bp); @@ -696,7 +698,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (error) return (error); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, lbn, + softdep_setup_allocdirect(ip, lbn, lbn, dbtofsb(fs, bp->b_blkno), nb, nsize, osize, bp); } @@ -716,7 +718,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, lbn, newb, 0, + softdep_setup_allocdirect(ip, lbn, lbn, newb, 0, nsize, 0, bp); } dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); @@ -761,7 +763,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, - newb, 0, fs->fs_bsize, 0, bp); + indirs[0].in_lbn, newb, 0, fs->fs_bsize, 0, bp); bdwrite(bp); } else { /* @@ -811,8 +813,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { - softdep_setup_allocindir_meta(nbp, ip, bp, - indirs[i - 1].in_off, nb); + softdep_setup_allocindir_meta(nbp, ip, bp, indirs, i, + nb); bdwrite(nbp); } else { /* @@ -867,8 +869,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffse if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) - softdep_setup_allocindir_page(ip, lbn, bp, - indirs[i].in_off, nb, 0, nbp); + softdep_setup_allocindir_page(ip, bp, indirs, i, lbn, + nb, 0, nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use @@ -930,6 +932,8 @@ fail: * slow, running out of disk space is not expected to be a common * occurence. The error return from fsync is ignored as we already * have an error to return to the user. + * + * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; @@ -985,7 +989,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); } return (error); } Index: /usr/src/sys/ufs/ffs/ffs_inode.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_inode.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_inode.c (working copy) @@ -232,7 +232,7 @@ ffs_truncate(vp, length, flags, cred, td) if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i], - sblksize(fs, osize, i), ip->i_number); + sblksize(fs, osize, i), ip->i_number, NULL); } } } @@ -336,6 +336,8 @@ ffs_truncate(vp, length, flags, cred, td) * zero'ed in case it ever becomes accessible again because * of subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. + * + * XXX Still need to manually journal this. */ offset = blkoff(fs, length); if (offset == 0) { @@ -445,7 +447,7 @@ ffs_truncate(vp, length, flags, cred, td) if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ip->i_devvp, bn, - fs->fs_bsize, ip->i_number); + fs->fs_bsize, ip->i_number, NULL); blocksreleased += nblocks; } } @@ -464,7 +466,8 @@ ffs_truncate(vp, length, flags, cred, td) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); - ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number); + ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number, + NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -496,7 +499,7 @@ ffs_truncate(vp, length, flags, cred, td) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ip->i_devvp, bn, - oldspace - newspace, ip->i_number); + oldspace - newspace, ip->i_number, NULL); blocksreleased += btodb(oldspace - newspace); } } @@ -638,7 +641,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp blocksreleased += blkcount; } ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize, - ip->i_number); + ip->i_number, NULL); blocksreleased += nblocks; } Index: /usr/src/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- /usr/src/sys/ufs/ffs/ffs_snapshot.c (revision 200565) +++ /usr/src/sys/ufs/ffs/ffs_snapshot.c (working copy) @@ -582,7 +582,8 @@ loop: len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, - DIP(xp, i_db[loc]), len, xp->i_number); + DIP(xp, i_db[loc]), len, xp->i_number, + NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -598,7 +599,7 @@ loop: DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { @@ -700,7 +701,7 @@ out1: copy_fs, vp, xp->i_number, - xp->i_mode); + xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; @@ -1220,7 +1221,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } @@ -1500,7 +1501,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, ex *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); } return (0); } Index: /usr/src/sys/ufs/ffs/fs.h =================================================================== --- /usr/src/sys/ufs/ffs/fs.h (revision 200565) +++ /usr/src/sys/ufs/ffs/fs.h (working copy) @@ -337,7 +337,9 @@ struct fs { int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ - int32_t fs_sparecon32[26]; /* reserved for future constants */ + int32_t fs_sujournal; /* SUJ journal file */ + int32_t fs_sujfree; /* SUJ free list */ + int32_t fs_sparecon32[24]; /* reserved for future constants */ int32_t fs_flags; /* see FS_ flags below */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ @@ -409,6 +411,7 @@ CTASSERT(sizeof(struct fs) == 1376); #define FS_MULTILABEL 0x20 /* file system is MAC multi-label */ #define FS_GJOURNAL 0x40 /* gjournaled file system */ #define FS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */ +#define FS_SUJ 0x100 /* Filesystem using softupdate journal */ /* * Macros to access bits in the fs_active array. @@ -598,8 +601,32 @@ struct cg { ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (size))))) - /* + * Indirect lbns are aligned on NDADDR addresses where single indirects + * are the negated address of the lowest lbn reachable, double indirects + * are this lbn - 1 and triple indirects are this lbn - 2. This yields + * an unusual bit order to determine level. + */ +static inline int +lbn_level(ufs_lbn_t lbn) +{ + if (lbn >= 0) + return 0; + switch (lbn & 0x3) { + case 0: + return (0); + case 1: + break; + case 2: + return (2); + case 3: + return (1); + default: + break; + } + return (-1); +} +/* * Number of inodes in a secondary storage block/fragment. */ #define INOPB(fs) ((fs)->fs_inopb) @@ -610,6 +637,66 @@ struct cg { */ #define NINDIR(fs) ((fs)->fs_nindir) +/* + * Softdep journal record format. + */ + +#define JOP_ADDREF 1 /* Add a reference to an inode. */ +#define JOP_REMREF 2 /* Remove a reference from an inode. */ +#define JOP_NEWBLK 3 /* Allocate a block. */ +#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ + +#define JREC_SIZE 32 /* Record and segment header size. */ + +#define SUJ_MIN (1 * 1024 * 1024) /* Minimum journal size */ +#define SUJ_MAX (128 * SUJ_MIN) /* Maximum journal size */ + +/* + * Size of the segment record header. There is at most one for each disk + * block and at least one for each filesystem block in the journal. The + * segment header is followed by an array of records. + */ +struct jsegrec { + uint64_t jsr_seq; /* Our sequence number */ + uint64_t jsr_oldest; /* Oldest valid sequence number */ + uint32_t jsr_cnt; /* Count of valid records */ + uint32_t jsr_crc; /* 32bit crc of the valid space */ + uint64_t jsr_unused; +}; + +struct jrefrec { + uint32_t jr_op; + ino_t jr_ino; + ino_t jr_parent; + int16_t jr_nlink; + uint16_t jr_mode; + off_t jr_diroff; + uint64_t jr_unused; +}; + +struct jblkrec { + uint32_t jb_op; + uint32_t jb_ino; + ufs2_daddr_t jb_blkno; + ufs_lbn_t jb_lbn; + uint16_t jb_frags; + uint16_t jb_oldfrags; + uint32_t jb_unused; +}; + +union jrec { + struct jsegrec rec_jsegrec; + struct jrefrec rec_jrefrec; + struct jblkrec rec_jblkrec; +}; + +#ifdef CTASSERT +CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); +CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); +CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); +CTASSERT(sizeof(union jrec) == JREC_SIZE); +#endif + extern int inside[], around[]; extern u_char *fragtbl[]; -- Test scenario: crashbox# ./journal.sh + umount /tmp + newfs -U /dev/ad0s1e + tunefs -j enable /dev/ad0s1e Using inode 4 in cg 0 for 67108864 byte journal tunefs: soft updates journaling set + mount /tmp + chmod 777 /tmp crashbox# exit exit $ cd stress2 $ cat j.cfg # $FreeBSD: projects/stress2/vfs.cfg 187224 2009-01-14 16:03:10Z pho $ # Stress Test Suite Configuration # Default values . ./default.cfg export TESTPROGS="testcases/swap/swap testcases/mkdir/mkdir" $ ./run.sh j.cfg