The Problem
This is an analysis of the panics seen after unclean shutdowns and panics
and is caused by the filesystem being left in an unclean state. It is
quite a common and widespread problem as a quick Google search will
reveal.
Those of us who dabble in stress testing have also learned the hard way
that running a background fsck is not always a good idea.
The Root of the Problem.
By using a small test program, a modified version of newfs(8), a few
kernel modifications and some small test scripts I will try to prove that
the bad_dir panics are caused by mounting an unclean filesystem and that
the UFS2 filesystem code does not handle this gracefully.
The scope of these tests is limited to small UFS2 filesystems without
snapshots. The goal is to get a few small corrupted filesystems suitable
for filesystem debugging.
Test Setup.
In order to provoke these panics I have devised a small test scenario
where I flip random bits in a filesystem and then mount it. To prove the
validity of this strategy the OS must not panic if the filesystem can be
mounted without any problems after fsck has marked as clean and sane. This
has been achieved and verified by running this test scenario for more that
24 hours:
#!/bin/sh
D=/tmp/diskimage
tst() {
rm -f $D
cp diskimage.ok $D
mdconfig -a -t vnode -f $D -u 0
bsdlabel -w md0 auto
../newfs/newfs -b 8192 -f 1024 -U /dev/md0c > /dev/null 2>&1
mount /dev/md0c /mnt
cp -r /usr/include/ufs /mnt
umount /mnt
for i in `jot 20`; do
../fuzz/fuzz -n 50 $D
if fsck -f -y /dev/md0c 2>&1 | egrep "^[A-Z]" > /dev/null; then
if fsck -f -y /dev/md0c 2>&1 | egrep "^[A-Z]" > /dev/null; then
if fsck -f -y /dev/md0c 2>&1 | egrep "^[A-Z]" > /dev/null; then
break # fsck is giving up
fi
fi
fi
sync;sync;sync
if mount /dev/md0c /mnt; then
find /mnt -exec dd if={} of=/dev/null bs=1m count=3 \; > /dev/null 2>&1
umount /mnt
else
echo "Giving up at loop $i"
break
fi
done
mdconfig -d -u 0
rm -f $D
}
while true; do
date '+%T'
tst
done
The strategy with flipping random bits has its merits as it is not based on
any preconceived notion of what the problem may actually be. There is
however one major drawback: The filesystem super blocks seems highly
fragile, so I had to add a checksum test to ffs_mountfs() in order to weed
out bad super blocks and concentrate on corrupted meta data. A phase II of
this project should probably verify the usefulness of adding checksums to
the super block.
This is a diff of the kernel modifications for implementing super block
checksums:
Index: sys/ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.307
diff -u -r1.307 ffs_vfsops.c
--- sys/ufs/ffs/ffs_vfsops.c 31 Mar 2006 03:54:20 -0000 1.307
+++ sys/ufs/ffs/ffs_vfsops.c 18 Apr 2006 14:57:48 -0000
@@ -121,6 +121,8 @@
"snapshot", "suid", "suiddir", "symfollow", "sync",
"update", "union", NULL };
+#include <ufs/ffs/cksum.c>
+
static int
ffs_mount(struct mount *mp, struct thread *td)
{
@@ -642,7 +644,8 @@
goto out;
fs = (struct fs *)bp->b_data;
sblockloc = sblock_try[i];
- if ((fs->fs_magic == FS_UFS1_MAGIC ||
+ if ((compare_sb_cksum(fs) == 0) &&
+ (fs->fs_magic == FS_UFS1_MAGIC ||
(fs->fs_magic == FS_UFS2_MAGIC &&
(fs->fs_sblockloc == sblockloc ||
(fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
@@ -1555,6 +1558,7 @@
fs->fs_fmod = 0;
fs->fs_time = time_second;
bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+ update_sb_cksum((struct fs *)bp->b_data);
ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
if (suspended)
bp->b_flags |= B_VALIDSUSPWRT;
Index: sys/ufs/ffs/ffs_alloc.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.139
diff -u -r1.139 ffs_alloc.c
--- sys/ufs/ffs/ffs_alloc.c 2 Mar 2006 05:50:23 -0000 1.139
+++ sys/ufs/ffs/ffs_alloc.c 18 Apr 2006 14:58:06 -0000
@@ -109,6 +109,8 @@
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
+extern void update_sb_cksum(struct fs *);
+
/*
* Allocate a block in the filesystem.
*
@@ -816,6 +818,7 @@
* synchronous write only when it has been cleared.
*/
if (sbap != &ip->i_din2->di_db[0]) {
+ update_sb_cksum(fs);
if (doasyncfree)
bdwrite(sbp);
else
--- /dev/null Tue Apr 18 16:58:09 2006
+++ sys/ufs/ffs/cksum.c Mon Apr 17 12:36:53 2006
@@ -0,0 +1,52 @@
+#include <sys/md5.h>
+
+/*
+ * Superblock checksum
+ */
+#define fs_cksum fs_sparecon32
+#define use_fs_cksum fs_sparecon32[4]
+static u_char labelsumMD5[16];
+
+static void sb_cksum(struct fs *);
+int compare_sb_cksum(struct fs *);
+void update_sb_cksum(struct fs *);
+
+static void
+sb_cksum(struct fs *b)
+{
+ u_char old[16];
+ MD5_CTX md5sum;
+
+ bcopy(b->fs_cksum, old, sizeof old);
+ bzero(b->fs_cksum, sizeof labelsumMD5);
+ MD5Init(&md5sum);
+ MD5Update(&md5sum, b, sizeof *b);
+ MD5Final(labelsumMD5, &md5sum);
+ bcopy(old, b->fs_cksum, sizeof old);
+}
+
+int
+compare_sb_cksum(struct fs *b)
+{
+ int n;
+
+ if (b->use_fs_cksum == 0)
+ return (0);
+
+ sb_cksum(b);
+ n = bcmp(b->fs_cksum, labelsumMD5, sizeof labelsumMD5);
+ if (n != 0)
+ printf("Superblock checksum error!\n");
+ return (n);
+}
+
+void
+update_sb_cksum(struct fs *b)
+{
+ if (b->use_fs_cksum == 0)
+ return;
+
+ sb_cksum(b);
+ bcopy(labelsumMD5, b->fs_cksum, sizeof labelsumMD5);
+// printf("update_sb_cksum called\n");
+}
The Tests
The test script without the fsck runs
has produced the following filesystems that
cause a panic at mount:
$ ls -alsrt *.gz
10 -rw-r--r-- 1 pho pho 8401 17 Apr 18:39 diskimage.baddir.gz
8 -rw-r--r-- 1 pho pho 6925 22 Apr 06:44 diskimage.kmem_malloc.gz
240 -rw-r--r-- 1 pho pho 226548 22 Apr 11:54 diskimage.VOP_STRATEGY_failed.gz
368 -rw-r--r-- 1 pho pho 352620 23 Apr 07:37 diskimage.dirhash_bad_offset.gz
$
The most frequent problem is by far the bad_dir panic.
By mounting the file system read-only the other problem has surfaced.
Conclusion.
It seems that the UFS2 filesystem code is not robust enough to
handle mounting of unclean filesystems, so in order to run background
fsck successfully these problems should probably be addressed.