Updated dirpref algorithm for UFS on Darwin/OSX. Index: bsd/ufs/ffs/ffs_alloc.c =================================================================== RCS file: /cvs/Darwin/src/notlive/xnu/bsd/ufs/ffs/ffs_alloc.c,v retrieving revision 1.1.1.5 diff -u -r1.1.1.5 ffs_alloc.c --- bsd/ufs/ffs/ffs_alloc.c 2 Apr 2002 01:15:59 -0000 1.1.1.5 +++ bsd/ufs/ffs/ffs_alloc.c 1 Aug 2002 06:54:00 -0000 @@ -86,7 +86,7 @@ static ufs_daddr_t ffs_alloccgblk __P((struct fs *, struct cg *, ufs_daddr_t)); static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, int)); -static ino_t ffs_dirpref __P((struct fs *)); +static ino_t ffs_dirpref __P((struct inode *)); static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); static void ffs_fserr __P((struct fs *, u_int, char *)); static u_long ffs_hashalloc @@ -389,12 +389,23 @@ goto noinodes; if ((mode & IFMT) == IFDIR) - ipref = ffs_dirpref(fs); + ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); + /* + * Track the number of dirs created one after another + * in a cg without intervening files. + */ + if ((mode & IFMT) == IFDIR) { + if (fs->fs_contigdirs[cg] < 255) + fs->fs_contigdirs[cg]++; + } else { + if (fs->fs_contigdirs[cg] > 0) + fs->fs_contigdirs[cg]--; + } ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg); if (ino == 0) goto noinodes; @@ -429,28 +440,110 @@ } /* - * Find a cylinder to place a directory. + * Find a cylinder group to place a directory. * - * The policy implemented by this algorithm is to select from - * among those cylinder groups with above the average number of - * free inodes, the one with the smallest number of directories. + * The policy implemented by this algorithm is to allocate a + * directory inode in the same cylinder group as its parent + * directory, but also to reserve space for its files inodes + * and data. Restrict the number of directories which may be + * allocated one after another in the same cylinder group + * without intervening allocation of files. */ static ino_t -ffs_dirpref(fs) - register struct fs *fs; +ffs_dirpref(pip) + struct inode *pip; { - int cg, minndir, mincg, avgifree; + register struct fs *fs; + int cg, prefcg, dirsize, cgsize; + int avgifree, avgbfree, avgndir, curdirsize; + int minifree, minbfree, maxndir; + int mincg, minndir; + int maxcontigdirs; + fs = pip->i_fs; avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; - minndir = fs->fs_ipg; - mincg = 0; - for (cg = 0; cg < fs->fs_ncg; cg++) - if (fs->fs_cs(fs, cg).cs_ndir < minndir && - fs->fs_cs(fs, cg).cs_nifree >= avgifree) { - mincg = cg; - minndir = fs->fs_cs(fs, cg).cs_ndir; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; + + /* + * Force allocation in another cg if creating a first level dir. + */ + if (ITOV(pip)->v_flag & VROOT) { +#ifdef __APPLE__ + prefcg = random() % fs->fs_ncg; +#else + prefcg = arc4random() % fs->fs_ncg; +#endif + mincg = prefcg; + minndir = fs->fs_ipg; + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); + } + + /* + * Count various limits which used for + * optimal allocation of a directory inode. + */ + maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); + minifree = avgifree - fs->fs_ipg / 4; + if (minifree < 0) + minifree = 0; + minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4; + if (minbfree < 0) + minbfree = 0; + cgsize = fs->fs_fsize * fs->fs_fpg; + dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; + curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; + if (dirsize < curdirsize) + dirsize = curdirsize; + maxcontigdirs = min(maxcontigdirs, + fs->fs_ipg / fs->fs_avgfpdir); + if (maxcontigdirs == 0) + maxcontigdirs = 1; + + /* + * Limit number of dirs in one cg and reserve space for + * regular files, but only if we have no deficit in + * inodes or space. + */ + prefcg = ino_to_cg(fs, pip->i_number); + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); } - return ((ino_t)(fs->fs_ipg * mincg)); + /* + * This is a backstop when we have deficit in space. + */ + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + return ((ino_t)(fs->fs_ipg * cg)); + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + break; + return ((ino_t)(fs->fs_ipg * cg)); } /* Index: bsd/ufs/ffs/ffs_vfsops.c =================================================================== RCS file: /cvs/Darwin/src/notlive/xnu/bsd/ufs/ffs/ffs_vfsops.c,v retrieving revision 1.1.1.16 diff -u -r1.1.1.16 ffs_vfsops.c --- bsd/ufs/ffs/ffs_vfsops.c 12 Jul 2002 08:07:31 -0000 1.1.1.16 +++ bsd/ufs/ffs/ffs_vfsops.c 1 Aug 2002 06:54:01 -0000 @@ -687,6 +687,7 @@ blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); + size += fs->fs_ncg * sizeof(u_int8_t); space = _MALLOC((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { @@ -712,6 +713,16 @@ for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } + size = fs->fs_ncg * sizeof(u_int8_t); + fs->fs_contigdirs = (u_int8_t *)space; + space = (u_int8_t *)space + size; + bzero(fs->fs_contigdirs, size); + /* XXX Compatibility for old filesystems */ + if (fs->fs_avgfilesize <= 0) + fs->fs_avgfilesize = AVFILESIZ; + if (fs->fs_avgfpdir <= 0) + fs->fs_avgfpdir = AFPDIR; + /* XXX End of compatibility */ mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; Index: bsd/ufs/ffs/fs.h =================================================================== RCS file: /cvs/Darwin/src/notlive/xnu/bsd/ufs/ffs/fs.h,v retrieving revision 1.1.1.5 diff -u -r1.1.1.5 fs.h --- bsd/ufs/ffs/fs.h 12 Jul 2002 03:29:33 -0000 1.1.1.5 +++ bsd/ufs/ffs/fs.h 1 Aug 2002 06:54:01 -0000 @@ -137,16 +137,17 @@ * computed as cylinder groups are inspected. * There is a 128-byte region in the superblock reserved for in-core * pointers to summary information. Originally this included an array - * of pointers to blocks of struct csum; now there are just two + * of pointers to blocks of struct csum; now there are just three * pointers and the remaining space is padded with fs_ocsp[]. * * NOCSPTRS determines the size of this padding. One pointer (fs_csp) * is taken away to point to a contiguous array of struct csum for * all cylinder groups; a second (fs_maxcluster) points to an array - * of cluster sizes that is computed as cylinder groups are inspected. + * of cluster sizes that is computed as cylinder groups are inspected, + * and the third points to an array that tracks the creation of new + * directories. */ -#define NOCSPTRS ((128 / sizeof(void *)) - 2) - +#define NOCSPTRS ((128 / sizeof(void *)) - 3) /* * A summary of contiguous blocks of various sizes is maintained @@ -171,6 +172,17 @@ #define MINFREE 5 #define DEFAULTOPT FS_OPTTIME +/* Grigoriy Orlov has done some extensive work to fine + * tune the layout preferences for directories within a filesystem. + * His algorithm can be tuned by adjusting the following parameters + * which tell the system the average file size and the average number + * of files per directory. These defaults are well selected for typical + * filesystems, but may need to be tuned for odd cases like filesystems + * being used for squid caches or news spools. + */ +#define AVFILESIZ 16384 +#define AFPDIR 64 + /* * Per cylinder group information; summarized in blocks allocated * from first cylinder group data blocks. These blocks have to be @@ -257,11 +269,14 @@ /* these fields retain the current block allocation info */ int32_t fs_cgrotor; /* last cg searched */ void *fs_ocsp[NOCSPTRS]; /* list of fs_cs info buffers */ + u_int8_t *fs_contigdirs; /* # of contiguously allocated dirs */ struct csum *fs_csp; /* list of fs_cs info buffers */ int32_t *fs_maxcluster; /* max cluster in each cyl group */ int32_t fs_cpc; /* cyl per cycle in postbl */ int16_t fs_opostbl[16][8]; /* old rotation block list head */ - int32_t fs_sparecon[50]; /* reserved for future constants */ + int32_t fs_avgfilesize; /* expected average file size */ + int32_t fs_avgfpdir; /* expected # of files per directory */ + int32_t fs_sparecon[48]; /* reserved for future constants */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ int32_t fs_inodefmt; /* format of on-disk inodes */