commit 73a98ba582b37f854271ec280a48e2b928c69aca Author: Gleb Kurtsou Date: Thu Jun 21 01:31:46 2012 +0300 tmpfs: Replace directory entry linked list with RB-Tree To speed up entry lookup during READDIR call use name hash value as directory entry offset (cookie). Fallback to incremental cookies in case of hash collisions (duplicate-cookies). Keep sorted per directory list of duplicate-cookie entries to facilitate cookie number allocation. Don't fail if previous VOP_READDIR() offset is no longer valid, start with next dirent instead. Other file system handle it similarly. Workaround race prone tn_readdir_last[pn] fields update. Add tmpfs_dir_destroy() to free all dirents. Set NFS cookies in tmpfs_dir_getdents(). Return EJUSTRETURN from tmpfs_dir_getdents() instead of hard coded -1. Mark directory traversal routines static as they are no longer used outside of tmpfs_subr.c diff --git a/sys/fs/tmpfs/tmpfs.h b/sys/fs/tmpfs/tmpfs.h index 1c6d714..99bdfc3 100644 --- a/sys/fs/tmpfs/tmpfs.h +++ b/sys/fs/tmpfs/tmpfs.h @@ -49,6 +49,7 @@ /* --------------------------------------------------------------------- */ #include #include +#include #include #include @@ -60,104 +61,81 @@ MALLOC_DECLARE(M_TMPFSNAME); /* * Internal representation of a tmpfs directory entry. */ + +LIST_HEAD(tmpfs_dir_duphead, tmpfs_dirent); + struct tmpfs_dirent { - TAILQ_ENTRY(tmpfs_dirent) td_entries; + /* + * Depending on td_cookie flag entry can be of 3 types: + * - regular -- no hash collisions, stored in RB-Tree + * - duphead -- synthetic linked list head for dup entries + * - dup -- stored in linked list instead of RB-Tree + */ + union { + /* regular and duphead entry types */ + RB_ENTRY(tmpfs_dirent) td_entries; - /* Length of the name stored in this directory entry. This avoids - * the need to recalculate it every time the name is used. */ - uint16_t td_namelen; + /* dup entry type */ + struct { + LIST_ENTRY(tmpfs_dirent) entries; + LIST_ENTRY(tmpfs_dirent) index_entries; + } td_dup; + } uh; - /* The name of the entry, allocated from a string pool. This - * string is not required to be zero-terminated; therefore, the - * td_namelen field must always be used when accessing its value. */ - char * td_name; + uint32_t td_cookie; + uint32_t td_hash; + u_int td_namelen; /* Pointer to the node this entry refers to. In case this field * is NULL, the node is a whiteout. */ struct tmpfs_node * td_node; + + union { + /* + * The name of the entry, allocated from a string pool. This + * string is not required to be zero-terminated. + */ + char * td_name; /* regular, dup */ + struct tmpfs_dir_duphead td_duphead; /* duphead */ + } ud; }; -/* A directory in tmpfs holds a sorted list of directory entries, which in +/* A directory in tmpfs holds a list of directory entries, which in * turn point to other files (which can be directories themselves). * - * In tmpfs, this list is managed by a tail queue, whose head is defined by + * In tmpfs, this list is managed by a RB-Tree, whose head is defined by * the struct tmpfs_dir type. * - * It is imporant to notice that directories do not have entries for . and + * It is important to notice that directories do not have entries for . and * .. as other file systems do. These can be generated when requested * based on information available by other means, such as the pointer to * the node itself in the former case or the pointer to the parent directory * in the latter case. This is done to simplify tmpfs's code and, more * importantly, to remove redundancy. */ -TAILQ_HEAD(tmpfs_dir, tmpfs_dirent); +RB_HEAD(tmpfs_dir, tmpfs_dirent); /* Each entry in a directory has a cookie that identifies it. Cookies * supersede offsets within directories because, given how tmpfs stores - * directories in memory, there is no such thing as an offset. (Emulating - * a real offset could be very difficult.) - * + * directories in memory, there is no such thing as an offset. + * * The '.', '..' and the end of directory markers have fixed cookies which * cannot collide with the cookies generated by other entries. The cookies - * fot the other entries are generated based on the memory address on which - * stores their information is stored. - * - * Ideally, using the entry's memory pointer as the cookie would be enough - * to represent it and it wouldn't cause collisions in any system. - * Unfortunately, this results in "offsets" with very large values which - * later raise problems in the Linux compatibility layer (and maybe in other - * places) as described in PR kern/32034. Hence we need to workaround this - * with a rather ugly hack. - * - * Linux 32-bit binaries, unless built with _FILE_OFFSET_BITS=64, have off_t - * set to 'long', which is a 32-bit *signed* long integer. Regardless of - * the macro value, GLIBC (2.3 at least) always uses the getdents64 - * system call (when calling readdir) which internally returns off64_t - * offsets. In order to make 32-bit binaries work, *GLIBC* converts the - * 64-bit values returned by the kernel to 32-bit ones and aborts with - * EOVERFLOW if the conversion results in values that won't fit in 32-bit - * integers (which it assumes is because the directory is extremely large). - * This wouldn't cause problems if we were dealing with unsigned integers, - * but as we have signed integers, this check fails due to sign expansion. + * for the other entries are generated based on the file name hash value or + * unique number in case of name hash collision. * - * For example, consider that the kernel returns the 0xc1234567 cookie to - * userspace in a off64_t integer. Later on, GLIBC casts this value to - * off_t (remember, signed) with code similar to: - * system call returns the offset in kernel_value; - * off_t casted_value = kernel_value; - * if (sizeof(off_t) != sizeof(off64_t) && - * kernel_value != casted_value) - * error! - * In this case, casted_value still has 0xc1234567, but when it is compared - * for equality against kernel_value, it is promoted to a 64-bit integer and - * becomes 0xffffffffc1234567, which is different than 0x00000000c1234567. - * Then, GLIBC assumes this is because the directory is very large. - * - * Given that all the above happens in user-space, we have no control over - * it; therefore we must workaround the issue here. We do this by - * truncating the pointer value to a 32-bit integer and hope that there - * won't be collisions. In fact, this will not cause any problems in - * 32-bit platforms but some might arise in 64-bit machines (I'm not sure - * if they can happen at all in practice). - * - * XXX A nicer solution shall be attempted. */ -#ifdef _KERNEL -#define TMPFS_DIRCOOKIE_DOT 0 -#define TMPFS_DIRCOOKIE_DOTDOT 1 -#define TMPFS_DIRCOOKIE_EOF 2 -static __inline -off_t -tmpfs_dircookie(struct tmpfs_dirent *de) -{ - off_t cookie; - - cookie = ((off_t)(uintptr_t)de >> 1) & 0x7FFFFFFF; - MPASS(cookie != TMPFS_DIRCOOKIE_DOT); - MPASS(cookie != TMPFS_DIRCOOKIE_DOTDOT); - MPASS(cookie != TMPFS_DIRCOOKIE_EOF); + * To preserve compatibility cookies are limited to 31 bits. + */ - return cookie; -} -#endif +#define TMPFS_DIRCOOKIE_DOT 0 +#define TMPFS_DIRCOOKIE_DOTDOT 1 +#define TMPFS_DIRCOOKIE_EOF 2 +#define TMPFS_DIRCOOKIE_MASK ((off_t)0x3fffffffU) +#define TMPFS_DIRCOOKIE_MIN ((off_t)0x00000004U) +#define TMPFS_DIRCOOKIE_DUP ((off_t)0x40000000U) +#define TMPFS_DIRCOOKIE_DUPHEAD ((off_t)0x80000000U) +#define TMPFS_DIRCOOKIE_DUP_MIN TMPFS_DIRCOOKIE_DUP +#define TMPFS_DIRCOOKIE_DUP_MAX \ + (TMPFS_DIRCOOKIE_DUP | TMPFS_DIRCOOKIE_MASK) /* --------------------------------------------------------------------- */ @@ -243,29 +221,31 @@ struct tmpfs_node { dev_t tn_rdev; /* Valid when tn_type == VDIR. */ - struct tn_dir{ + struct tn_dir { /* Pointer to the parent directory. The root * directory has a pointer to itself in this field; * this property identifies the root node. */ struct tmpfs_node * tn_parent; - /* Head of a tail-queue that links the contents of - * the directory together. See above for a - * description of its contents. */ + /* Head of a tree that links the contents of + * the directory together. */ struct tmpfs_dir tn_dirhead; + /* Head of a list the contains fake directory entries + * heads, i.e. entries with TMPFS_DIRCOOKIE_DUHEAD + * flag. */ + struct tmpfs_dir_duphead tn_dupindex; + /* Number and pointer of the first directory entry * returned by the readdir operation if it were * called again to continue reading data from the * same directory as before. This is used to speed * up reads of long directories, assuming that no * more than one read is in progress at a given time. - * Otherwise, these values are discarded and a linear - * scan is performed from the beginning up to the - * point where readdir starts returning values. */ + * Otherwise, these values are discarded. */ off_t tn_readdir_lastn; struct tmpfs_dirent * tn_readdir_lastp; - }tn_dir; + } tn_dir; /* Valid when tn_type == VLNK. */ /* The link's target, allocated from a string pool. */ @@ -419,9 +399,9 @@ int tmpfs_alloc_node(struct tmpfs_mount *, enum vtype, char *, dev_t, struct tmpfs_node **); void tmpfs_free_node(struct tmpfs_mount *, struct tmpfs_node *); int tmpfs_alloc_dirent(struct tmpfs_mount *, struct tmpfs_node *, - const char *, uint16_t, struct tmpfs_dirent **); -void tmpfs_free_dirent(struct tmpfs_mount *, struct tmpfs_dirent *, - boolean_t); + const char *, u_int, struct tmpfs_dirent **); +void tmpfs_free_dirent(struct tmpfs_mount *, struct tmpfs_dirent *); +void tmpfs_dirent_init(struct tmpfs_dirent *, const char *, u_int); int tmpfs_alloc_vp(struct mount *, struct tmpfs_node *, int, struct vnode **); void tmpfs_free_vp(struct vnode *); @@ -429,13 +409,12 @@ int tmpfs_alloc_file(struct vnode *, struct vnode **, struct vattr *, struct componentname *, char *); void tmpfs_dir_attach(struct vnode *, struct tmpfs_dirent *); void tmpfs_dir_detach(struct vnode *, struct tmpfs_dirent *); +void tmpfs_dir_destroy(struct tmpfs_mount *, struct tmpfs_node *); struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct componentname *cnp); -int tmpfs_dir_getdotdent(struct tmpfs_node *, struct uio *); -int tmpfs_dir_getdotdotdent(struct tmpfs_node *, struct uio *); -struct tmpfs_dirent * tmpfs_dir_lookupbycookie(struct tmpfs_node *, off_t); -int tmpfs_dir_getdents(struct tmpfs_node *, struct uio *, off_t *); +int tmpfs_dir_getdents(struct tmpfs_node *, struct uio *, int, + u_long *, int *); int tmpfs_dir_whiteout_add(struct vnode *, struct componentname *); void tmpfs_dir_whiteout_remove(struct vnode *, struct componentname *); int tmpfs_reg_resize(struct vnode *, off_t, boolean_t); @@ -467,8 +446,8 @@ int tmpfs_truncate(struct vnode *, off_t); * with a length of 'len'. */ #define TMPFS_DIRENT_MATCHES(de, name, len) \ - (de->td_namelen == (uint16_t)len && \ - bcmp((de)->td_name, (name), (de)->td_namelen) == 0) + (de->td_namelen == len && \ + bcmp((de)->ud.td_name, (name), (de)->td_namelen) == 0) /* --------------------------------------------------------------------- */ @@ -476,11 +455,10 @@ int tmpfs_truncate(struct vnode *, off_t); * Ensures that the node pointed by 'node' is a directory and that its * contents are consistent with respect to directories. */ -#define TMPFS_VALIDATE_DIR(node) \ - MPASS((node)->tn_type == VDIR); \ - MPASS((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \ - MPASS((node)->tn_dir.tn_readdir_lastp == NULL || \ - tmpfs_dircookie((node)->tn_dir.tn_readdir_lastp) == (node)->tn_dir.tn_readdir_lastn); +#define TMPFS_VALIDATE_DIR(node) do { \ + MPASS((node)->tn_type == VDIR); \ + MPASS((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \ +} while (0) /* --------------------------------------------------------------------- */ diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index 9395824..ce37d98 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -57,6 +58,20 @@ __FBSDID("$FreeBSD$"); #include #include +#if 0 +#ifdef TMPFS_DEBUG_DIRCOOKIE_DUP +#undef TMPFS_DIRCOOKIE_DUP_MAX +#define TMPFS_DIRCOOKIE_DUP_MAX (TMPFS_DIRCOOKIE_DUP | 0xf) +#endif +#endif + +struct tmpfs_dir_cursor { + struct tmpfs_dirent *tdc_tree, *tdc_current; +}; + +static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, + struct tmpfs_dirent *b); + SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "tmpfs file system"); static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; @@ -86,6 +101,8 @@ SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, CTLTYPE_LONG|CTLFLAG_RW, &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L", "Amount of available memory and swap below which tmpfs growth stops"); +RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); + size_t tmpfs_mem_avail(void) { @@ -187,7 +204,8 @@ tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type, break; case VDIR: - TAILQ_INIT(&nnode->tn_dir.tn_dirhead); + RB_INIT(&nnode->tn_dir.tn_dirhead); + LIST_INIT(&nnode->tn_dir.tn_dupindex); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; @@ -308,6 +326,49 @@ tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) /* --------------------------------------------------------------------- */ +static __inline uint32_t +tmpfs_dirent_hash(const char *name, u_int len) +{ + uint32_t hash; + + hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; +#ifdef TMPFS_DEBUG_DIRCOOKIE_DUP + hash &= 0xf; +#endif + if (hash < TMPFS_DIRCOOKIE_MIN) + hash += TMPFS_DIRCOOKIE_MIN; + + return (hash); +} + +static __inline off_t +tmpfs_dirent_cookie(struct tmpfs_dirent *de) +{ + MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); + + return (de->td_cookie); +} + +static __inline boolean_t +tmpfs_dirent_dup(struct tmpfs_dirent *de) +{ + return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); +} + +static __inline boolean_t +tmpfs_dirent_duphead(struct tmpfs_dirent *de) +{ + return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); +} + +void +tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) +{ + de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); + memcpy(de->ud.td_name, name, namelen); + de->td_namelen = namelen; +} + /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. @@ -319,17 +380,17 @@ tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, - const char *name, uint16_t len, struct tmpfs_dirent **de) + const char *name, u_int len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; - nde = (struct tmpfs_dirent *)uma_zalloc( - tmp->tm_dirent_pool, M_WAITOK); - nde->td_name = malloc(len, M_TMPFSNAME, M_WAITOK); - nde->td_namelen = len; - memcpy(nde->td_name, name, len); - + nde = uma_zalloc(tmp->tm_dirent_pool, M_WAITOK); nde->td_node = node; + if (name != NULL) { + nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); + tmpfs_dirent_init(nde, name, len); + } else + nde->td_namelen = 0; if (node != NULL) node->tn_links++; @@ -350,20 +411,17 @@ tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, * directory entry, as it may already have been released from the outside. */ void -tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de, - boolean_t node_exists) +tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) { - if (node_exists) { - struct tmpfs_node *node; + struct tmpfs_node *node; - node = de->td_node; - if (node != NULL) { - MPASS(node->tn_links > 0); - node->tn_links--; - } + node = de->td_node; + if (node != NULL) { + MPASS(node->tn_links > 0); + node->tn_links--; } - - free(de->td_name, M_TMPFSNAME); + if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) + free(de->ud.td_name, M_TMPFSNAME); uma_zfree(tmp->tm_dirent_pool, de); } @@ -585,7 +643,7 @@ tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); if (error != 0) { - tmpfs_free_dirent(tmp, de, TRUE); + tmpfs_free_dirent(tmp, de); tmpfs_free_node(tmp, node); goto out; } @@ -604,6 +662,215 @@ out: /* --------------------------------------------------------------------- */ +static struct tmpfs_dirent * +tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) +{ + struct tmpfs_dirent *de; + + de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); + dc->tdc_tree = de; + if (de != NULL && tmpfs_dirent_duphead(de)) + de = LIST_FIRST(&de->ud.td_duphead); + dc->tdc_current = de; + + return (dc->tdc_current); +} + +static struct tmpfs_dirent * +tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) +{ + struct tmpfs_dirent *de; + + MPASS(dc->tdc_tree != NULL); + if (tmpfs_dirent_dup(dc->tdc_current)) { + dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); + if (dc->tdc_current != NULL) + return (dc->tdc_current); + } + dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, + &dnode->tn_dir.tn_dirhead, dc->tdc_tree); + if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { + dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); + MPASS(dc->tdc_current != NULL); + } + + return (dc->tdc_current); +} + +/* Lookup directory entry in RB-Tree. Function may return duphead entry. */ +static struct tmpfs_dirent * +tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) +{ + struct tmpfs_dirent *de, dekey; + + dekey.td_hash = hash; + de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); + return (de); +} + +/* Lookup directory entry by cookie, initialize directory cursor accordingly. */ +static struct tmpfs_dirent * +tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, + struct tmpfs_dir_cursor *dc) +{ + struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; + struct tmpfs_dirent *de, dekey; + + MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); + + if (cookie == node->tn_dir.tn_readdir_lastn && + (de = node->tn_dir.tn_readdir_lastp) != NULL) { + /* Protect against possible race, tn_readdir_last[pn] + * may be updated with only shared vnode lock held. */ + if (cookie == tmpfs_dirent_cookie(de)) + goto out; + } + + if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { + LIST_FOREACH(de, &node->tn_dir.tn_dupindex, + uh.td_dup.index_entries) { + MPASS(tmpfs_dirent_dup(de)); + if (de->td_cookie == cookie) + goto out; + /* dupindex list is sorted. */ + if (de->td_cookie < cookie) { + de = NULL; + goto out; + } + } + MPASS(de == NULL); + goto out; + } + + MPASS((cookie & TMPFS_DIRCOOKIE_MASK) == cookie); + dekey.td_hash = cookie; + /* Recover if direntry for cookie was removed */ + de = RB_NFIND(tmpfs_dir, dirhead, &dekey); + dc->tdc_tree = de; + dc->tdc_current = de; + if (de != NULL && tmpfs_dirent_duphead(de)) { + dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); + MPASS(dc->tdc_current != NULL); + } + return (dc->tdc_current); + +out: + dc->tdc_tree = de; + dc->tdc_current = de; + if (de != NULL && tmpfs_dirent_dup(de)) + dc->tdc_tree = tmpfs_dir_xlookup_hash(node, + de->td_hash); + return (dc->tdc_current); +} + +/* + * Looks for a directory entry in the directory represented by node. + * 'cnp' describes the name of the entry to look for. Note that the . + * and .. components are not allowed as they do not physically exist + * within directories. + * + * Returns a pointer to the entry when found, otherwise NULL. + */ +struct tmpfs_dirent * +tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, + struct componentname *cnp) +{ + struct tmpfs_dir_duphead *duphead; + struct tmpfs_dirent *de; + uint32_t hash; + + MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); + MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && + cnp->cn_nameptr[1] == '.'))); + TMPFS_VALIDATE_DIR(node); + + hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); + de = tmpfs_dir_xlookup_hash(node, hash); + if (de != NULL && tmpfs_dirent_duphead(de)) { + duphead = &de->ud.td_duphead; + LIST_FOREACH(de, duphead, uh.td_dup.entries) { + if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, + cnp->cn_namelen)) + break; + } + } else if (de != NULL) { + if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, + cnp->cn_namelen)) + de = NULL; + } + if (de != NULL && f != NULL && de->td_node != f) + de = NULL; + + return (de); +} + +/* + * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex + * list, allocate new cookie value. + */ +static void +tmpfs_dir_attach_dup(struct tmpfs_node *dnode, + struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) +{ + struct tmpfs_dir_duphead *dupindex; + struct tmpfs_dirent *de, *pde; + + dupindex = &dnode->tn_dir.tn_dupindex; + de = LIST_FIRST(dupindex); + if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { + if (de == NULL) + nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; + else + nde->td_cookie = de->td_cookie + 1; + MPASS(tmpfs_dirent_dup(nde)); + LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); + LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); + return; + } + + /* + * Cookie numbers are near exhaustion. Scan dupindex list for unused + * numbers. dupindex list is sorted in descending order. Keep it so + * after inserting nde. + */ + while (1) { + pde = de; + de = LIST_NEXT(de, uh.td_dup.index_entries); + if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { + /* + * Last element of the index doesn't have minimal cookie + * value, use it. + */ + nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; + LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); + LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); + return; + } else if (de == NULL) { + /* + * We are so lucky have 2^30 hash duplicates in single + * directory :) Return largest possible cookie value. + * It should be fine except possible issues with + * VOP_READDIR restart. + */ + nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; + LIST_INSERT_HEAD(dupindex, nde, + uh.td_dup.index_entries); + LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); + return; + } + if (de->td_cookie + 1 == pde->td_cookie || + de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) + continue; /* No hole or invalid cookie. */ + nde->td_cookie = de->td_cookie + 1; + MPASS(tmpfs_dirent_dup(nde)); + MPASS(pde->td_cookie > nde->td_cookie); + MPASS(nde->td_cookie > de->td_cookie); + LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); + LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); + return; + }; +} + /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by @@ -613,10 +880,38 @@ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; + struct tmpfs_dirent *xde, *nde; ASSERT_VOP_ELOCKED(vp, __func__); + MPASS(de->td_namelen > 0); + MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); + MPASS(de->td_cookie == de->td_hash); + dnode = VP_TO_TMPFS_DIR(vp); - TAILQ_INSERT_TAIL(&dnode->tn_dir.tn_dirhead, de, td_entries); + dnode->tn_dir.tn_readdir_lastn = 0; + dnode->tn_dir.tn_readdir_lastp = NULL; + + MPASS(!tmpfs_dirent_dup(de)); + xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); + if (xde != NULL && tmpfs_dirent_duphead(xde)) + tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); + else if (xde != NULL) { + /* + * Allocate new duphead. Swap xde with duphead to avoid + * adding/removing elements with the same hash. + */ + MPASS(!tmpfs_dirent_dup(xde)); + tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, + &nde); + /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ + memcpy(nde, xde, sizeof(*xde)); + xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; + LIST_INIT(&xde->ud.td_duphead); + xde->td_namelen = 0; + xde->td_node = NULL; + tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); + tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); + } dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; @@ -632,58 +927,61 @@ tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { + struct tmpfs_mount *tmp; + struct tmpfs_dir *head; struct tmpfs_node *dnode; + struct tmpfs_dirent *xde; ASSERT_VOP_ELOCKED(vp, __func__); - dnode = VP_TO_TMPFS_DIR(vp); - if (dnode->tn_dir.tn_readdir_lastp == de) { - dnode->tn_dir.tn_readdir_lastn = 0; - dnode->tn_dir.tn_readdir_lastp = NULL; - } + dnode = VP_TO_TMPFS_DIR(vp); + head = &dnode->tn_dir.tn_dirhead; + dnode->tn_dir.tn_readdir_lastn = 0; + dnode->tn_dir.tn_readdir_lastp = NULL; + + if (tmpfs_dirent_dup(de)) { + /* Remove duphead if de was last entry. */ + if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { + xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); + MPASS(tmpfs_dirent_duphead(xde)); + } else + xde = NULL; + LIST_REMOVE(de, uh.td_dup.entries); + LIST_REMOVE(de, uh.td_dup.index_entries); + if (xde != NULL) { + if (LIST_EMPTY(&xde->ud.td_duphead)) { + RB_REMOVE(tmpfs_dir, head, xde); + tmp = VFS_TO_TMPFS(vp->v_mount); + MPASS(xde->td_node == NULL); + tmpfs_free_dirent(tmp, xde); + } + } + } else + RB_REMOVE(tmpfs_dir, head, de); - TAILQ_REMOVE(&dnode->tn_dir.tn_dirhead, de, td_entries); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; } -/* --------------------------------------------------------------------- */ - -/* - * Looks for a directory entry in the directory represented by node. - * 'cnp' describes the name of the entry to look for. Note that the . - * and .. components are not allowed as they do not physically exist - * within directories. - * - * Returns a pointer to the entry when found, otherwise NULL. - */ -struct tmpfs_dirent * -tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, - struct componentname *cnp) +void +tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) { - boolean_t found; - struct tmpfs_dirent *de; - - MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); - MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && - cnp->cn_nameptr[1] == '.'))); - TMPFS_VALIDATE_DIR(node); - - found = 0; - TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { - if (f != NULL && de->td_node != f) - continue; - MPASS(cnp->cn_namelen < 0xffff); - if (de->td_namelen == (uint16_t)cnp->cn_namelen && - bcmp(de->td_name, cnp->cn_nameptr, de->td_namelen) == 0) { - found = 1; - break; + struct tmpfs_dirent *de, *dde, *nde; + + RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { + RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); + /* Node may already be destroyed. */ + de->td_node = NULL; + if (tmpfs_dirent_duphead(de)) { + while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { + LIST_REMOVE(dde, uh.td_dup.entries); + dde->td_node = NULL; + tmpfs_free_dirent(tmp, dde); + } } + tmpfs_free_dirent(tmp, de); } - node->tn_status |= TMPFS_NODE_ACCESSED; - - return found ? de : NULL; } /* --------------------------------------------------------------------- */ @@ -695,7 +993,7 @@ tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, * hold the directory entry or an appropriate error code if another * error happens. */ -int +static int tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) { int error; @@ -712,12 +1010,9 @@ tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) - error = -1; - else { + error = EJUSTRETURN; + else error = uiomove(&dent, dent.d_reclen, uio); - if (error == 0) - uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; - } node->tn_status |= TMPFS_NODE_ACCESSED; @@ -733,7 +1028,7 @@ tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) * hold the directory entry or an appropriate error code if another * error happens. */ -int +static int tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio) { int error; @@ -762,19 +1057,9 @@ tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio) dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) - error = -1; - else { + error = EJUSTRETURN; + else error = uiomove(&dent, dent.d_reclen, uio); - if (error == 0) { - struct tmpfs_dirent *de; - - de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); - if (de == NULL) - uio->uio_offset = TMPFS_DIRCOOKIE_EOF; - else - uio->uio_offset = tmpfs_dircookie(de); - } - } node->tn_status |= TMPFS_NODE_ACCESSED; @@ -784,30 +1069,6 @@ tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio) /* --------------------------------------------------------------------- */ /* - * Lookup a directory entry by its associated cookie. - */ -struct tmpfs_dirent * -tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie) -{ - struct tmpfs_dirent *de; - - if (cookie == node->tn_dir.tn_readdir_lastn && - node->tn_dir.tn_readdir_lastp != NULL) { - return node->tn_dir.tn_readdir_lastp; - } - - TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { - if (tmpfs_dircookie(de) == cookie) { - break; - } - } - - return de; -} - -/* --------------------------------------------------------------------- */ - -/* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space @@ -815,27 +1076,47 @@ tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie) * error code if another error happens. */ int -tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp) +tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, int cnt, + u_long *cookies, int *ncookies) { - int error; - off_t startcookie; + struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; + off_t off; + int error; TMPFS_VALIDATE_DIR(node); - /* Locate the first directory entry we have to return. We have cached - * the last readdir in the node, so use those values if appropriate. - * Otherwise do a linear scan to find the requested entry. */ - startcookie = uio->uio_offset; - MPASS(startcookie != TMPFS_DIRCOOKIE_DOT); - MPASS(startcookie != TMPFS_DIRCOOKIE_DOTDOT); - if (startcookie == TMPFS_DIRCOOKIE_EOF) { - return 0; - } else { - de = tmpfs_dir_lookupbycookie(node, startcookie); - } - if (de == NULL) { - return EINVAL; + off = 0; + switch (uio->uio_offset) { + case TMPFS_DIRCOOKIE_DOT: + error = tmpfs_dir_getdotdent(node, uio); + if (error != 0) + return (error); + uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; + if (cnt != 0) + cookies[(*ncookies)++] = off = uio->uio_offset; + case TMPFS_DIRCOOKIE_DOTDOT: + error = tmpfs_dir_getdotdotdent(node, uio); + if (error != 0) + return (error); + de = tmpfs_dir_first(node, &dc); + if (de == NULL) + uio->uio_offset = TMPFS_DIRCOOKIE_EOF; + else + uio->uio_offset = tmpfs_dirent_cookie(de); + if (cnt != 0) + cookies[(*ncookies)++] = off = uio->uio_offset; + if (de == NULL) + return (0); + break; + case TMPFS_DIRCOOKIE_EOF: + return (0); + default: + de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); + if (de == NULL) + return (EINVAL); + if (cnt != 0) + off = tmpfs_dirent_cookie(de); } /* Read as much entries as possible; i.e., until we reach the end of @@ -886,14 +1167,14 @@ tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp) } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); - (void)memcpy(d.d_name, de->td_name, de->td_namelen); + (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); d.d_name[de->td_namelen] = '\0'; d.d_reclen = GENERIC_DIRSIZ(&d); /* Stop reading if the directory entry we are treating is * bigger than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { - error = -1; + error = EJUSTRETURN; break; } @@ -901,21 +1182,30 @@ tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp) * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); if (error == 0) { - (*cntp)++; - de = TAILQ_NEXT(de, td_entries); + de = tmpfs_dir_next(node, &dc); + if (cnt != 0) { + if (de == NULL) + off = TMPFS_DIRCOOKIE_EOF; + else + off = tmpfs_dirent_cookie(de); + MPASS(*ncookies < cnt); + cookies[(*ncookies)++] = off; + } } } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Update the offset and cache. */ - if (de == NULL) { - uio->uio_offset = TMPFS_DIRCOOKIE_EOF; - node->tn_dir.tn_readdir_lastn = 0; - node->tn_dir.tn_readdir_lastp = NULL; - } else { - node->tn_dir.tn_readdir_lastn = uio->uio_offset = tmpfs_dircookie(de); - node->tn_dir.tn_readdir_lastp = de; + if (cnt == 0) { + if (de == NULL) + off = TMPFS_DIRCOOKIE_EOF; + else + off = tmpfs_dirent_cookie(de); } + uio->uio_offset = off; + node->tn_dir.tn_readdir_lastn = off; + node->tn_dir.tn_readdir_lastp = de; + node->tn_status |= TMPFS_NODE_ACCESSED; return error; } @@ -942,7 +1232,7 @@ tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); MPASS(de != NULL && de->td_node == NULL); tmpfs_dir_detach(dvp, de); - tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de, TRUE); + tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* --------------------------------------------------------------------- */ @@ -1435,3 +1725,15 @@ out: return error; } + +static __inline int +tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) +{ + if (a->td_hash > b->td_hash) + return (1); + else if (a->td_hash < b->td_hash) + return (-1); + return (0); +} + +RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); diff --git a/sys/fs/tmpfs/tmpfs_vfsops.c b/sys/fs/tmpfs/tmpfs_vfsops.c index a08dafa..ef373b7 100644 --- a/sys/fs/tmpfs/tmpfs_vfsops.c +++ b/sys/fs/tmpfs/tmpfs_vfsops.c @@ -294,19 +294,8 @@ tmpfs_unmount(struct mount *mp, int mntflags) while (node != NULL) { struct tmpfs_node *next; - if (node->tn_type == VDIR) { - struct tmpfs_dirent *de; - - de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); - while (de != NULL) { - struct tmpfs_dirent *nde; - - nde = TAILQ_NEXT(de, td_entries); - tmpfs_free_dirent(tmp, de, FALSE); - de = nde; - node->tn_size -= sizeof(struct tmpfs_dirent); - } - } + if (node->tn_type == VDIR) + tmpfs_dir_destroy(tmp, node); next = LIST_NEXT(node, tn_entries); tmpfs_free_node(tmp, node); diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 09780c8..8888c3f 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -846,7 +846,7 @@ tmpfs_remove(struct vop_remove_args *v) /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ - tmpfs_free_dirent(tmp, de, TRUE); + tmpfs_free_dirent(tmp, de); node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED; error = 0; @@ -1262,26 +1262,25 @@ tmpfs_rename(struct vop_rename_args *v) fdnode->tn_links--; TMPFS_NODE_UNLOCK(fdnode); } - - /* Do the move: just remove the entry from the source directory - * and insert it into the target one. */ - tmpfs_dir_detach(fdvp, de); - if (fcnp->cn_flags & DOWHITEOUT) - tmpfs_dir_whiteout_add(fdvp, fcnp); - if (tcnp->cn_flags & ISWHITEOUT) - tmpfs_dir_whiteout_remove(tdvp, tcnp); - tmpfs_dir_attach(tdvp, de); } + /* Do the move: just remove the entry from the source directory + * and insert it into the target one. */ + tmpfs_dir_detach(fdvp, de); + + if (fcnp->cn_flags & DOWHITEOUT) + tmpfs_dir_whiteout_add(fdvp, fcnp); + if (tcnp->cn_flags & ISWHITEOUT) + tmpfs_dir_whiteout_remove(tdvp, tcnp); + /* If the name has changed, we need to make it effective by changing * it in the directory entry. */ if (newname != NULL) { MPASS(tcnp->cn_namelen <= MAXNAMLEN); - free(de->td_name, M_TMPFSNAME); - de->td_namelen = (uint16_t)tcnp->cn_namelen; - memcpy(newname, tcnp->cn_nameptr, tcnp->cn_namelen); - de->td_name = newname; + free(de->ud.td_name, M_TMPFSNAME); + de->ud.td_name = newname; + tmpfs_dirent_init(de, tcnp->cn_nameptr, tcnp->cn_namelen); fnode->tn_status |= TMPFS_NODE_CHANGED; tdnode->tn_status |= TMPFS_NODE_MODIFIED; @@ -1290,15 +1289,20 @@ tmpfs_rename(struct vop_rename_args *v) /* If we are overwriting an entry, we have to remove the old one * from the target directory. */ if (tvp != NULL) { + struct tmpfs_dirent *tde; + /* Remove the old entry from the target directory. */ - de = tmpfs_dir_lookup(tdnode, tnode, tcnp); - tmpfs_dir_detach(tdvp, de); + tde = tmpfs_dir_lookup(tdnode, tnode, tcnp); + tmpfs_dir_detach(tdvp, tde); /* Free the directory entry we just deleted. Note that the * node referred by it will not be removed until the vnode is * really reclaimed. */ - tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), de, TRUE); + tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde); } + + tmpfs_dir_attach(tdvp, de); + cache_purge(fvp); if (tvp != NULL) cache_purge(tvp); @@ -1426,7 +1430,7 @@ tmpfs_rmdir(struct vop_rmdir_args *v) /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ - tmpfs_free_dirent(tmp, de, TRUE); + tmpfs_free_dirent(tmp, de); /* Release the deleted vnode (will destroy the node, notify * interested parties and clean it from the cache). */ @@ -1472,8 +1476,8 @@ tmpfs_readdir(struct vop_readdir_args *v) int *ncookies = v->a_ncookies; int error; - off_t startoff; - off_t cnt = 0; + ssize_t startresid; + int cnt = 0; struct tmpfs_node *node; /* This operation only makes sense on directory nodes. */ @@ -1482,69 +1486,29 @@ tmpfs_readdir(struct vop_readdir_args *v) node = VP_TO_TMPFS_DIR(vp); - startoff = uio->uio_offset; + startresid = uio->uio_resid; - if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) { - error = tmpfs_dir_getdotdent(node, uio); - if (error != 0) - goto outok; - cnt++; - } - - if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) { - error = tmpfs_dir_getdotdotdent(node, uio); - if (error != 0) - goto outok; - cnt++; + if (cookies != NULL && ncookies != NULL) { + cnt = howmany(node->tn_size, sizeof(struct tmpfs_dirent)) + 2; + *cookies = malloc(cnt * sizeof(**cookies), M_TEMP, M_WAITOK); + *ncookies = 0; } - error = tmpfs_dir_getdents(node, uio, &cnt); + if (cnt == 0) + error = tmpfs_dir_getdents(node, uio, 0, NULL, NULL); + else + error = tmpfs_dir_getdents(node, uio, cnt, *cookies, ncookies); -outok: - MPASS(error >= -1); + if (error == EJUSTRETURN) + error = (uio->uio_resid != startresid) ? 0 : EINVAL; - if (error == -1) - error = (cnt != 0) ? 0 : EINVAL; + if (error != 0 && cnt != 0) + free(*cookies, M_TEMP); if (eofflag != NULL) *eofflag = (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); - /* Update NFS-related variables. */ - if (error == 0 && cookies != NULL && ncookies != NULL) { - off_t i; - off_t off = startoff; - struct tmpfs_dirent *de = NULL; - - *ncookies = cnt; - *cookies = malloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK); - - for (i = 0; i < cnt; i++) { - MPASS(off != TMPFS_DIRCOOKIE_EOF); - if (off == TMPFS_DIRCOOKIE_DOT) { - off = TMPFS_DIRCOOKIE_DOTDOT; - } else { - if (off == TMPFS_DIRCOOKIE_DOTDOT) { - de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); - } else if (de != NULL) { - de = TAILQ_NEXT(de, td_entries); - } else { - de = tmpfs_dir_lookupbycookie(node, - off); - MPASS(de != NULL); - de = TAILQ_NEXT(de, td_entries); - } - if (de == NULL) - off = TMPFS_DIRCOOKIE_EOF; - else - off = tmpfs_dircookie(de); - } - - (*cookies)[i] = off; - } - MPASS(uio->uio_offset == off); - } - return error; }