Index: kern/vfs_syscalls.c =================================================================== --- kern/vfs_syscalls.c (revision 191918) +++ kern/vfs_syscalls.c (working copy) @@ -4232,22 +4232,13 @@ int error; struct file *fp; + error = 0; fp = NULL; - if (fdp == NULL) + if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) error = EBADF; - else { - FILEDESC_SLOCK(fdp); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) - error = EBADF; - else if (fp->f_vnode == NULL) { - fp = NULL; - error = EINVAL; - } else { - fhold(fp); - error = 0; - } - FILEDESC_SUNLOCK(fdp); + else if (fp->f_vnode == NULL) { + error = EINVAL; + fdrop(fp, curthread); } *fpp = fp; return (error); Index: kern/sys_generic.c =================================================================== --- kern/sys_generic.c (revision 191918) +++ kern/sys_generic.c (working copy) @@ -988,7 +988,6 @@ fdp = td->td_proc->p_fd; stp = td->td_sel; n = 0; - FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; @@ -996,17 +995,15 @@ /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; - if ((fp = fget_locked(fdp, fd)) == NULL) { - FILEDESC_SUNLOCK(fdp); + if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); - } idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); + fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } - FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); @@ -1030,7 +1027,6 @@ fdp = td->td_proc->p_fd; n = 0; - FILEDESC_SLOCK(fdp); for (idx = 0, fd = 0; fd < nfd; idx++) { end = imin(fd + NFDBITS, nfd); for (bit = 1; fd < end; bit <<= 1, fd++) { @@ -1038,18 +1034,16 @@ flags = selflags(ibits, idx, bit); if (flags == 0) continue; - if ((fp = fget_locked(fdp, fd)) == NULL) { - FILEDESC_SUNLOCK(fdp); + if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); - } selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); + fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } } - FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } Index: kern/kern_descrip.c =================================================================== --- kern/kern_descrip.c (revision 191918) +++ kern/kern_descrip.c (working copy) @@ -125,12 +125,24 @@ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* + * Storage to hold unused ofiles that need to be reclaimed. + */ +struct freetable { + struct file **ft_table; + SLIST_ENTRY(freetable) ft_next; +}; + +/* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* + * ofiles which need to be reclaimed on free. + */ + SLIST_HEAD(,freetable) fd_free; + /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ @@ -1268,7 +1280,10 @@ static void fdgrowtable(struct filedesc *fdp, int nfd) { + struct filedesc0 *fdp0; + struct freetable *fo; struct file **ntable; + struct file **otable; char *nfileflags; int nnfiles, onfiles; NDSLOTTYPE *nmap; @@ -1287,7 +1302,7 @@ /* allocate a new table and (if required) new bitmaps */ FILEDESC_XUNLOCK(fdp); - ntable = malloc(nnfiles * OFILESIZE, + ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); nfileflags = (char *)&ntable[nnfiles]; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) @@ -1311,10 +1326,20 @@ } bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); bcopy(fdp->fd_ofileflags, nfileflags, onfiles); - if (onfiles > NDFILE) - free(fdp->fd_ofiles, M_FILEDESC); + otable = fdp->fd_ofiles; + fdp->fd_ofileflags = nfileflags; fdp->fd_ofiles = ntable; - fdp->fd_ofileflags = nfileflags; + /* + * We must preserve ofiles until the process exits because we can't + * be certain that no threads have references to the old table via + * _fget(). + */ + if (onfiles > NDFILE) { + fo = (struct freetable *)&otable[onfiles]; + fdp0 = (struct filedesc0 *)fdp; + fo->ft_table = otable; + SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next); + } if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) @@ -1512,6 +1537,8 @@ static void fddrop(struct filedesc *fdp) { + struct filedesc0 *fdp0; + struct freetable *ft; int i; mtx_lock(&fdesc_mtx); @@ -1521,6 +1548,11 @@ return; FILEDESC_LOCK_DESTROY(fdp); + fdp0 = (struct filedesc0 *)fdp; + while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) { + SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next); + free(ft->ft_table, M_FILEDESC); + } free(fdp, M_FILEDESC); } @@ -2022,7 +2054,39 @@ atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } +struct file * +fget_unlocked(struct filedesc *fdp, int fd) +{ + struct file *fp; + u_int count; + if (fd < 0 || fd >= fdp->fd_nfiles) + return (NULL); + /* + * Fetch the descriptor locklessly. We avoid fdrop() races by + * never raising a refcount above 0. To accomplish this we have + * to use a cmpset loop rather than an atomic_add. The descriptor + * must be re-verified once we acquire a reference to be certain + * that the identity is still correct and we did not lose a race + * due to preemption. + */ + for (;;) { + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + break; + count = fp->f_count; + if (count == 0) + continue; + if (atomic_cmpset_int(&fp->f_count, count, count + 1) != 1) + continue; + if (fp == ((struct file *volatile*)fdp->fd_ofiles)[fd]) + break; + fdrop(fp, curthread); + } + + return (fp); +} + /* * Extract the file pointer associated with the specified descriptor for the * current user process. @@ -2030,16 +2094,11 @@ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * - * If 'hold' is set (non-zero) the file's refcount will be bumped on return. - * It should be dropped with fdrop(). If it is not set, then the refcount - * will not be bumped however the thread's filedesc struct will be returned - * locked (for fgetsock). - * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is set and zero is returned. */ static __inline int -_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) +_fget(struct thread *td, int fd, struct file **fpp, int flags) { struct filedesc *fdp; struct file *fp; @@ -2047,29 +2106,22 @@ *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); - FILEDESC_SLOCK(fdp); - if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { - FILEDESC_SUNLOCK(fdp); + if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); + if (fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EBADF); } - /* * FREAD and FWRITE failure return EBADF as per POSIX. * * Only one flag, or 0, may be specified. */ - if (flags == FREAD && (fp->f_flag & FREAD) == 0) { - FILEDESC_SUNLOCK(fdp); + if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || + (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { + fdrop(fp, td); return (EBADF); } - if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { - FILEDESC_SUNLOCK(fdp); - return (EBADF); - } - if (hold) { - fhold(fp); - FILEDESC_SUNLOCK(fdp); - } *fpp = fp; return (0); } @@ -2078,21 +2130,21 @@ fget(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, 0, 1)); + return(_fget(td, fd, fpp, 0)); } int fget_read(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, FREAD, 1)); + return(_fget(td, fd, fpp, FREAD)); } int fget_write(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, FWRITE, 1)); + return(_fget(td, fd, fpp, FWRITE)); } /* @@ -2109,7 +2161,7 @@ int error; *vpp = NULL; - if ((error = _fget(td, fd, &fp, flags, 0)) != 0) + if ((error = _fget(td, fd, &fp, flags)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; @@ -2117,7 +2169,8 @@ *vpp = fp->f_vnode; vref(*vpp); } - FILEDESC_SUNLOCK(td->td_proc->p_fd); + fdrop(fp, td); + return (error); } @@ -2164,7 +2217,7 @@ *spp = NULL; if (fflagp != NULL) *fflagp = 0; - if ((error = _fget(td, fd, &fp, 0, 0)) != 0) + if ((error = _fget(td, fd, &fp, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; @@ -2176,7 +2229,8 @@ soref(*spp); SOCK_UNLOCK(*spp); } - FILEDESC_SUNLOCK(td->td_proc->p_fd); + fdrop(fp, td); + return (error); } @@ -3147,7 +3201,7 @@ { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } Index: sys/filedesc.h =================================================================== --- sys/filedesc.h (revision 191918) +++ sys/filedesc.h (working copy) @@ -128,6 +128,7 @@ int getvnode(struct filedesc *fdp, int fd, struct file **fpp); void mountcheckdirs(struct vnode *olddp, struct vnode *newdp); void setugidsafety(struct thread *td); +struct file *fget_unlocked(struct filedesc *fdp, int fd); static __inline struct file * fget_locked(struct filedesc *fdp, int fd)