Index: kern/kern_event.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_event.c,v retrieving revision 1.111 diff -u -r1.111 kern_event.c --- kern/kern_event.c 28 May 2007 17:15:05 -0000 1.111 +++ kern/kern_event.c 4 Jul 2007 01:07:48 -0000 @@ -1400,7 +1400,8 @@ revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); - kq->kq_state |= KQ_SEL; + if (SEL_WAITING(&kq->kq_sel)) + kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); @@ -1486,8 +1487,9 @@ } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { - kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); + if (!SEL_WAITING(&kq->kq_sel)) + kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); @@ -1522,8 +1524,9 @@ wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { - kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); + if (!SEL_WAITING(&kq->kq_sel)) + kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); Index: kern/kern_thread.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_thread.c,v retrieving revision 1.250 diff -u -r1.250 kern_thread.c --- kern/kern_thread.c 12 Jun 2007 19:49:39 -0000 1.250 +++ kern/kern_thread.c 4 Jul 2007 01:07:48 -0000 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,7 @@ sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); vm_thread_dispose(td); + seltdfini(td); } /* Index: kern/sys_generic.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_generic.c,v retrieving revision 1.157 diff -u -r1.157 sys_generic.c --- kern/sys_generic.c 5 Jun 2007 00:00:54 -0000 1.157 +++ kern/sys_generic.c 4 Jul 2007 01:07:48 -0000 @@ -68,18 +68,47 @@ #ifdef KTRACE #include #endif +#include static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollscan(struct thread *, struct pollfd *, u_int); +static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); +static int selrescan(struct thread *, fd_mask **, fd_mask **); +static void selfdalloc(struct thread *, void *); +static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); +static void seltdinit(struct thread *); +static int seltdwait(struct seltd *, int); +static void seltdclear(struct thread *); + +/* + * One seltd per-thread allocated on demand as needed. + * + * t - protected by st_mtx + * k - Only accessed by curthread or read-only + */ +struct seltd { + struct selfdlist st_selq; /* (k) List of selfds. */ + struct selfd *st_free1; /* (k) free fd for read set. */ + struct selfd *st_free2; /* (k) free fd for write set. */ + void *st_cookie; /* (k) temp holds fd. */ + struct mtx st_mtx; /* Protects struct seltd */ + struct cv st_wait; /* (t) Wait channel. */ + int st_flags; /* (t) SELTD_ flags. */ +}; + +#define SELTD_PENDING 0x0001 /* We have pending events. */ +#define SELTD_RESCAN 0x0002 /* Doing a rescan. */ + +static uma_zone_t selfd_zone; #ifndef _SYS_SYSPROTO_H_ struct read_args { @@ -601,14 +630,6 @@ return (error); } -/* - * sellock and selwait are initialized in selectinit() via SYSINIT. - */ -struct mtx sellock; -struct cv selwait; -u_int nselcoll; /* Select collisions since boot */ -SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); - #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; @@ -650,7 +671,7 @@ fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv, rtv, ttv; int error, timo; - u_int ncoll, nbufbytes, ncpbytes, nfdbits; + u_int nbufbytes, ncpbytes, nfdbits; if (nd < 0) return (EINVAL); @@ -695,7 +716,7 @@ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpbytes); \ if (error != 0) \ - goto done_nosellock; \ + goto done; \ } \ } while (0) getbits(fd_in, 0); @@ -709,7 +730,7 @@ atv = *tvp; if (itimerfix(&atv)) { error = EINVAL; - goto done_nosellock; + goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); @@ -718,58 +739,31 @@ atv.tv_usec = 0; } timo = 0; - TAILQ_INIT(&td->td_selq); - mtx_lock(&sellock); -retry: - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - error = selscan(td, ibits, obits, nd); - mtx_lock(&sellock); - if (error || td->td_retval[0]) - goto done; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); - } - - /* - * An event of interest may occur while we do not hold - * sellock, so check TDF_SELECT and the number of - * collisions and rescan the file descriptors if - * necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = selscan(td, ibits, obits, nd); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td->td_sel, timo); + if (error) + break; + error = selrescan(td, ibits, obits); + if (error || td->td_retval[0] != 0) + break; } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait_sig(&selwait, &sellock, timo); - else - error = cv_wait_sig(&selwait, &sellock); - - if (error == 0) - goto retry; + seltdclear(td); done: - clear_selinfo_list(td); - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_nosellock: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; @@ -792,6 +786,60 @@ return (error); } +/* + * Traverse the list of fds attached to this thread's seltd and check for + * completion. + */ +static int +selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct selinfo *si; + struct file *fp; + int msk, fd; + int n = 0; + /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ + static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; + struct filedesc *fdp = td->td_proc->p_fd; + + stp = td->td_sel; + FILEDESC_SLOCK(fdp); + TAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (int)(uintptr_t)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + if ((fp = fget_locked(fdp, fd)) == NULL) { + FILEDESC_SUNLOCK(fdp); + return (EBADF); + } + for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; + if ((ibits[msk][fd/NFDBITS] & + ((fd_mask) 1 << (fd % NFDBITS))) == 0) + continue; + if (fo_poll(fp, flag[msk], td->td_ucred, td)) { + obits[msk][(fd)/NFDBITS] |= + ((fd_mask)1 << ((fd) % NFDBITS)); + n++; + } + } + } + FILEDESC_SUNLOCK(fdp); + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + +/* + * Perform the initial filedescriptor scan and register ourselves with + * each selinfo. + */ static int selscan(td, ibits, obits, nfd) struct thread *td; @@ -820,6 +868,7 @@ FILEDESC_SUNLOCK(fdp); return (EBADF); } + selfdalloc(td, (void *)(uintptr_t)fd); if (fo_poll(fp, flag[msk], td->td_ucred, td)) { obits[msk][(fd)/NFDBITS] |= @@ -850,7 +899,7 @@ struct pollfd smallbits[32]; struct timeval atv, rtv, ttv; int error = 0, timo; - u_int ncoll, nfds; + u_int nfds; size_t ni; nfds = uap->nfds; @@ -877,13 +926,13 @@ bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) - goto done_nosellock; + goto done; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; - goto done_nosellock; + goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); @@ -892,56 +941,31 @@ atv.tv_usec = 0; } timo = 0; - TAILQ_INIT(&td->td_selq); - mtx_lock(&sellock); -retry: - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - error = pollscan(td, bits, nfds); - mtx_lock(&sellock); - if (error || td->td_retval[0]) - goto done; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); - } - /* - * An event of interest may occur while we do not hold - * sellock, so check TDF_SELECT and the number of collisions - * and rescan the file descriptors if necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = pollscan(td, bits, nfds); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td->td_sel, timo); + if (error) + break; + error = pollrescan(td); + if (error || td->td_retval[0] != 0) + break; } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait_sig(&selwait, &sellock, timo); - else - error = cv_wait_sig(&selwait, &sellock); - - if (error == 0) - goto retry; + seltdclear(td); done: - clear_selinfo_list(td); - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_nosellock: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; @@ -960,12 +984,56 @@ } static int +pollrescan(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct selinfo *si; + struct filedesc *fdp; + struct file *fp; + struct pollfd *fd; + int n; + + n = 0; + fdp = td->td_proc->p_fd; + stp = td->td_sel; + FILEDESC_SLOCK(fdp); + TAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (struct pollfd *)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + fp = fdp->fd_ofiles[fd->fd]; + if (fp == NULL) { + fd->revents = POLLNVAL; + n++; + continue; + } + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); + if (fd->revents != 0) + n++; + } + FILEDESC_SUNLOCK(fdp); + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + + +static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { - register struct filedesc *fdp = td->td_proc->p_fd; + struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; @@ -987,6 +1055,7 @@ * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ + selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); if (fds->revents != 0) @@ -1020,23 +1089,36 @@ } /* - * Remove the references to the thread from all of the objects we were - * polling. - * - * This code assumes that the underlying owner of the selinfo structure will - * hold sellock before it changes it, and that it will unlink itself from our - * list if it goes away. + * Preallocate two selfds associated with 'cookie'. Some fo_poll routines + * have two select sets, one for read and another for write. */ -void -clear_selinfo_list(td) - struct thread *td; +static void +selfdalloc(struct thread *td, void *cookie) { - struct selinfo *si; + struct seltd *stp; - mtx_assert(&sellock, MA_OWNED); - TAILQ_FOREACH(si, &td->td_selq, si_thrlist) - si->si_thread = NULL; - TAILQ_INIT(&td->td_selq); + stp = td->td_sel; + stp->st_cookie = cookie; + if (stp->st_free1 == NULL) + stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free1->sf_flags = SELFD_MALLOC; + if (stp->st_free2 == NULL) + stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free2->sf_flags = SELFD_MALLOC; +} + +static void +selfdfree(struct seltd *stp, struct selfd *sfp) +{ + TAILQ_REMOVE(&stp->st_selq, sfp, sf_link); + mtx_lock(sfp->sf_mtx); + if (sfp->sf_si) + TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); + mtx_unlock(sfp->sf_mtx); + if (sfp->sf_flags & SELFD_MALLOC) + uma_zfree(selfd_zone, sfp); + else + atomic_store_rel_ptr((volatile uintptr_t *)&sfp->sf_td, 0); } /* @@ -1047,26 +1129,43 @@ struct thread *selector; struct selinfo *sip; { + struct selfd *sfp; + struct seltd *stp; + struct mtx *mtx; - mtx_lock(&sellock); + stp = selector->td_sel; /* - * If the selinfo's thread pointer is NULL then take ownership of it. - * - * If the thread pointer is not NULL and it points to another - * thread, then we have a collision. - * - * If the thread pointer is not NULL and points back to us then leave - * it alone as we've already added pointed it at us and added it to - * our list. + * Don't record when doing a rescan. */ - if (sip->si_thread == NULL) { - sip->si_thread = selector; - TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); - } else if (sip->si_thread != selector) { - sip->si_flags |= SI_COLL; + if (stp->st_flags & SELTD_RESCAN) + return; + mtx = mtx_pool_find(mtxpool_sleep, sip); + mtx_lock(mtx); + if (sip->si_mtx == NULL) { + sip->si_mtx = mtx; + TAILQ_INIT(&sip->si_tdlist); } - - mtx_unlock(&sellock); + /* + * Grab one of the preallocated descriptors. + */ + if (sip->si_fd.sf_td == NULL) + sfp = &sip->si_fd; + else if ((sfp = stp->st_free1) != NULL) + stp->st_free1 = NULL; + else if ((sfp = stp->st_free2) != NULL) + stp->st_free2 = NULL; + else + panic("selrecord: No free selfd on selq"); + sfp->sf_cookie = stp->st_cookie; + sfp->sf_si = sip; + sfp->sf_td = stp; + sfp->sf_mtx = sip->si_mtx; + TAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); + /* + * Add this thread to the list of selfds listening on this selinfo. + */ + TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); + mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ @@ -1083,7 +1182,7 @@ struct selinfo *sip; int pri; { - doselwakeup(sip, pri); + doselwakeup(sip, /*pri*/ -1); } /* @@ -1094,36 +1193,116 @@ struct selinfo *sip; int pri; { - struct thread *td; + struct selfd *sfp; + struct selfd *sfn; + struct seltd *stp; - mtx_lock(&sellock); - td = sip->si_thread; - if ((sip->si_flags & SI_COLL) != 0) { - nselcoll++; - sip->si_flags &= ~SI_COLL; - cv_broadcastpri(&selwait, pri); - } - if (td == NULL) { - mtx_unlock(&sellock); + /* If it's not initialized there can't be any waiters. */ + if (sip->si_mtx == NULL) return; + /* + * Locking the selinfo locks all selfds associated with it. + */ + mtx_lock(sip->si_mtx); + TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { + /* + * Once we remove this sfp from the list and clear the + * sf_si seltdclear will know to ignore this si. + */ + TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); + sfp->sf_si = NULL; + stp = sfp->sf_td; + mtx_lock(&stp->st_mtx); + stp->st_flags |= SELTD_PENDING; + cv_broadcastpri(&stp->st_wait, pri); + mtx_unlock(&stp->st_mtx); + } + mtx_unlock(sip->si_mtx); +} + +static void +seltdinit(struct thread *td) +{ + struct seltd *stp; + + if ((stp = td->td_sel) != NULL) + goto out; + td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); + mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); + cv_init(&stp->st_wait, "select"); +out: + stp->st_flags = 0; + TAILQ_INIT(&stp->st_selq); +} + +static int +seltdwait(struct seltd *stp, int timo) +{ + int error; + + /* + * An event of interest may occur while we do not hold the seltd + * locked so check the pending flag before we sleep. + */ + mtx_lock(&stp->st_mtx); + /* + * Any further calls to selrecord will be a rescan. + */ + stp->st_flags |= SELTD_RESCAN; + if (stp->st_flags & SELTD_PENDING) { + mtx_unlock(&stp->st_mtx); + return (0); } - TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); - sip->si_thread = NULL; - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - sleepq_remove(td, &selwait); - mtx_unlock(&sellock); + if (timo > 0) + error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); + else + error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); + mtx_unlock(&stp->st_mtx); + + return (error); +} + +void +seltdfini(struct thread *td) +{ + struct seltd *stp; + + stp = td->td_sel; + if (stp == NULL) + return; + if (stp->st_free1) + free(stp->st_free1, M_SELECT); + stp->st_free1 = NULL; + if (stp->st_free2) + free(stp->st_free2, M_SELECT); + stp->st_free2 = NULL; + td->td_sel = NULL; + free(stp, M_SELECT); +} + +/* + * Remove the references to the thread from all of the objects we were + * polling. + */ +static void +seltdclear(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + + stp = td->td_sel; + TAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) + selfdfree(stp, sfp); + stp->st_flags = 0; } static void selectinit(void *); -SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) +SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); -/* ARGSUSED*/ static void -selectinit(dummy) - void *dummy; +selectinit(void *dummy __unused) { - cv_init(&selwait, "select"); - mtx_init(&sellock, "sellck", NULL, MTX_DEF); + selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); } Index: kern/sys_pipe.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_pipe.c,v retrieving revision 1.191 diff -u -r1.191 sys_pipe.c --- kern/sys_pipe.c 27 May 2007 17:33:10 -0000 1.191 +++ kern/sys_pipe.c 4 Jul 2007 01:07:48 -0000 @@ -524,8 +524,9 @@ PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { - cpipe->pipe_state &= ~PIPE_SEL; selwakeuppri(&cpipe->pipe_sel, PSOCK); + if (!SEL_WAITING(&cpipe->pipe_sel)) + cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); @@ -1350,12 +1351,14 @@ if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); - rpipe->pipe_state |= PIPE_SEL; + if (SEL_WAITING(&rpipe->pipe_sel)) + rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); - wpipe->pipe_state |= PIPE_SEL; + if (SEL_WAITING(&wpipe->pipe_sel)) + wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC Index: kern/uipc_sockbuf.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/uipc_sockbuf.c,v retrieving revision 1.171 diff -u -r1.171 uipc_sockbuf.c --- kern/uipc_sockbuf.c 31 May 2007 11:51:22 -0000 1.171 +++ kern/uipc_sockbuf.c 4 Jul 2007 01:07:48 -0000 @@ -176,7 +176,8 @@ SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(&sb->sb_sel, PSOCK); - sb->sb_flags &= ~SB_SEL; + if (!SEL_WAITING(&sb->sb_sel)) + sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); Index: kern/uipc_socket.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.302 diff -u -r1.302 uipc_socket.c --- kern/uipc_socket.c 4 Jun 2007 18:25:07 -0000 1.302 +++ kern/uipc_socket.c 4 Jul 2007 01:07:48 -0000 @@ -2489,12 +2489,14 @@ (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + if (SEL_WAITING(&so->so_rcv.sb_sel)) + so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (SEL_WAITING(&so->so_snd.sb_sel)) + so->so_snd.sb_flags |= SB_SEL; } } Index: sys/proc.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.486 diff -u -r1.486 proc.h --- sys/proc.h 12 Jun 2007 20:22:06 -0000 1.486 +++ sys/proc.h 4 Jul 2007 01:07:48 -0000 @@ -141,7 +141,6 @@ * m - Giant * n - not locked, lazy * o - ktrace lock - * p - select lock (sellock) * q - td_contested lock * r - p_peers lock * t - thread lock @@ -209,7 +208,7 @@ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ - TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ + struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ @@ -321,7 +320,7 @@ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ -#define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ +#define TDF_UNUSEDx40 0x00000040 /* --available-- */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_UNUSEDx100 0x00000100 /* --available-- */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ Index: sys/selinfo.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/selinfo.h,v retrieving revision 1.18 diff -u -r1.18 selinfo.h --- sys/selinfo.h 15 Aug 2004 06:24:42 -0000 1.18 +++ sys/selinfo.h 4 Jul 2007 01:07:48 -0000 @@ -36,25 +36,41 @@ #include /* for struct klist */ /* + * One selfd allocated per-thread per-file-descriptor. + * f - protected by sf_mtx + * k - Only accessed by curthread or read-only + */ +struct selfd { + TAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ + TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ + struct selinfo *sf_si; /* (f) selinfo when linked. */ + struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ + struct seltd *sf_td; /* (k) owning seltd. */ + void *sf_cookie; /* (k) fd or pollfd. */ + int sf_flags; /* (k) SELF_ flags. */ +}; +#define SELFD_MALLOC 0x0001 /* Was malloced. */ + +TAILQ_HEAD(selfdlist, selfd); + +/* * Used to maintain information about processes that wish to be * notified when I/O becomes possible. */ struct selinfo { - TAILQ_ENTRY(selinfo) si_thrlist; /* list hung off of thread */ - struct thread *si_thread; /* thread waiting */ - struct knlist si_note; /* kernel note list */ - short si_flags; /* see below */ + struct selfdlist si_tdlist; /* List of sleeping threads. */ + struct knlist si_note; /* kernel note list */ + struct mtx *si_mtx; /* Lock for tdlist. */ + struct selfd si_fd; /* Avoid malloc. */ }; -#define SI_COLL 0x0001 /* collision occurred */ -#define SEL_WAITING(si) \ - ((si)->si_thread != NULL || ((si)->si_flags & SI_COLL) != 0) +#define SEL_WAITING(si) (!TAILQ_EMPTY(&(si)->si_tdlist)) #ifdef _KERNEL -void clear_selinfo_list(struct thread *td); void selrecord(struct thread *selector, struct selinfo *sip); void selwakeup(struct selinfo *sip); void selwakeuppri(struct selinfo *sip, int pri); +void seltdfini(struct thread *td); #endif #endif /* !_SYS_SELINFO_H_ */ Index: sys/systm.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/systm.h,v retrieving revision 1.258 diff -u -r1.258 systm.h --- sys/systm.h 12 Jun 2007 00:12:01 -0000 1.258 +++ sys/systm.h 4 Jul 2007 01:07:48 -0000 @@ -57,10 +57,6 @@ extern int nswap; /* size of swap space */ -extern u_int nselcoll; /* select collisions since boot */ -extern struct mtx sellock; /* select lock variable */ -extern struct cv selwait; /* select conditional variable */ - extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */