Index: sys/kern/kern_event.c =================================================================== --- sys/kern/kern_event.c (revision 258883) +++ sys/kern/kern_event.c (working copy) @@ -290,6 +290,7 @@ { &fs_filtops }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops }, /* EVFILT_USER */ + { &null_filtops }, /* EVFILT_SENDFILE */ }; /* Index: sys/kern/uipc_syscalls.c =================================================================== --- sys/kern/uipc_syscalls.c (revision 258883) +++ sys/kern/uipc_syscalls.c (working copy) @@ -80,6 +80,9 @@ #include #endif +#include +#include +#include #include #include @@ -121,6 +124,10 @@ counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; +static int filt_sfsync_attach(struct knote *kn); +static void filt_sfsync_detach(struct knote *kn); +static int filt_sfsync(struct knote *kn, long hint); + /* * sendfile(2)-related variables and associated sysctls */ @@ -130,7 +137,28 @@ SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); +#ifdef SFSYNC_DEBUG +static int sf_sync_debug = 0; +SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, + &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); +#define SFSYNC_DPRINTF(s, ...) \ + do { \ + if (sf_sync_debug) \ + printf((s), ##__VA_ARGS__); \ + } while (0) +#else +#define SFSYNC_DPRINTF(c, ...) +#endif +static uma_zone_t zone_sfsync; + +static struct filterops sendfile_filtops = { + .f_isfd = 0, + .f_attach = filt_sfsync_attach, + .f_detach = filt_sfsync_detach, + .f_event = filt_sfsync, +}; + static void sfstat_init(const void *unused) { @@ -140,6 +168,22 @@ } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); +static void +sf_sync_init(const void *unused) +{ + zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), + NULL, NULL, +#ifdef INVARIANTS + trash_init, trash_fini, +#else + NULL, NULL, +#endif + UMA_ALIGN_CACHE, + 0); + kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); +} +SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); + static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { @@ -1845,7 +1889,124 @@ return (error); } +static int +filt_sfsync_attach(struct knote *kn) +{ + struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; + struct knlist *knl = &sfs->klist; + + SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); + + /* + * Validate that we actually received this via the kernel API. + */ + if ((kn->kn_flags & EV_FLAG1) == 0) + return (EPERM); + + kn->kn_ptr.p_v = sfs; + kn->kn_flags &= ~EV_FLAG1; + + knl->kl_lock(knl->kl_lockarg); + /* + * If we're in the "freeing" state, + * don't allow the add. That way we don't + * end up racing with some other thread that + * is trying to finish some setup. + */ + if (sfs->state == SF_STATE_FREEING) { + knl->kl_unlock(knl->kl_lockarg); + return (EINVAL); + } + knlist_add(&sfs->klist, kn, 1); + knl->kl_unlock(knl->kl_lockarg); + + return (0); +} + /* + * Called when a knote is being detached. + */ +static void +filt_sfsync_detach(struct knote *kn) +{ + struct knlist *knl; + struct sendfile_sync *sfs; + int do_free = 0; + + sfs = kn->kn_ptr.p_v; + knl = &sfs->klist; + + SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); + + knl->kl_lock(knl->kl_lockarg); + if (!knlist_empty(knl)) + knlist_remove(knl, kn, 1); + + /* + * If the list is empty _AND_ the refcount is 0 + * _AND_ we've finished the setup phase and now + * we're in the running phase, we can free the + * underlying sendfile_sync. + * + * But we shouldn't do it before finishing the + * underlying divorce from the knote. + * + * So, we have the sfsync lock held; transition + * it to "freeing", then unlock, then free + * normally. + */ + if (knlist_empty(knl)) { + if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " + "count==0, empty list: time to free!\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + sf_sync_set_state(sfs, SF_STATE_FREEING, 1); + do_free = 1; + } + } + knl->kl_unlock(knl->kl_lockarg); + + /* + * Only call free if we're the one who has transitioned things + * to free. Otherwise we could race with another thread that + * is currently tearing things down. + */ + if (do_free == 1) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs, + __FILE__, + __LINE__); + sf_sync_free(sfs); + } +} + +static int +filt_sfsync(struct knote *kn, long hint) +{ + struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; +// struct knlist *knl = &sfs->klist; + int ret; + + SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); + + /* + * XXX these locks are owned when kqueue_register() is running? + * + * XXX add a lock assertion here? + */ +// knl->kl_lock(knl->kl_lockarg); + ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); +// knl->kl_unlock(knl->kl_lockarg); + + return (ret); +} + + +/* * Detach mapped page and release resources back to the system. */ int @@ -1870,12 +2031,28 @@ sfs = addr; sf_sync_deref(sfs); } + /* + * sfs may be invalid at this point, don't use it! + */ return (EXT_FREE_OK); } +/* + * Called to remove a reference to a sf_sync object. + * + * This is generally done during the mbuf free path to signify + * that one of the mbufs in the transaction has been completed. + * + * If we're doing SF_SYNC and the refcount is zero then we'll wake + * up any waiters. + * + * IF we're doing SF_KQUEUE and the refcount is zero then we'll + * fire off the knote. + */ void sf_sync_deref(struct sendfile_sync *sfs) { + int do_free = 0; if (sfs == NULL) return; @@ -1882,9 +2059,69 @@ mtx_lock(&sfs->mtx); KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); - if (--sfs->count == 0) - cv_signal(&sfs->cv); + sfs->count --; + + /* + * Only fire off the wakeup / kqueue notification if + * we are in the running state. + */ + if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { + if (sfs->flags & SF_SYNC) + cv_signal(&sfs->cv); + + if (sfs->flags & SF_KQUEUE) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + KNOTE_LOCKED(&sfs->klist, 1); + } + + /* + * If we're not waiting around for a sync, + * check if the knote list is empty. + * If it is, we transition to free. + * + * XXX I think it's about time I added some state + * or flag that says whether we're supposed to be + * waiting around until we've done a signal. + * + * XXX Ie, the reason that I don't free it here + * is because the caller will free the last reference, + * not us. That should be codified in some flag + * that indicates "self-free" rather than checking + * for SF_SYNC all the time. + */ + if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " + "count==0, empty list: time to free!\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + sf_sync_set_state(sfs, SF_STATE_FREEING, 1); + do_free = 1; + } + + } mtx_unlock(&sfs->mtx); + + /* + * Attempt to do a free here. + * + * We do this outside of the lock because it may destroy the + * lock in question as it frees things. We can optimise this + * later. + * + * XXX yes, we should make it a requirement to hold the + * lock across sf_sync_free(). + */ + if (do_free == 1) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + sf_sync_free(sfs); + } } /* @@ -1898,11 +2135,15 @@ { struct sendfile_sync *sfs; - sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); + sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); sfs->flags = flags; + sfs->state = SF_STATE_SETUP; + knlist_init_mtx(&sfs->klist, &sfs->mtx); + SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); + return (sfs); } @@ -1931,13 +2172,49 @@ if (sfs == NULL) return; - mtx_lock(&sfs->mtx); + KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", + __func__, + sfs)); + + /* + * If we're not requested to wait during the syscall, + * don't bother waiting. + */ + if ((sfs->flags & SF_SYNC) == 0) + goto out; + + /* + * This is a bit suboptimal and confusing, so bear with me. + * + * Ideally sf_sync_syscall_wait() will wait until + * all pending mbuf transmit operations are done. + * This means that when sendfile becomes async, it'll + * run in the background and will transition from + * RUNNING to COMPLETED when it's finished acquiring + * new things to send. Then, when the mbufs finish + * sending, COMPLETED + sfs->count == 0 is enough to + * know that no further work is being done. + * + * So, we will sleep on both RUNNING and COMPLETED. + * It's up to the (in progress) async sendfile loop + * to transition the sf_sync from RUNNING to + * COMPLETED so the wakeup above will actually + * do the cv_signal() call. + */ + if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) + goto out; + if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); - mtx_unlock(&sfs->mtx); + +out: + return; } +/* + * Free an sf_sync if it's appropriate to. + */ void sf_sync_free(struct sendfile_sync *sfs) { @@ -1945,18 +2222,157 @@ if (sfs == NULL) return; + SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " + "count=%d\n", + __func__, + (long long) curthread->td_tid, + sfs, + sfs->state, + sfs->flags, + sfs->count); + + mtx_lock(&sfs->mtx); + /* - * XXX we should ensure that nothing else has this - * locked before freeing. + * We keep the sf_sync around if the state is active, + * we are doing kqueue notification and we have active + * knotes. + * + * If the caller wants to free us right this second it + * should transition this to the freeing state. + * + * So, complain loudly if they break this rule. */ - mtx_lock(&sfs->mtx); + if (sfs->state != SF_STATE_FREEING) { + printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + mtx_unlock(&sfs->mtx); + return; + } + KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); + /* + * This doesn't call knlist_detach() on each knote; it just frees + * the entire list. + */ + knlist_delete(&sfs->klist, curthread, 1); mtx_destroy(&sfs->mtx); - free(sfs, M_TEMP); + SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + uma_zfree(zone_sfsync, sfs); } /* + * Setup a sf_sync to post a kqueue notification when things are complete. + */ +int +sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) +{ + struct kevent kev; + int error; + + sfs->flags |= SF_KQUEUE; + + /* Check the flags are valid */ + if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) + return (EINVAL); + + SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", + __func__, + sfs, + sfkq->kq_fd, + sfkq->kq_flags, + (void *) sfkq->kq_ident, + (void *) sfkq->kq_udata); + + /* Setup and register a knote on the given kqfd. */ + kev.ident = (uintptr_t) sfkq->kq_ident; + kev.filter = EVFILT_SENDFILE; + kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; + kev.data = (intptr_t) sfs; + kev.udata = sfkq->kq_udata; + + error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); + if (error != 0) { + SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); + } + return (error); +} + +void +sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, int islocked) +{ + sendfile_sync_state_t old_state; + + if (! islocked) + mtx_lock(&sfs->mtx); + + /* + * Update our current state. + */ + old_state = sfs->state; + sfs->state = state; + SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs, + old_state, + state); + + /* + * If we're transitioning from RUNNING to COMPLETED and the count is zero, + * then post the knote. The caller may have completed the send before + * we updated the state to COMPLETED and we need to make sure this is + * communicated. + */ + if (old_state == SF_STATE_RUNNING + && state == SF_STATE_COMPLETED + && sfs->count == 0 + && sfs->flags & SF_KQUEUE) { + SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + KNOTE_LOCKED(&sfs->klist, 1); + } + + if (! islocked) + mtx_unlock(&sfs->mtx); +} + +/* + * Set the retval/errno for the given transaction. + * + * This will eventually/ideally be used when the KNOTE is fired off + * to signify the completion of this transaction. + * + * The sfsync lock should be held before entering this function. + */ +void +sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) +{ + + KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", + __func__, + sfs)); + + SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs, + xerrno, + (intmax_t) retval); + + sfs->retval = retval; + sfs->xerrno = xerrno; +} + +/* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, @@ -1978,6 +2394,7 @@ do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; + struct sf_hdtr_kq hdtr_kq; struct uio *hdr_uio, *trl_uio; struct file *fp; cap_rights_t rights; @@ -1984,6 +2401,8 @@ int error; off_t sbytes; struct sendfile_sync *sfs; + int do_kqueue = 0; + int do_free = 0; /* * File offset must be positive. If it goes beyond EOF @@ -1995,6 +2414,10 @@ hdr_uio = trl_uio = NULL; sfs = NULL; + /* + * XXX TODO: handle copying in the extra kqueue data and + * somehow using it as appropriate. + */ if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) @@ -2008,7 +2431,19 @@ error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; + } + /* + * If SF_KQUEUE is set, then we need to also copy in + * the kqueue data after the normal hdtr set and set do_kqueue=1. + */ + if (uap->flags & SF_KQUEUE) { + error = copyin(((char *) uap->hdtr) + sizeof(hdtr), + &hdtr_kq, + sizeof(hdtr_kq)); + if (error != 0) + goto out; + do_kqueue = 1; } } @@ -2024,20 +2459,121 @@ } /* + * IF SF_KQUEUE is set but we haven't copied in anything for + * kqueue data, error out. + */ + if (uap->flags & SF_KQUEUE && do_kqueue == 0) { + SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); + goto out; + } + + /* * If we need to wait for completion, initialise the sfsync * state here. */ - if (uap->flags & SF_SYNC) - sfs = sf_sync_alloc(uap->flags & SF_SYNC); + if (uap->flags & (SF_SYNC | SF_KQUEUE)) + sfs = sf_sync_alloc(uap->flags & (SF_SYNC | SF_KQUEUE)); + if (uap->flags & SF_KQUEUE) { + error = sf_sync_kqueue_setup(sfs, &hdtr_kq); + if (error) { + SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", + __func__, + (unsigned long long) curthread->td_tid, + sfs); + sf_sync_set_state(sfs, SF_STATE_FREEING, 0); + sf_sync_free(sfs); + goto out; + } + } + + /* + * Do the sendfile call. + * + * If this fails, it'll free the mbuf chain which will free up the + * sendfile_sync references. + */ error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, sfs, td); /* - * If appropriate, do the wait and free here. + * If the sendfile call succeeded, transition the sf_sync state + * to RUNNING, then COMPLETED. + * + * If the sendfile call failed, then the sendfile call may have + * actually sent some data first - so we check to see whether + * any data was sent. If some data was queued (ie, count > 0) + * then we can't call free; we have to wait until the partial + * transaction completes before we continue along. + * + * This has the side effect of firing off the knote + * if the refcount has hit zero by the time we get here. */ if (sfs != NULL) { + mtx_lock(&sfs->mtx); + if (error == 0 || sfs->count > 0) { + /* + * When it's time to do async sendfile, the transition + * to RUNNING signifies that we're actually actively + * adding and completing mbufs. When the last disk + * buffer is read (ie, when we're not doing any + * further read IO and all subsequent stuff is mbuf + * transmissions) we'll transition to COMPLETED + * and when the final mbuf is freed, the completion + * will be signaled. + */ + sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); + + /* + * Set the retval before we signal completed. + * If we do it the other way around then transitioning to + * COMPLETED may post the knote before you set the return + * status! + * + * XXX for now, errno is always 0, as we don't post + * knotes if sendfile failed. Maybe that'll change later. + */ + sf_sync_set_retval(sfs, sbytes, error); + + /* + * And now transition to completed, which will kick off + * the knote if required. + */ + sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); + } else { + /* + * Error isn't zero, sfs_count is zero, so we + * won't have some other thing to wake things up. + * Thus free. + */ + sf_sync_set_state(sfs, SF_STATE_FREEING, 1); + do_free = 1; + } + + /* + * Next - wait if appropriate. + */ sf_sync_syscall_wait(sfs); + + /* + * If we're not doing kqueue notifications, we can + * transition this immediately to the freeing state. + */ + if ((sfs->flags & SF_KQUEUE) == 0) { + sf_sync_set_state(sfs, SF_STATE_FREEING, 1); + do_free = 1; + } + + mtx_unlock(&sfs->mtx); + } + + /* + * If do_free is set, free here. + * + * If we're doing no-kqueue notification and it's just sleep notification, + * we also do free; it's the only chance we have. + */ + if (sfs != NULL && do_free == 1) { sf_sync_free(sfs); } Index: sys/sys/event.h =================================================================== --- sys/sys/event.h (revision 258883) +++ sys/sys/event.h (working copy) @@ -42,7 +42,8 @@ #define EVFILT_FS (-9) /* filesystem events */ #define EVFILT_LIO (-10) /* attached to lio requests */ #define EVFILT_USER (-11) /* User events */ -#define EVFILT_SYSCOUNT 11 +#define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ +#define EVFILT_SYSCOUNT 12 #define EV_SET(kevp_, a, b, c, d, e, f) do { \ struct kevent *kevp = (kevp_); \ @@ -212,7 +213,8 @@ struct file *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ struct aiocblist *p_aio; /* AIO job pointer */ - struct aioliojob *p_lio; /* LIO job pointer */ + struct aioliojob *p_lio; /* LIO job pointer */ + void *p_v; /* generic other pointer */ } kn_ptr; struct filterops *kn_fop; void *kn_hook; Index: sys/sys/sf_sync.h =================================================================== --- sys/sys/sf_sync.h (revision 258883) +++ sys/sys/sf_sync.h (working copy) @@ -29,17 +29,36 @@ #ifndef _SYS_SF_SYNC_H_ #define _SYS_SF_SYNC_H_ +typedef enum { + SF_STATE_NONE, + SF_STATE_SETUP, + SF_STATE_RUNNING, + SF_STATE_COMPLETED, + SF_STATE_FREEING +} sendfile_sync_state_t; + struct sendfile_sync { - uint32_t flags; struct mtx mtx; struct cv cv; - unsigned count; + struct knlist klist; + uint32_t flags; + uint32_t count; + int32_t xerrno; /* Completion errno, if retval < 0 */ + off_t retval; /* Completion retval (eg written bytes) */ + sendfile_sync_state_t state; }; +/* XXX pollution */ +struct sf_hdtr_kq; + extern struct sendfile_sync * sf_sync_alloc(uint32_t flags); extern void sf_sync_syscall_wait(struct sendfile_sync *); extern void sf_sync_free(struct sendfile_sync *); +extern void sf_sync_try_free(struct sendfile_sync *); extern void sf_sync_ref(struct sendfile_sync *); extern void sf_sync_deref(struct sendfile_sync *); +extern int sf_sync_kqueue_setup(struct sendfile_sync *, struct sf_hdtr_kq *); +extern void sf_sync_set_state(struct sendfile_sync *, sendfile_sync_state_t, int); +extern void sf_sync_set_retval(struct sendfile_sync *, off_t, int); #endif /* !_SYS_SF_BUF_H_ */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h (revision 258883) +++ sys/sys/socket.h (working copy) @@ -577,11 +577,27 @@ }; /* + * sendfile(2) kqueue information + */ +struct sf_hdtr_kq { + int kq_fd; /* kq fd to post completion events on */ + uint32_t kq_flags; /* extra flags to pass in */ + void *kq_udata; /* user data pointer */ + uintptr_t kq_ident; /* ident (from userland?) */ +}; + +struct sf_hdtr_all { + struct sf_hdtr hdtr; + struct sf_hdtr_kq kq; +}; + +/* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 #define SF_SYNC 0x00000004 +#define SF_KQUEUE 0x00000008 #ifdef _KERNEL #define SFK_COMPAT 0x00000001