Index: sys/kern/kern_timeout.c =================================================================== --- sys/kern/kern_timeout.c (revision 238497) +++ sys/kern/kern_timeout.c (working copy) @@ -117,6 +117,7 @@ struct callout_cpu { struct callout_tailq cc_expireq; struct callout_list cc_callfree; struct callout *cc_next; + struct callout *cc_next_direct; struct callout *cc_curr; struct bintime cc_firstevent; struct bintime cc_lastscan; @@ -145,6 +146,10 @@ struct callout_cpu cc_cpu; #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) #define C_PRECISION 0x2 +#ifndef MAX_SOFTCLOCK_STEPS +#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ +#endif /* MAX_SOFTCLOCK_STEPS */ + #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ @@ -348,151 +353,15 @@ get_bucket(struct bintime *bt) return callout_hash(bt) & callwheelmask; } -void -callout_process(struct bintime *now) +static inline void +callout_stats(int depth, int mpcalls, int lockcalls, int gcalls) { - struct bintime max, min, next, tmp_max, tmp_min; - struct callout *tmp; - struct callout_cpu *cc; - struct callout_tailq *sc; - int cpu, first, future, last, need_softclock; - - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ - need_softclock = 0; - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cpu = curcpu; - first = callout_hash(&cc->cc_lastscan); - last = callout_hash(now); - /* - * Check if we wrapped around the entire wheel from the last scan. - * In case, we need to scan entirely the wheel for pending callouts. - */ - last = (last - first >= callwheelsize) ? (first - 1) & callwheelmask : - last & callwheelmask; - first &= callwheelmask; - for (;;) { - sc = &cc->cc_callwheel[first]; - TAILQ_FOREACH(tmp, sc, c_links.tqe) { - next = tmp->c_time; - bintime_sub(&next, &tmp->c_precision); - if (bintime_cmp(&next, now, <=)) { - /* - * Consumer told us the callout may be run - * directly from hardware interrupt context. - */ - if (tmp->c_flags & CALLOUT_DIRECT) { - tmp->c_func(tmp->c_arg); - TAILQ_REMOVE(sc, tmp, c_links.tqe); - tmp->c_flags &= ~CALLOUT_PENDING; - } else { - TAILQ_INSERT_TAIL(&cc->cc_expireq, - tmp, c_staiter); - TAILQ_REMOVE(sc, tmp, c_links.tqe); - tmp->c_flags |= CALLOUT_PROCESSED; - need_softclock = 1; - } - } - } - if (first == last) - break; - first = (first + 1) & callwheelmask; - } - future = (last + hz / 4) & callwheelmask; - max.sec = min.sec = TIME_T_MAX; - max.frac = min.frac = UINT64_MAX; - /* - * Look for the first bucket in the future that contains some event, - * up to some point, so that we can look for aggregation. - */ - for (;;) { - sc = &cc->cc_callwheel[last]; - TAILQ_FOREACH(tmp, sc, c_links.tqe) { - tmp_max = tmp_min = tmp->c_time; - bintime_add(&tmp_max, &tmp->c_precision); - bintime_sub(&tmp_min, &tmp->c_precision); - /* - * This is the fist event we're going to process or - * event maximal time is less than present minimal. - * In both cases, take it. - */ - if (bintime_cmp(&tmp_max, &min, <)) { - max = tmp_max; - min = tmp_min; - continue; - } - /* - * Event minimal time is bigger than present maximal - * time, so it cannot be aggregated. - */ - if (bintime_cmp(&tmp_min, &max, >)) - continue; - /* - * If neither of the two previous happened, just take - * the intersection of events. - */ - min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min; - max = (bintime_cmp(&tmp_max, &max, >)) ? tmp_max : max; - } - if (last == future || max.sec != TIME_T_MAX) - break; - last = (last + 1) & callwheelmask; - } - if (max.sec == TIME_T_MAX) { - next.sec = 0; - next.frac = (uint64_t)1 << (64 - 2); - bintime_add(&next, now); - } else { - /* - * Now that we found something to aggregate, schedule an - * interrupt in the middle of the previously calculated range. - */ - bintime_add(&max, &min); - next = max; - next.frac >>= 1; - if (next.sec & 1) - next.frac |= ((uint64_t)1 << 63); - next.sec >>= 1; - } - cc->cc_firstevent = next; - if (callout_new_inserted != NULL) - (*callout_new_inserted)(cpu, next); - cc->cc_lastscan = *now; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - /* - * swi_sched acquires the thread lock, so we don't want to call it - * with cc_lock held; incorrect locking order. - */ - if (need_softclock) { - swi_sched(cc->cc_cookie, 0); - } -} - -static struct callout_cpu * -callout_lock(struct callout *c) -{ - struct callout_cpu *cc; - int cpu; - - for (;;) { - cpu = c->c_cpu; -#ifdef SMP - if (cpu == CPUBLOCK) { - while (c->c_cpu == CPUBLOCK) - cpu_spinwait(); - continue; - } +#ifdef CALLOUT_PROFILING + avg_depth += (depth * 1000 - avg_depth) >> 8; + avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; + avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; + avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif - cc = CC_CPU(cpu); - CC_LOCK(cc); - if (cpu == c->c_cpu) - break; - CC_UNLOCK(cc); - } - return (cc); } static void @@ -564,8 +433,10 @@ static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { - if (cc->cc_next == c) + if (cc->cc_next == c) cc->cc_next = TAILQ_NEXT(c, c_staiter); + if (cc->cc_next_direct == c) + cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe); if (c->c_flags & CALLOUT_LOCAL_ALLOC) { c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); @@ -574,13 +445,13 @@ callout_cc_del(struct callout *c, struct callout_c static struct callout * softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) + int *lockcalls, int *gcalls, int direct) { void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; - int c_flags, sharedlock; + int c_direct, c_flags, sharedlock; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); @@ -594,8 +465,10 @@ softclock_call_cc(struct callout *c, struct callou static uint64_t maxdt = 36893488147419102LL; /* 2 msec */ static timeout_t *lastfunc; #endif - - cc->cc_next = TAILQ_NEXT(c, c_staiter); + if (direct) + cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe); + else + cc->cc_next = TAILQ_NEXT(c, c_staiter); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1; c_lock = c->c_lock; @@ -617,11 +490,11 @@ softclock_call_cc(struct callout *c, struct callou */ if (cc->cc_cancel) { class->lc_unlock(c_lock); - goto skip; + goto skip; } /* The callout cannot be stopped now. */ cc->cc_cancel = 1; - +#ifdef CALLOUT_PROFILING if (c_lock == &Giant.lock_object) { (*gcalls)++; CTR3(KTR_CALLOUT, "callout %p func %p arg %p", @@ -631,10 +504,13 @@ softclock_call_cc(struct callout *c, struct callou CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } +#endif } else { +#ifdef CALLOUT_PROFILING (*mpcalls)++; CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p", c, c_func, c_arg); +#endif } #ifdef DIAGNOSTIC binuptime(&bt1); @@ -722,8 +598,9 @@ skip: * is not easy. */ new_cc = callout_cpu_switch(c, cc, new_cpu); + c_direct = c->c_flags & CALLOUT_DIRECT; callout_cc_add(c, new_cc, new_time, new_func, new_arg, - new_cpu, 0); + new_cpu, c_direct); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -733,9 +610,164 @@ skip: #ifdef SMP nextc: #endif - return (cc->cc_next); + if (direct) + return (cc->cc_next_direct); + else + return (cc->cc_next); } +void +callout_process(struct bintime *now) +{ + struct bintime max, min, next, tmp_max, tmp_min; + struct callout *tmp; + struct callout_cpu *cc; + struct callout_tailq *sc; + int cpu, depth, first, future, gcalls, last, lockcalls, mpcalls, + need_softclock; + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + need_softclock = steps = 0; + cc = CC_SELF(); + mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); + cpu = curcpu; + first = callout_hash(&cc->cc_lastscan); + last = callout_hash(now); + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + last = (last - first >= callwheelsize) ? (first - 1) & callwheelmask : + last & callwheelmask; + first &= callwheelmask; + for (;;) { + sc = &cc->cc_callwheel[first]; + tmp = TAILQ_FIRST(sc); + while (tmp != NULL) { + next = tmp->c_time; + bintime_sub(&next, &tmp->c_precision); + if (bintime_cmp(&next, now, <=)) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { + ++depth; + TAILQ_REMOVE(sc, tmp, c_links.tqe); + tmp = softclock_call_cc(tmp, cc, + &mpcalls, &lockcalls, + &gcalls, 1); + } else { + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_staiter); + TAILQ_REMOVE(sc, tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + need_softclock = 1; + } + } else + tmp = TAILQ_NEXT(tmp, c_links.tqe); + } + if (first == last) + break; + first = (first + 1) & callwheelmask; + } + future = (last + hz / 4) & callwheelmask; + max.sec = min.sec = TIME_T_MAX; + max.frac = min.frac = UINT64_MAX; + /* + * Look for the first bucket in the future that contains some event, + * up to some point, so that we can look for aggregation. + */ + for (;;) { + sc = &cc->cc_callwheel[last]; + TAILQ_FOREACH(tmp, sc, c_links.tqe) { + tmp_max = tmp_min = tmp->c_time; + bintime_add(&tmp_max, &tmp->c_precision); + bintime_sub(&tmp_min, &tmp->c_precision); + /* + * This is the fist event we're going to process or + * event maximal time is less than present minimal. + * In both cases, take it. + */ + if (bintime_cmp(&tmp_max, &min, <)) { + max = tmp_max; + min = tmp_min; + continue; + } + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (bintime_cmp(&tmp_min, &max, >)) + continue; + /* + * If neither of the two previous happened, just take + * the intersection of events. + */ + min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min; + max = (bintime_cmp(&tmp_max, &max, >)) ? tmp_max : max; + } + if (last == future || max.sec != TIME_T_MAX) + break; + last = (last + 1) & callwheelmask; + } + if (max.sec == TIME_T_MAX) { + next.sec = 0; + next.frac = (uint64_t)1 << (64 - 2); + bintime_add(&next, now); + } else { + /* + * Now that we found something to aggregate, schedule an + * interrupt in the middle of the previously calculated range. + */ + bintime_add(&max, &min); + next = max; + next.frac >>= 1; + if (next.sec & 1) + next.frac |= ((uint64_t)1 << 63); + next.sec >>= 1; + } + cc->cc_firstevent = next; + if (callout_new_inserted != NULL) + (*callout_new_inserted)(cpu, next); + cc->cc_lastscan = *now; + callout_stats(depth, mpcalls, lockcalls, gcalls); + mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); + /* + * swi_sched acquires the thread lock, so we don't want to call it + * with cc_lock held; incorrect locking order. + */ + if (need_softclock) { + swi_sched(cc->cc_cookie, 0); + } +} + +static struct callout_cpu * +callout_lock(struct callout *c) +{ + struct callout_cpu *cc; + int cpu; + + for (;;) { + cpu = c->c_cpu; +#ifdef SMP + if (cpu == CPUBLOCK) { + while (c->c_cpu == CPUBLOCK) + cpu_spinwait(); + continue; + } +#endif + cc = CC_CPU(cpu); + CC_LOCK(cc); + if (cpu == c->c_cpu) + break; + CC_UNLOCK(cc); + } + return (cc); +} + /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning @@ -758,14 +790,12 @@ softclock(void *arg) struct callout_cpu *cc; struct callout *c; int steps; /* #steps since we last allowed interrupts */ + int depth; int mpcalls; int lockcalls; int gcalls; -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ - + depth = 0; mpcalls = 0; lockcalls = 0; gcalls = 0; @@ -775,6 +805,7 @@ softclock(void *arg) c = TAILQ_FIRST(&cc->cc_expireq); while (c != NULL) { + ++depth; ++steps; if (steps >= MAX_SOFTCLOCK_STEPS) { cc->cc_next = c; @@ -787,15 +818,10 @@ softclock(void *arg) } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); c = softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); + &lockcalls, &gcalls, 0); } } -#ifdef CALLOUT_PROFILING - avg_depth += (depth * 1000 - avg_depth) >> 8; - avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; - avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; - avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; -#endif + callout_stats(depth, mpcalls, lockcalls, gcalls); cc->cc_next = NULL; CC_UNLOCK(cc); } @@ -929,16 +955,22 @@ _callout_reset_on(struct callout *c, struct bintim } } if (c->c_flags & CALLOUT_PENDING) { - if ((c->c_flags & CALLOUT_PROCESSED) == 0) { - if (cc->cc_next == c) + if (cc->cc_next == c) { + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { cc->cc_next = TAILQ_NEXT(c, c_links.tqe); + bucket = get_bucket(&c->c_time); + TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, + c_links.tqe); + } else { + cc->cc_next = TAILQ_NEXT(c, c_staiter); + TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); + } + } + else if (cc->cc_next_direct == c) { + cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe); bucket = get_bucket(&c->c_time); TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, - c_links.tqe); - } else { - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_staiter); - TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); + c_links.tqe); } cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);