diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 5d26093..27c3380 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -259,6 +259,8 @@ options SX_NOINLINE # SMP Debugging Options: # +# CALLOUT_PROFILING enables rudimentary profiling of the callwheel data +# structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. @@ -297,6 +299,9 @@ options LOCK_PROFILING options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" +# Profiling for the callout(9) backend. +options CALLOUT_PROFILING + # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING diff --git a/sys/conf/options b/sys/conf/options index ab5d153..75d0c97 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -68,6 +68,7 @@ TEXTDUMP_VERBOSE opt_ddb.h ADAPTIVE_LOCKMGRS ALQ AUDIT opt_global.h +CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 9d62c58..c34970e 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -425,6 +425,7 @@ initclocks(dummy) void hardclock_cpu(int usermode) { + struct bintime now; struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; @@ -459,7 +460,8 @@ hardclock_cpu(int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); + binuptime(&now); + callout_process(&now); } /* @@ -549,7 +551,6 @@ hardclock_cnt(int cnt, int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c index 6fd40a8..d6ac052 100644 --- a/sys/kern/kern_clocksource.c +++ b/sys/kern/kern_clocksource.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -71,9 +72,8 @@ static int round_freq(struct eventtimer *et, int freq); static void getnextcpuevent(struct bintime *event, int idle); static void getnextevent(struct bintime *event); static int handleevents(struct bintime *now, int fake); -#ifdef SMP -static void cpu_new_callout(int cpu, int ticks); -#endif +static void cpu_new_callout(int cpu, struct bintime bt, + struct bintime bt_opt); static struct mtx et_hw_mtx; @@ -95,7 +95,6 @@ static struct mtx et_hw_mtx; static struct eventtimer *timer = NULL; static struct bintime timerperiod; /* Timer period for periodic mode. */ -static struct bintime hardperiod; /* hardclock() events period. */ static struct bintime statperiod; /* statclock() events period. */ static struct bintime profperiod; /* profclock() events period. */ static struct bintime nexttick; /* Next global timer tick time. */ @@ -135,6 +134,8 @@ struct pcpu_state { struct bintime nexthard; /* Next hardlock() event. */ struct bintime nextstat; /* Next statclock() event. */ struct bintime nextprof; /* Next profclock() event. */ + struct bintime nextcall; /* Next callout event. */ + struct bintime nextcallopt; /* Next optional callout event. */ #ifdef KDTRACE_HOOKS struct bintime nextcyc; /* Next OpenSolaris cyclics event. */ #endif @@ -143,15 +144,7 @@ struct pcpu_state { }; static DPCPU_DEFINE(struct pcpu_state, timerstate); - -#define FREQ2BT(freq, bt) \ -{ \ - (bt)->sec = 0; \ - (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ -} -#define BT2FREQ(bt) \ - (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ - ((bt)->frac >> 1)) +DPCPU_DEFINE(struct bintime, hardclocktime); /* * Timer broadcast IPI handler. @@ -180,7 +173,7 @@ hardclockintr(void) static int handleevents(struct bintime *now, int fake) { - struct bintime t; + struct bintime t, *hct; struct trapframe *frame; struct pcpu_state *state; uintfptr_t pc; @@ -205,10 +198,13 @@ handleevents(struct bintime *now, int fake) runs = 0; while (bintime_cmp(now, &state->nexthard, >=)) { - bintime_addx(&state->nexthard, hardperiod.frac); + bintime_addx(&state->nexthard, tick_bt.frac); runs++; } if (runs) { + hct = DPCPU_PTR(hardclocktime); + *hct = state->nexthard; + bintime_sub(hct,&tick_bt); if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 && bintime_cmp(&state->nexthard, &nexthard, >)) nexthard = state->nexthard; @@ -238,6 +234,12 @@ handleevents(struct bintime *now, int fake) } } else state->nextprof = state->nextstat; + if (bintime_cmp(now, &state->nextcallopt, >=) && + (state->nextcallopt.sec != -1)) { + state->nextcall.sec = -1; + state->nextcallopt.sec = -1; + callout_process(now); + } #ifdef KDTRACE_HOOKS if (fake == 0 && cyclic_clock_func != NULL && @@ -271,22 +273,26 @@ getnextcpuevent(struct bintime *event, int idle) { struct bintime tmp; struct pcpu_state *state; - int skip; + int hardfreq; state = DPCPU_PTR(timerstate); - /* Handle hardclock() events. */ + /* Handle hardclock() events, skipping some if CPU is idle. */ *event = state->nexthard; if (idle || (!activetick && !profiling && (timer->et_flags & ET_FLAGS_PERCPU) == 0)) { - skip = idle ? 4 : (stathz / 2); - if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > skip) - skip = tc_min_ticktock_freq; - skip = callout_tickstofirst(hz / skip) - 1; - CTR2(KTR_SPARE2, "skip at %d: %d", curcpu, skip); - tmp = hardperiod; - bintime_mul(&tmp, skip); - bintime_add(event, &tmp); + hardfreq = 2; + if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > hardfreq) + hardfreq = tc_min_ticktock_freq; + if (hz > hardfreq) { + tmp = tick_bt; + bintime_mul(&tmp, hz / hardfreq - 1); + bintime_add(event, &tmp); + } } + /* Handle callout events. */ + if (state->nextcall.sec != -1 && + bintime_cmp(event, &state->nextcall, >)) + *event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (bintime_cmp(event, &state->nextstat, >)) *event = state->nextstat; @@ -629,10 +635,10 @@ cpu_initclocks_bsp(void) #ifdef KDTRACE_HOOKS state->nextcyc.sec = -1; #endif + state->nextcall.sec = -1; + state->nextcallopt.sec = -1; } -#ifdef SMP callout_new_inserted = cpu_new_callout; -#endif periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) @@ -696,7 +702,7 @@ cpu_initclocks_bsp(void) profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; - FREQ2BT(hz, &hardperiod); + FREQ2BT(hz, &tick_bt); FREQ2BT(stathz, &statperiod); FREQ2BT(profhz, &profperiod); ET_LOCK(); @@ -856,52 +862,57 @@ clocksource_cyc_set(const struct bintime *t) } #endif -#ifdef SMP static void -cpu_new_callout(int cpu, int ticks) +cpu_new_callout(int cpu, struct bintime bt, struct bintime bt_opt) { - struct bintime tmp; + struct bintime now; struct pcpu_state *state; - CTR3(KTR_SPARE2, "new co at %d: on %d in %d", - curcpu, cpu, ticks); + CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", + curcpu, cpu, (int)(bt_opt.sec), (u_int)(bt_opt.frac >> 32), + (int)(bt.sec), (u_int)(bt.frac >> 32)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); - if (state->idle == 0 || busy) { + + /* + * If there is callout time already set earlier -- do nothing. + * This check may appear redundant because we check already in + * callout_process() but this double check guarantees we're safe + * with respect to race conditions between interrupts execution + * and scheduling. + */ + state->nextcallopt = bt_opt; + if (state->nextcall.sec != -1 && + bintime_cmp(&bt, &state->nextcall, >=)) { ET_HW_UNLOCK(state); return; } - /* - * If timer is periodic - just update next event time for target CPU. - * If timer is global - there is chance it is already programmed. - */ - if (periodic || (timer->et_flags & ET_FLAGS_PERCPU) == 0) { - tmp = hardperiod; - bintime_mul(&tmp, ticks - 1); - bintime_add(&tmp, &state->nexthard); - if (bintime_cmp(&tmp, &state->nextevent, <)) - state->nextevent = tmp; - if (periodic || - bintime_cmp(&state->nextevent, &nexttick, >=)) { - ET_HW_UNLOCK(state); - return; - } + state->nextcall = bt; + /* If there is some other event set earlier -- do nothing. */ + if (bintime_cmp(&state->nextcall, &state->nextevent, >=)) { + ET_HW_UNLOCK(state); + return; } - /* - * Otherwise we have to wake that CPU up, as we can't get present - * bintime to reprogram global timer from here. If timer is per-CPU, - * we by definition can't do it from here. - */ - ET_HW_UNLOCK(state); - if (timer->et_flags & ET_FLAGS_PERCPU) { - state->handle = 1; - ipi_cpu(cpu, IPI_HARDCLOCK); - } else { - if (!cpu_idle_wakeup(cpu)) - ipi_cpu(cpu, IPI_AST); + state->nextevent = state->nextcall; + /* If timer is periodic -- there is nothing to reprogram. */ + if (periodic) { + ET_HW_UNLOCK(state); + return; } -} + /* If timer is global or of the current CPU -- reprogram it. */ + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { + binuptime(&now); + loadtimer(&now, 0); + ET_HW_UNLOCK(state); + return; + } + /* Otherwise make other CPU to reprogram it. */ + state->handle = 1; + ET_HW_UNLOCK(state); +#ifdef SMP + ipi_cpu(cpu, IPI_HARDCLOCK); #endif +} /* * Report or change the active event timers hardware. diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index 6e1f486..834c2a7 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -22,6 +22,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #ifdef FFCLOCK #include #include @@ -119,6 +120,21 @@ static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); +struct bintime bt_timethreshold; +struct bintime bt_tickthreshold; +sbintime_t sbt_timethreshold; +sbintime_t sbt_tickthreshold; +struct bintime tc_tick_bt; +sbintime_t tc_tick_sbt; +int tc_precexp; +int tc_timepercentage = TC_DEFAULTPERC; +TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage); +static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, + sysctl_kern_timecounter_adjprecision, "I", + "Allowed time interval deviation in percents"); + static void tc_windup(void); static void cpu_tick_calibrate(int); @@ -1746,10 +1762,47 @@ tc_ticktock(int cnt) tc_windup(); } +static void __inline +tc_adjprecision(void) +{ + int t; + + if (tc_timepercentage > 0) { + t = (99 + tc_timepercentage) / tc_timepercentage; + tc_precexp = fls(t + (t >> 1)) - 1; + FREQ2BT(hz / tc_tick, &bt_timethreshold); + FREQ2BT(hz, &bt_tickthreshold); + bintime_shift(&bt_timethreshold, tc_precexp); + bintime_shift(&bt_tickthreshold, tc_precexp); + } else { + tc_precexp = 31; + bt_timethreshold.sec = INT_MAX; + bt_timethreshold.frac = ~(uint64_t)0; + bt_tickthreshold = bt_timethreshold; + } + sbt_timethreshold = bintime2sbintime(bt_timethreshold); + sbt_tickthreshold = bintime2sbintime(bt_tickthreshold); +} + +static int +sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = tc_timepercentage; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + tc_timepercentage = val; + tc_adjprecision(); + return (0); +} + static void inittimecounter(void *dummy) { u_int p; + int tick_rate; /* * Set the initial timeout to @@ -1763,6 +1816,12 @@ inittimecounter(void *dummy) tc_tick = (hz + 500) / 1000; else tc_tick = 1; + tc_adjprecision(); + FREQ2BT(hz, &tick_bt); + tick_sbt = bintime2sbintime(tick_bt); + tick_rate = hz / tc_tick; + FREQ2BT(tick_rate, &tc_tick_bt); + tc_tick_sbt = bintime2sbintime(tc_tick_bt); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index 80933fa..bcbcfd1 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -37,13 +37,16 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_callout_profiling.h" #include "opt_kdtrace.h" +#if defined(__arm__) +#include "opt_timer.h" +#endif #include #include #include #include -#include #include #include #include @@ -55,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef SMP #include @@ -68,6 +72,7 @@ SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); +#ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); @@ -80,6 +85,19 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); +static int avg_depth_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, + "Average number of direct callouts examined per callout_process call. " + "Units = 1/1000"); +static int avg_lockcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, + &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " + "callout_process call. Units = 1/1000"); +static int avg_mpcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, + 0, "Average number of MP direct callouts made per callout_process call. " + "Units = 1/1000"); +#endif /* * TODO: * allocate more timeout table slots when table overflows. @@ -87,58 +105,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, int callwheelsize, callwheelmask; /* - * The callout cpu migration entity represents informations necessary for - * describing the migrating callout to the new callout cpu. + * The callout cpu exec entities represent informations necessary for + * describing the state of callouts currently running on the CPU and the ones + * necessary for migrating callouts to the new callout cpu. In particular, + * the first entry of the array cc_exec_entity holds informations for callout + * running in SWI thread context, while the second one holds informations + * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ -struct cc_mig_ent { +struct cc_exec { + struct callout *cc_next; + struct callout *cc_curr; #ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - int ce_migration_ticks; + void (*ce_migration_func)(void *); + void *ce_migration_arg; + int ce_migration_cpu; + sbintime_t ce_migration_time; #endif + int cc_cancel; + int cc_waiting; }; - + /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct mtx_padalign cc_lock; - struct cc_mig_ent cc_migrating_entity; + struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; struct callout_tailq *cc_callwheel; + struct callout_tailq cc_expireq; struct callout_list cc_callfree; - struct callout *cc_next; - struct callout *cc_curr; + sbintime_t cc_firstevent; + sbintime_t cc_lastscan; void *cc_cookie; - int cc_ticks; - int cc_softticks; - int cc_cancel; - int cc_waiting; - int cc_firsttick; }; +#define cc_exec_curr cc_exec_entity[0].cc_curr +#define cc_exec_next cc_exec_entity[0].cc_next +#define cc_exec_cancel cc_exec_entity[0].cc_cancel +#define cc_exec_waiting cc_exec_entity[0].cc_waiting +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr +#define cc_exec_next_dir cc_exec_entity[1].cc_next +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting + #ifdef SMP -#define cc_migration_func cc_migrating_entity.ce_migration_func -#define cc_migration_arg cc_migrating_entity.ce_migration_arg -#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu -#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks +#define cc_migration_func cc_exec_entity[0].ce_migration_func +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu +#define cc_migration_time cc_exec_entity[0].ce_migration_time +#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu +#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU @@ -153,40 +175,51 @@ struct callout_cpu cc_cpu; #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) +#define TIME_T_MAX \ + (sizeof(time_t) == (sizeof(int64_t)) ? INT64_MAX : INT32_MAX) + static int timeout_cpu; -void (*callout_new_inserted)(int cpu, int ticks) = NULL; +void (*callout_new_inserted)(int cpu, struct bintime bt, + struct bintime bt_opt) = NULL; +static void +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, + int *lockcalls, int *gcalls, int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* * Resets the migration entity tied to a specific callout cpu. */ static void -cc_cme_cleanup(struct callout_cpu *cc) +cc_cme_cleanup(struct callout_cpu *cc, int direct) { + cc->cc_exec_entity[direct].cc_curr = NULL; + cc->cc_exec_entity[direct].cc_next = NULL; + cc->cc_exec_entity[direct].cc_cancel = 0; + cc->cc_exec_entity[direct].cc_waiting = 0; #ifdef SMP - cc->cc_migration_cpu = CPUBLOCK; - cc->cc_migration_ticks = 0; - cc->cc_migration_func = NULL; - cc->cc_migration_arg = NULL; + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; + cc->cc_exec_entity[direct].ce_migration_time = 0; + cc->cc_exec_entity[direct].ce_migration_func = NULL; + cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } @@ -194,18 +227,18 @@ cc_cme_cleanup(struct callout_cpu *cc) * Checks if migration is requested by a specific callout cpu. */ static int -cc_cme_migrating(struct callout_cpu *cc) +cc_cme_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP - return (cc->cc_migration_cpu != CPUBLOCK); + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif } /* - * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization + * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization * * This code is called very early in the kernel initialization sequence, * and may be called more then once. @@ -242,7 +275,9 @@ callout_cpu_init(struct callout_cpu *cc) for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&cc->cc_callwheel[i]); } - cc_cme_cleanup(cc); + TAILQ_INIT(&cc->cc_expireq); + for (i = 0; i < 2; i++) + cc_cme_cleanup(cc, i); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { @@ -330,28 +365,149 @@ start_softclock(void *dummy) SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); +#define CC_HASH_SHIFT 10 + +static inline int +callout_hash(sbintime_t sbt) +{ + + return (int)(sbt >> (32 - CC_HASH_SHIFT)); +} + +static inline int +callout_get_bucket(sbintime_t sbt) +{ + + return callout_hash(sbt) & callwheelmask; +} + void -callout_tick(void) +callout_process(struct bintime *now) { + struct callout *tmp, *tmpn; struct callout_cpu *cc; - int need_softclock; - int bucket; + struct callout_tailq *sc; + uint64_t lookahead; + sbintime_t first, last, max, now_sbt, tmp_max; + int depth_dir, exit_allowed, exit_wanted, firstb, lastb, lockcalls_dir, + max, mpcalls_dir, need_softclock, nowb; - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ need_softclock = 0; + depth_dir = 0; + mpcalls_dir = 0; + lockcalls_dir = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cc->cc_firsttick = cc->cc_ticks = ticks; - for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; + + /* Compute the buckets of the last scan and present times. */ + firstb = callout_hash(cc->cc_lastscan); + now_sbt = bintime2sbintime(*now); + cc->cc_lastscan = now_sbt; + nowb = callout_hash(now_sbt); + + /* Compute the last bucket and minimum time of the bucket after it. */ + if (nowb == firstb) + lookahead = (SBT_1S / 16); + else if (nowb - firstb == 1) + lookahead = (SBT_1S / 8); + else + lookahead = (SBT_1S / 2); + first = last = now_sbt; + first += (lookahead / 2); + last += lookahead; + last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); + lastb = callout_hash(last) - 1; + max = last; + + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + if (lastb - firstb >= callwheelsize) + lastb = firstb - 1; + if (nowb - firstb >= callwheelsize) + nowb = firstb - 1; + nowb &= callwheelmask; + lastb &= callwheelmask; + firstb &= callwheelmask; + + /* Iterate callwheel from firstb to nowb and then up to lastb. */ + exit_allowed = exit_wanted = 0; + for (;;) { + sc = &cc->cc_callwheel[firstb]; + tmp = TAILQ_FIRST(sc); + while (tmp != NULL) { + /* Run the callout if present time within allowed. */ + if (tmp->c_time <= now_sbt) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { + ++depth_dir; + cc->cc_exec_next_dir = + TAILQ_NEXT(tmp, c_links.tqe); + TAILQ_REMOVE(sc, tmp, c_links.tqe); + softclock_call_cc(tmp, cc, + &mpcalls_dir, &lockcalls_dir, + NULL, 1); + tmp = cc->cc_exec_next_dir; + } else { + tmpn = TAILQ_NEXT(tmp, c_links.tqe); + TAILQ_REMOVE(sc, tmp, c_links.tqe); + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + need_softclock = 1; + tmp = tmpn; + } + continue; + } + /* Skip events from distant future. */ + if (tmp->c_time >= max) + goto next; + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (tmp->c_time > last) { + exit_wanted = 1; + goto next; + } + /* Update first and last time, respecting this event. */ + if (tmp->c_time < first) + first = tmp->c_time; + tmp_max = tmp->c_time; + tmp_max += tmp->c_precision; + if (tmp_max < last) + last = tmp_max; +next: + tmp = TAILQ_NEXT(tmp, c_links.tqe); } + /* Stop if we looked far enough into the future. */ + if (firstb == lastb) + break; + /* + * Stop if we looked after present time and found + * some event we can't execute at now. + */ + if (firstb == nowb) + exit_allowed = 1; + if (exit_allowed && exit_wanted) + break; + /* Proceed with the next bucket. */ + firstb = (firstb + 1) & callwheelmask; } + cc->cc_exec_next_dir = NULL; + if (callout_new_inserted != NULL) + (*callout_new_inserted)(curcpu, sbintime2bintime(last), + sbintime2bintime(first)); + cc->cc_firstevent = last; +#ifdef CALLOUT_PROFILING + avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; + avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; + avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; +#endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it @@ -361,33 +517,6 @@ callout_tick(void) swi_sched(cc->cc_cookie, 0); } -int -callout_tickstofirst(int limit) -{ - struct callout_cpu *cc; - struct callout *c; - struct callout_tailq *sc; - int curticks; - int skip = 1; - - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - curticks = cc->cc_ticks; - while( skip < ncallout && skip < limit ) { - sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ]; - /* search scanning ticks */ - TAILQ_FOREACH( c, sc, c_links.tqe ){ - if (c->c_time - curticks <= ncallout) - goto out; - } - skip++; - } -out: - cc->cc_firsttick = curticks + skip; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - return (skip); -} - static struct callout_cpu * callout_lock(struct callout *c) { @@ -413,25 +542,39 @@ callout_lock(struct callout *c) } static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks, - void (*func)(void *), void *arg, int cpu) +callout_cc_add(struct callout *c, struct callout_cpu *cc, + sbintime_t sbt, sbintime_t precision, void (*func)(void *), + void *arg, int cpu, int flags) { + sbintime_t last; + int bucket; CC_LOCK_ASSERT(cc); - - if (to_ticks <= 0) - to_ticks = 1; + if (sbt < cc->cc_lastscan) + sbt = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + if (flags & C_DIRECT_EXEC) + c->c_flags |= CALLOUT_DIRECT; + c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - if ((c->c_time - cc->cc_firsttick) < 0 && - callout_new_inserted != NULL) { - cc->cc_firsttick = c->c_time; - (*callout_new_inserted)(cpu, - to_ticks + (ticks - cc->cc_ticks)); + c->c_time = sbt; + c->c_precision = precision; + bucket = callout_get_bucket(c->c_time); + CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", + c, (int)(c->c_precision >> 32), + (u_int)(c->c_precision & 0xffffffff)); + TAILQ_INSERT_TAIL(&cc->cc_callwheel[bucket], c, c_links.tqe); + /* + * Inform the eventtimers(4) subsystem there's a new callout + * that has been inserted, but only if really required. + */ + last = c->c_time + c->c_precision; + if (callout_new_inserted != NULL && ((last < cc->cc_firstevent) || + (cc->cc_firstevent == 0))) { + cc->cc_firstevent = last; + (*callout_new_inserted)(cpu, sbintime2bintime(last), + sbintime2bintime(c->c_time)); } } @@ -447,7 +590,7 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) + int *lockcalls, int *gcalls, int direct) { void (*c_func)(void *); void *c_arg; @@ -458,7 +601,8 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; - int new_cpu, new_ticks; + sbintime_t new_time; + int flags, new_cpu; #endif #ifdef DIAGNOSTIC struct bintime bt1, bt2; @@ -480,8 +624,8 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; - cc->cc_curr = c; - cc->cc_cancel = 0; + cc->cc_exec_entity[direct].cc_curr = c; + cc->cc_exec_entity[direct].cc_cancel = 0; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); @@ -489,14 +633,18 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, * The callout may have been cancelled * while we switched locks. */ - if (cc->cc_cancel) { + if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; - - if (c_lock == &Giant.lock_object) { + cc->cc_exec_entity[direct].cc_cancel = 1; + /* + * In case we're processing a direct callout we + * can't hold giant because holding a sleep mutex + * from hardware interrupt context is not allowed. + */ + if ((c_lock == &Giant.lock_object) && gcalls != NULL) { (*gcalls)++; CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); @@ -513,11 +661,13 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, #ifdef DIAGNOSTIC binuptime(&bt1); #endif - THREAD_NO_SLEEPING(); + if (!direct) + THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); c_func(c_arg); SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); - THREAD_SLEEPING_OK(); + if (!direct) + THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC binuptime(&bt2); bintime_sub(&bt2, &bt1); @@ -537,17 +687,17 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, class->lc_unlock(c_lock); skip: CC_LOCK(cc); - KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); - cc->cc_curr = NULL; - if (cc->cc_waiting) { + KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); + cc->cc_exec_entity[direct].cc_curr = NULL; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) { - cc_cme_cleanup(cc); + if (cc_cme_migrating(cc, direct)) { + cc_cme_cleanup(cc, direct); /* * It should be assert here that the callout is not @@ -555,11 +705,11 @@ skip: */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } - cc->cc_waiting = 0; + cc->cc_exec_entity[direct].cc_waiting = 0; CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); + wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); - } else if (cc_cme_migrating(cc)) { + } else if (cc_cme_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP @@ -567,11 +717,11 @@ skip: * If the callout was scheduled for * migration just perform it now. */ - new_cpu = cc->cc_migration_cpu; - new_ticks = cc->cc_migration_ticks; - new_func = cc->cc_migration_func; - new_arg = cc->cc_migration_arg; - cc_cme_cleanup(cc); + new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; + new_time = cc->cc_exec_entity[direct].ce_migration_time; + new_func = cc->cc_exec_entity[direct].ce_migration_func; + new_arg = cc->cc_exec_entity[direct].ce_migration_arg; + cc_cme_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed @@ -589,8 +739,9 @@ skip: c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); - callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, - new_cpu); + flags = (direct) ? C_DIRECT_EXEC : 0; + callout_cc_add(c, new_cc, new_time, c->c_precision, new_func, + new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -633,63 +784,25 @@ softclock(void *arg) { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; - int curticks; - int steps; /* #steps since we last allowed interrupts */ - int depth; - int mpcalls; - int lockcalls; - int gcalls; - -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ + int depth, gcalls, lockcalls, mpcalls; + depth = 0; mpcalls = 0; lockcalls = 0; gcalls = 0; - depth = 0; - steps = 0; cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks - 1 != cc->cc_ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c != NULL) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); - steps = 0; - c = cc->cc_next; - } - } + while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + softclock_call_cc(c, cc, &mpcalls, &lockcalls, &gcalls, 0); + ++depth; } +#ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; +#endif CC_UNLOCK(cc); } @@ -704,7 +817,7 @@ softclock(void *arg) * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This - * implementation differs from that one in that although an + * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. @@ -762,6 +875,10 @@ callout_handle_init(struct callout_handle *handle) handle->callout = NULL; } +#ifndef NO_EVENTTIMERS +DPCPU_DECLARE(struct bintime, hardclocktime); +#endif + /* * New interface; clients allocate their own callout structures. * @@ -779,28 +896,66 @@ callout_handle_init(struct callout_handle *handle) * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), - void *arg, int cpu) +callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*ftn)(void *), void *arg, int cpu, int flags) { + sbintime_t to_sbt, pr; struct callout_cpu *cc; - int cancelled = 0; + int bucket, cancelled, direct; + cancelled = 0; + if (flags & C_ABSOLUTE) { + to_sbt = sbt; + } else { + if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbt = tick_sbt; + if ((flags & C_HARDCLOCK) || +#ifdef NO_EVENTTIMERS + sbt >= sbt_timethreshold) { + getsbinuptime(&to_sbt); + /* Add safety belt for the case of hz > 1000. */ + to_sbt += (tc_tick_sbt - tick_sbt); +#else + sbt >= sbt_tickthreshold) { + /* + * Obtain the time of the last hardclock() call on + * this CPU directly from the kern_clocksource.c. + * This value is per-CPU, but it is equal for all + * active ones. + */ + spinlock_enter(); + to_sbt = bintime2sbintime(DPCPU_GET(hardclocktime)); + spinlock_exit(); +#endif + if ((flags & C_HARDCLOCK) == 0) + to_sbt += tick_sbt; + } else + sbinuptime(&to_sbt); + to_sbt += sbt; + pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(flags)); + if (pr > precision) + precision = pr; + } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; + direct = c->c_flags & CALLOUT_DIRECT; cc = callout_lock(c); - if (cc->cc_curr == c) { + if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ - if (c->c_lock != NULL && !cc->cc_cancel) - cancelled = cc->cc_cancel = 1; - if (cc->cc_waiting) { + if (c->c_lock != NULL && + !cc->cc_exec_entity[direct].cc_cancel) + cancelled = + cc->cc_exec_entity[direct].cc_cancel = 1; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. @@ -813,12 +968,15 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = TAILQ_NEXT(c, + c_links.tqe); + bucket = callout_get_bucket(c->c_time); + TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, + c_links.tqe); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -830,15 +988,17 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), * to a more appropriate moment. */ if (c->c_cpu != cpu) { - if (cc->cc_curr == c) { - cc->cc_migration_cpu = cpu; - cc->cc_migration_ticks = to_ticks; - cc->cc_migration_func = ftn; - cc->cc_migration_arg = arg; + if (cc->cc_exec_entity[direct].cc_curr == c) { + cc->cc_exec_entity[direct].ce_migration_cpu = cpu; + cc->cc_exec_entity[direct].ce_migration_time + = to_sbt; + cc->cc_exec_entity[direct].ce_migration_func = ftn; + cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; - CTR5(KTR_CALLOUT, - "migration of %p func %p arg %p in %d to %u deferred", - c, c->c_func, c->c_arg, to_ticks, cpu); + CTR6(KTR_CALLOUT, + "migration of %p func %p arg %p in %d.%08x to %u deferred", + c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } @@ -846,9 +1006,10 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } #endif - callout_cc_add(c, cc, to_ticks, ftn, arg, cpu); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); + CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", + cancelled ? "re" : "", c, c->c_func, c->c_arg,(int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); @@ -876,7 +1037,7 @@ _callout_stop_safe(c, safe) { struct callout_cpu *cc, *old_cc; struct lock_class *class; - int use_lock, sq_locked; + int bucket, direct, sq_locked, use_lock; /* * Some old subsystems don't hold Giant while running a callout_stop(), @@ -892,7 +1053,7 @@ _callout_stop_safe(c, safe) } } else use_lock = 0; - + direct = c->c_flags & CALLOUT_DIRECT; sq_locked = 0; old_cc = NULL; again: @@ -906,7 +1067,7 @@ again: if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_waiting); + sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; @@ -927,12 +1088,13 @@ again: * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ - if (cc->cc_curr != c) { + if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release( + &cc->cc_exec_entity[direct].cc_waiting); return (0); } @@ -943,8 +1105,7 @@ again: * just wait for the current invocation to * finish. */ - while (cc->cc_curr == c) { - + while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid @@ -964,7 +1125,8 @@ again: */ if (!sq_locked) { CC_UNLOCK(cc); - sleepq_lock(&cc->cc_waiting); + sleepq_lock( + &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; @@ -976,13 +1138,16 @@ again: * will be packed up, just let softclock() * take care of it. */ - cc->cc_waiting = 1; + cc->cc_exec_entity[direct].cc_waiting = 1; DROP_GIANT(); CC_UNLOCK(cc); - sleepq_add(&cc->cc_waiting, + sleepq_add( + &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); - sleepq_wait(&cc->cc_waiting, 0); + sleepq_wait( + &cc->cc_exec_entity[direct].cc_waiting, + 0); sq_locked = 0; old_cc = NULL; @@ -990,7 +1155,8 @@ again: PICKUP_GIANT(); CC_LOCK(cc); } - } else if (use_lock && !cc->cc_cancel) { + } else if (use_lock && + !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout @@ -998,10 +1164,10 @@ again: * lock, the callout will be skipped in * softclock(). */ - cc->cc_cancel = 1; + cc->cc_exec_entity[direct].cc_cancel = 1; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - KASSERT(!cc_cme_migrating(cc), + KASSERT(!cc_cme_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); @@ -1020,16 +1186,19 @@ again: return (0); } if (sq_locked) - sleepq_release(&cc->cc_waiting); - + sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe); + bucket = callout_get_bucket(c->c_time); + TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, + c_links.tqe); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); callout_cc_del(c, cc); CC_UNLOCK(cc); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f36c769..8b8897c 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -81,8 +81,10 @@ __FBSDID("$FreeBSD$"); static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS); -int hz; -int tick; +int hz; /* system clock's frequency */ +int tick; /* usec per tick (1000000 / hz) */ +struct bintime tick_bt; /* bintime per tick (1s / hz) */ +sbintime_t tick_sbt; int maxusers; /* base tunable */ int maxproc; /* maximum # of processes */ int maxprocperuid; /* max # of procs per user */ @@ -221,6 +223,8 @@ init_param1(void) if (hz == -1) hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ; tick = 1000000 / hz; + FREQ2BT(hz, &tick_bt); + tick_sbt = bintime2sbintime(tick_bt); #ifdef VM_SWZONE_SIZE_MAX maxswzone = VM_SWZONE_SIZE_MAX; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 48444c1..e29b3b3 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -719,20 +719,24 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) #define ticks_to_msecs(t) (1000*(t) / hz) void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + sbintime_t now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; + getsbinuptime(&now); if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } diff --git a/sys/ofed/include/linux/timer.h b/sys/ofed/include/linux/timer.h index ed4ed4a..21fe432 100644 --- a/sys/ofed/include/linux/timer.h +++ b/sys/ofed/include/linux/timer.h @@ -38,10 +38,9 @@ struct timer_list { struct callout timer_callout; void (*function)(unsigned long); unsigned long data; + int expires; }; -#define expires timer_callout.c_time - static inline void _timer_fn(void *context) { @@ -65,13 +64,16 @@ do { \ callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \ } while (0) -#define mod_timer(timer, expire) \ - callout_reset(&(timer)->timer_callout, (expire) - jiffies, \ - _timer_fn, (timer)) +#define mod_timer(timer, exp) \ +do { \ + (timer)->expires = exp; \ + callout_reset(&(timer)->timer_callout, (exp) - jiffies, \ + _timer_fn, (timer)); \ +} while (0) #define add_timer(timer) \ callout_reset(&(timer)->timer_callout, \ - (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer)) + (timer)->expires - jiffies, _timer_fn, (timer)) #define del_timer(timer) callout_stop(&(timer)->timer_callout) #define del_timer_sync(timer) callout_drain(&(timer)->timer_callout) diff --git a/sys/sys/_callout.h b/sys/sys/_callout.h index b8c3ce9..d4fee35 100644 --- a/sys/sys/_callout.h +++ b/sys/sys/_callout.h @@ -39,6 +39,7 @@ #define _SYS__CALLOUT_H #include +#include struct lock_object; @@ -50,7 +51,8 @@ struct callout { SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - int c_time; /* ticks to the event */ + sbintime_t c_time; /* ticks to the event */ + sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ diff --git a/sys/sys/callout.h b/sys/sys/callout.h index 95b9a32..835f881 100644 --- a/sys/sys/callout.h +++ b/sys/sys/callout.h @@ -47,6 +47,16 @@ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ #define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ #define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ +#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ + +#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ +#define C_PRELBITS 7 +#define C_PRELRANGE ((1 << C_PRELBITS) - 1) +#define C_PREL(x) (((x) + 1) << 1) +#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1) +#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */ +#define C_ABSOLUTE 0x0200 /* event time is absolute. */ struct callout_handle { struct callout *callout; @@ -67,7 +77,15 @@ void _callout_init_lock(struct callout *, struct lock_object *, int); _callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object : \ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) -int callout_reset_on(struct callout *, int, void (*)(void *), void *, int); +int callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t, + void (*)(void *), void *, int, int); +#define callout_reset_sbt(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, flags) +#define callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), PCPU_GET(cpuid), flags) +#define callout_reset_on(c, to_ticks, fn, arg, cpu) \ + callout_reset_sbt_on((c), (tick_sbt * (to_ticks)), 0, (fn), (arg), \ + (cpu), C_HARDCLOCK) #define callout_reset(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu) #define callout_reset_curcpu(c, on_tick, fn, arg) \ @@ -78,9 +96,9 @@ int callout_schedule_on(struct callout *, int, int); callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) #define callout_stop(c) _callout_stop_safe(c, 0) int _callout_stop_safe(struct callout *, int); -void callout_tick(void); -int callout_tickstofirst(int limit); -extern void (*callout_new_inserted)(int cpu, int ticks); +void callout_process(struct bintime *); +extern void (*callout_new_inserted)(int cpu, struct bintime bt, + struct bintime); #endif diff --git a/sys/sys/time.h b/sys/sys/time.h index 80878c0..b1f68de 100644 --- a/sys/sys/time.h +++ b/sys/sys/time.h @@ -102,6 +102,21 @@ bintime_mul(struct bintime *bt, u_int x) bt->frac = (p2 << 32) | (p1 & 0xffffffffull); } +static __inline void +bintime_shift(struct bintime *bt, int exp) +{ + + if (exp > 0) { + bt->sec <<= exp; + bt->sec |= bt->frac >> (64 - exp); + bt->frac <<= exp; + } else if (exp < 0) { + bt->frac >>= -exp; + bt->frac |= (uint64_t)bt->sec << (64 + exp); + bt->sec >>= -exp; + } +} + #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ @@ -109,6 +124,13 @@ bintime_mul(struct bintime *bt, u_int x) ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) +#ifdef _KERNEL + +extern struct bintime tick_bt; +extern sbintime_t tick_sbt; + +#endif /* KERNEL */ + /*- * Background information: * @@ -290,7 +312,15 @@ void resettodr(void); extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime boottimebin; +extern struct bintime tc_tick_bt; +extern sbintime_t tc_tick_sbt; extern struct timeval boottime; +extern int tc_precexp; +extern int tc_timepercentage; +extern struct bintime bt_timethreshold; +extern struct bintime bt_tickthreshold; +extern sbintime_t sbt_timethreshold; +extern sbintime_t sbt_tickthreshold; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() @@ -337,6 +367,23 @@ int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); + +#define TC_DEFAULTPERC 5 + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +#define TIMESEL(sbt, sbt2) \ + (((sbt2) >= sbt_timethreshold) ? \ + (getsbinuptime(sbt), 1) : (sbinuptime(sbt), 0)) + #else /* !_KERNEL */ #include