diff -urN -x -p head-davide/sys/amd64/amd64/machdep.c calloutng/sys/amd64/amd64/machdep.c --- head-davide/sys/amd64/amd64/machdep.c 2012-12-07 07:27:31.000000000 +0100 +++ calloutng/sys/amd64/amd64/machdep.c 2012-11-12 12:27:29.000000000 +0100 @@ -658,7 +658,7 @@ halt(); } -void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */ +void (*cpu_idle_hook)(int) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ TUNABLE_INT("machdep.idle_mwait", &idle_mwait); @@ -670,7 +670,7 @@ #define STATE_SLEEPING 0x2 static void -cpu_idle_acpi(int busy) +cpu_idle_acpi(int us) { int *state; @@ -682,14 +682,14 @@ if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) - cpu_idle_hook(); + cpu_idle_hook(us); else __asm __volatile("sti; hlt"); *state = STATE_RUNNING; } static void -cpu_idle_hlt(int busy) +cpu_idle_hlt(int us) { int *state; @@ -730,7 +730,7 @@ #define MWAIT_C4 0x30 static void -cpu_idle_mwait(int busy) +cpu_idle_mwait(int us) { int *state; @@ -753,7 +753,7 @@ } static void -cpu_idle_spin(int busy) +cpu_idle_spin(int us) { int *state; int i; @@ -808,6 +808,7 @@ cpu_idle(int busy) { uint64_t msr; + int us = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); @@ -825,7 +826,7 @@ /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); - cpu_idleclock(); + us = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ @@ -836,7 +837,7 @@ } /* Call main idle method. */ - cpu_idle_fn(busy); + cpu_idle_fn(us); /* Switch timers mack into active mode. */ if (!busy) { diff -urN -x -p head-davide/sys/conf/NOTES calloutng/sys/conf/NOTES --- head-davide/sys/conf/NOTES 2012-12-07 07:27:34.000000000 +0100 +++ calloutng/sys/conf/NOTES 2012-12-11 09:28:02.000000000 +0100 @@ -259,6 +259,8 @@ # SMP Debugging Options: # +# CALLOUT_PROFILING enables rudimentary profiling of the callwheel data +# structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. @@ -297,6 +299,9 @@ options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" +# Profiling for the callout(9) backend. +options CALLOUT_PROFILING + # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING diff -urN -x -p head-davide/sys/conf/options calloutng/sys/conf/options --- head-davide/sys/conf/options 2012-12-07 07:27:34.000000000 +0100 +++ calloutng/sys/conf/options 2012-11-06 09:39:20.000000000 +0100 @@ -68,6 +68,7 @@ ADAPTIVE_LOCKMGRS ALQ AUDIT opt_global.h +CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h diff -urN -x -p head-davide/sys/dev/acpica/acpi_cpu.c calloutng/sys/dev/acpica/acpi_cpu.c --- head-davide/sys/dev/acpica/acpi_cpu.c 2012-12-07 07:27:11.000000000 +0100 +++ calloutng/sys/dev/acpica/acpi_cpu.c 2012-12-11 09:28:02.000000000 +0100 @@ -168,7 +168,7 @@ static void acpi_cpu_startup(void *arg); static void acpi_cpu_startup_cx(struct acpi_cpu_softc *sc); static void acpi_cpu_cx_list(struct acpi_cpu_softc *sc); -static void acpi_cpu_idle(void); +static void acpi_cpu_idle(int us); static void acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context); static int acpi_cpu_quirks(void); static int acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS); @@ -954,7 +954,7 @@ * interrupts are re-enabled. */ static void -acpi_cpu_idle() +acpi_cpu_idle(int us) { struct acpi_cpu_softc *sc; struct acpi_cx *cx_next; @@ -980,13 +980,14 @@ } /* Find the lowest state that has small enough latency. */ + us = min(us, sc->cpu_prev_sleep); cx_next_idx = 0; if (cpu_disable_deep_sleep) i = min(sc->cpu_cx_lowest, sc->cpu_non_c3); else i = sc->cpu_cx_lowest; for (; i >= 0; i--) { - if (sc->cpu_cx_states[i].trans_lat * 3 <= sc->cpu_prev_sleep) { + if (sc->cpu_cx_states[i].trans_lat * 3 <= us) { cx_next_idx = i; break; } diff -urN -x -p head-davide/sys/i386/i386/machdep.c calloutng/sys/i386/i386/machdep.c --- head-davide/sys/i386/i386/machdep.c 2012-12-07 07:27:24.000000000 +0100 +++ calloutng/sys/i386/i386/machdep.c 2012-11-12 12:27:30.000000000 +0100 @@ -1220,7 +1220,7 @@ int scheduler_running; static void -cpu_idle_hlt(int busy) +cpu_idle_hlt(int us) { scheduler_running = 1; @@ -1241,7 +1241,7 @@ #endif -void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */ +void (*cpu_idle_hook)(int) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ TUNABLE_INT("machdep.idle_mwait", &idle_mwait); @@ -1253,7 +1253,7 @@ #define STATE_SLEEPING 0x2 static void -cpu_idle_acpi(int busy) +cpu_idle_acpi(int us) { int *state; @@ -1265,7 +1265,7 @@ if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) - cpu_idle_hook(); + cpu_idle_hook(us); else __asm __volatile("sti; hlt"); *state = STATE_RUNNING; @@ -1273,7 +1273,7 @@ #ifndef XEN static void -cpu_idle_hlt(int busy) +cpu_idle_hlt(int us) { int *state; @@ -1315,7 +1315,7 @@ #define MWAIT_C4 0x30 static void -cpu_idle_mwait(int busy) +cpu_idle_mwait(int us) { int *state; @@ -1338,7 +1338,7 @@ } static void -cpu_idle_spin(int busy) +cpu_idle_spin(int us) { int *state; int i; @@ -1399,6 +1399,7 @@ #ifndef XEN uint64_t msr; #endif + int us = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); @@ -1418,7 +1419,7 @@ /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); - cpu_idleclock(); + us = cpu_idleclock(); } #ifndef XEN @@ -1431,7 +1432,7 @@ #endif /* Call main idle method. */ - cpu_idle_fn(busy); + cpu_idle_fn(us); /* Switch timers mack into active mode. */ if (!busy) { diff -urN -x -p head-davide/sys/ia64/ia64/machdep.c calloutng/sys/ia64/ia64/machdep.c --- head-davide/sys/ia64/ia64/machdep.c 2012-08-03 20:48:53.000000000 +0200 +++ calloutng/sys/ia64/ia64/machdep.c 2012-11-12 12:27:30.000000000 +0100 @@ -155,7 +155,7 @@ struct msgbuf *msgbufp = NULL; /* Other subsystems (e.g., ACPI) can hook this later. */ -void (*cpu_idle_hook)(void) = NULL; +void (*cpu_idle_hook)(int) = NULL; struct kva_md_info kmi; @@ -392,10 +392,11 @@ cpu_idle(int busy) { register_t ie; + int us = -1; if (!busy) { critical_enter(); - cpu_idleclock(); + us = cpu_idleclock(); } ie = intr_disable(); @@ -404,7 +405,7 @@ if (sched_runnable()) ia64_enable_intr(); else if (cpu_idle_hook != NULL) { - (*cpu_idle_hook)(); + (*cpu_idle_hook)(us); /* The hook must enable interrupts! */ } else { ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0); diff -urN -x -p head-davide/sys/kern/kern_clock.c calloutng/sys/kern/kern_clock.c --- head-davide/sys/kern/kern_clock.c 2012-08-03 20:48:45.000000000 +0200 +++ calloutng/sys/kern/kern_clock.c 2012-08-08 00:53:23.000000000 +0200 @@ -425,6 +425,7 @@ void hardclock_cpu(int usermode) { + struct bintime now; struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; @@ -459,7 +460,8 @@ if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); + binuptime(&now); + callout_process(&now); } /* @@ -549,7 +551,6 @@ if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ diff -urN -x -p head-davide/sys/kern/kern_clocksource.c calloutng/sys/kern/kern_clocksource.c --- head-davide/sys/kern/kern_clocksource.c 2012-12-07 07:27:02.000000000 +0100 +++ calloutng/sys/kern/kern_clocksource.c 2012-11-12 13:09:40.000000000 +0100 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include @@ -71,9 +73,8 @@ static void getnextcpuevent(struct bintime *event, int idle); static void getnextevent(struct bintime *event); static int handleevents(struct bintime *now, int fake); -#ifdef SMP -static void cpu_new_callout(int cpu, int ticks); -#endif +static void cpu_new_callout(int cpu, struct bintime bt, + struct bintime bt_opt); static struct mtx et_hw_mtx; @@ -135,6 +136,8 @@ struct bintime nexthard; /* Next hardlock() event. */ struct bintime nextstat; /* Next statclock() event. */ struct bintime nextprof; /* Next profclock() event. */ + struct bintime nextcall; /* Next callout event. */ + struct bintime nextcallopt; /* Next optional callout event. */ #ifdef KDTRACE_HOOKS struct bintime nextcyc; /* Next OpenSolaris cyclics event. */ #endif @@ -144,15 +147,6 @@ static DPCPU_DEFINE(struct pcpu_state, timerstate); -#define FREQ2BT(freq, bt) \ -{ \ - (bt)->sec = 0; \ - (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ -} -#define BT2FREQ(bt) \ - (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ - ((bt)->frac >> 1)) - /* * Timer broadcast IPI handler. */ @@ -238,6 +232,12 @@ } } else state->nextprof = state->nextstat; + if (bintime_cmp(now, &state->nextcallopt, >=) && + (state->nextcallopt.sec != -1)) { + state->nextcall.sec = -1; + state->nextcallopt.sec = -1; + callout_process(now); + } #ifdef KDTRACE_HOOKS if (fake == 0 && cyclic_clock_func != NULL && @@ -269,24 +269,28 @@ static void getnextcpuevent(struct bintime *event, int idle) { - struct bintime tmp; struct pcpu_state *state; - int skip; - + struct bintime tmp; + int hardfreq; + state = DPCPU_PTR(timerstate); - /* Handle hardclock() events. */ + /* Handle hardclock() events, skipping some is CPU is idle. */ *event = state->nexthard; if (idle || (!activetick && !profiling && (timer->et_flags & ET_FLAGS_PERCPU) == 0)) { - skip = idle ? 4 : (stathz / 2); - if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > skip) - skip = tc_min_ticktock_freq; - skip = callout_tickstofirst(hz / skip) - 1; - CTR2(KTR_SPARE2, "skip at %d: %d", curcpu, skip); - tmp = hardperiod; - bintime_mul(&tmp, skip); - bintime_add(event, &tmp); + hardfreq = idle ? 2 : (stathz / 2); + if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > hardfreq) + hardfreq = tc_min_ticktock_freq; + if (hz > hardfreq) { + tmp = hardperiod; + bintime_mul(&tmp, hz / hardfreq - 1); + bintime_add(event, &tmp); + } } + /* Handle callout events. */ + if (state->nextcall.sec != -1 && + bintime_cmp(event, &state->nextcall, >)) + *event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (bintime_cmp(event, &state->nextstat, >)) *event = state->nextstat; @@ -627,10 +631,10 @@ #ifdef KDTRACE_HOOKS state->nextcyc.sec = -1; #endif + state->nextcall.sec = -1; + state->nextcallopt.sec = -1; } -#ifdef SMP callout_new_inserted = cpu_new_callout; -#endif periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) @@ -759,7 +763,7 @@ /* * Switch to idle mode (all ticks handled). */ -void +int cpu_idleclock(void) { struct bintime now, t; @@ -771,7 +775,7 @@ || curcpu == CPU_FIRST() #endif ) - return; + return (-1); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; @@ -787,6 +791,9 @@ if (!periodic) loadtimer(&now, 0); ET_HW_UNLOCK(state); + bintime_sub(&t, &now); + return (t.sec > (INT_MAX >> 20) ? INT_MAX : + ((t.sec < 0) ? 0 : ((t.sec << 20) + (t.frac >> 44)))); } /* @@ -854,52 +861,57 @@ } #endif -#ifdef SMP static void -cpu_new_callout(int cpu, int ticks) +cpu_new_callout(int cpu, struct bintime bt, struct bintime bt_opt) { - struct bintime tmp; + struct bintime now; struct pcpu_state *state; - CTR3(KTR_SPARE2, "new co at %d: on %d in %d", - curcpu, cpu, ticks); + CTR5(KTR_SPARE2, "new co at %d: on %d at %d.%08x%08x", + curcpu, cpu, (int)(bt.sec), (u_int)(bt.frac >> 32), + (u_int)(bt.frac & 0xffffffff)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); - if (state->idle == 0 || busy) { + + /* + * If there is callout time already set earlier -- do nothing. + * This check may appear redundant because we check already in + * callout_process() but this double check guarantees we're safe + * with respect to race conditions between interrupts execution + * and scheduling. + */ + state->nextcallopt = bt_opt; + if (state->nextcall.sec != -1 && + bintime_cmp(&bt, &state->nextcall, >=)) { ET_HW_UNLOCK(state); return; } - /* - * If timer is periodic - just update next event time for target CPU. - * If timer is global - there is chance it is already programmed. - */ - if (periodic || (timer->et_flags & ET_FLAGS_PERCPU) == 0) { - tmp = hardperiod; - bintime_mul(&tmp, ticks - 1); - bintime_add(&tmp, &state->nexthard); - if (bintime_cmp(&tmp, &state->nextevent, <)) - state->nextevent = tmp; - if (periodic || - bintime_cmp(&state->nextevent, &nexttick, >=)) { - ET_HW_UNLOCK(state); - return; - } + state->nextcall = bt; + /* If there is some some other event set earlier -- do nothing. */ + if (bintime_cmp(&state->nextcall, &state->nextevent, >=)) { + ET_HW_UNLOCK(state); + return; } - /* - * Otherwise we have to wake that CPU up, as we can't get present - * bintime to reprogram global timer from here. If timer is per-CPU, - * we by definition can't do it from here. - */ - ET_HW_UNLOCK(state); - if (timer->et_flags & ET_FLAGS_PERCPU) { - state->handle = 1; - ipi_cpu(cpu, IPI_HARDCLOCK); - } else { - if (!cpu_idle_wakeup(cpu)) - ipi_cpu(cpu, IPI_AST); + state->nextevent = state->nextcall; + /* If timer is periodic -- there is nothing to reprogram. */ + if (periodic) { + ET_HW_UNLOCK(state); + return; } -} + /* If timer is global or of the current CPU -- reprogram it. */ + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { + binuptime(&now); + loadtimer(&now, 0); + ET_HW_UNLOCK(state); + return; + } + /* Otherwise make other CPU to reprogram it. */ + state->handle = 1; + ET_HW_UNLOCK(state); +#ifdef SMP + ipi_cpu(cpu, IPI_HARDCLOCK); #endif +} /* * Report or change the active event timers hardware. diff -urN -x -p head-davide/sys/kern/kern_condvar.c calloutng/sys/kern/kern_condvar.c --- head-davide/sys/kern/kern_condvar.c 2012-09-30 17:50:32.000000000 +0200 +++ calloutng/sys/kern/kern_condvar.c 2012-12-11 09:30:13.000000000 +0100 @@ -270,12 +270,12 @@ } /* - * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the - * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout - * expires. + * Wait on a condition variable. Returns 0 if the process was resumed by + * cv_signal or cv_broadcast, EWOULDBLOCK if the timeout expires. */ int -_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo) +_cv_timedwait(struct cv *cvp, struct lock_object *lock, struct bintime *bt, + struct bintime *precision, int timo, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; @@ -311,7 +311,10 @@ DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); - sleepq_set_timeout(cvp, timo); + if (bt == NULL) + sleepq_set_timeout_flags(cvp, timo, flags); + else + sleepq_set_timeout_bt(cvp, bt, precision); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); @@ -336,13 +339,14 @@ } /* - * Wait on a condition variable for at most timo/hz seconds, allowing - * interruption by signals. Returns 0 if the thread was resumed by cv_signal - * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if - * a signal was caught. + * Wait on a condition variable allowing interruption by signals. + * Returns 0 if the thread was resumed by cv_signal or cv_broadcast, + * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR + * or ERESTART if a signal was caught. */ int -_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo) +_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, + struct bintime *bt, struct bintime *precision, int timo, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; @@ -379,7 +383,10 @@ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); - sleepq_set_timeout(cvp, timo); + if (bt == NULL) + sleepq_set_timeout_flags(cvp, timo, flags); + else + sleepq_set_timeout_bt(cvp, bt, precision); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); diff -urN -x -p head-davide/sys/kern/kern_event.c calloutng/sys/kern/kern_event.c --- head-davide/sys/kern/kern_event.c 2012-08-03 20:48:45.000000000 +0200 +++ calloutng/sys/kern/kern_event.c 2012-12-11 09:30:13.000000000 +0100 @@ -517,25 +517,26 @@ * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ -static int -timertoticks(intptr_t data) +static struct bintime +timer2bintime(intptr_t data) { - struct timeval tv; - int tticks; - - tv.tv_sec = data / 1000; - tv.tv_usec = (data % 1000) * 1000; - tticks = tvtohz(&tv); + struct bintime bt, pbt; - return tticks; + getbinuptime(&pbt); + bt.sec = data / 1000; + bt.frac = (data % 1000) * (uint64_t)1844674407309000LL; + bintime_add(&bt, &pbt); + return bt; } static void filt_timerexpire(void *knx) { - struct knote *kn = knx; + struct bintime bt; struct callout *calloutp; + struct knote *kn; + kn = knx; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ @@ -547,9 +548,10 @@ * when we're delayed. */ if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) { + bt = timer2bintime(kn->kn_sdata); calloutp = (struct callout *)kn->kn_hook; - callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1, - filt_timerexpire, kn); + callout_reset_bt_on(calloutp, &bt, NULL, filt_timerexpire, kn, + PCPU_GET(cpuid), 0); } } @@ -559,6 +561,7 @@ static int filt_timerattach(struct knote *kn) { + struct bintime bt; struct callout *calloutp; atomic_add_int(&kq_ncallouts, 1); @@ -573,8 +576,9 @@ calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); callout_init(calloutp, CALLOUT_MPSAFE); kn->kn_hook = calloutp; - callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata), - filt_timerexpire, kn); + bt = timer2bintime(kn->kn_sdata); + callout_reset_bt_on(calloutp, &bt, NULL, filt_timerexpire, kn, + PCPU_GET(cpuid), 0); return (0); } diff -urN -x -p head-davide/sys/kern/kern_synch.c calloutng/sys/kern/kern_synch.c --- head-davide/sys/kern/kern_synch.c 2012-08-03 20:48:45.000000000 +0200 +++ calloutng/sys/kern/kern_synch.c 2012-12-11 09:30:13.000000000 +0100 @@ -146,12 +146,13 @@ */ int _sleep(void *ident, struct lock_object *lock, int priority, - const char *wmesg, int timo) + const char *wmesg, int timo, struct bintime *bt, + struct bintime *precision, int flags) { struct thread *td; struct proc *p; struct lock_class *class; - int catch, flags, lock_state, pri, rval; + int catch, sleepq_flags, lock_state, pri, rval; WITNESS_SAVE_DECL(lock_witness); td = curthread; @@ -162,7 +163,7 @@ #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); - KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL, + KASSERT(timo != 0 || bt != NULL || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); @@ -199,13 +200,13 @@ sleepq_remove(td, td->td_wchan); if (ident == &pause_wchan) - flags = SLEEPQ_PAUSE; + sleepq_flags = SLEEPQ_PAUSE; else - flags = SLEEPQ_SLEEP; + sleepq_flags = SLEEPQ_SLEEP; if (catch) - flags |= SLEEPQ_INTERRUPTIBLE; + sleepq_flags |= SLEEPQ_INTERRUPTIBLE; if (priority & PBDRY) - flags |= SLEEPQ_STOP_ON_BDRY; + sleepq_flags |= SLEEPQ_STOP_ON_BDRY; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", @@ -231,18 +232,20 @@ * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ - sleepq_add(ident, lock, wmesg, flags, 0); - if (timo) - sleepq_set_timeout(ident, timo); + sleepq_add(ident, lock, wmesg, sleepq_flags, 0); + if (bt) + sleepq_set_timeout_bt(ident, bt, precision); + else if (timo) + sleepq_set_timeout_flags(ident, timo, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } - if (timo && catch) + if ((timo != 0 || bt != NULL) && catch) rval = sleepq_timedwait_sig(ident, pri); - else if (timo) + else if (timo != 0 || bt != NULL) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); diff -urN -x -p head-davide/sys/kern/kern_tc.c calloutng/sys/kern/kern_tc.c --- head-davide/sys/kern/kern_tc.c 2012-08-03 20:48:45.000000000 +0200 +++ calloutng/sys/kern/kern_tc.c 2012-12-13 11:57:45.000000000 +0100 @@ -119,6 +119,19 @@ SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); +struct bintime bt_timethreshold; +struct bintime halftick_bt; +struct bintime tick_bt; +int tc_timeexp; +int tc_timepercentage = TC_DEFAULTPERC; +TUNABLE_INT("kern.timecounter.allowdeviation", &tc_timepercentage); +int tc_timethreshold; +static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_kern_timecounter, OID_AUTO, tc_timepercentage, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, + sysctl_kern_timecounter_adjprecision, "I", + "Allowed deviation from absolute value"); + static void tc_windup(void); static void cpu_tick_calibrate(int); @@ -275,7 +288,7 @@ do { th = timehands; gen = th->th_generation; - bintime2timeval(&th->th_offset, tvp); + Bintime2timeval(&th->th_offset, tvp); } while (gen == 0 || gen != th->th_generation); } @@ -1705,10 +1718,39 @@ tc_windup(); } +static void __inline +tc_adjprecision(void) +{ + struct timespec ts; + int tick_rate; + + tick_rate = hz / tc_tick; + tc_timethreshold = (1000000000 / (tick_rate * tc_timepercentage)) * 100; + tc_timeexp = fls(roundup2(100 / tc_timepercentage, 2)); + ts.tv_sec = tc_timethreshold / 1000000000; + ts.tv_nsec = tc_timethreshold % 1000000000; + timespec2bintime(&ts, &bt_timethreshold); +} + +static int +sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = tc_timepercentage; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + tc_timepercentage = val; + tc_adjprecision(); + return (0); +} + static void inittimecounter(void *dummy) { u_int p; + int tick_rate; /* * Set the initial timeout to @@ -1722,6 +1764,11 @@ tc_tick = (hz + 500) / 1000; else tc_tick = 1; + tc_adjprecision(); + tick_rate = hz / tc_tick; + FREQ2BT(tick_rate, &tick_bt); + halftick_bt = tick_bt; + bintime_divpow2(&halftick_bt, 1); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); diff -urN -x -p head-davide/sys/kern/kern_time.c calloutng/sys/kern/kern_time.c --- head-davide/sys/kern/kern_time.c 2012-12-07 07:27:02.000000000 +0100 +++ calloutng/sys/kern/kern_time.c 2012-12-13 13:57:07.000000000 +0100 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -481,38 +482,39 @@ int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt) { - struct timespec ts, ts2, ts3; - struct timeval tv; + struct timespec ts; + struct bintime bt, btt, bt_prec, tmp; int error; if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) return (EINVAL); if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0)) return (0); - getnanouptime(&ts); - timespecadd(&ts, rqt); - TIMESPEC_TO_TIMEVAL(&tv, rqt); - for (;;) { - error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp", - tvtohz(&tv)); - getnanouptime(&ts2); - if (error != EWOULDBLOCK) { - if (error == ERESTART) - error = EINTR; - if (rmt != NULL) { - timespecsub(&ts, &ts2); - if (ts.tv_sec < 0) - timespecclear(&ts); - *rmt = ts; - } - return (error); + timespec2bintime(rqt, &tmp); + bt_prec = tmp; + bintime_divpow2(&bt_prec, tc_timeexp); + if (TIMESEL(&bt, &tmp)) + bintime_add(&bt, &tick_bt); + bintime_add(&bt, &tmp); + bintime_add(&bt, &bt_prec); + error = tsleep_bt(&nanowait, PWAIT | PCATCH, "nanslp", &bt, &bt_prec); + TIMESEL(&btt, &tmp); + if (error != EWOULDBLOCK) { + if (error == ERESTART) + error = EINTR; + if (rmt != NULL) { + tmp = bt; + bintime_sub(&tmp, &btt); + bintime2timespec(&tmp, &ts); + if (ts.tv_sec < 0) + timespecclear(&ts); + *rmt = ts; } - if (timespeccmp(&ts2, &ts, >=)) + if (bintime_cmp(&btt, &bt, >=)) return (0); - ts3 = ts; - timespecsub(&ts3, &ts2); - TIMESPEC_TO_TIMEVAL(&tv, &ts3); + return (error); } + return (0); } #ifndef _SYS_SYSPROTO_H_ diff -urN -x -p head-davide/sys/kern/kern_timeout.c calloutng/sys/kern/kern_timeout.c --- head-davide/sys/kern/kern_timeout.c 2012-12-07 07:27:02.000000000 +0100 +++ calloutng/sys/kern/kern_timeout.c 2012-12-13 13:57:07.000000000 +0100 @@ -37,13 +37,13 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_callout_profiling.h" #include "opt_kdtrace.h" #include #include #include #include -#include #include #include #include @@ -55,6 +55,7 @@ #include #include #include +#include #ifdef SMP #include @@ -68,6 +69,7 @@ SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); +#ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); @@ -80,6 +82,19 @@ static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); +static int avg_depth_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, + "Average number of direct callouts examined per callout_process call. " + "Units = 1/1000"); +static int avg_lockcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, + &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " + "callout_process call. Units = 1/1000"); +static int avg_mpcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, + 0, "Average number of MP direct callouts made per callout_process call. " + "Units = 1/1000"); +#endif /* * TODO: * allocate more timeout table slots when table overflows. @@ -87,58 +102,62 @@ int callwheelsize, callwheelmask; /* - * The callout cpu migration entity represents informations necessary for - * describing the migrating callout to the new callout cpu. + * The callout cpu exec entities represent informations necessary for + * describing the state of callouts currently running on the CPU and the ones + * necessary for migrating callouts to the new callout cpu. In particular, + * the first entry of the array cc_exec_entity holds informations for callout + * running in SWI thread context, while the second one holds informations + * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ -struct cc_mig_ent { +struct cc_exec { + struct callout *cc_next; + struct callout *cc_curr; #ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - int ce_migration_ticks; + void (*ce_migration_func)(void *); + void *ce_migration_arg; + int ce_migration_cpu; + struct bintime ce_migration_time; #endif + int cc_cancel; + int cc_waiting; }; /* - * There is one struct callout_cpu per cpu, holding all relevant + * There is one struct callou_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct mtx_padalign cc_lock; - struct cc_mig_ent cc_migrating_entity; + struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; struct callout_tailq *cc_callwheel; + struct callout_tailq cc_expireq; struct callout_list cc_callfree; - struct callout *cc_next; - struct callout *cc_curr; + struct bintime cc_firstevent; + struct bintime cc_lastscan; void *cc_cookie; - int cc_ticks; - int cc_softticks; - int cc_cancel; - int cc_waiting; - int cc_firsttick; }; +#define cc_exec_curr cc_exec_entity[0].cc_curr +#define cc_exec_next cc_exec_entity[0].cc_next +#define cc_exec_cancel cc_exec_entity[0].cc_cancel +#define cc_exec_waiting cc_exec_entity[0].cc_waiting +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr +#define cc_exec_next_dir cc_exec_entity[1].cc_next +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting + #ifdef SMP -#define cc_migration_func cc_migrating_entity.ce_migration_func -#define cc_migration_arg cc_migrating_entity.ce_migration_arg -#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu -#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks +#define cc_migration_func cc_exec_entity[0].ce_migration_func +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu +#define cc_migration_time cc_exec_entity[0].ce_migration_time +#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu +#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU @@ -153,40 +172,51 @@ #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) +#define TIME_T_MAX \ + (sizeof(time_t) == (sizeof(int64_t)) ? INT64_MAX : INT32_MAX) + static int timeout_cpu; -void (*callout_new_inserted)(int cpu, int ticks) = NULL; +void (*callout_new_inserted)(int cpu, struct bintime bt, + struct bintime bt_opt) = NULL; +static void +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, + int *lockcalls, int *gcalls, int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* * Resets the migration entity tied to a specific callout cpu. */ static void -cc_cme_cleanup(struct callout_cpu *cc) +cc_cme_cleanup(struct callout_cpu *cc, int direct) { - + + cc->cc_exec_entity[direct].cc_curr = NULL; + cc->cc_exec_entity[direct].cc_next = NULL; + cc->cc_exec_entity[direct].cc_cancel = 0; + cc->cc_exec_entity[direct].cc_waiting = 0; #ifdef SMP - cc->cc_migration_cpu = CPUBLOCK; - cc->cc_migration_ticks = 0; - cc->cc_migration_func = NULL; - cc->cc_migration_arg = NULL; + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; + bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time); + cc->cc_exec_entity[direct].ce_migration_func = NULL; + cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } @@ -194,18 +224,19 @@ * Checks if migration is requested by a specific callout cpu. */ static int -cc_cme_migrating(struct callout_cpu *cc) +cc_cme_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP - return (cc->cc_migration_cpu != CPUBLOCK); + + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif } /* - * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization + * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization * * This code is called very early in the kernel initialization sequence, * and may be called more then once. @@ -242,7 +273,9 @@ for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&cc->cc_callwheel[i]); } - cc_cme_cleanup(cc); + TAILQ_INIT(&cc->cc_expireq); + for (i = 0; i < 2; i++) + cc_cme_cleanup(cc, i); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { @@ -330,28 +363,175 @@ SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); +#define CC_HASH_SHIFT 10 + +static inline int +callout_hash(struct bintime *bt) +{ + + return (int) ((bt->sec << CC_HASH_SHIFT) + + (bt->frac >> (64 - CC_HASH_SHIFT))); +} + +static inline int +get_bucket(struct bintime *bt) +{ + + return callout_hash(bt) & callwheelmask; +} + void -callout_tick(void) +callout_process(struct bintime *now) { + struct bintime max, min, next, next_opt, tmp_max, tmp_min; + struct callout *tmp; struct callout_cpu *cc; - int need_softclock; - int bucket; + struct callout_tailq *sc; + int cpu, depth_dir, firstb, mpcalls_dir, lastb, nowb, lockcalls_dir, + need_softclock, exit_allowed, exit_wanted; - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ need_softclock = 0; + depth_dir = 0; + mpcalls_dir = 0; + lockcalls_dir = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cc->cc_firsttick = cc->cc_ticks = ticks; - for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; + cpu = curcpu; + + /* Compute the buckets of the last scan and present times. */ + firstb = callout_hash(&cc->cc_lastscan); + nowb = callout_hash(now); + + /* Compute the last bucket and minimum time of the bucket after it. */ + next = next_opt = *now; + bintime_addx(&next, (uint64_t)3 << (64 - 2)); /* 0.75s */ + next.frac &= (0xffffffffffffffffLLU << (64 - CC_HASH_SHIFT)); + bintime_addx(&next_opt, (uint64_t)3 << (64 - 3)); /* 0.37s */ + lastb = callout_hash(&next) - 1; + + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + if (lastb - firstb >= callwheelsize) + lastb = firstb - 1; + if (nowb - firstb >= callwheelsize) + nowb = firstb - 1; + nowb &= callwheelmask; + lastb &= callwheelmask; + firstb &= callwheelmask; + + /* Iterate callwheel from firstb to nowb and then up to lastb. */ + min.sec = TIME_T_MAX; + min.frac = UINT64_MAX; + max = next; + exit_allowed = 0; + for (;;) { + exit_wanted = 0; + sc = &cc->cc_callwheel[firstb]; + tmp = TAILQ_FIRST(sc); + while (tmp != NULL) { + /* Compute allowed time range for the event */ + tmp_max = tmp_min = tmp->c_time; + if (bintime_isset(&tmp->c_precision)) { + bintime_add(&tmp_max, &tmp->c_precision); + bintime_sub(&tmp_min, &tmp->c_precision); + } + /* Run the callout if present time within allowed. */ + if (bintime_cmp(&tmp_min, now, <=)) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { + ++depth_dir; + cc->cc_exec_next_dir = + TAILQ_NEXT(tmp, c_links.tqe); + TAILQ_REMOVE(sc, tmp, c_links.tqe); + softclock_call_cc(tmp, cc, + &mpcalls_dir, &lockcalls_dir, + NULL, 1); + tmp = cc->cc_exec_next_dir; + } else { + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_staiter); + TAILQ_REMOVE(sc, tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + need_softclock = 1; + tmp = TAILQ_NEXT(tmp, c_links.tqe); + } + continue; + } + /* Skip events from distant future. */ + if (bintime_cmp(&tmp_min, &next, >=)) + goto next; + /* + * This is the fist event we're going to process or + * event maximal time is less than present minimal. + * In both cases, take it. + */ + if (bintime_cmp(&tmp_max, &min, <)) { + max = tmp_max; + min = tmp_min; + goto next; + } + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (bintime_cmp(&tmp_min, &max, >)) { + exit_wanted = 1; + goto next; + } + /* + * If neither of the two previous happened, just take + * the intersection of events. + */ + min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min; + max = (bintime_cmp(&tmp_max, &max, <)) ? tmp_max : max; +next: + tmp = TAILQ_NEXT(tmp, c_links.tqe); } + /* Stop if we looked far enough into the future. */ + if (firstb == lastb) + break; + /* + * Stop if we looked after present time and found + * some event we can't execute at now. + */ + if (firstb == nowb) + exit_allowed = 1; + if (exit_allowed && exit_wanted) + break; + /* Proceed with the next bucket. */ + firstb = (firstb + 1) & callwheelmask; + } + cc->cc_exec_next_dir = NULL; + if (min.sec != TIME_T_MAX) { + /* + * Now that we found something to aggregate, schedule an + * interrupt in the middle of the previously calculated range. + */ + if (bintime_cmp(&max, &min, !=)) { + bintime_add(&max, &min); + next = max; + next.frac >>= 1; + if (next.sec & 1) + next.frac |= ((uint64_t)1 << 63); + next.sec >>= 1; + } else + next = max; + next_opt = min; } + if (callout_new_inserted != NULL) + (*callout_new_inserted)(cpu, next, next_opt); + cc->cc_firstevent = next; + cc->cc_lastscan = *now; +#ifdef CALLOUT_PROFILING + avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; + avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; + avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; +#endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it @@ -361,33 +541,6 @@ swi_sched(cc->cc_cookie, 0); } -int -callout_tickstofirst(int limit) -{ - struct callout_cpu *cc; - struct callout *c; - struct callout_tailq *sc; - int curticks; - int skip = 1; - - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - curticks = cc->cc_ticks; - while( skip < ncallout && skip < limit ) { - sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ]; - /* search scanning ticks */ - TAILQ_FOREACH( c, sc, c_links.tqe ){ - if (c->c_time - curticks <= ncallout) - goto out; - } - skip++; - } -out: - cc->cc_firsttick = curticks + skip; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - return (skip); -} - static struct callout_cpu * callout_lock(struct callout *c) { @@ -413,25 +566,42 @@ } static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks, - void (*func)(void *), void *arg, int cpu) +callout_cc_add(struct callout *c, struct callout_cpu *cc, + struct bintime to_bintime, struct bintime precision, void (*func)(void *), + void *arg, int cpu, int flags) { + struct bintime bt; + int bucket; CC_LOCK_ASSERT(cc); - - if (to_ticks <= 0) - to_ticks = 1; + if (bintime_cmp(&to_bintime, &cc->cc_lastscan, <)) + to_bintime = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + if (flags & C_DIRECT_EXEC) + c->c_flags |= CALLOUT_DIRECT; + c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - if ((c->c_time - cc->cc_firsttick) < 0 && - callout_new_inserted != NULL) { - cc->cc_firsttick = c->c_time; - (*callout_new_inserted)(cpu, - to_ticks + (ticks - cc->cc_ticks)); + c->c_time = to_bintime; + c->c_precision = precision; + CTR4(KTR_CALLOUT, "precision set for %p: %d.%08x%08x", + c, c->c_precision.sec, (u_int) (c->c_precision.frac >> 32), + (u_int) (c->c_precision.frac & 0xffffffff)); + bucket = get_bucket(&c->c_time); + TAILQ_INSERT_TAIL(&cc->cc_callwheel[bucket], c, c_links.tqe); + /* + * Inform the eventtimers(4) subsystem there's a new callout + * that has been inserted, but only if really required. + */ + bt = c->c_time; + bintime_add(&bt, &c->c_precision); + if (callout_new_inserted != NULL && + (bintime_cmp(&bt, &cc->cc_firstevent, <) || + !bintime_isset(&cc->cc_firstevent))) { + cc->cc_firstevent = c->c_time; + bt = c->c_time; + bintime_sub(&bt, &c->c_precision); + (*callout_new_inserted)(cpu, c->c_time, bt); } } @@ -447,7 +617,7 @@ static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) + int *lockcalls, int *gcalls, int direct) { void (*c_func)(void *); void *c_arg; @@ -458,7 +628,8 @@ struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; - int new_cpu, new_ticks; + int flags, new_cpu; + struct bintime new_time; #endif #ifdef DIAGNOSTIC struct bintime bt1, bt2; @@ -480,8 +651,8 @@ c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; - cc->cc_curr = c; - cc->cc_cancel = 0; + cc->cc_exec_entity[direct].cc_curr = c; + cc->cc_exec_entity[direct].cc_cancel = 0; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); @@ -489,14 +660,18 @@ * The callout may have been cancelled * while we switched locks. */ - if (cc->cc_cancel) { + if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; - - if (c_lock == &Giant.lock_object) { + cc->cc_exec_entity[direct].cc_cancel = 1; + /* + * In case we're processing a direct callout we + * can't hold giant because holding a sleep mutex + * from hardware interrupt context is not allowed. + */ + if ((c_lock == &Giant.lock_object) && gcalls != NULL) { (*gcalls)++; CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); @@ -513,11 +688,13 @@ #ifdef DIAGNOSTIC binuptime(&bt1); #endif - THREAD_NO_SLEEPING(); + if (!direct) + THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); c_func(c_arg); SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); - THREAD_SLEEPING_OK(); + if (!direct) + THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC binuptime(&bt2); bintime_sub(&bt2, &bt1); @@ -537,17 +714,17 @@ class->lc_unlock(c_lock); skip: CC_LOCK(cc); - KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); - cc->cc_curr = NULL; - if (cc->cc_waiting) { + KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); + cc->cc_exec_entity[direct].cc_curr = NULL; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) { - cc_cme_cleanup(cc); + if (cc_cme_migrating(cc, direct)) { + cc_cme_cleanup(cc, direct); /* * It should be assert here that the callout is not @@ -555,11 +732,11 @@ */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } - cc->cc_waiting = 0; + cc->cc_exec_entity[direct].cc_waiting = 0; CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); + wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); - } else if (cc_cme_migrating(cc)) { + } else if (cc_cme_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP @@ -567,11 +744,11 @@ * If the callout was scheduled for * migration just perform it now. */ - new_cpu = cc->cc_migration_cpu; - new_ticks = cc->cc_migration_ticks; - new_func = cc->cc_migration_func; - new_arg = cc->cc_migration_arg; - cc_cme_cleanup(cc); + new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; + new_time = cc->cc_exec_entity[direct].ce_migration_time; + new_func = cc->cc_exec_entity[direct].ce_migration_func; + new_arg = cc->cc_exec_entity[direct].ce_migration_arg; + cc_cme_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed @@ -589,8 +766,9 @@ c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); - callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, - new_cpu); + flags = (direct) ? C_DIRECT_EXEC : 0; + callout_cc_add(c, new_cc, new_time, c->c_precision, new_func, + new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -613,7 +791,7 @@ } /* - * The callout mechanism is based on the work of Adam M. Costello and + * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures @@ -633,63 +811,29 @@ { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; - int curticks; - int steps; /* #steps since we last allowed interrupts */ - int depth; - int mpcalls; - int lockcalls; - int gcalls; - -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ + int depth, gcalls, lockcalls, mpcalls; + depth = 0; mpcalls = 0; lockcalls = 0; gcalls = 0; - depth = 0; - steps = 0; cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks - 1 != cc->cc_ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c != NULL) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); - steps = 0; - c = cc->cc_next; - } - } + c = TAILQ_FIRST(&cc->cc_expireq); + while (c != NULL) { + ++depth; + cc->cc_exec_next = TAILQ_NEXT(c, c_staiter); + TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); + softclock_call_cc(c, cc, &mpcalls, &lockcalls, &gcalls, 0); + c = cc->cc_exec_next; } + cc->cc_exec_next = NULL; +#ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; +#endif CC_UNLOCK(cc); } @@ -704,7 +848,7 @@ * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This - * implementation differs from that one in that although an + * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. @@ -779,28 +923,52 @@ * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), - void *arg, int cpu) +_callout_reset_on(struct callout *c, struct bintime *bt, + struct bintime *precision, int to_ticks, void (*ftn)(void *), + void *arg, int cpu, int flags) { + struct bintime now, to_bt, pr; struct callout_cpu *cc; - int cancelled = 0; + int bucket, cancelled, direct; + cancelled = 0; + if (bt == NULL) { + pr = to_bt = tick_bt; + getbinuptime(&now); + if (to_ticks > 1) + bintime_mul(&to_bt, to_ticks); + bintime_add(&to_bt, &now); + to_ticks >>= C_PRELGET(flags); + if (to_ticks == 0) + pr = halftick_bt; + else + bintime_mul(&pr, to_ticks); + } else { + to_bt = *bt; + if (precision != NULL) + pr = *precision; + else + bintime_clear(&pr); + } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; + direct = c->c_flags & CALLOUT_DIRECT; cc = callout_lock(c); - if (cc->cc_curr == c) { + if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ - if (c->c_lock != NULL && !cc->cc_cancel) - cancelled = cc->cc_cancel = 1; - if (cc->cc_waiting) { + if (c->c_lock != NULL && + !cc->cc_exec_entity[direct].cc_cancel) + cancelled = + cc->cc_exec_entity[direct].cc_cancel = 1; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. @@ -813,12 +981,18 @@ } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = TAILQ_NEXT(c, + c_links.tqe); + bucket = get_bucket(&c->c_time); + TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, + c_links.tqe); + } else { + if (cc->cc_exec_next == c) + cc->cc_exec_next = TAILQ_NEXT(c, c_staiter); + TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -830,15 +1004,17 @@ * to a more appropriate moment. */ if (c->c_cpu != cpu) { - if (cc->cc_curr == c) { - cc->cc_migration_cpu = cpu; - cc->cc_migration_ticks = to_ticks; - cc->cc_migration_func = ftn; - cc->cc_migration_arg = arg; + if (cc->cc_exec_entity[direct].cc_curr == c) { + cc->cc_exec_entity[direct].ce_migration_cpu = cpu; + cc->cc_exec_entity[direct].ce_migration_time + = to_bt; + cc->cc_exec_entity[direct].ce_migration_func = ftn; + cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; - CTR5(KTR_CALLOUT, - "migration of %p func %p arg %p in %d to %u deferred", - c, c->c_func, c->c_arg, to_ticks, cpu); + CTR6(KTR_CALLOUT, + "migration of %p func %p arg %p in %d.%08x to %u deferred", + c, c->c_func, c->c_arg, (int)(to_bt.sec), + (u_int)(to_bt.frac >> 32), cpu); CC_UNLOCK(cc); return (cancelled); } @@ -846,9 +1022,10 @@ } #endif - callout_cc_add(c, cc, to_ticks, ftn, arg, cpu); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + callout_cc_add(c, cc, to_bt, pr, ftn, arg, cpu, flags); + CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", + cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_bt.sec), + (u_int)(to_bt.frac >> 32)); CC_UNLOCK(cc); return (cancelled); @@ -876,7 +1053,7 @@ { struct callout_cpu *cc, *old_cc; struct lock_class *class; - int use_lock, sq_locked; + int bucket, direct, sq_locked, use_lock; /* * Some old subsystems don't hold Giant while running a callout_stop(), @@ -892,7 +1069,7 @@ } } else use_lock = 0; - + direct = c->c_flags & CALLOUT_DIRECT; sq_locked = 0; old_cc = NULL; again: @@ -906,7 +1083,7 @@ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_waiting); + sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; @@ -927,12 +1104,13 @@ * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ - if (cc->cc_curr != c) { + if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release( + &cc->cc_exec_entity[direct].cc_waiting); return (0); } @@ -943,8 +1121,7 @@ * just wait for the current invocation to * finish. */ - while (cc->cc_curr == c) { - + while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid @@ -964,7 +1141,8 @@ */ if (!sq_locked) { CC_UNLOCK(cc); - sleepq_lock(&cc->cc_waiting); + sleepq_lock( + &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; @@ -976,13 +1154,16 @@ * will be packed up, just let softclock() * take care of it. */ - cc->cc_waiting = 1; + cc->cc_exec_entity[direct].cc_waiting = 1; DROP_GIANT(); CC_UNLOCK(cc); - sleepq_add(&cc->cc_waiting, + sleepq_add( + &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); - sleepq_wait(&cc->cc_waiting, 0); + sleepq_wait( + &cc->cc_exec_entity[direct].cc_waiting, + 0); sq_locked = 0; old_cc = NULL; @@ -990,7 +1171,8 @@ PICKUP_GIANT(); CC_LOCK(cc); } - } else if (use_lock && !cc->cc_cancel) { + } else if (use_lock && + !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout @@ -998,10 +1180,10 @@ * lock, the callout will be skipped in * softclock(). */ - cc->cc_cancel = 1; + cc->cc_exec_entity[direct].cc_cancel = 1; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - KASSERT(!cc_cme_migrating(cc), + KASSERT(!cc_cme_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); @@ -1020,16 +1202,22 @@ return (0); } if (sq_locked) - sleepq_release(&cc->cc_waiting); - + sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe); + bucket = get_bucket(&c->c_time); + TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, + c_links.tqe); + } else { + if (cc->cc_exec_next == c) + cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe); + TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter); + } callout_cc_del(c, cc); CC_UNLOCK(cc); diff -urN -x -p head-davide/sys/kern/subr_sleepqueue.c calloutng/sys/kern/subr_sleepqueue.c --- head-davide/sys/kern/subr_sleepqueue.c 2012-09-30 17:50:32.000000000 +0200 +++ calloutng/sys/kern/subr_sleepqueue.c 2012-12-11 09:30:13.000000000 +0100 @@ -362,9 +362,11 @@ * Sets a timeout that will remove the current thread from the specified * sleep queue after timo ticks if the thread has not already been awakened. */ -void -sleepq_set_timeout(void *wchan, int timo) +void +_sleepq_set_timeout(void *wchan, struct bintime *bt, struct bintime *precision, + int timo, int flags) { + struct sleepqueue_chain *sc; struct thread *td; @@ -374,7 +376,12 @@ MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_sleepqueue == NULL); MPASS(wchan != NULL); - callout_reset_curcpu(&td->td_slpcallout, timo, sleepq_timeout, td); + if (bt == NULL) + callout_reset_flags_on(&td->td_slpcallout, timo, + sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); + else + callout_reset_bt_on(&td->td_slpcallout, bt, precision, + sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); } /* diff -urN -x -p head-davide/sys/kern/sys_generic.c calloutng/sys/kern/sys_generic.c --- head-davide/sys/kern/sys_generic.c 2012-12-07 07:27:02.000000000 +0100 +++ calloutng/sys/kern/sys_generic.c 2012-12-13 13:57:07.000000000 +0100 @@ -102,7 +102,8 @@ off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); -static int seltdwait(struct thread *, int); +static int seltdwait(struct thread *, struct bintime *, struct bintime *, + int); static void seltdclear(struct thread *); /* @@ -902,10 +903,12 @@ */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; - struct timeval atv, rtv, ttv; - int error, lf, ndu, timo; + struct bintime abt, precision, rbt; + struct timeval atv; + int error, lf, ndu; u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; + timevalclear(&atv); if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; @@ -996,33 +999,37 @@ if (tvp != NULL) { atv = *tvp; - if (itimerfix(&atv)) { + if (atv.tv_sec < 0 || atv.tv_usec < 0 || + atv.tv_usec >= 1000000) { error = EINVAL; goto done; } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); + timeval2bintime(&atv, &abt); + precision = abt; + bintime_divpow2(&precision, tc_timeexp); + if (TIMESEL(&rbt, &abt)) + bintime_add(&abt, &tick_bt); + bintime_add(&abt, &rbt); + bintime_add(&abt, &precision); } else { - atv.tv_sec = 0; - atv.tv_usec = 0; + abt.sec = 0; + abt.frac = 0; } - timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) + if (abt.sec || abt.frac) { + TIMESEL(&rbt, &abt); + if (bintime_cmp(&rbt, &abt, >=)) break; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); + error = seltdwait(td, &abt, &precision, 0); + } + else { + error = seltdwait(td, NULL, NULL, 0); } - error = seltdwait(td, timo); if (error) break; error = selrescan(td, ibits, obits); @@ -1254,11 +1261,13 @@ { struct pollfd *bits; struct pollfd smallbits[32]; - struct timeval atv, rtv, ttv; - int error, timo; + struct bintime abt, precision, rbt; + struct timeval atv; + int error; u_int nfds; size_t ni; + timevalclear(&atv); nfds = uap->nfds; if (nfds > maxfilesperproc && nfds > FD_SETSIZE) return (EINVAL); @@ -1273,33 +1282,36 @@ if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; - if (itimerfix(&atv)) { + if (atv.tv_sec < 0 || atv.tv_usec < 0 || + atv.tv_usec >= 1000000) { error = EINVAL; goto done; } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); + timeval2bintime(&atv, &abt); + precision = abt; + bintime_divpow2(&precision, tc_timeexp); + if (TIMESEL(&rbt, &abt)) + bintime_add(&abt, &tick_bt); + bintime_add(&abt, &rbt); + bintime_add(&abt, &precision); } else { - atv.tv_sec = 0; - atv.tv_usec = 0; + abt.sec = 0; + abt.frac = 0; } - timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, bits, nfds); if (error || td->td_retval[0] != 0) break; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) + if (abt.sec || abt.frac) { + TIMESEL(&rbt, &abt); + if (bintime_cmp(&rbt, &abt, >=)) break; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); + error = seltdwait(td, &abt, &precision, 0); + } else { + error = seltdwait(td, NULL, NULL, 0); } - error = seltdwait(td, timo); if (error) break; error = pollrescan(td); @@ -1641,7 +1653,8 @@ } static int -seltdwait(struct thread *td, int timo) +seltdwait(struct thread *td, struct bintime *bt, struct bintime *precision, + int timo) { struct seltd *stp; int error; @@ -1660,9 +1673,12 @@ mtx_unlock(&stp->st_mtx); return (0); } - if (timo > 0) + if (bt == NULL && timo > 0) error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); - else + else if (bt != NULL) + error = cv_timedwait_sig_bt(&stp->st_wait, &stp->st_mtx, + bt, precision); + else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); diff -urN -x -p head-davide/sys/netinet/tcp_timer.c calloutng/sys/netinet/tcp_timer.c --- head-davide/sys/netinet/tcp_timer.c 2012-12-07 07:27:04.000000000 +0100 +++ calloutng/sys/netinet/tcp_timer.c 2012-12-11 09:27:56.000000000 +0100 @@ -712,21 +712,39 @@ #define ticks_to_msecs(t) (1000*(t) / hz) +static int +delta_bintime_in_msecs(struct bintime bt, struct bintime now) +{ + bintime_sub(&bt, &now); + return (((uint64_t)1000 * (uint64_t)(bt.frac >> 32)) >> 32) + + (bt.sec * 1000); +} + void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + struct bintime bt, now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; - if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); - if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); - if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); - if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); - if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + bintime_clear(&bt); + getbinuptime(&now); + if (callout_active(&timer->tt_delack)) + xtimer->tt_delack = delta_bintime_in_msecs( + timer->tt_delack.c_time, now); + if (callout_active(&timer->tt_rexmt)) + xtimer->tt_rexmt = delta_bintime_in_msecs( + timer->tt_rexmt.c_time, now); + if (callout_active(&timer->tt_persist)) + xtimer->tt_persist = delta_bintime_in_msecs( + timer->tt_persist.c_time, now); + if (callout_active(&timer->tt_keep)) + xtimer->tt_keep = delta_bintime_in_msecs( + timer->tt_keep.c_time, now); + if (callout_active(&timer->tt_2msl)) + xtimer->tt_2msl = delta_bintime_in_msecs( + timer->tt_2msl.c_time, now); xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } diff -urN -x -p head-davide/sys/ofed/include/linux/timer.h calloutng/sys/ofed/include/linux/timer.h --- head-davide/sys/ofed/include/linux/timer.h 2012-08-03 20:49:21.000000000 +0200 +++ calloutng/sys/ofed/include/linux/timer.h 2012-12-11 09:30:13.000000000 +0100 @@ -38,10 +38,9 @@ struct callout timer_callout; void (*function)(unsigned long); unsigned long data; + int expires; }; -#define expires timer_callout.c_time - static inline void _timer_fn(void *context) { @@ -65,13 +64,16 @@ callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \ } while (0) -#define mod_timer(timer, expire) \ - callout_reset(&(timer)->timer_callout, (expire) - jiffies, \ - _timer_fn, (timer)) +#define mod_timer(timer, exp) \ +do { \ + (timer)->expires = exp; \ + callout_reset(&(timer)->timer_callout, (exp) - jiffies, \ + _timer_fn, (timer)); \ +} while (0) #define add_timer(timer) \ callout_reset(&(timer)->timer_callout, \ - (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer)) + (timer)->expires - jiffies, _timer_fn, (timer)) #define del_timer(timer) callout_stop(&(timer)->timer_callout) #define del_timer_sync(timer) callout_drain(&(timer)->timer_callout) diff -urN -x -p head-davide/sys/pc98/pc98/machdep.c calloutng/sys/pc98/pc98/machdep.c --- head-davide/sys/pc98/pc98/machdep.c 2012-12-07 07:27:21.000000000 +0100 +++ calloutng/sys/pc98/pc98/machdep.c 2012-11-26 12:16:35.000000000 +0100 @@ -1145,7 +1145,7 @@ #define STATE_SLEEPING 0x2 static void -cpu_idle_hlt(int busy) +cpu_idle_hlt(int us) { int *state; @@ -1186,7 +1186,7 @@ #define MWAIT_C4 0x30 static void -cpu_idle_mwait(int busy) +cpu_idle_mwait(int us) { int *state; @@ -1209,7 +1209,7 @@ } static void -cpu_idle_spin(int busy) +cpu_idle_spin(int us) { int *state; int i; @@ -1234,6 +1234,7 @@ void cpu_idle(int busy) { + int us = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); @@ -1251,11 +1252,11 @@ /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); - cpu_idleclock(); + us = cpu_idleclock(); } /* Call main idle method. */ - cpu_idle_fn(busy); + cpu_idle_fn(us); /* Switch timers mack into active mode. */ if (!busy) { diff -urN -x -p head-davide/sys/powerpc/powerpc/cpu.c calloutng/sys/powerpc/powerpc/cpu.c --- head-davide/sys/powerpc/powerpc/cpu.c 2012-08-03 20:48:33.000000000 +0200 +++ calloutng/sys/powerpc/powerpc/cpu.c 2012-11-12 12:27:29.000000000 +0100 @@ -79,9 +79,9 @@ static void cpu_booke_setup(int cpuid, uint16_t vers); int powerpc_pow_enabled; -void (*cpu_idle_hook)(void) = NULL; -static void cpu_idle_60x(void); -static void cpu_idle_booke(void); +void (*cpu_idle_hook)(int) = NULL; +static void cpu_idle_60x(int); +static void cpu_idle_booke(int); struct cputab { const char *name; @@ -516,6 +516,7 @@ void cpu_idle(int busy) { + int us = -1; #ifdef INVARIANTS if ((mfmsr() & PSL_EE) != PSL_EE) { @@ -531,9 +532,9 @@ if (cpu_idle_hook != NULL) { if (!busy) { critical_enter(); - cpu_idleclock(); + us = cpu_idleclock(); } - cpu_idle_hook(); + cpu_idle_hook(us); if (!busy) { cpu_activeclock(); critical_exit(); @@ -551,7 +552,7 @@ } static void -cpu_idle_60x(void) +cpu_idle_60x(int us) { register_t msr; uint16_t vers; @@ -586,7 +587,7 @@ } static void -cpu_idle_booke(void) +cpu_idle_booke(int us) { register_t msr; diff -urN -x -p head-davide/sys/powerpc/ps3/platform_ps3.c calloutng/sys/powerpc/ps3/platform_ps3.c --- head-davide/sys/powerpc/ps3/platform_ps3.c 2012-08-03 20:48:33.000000000 +0200 +++ calloutng/sys/powerpc/ps3/platform_ps3.c 2012-12-11 09:28:07.000000000 +0100 @@ -70,7 +70,7 @@ static struct cpu_group *ps3_smp_topo(platform_t); #endif static void ps3_reset(platform_t); -static void ps3_cpu_idle(void); +static void ps3_cpu_idle(int); static platform_method_t ps3_methods[] = { PLATFORMMETHOD(platform_probe, ps3_probe), @@ -245,7 +245,7 @@ } static void -ps3_cpu_idle(void) +ps3_cpu_idle(int us) { lv1_pause(0); } diff -urN -x -p head-davide/sys/powerpc/wii/platform_wii.c calloutng/sys/powerpc/wii/platform_wii.c --- head-davide/sys/powerpc/wii/platform_wii.c 2012-12-07 07:27:34.000000000 +0100 +++ calloutng/sys/powerpc/wii/platform_wii.c 2012-12-11 09:28:07.000000000 +0100 @@ -60,7 +60,7 @@ int *, struct mem_region **, int *); static unsigned long wii_timebase_freq(platform_t, struct cpuref *cpuref); static void wii_reset(platform_t); -static void wii_cpu_idle(void); +static void wii_cpu_idle(int); static platform_method_t wii_methods[] = { PLATFORMMETHOD(platform_probe, wii_probe), @@ -155,6 +155,6 @@ } static void -wii_cpu_idle(void) +wii_cpu_idle(int us) { } diff -urN -x -p head-davide/sys/sys/_callout.h calloutng/sys/sys/_callout.h --- head-davide/sys/sys/_callout.h 2012-08-03 20:51:25.000000000 +0200 +++ calloutng/sys/sys/_callout.h 2012-11-06 09:39:17.000000000 +0100 @@ -39,6 +39,7 @@ #define _SYS__CALLOUT_H #include +#include struct lock_object; @@ -50,7 +51,9 @@ SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - int c_time; /* ticks to the event */ + TAILQ_ENTRY(callout) c_staiter; + struct bintime c_time; /* ticks to the event */ + struct bintime c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ diff -urN -x -p head-davide/sys/sys/callout.h calloutng/sys/sys/callout.h --- head-davide/sys/sys/callout.h 2012-08-03 20:51:21.000000000 +0200 +++ calloutng/sys/sys/callout.h 2012-12-11 09:30:13.000000000 +0100 @@ -47,6 +47,14 @@ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ #define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ #define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ +#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ + +#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ +#define C_PRELBITS 7 +#define C_PRELRANGE ((1 << C_PRELBITS) - 1) +#define C_PRELSET(x) ((x) << 1) +#define C_PRELGET(x) (((x) >> 1) & C_PRELRANGE) struct callout_handle { struct callout *callout; @@ -67,7 +75,16 @@ _callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object : \ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) -int callout_reset_on(struct callout *, int, void (*)(void *), void *, int); +int _callout_reset_on(struct callout *, struct bintime *, + struct bintime *, int, void (*)(void *), void *, int, int); +#define callout_reset_on(c, to_ticks, fn, arg, cpu) \ + _callout_reset_on((c), NULL, NULL, (to_ticks), (fn), (arg), \ + (cpu), 0) +#define callout_reset_flags_on(c, to_ticks, fn, arg, cpu, flags) \ + _callout_reset_on((c), NULL, NULL, (to_ticks), (fn), (arg), (cpu), \ + (flags)) +#define callout_reset_bt_on(c, bt, pr, fn, arg, cpu, flags) \ + _callout_reset_on((c), (bt), (pr), 0, (fn), (arg), (cpu), (flags)) #define callout_reset(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu) #define callout_reset_curcpu(c, on_tick, fn, arg) \ @@ -78,9 +95,9 @@ callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) #define callout_stop(c) _callout_stop_safe(c, 0) int _callout_stop_safe(struct callout *, int); -void callout_tick(void); -int callout_tickstofirst(int limit); -extern void (*callout_new_inserted)(int cpu, int ticks); +void callout_process(struct bintime *); +extern void (*callout_new_inserted)(int cpu, struct bintime bt, + struct bintime); #endif diff -urN -x -p head-davide/sys/sys/condvar.h calloutng/sys/sys/condvar.h --- head-davide/sys/sys/condvar.h 2012-08-03 20:51:21.000000000 +0200 +++ calloutng/sys/sys/condvar.h 2012-12-11 09:30:13.000000000 +0100 @@ -55,8 +55,12 @@ void _cv_wait(struct cv *cvp, struct lock_object *lock); void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock); int _cv_wait_sig(struct cv *cvp, struct lock_object *lock); -int _cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo); -int _cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo); +int _cv_timedwait(struct cv *cvp, struct lock_object *lock, + struct bintime *bt, struct bintime *precision, int timo, + int flags); +int _cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, + struct bintime *bt, struct bintime *precision, int timo, + int flags); void cv_signal(struct cv *cvp); void cv_broadcastpri(struct cv *cvp, int pri); @@ -68,9 +72,23 @@ #define cv_wait_sig(cvp, lock) \ _cv_wait_sig((cvp), &(lock)->lock_object) #define cv_timedwait(cvp, lock, timo) \ - _cv_timedwait((cvp), &(lock)->lock_object, (timo)) + _cv_timedwait((cvp), &(lock)->lock_object, NULL, NULL, \ + (timo), 0) +#define cv_timedwait_bt(cvp, lock, bt, pr) \ + _cv_timedwait_sig((cvp), &(lock)->lock_object, (bt), \ + (pr), 0, 0) +#define cv_timedwait_sig_bt(cvp, lock, bt, pr) \ + _cv_timedwait_sig((cvp), &(lock)->lock_object, (bt), (pr), 0, \ + 0) +#define cv_timedwait_flags(cvp, lock, timo, flags) \ + _cv_timedwait((cvp), &(lock)->lock_object, NULL, NULL, (timo), \ + (flags)) #define cv_timedwait_sig(cvp, lock, timo) \ - _cv_timedwait_sig((cvp), &(lock)->lock_object, (timo)) + _cv_timedwait_sig((cvp), &(lock)->lock_object, NULL, NULL, \ + (timo), 0) +#define cv_timedwait_sig_flags(cvp, lock, timo, flags) \ + _cv_timedwait_sig((cvp), &(lock)->lock_object, NULL, NULL, \ + (timo), (flags)) #define cv_broadcast(cvp) cv_broadcastpri(cvp, 0) diff -urN -x -p head-davide/sys/sys/mutex.h calloutng/sys/sys/mutex.h --- head-davide/sys/sys/mutex.h 2012-12-07 07:27:32.000000000 +0100 +++ calloutng/sys/sys/mutex.h 2012-12-11 09:30:13.000000000 +0100 @@ -376,7 +376,8 @@ mtx_assert_((m), (what), __FILE__, __LINE__) #define mtx_sleep(chan, mtx, pri, wmesg, timo) \ - _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo)) + _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo), \ + NULL, NULL, 0) #define mtx_initialized(m) lock_initalized(&(m)->lock_object) diff -urN -x -p head-davide/sys/sys/proc.h calloutng/sys/sys/proc.h --- head-davide/sys/sys/proc.h 2012-12-07 07:27:32.000000000 +0100 +++ calloutng/sys/sys/proc.h 2012-11-26 12:16:36.000000000 +0100 @@ -907,7 +907,7 @@ void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); -extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ +extern void (*cpu_idle_hook)(int); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); diff -urN -x -p head-davide/sys/sys/rwlock.h calloutng/sys/sys/rwlock.h --- head-davide/sys/sys/rwlock.h 2012-12-07 07:27:32.000000000 +0100 +++ calloutng/sys/sys/rwlock.h 2012-12-11 09:30:13.000000000 +0100 @@ -211,7 +211,8 @@ rw_runlock(rw); \ } while (0) #define rw_sleep(chan, rw, pri, wmesg, timo) \ - _sleep((chan), &(rw)->lock_object, (pri), (wmesg), (timo)) + _sleep((chan), &(rw)->lock_object, (pri), (wmesg), (timo), \ + NULL, NULL, 0) #define rw_initialized(rw) lock_initalized(&(rw)->lock_object) diff -urN -x -p head-davide/sys/sys/sleepqueue.h calloutng/sys/sys/sleepqueue.h --- head-davide/sys/sys/sleepqueue.h 2012-08-03 20:51:27.000000000 +0200 +++ calloutng/sys/sys/sleepqueue.h 2012-12-11 09:30:13.000000000 +0100 @@ -108,7 +108,14 @@ void sleepq_release(void *wchan); void sleepq_remove(struct thread *td, void *wchan); int sleepq_signal(void *wchan, int flags, int pri, int queue); -void sleepq_set_timeout(void *wchan, int timo); +void _sleepq_set_timeout(void *wchan, struct bintime *bt, + struct bintime *precision, int timo, int flags); +#define sleepq_set_timeout(wchan, timo) \ + _sleepq_set_timeout((wchan), NULL, NULL, (timo), 0) +#define sleepq_set_timeout_flags(wchan, timo, flags) \ + _sleepq_set_timeout((wchan), NULL, NULL, (timo), (flags)) +#define sleepq_set_timeout_bt(wchan, bt, precision) \ + _sleepq_set_timeout((wchan), (bt), (precision), 0, 0) u_int sleepq_sleepcnt(void *wchan, int queue); int sleepq_timedwait(void *wchan, int pri); int sleepq_timedwait_sig(void *wchan, int pri); diff -urN -x -p head-davide/sys/sys/sx.h calloutng/sys/sys/sx.h --- head-davide/sys/sys/sx.h 2012-08-03 20:51:21.000000000 +0200 +++ calloutng/sys/sys/sx.h 2012-12-11 09:30:13.000000000 +0100 @@ -275,7 +275,8 @@ #define sx_unlock(sx) sx_unlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_sleep(chan, sx, pri, wmesg, timo) \ - _sleep((chan), &(sx)->lock_object, (pri), (wmesg), (timo)) + _sleep((chan), &(sx)->lock_object, (pri), (wmesg), (timo), \ + NULL, NULL, 0) /* * Options passed to sx_init_flags(). diff -urN -x -p head-davide/sys/sys/systm.h calloutng/sys/sys/systm.h --- head-davide/sys/sys/systm.h 2012-12-13 14:16:04.000000000 +0100 +++ calloutng/sys/sys/systm.h 2012-12-12 04:30:59.000000000 +0100 @@ -266,7 +266,7 @@ void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); -void cpu_idleclock(void); +int cpu_idleclock(void); void cpu_activeclock(void); extern int cpu_can_deep_sleep; extern int cpu_disable_deep_sleep; @@ -345,14 +345,24 @@ * less often. */ int _sleep(void *chan, struct lock_object *lock, int pri, const char *wmesg, - int timo) __nonnull(1); + int timo, struct bintime *bt, struct bintime *precision, + int flags) __nonnull(1); #define msleep(chan, mtx, pri, wmesg, timo) \ - _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo)) + _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo), \ + NULL, NULL, 0) +#define msleep_flags(chan, mtx, pri, wmesg, timo, flags) \ + _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo), \ + NULL, NULL, (flags)) +#define msleep_bt(chan, mtx, pri, wmesg, bt, pr) \ + _sleep((chan), &(mtx)->lock_object, (pri), (wmesg) 0, (bt), \ + (pr), 0) int msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int timo) __nonnull(1); int pause(const char *wmesg, int timo); #define tsleep(chan, pri, wmesg, timo) \ - _sleep((chan), NULL, (pri), (wmesg), (timo)) + _sleep((chan), NULL, (pri), (wmesg), (timo), NULL, NULL, 0) +#define tsleep_bt(chan, pri, wmesg, bt, pr) \ + _sleep((chan), NULL, (pri), (wmesg), 0, (bt), (pr), 0) void wakeup(void *chan) __nonnull(1); void wakeup_one(void *chan) __nonnull(1); diff -urN -x -p head-davide/sys/sys/time.h calloutng/sys/sys/time.h --- head-davide/sys/sys/time.h 2012-09-30 17:51:01.000000000 +0200 +++ calloutng/sys/sys/time.h 2012-12-13 13:57:07.000000000 +0100 @@ -102,6 +102,15 @@ bt->frac = (p2 << 32) | (p1 & 0xffffffffull); } +static __inline void +bintime_divpow2(struct bintime *bt, u_int exp) +{ + + bt->frac >>= exp; + bt->frac |= (uint64_t)bt->sec << (64 - exp); + bt->sec >>= exp; +} + #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ @@ -290,7 +299,13 @@ extern time_t time_second; extern time_t time_uptime; extern struct bintime boottimebin; +extern struct bintime halftick_bt; +extern struct bintime tick_bt; extern struct timeval boottime; +extern int tc_timeexp; +extern int tc_timepercentage; +extern int tc_timethreshold; +extern struct bintime bt_timethreshold; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() @@ -337,6 +352,23 @@ void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); + +#define TC_DEFAULTPERC 5 + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +#define TIMESEL(bt, bt2) \ + ((bintime_cmp((bt2), (&bt_timethreshold), >=)) ? \ + (getbinuptime(bt), 1) : (binuptime(bt), 0)) + #else /* !_KERNEL */ #include