diff -urN -x -p head-davide/sys/amd64/amd64/machdep.c calloutng/sys/amd64/amd64/machdep.c
--- head-davide/sys/amd64/amd64/machdep.c	2012-12-07 07:27:31.000000000 +0100
+++ calloutng/sys/amd64/amd64/machdep.c	2012-11-12 12:27:29.000000000 +0100
@@ -658,7 +658,7 @@
 		halt();
 }
 
-void (*cpu_idle_hook)(void) = NULL;	/* ACPI idle hook. */
+void (*cpu_idle_hook)(int) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
@@ -670,7 +670,7 @@
 #define	STATE_SLEEPING	0x2
 
 static void
-cpu_idle_acpi(int busy)
+cpu_idle_acpi(int us)
 {
 	int *state;
 
@@ -682,14 +682,14 @@
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
-		cpu_idle_hook();
+		cpu_idle_hook(us);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(int us)
 {
 	int *state;
 
@@ -730,7 +730,7 @@
 #define	MWAIT_C4	0x30
 
 static void
-cpu_idle_mwait(int busy)
+cpu_idle_mwait(int us)
 {
 	int *state;
 
@@ -753,7 +753,7 @@
 }
 
 static void
-cpu_idle_spin(int busy)
+cpu_idle_spin(int us)
 {
 	int *state;
 	int i;
@@ -808,6 +808,7 @@
 cpu_idle(int busy)
 {
 	uint64_t msr;
+	int us = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
@@ -825,7 +826,7 @@
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
-		cpu_idleclock();
+		us = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
@@ -836,7 +837,7 @@
 	}
 
 	/* Call main idle method. */
-	cpu_idle_fn(busy);
+	cpu_idle_fn(us);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
diff -urN -x -p head-davide/sys/conf/NOTES calloutng/sys/conf/NOTES
--- head-davide/sys/conf/NOTES	2012-12-07 07:27:34.000000000 +0100
+++ calloutng/sys/conf/NOTES	2012-12-11 09:28:02.000000000 +0100
@@ -259,6 +259,8 @@
 
 # SMP Debugging Options:
 #
+# CALLOUT_PROFILING enables rudimentary profiling of the callwheel data
+#	  structure used as backend in callout(9).
 # PREEMPTION allows the threads that are in the kernel to be preempted by
 #	  higher priority [interrupt] threads.  It helps with interactivity
 #	  and allows interrupt threads to run sooner rather than waiting.
@@ -297,6 +299,9 @@
 options 	MPROF_BUFFERS="1536"
 options 	MPROF_HASH_SIZE="1543"
 
+# Profiling for the callout(9) backend.
+options 	CALLOUT_PROFILING
+
 # Profiling for internal hash tables.
 options 	SLEEPQUEUE_PROFILING
 options 	TURNSTILE_PROFILING
diff -urN -x -p head-davide/sys/conf/options calloutng/sys/conf/options
--- head-davide/sys/conf/options	2012-12-07 07:27:34.000000000 +0100
+++ calloutng/sys/conf/options	2012-11-06 09:39:20.000000000 +0100
@@ -68,6 +68,7 @@
 ADAPTIVE_LOCKMGRS
 ALQ
 AUDIT		opt_global.h
+CALLOUT_PROFILING
 CAPABILITIES	opt_capsicum.h
 CAPABILITY_MODE	opt_capsicum.h
 COMPAT_43	opt_compat.h
diff -urN -x -p head-davide/sys/dev/acpica/acpi_cpu.c calloutng/sys/dev/acpica/acpi_cpu.c
--- head-davide/sys/dev/acpica/acpi_cpu.c	2012-12-07 07:27:11.000000000 +0100
+++ calloutng/sys/dev/acpica/acpi_cpu.c	2012-12-11 09:28:02.000000000 +0100
@@ -168,7 +168,7 @@
 static void	acpi_cpu_startup(void *arg);
 static void	acpi_cpu_startup_cx(struct acpi_cpu_softc *sc);
 static void	acpi_cpu_cx_list(struct acpi_cpu_softc *sc);
-static void	acpi_cpu_idle(void);
+static void	acpi_cpu_idle(int us);
 static void	acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context);
 static int	acpi_cpu_quirks(void);
 static int	acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS);
@@ -954,7 +954,7 @@
  * interrupts are re-enabled.
  */
 static void
-acpi_cpu_idle()
+acpi_cpu_idle(int us)
 {
     struct	acpi_cpu_softc *sc;
     struct	acpi_cx *cx_next;
@@ -980,13 +980,14 @@
     }
 
     /* Find the lowest state that has small enough latency. */
+    us = min(us, sc->cpu_prev_sleep);
     cx_next_idx = 0;
     if (cpu_disable_deep_sleep)
 	i = min(sc->cpu_cx_lowest, sc->cpu_non_c3);
     else
 	i = sc->cpu_cx_lowest;
     for (; i >= 0; i--) {
-	if (sc->cpu_cx_states[i].trans_lat * 3 <= sc->cpu_prev_sleep) {
+	if (sc->cpu_cx_states[i].trans_lat * 3 <= us) {
 	    cx_next_idx = i;
 	    break;
 	}
diff -urN -x -p head-davide/sys/i386/i386/machdep.c calloutng/sys/i386/i386/machdep.c
--- head-davide/sys/i386/i386/machdep.c	2012-12-07 07:27:24.000000000 +0100
+++ calloutng/sys/i386/i386/machdep.c	2012-11-12 12:27:30.000000000 +0100
@@ -1220,7 +1220,7 @@
 int scheduler_running;
 
 static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(int us)
 {
 
 	scheduler_running = 1;
@@ -1241,7 +1241,7 @@
 
 #endif
 
-void (*cpu_idle_hook)(void) = NULL;	/* ACPI idle hook. */
+void (*cpu_idle_hook)(int) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
@@ -1253,7 +1253,7 @@
 #define	STATE_SLEEPING	0x2
 
 static void
-cpu_idle_acpi(int busy)
+cpu_idle_acpi(int us)
 {
 	int *state;
 
@@ -1265,7 +1265,7 @@
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
-		cpu_idle_hook();
+		cpu_idle_hook(us);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
@@ -1273,7 +1273,7 @@
 
 #ifndef XEN
 static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(int us)
 {
 	int *state;
 
@@ -1315,7 +1315,7 @@
 #define	MWAIT_C4	0x30
 
 static void
-cpu_idle_mwait(int busy)
+cpu_idle_mwait(int us)
 {
 	int *state;
 
@@ -1338,7 +1338,7 @@
 }
 
 static void
-cpu_idle_spin(int busy)
+cpu_idle_spin(int us)
 {
 	int *state;
 	int i;
@@ -1399,6 +1399,7 @@
 #ifndef XEN
 	uint64_t msr;
 #endif
+	int us = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
@@ -1418,7 +1419,7 @@
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
-		cpu_idleclock();
+		us = cpu_idleclock();
 	}
 
 #ifndef XEN
@@ -1431,7 +1432,7 @@
 #endif
 
 	/* Call main idle method. */
-	cpu_idle_fn(busy);
+	cpu_idle_fn(us);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
diff -urN -x -p head-davide/sys/ia64/ia64/machdep.c calloutng/sys/ia64/ia64/machdep.c
--- head-davide/sys/ia64/ia64/machdep.c	2012-08-03 20:48:53.000000000 +0200
+++ calloutng/sys/ia64/ia64/machdep.c	2012-11-12 12:27:30.000000000 +0100
@@ -155,7 +155,7 @@
 struct msgbuf *msgbufp = NULL;
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
-void (*cpu_idle_hook)(void) = NULL;
+void (*cpu_idle_hook)(int) = NULL;
 
 struct kva_md_info kmi;
 
@@ -392,10 +392,11 @@
 cpu_idle(int busy)
 {
 	register_t ie;
+	int us = -1;
 
 	if (!busy) {
 		critical_enter();
-		cpu_idleclock();
+		us = cpu_idleclock();
 	}
 
 	ie = intr_disable();
@@ -404,7 +405,7 @@
 	if (sched_runnable())
 		ia64_enable_intr();
 	else if (cpu_idle_hook != NULL) {
-		(*cpu_idle_hook)();
+		(*cpu_idle_hook)(us);
 		/* The hook must enable interrupts! */
 	} else {
 		ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
diff -urN -x -p head-davide/sys/kern/kern_clock.c calloutng/sys/kern/kern_clock.c
--- head-davide/sys/kern/kern_clock.c	2012-08-03 20:48:45.000000000 +0200
+++ calloutng/sys/kern/kern_clock.c	2012-08-08 00:53:23.000000000 +0200
@@ -425,6 +425,7 @@
 void
 hardclock_cpu(int usermode)
 {
+	struct bintime now;
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
@@ -459,7 +460,8 @@
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
-	callout_tick();
+	binuptime(&now);
+	callout_process(&now);
 }
 
 /*
@@ -549,7 +551,6 @@
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
-	callout_tick();
 	/* We are in charge to handle this tick duty. */
 	if (newticks > 0) {
 		/* Dangerous and no need to call these things concurrently. */
diff -urN -x -p head-davide/sys/kern/kern_clocksource.c calloutng/sys/kern/kern_clocksource.c
--- head-davide/sys/kern/kern_clocksource.c	2012-12-07 07:27:02.000000000 +0100
+++ calloutng/sys/kern/kern_clocksource.c	2012-11-12 13:09:40.000000000 +0100
@@ -37,6 +37,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
@@ -46,6 +47,7 @@
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
+#include <sys/time.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 
@@ -71,9 +73,8 @@
 static void		getnextcpuevent(struct bintime *event, int idle);
 static void		getnextevent(struct bintime *event);
 static int		handleevents(struct bintime *now, int fake);
-#ifdef SMP
-static void		cpu_new_callout(int cpu, int ticks);
-#endif
+static void		cpu_new_callout(int cpu, struct bintime bt,
+			    struct bintime bt_opt);
 
 static struct mtx	et_hw_mtx;
 
@@ -135,6 +136,8 @@
 	struct bintime	nexthard;	/* Next hardlock() event. */
 	struct bintime	nextstat;	/* Next statclock() event. */
 	struct bintime	nextprof;	/* Next profclock() event. */
+	struct bintime	nextcall;	/* Next callout event. */
+	struct bintime  nextcallopt;	/* Next optional callout event. */
 #ifdef KDTRACE_HOOKS
 	struct bintime	nextcyc;	/* Next OpenSolaris cyclics event. */
 #endif
@@ -144,15 +147,6 @@
 
 static DPCPU_DEFINE(struct pcpu_state, timerstate);
 
-#define FREQ2BT(freq, bt)						\
-{									\
-	(bt)->sec = 0;							\
-	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;	\
-}
-#define BT2FREQ(bt)							\
-	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /		\
-	    ((bt)->frac >> 1))
-
 /*
  * Timer broadcast IPI handler.
  */
@@ -238,6 +232,12 @@
 		}
 	} else
 		state->nextprof = state->nextstat;
+	if (bintime_cmp(now, &state->nextcallopt, >=) &&
+		(state->nextcallopt.sec != -1)) {
+		state->nextcall.sec = -1;
+		state->nextcallopt.sec = -1;
+		callout_process(now);
+	}
 
 #ifdef KDTRACE_HOOKS
 	if (fake == 0 && cyclic_clock_func != NULL &&
@@ -269,24 +269,28 @@
 static void
 getnextcpuevent(struct bintime *event, int idle)
 {
-	struct bintime tmp;
 	struct pcpu_state *state;
-	int skip;
-
+	struct bintime tmp;
+	int hardfreq;
+	
 	state = DPCPU_PTR(timerstate);
-	/* Handle hardclock() events. */
+	/* Handle hardclock() events, skipping some is CPU is idle. */
 	*event = state->nexthard;
 	if (idle || (!activetick && !profiling &&
 	    (timer->et_flags & ET_FLAGS_PERCPU) == 0)) {
-		skip = idle ? 4 : (stathz / 2);
-		if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > skip)
-			skip = tc_min_ticktock_freq;
-		skip = callout_tickstofirst(hz / skip) - 1;
-		CTR2(KTR_SPARE2, "skip   at %d: %d", curcpu, skip);
-		tmp = hardperiod;
-		bintime_mul(&tmp, skip);
-		bintime_add(event, &tmp);
+		hardfreq = idle ? 2 : (stathz / 2);
+		if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > hardfreq)
+			hardfreq = tc_min_ticktock_freq;
+		if (hz > hardfreq) {
+			tmp = hardperiod;
+			bintime_mul(&tmp, hz / hardfreq - 1);
+			bintime_add(event, &tmp);
+		}
 	}
+	/* Handle callout events. */
+	if (state->nextcall.sec != -1 &&
+	    bintime_cmp(event, &state->nextcall, >))
+		*event = state->nextcall;
 	if (!idle) { /* If CPU is active - handle other types of events. */
 		if (bintime_cmp(event, &state->nextstat, >))
 			*event = state->nextstat;
@@ -627,10 +631,10 @@
 #ifdef KDTRACE_HOOKS
 		state->nextcyc.sec = -1;
 #endif
+		state->nextcall.sec = -1;
+		state->nextcallopt.sec = -1;
 	}
-#ifdef SMP
 	callout_new_inserted = cpu_new_callout;
-#endif
 	periodic = want_periodic;
 	/* Grab requested timer or the best of present. */
 	if (timername[0])
@@ -759,7 +763,7 @@
 /*
  * Switch to idle mode (all ticks handled).
  */
-void
+int
 cpu_idleclock(void)
 {
 	struct bintime now, t;
@@ -771,7 +775,7 @@
 	    || curcpu == CPU_FIRST()
 #endif
 	    )
-		return;
+		return (-1);
 	state = DPCPU_PTR(timerstate);
 	if (periodic)
 		now = state->now;
@@ -787,6 +791,9 @@
 	if (!periodic)
 		loadtimer(&now, 0);
 	ET_HW_UNLOCK(state);
+	bintime_sub(&t, &now);
+	return (t.sec > (INT_MAX >> 20) ? INT_MAX :
+	    ((t.sec < 0) ? 0 : ((t.sec << 20) + (t.frac >> 44))));
 }
 
 /*
@@ -854,52 +861,57 @@
 }
 #endif
 
-#ifdef SMP
 static void
-cpu_new_callout(int cpu, int ticks)
+cpu_new_callout(int cpu, struct bintime bt, struct bintime bt_opt)
 {
-	struct bintime tmp;
+	struct bintime now;
 	struct pcpu_state *state;
 
-	CTR3(KTR_SPARE2, "new co at %d:    on %d in %d",
-	    curcpu, cpu, ticks);
+	CTR5(KTR_SPARE2, "new co at %d:    on %d at %d.%08x%08x",
+	    curcpu, cpu, (int)(bt.sec), (u_int)(bt.frac >> 32),
+			 (u_int)(bt.frac & 0xffffffff));
 	state = DPCPU_ID_PTR(cpu, timerstate);
 	ET_HW_LOCK(state);
-	if (state->idle == 0 || busy) {
+
+	/* 
+	 * If there is callout time already set earlier -- do nothing. 
+	 * This check may appear redundant because we check already in  
+	 * callout_process() but this double check guarantees we're safe 
+	 * with respect to race conditions between interrupts execution 
+	 * and scheduling. 
+	 */
+	state->nextcallopt = bt_opt;
+	if (state->nextcall.sec != -1 &&
+	    bintime_cmp(&bt, &state->nextcall, >=)) {
 		ET_HW_UNLOCK(state);
 		return;
 	}
-	/*
-	 * If timer is periodic - just update next event time for target CPU.
-	 * If timer is global - there is chance it is already programmed.
-	 */
-	if (periodic || (timer->et_flags & ET_FLAGS_PERCPU) == 0) {
-		tmp = hardperiod;
-		bintime_mul(&tmp, ticks - 1);
-		bintime_add(&tmp, &state->nexthard);
-		if (bintime_cmp(&tmp, &state->nextevent, <))
-			state->nextevent = tmp;
-		if (periodic ||
-		    bintime_cmp(&state->nextevent, &nexttick, >=)) {
-			ET_HW_UNLOCK(state);
-			return;
-		}
+	state->nextcall = bt;
+	/* If there is some some other event set earlier -- do nothing. */
+	if (bintime_cmp(&state->nextcall, &state->nextevent, >=)) {
+		ET_HW_UNLOCK(state);
+		return;
 	}
-	/*
-	 * Otherwise we have to wake that CPU up, as we can't get present
-	 * bintime to reprogram global timer from here. If timer is per-CPU,
-	 * we by definition can't do it from here.
-	 */
-	ET_HW_UNLOCK(state);
-	if (timer->et_flags & ET_FLAGS_PERCPU) {
-		state->handle = 1;
-		ipi_cpu(cpu, IPI_HARDCLOCK);
-	} else {
-		if (!cpu_idle_wakeup(cpu))
-			ipi_cpu(cpu, IPI_AST);
+	state->nextevent = state->nextcall;
+	/* If timer is periodic -- there is nothing to reprogram. */
+	if (periodic) {
+		ET_HW_UNLOCK(state);
+		return;
 	}
-}
+	/* If timer is global or of the current CPU -- reprogram it. */
+	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) {
+		binuptime(&now);
+		loadtimer(&now, 0);
+		ET_HW_UNLOCK(state);
+		return;
+	}
+	/* Otherwise make other CPU to reprogram it. */
+	state->handle = 1;
+	ET_HW_UNLOCK(state);
+#ifdef SMP
+	ipi_cpu(cpu, IPI_HARDCLOCK);
 #endif
+}
 
 /*
  * Report or change the active event timers hardware.
diff -urN -x -p head-davide/sys/kern/kern_condvar.c calloutng/sys/kern/kern_condvar.c
--- head-davide/sys/kern/kern_condvar.c	2012-09-30 17:50:32.000000000 +0200
+++ calloutng/sys/kern/kern_condvar.c	2012-12-11 09:30:13.000000000 +0100
@@ -270,12 +270,12 @@
 }
 
 /*
- * Wait on a condition variable for at most timo/hz seconds.  Returns 0 if the
- * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout
- * expires.
+ * Wait on a condition variable.  Returns 0 if the process was resumed by
+ * cv_signal or cv_broadcast, EWOULDBLOCK if the timeout expires.
  */
 int
-_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo)
+_cv_timedwait(struct cv *cvp, struct lock_object *lock, struct bintime *bt, 
+    struct bintime *precision, int timo, int flags)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
@@ -311,7 +311,10 @@
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
-	sleepq_set_timeout(cvp, timo);
+	if (bt == NULL) 
+		sleepq_set_timeout_flags(cvp, timo, flags);
+	else
+		sleepq_set_timeout_bt(cvp, bt, precision);	
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
@@ -336,13 +339,14 @@
 }
 
 /*
- * Wait on a condition variable for at most timo/hz seconds, allowing
- * interruption by signals.  Returns 0 if the thread was resumed by cv_signal
- * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if
- * a signal was caught.
+ * Wait on a condition variable allowing interruption by signals.
+ * Returns 0 if the thread was resumed by cv_signal or cv_broadcast, 
+ * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR 
+ * or ERESTART if a signal was caught.
  */
 int
-_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo)
+_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, 
+    struct bintime *bt, struct bintime *precision, int timo, int flags)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
@@ -379,7 +383,10 @@
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
 	    SLEEPQ_INTERRUPTIBLE, 0);
-	sleepq_set_timeout(cvp, timo);
+	if (bt == NULL)	
+		sleepq_set_timeout_flags(cvp, timo, flags);
+	else
+		sleepq_set_timeout_bt(cvp, bt, precision);
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
diff -urN -x -p head-davide/sys/kern/kern_event.c calloutng/sys/kern/kern_event.c
--- head-davide/sys/kern/kern_event.c	2012-08-03 20:48:45.000000000 +0200
+++ calloutng/sys/kern/kern_event.c	2012-12-11 09:30:13.000000000 +0100
@@ -517,25 +517,26 @@
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
-static int
-timertoticks(intptr_t data)
+static struct bintime
+timer2bintime(intptr_t data)
 {
-	struct timeval tv;
-	int tticks;
-
-	tv.tv_sec = data / 1000;
-	tv.tv_usec = (data % 1000) * 1000;
-	tticks = tvtohz(&tv);
+	struct bintime bt, pbt;
 
-	return tticks;
+	getbinuptime(&pbt);
+	bt.sec = data / 1000;
+	bt.frac = (data % 1000) * (uint64_t)1844674407309000LL;
+	bintime_add(&bt, &pbt);
+	return bt;
 }
 
 static void
 filt_timerexpire(void *knx)
 {
-	struct knote *kn = knx;
+	struct bintime bt;
 	struct callout *calloutp;
+	struct knote *kn;
 
+	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
@@ -547,9 +548,10 @@
 	 * when we're delayed.
 	 */
 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
+		bt = timer2bintime(kn->kn_sdata);
 		calloutp = (struct callout *)kn->kn_hook;
-		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
-		    filt_timerexpire, kn);
+		callout_reset_bt_on(calloutp, &bt, NULL, filt_timerexpire, kn,
+		    PCPU_GET(cpuid), 0);
 	}
 }
 
@@ -559,6 +561,7 @@
 static int
 filt_timerattach(struct knote *kn)
 {
+	struct bintime bt;
 	struct callout *calloutp;
 
 	atomic_add_int(&kq_ncallouts, 1);
@@ -573,8 +576,9 @@
 	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
 	callout_init(calloutp, CALLOUT_MPSAFE);
 	kn->kn_hook = calloutp;
-	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
-	    filt_timerexpire, kn);
+	bt = timer2bintime(kn->kn_sdata);
+	callout_reset_bt_on(calloutp, &bt, NULL, filt_timerexpire, kn,
+	    PCPU_GET(cpuid), 0);
 
 	return (0);
 }
diff -urN -x -p head-davide/sys/kern/kern_synch.c calloutng/sys/kern/kern_synch.c
--- head-davide/sys/kern/kern_synch.c	2012-08-03 20:48:45.000000000 +0200
+++ calloutng/sys/kern/kern_synch.c	2012-12-11 09:30:13.000000000 +0100
@@ -146,12 +146,13 @@
  */
 int
 _sleep(void *ident, struct lock_object *lock, int priority,
-    const char *wmesg, int timo)
+    const char *wmesg, int timo, struct bintime *bt, 
+    struct bintime *precision, int flags)
 {
 	struct thread *td;
 	struct proc *p;
 	struct lock_class *class;
-	int catch, flags, lock_state, pri, rval;
+	int catch, sleepq_flags, lock_state, pri, rval;
 	WITNESS_SAVE_DECL(lock_witness);
 
 	td = curthread;
@@ -162,7 +163,7 @@
 #endif
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Sleeping on \"%s\"", wmesg);
-	KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL,
+	KASSERT(timo != 0 || bt != NULL || mtx_owned(&Giant) || lock != NULL,
 	    ("sleeping without a lock"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
@@ -199,13 +200,13 @@
 		sleepq_remove(td, td->td_wchan);
 
 	if (ident == &pause_wchan)
-		flags = SLEEPQ_PAUSE;
+		sleepq_flags = SLEEPQ_PAUSE;
 	else
-		flags = SLEEPQ_SLEEP;
+		sleepq_flags = SLEEPQ_SLEEP;
 	if (catch)
-		flags |= SLEEPQ_INTERRUPTIBLE;
+		sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
 	if (priority & PBDRY)
-		flags |= SLEEPQ_STOP_ON_BDRY;
+		sleepq_flags |= SLEEPQ_STOP_ON_BDRY;
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
@@ -231,18 +232,20 @@
 	 * stopped, then td will no longer be on a sleep queue upon
 	 * return from cursig().
 	 */
-	sleepq_add(ident, lock, wmesg, flags, 0);
-	if (timo)
-		sleepq_set_timeout(ident, timo);
+	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
+	if (bt) 
+		sleepq_set_timeout_bt(ident, bt, precision);
+	else if (timo)
+		sleepq_set_timeout_flags(ident, timo, flags);
 	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 		sleepq_release(ident);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		sleepq_lock(ident);
 	}
-	if (timo && catch)
+	if ((timo != 0 || bt != NULL) && catch)
 		rval = sleepq_timedwait_sig(ident, pri);
-	else if (timo)
+	else if (timo != 0 || bt != NULL)
 		rval = sleepq_timedwait(ident, pri);
 	else if (catch)
 		rval = sleepq_wait_sig(ident, pri);
diff -urN -x -p head-davide/sys/kern/kern_tc.c calloutng/sys/kern/kern_tc.c
--- head-davide/sys/kern/kern_tc.c	2012-08-03 20:48:45.000000000 +0200
+++ calloutng/sys/kern/kern_tc.c	2012-12-13 11:57:45.000000000 +0100
@@ -119,6 +119,19 @@
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
     &timestepwarnings, 0, "Log time steps");
 
+struct bintime bt_timethreshold;
+struct bintime halftick_bt;
+struct bintime tick_bt;
+int tc_timeexp;
+int tc_timepercentage = TC_DEFAULTPERC;
+TUNABLE_INT("kern.timecounter.allowdeviation", &tc_timepercentage);
+int tc_timethreshold;
+static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, tc_timepercentage, 
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_kern_timecounter_adjprecision, "I", 
+    "Allowed deviation from absolute value");
+
 static void tc_windup(void);
 static void cpu_tick_calibrate(int);
 
@@ -275,7 +288,7 @@
 	do {
 		th = timehands;
 		gen = th->th_generation;
-		bintime2timeval(&th->th_offset, tvp);
+		Bintime2timeval(&th->th_offset, tvp);
 	} while (gen == 0 || gen != th->th_generation);
 }
 
@@ -1705,10 +1718,39 @@
 	tc_windup();
 }
 
+static void __inline 
+tc_adjprecision(void)
+{
+	struct timespec ts;
+	int tick_rate;
+
+	tick_rate = hz / tc_tick;
+	tc_timethreshold = (1000000000 / (tick_rate * tc_timepercentage)) * 100;
+	tc_timeexp = fls(roundup2(100 / tc_timepercentage, 2));
+	ts.tv_sec = tc_timethreshold / 1000000000;
+	ts.tv_nsec = tc_timethreshold % 1000000000;
+	timespec2bintime(&ts, &bt_timethreshold);
+}
+
+static int 
+sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = tc_timepercentage;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	tc_timepercentage = val;
+	tc_adjprecision();
+	return (0);
+}
+
 static void
 inittimecounter(void *dummy)
 {
 	u_int p;
+	int tick_rate;
 
 	/*
 	 * Set the initial timeout to
@@ -1722,6 +1764,11 @@
 		tc_tick = (hz + 500) / 1000;
 	else
 		tc_tick = 1;
+	tc_adjprecision();
+	tick_rate = hz / tc_tick;
+	FREQ2BT(tick_rate, &tick_bt);
+	halftick_bt = tick_bt;
+	bintime_divpow2(&halftick_bt, 1);
 	p = (tc_tick * 1000000) / hz;
 	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
 
diff -urN -x -p head-davide/sys/kern/kern_time.c calloutng/sys/kern/kern_time.c
--- head-davide/sys/kern/kern_time.c	2012-12-07 07:27:02.000000000 +0100
+++ calloutng/sys/kern/kern_time.c	2012-12-13 13:57:07.000000000 +0100
@@ -43,6 +43,7 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
+#include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -481,38 +482,39 @@
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
+	struct timespec ts;
+	struct bintime bt, btt, bt_prec, tmp;
 	int error;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 		return (0);
-	getnanouptime(&ts);
-	timespecadd(&ts, rqt);
-	TIMESPEC_TO_TIMEVAL(&tv, rqt);
-	for (;;) {
-		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
-		    tvtohz(&tv));
-		getnanouptime(&ts2);
-		if (error != EWOULDBLOCK) {
-			if (error == ERESTART)
-				error = EINTR;
-			if (rmt != NULL) {
-				timespecsub(&ts, &ts2);
-				if (ts.tv_sec < 0)
-					timespecclear(&ts);
-				*rmt = ts;
-			}
-			return (error);
+	timespec2bintime(rqt, &tmp);
+	bt_prec = tmp;
+	bintime_divpow2(&bt_prec, tc_timeexp);
+	if (TIMESEL(&bt, &tmp))
+		bintime_add(&bt, &tick_bt);
+	bintime_add(&bt, &tmp);
+	bintime_add(&bt, &bt_prec);
+	error = tsleep_bt(&nanowait, PWAIT | PCATCH, "nanslp", &bt, &bt_prec);
+	TIMESEL(&btt, &tmp);
+	if (error != EWOULDBLOCK) {
+		if (error == ERESTART)
+			error = EINTR;
+		if (rmt != NULL) {
+			tmp = bt;
+			bintime_sub(&tmp, &btt);
+			bintime2timespec(&tmp, &ts);
+			if (ts.tv_sec < 0)
+				timespecclear(&ts);
+			*rmt = ts;
 		}
-		if (timespeccmp(&ts2, &ts, >=))
+		if (bintime_cmp(&btt, &bt, >=))
 			return (0);
-		ts3 = ts;
-		timespecsub(&ts3, &ts2);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+		return (error);
 	}
+	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
diff -urN -x -p head-davide/sys/kern/kern_timeout.c calloutng/sys/kern/kern_timeout.c
--- head-davide/sys/kern/kern_timeout.c	2012-12-07 07:27:02.000000000 +0100
+++ calloutng/sys/kern/kern_timeout.c	2012-12-13 13:57:07.000000000 +0100
@@ -37,13 +37,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_callout_profiling.h"
 #include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
-#include <sys/condvar.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
@@ -55,6 +55,7 @@
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
+#include <sys/time.h>
 
 #ifdef SMP
 #include <machine/cpu.h>
@@ -68,6 +69,7 @@
 SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0,
     "struct callout *");
 
+#ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
@@ -80,6 +82,19 @@
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
+static int avg_depth_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
+    "Average number of direct callouts examined per callout_process call. "
+    "Units = 1/1000");
+static int avg_lockcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
+    &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
+    "callout_process call. Units = 1/1000");
+static int avg_mpcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
+    0, "Average number of MP direct callouts made per callout_process call. "
+    "Units = 1/1000");
+#endif
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
@@ -87,58 +102,62 @@
 int callwheelsize, callwheelmask;
 
 /*
- * The callout cpu migration entity represents informations necessary for
- * describing the migrating callout to the new callout cpu.
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
-struct cc_mig_ent {
+struct cc_exec {
+	struct callout		*cc_next;
+	struct callout		*cc_curr;
 #ifdef SMP
-	void	(*ce_migration_func)(void *);
-	void	*ce_migration_arg;
-	int	ce_migration_cpu;
-	int	ce_migration_ticks;
+	void			(*ce_migration_func)(void *);
+	void			*ce_migration_arg;
+	int			ce_migration_cpu;
+	struct bintime		ce_migration_time;
 #endif
+	int			cc_cancel;
+	int			cc_waiting;
 };
 	
 /*
- * There is one struct callout_cpu per cpu, holding all relevant
+ * There is one struct callou_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
- * In particular:
- *	cc_ticks is incremented once per tick in callout_cpu().
- *	It tracks the global 'ticks' but in a way that the individual
- *	threads should not worry about races in the order in which
- *	hardclock() and hardclock_cpu() run on the various CPUs.
- *	cc_softclock is advanced in callout_cpu() to point to the
- *	first entry in cc_callwheel that may need handling. In turn,
- *	a softclock() is scheduled so it can serve the various entries i
- *	such that cc_softclock <= i <= cc_ticks .
- *	XXX maybe cc_softclock and cc_ticks should be volatile ?
- *
- *	cc_ticks is also used in callout_reset_cpu() to determine
- *	when the callout should be served.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
-	struct cc_mig_ent	cc_migrating_entity;
+	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_callout;
 	struct callout_tailq	*cc_callwheel;
+	struct callout_tailq	cc_expireq;
 	struct callout_list	cc_callfree;
-	struct callout		*cc_next;
-	struct callout		*cc_curr;
+	struct bintime 		cc_firstevent;
+	struct bintime 		cc_lastscan;
 	void			*cc_cookie;
-	int 			cc_ticks;
-	int 			cc_softticks;
-	int			cc_cancel;
-	int			cc_waiting;
-	int 			cc_firsttick;
 };
 
+#define	cc_exec_curr		cc_exec_entity[0].cc_curr
+#define	cc_exec_next		cc_exec_entity[0].cc_next
+#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
+#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
+#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
+#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
+#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
+#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
+
 #ifdef SMP
-#define	cc_migration_func	cc_migrating_entity.ce_migration_func
-#define	cc_migration_arg	cc_migrating_entity.ce_migration_arg
-#define	cc_migration_cpu	cc_migrating_entity.ce_migration_cpu
-#define	cc_migration_ticks	cc_migrating_entity.ce_migration_ticks
+#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
+#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
+#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
+#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
+#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
+#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
+#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
+#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
@@ -153,40 +172,51 @@
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
+#define	TIME_T_MAX							\
+	(sizeof(time_t) == (sizeof(int64_t)) ? INT64_MAX : INT32_MAX)
+
 static int timeout_cpu;
-void (*callout_new_inserted)(int cpu, int ticks) = NULL;
+void (*callout_new_inserted)(int cpu, struct bintime bt,
+    struct bintime bt_opt) = NULL;
+static void
+softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
+    int *lockcalls, int *gcalls, int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
- *   cc_curr         - If a callout is in progress, it is curr_callout.
- *                     If curr_callout is non-NULL, threads waiting in
+ *   cc_curr         - If a callout is in progress, it is cc_curr.
+ *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
- *   cc_cancel       - Changing to 1 with both callout_lock and c_lock held
+ *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
- *                     c_lock is successfully acquired.
+ *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
- *                     curr_callout is non-NULL.
+ *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the migration entity tied to a specific callout cpu.
  */
 static void
-cc_cme_cleanup(struct callout_cpu *cc)
+cc_cme_cleanup(struct callout_cpu *cc, int direct)
 {
-
+	
+	cc->cc_exec_entity[direct].cc_curr = NULL;
+	cc->cc_exec_entity[direct].cc_next = NULL;
+	cc->cc_exec_entity[direct].cc_cancel = 0;
+	cc->cc_exec_entity[direct].cc_waiting = 0;
 #ifdef SMP
-	cc->cc_migration_cpu = CPUBLOCK;
-	cc->cc_migration_ticks = 0;
-	cc->cc_migration_func = NULL;
-	cc->cc_migration_arg = NULL;
+	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
+	bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
+	cc->cc_exec_entity[direct].ce_migration_func = NULL;
+	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
 #endif
 }
 
@@ -194,18 +224,19 @@
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
-cc_cme_migrating(struct callout_cpu *cc)
+cc_cme_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
-	return (cc->cc_migration_cpu != CPUBLOCK);
+	
+	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
- * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization 
+ * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization
  *
  *	This code is called very early in the kernel initialization sequence,
  *	and may be called more then once.
@@ -242,7 +273,9 @@
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&cc->cc_callwheel[i]);
 	}
-	cc_cme_cleanup(cc);
+	TAILQ_INIT(&cc->cc_expireq);
+	for (i = 0; i < 2; i++)
+		cc_cme_cleanup(cc, i);
 	if (cc->cc_callout == NULL)
 		return;
 	for (i = 0; i < ncallout; i++) {
@@ -330,28 +363,175 @@
 
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
+#define	CC_HASH_SHIFT	10
+
+static inline int
+callout_hash(struct bintime *bt)
+{
+
+	return (int) ((bt->sec << CC_HASH_SHIFT) +
+	    (bt->frac >> (64 - CC_HASH_SHIFT)));
+} 
+
+static inline int
+get_bucket(struct bintime *bt)
+{
+
+	return callout_hash(bt) & callwheelmask;
+}
+
 void
-callout_tick(void)
+callout_process(struct bintime *now)
 {
+	struct bintime max, min, next, next_opt, tmp_max, tmp_min;
+	struct callout *tmp;
 	struct callout_cpu *cc;
-	int need_softclock;
-	int bucket;
+	struct callout_tailq *sc;
+	int cpu, depth_dir, firstb, mpcalls_dir, lastb, nowb, lockcalls_dir,
+	    need_softclock, exit_allowed, exit_wanted;
 
-	/*
-	 * Process callouts at a very low cpu priority, so we don't keep the
-	 * relatively high clock interrupt priority any longer than necessary.
-	 */
 	need_softclock = 0;
+	depth_dir = 0;
+	mpcalls_dir = 0;
+	lockcalls_dir = 0;
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	cc->cc_firsttick = cc->cc_ticks = ticks;
-	for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) {
-		bucket = cc->cc_softticks & callwheelmask;
-		if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) {
-			need_softclock = 1;
-			break;
+	cpu = curcpu;
+
+	/* Compute the buckets of the last scan and present times. */
+	firstb = callout_hash(&cc->cc_lastscan);
+	nowb = callout_hash(now);
+
+	/* Compute the last bucket and minimum time of the bucket after it. */
+	next = next_opt = *now;
+	bintime_addx(&next, (uint64_t)3 << (64 - 2));		/* 0.75s */
+	next.frac &= (0xffffffffffffffffLLU << (64 - CC_HASH_SHIFT));
+	bintime_addx(&next_opt, (uint64_t)3 << (64 - 3));	/* 0.37s */
+	lastb = callout_hash(&next) - 1;
+
+	/*
+	 * Check if we wrapped around the entire wheel from the last scan.
+	 * In case, we need to scan entirely the wheel for pending callouts.
+	 */
+	if (lastb - firstb >= callwheelsize)
+		lastb = firstb - 1;
+	if (nowb - firstb >= callwheelsize)
+		nowb = firstb - 1;
+	nowb &= callwheelmask;
+	lastb &= callwheelmask;
+	firstb &= callwheelmask;
+
+	/* Iterate callwheel from firstb to nowb and then up to lastb. */
+	min.sec = TIME_T_MAX;
+	min.frac = UINT64_MAX;
+	max = next;
+	exit_allowed = 0;
+	for (;;) {
+		exit_wanted = 0;
+		sc = &cc->cc_callwheel[firstb];
+		tmp = TAILQ_FIRST(sc);
+		while (tmp != NULL) {
+			/* Compute allowed time range for the event */
+			tmp_max = tmp_min = tmp->c_time;
+			if (bintime_isset(&tmp->c_precision)) {
+				bintime_add(&tmp_max, &tmp->c_precision);
+				bintime_sub(&tmp_min, &tmp->c_precision);
+			}
+			/* Run the callout if present time within allowed. */
+			if (bintime_cmp(&tmp_min, now, <=)) {
+				/*
+				 * Consumer told us the callout may be run
+				 * directly from hardware interrupt context.
+				 */
+				if (tmp->c_flags & CALLOUT_DIRECT) {
+					++depth_dir;
+					cc->cc_exec_next_dir =
+					    TAILQ_NEXT(tmp, c_links.tqe);
+					TAILQ_REMOVE(sc, tmp, c_links.tqe);
+					softclock_call_cc(tmp, cc,
+					    &mpcalls_dir, &lockcalls_dir,
+					    NULL, 1);
+					tmp = cc->cc_exec_next_dir;
+				} else {
+					TAILQ_INSERT_TAIL(&cc->cc_expireq,
+					    tmp, c_staiter);
+					TAILQ_REMOVE(sc, tmp, c_links.tqe);
+					tmp->c_flags |= CALLOUT_PROCESSED;
+					need_softclock = 1;
+					tmp = TAILQ_NEXT(tmp, c_links.tqe);
+				}
+				continue;
+			}
+			/* Skip events from distant future. */
+			if (bintime_cmp(&tmp_min, &next, >=))
+				goto next;
+			/*
+			 * This is the fist event we're going to process or
+			 * event maximal time is less than present minimal.
+			 * In both cases, take it.
+			 */
+			if (bintime_cmp(&tmp_max, &min, <)) {
+				max = tmp_max;
+				min = tmp_min;
+				goto next;
+			}
+			/*
+			 * Event minimal time is bigger than present maximal
+			 * time, so it cannot be aggregated.
+			 */
+			if (bintime_cmp(&tmp_min, &max, >)) {
+				exit_wanted = 1;
+				goto next;
+			}
+			/*
+			 * If neither of the two previous happened, just take
+			 * the intersection of events.
+			 */
+			min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min;
+			max = (bintime_cmp(&tmp_max, &max, <)) ? tmp_max : max;
+next:
+			tmp = TAILQ_NEXT(tmp, c_links.tqe);
 		}
+		/* Stop if we looked far enough into the future. */
+		if (firstb == lastb)
+			break;
+		/*
+		 * Stop if we looked after present time and found
+		 * some event we can't execute at now.
+		 */
+		if (firstb == nowb)
+			exit_allowed = 1;
+		if (exit_allowed && exit_wanted)
+			break;
+		/* Proceed with the next bucket. */
+		firstb = (firstb + 1) & callwheelmask;
+	}
+	cc->cc_exec_next_dir = NULL;
+	if (min.sec != TIME_T_MAX) {
+		/*
+		 * Now that we found something to aggregate, schedule an
+		 * interrupt in the middle of the previously calculated range.
+		 */
+		if (bintime_cmp(&max, &min, !=)) {
+			bintime_add(&max, &min);
+			next = max;
+			next.frac >>= 1;
+			if (next.sec & 1)
+				next.frac |= ((uint64_t)1 << 63);
+			next.sec >>= 1;
+		} else
+			next = max;
+		next_opt = min;
 	}
+	if (callout_new_inserted != NULL)
+		(*callout_new_inserted)(cpu, next, next_opt);
+	cc->cc_firstevent = next;
+	cc->cc_lastscan = *now;
+#ifdef CALLOUT_PROFILING
+	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
+	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
+	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
+#endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
@@ -361,33 +541,6 @@
 		swi_sched(cc->cc_cookie, 0);
 }
 
-int
-callout_tickstofirst(int limit)
-{
-	struct callout_cpu *cc;
-	struct callout *c;
-	struct callout_tailq *sc;
-	int curticks;
-	int skip = 1;
-
-	cc = CC_SELF();
-	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	curticks = cc->cc_ticks;
-	while( skip < ncallout && skip < limit ) {
-		sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ];
-		/* search scanning ticks */
-		TAILQ_FOREACH( c, sc, c_links.tqe ){
-			if (c->c_time - curticks <= ncallout)
-				goto out;
-		}
-		skip++;
-	}
-out:
-	cc->cc_firsttick = curticks + skip;
-	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	return (skip);
-}
-
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
@@ -413,25 +566,42 @@
 }
 
 static void
-callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks,
-    void (*func)(void *), void *arg, int cpu)
+callout_cc_add(struct callout *c, struct callout_cpu *cc,
+    struct bintime to_bintime, struct bintime precision, void (*func)(void *),
+    void *arg, int cpu, int flags)
 {
+	struct bintime bt;
+	int bucket;
 
 	CC_LOCK_ASSERT(cc);
-
-	if (to_ticks <= 0)
-		to_ticks = 1;
+	if (bintime_cmp(&to_bintime, &cc->cc_lastscan, <))
+		to_bintime = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+	if (flags & C_DIRECT_EXEC)
+		c->c_flags |= CALLOUT_DIRECT;
+	c->c_flags &= ~CALLOUT_PROCESSED;
 	c->c_func = func;
-	c->c_time = ticks + to_ticks;
-	TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], 
-	    c, c_links.tqe);
-	if ((c->c_time - cc->cc_firsttick) < 0 &&
-	    callout_new_inserted != NULL) {
-		cc->cc_firsttick = c->c_time;
-		(*callout_new_inserted)(cpu,
-		    to_ticks + (ticks - cc->cc_ticks));
+	c->c_time = to_bintime;
+	c->c_precision = precision;
+	CTR4(KTR_CALLOUT, "precision set for %p: %d.%08x%08x",
+	    c, c->c_precision.sec, (u_int) (c->c_precision.frac >> 32),
+	    (u_int) (c->c_precision.frac & 0xffffffff));
+	bucket = get_bucket(&c->c_time);
+	TAILQ_INSERT_TAIL(&cc->cc_callwheel[bucket], c, c_links.tqe);
+	/*
+	 * Inform the eventtimers(4) subsystem there's a new callout
+	 * that has been inserted, but only if really required.
+	 */
+	bt = c->c_time;
+	bintime_add(&bt, &c->c_precision);
+	if (callout_new_inserted != NULL &&
+	    (bintime_cmp(&bt, &cc->cc_firstevent, <) ||
+	    !bintime_isset(&cc->cc_firstevent))) {
+		cc->cc_firstevent = c->c_time;
+		bt = c->c_time;
+		bintime_sub(&bt, &c->c_precision);
+		(*callout_new_inserted)(cpu, c->c_time, bt);
 	}
 }
 
@@ -447,7 +617,7 @@
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
-    int *lockcalls, int *gcalls)
+    int *lockcalls, int *gcalls, int direct)
 {
 	void (*c_func)(void *);
 	void *c_arg;
@@ -458,7 +628,8 @@
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
-	int new_cpu, new_ticks;
+	int flags, new_cpu;
+	struct bintime new_time;
 #endif
 #ifdef DIAGNOSTIC
 	struct bintime bt1, bt2;
@@ -480,8 +651,8 @@
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_flags &= ~CALLOUT_PENDING;
-	cc->cc_curr = c;
-	cc->cc_cancel = 0;
+	cc->cc_exec_entity[direct].cc_curr = c;
+	cc->cc_exec_entity[direct].cc_cancel = 0;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, sharedlock);
@@ -489,14 +660,18 @@
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
-		if (cc->cc_cancel) {
+		if (cc->cc_exec_entity[direct].cc_cancel) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
-		cc->cc_cancel = 1;
-
-		if (c_lock == &Giant.lock_object) {
+		cc->cc_exec_entity[direct].cc_cancel = 1;
+		/*
+		 * In case we're processing a direct callout we
+		 * can't hold giant because holding a sleep mutex
+		 * from hardware interrupt context is not allowed.
+		 */
+		if ((c_lock == &Giant.lock_object) && gcalls != NULL) {
 			(*gcalls)++;
 			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 			    c, c_func, c_arg);
@@ -513,11 +688,13 @@
 #ifdef DIAGNOSTIC
 	binuptime(&bt1);
 #endif
-	THREAD_NO_SLEEPING();
+	if (!direct)
+		THREAD_NO_SLEEPING();
 	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
 	c_func(c_arg);
 	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
-	THREAD_SLEEPING_OK();
+	if (!direct)
+		THREAD_SLEEPING_OK();
 #ifdef DIAGNOSTIC
 	binuptime(&bt2);
 	bintime_sub(&bt2, &bt1);
@@ -537,17 +714,17 @@
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
-	KASSERT(cc->cc_curr == c, ("mishandled cc_curr"));
-	cc->cc_curr = NULL;
-	if (cc->cc_waiting) {
+	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
+	cc->cc_exec_entity[direct].cc_curr = NULL;
+	if (cc->cc_exec_entity[direct].cc_waiting) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
-		if (cc_cme_migrating(cc)) {
-			cc_cme_cleanup(cc);
+		if (cc_cme_migrating(cc, direct)) {
+			cc_cme_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
@@ -555,11 +732,11 @@
 			 */
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 		}
-		cc->cc_waiting = 0;
+		cc->cc_exec_entity[direct].cc_waiting = 0;
 		CC_UNLOCK(cc);
-		wakeup(&cc->cc_waiting);
+		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
 		CC_LOCK(cc);
-	} else if (cc_cme_migrating(cc)) {
+	} else if (cc_cme_migrating(cc, direct)) {
 		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
@@ -567,11 +744,11 @@
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
-		new_cpu = cc->cc_migration_cpu;
-		new_ticks = cc->cc_migration_ticks;
-		new_func = cc->cc_migration_func;
-		new_arg = cc->cc_migration_arg;
-		cc_cme_cleanup(cc);
+		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
+		new_time = cc->cc_exec_entity[direct].ce_migration_time;
+		new_func = cc->cc_exec_entity[direct].ce_migration_func;
+		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
+		cc_cme_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
@@ -589,8 +766,9 @@
 		c->c_flags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
-		callout_cc_add(c, new_cc, new_ticks, new_func, new_arg,
-		    new_cpu);
+		flags = (direct) ? C_DIRECT_EXEC : 0;
+		callout_cc_add(c, new_cc, new_time, c->c_precision, new_func,
+		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
@@ -613,7 +791,7 @@
 }
 
 /*
- * The callout mechanism is based on the work of Adam M. Costello and 
+ * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
@@ -633,63 +811,29 @@
 {
 	struct callout_cpu *cc;
 	struct callout *c;
-	struct callout_tailq *bucket;
-	int curticks;
-	int steps;	/* #steps since we last allowed interrupts */
-	int depth;
-	int mpcalls;
-	int lockcalls;
-	int gcalls;
-
-#ifndef MAX_SOFTCLOCK_STEPS
-#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
-#endif /* MAX_SOFTCLOCK_STEPS */
+	int depth, gcalls, lockcalls, mpcalls;
 
+	depth = 0;
 	mpcalls = 0;
 	lockcalls = 0;
 	gcalls = 0;
-	depth = 0;
-	steps = 0;
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
-	while (cc->cc_softticks - 1 != cc->cc_ticks) {
-		/*
-		 * cc_softticks may be modified by hard clock, so cache
-		 * it while we work on a given bucket.
-		 */
-		curticks = cc->cc_softticks;
-		cc->cc_softticks++;
-		bucket = &cc->cc_callwheel[curticks & callwheelmask];
-		c = TAILQ_FIRST(bucket);
-		while (c != NULL) {
-			depth++;
-			if (c->c_time != curticks) {
-				c = TAILQ_NEXT(c, c_links.tqe);
-				++steps;
-				if (steps >= MAX_SOFTCLOCK_STEPS) {
-					cc->cc_next = c;
-					/* Give interrupts a chance. */
-					CC_UNLOCK(cc);
-					;	/* nothing */
-					CC_LOCK(cc);
-					c = cc->cc_next;
-					steps = 0;
-				}
-			} else {
-				cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-				TAILQ_REMOVE(bucket, c, c_links.tqe);
-				softclock_call_cc(c, cc, &mpcalls,
-				    &lockcalls, &gcalls);
-				steps = 0;
-				c = cc->cc_next;
-			}
-		}
+	c = TAILQ_FIRST(&cc->cc_expireq);
+	while (c != NULL) {
+		++depth;
+		cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);
+		softclock_call_cc(c, cc, &mpcalls, &lockcalls, &gcalls, 0);
+		c = cc->cc_exec_next;
 	}
+	cc->cc_exec_next = NULL;
+#ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
-	cc->cc_next = NULL;
+#endif
 	CC_UNLOCK(cc);
 }
 
@@ -704,7 +848,7 @@
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
- *	implementation differs from that one in that although an 
+ *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
@@ -779,28 +923,52 @@
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
-callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *),
-    void *arg, int cpu)
+_callout_reset_on(struct callout *c, struct bintime *bt,
+    struct bintime *precision, int to_ticks, void (*ftn)(void *),
+    void *arg, int cpu, int flags)
 {
+	struct bintime now, to_bt, pr;
 	struct callout_cpu *cc;
-	int cancelled = 0;
+	int bucket, cancelled, direct;
 
+	cancelled = 0;
+	if (bt == NULL) {
+		pr = to_bt = tick_bt;
+		getbinuptime(&now);
+		if (to_ticks > 1)
+			bintime_mul(&to_bt, to_ticks);
+		bintime_add(&to_bt, &now);
+		to_ticks >>= C_PRELGET(flags);
+		if (to_ticks == 0)
+			pr = halftick_bt;
+		else
+			bintime_mul(&pr, to_ticks);
+	} else { 
+		to_bt = *bt;
+		if (precision != NULL)
+			pr = *precision;
+		else
+			bintime_clear(&pr);
+	}
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced.
 	 */
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		cpu = c->c_cpu;
+	direct = c->c_flags & CALLOUT_DIRECT;
 	cc = callout_lock(c);
-	if (cc->cc_curr == c) {
+	if (cc->cc_exec_entity[direct].cc_curr == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
-		if (c->c_lock != NULL && !cc->cc_cancel)
-			cancelled = cc->cc_cancel = 1;
-		if (cc->cc_waiting) {
+		if (c->c_lock != NULL &&
+		    !cc->cc_exec_entity[direct].cc_cancel)
+			cancelled =
+			    cc->cc_exec_entity[direct].cc_cancel = 1;
+		if (cc->cc_exec_entity[direct].cc_waiting) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
@@ -813,12 +981,18 @@
 		}
 	}
 	if (c->c_flags & CALLOUT_PENDING) {
-		if (cc->cc_next == c) {
-			cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+			if (cc->cc_exec_next_dir == c)
+				cc->cc_exec_next_dir = TAILQ_NEXT(c,
+				    c_links.tqe);
+			bucket = get_bucket(&c->c_time);
+			TAILQ_REMOVE(&cc->cc_callwheel[bucket], c,
+			    c_links.tqe);
+		} else {
+			if (cc->cc_exec_next == c)
+				cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);
 		}
-		TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
-		    c_links.tqe);
-
 		cancelled = 1;
 		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	}
@@ -830,15 +1004,17 @@
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
-		if (cc->cc_curr == c) {
-			cc->cc_migration_cpu = cpu;
-			cc->cc_migration_ticks = to_ticks;
-			cc->cc_migration_func = ftn;
-			cc->cc_migration_arg = arg;
+		if (cc->cc_exec_entity[direct].cc_curr == c) {
+			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
+			cc->cc_exec_entity[direct].ce_migration_time
+			    = to_bt;
+			cc->cc_exec_entity[direct].ce_migration_func = ftn;
+			cc->cc_exec_entity[direct].ce_migration_arg = arg;
 			c->c_flags |= CALLOUT_DFRMIGRATION;
-			CTR5(KTR_CALLOUT,
-		    "migration of %p func %p arg %p in %d to %u deferred",
-			    c, c->c_func, c->c_arg, to_ticks, cpu);
+			CTR6(KTR_CALLOUT,
+		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
+			    c, c->c_func, c->c_arg, (int)(to_bt.sec),
+			    (u_int)(to_bt.frac >> 32), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
@@ -846,9 +1022,10 @@
 	}
 #endif
 
-	callout_cc_add(c, cc, to_ticks, ftn, arg, cpu);
-	CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
-	    cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
+	callout_cc_add(c, cc, to_bt, pr, ftn, arg, cpu, flags);
+	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
+	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_bt.sec),
+	    (u_int)(to_bt.frac >> 32));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
@@ -876,7 +1053,7 @@
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
-	int use_lock, sq_locked;
+	int bucket, direct, sq_locked, use_lock;
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
@@ -892,7 +1069,7 @@
 		}
 	} else
 		use_lock = 0;
-
+	direct = c->c_flags & CALLOUT_DIRECT;
 	sq_locked = 0;
 	old_cc = NULL;
 again:
@@ -906,7 +1083,7 @@
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
-		sleepq_release(&old_cc->cc_waiting);
+		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
@@ -927,12 +1104,13 @@
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 */
-		if (cc->cc_curr != c) {
+		if (cc->cc_exec_entity[direct].cc_curr != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
-				sleepq_release(&cc->cc_waiting);
+				sleepq_release(
+				    &cc->cc_exec_entity[direct].cc_waiting);
 			return (0);
 		}
 
@@ -943,8 +1121,7 @@
 			 * just wait for the current invocation to
 			 * finish.
 			 */
-			while (cc->cc_curr == c) {
-
+			while (cc->cc_exec_entity[direct].cc_curr == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
@@ -964,7 +1141,8 @@
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
-					sleepq_lock(&cc->cc_waiting);
+					sleepq_lock(
+					&cc->cc_exec_entity[direct].cc_waiting);
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
@@ -976,13 +1154,16 @@
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
-				cc->cc_waiting = 1;
+				cc->cc_exec_entity[direct].cc_waiting = 1;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
-				sleepq_add(&cc->cc_waiting,
+				sleepq_add(
+				    &cc->cc_exec_entity[direct].cc_waiting,
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
-				sleepq_wait(&cc->cc_waiting, 0);
+				sleepq_wait(
+				    &cc->cc_exec_entity[direct].cc_waiting,
+					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
@@ -990,7 +1171,8 @@
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
-		} else if (use_lock && !cc->cc_cancel) {
+		} else if (use_lock &&
+			    !cc->cc_exec_entity[direct].cc_cancel) {
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
@@ -998,10 +1180,10 @@
 			 * lock, the callout will be skipped in
 			 * softclock().
 			 */
-			cc->cc_cancel = 1;
+			cc->cc_exec_entity[direct].cc_cancel = 1;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
-			KASSERT(!cc_cme_migrating(cc),
+			KASSERT(!cc_cme_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
@@ -1020,16 +1202,22 @@
 		return (0);
 	}
 	if (sq_locked)
-		sleepq_release(&cc->cc_waiting);
-
+		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
-	if (cc->cc_next == c)
-		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-	TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
-	    c_links.tqe);
+	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+		if (cc->cc_exec_next_dir == c)
+			cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
+		bucket = get_bucket(&c->c_time);
+		TAILQ_REMOVE(&cc->cc_callwheel[bucket], c,
+		    c_links.tqe);
+	} else {
+		if (cc->cc_exec_next == c)
+			cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);
+	}
 	callout_cc_del(c, cc);
 
 	CC_UNLOCK(cc);
diff -urN -x -p head-davide/sys/kern/subr_sleepqueue.c calloutng/sys/kern/subr_sleepqueue.c
--- head-davide/sys/kern/subr_sleepqueue.c	2012-09-30 17:50:32.000000000 +0200
+++ calloutng/sys/kern/subr_sleepqueue.c	2012-12-11 09:30:13.000000000 +0100
@@ -362,9 +362,11 @@
  * Sets a timeout that will remove the current thread from the specified
  * sleep queue after timo ticks if the thread has not already been awakened.
  */
-void
-sleepq_set_timeout(void *wchan, int timo)
+void 
+_sleepq_set_timeout(void *wchan, struct bintime *bt, struct bintime *precision,
+    int timo, int flags)
 {
+
 	struct sleepqueue_chain *sc;
 	struct thread *td;
 
@@ -374,7 +376,12 @@
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
-	callout_reset_curcpu(&td->td_slpcallout, timo, sleepq_timeout, td);
+	if (bt == NULL) 
+		callout_reset_flags_on(&td->td_slpcallout, timo,
+		    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+	else
+		callout_reset_bt_on(&td->td_slpcallout, bt, precision,
+		    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
 }
 
 /*
diff -urN -x -p head-davide/sys/kern/sys_generic.c calloutng/sys/kern/sys_generic.c
--- head-davide/sys/kern/sys_generic.c	2012-12-07 07:27:02.000000000 +0100
+++ calloutng/sys/kern/sys_generic.c	2012-12-13 13:57:07.000000000 +0100
@@ -102,7 +102,8 @@
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 static void	seltdinit(struct thread *);
-static int	seltdwait(struct thread *, int);
+static int	seltdwait(struct thread *, struct bintime *, struct bintime *,
+		    int);
 static void	seltdclear(struct thread *);
 
 /*
@@ -902,10 +903,12 @@
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
-	struct timeval atv, rtv, ttv;
-	int error, lf, ndu, timo;
+	struct bintime abt, precision, rbt;
+	struct timeval atv;
+	int error, lf, ndu;
 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
 
+	timevalclear(&atv);
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
@@ -996,33 +999,37 @@
 
 	if (tvp != NULL) {
 		atv = *tvp;
-		if (itimerfix(&atv)) {
+		if (atv.tv_sec < 0 || atv.tv_usec < 0 || 
+		    atv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
+		timeval2bintime(&atv, &abt);
+		precision = abt;
+		bintime_divpow2(&precision, tc_timeexp);
+		if (TIMESEL(&rbt, &abt))
+			bintime_add(&abt, &tick_bt);
+		bintime_add(&abt, &rbt);
+		bintime_add(&abt, &precision);
 	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
+		abt.sec = 0;
+		abt.frac = 0;
 	}
-	timo = 0;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=))
+		if (abt.sec || abt.frac) {
+			TIMESEL(&rbt, &abt);
+			if (bintime_cmp(&rbt, &abt, >=))
 				break;
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
+			error = seltdwait(td, &abt, &precision, 0);
+		}
+		else {
+			error = seltdwait(td, NULL, NULL, 0);
 		}
-		error = seltdwait(td, timo);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
@@ -1254,11 +1261,13 @@
 {
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
-	struct timeval atv, rtv, ttv;
-	int error, timo;
+	struct bintime abt, precision, rbt;
+	struct timeval atv;
+	int error;
 	u_int nfds;
 	size_t ni;
 
+	timevalclear(&atv);
 	nfds = uap->nfds;
 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 
 		return (EINVAL);
@@ -1273,33 +1282,36 @@
 	if (uap->timeout != INFTIM) {
 		atv.tv_sec = uap->timeout / 1000;
 		atv.tv_usec = (uap->timeout % 1000) * 1000;
-		if (itimerfix(&atv)) {
+		if (atv.tv_sec < 0 || atv.tv_usec < 0 || 
+		    atv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
+		timeval2bintime(&atv, &abt);
+		precision = abt;
+		bintime_divpow2(&precision, tc_timeexp);
+		if (TIMESEL(&rbt, &abt))
+			bintime_add(&abt, &tick_bt);
+		bintime_add(&abt, &rbt);
+		bintime_add(&abt, &precision);
 	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
+		abt.sec = 0;
+		abt.frac = 0;
 	}
-	timo = 0;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = pollscan(td, bits, nfds);
 		if (error || td->td_retval[0] != 0)
 			break;
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=))
+		if (abt.sec || abt.frac) {
+			TIMESEL(&rbt, &abt);
+			if (bintime_cmp(&rbt, &abt, >=))
 				break;
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
+			error = seltdwait(td, &abt, &precision, 0);
+		} else { 
+			error = seltdwait(td, NULL, NULL, 0);
 		}
-		error = seltdwait(td, timo);
 		if (error)
 			break;
 		error = pollrescan(td);
@@ -1641,7 +1653,8 @@
 }
 
 static int
-seltdwait(struct thread *td, int timo)
+seltdwait(struct thread *td, struct bintime *bt, struct bintime *precision,
+    int timo)
 {
 	struct seltd *stp;
 	int error;
@@ -1660,9 +1673,12 @@
 		mtx_unlock(&stp->st_mtx);
 		return (0);
 	}
-	if (timo > 0)
+	if (bt == NULL && timo > 0) 
 		error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
-	else
+	else if (bt != NULL)
+		error = cv_timedwait_sig_bt(&stp->st_wait, &stp->st_mtx,
+		    bt, precision);
+	else	
 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
 	mtx_unlock(&stp->st_mtx);
 
diff -urN -x -p head-davide/sys/netinet/tcp_timer.c calloutng/sys/netinet/tcp_timer.c
--- head-davide/sys/netinet/tcp_timer.c	2012-12-07 07:27:04.000000000 +0100
+++ calloutng/sys/netinet/tcp_timer.c	2012-12-11 09:27:56.000000000 +0100
@@ -712,21 +712,39 @@
 
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
+static int
+delta_bintime_in_msecs(struct bintime bt, struct bintime now) 
+{
+	bintime_sub(&bt, &now);
+	return (((uint64_t)1000 * (uint64_t)(bt.frac >> 32)) >> 32) +
+	    (bt.sec * 1000);
+}
+
 void
-tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 
+    struct xtcp_timer *xtimer)
 {
-	bzero(xtimer, sizeof(struct xtcp_timer));
+	struct bintime bt, now;
+	
+	bzero(xtimer, sizeof(*xtimer));
 	if (timer == NULL)
 		return;
-	if (callout_active(&timer->tt_delack))
-		xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
-	if (callout_active(&timer->tt_rexmt))
-		xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
-	if (callout_active(&timer->tt_persist))
-		xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
-	if (callout_active(&timer->tt_keep))
-		xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
-	if (callout_active(&timer->tt_2msl))
-		xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+	bintime_clear(&bt);
+	getbinuptime(&now);	
+	if (callout_active(&timer->tt_delack)) 
+		xtimer->tt_delack = delta_bintime_in_msecs(
+		    timer->tt_delack.c_time, now);
+	if (callout_active(&timer->tt_rexmt)) 
+		xtimer->tt_rexmt = delta_bintime_in_msecs(
+		    timer->tt_rexmt.c_time, now);
+	if (callout_active(&timer->tt_persist)) 
+		xtimer->tt_persist = delta_bintime_in_msecs(
+		    timer->tt_persist.c_time, now);
+	if (callout_active(&timer->tt_keep)) 
+		xtimer->tt_keep = delta_bintime_in_msecs(
+		    timer->tt_keep.c_time, now);
+	if (callout_active(&timer->tt_2msl)) 
+		xtimer->tt_2msl = delta_bintime_in_msecs(
+		    timer->tt_2msl.c_time, now);
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
diff -urN -x -p head-davide/sys/ofed/include/linux/timer.h calloutng/sys/ofed/include/linux/timer.h
--- head-davide/sys/ofed/include/linux/timer.h	2012-08-03 20:49:21.000000000 +0200
+++ calloutng/sys/ofed/include/linux/timer.h	2012-12-11 09:30:13.000000000 +0100
@@ -38,10 +38,9 @@
 	struct callout	timer_callout;
 	void		(*function)(unsigned long);
         unsigned long	data;
+	int		expires;
 };
 
-#define	expires	timer_callout.c_time
-
 static inline void
 _timer_fn(void *context)
 {
@@ -65,13 +64,16 @@
 	callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE);		\
 } while (0)
 
-#define	mod_timer(timer, expire)					\
-	callout_reset(&(timer)->timer_callout, (expire) - jiffies,	\
-	    _timer_fn, (timer))
+#define	mod_timer(timer, exp)						\
+do {									\
+	(timer)->expires = exp;						\
+	callout_reset(&(timer)->timer_callout, (exp) - jiffies,		\
+	    _timer_fn, (timer));					\
+} while (0)
 
 #define	add_timer(timer)						\
 	callout_reset(&(timer)->timer_callout,				\
-	    (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
+	    (timer)->expires - jiffies, _timer_fn, (timer))
 
 #define	del_timer(timer)	callout_stop(&(timer)->timer_callout)
 #define	del_timer_sync(timer)	callout_drain(&(timer)->timer_callout)
diff -urN -x -p head-davide/sys/pc98/pc98/machdep.c calloutng/sys/pc98/pc98/machdep.c
--- head-davide/sys/pc98/pc98/machdep.c	2012-12-07 07:27:21.000000000 +0100
+++ calloutng/sys/pc98/pc98/machdep.c	2012-11-26 12:16:35.000000000 +0100
@@ -1145,7 +1145,7 @@
 #define	STATE_SLEEPING	0x2
 
 static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(int us)
 {
 	int *state;
 
@@ -1186,7 +1186,7 @@
 #define	MWAIT_C4	0x30
 
 static void
-cpu_idle_mwait(int busy)
+cpu_idle_mwait(int us)
 {
 	int *state;
 
@@ -1209,7 +1209,7 @@
 }
 
 static void
-cpu_idle_spin(int busy)
+cpu_idle_spin(int us)
 {
 	int *state;
 	int i;
@@ -1234,6 +1234,7 @@
 void
 cpu_idle(int busy)
 {
+	int us = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
@@ -1251,11 +1252,11 @@
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
-		cpu_idleclock();
+		us = cpu_idleclock();
 	}
 
 	/* Call main idle method. */
-	cpu_idle_fn(busy);
+	cpu_idle_fn(us);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
diff -urN -x -p head-davide/sys/powerpc/powerpc/cpu.c calloutng/sys/powerpc/powerpc/cpu.c
--- head-davide/sys/powerpc/powerpc/cpu.c	2012-08-03 20:48:33.000000000 +0200
+++ calloutng/sys/powerpc/powerpc/cpu.c	2012-11-12 12:27:29.000000000 +0100
@@ -79,9 +79,9 @@
 static void	cpu_booke_setup(int cpuid, uint16_t vers);
 
 int powerpc_pow_enabled;
-void (*cpu_idle_hook)(void) = NULL;
-static void	cpu_idle_60x(void);
-static void	cpu_idle_booke(void);
+void (*cpu_idle_hook)(int) = NULL;
+static void	cpu_idle_60x(int);
+static void	cpu_idle_booke(int);
 
 struct cputab {
 	const char	*name;
@@ -516,6 +516,7 @@
 void
 cpu_idle(int busy)
 {
+	int us = -1;
 
 #ifdef INVARIANTS
 	if ((mfmsr() & PSL_EE) != PSL_EE) {
@@ -531,9 +532,9 @@
 	if (cpu_idle_hook != NULL) {
 		if (!busy) {
 			critical_enter();
-			cpu_idleclock();
+			us = cpu_idleclock();
 		}
-		cpu_idle_hook();
+		cpu_idle_hook(us);
 		if (!busy) {
 			cpu_activeclock();
 			critical_exit();
@@ -551,7 +552,7 @@
 }
 
 static void
-cpu_idle_60x(void)
+cpu_idle_60x(int us)
 {
 	register_t msr;
 	uint16_t vers;
@@ -586,7 +587,7 @@
 }
 
 static void
-cpu_idle_booke(void)
+cpu_idle_booke(int us)
 {
 	register_t msr;
 
diff -urN -x -p head-davide/sys/powerpc/ps3/platform_ps3.c calloutng/sys/powerpc/ps3/platform_ps3.c
--- head-davide/sys/powerpc/ps3/platform_ps3.c	2012-08-03 20:48:33.000000000 +0200
+++ calloutng/sys/powerpc/ps3/platform_ps3.c	2012-12-11 09:28:07.000000000 +0100
@@ -70,7 +70,7 @@
 static struct cpu_group *ps3_smp_topo(platform_t);
 #endif
 static void ps3_reset(platform_t);
-static void ps3_cpu_idle(void);
+static void ps3_cpu_idle(int);
 
 static platform_method_t ps3_methods[] = {
 	PLATFORMMETHOD(platform_probe, 		ps3_probe),
@@ -245,7 +245,7 @@
 }
 
 static void
-ps3_cpu_idle(void)
+ps3_cpu_idle(int us)
 {
 	lv1_pause(0);
 }
diff -urN -x -p head-davide/sys/powerpc/wii/platform_wii.c calloutng/sys/powerpc/wii/platform_wii.c
--- head-davide/sys/powerpc/wii/platform_wii.c	2012-12-07 07:27:34.000000000 +0100
+++ calloutng/sys/powerpc/wii/platform_wii.c	2012-12-11 09:28:07.000000000 +0100
@@ -60,7 +60,7 @@
 			    int *, struct mem_region **, int *);
 static unsigned long	wii_timebase_freq(platform_t, struct cpuref *cpuref);
 static void		wii_reset(platform_t);
-static void		wii_cpu_idle(void);
+static void		wii_cpu_idle(int);
 
 static platform_method_t wii_methods[] = {
 	PLATFORMMETHOD(platform_probe,		wii_probe),
@@ -155,6 +155,6 @@
 }
 
 static void
-wii_cpu_idle(void)
+wii_cpu_idle(int us)
 {
 }
diff -urN -x -p head-davide/sys/sys/_callout.h calloutng/sys/sys/_callout.h
--- head-davide/sys/sys/_callout.h	2012-08-03 20:51:25.000000000 +0200
+++ calloutng/sys/sys/_callout.h	2012-11-06 09:39:17.000000000 +0100
@@ -39,6 +39,7 @@
 #define	_SYS__CALLOUT_H
 
 #include <sys/queue.h>
+#include <sys/time.h>
 
 struct lock_object;
 
@@ -50,7 +51,9 @@
 		SLIST_ENTRY(callout) sle;
 		TAILQ_ENTRY(callout) tqe;
 	} c_links;
-	int	c_time;				/* ticks to the event */
+	TAILQ_ENTRY(callout) c_staiter;
+	struct bintime c_time;			/* ticks to the event */
+	struct bintime c_precision;		/* delta allowed wrt opt */
 	void	*c_arg;				/* function argument */
 	void	(*c_func)(void *);		/* function to call */
 	struct lock_object *c_lock;		/* lock to handle */
diff -urN -x -p head-davide/sys/sys/callout.h calloutng/sys/sys/callout.h
--- head-davide/sys/sys/callout.h	2012-08-03 20:51:21.000000000 +0200
+++ calloutng/sys/sys/callout.h	2012-12-11 09:30:13.000000000 +0100
@@ -47,6 +47,14 @@
 #define	CALLOUT_RETURNUNLOCKED	0x0010 /* handler returns with mtx unlocked */
 #define	CALLOUT_SHAREDLOCK	0x0020 /* callout lock held in shared mode */
 #define	CALLOUT_DFRMIGRATION	0x0040 /* callout in deferred migration mode */
+#define	CALLOUT_PROCESSED	0x0080 /* callout in wheel or processing list? */
+#define	CALLOUT_DIRECT 		0x0100 /* allow exec from hw int context */
+
+#define	C_DIRECT_EXEC		0x0001 /* direct execution of callout */
+#define	C_PRELBITS		7
+#define	C_PRELRANGE		((1 << C_PRELBITS) - 1)
+#define	C_PRELSET(x)		((x) << 1)
+#define	C_PRELGET(x)		(((x) >> 1) & C_PRELRANGE)
 
 struct callout_handle {
 	struct callout *callout;
@@ -67,7 +75,16 @@
 	_callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object :	\
 	   NULL, (flags))
 #define	callout_pending(c)	((c)->c_flags & CALLOUT_PENDING)
-int	callout_reset_on(struct callout *, int, void (*)(void *), void *, int);
+int	_callout_reset_on(struct callout *, struct bintime *,
+	    struct bintime *, int, void (*)(void *), void *, int, int);
+#define	callout_reset_on(c, to_ticks, fn, arg, cpu)			\
+    _callout_reset_on((c), NULL, NULL, (to_ticks), (fn), (arg), 	\
+        (cpu), 0)
+#define callout_reset_flags_on(c, to_ticks, fn, arg, cpu, flags)	\
+    _callout_reset_on((c), NULL, NULL, (to_ticks), (fn), (arg), (cpu),	\
+        (flags))
+#define callout_reset_bt_on(c, bt, pr, fn, arg, cpu, flags)		\
+    _callout_reset_on((c), (bt), (pr), 0, (fn), (arg), (cpu), (flags)) 
 #define	callout_reset(c, on_tick, fn, arg)				\
     callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu)
 #define	callout_reset_curcpu(c, on_tick, fn, arg)			\
@@ -78,9 +95,9 @@
     callout_schedule_on((c), (on_tick), PCPU_GET(cpuid))
 #define	callout_stop(c)		_callout_stop_safe(c, 0)
 int	_callout_stop_safe(struct callout *, int);
-void	callout_tick(void);
-int	callout_tickstofirst(int limit);
-extern void (*callout_new_inserted)(int cpu, int ticks);
+void	callout_process(struct bintime *);
+extern void (*callout_new_inserted)(int cpu, struct bintime bt,
+    struct bintime);
 
 #endif
 
diff -urN -x -p head-davide/sys/sys/condvar.h calloutng/sys/sys/condvar.h
--- head-davide/sys/sys/condvar.h	2012-08-03 20:51:21.000000000 +0200
+++ calloutng/sys/sys/condvar.h	2012-12-11 09:30:13.000000000 +0100
@@ -55,8 +55,12 @@
 void	_cv_wait(struct cv *cvp, struct lock_object *lock);
 void	_cv_wait_unlock(struct cv *cvp, struct lock_object *lock);
 int	_cv_wait_sig(struct cv *cvp, struct lock_object *lock);
-int	_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo);
-int	_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo);
+int	_cv_timedwait(struct cv *cvp, struct lock_object *lock,
+	    struct bintime *bt, struct bintime *precision, int timo, 
+	    int flags);
+int	_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock,
+	    struct bintime *bt, struct bintime *precision, int timo, 
+	    int flags);
 
 void	cv_signal(struct cv *cvp);
 void	cv_broadcastpri(struct cv *cvp, int pri);
@@ -68,9 +72,23 @@
 #define	cv_wait_sig(cvp, lock)						\
 	_cv_wait_sig((cvp), &(lock)->lock_object)
 #define	cv_timedwait(cvp, lock, timo)					\
-	_cv_timedwait((cvp), &(lock)->lock_object, (timo))
+	_cv_timedwait((cvp), &(lock)->lock_object, NULL, NULL,		\
+	    (timo), 0)
+#define	cv_timedwait_bt(cvp, lock, bt, pr)				\
+	_cv_timedwait_sig((cvp), &(lock)->lock_object, (bt),		\
+	    (pr), 0, 0)
+#define	cv_timedwait_sig_bt(cvp, lock, bt, pr)				\
+	_cv_timedwait_sig((cvp), &(lock)->lock_object, (bt), (pr), 0, 	\
+	    0)
+#define	cv_timedwait_flags(cvp, lock, timo, flags)			\
+	_cv_timedwait((cvp), &(lock)->lock_object, NULL, NULL, (timo), 	\
+	    (flags))
 #define	cv_timedwait_sig(cvp, lock, timo)				\
-	_cv_timedwait_sig((cvp), &(lock)->lock_object, (timo))
+	_cv_timedwait_sig((cvp), &(lock)->lock_object, NULL, NULL,	\
+	    (timo), 0)
+#define	cv_timedwait_sig_flags(cvp, lock, timo, flags)			\
+	_cv_timedwait_sig((cvp), &(lock)->lock_object, NULL, NULL,	\
+	    (timo), (flags))
 
 #define cv_broadcast(cvp)	cv_broadcastpri(cvp, 0)
 
diff -urN -x -p head-davide/sys/sys/mutex.h calloutng/sys/sys/mutex.h
--- head-davide/sys/sys/mutex.h	2012-12-07 07:27:32.000000000 +0100
+++ calloutng/sys/sys/mutex.h	2012-12-11 09:30:13.000000000 +0100
@@ -376,7 +376,8 @@
 	mtx_assert_((m), (what), __FILE__, __LINE__)
 
 #define	mtx_sleep(chan, mtx, pri, wmesg, timo)				\
-	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo))
+	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo),	\
+	   NULL, NULL, 0)
 
 #define	mtx_initialized(m)	lock_initalized(&(m)->lock_object)
 
diff -urN -x -p head-davide/sys/sys/proc.h calloutng/sys/sys/proc.h
--- head-davide/sys/sys/proc.h	2012-12-07 07:27:32.000000000 +0100
+++ calloutng/sys/sys/proc.h	2012-11-26 12:16:36.000000000 +0100
@@ -907,7 +907,7 @@
 void	tidhash_remove(struct thread *);
 void	cpu_idle(int);
 int	cpu_idle_wakeup(int);
-extern	void (*cpu_idle_hook)(void);	/* Hook to machdep CPU idler. */
+extern	void (*cpu_idle_hook)(int);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *, struct thread *, struct mtx *);
 void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
diff -urN -x -p head-davide/sys/sys/rwlock.h calloutng/sys/sys/rwlock.h
--- head-davide/sys/sys/rwlock.h	2012-12-07 07:27:32.000000000 +0100
+++ calloutng/sys/sys/rwlock.h	2012-12-11 09:30:13.000000000 +0100
@@ -211,7 +211,8 @@
 		rw_runlock(rw);						\
 } while (0)
 #define	rw_sleep(chan, rw, pri, wmesg, timo)				\
-	_sleep((chan), &(rw)->lock_object, (pri), (wmesg), (timo))
+	_sleep((chan), &(rw)->lock_object, (pri), (wmesg), (timo),	\
+	    NULL, NULL, 0)
 
 #define	rw_initialized(rw)	lock_initalized(&(rw)->lock_object)
 
diff -urN -x -p head-davide/sys/sys/sleepqueue.h calloutng/sys/sys/sleepqueue.h
--- head-davide/sys/sys/sleepqueue.h	2012-08-03 20:51:27.000000000 +0200
+++ calloutng/sys/sys/sleepqueue.h	2012-12-11 09:30:13.000000000 +0100
@@ -108,7 +108,14 @@
 void	sleepq_release(void *wchan);
 void	sleepq_remove(struct thread *td, void *wchan);
 int	sleepq_signal(void *wchan, int flags, int pri, int queue);
-void	sleepq_set_timeout(void *wchan, int timo);
+void	_sleepq_set_timeout(void *wchan, struct bintime *bt, 
+	    struct bintime *precision, int timo, int flags);
+#define	sleepq_set_timeout(wchan, timo)					\
+    _sleepq_set_timeout((wchan), NULL, NULL, (timo), 0)
+#define	sleepq_set_timeout_flags(wchan, timo, flags)			\
+    _sleepq_set_timeout((wchan), NULL, NULL, (timo), (flags))
+#define	sleepq_set_timeout_bt(wchan, bt, precision)			\
+    _sleepq_set_timeout((wchan), (bt), (precision), 0, 0)
 u_int	sleepq_sleepcnt(void *wchan, int queue);
 int	sleepq_timedwait(void *wchan, int pri);
 int	sleepq_timedwait_sig(void *wchan, int pri);
diff -urN -x -p head-davide/sys/sys/sx.h calloutng/sys/sys/sx.h
--- head-davide/sys/sys/sx.h	2012-08-03 20:51:21.000000000 +0200
+++ calloutng/sys/sys/sx.h	2012-12-11 09:30:13.000000000 +0100
@@ -275,7 +275,8 @@
 #define	sx_unlock(sx)	sx_unlock_((sx), LOCK_FILE, LOCK_LINE)
 
 #define	sx_sleep(chan, sx, pri, wmesg, timo)				\
-	_sleep((chan), &(sx)->lock_object, (pri), (wmesg), (timo))
+	_sleep((chan), &(sx)->lock_object, (pri), (wmesg), (timo),	\
+	    NULL, NULL, 0)
 
 /*
  * Options passed to sx_init_flags().
diff -urN -x -p head-davide/sys/sys/systm.h calloutng/sys/sys/systm.h
--- head-davide/sys/sys/systm.h	2012-12-13 14:16:04.000000000 +0100
+++ calloutng/sys/sys/systm.h	2012-12-12 04:30:59.000000000 +0100
@@ -266,7 +266,7 @@
 void	stopprofclock(struct proc *);
 void	cpu_startprofclock(void);
 void	cpu_stopprofclock(void);
-void	cpu_idleclock(void);
+int	cpu_idleclock(void);
 void	cpu_activeclock(void);
 extern int	cpu_can_deep_sleep;
 extern int	cpu_disable_deep_sleep;
@@ -345,14 +345,24 @@
  * less often.
  */
 int	_sleep(void *chan, struct lock_object *lock, int pri, const char *wmesg,
-	    int timo) __nonnull(1);
+	    int timo, struct bintime *bt, struct bintime *precision, 
+	    int flags) __nonnull(1);
 #define	msleep(chan, mtx, pri, wmesg, timo)				\
-	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo))
+	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo),	\
+	    NULL, NULL, 0)
+#define	msleep_flags(chan, mtx, pri, wmesg, timo, flags)		\
+	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo), 	\
+	    NULL, NULL, (flags))
+#define	msleep_bt(chan, mtx, pri, wmesg, bt, pr)			\
+	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg) 0, (bt),	\
+	    (pr), 0)
 int	msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int timo)
 	    __nonnull(1);
 int	pause(const char *wmesg, int timo);
 #define	tsleep(chan, pri, wmesg, timo)					\
-	_sleep((chan), NULL, (pri), (wmesg), (timo))
+	_sleep((chan), NULL, (pri), (wmesg), (timo), NULL, NULL, 0)
+#define	tsleep_bt(chan, pri, wmesg, bt, pr)				\
+	_sleep((chan), NULL, (pri), (wmesg), 0, (bt), (pr), 0)
 void	wakeup(void *chan) __nonnull(1);
 void	wakeup_one(void *chan) __nonnull(1);
 
diff -urN -x -p head-davide/sys/sys/time.h calloutng/sys/sys/time.h
--- head-davide/sys/sys/time.h	2012-09-30 17:51:01.000000000 +0200
+++ calloutng/sys/sys/time.h	2012-12-13 13:57:07.000000000 +0100
@@ -102,6 +102,15 @@
 	bt->frac = (p2 << 32) | (p1 & 0xffffffffull);
 }
 
+static __inline void
+bintime_divpow2(struct bintime *bt, u_int exp)
+{
+
+	bt->frac >>= exp;
+	bt->frac |= (uint64_t)bt->sec << (64 - exp);
+	bt->sec >>= exp;
+}
+
 #define	bintime_clear(a)	((a)->sec = (a)->frac = 0)
 #define	bintime_isset(a)	((a)->sec || (a)->frac)
 #define	bintime_cmp(a, b, cmp)						\
@@ -290,7 +299,13 @@
 extern time_t	time_second;
 extern time_t	time_uptime;
 extern struct bintime boottimebin;
+extern struct bintime halftick_bt;
+extern struct bintime tick_bt;
 extern struct timeval boottime;
+extern int tc_timeexp;
+extern int tc_timepercentage;
+extern int tc_timethreshold;
+extern struct bintime bt_timethreshold; 
 
 /*
  * Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
@@ -337,6 +352,23 @@
 void	timevaladd(struct timeval *t1, const struct timeval *t2);
 void	timevalsub(struct timeval *t1, const struct timeval *t2);
 int	tvtohz(struct timeval *tv);
+
+#define	TC_DEFAULTPERC		5
+
+#define	BT2FREQ(bt)                                                     \
+	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /           \
+	    ((bt)->frac >> 1))
+
+#define	FREQ2BT(freq, bt)                                               \
+{									\
+	(bt)->sec = 0;                                                  \
+	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;     \
+}
+
+#define	TIMESEL(bt, bt2)						\
+	((bintime_cmp((bt2), (&bt_timethreshold), >=)) ?		\
+	    (getbinuptime(bt), 1) : (binuptime(bt), 0))
+
 #else /* !_KERNEL */
 #include <time.h>