Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c	(revision 238497)
+++ sys/kern/kern_timeout.c	(working copy)
@@ -117,6 +117,7 @@ struct callout_cpu {
 	struct callout_tailq	cc_expireq;		  
 	struct callout_list	cc_callfree;
 	struct callout		*cc_next;
+	struct callout		*cc_next_direct;
 	struct callout		*cc_curr;
 	struct bintime 		cc_firstevent;
 	struct bintime 		cc_lastscan;
@@ -145,6 +146,10 @@ struct callout_cpu cc_cpu;
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 #define	C_PRECISION	0x2
 
+#ifndef MAX_SOFTCLOCK_STEPS
+#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
+#endif /* MAX_SOFTCLOCK_STEPS */
+
 #define FREQ2BT(freq, bt)                                               \
 {                                                                       \
         (bt)->sec = 0;                                                  \
@@ -348,151 +353,15 @@ get_bucket(struct bintime *bt)
 	return callout_hash(bt) & callwheelmask;
 }
 
-void
-callout_process(struct bintime *now)
+static inline void
+callout_stats(int depth, int mpcalls, int lockcalls, int gcalls)
 {
-	struct bintime max, min, next, tmp_max, tmp_min;
-	struct callout *tmp;
-	struct callout_cpu *cc;
-	struct callout_tailq *sc;
-	int cpu, first, future, last, need_softclock; 
-
-	/*
-	 * Process callouts at a very low cpu priority, so we don't keep the
-	 * relatively high clock interrupt priority any longer than necessary.
-	 */
-	need_softclock = 0;
-	cc = CC_SELF();
-	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	cpu = curcpu;
-	first = callout_hash(&cc->cc_lastscan);
-	last = callout_hash(now);
-	/* 
-	 * Check if we wrapped around the entire wheel from the last scan.
-	 * In case, we need to scan entirely the wheel for pending callouts.
-	 */
-	last = (last - first >= callwheelsize) ? (first - 1) & callwheelmask : 
-	    last & callwheelmask;
-	first &= callwheelmask;
-	for (;;) {	
-		sc = &cc->cc_callwheel[first];
-		TAILQ_FOREACH(tmp, sc, c_links.tqe) {
-			next = tmp->c_time;
-			bintime_sub(&next, &tmp->c_precision);
-			if (bintime_cmp(&next, now, <=)) {
-				/* 
-				 * Consumer told us the callout may be run
-				 * directly from hardware interrupt context.
-				 */
-				if (tmp->c_flags & CALLOUT_DIRECT) {
-					tmp->c_func(tmp->c_arg);
-					TAILQ_REMOVE(sc, tmp, c_links.tqe);
-			                tmp->c_flags &= ~CALLOUT_PENDING;
-				} else {
-					TAILQ_INSERT_TAIL(&cc->cc_expireq, 
-					    tmp, c_staiter);
-					TAILQ_REMOVE(sc, tmp, c_links.tqe);
-					tmp->c_flags |= CALLOUT_PROCESSED;
-					need_softclock = 1;
-				}
-			}	
-		}	
-		if (first == last)
-			break;
-		first = (first + 1) & callwheelmask;
-	}
-	future = (last + hz / 4) & callwheelmask; 
-	max.sec = min.sec = TIME_T_MAX;
-	max.frac = min.frac = UINT64_MAX;
-	/* 
-	 * Look for the first bucket in the future that contains some event,
-	 * up to some point,  so that we can look for aggregation. 
-	 */ 
-	for (;;) { 
-		sc = &cc->cc_callwheel[last];
-		TAILQ_FOREACH(tmp, sc, c_links.tqe) {
-			tmp_max = tmp_min = tmp->c_time; 
-			bintime_add(&tmp_max, &tmp->c_precision);
-			bintime_sub(&tmp_min, &tmp->c_precision);
-			/*
-			 * This is the fist event we're going to process or 
-			 * event maximal time is less than present minimal. 
-			 * In both cases, take it.
-			 */
-			 if (bintime_cmp(&tmp_max, &min, <)) {
-				max = tmp_max;
-				min = tmp_min;
-				continue;	
-			}
-			/*
-			 * Event minimal time is bigger than present maximal  
-		 	 * time, so it cannot be aggregated.
-			 */
-			if (bintime_cmp(&tmp_min, &max, >))
-				continue;
-			/*
-			 * If neither of the two previous happened, just take 
-			 * the intersection of events.
-			 */	
-			min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min;
-			max = (bintime_cmp(&tmp_max, &max, >)) ? tmp_max : max;
-		}		
-		if (last == future || max.sec != TIME_T_MAX) 
-			break;
-		last = (last + 1) & callwheelmask;
-	}
-	if (max.sec == TIME_T_MAX) { 
-		next.sec = 0;	
-		next.frac = (uint64_t)1 << (64 - 2);	
-		bintime_add(&next, now);
-	} else {
-		/*
-		 * Now that we found something to aggregate, schedule an  
-		 * interrupt in the middle of the previously calculated range.
-	 	 */
-		bintime_add(&max, &min);
-		next = max;
-		next.frac >>= 1;
-		if (next.sec & 1)	
-			next.frac |= ((uint64_t)1 << 63);
-		next.sec >>= 1;
-	}
-	cc->cc_firstevent = next;
-	if (callout_new_inserted != NULL) 
-		(*callout_new_inserted)(cpu, next);
-	cc->cc_lastscan = *now;
-	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	/*
-	 * swi_sched acquires the thread lock, so we don't want to call it
-	 * with cc_lock held; incorrect locking order.
-	 */
-	if (need_softclock) {
-		swi_sched(cc->cc_cookie, 0);
-	}
-}
-
-static struct callout_cpu *
-callout_lock(struct callout *c)
-{
-	struct callout_cpu *cc;
-	int cpu;
-
-	for (;;) {
-		cpu = c->c_cpu;
-#ifdef SMP
-		if (cpu == CPUBLOCK) {
-			while (c->c_cpu == CPUBLOCK)
-				cpu_spinwait();
-			continue;
-		}
+#ifdef CALLOUT_PROFILING
+	avg_depth += (depth * 1000 - avg_depth) >> 8;
+	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
+	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
+	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
-		cc = CC_CPU(cpu);
-		CC_LOCK(cc);
-		if (cpu == c->c_cpu)
-			break;
-		CC_UNLOCK(cc);
-	}
-	return (cc);
 }
 
 static void
@@ -564,8 +433,10 @@ static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
-	if (cc->cc_next == c)
+	if (cc->cc_next == c) 
 		cc->cc_next = TAILQ_NEXT(c, c_staiter);
+	if (cc->cc_next_direct == c)
+		cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe);
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
 		c->c_func = NULL;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
@@ -574,13 +445,13 @@ callout_cc_del(struct callout *c, struct callout_c
 
 static struct callout *
 softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
-    int *lockcalls, int *gcalls)
+    int *lockcalls, int *gcalls, int direct)
 {
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
-	int c_flags, sharedlock;
+	int c_direct, c_flags, sharedlock;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
@@ -594,8 +465,10 @@ softclock_call_cc(struct callout *c, struct callou
 	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
-
-	cc->cc_next = TAILQ_NEXT(c, c_staiter);
+	if (direct)
+		cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe);
+	else 
+		cc->cc_next = TAILQ_NEXT(c, c_staiter);
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
 	c_lock = c->c_lock;
@@ -617,11 +490,11 @@ softclock_call_cc(struct callout *c, struct callou
 		 */
 		if (cc->cc_cancel) {
 			class->lc_unlock(c_lock);
-			goto skip;
+		goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc->cc_cancel = 1;
-
+#ifdef CALLOUT_PROFILING
 		if (c_lock == &Giant.lock_object) {
 			(*gcalls)++;
 			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
@@ -631,10 +504,13 @@ softclock_call_cc(struct callout *c, struct callou
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
+#endif
 	} else {
+#ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 		CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p",
 		    c, c_func, c_arg);
+#endif
 	}
 #ifdef DIAGNOSTIC
 	binuptime(&bt1);
@@ -722,8 +598,9 @@ skip:
 		 * is not easy.
 		 */
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
+		c_direct = c->c_flags & CALLOUT_DIRECT;
 		callout_cc_add(c, new_cc, new_time, new_func, new_arg,
-		    new_cpu, 0);
+		    new_cpu, c_direct);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
@@ -733,9 +610,164 @@ skip:
 #ifdef SMP
 nextc:
 #endif
-	return (cc->cc_next);
+	if (direct)
+		return (cc->cc_next_direct);
+	else 
+		return (cc->cc_next);
 }
+void
+callout_process(struct bintime *now)
+{
+	struct bintime max, min, next, tmp_max, tmp_min;
+	struct callout *tmp;
+	struct callout_cpu *cc;
+	struct callout_tailq *sc;
+	int cpu, depth, first, future, gcalls, last, lockcalls, mpcalls, 
+	    need_softclock;
 
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	need_softclock = steps = 0;
+	cc = CC_SELF();
+	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	cpu = curcpu;
+	first = callout_hash(&cc->cc_lastscan);
+	last = callout_hash(now);
+	/* 
+	 * Check if we wrapped around the entire wheel from the last scan.
+	 * In case, we need to scan entirely the wheel for pending callouts.
+	 */
+	last = (last - first >= callwheelsize) ? (first - 1) & callwheelmask : 
+	    last & callwheelmask;
+	first &= callwheelmask;
+	for (;;) {	
+		sc = &cc->cc_callwheel[first];
+		tmp = TAILQ_FIRST(sc);
+		while (tmp != NULL) {	
+			next = tmp->c_time;
+			bintime_sub(&next, &tmp->c_precision);
+			if (bintime_cmp(&next, now, <=)) {
+				/* 
+				 * Consumer told us the callout may be run
+				 * directly from hardware interrupt context.
+				 */
+				if (tmp->c_flags & CALLOUT_DIRECT) {
+					++depth;
+					TAILQ_REMOVE(sc, tmp, c_links.tqe); 
+					tmp = softclock_call_cc(tmp, cc,
+						    &mpcalls, &lockcalls, 
+						    &gcalls, 1);
+				} else { 
+					TAILQ_INSERT_TAIL(&cc->cc_expireq, 
+					    tmp, c_staiter);
+					TAILQ_REMOVE(sc, tmp, c_links.tqe);
+					tmp->c_flags |= CALLOUT_PROCESSED;
+					need_softclock = 1;
+				}
+			} else
+				tmp = TAILQ_NEXT(tmp, c_links.tqe);
+		}	
+		if (first == last)
+			break;
+		first = (first + 1) & callwheelmask;
+	}
+	future = (last + hz / 4) & callwheelmask; 
+	max.sec = min.sec = TIME_T_MAX;
+	max.frac = min.frac = UINT64_MAX;
+	/* 
+	 * Look for the first bucket in the future that contains some event,
+	 * up to some point,  so that we can look for aggregation. 
+	 */ 
+	for (;;) { 
+		sc = &cc->cc_callwheel[last];
+		TAILQ_FOREACH(tmp, sc, c_links.tqe) {
+			tmp_max = tmp_min = tmp->c_time; 
+			bintime_add(&tmp_max, &tmp->c_precision);
+			bintime_sub(&tmp_min, &tmp->c_precision);
+			/*
+			 * This is the fist event we're going to process or 
+			 * event maximal time is less than present minimal. 
+			 * In both cases, take it.
+			 */
+			 if (bintime_cmp(&tmp_max, &min, <)) {
+				max = tmp_max;
+				min = tmp_min;
+				continue;	
+			}
+			/*
+			 * Event minimal time is bigger than present maximal  
+		 	 * time, so it cannot be aggregated.
+			 */
+			if (bintime_cmp(&tmp_min, &max, >))
+				continue;
+			/*
+			 * If neither of the two previous happened, just take 
+			 * the intersection of events.
+			 */	
+			min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min;
+			max = (bintime_cmp(&tmp_max, &max, >)) ? tmp_max : max;
+		}		
+		if (last == future || max.sec != TIME_T_MAX) 
+			break;
+		last = (last + 1) & callwheelmask;
+	}
+	if (max.sec == TIME_T_MAX) { 
+		next.sec = 0;	
+		next.frac = (uint64_t)1 << (64 - 2);	
+		bintime_add(&next, now);
+	} else {
+		/*
+		 * Now that we found something to aggregate, schedule an  
+		 * interrupt in the middle of the previously calculated range.
+	 	 */
+		bintime_add(&max, &min);
+		next = max;
+		next.frac >>= 1;
+		if (next.sec & 1)	
+			next.frac |= ((uint64_t)1 << 63);
+		next.sec >>= 1;
+	}
+	cc->cc_firstevent = next;
+	if (callout_new_inserted != NULL) 
+		(*callout_new_inserted)(cpu, next);
+	cc->cc_lastscan = *now;
+	callout_stats(depth, mpcalls, lockcalls, gcalls);
+	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	/*
+	 * swi_sched acquires the thread lock, so we don't want to call it
+	 * with cc_lock held; incorrect locking order.
+	 */
+	if (need_softclock) {
+		swi_sched(cc->cc_cookie, 0);
+	}
+}
+
+static struct callout_cpu *
+callout_lock(struct callout *c)
+{
+	struct callout_cpu *cc;
+	int cpu;
+
+	for (;;) {
+		cpu = c->c_cpu;
+#ifdef SMP
+		if (cpu == CPUBLOCK) {
+			while (c->c_cpu == CPUBLOCK)
+				cpu_spinwait();
+			continue;
+		}
+#endif
+		cc = CC_CPU(cpu);
+		CC_LOCK(cc);
+		if (cpu == c->c_cpu)
+			break;
+		CC_UNLOCK(cc);
+	}
+	return (cc);
+}
+
 /*
  * The callout mechanism is based on the work of Adam M. Costello and 
  * George Varghese, published in a technical report entitled "Redesigning
@@ -758,14 +790,12 @@ softclock(void *arg)
 	struct callout_cpu *cc;
 	struct callout *c;
 	int steps;	/* #steps since we last allowed interrupts */
+	int depth;
 	int mpcalls;
 	int lockcalls;
 	int gcalls;
 
-#ifndef MAX_SOFTCLOCK_STEPS
-#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
-#endif /* MAX_SOFTCLOCK_STEPS */
-	
+	depth = 0;	
 	mpcalls = 0;
 	lockcalls = 0;
 	gcalls = 0;
@@ -775,6 +805,7 @@ softclock(void *arg)
 
 	c = TAILQ_FIRST(&cc->cc_expireq);
 	while (c != NULL) {
+		++depth;
 		++steps;
 		if (steps >= MAX_SOFTCLOCK_STEPS) {
 			cc->cc_next = c;
@@ -787,15 +818,10 @@ softclock(void *arg)
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);
 			c = softclock_call_cc(c, cc, &mpcalls,
-			    &lockcalls, &gcalls);
+			    &lockcalls, &gcalls, 0);
 		}	
 	}
-#ifdef CALLOUT_PROFILING
-	avg_depth += (depth * 1000 - avg_depth) >> 8;
-	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
-	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
-	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
-#endif
+	callout_stats(depth, mpcalls, lockcalls, gcalls);
 	cc->cc_next = NULL;
 	CC_UNLOCK(cc);
 }
@@ -929,16 +955,22 @@ _callout_reset_on(struct callout *c, struct bintim
 		}
 	}
 	if (c->c_flags & CALLOUT_PENDING) {
-		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {	
-			if (cc->cc_next == c)
+		if (cc->cc_next == c) {
+			if ((c->c_flags & CALLOUT_PROCESSED) == 0) {	
 				cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+				bucket = get_bucket(&c->c_time);
+				TAILQ_REMOVE(&cc->cc_callwheel[bucket], c,
+				    c_links.tqe);
+			} else {
+				cc->cc_next = TAILQ_NEXT(c, c_staiter);
+				TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);
+			}
+		}
+		else if (cc->cc_next_direct == c) {
+			cc->cc_next_direct = TAILQ_NEXT(c, c_links.tqe);
 			bucket = get_bucket(&c->c_time);
 			TAILQ_REMOVE(&cc->cc_callwheel[bucket], c,
-			    c_links.tqe);
-		} else {
-			if (cc->cc_next == c)
-				cc->cc_next = TAILQ_NEXT(c, c_staiter);
-			TAILQ_REMOVE(&cc->cc_expireq, c, c_staiter);  
+			    c_links.tqe); 
 		}
 		cancelled = 1;
 		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);