Index: sys/smp.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/smp.h,v
retrieving revision 1.86
diff -u -r1.86 smp.h
--- sys/smp.h	8 Nov 2007 14:47:55 -0000	1.86
+++ sys/smp.h	16 Feb 2008 00:13:42 -0000
@@ -32,12 +32,30 @@
  */
 
 struct cpu_group {
-	cpumask_t cg_mask;		/* Mask of cpus in this group. */
-	int	cg_count;		/* Count of cpus in this group. */
-	int	cg_children;		/* Number of children groups. */
-	struct cpu_group *cg_child;	/* Optional child group. */
+	struct cpu_group *cg_parent;	/* Our parent group. */
+	struct cpu_group *cg_child;	/* Optional children groups. */
+	cpumask_t	cg_mask;	/* Mask of cpus in this group. */
+	int8_t		cg_count;	/* Count of cpus in this group. */
+	int8_t		cg_children;	/* Number of children groups. */
+	int8_t		cg_level;	/* Shared cache level. */
+	int8_t		cg_flags;	/* Traversal modifiers. */
 };
 
+/*
+ * Defines common resources for CPUs in the group.  The highest level
+ * resource should be used when multiple are shared.
+ */
+#define	CG_SHARE_NONE	0
+#define	CG_SHARE_L1	1
+#define	CG_SHARE_L2	2
+#define	CG_SHARE_L3	3
+
+/*
+ * Behavior modifiers for load balancing and affinity.
+ */
+#define	CG_FLAG_HTT	0x01		/* Schedule the alternate core last. */
+#define	CG_FLAG_THREAD	0x02		/* New age htt, less crippled. */
+
 struct cpu_top {
 	int	ct_count;		/* Count of groups. */
 	struct cpu_group *ct_group;	/* Array of pointers to cpu groups. */
Index: kern/sched_ule.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.223
diff -u -r1.223 sched_ule.c
--- kern/sched_ule.c	23 Jan 2008 03:10:18 -0000	1.223
+++ kern/sched_ule.c	16 Feb 2008 00:13:42 -0000
@@ -190,47 +190,28 @@
  * locking in sched_pickcpu();
  */
 struct tdq {
-	struct mtx	*tdq_lock;		/* Pointer to group lock. */
+	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
+	struct mtx	tdq_lock;		/* run queue lock. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
 	int		tdq_load;		/* Aggregate load. */
+	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
 #ifdef SMP
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	int		tdq_transferable;	/* Transferable thread count. */
-	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
-	struct tdq_group *tdq_group;		/* Our processor group. */
-#else
-	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 #endif
+	char		tdq_name[sizeof("sched lock") + 6];
 } __aligned(64);
 
 
 #ifdef SMP
-/*
- * tdq groups are groups of processors which can cheaply share threads.  When
- * one processor in the group goes idle it will check the runqs of the other
- * processors in its group prior to halting and waiting for an interrupt.
- * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
- * In a numa environment we'd want an idle bitmap per group and a two tiered
- * load balancer.
- */
-struct tdq_group {
-	struct mtx	tdg_lock;	/* Protects all fields below. */
-	int		tdg_cpus;	/* Count of CPUs in this tdq group. */
-	cpumask_t 	tdg_cpumask;	/* Mask of cpus in this group. */
-	cpumask_t 	tdg_idlemask;	/* Idle cpus in this group. */
-	cpumask_t 	tdg_mask;	/* Bit mask for first cpu. */
-	int		tdg_load;	/* Total load of this group. */
-	int	tdg_transferable;	/* Transferable load of this group. */
-	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
-	char		tdg_name[16];	/* lock name. */
-} __aligned(64);
+struct cpu_group *cpu_top;
 
-#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
-#define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
+#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
@@ -240,6 +221,8 @@
 static int pick_pri = 1;
 static int affinity;
 static int tryself = 1;
+static int lowpri_userret = 0;
+static int oldtryself = 0;
 static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
@@ -248,19 +231,13 @@
 /*
  * One thread queue per processor.
  */
-static volatile cpumask_t tdq_idle;
-static int tdg_maxid;
 static struct tdq	tdq_cpu[MAXCPU];
-static struct tdq_group tdq_groups[MAXCPU];
 static struct tdq	*balance_tdq;
-static int balance_group_ticks;
 static int balance_ticks;
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
 #define	TDQ_ID(x)	((int)((x) - tdq_cpu))
-#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
-#define	TDG_ID(x)	((int)((x) - tdq_groups))
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
 static struct mtx	tdq_lock;
@@ -274,7 +251,7 @@
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
-#define	TDQ_LOCKPTR(t)		((t)->tdq_lock)
+#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
@@ -301,8 +278,6 @@
 static struct td_sched *runq_steal(struct runq *);
 static int sched_pickcpu(struct td_sched *, int);
 static void sched_balance(void);
-static void sched_balance_groups(void);
-static void sched_balance_group(struct tdq_group *);
 static void sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
 static inline struct mtx *thread_block_switch(struct thread *);
@@ -356,7 +331,8 @@
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
-	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
@@ -369,8 +345,6 @@
 #ifdef SMP
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
-	printf("\tgroup:             %d\n", TDG_ID(tdq->tdq_group));
-	printf("\tLock name:         %s\n", tdq->tdq_group->tdg_name);
 #endif
 }
 
@@ -388,7 +362,6 @@
 #ifdef SMP
 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
 		tdq->tdq_transferable++;
-		tdq->tdq_group->tdg_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 #endif
@@ -434,7 +407,6 @@
 #ifdef SMP
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
-		tdq->tdq_group->tdg_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 #endif
@@ -470,11 +442,7 @@
 	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
-		tdq->tdq_group->tdg_load++;
-#else
 		tdq->tdq_sysload++;
-#endif
 }
 
 /*
@@ -491,11 +459,7 @@
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
-		tdq->tdq_group->tdg_load--;
-#else
 		tdq->tdq_sysload--;
-#endif
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 	tdq->tdq_load--;
@@ -504,111 +468,237 @@
 }
 
 #ifdef SMP
+struct cpu_search {
+	u_int	cs_load;
+	u_int	cs_cpu;
+	int	cs_limit;	/* Min priority for low min load for high. */
+};
+
+#define	CPU_SEARCH_LOWEST	0x1
+#define	CPU_SEARCH_HIGHEST	0x2
+#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
+
+#define	CPUMASK_FOREACH(cpu, mask)				\
+	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
+		if ((mask) & 1 << (cpu))
+
+__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match);
+int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high);
+
+/*
+ * This routine compares according to the match argument and should be
+ * reduced in actual instantiations via constant propagation and dead code
+ * elimination.
+ */ 
+static __inline int
+cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
+    const int match)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_CPU(cpu);
+	if (match & CPU_SEARCH_LOWEST)
+		if (tdq->tdq_load < low->cs_load &&
+		    tdq->tdq_lowpri > low->cs_limit) {
+			low->cs_cpu = cpu;
+			low->cs_load = tdq->tdq_load;
+		}
+	if (match & CPU_SEARCH_HIGHEST)
+		if (tdq->tdq_load >= high->cs_limit && 
+		    tdq->tdq_load > high->cs_load && tdq->tdq_transferable) {
+			high->cs_cpu = cpu;
+			high->cs_load = tdq->tdq_load;
+		}
+	return (tdq->tdq_load);
+}
+
 /*
- * sched_balance is a simple CPU load balancing algorithm.  It operates by
- * finding the least loaded and most loaded cpu and equalizing their load
- * by migrating some processes.
- *
- * Dealing only with two CPUs at a time has two advantages.  Firstly, most
- * installations will only have 2 cpus.  Secondly, load balancing too much at
- * once can have an unpleasant effect on the system.  The scheduler rarely has
- * enough information to make perfect decisions.  So this algorithm chooses
- * simplicity and more gradual effects on load in larger systems.
+ * Search the tree of cpu_groups for the lowest or highest loaded cpu
+ * according to the match argument.  This routine actually compares the
+ * load on all paths through the tree and finds the least loaded cpu on
+ * the least loaded path, which may differ from the least loaded cpu in
+ * the system.  This balances work among caches and busses.
  *
+ * This inline is instantiated in three forms below using constants for the
+ * match argument.  It is reduced to the minimum set for each case.  It is
+ * also recursive to the depth of the tree.
+ */
+static inline int
+cpu_search(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match)
+{
+	int total;
+
+	total = 0;
+	if (cg->cg_children) {
+		struct cpu_search lgroup;
+		struct cpu_search hgroup;
+		struct cpu_group *child;
+		u_int lload;
+		int hload;
+		int load;
+		int i;
+
+		lload = -1;
+		hload = -1;
+		for (i = 0; i < cg->cg_children; i++) {
+			child = &cg->cg_child[i];
+			if (match & CPU_SEARCH_LOWEST) {
+				lgroup.cs_cpu = -1;
+				lgroup.cs_load = -1;
+				lgroup.cs_limit = low->cs_limit;
+			}
+			if (match & CPU_SEARCH_HIGHEST) {
+				hgroup.cs_cpu = -1;
+				hgroup.cs_load = 0;
+				hgroup.cs_limit = high->cs_limit;
+			}
+			switch (match) {
+			case CPU_SEARCH_LOWEST:
+				load = cpu_search_lowest(child, &lgroup);
+				break;
+			case CPU_SEARCH_HIGHEST:
+				load = cpu_search_highest(child, &hgroup);
+				break;
+			case CPU_SEARCH_BOTH:
+				load = cpu_search_both(child, &lgroup, &hgroup);
+				break;
+			}
+			total += load;
+			if (match & CPU_SEARCH_LOWEST && load < lload) {
+				*low = lgroup;
+				lload = load;
+			}
+			if (match & CPU_SEARCH_HIGHEST && load > hload) {
+				hload = load;
+				*high = hgroup;
+			}
+		}
+	} else {
+		int cpu;
+
+		CPUMASK_FOREACH(cpu, cg->cg_mask)
+			total += cpu_compare(cpu, low, high, match);
+	}
+	return (total);
+}
+
+/*
+ * cpu_search instantiations must pass constants to maintain the inline
+ * optimization.
  */
-static void
-sched_balance()
+int
+cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
 {
-	struct tdq_group *high;
-	struct tdq_group *low;
-	struct tdq_group *tdg;
-	struct tdq *tdq;
-	int cnt;
-	int i;
+	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
+}
 
-	/*
-	 * Select a random time between .5 * balance_interval and
-	 * 1.5 * balance_interval.
-	 */
-	balance_ticks = max(balance_interval / 2, 1);
-	balance_ticks += random() % balance_interval;
-	if (smp_started == 0 || rebalance == 0)
-		return;
-	tdq = TDQ_SELF();
-	TDQ_UNLOCK(tdq);
-	low = high = NULL;
-	i = random() % (tdg_maxid + 1);
-	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
-		tdg = TDQ_GROUP(i);
-		/*
-		 * Find the CPU with the highest load that has some
-		 * threads to transfer.
-		 */
-		if ((high == NULL || tdg->tdg_load > high->tdg_load)
-		    && tdg->tdg_transferable)
-			high = tdg;
-		if (low == NULL || tdg->tdg_load < low->tdg_load)
-			low = tdg;
-		if (++i > tdg_maxid)
-			i = 0;
-	}
-	if (low != NULL && high != NULL && high != low)
-		sched_balance_pair(LIST_FIRST(&high->tdg_members),
-		    LIST_FIRST(&low->tdg_members));
-	TDQ_LOCK(tdq);
+int
+cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+{
+	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
+}
+
+int
+cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high)
+{
+	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+}
+
+/*
+ * Find the cpu with the least load via the least loaded path that has a
+ * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
+ * acceptable.
+ */
+static inline int
+sched_lowest(struct cpu_group *cg, int pri)
+{
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_limit = pri;
+	low.cs_load = -1;
+	cpu_search_lowest(cg, &low);
+	return low.cs_cpu;
+}
+
+/*
+ * Find the cpu with the highest load via the highest loaded path.
+ */
+static inline int
+sched_highest(struct cpu_group *cg, int minload)
+{
+	struct cpu_search high;
+
+	high.cs_cpu = -1;
+	high.cs_load = 0;
+	high.cs_limit = minload;
+	cpu_search_highest(cg, &high);
+	return high.cs_cpu;
 }
 
 /*
- * Balance load between CPUs in a group.  Will only migrate within the group.
+ * Simultaneously find the highest and lowest loaded cpu reachable via
+ * cg.
  */
+static inline void 
+sched_both(struct cpu_group *cg, int *lowcpu, int *highcpu)
+{
+	struct cpu_search high;
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_limit = -1;
+	low.cs_load = -1;
+	high.cs_load = 0;
+	high.cs_cpu = -1;
+	high.cs_limit = -1;
+	cpu_search_both(cg, &low, &high);
+	*lowcpu = low.cs_cpu;
+	*highcpu = high.cs_cpu;
+	return;
+}
+
 static void
-sched_balance_groups()
+sched_balance_group(struct cpu_group *cg)
 {
-	struct tdq *tdq;
+	int high;
+	int low;
 	int i;
 
+	sched_both(cg, &low, &high);
+	if (low != high && low != -1 && high != -1)
+		sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low));
+
+	for (i = 0; i < cg->cg_children; i++)
+		sched_balance_group(&cg->cg_child[i]);
+}
+
+static void
+sched_balance()
+{
+	struct tdq *tdq;
+
 	/*
 	 * Select a random time between .5 * balance_interval and
 	 * 1.5 * balance_interval.
 	 */
-	balance_group_ticks = max(balance_interval / 2, 1);
-	balance_group_ticks += random() % balance_interval;
+	balance_ticks = max(balance_interval / 2, 1);
+	balance_ticks += random() % balance_interval;
 	if (smp_started == 0 || rebalance == 0)
 		return;
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
-	for (i = 0; i <= tdg_maxid; i++)
-		sched_balance_group(TDQ_GROUP(i));
+	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
- * Finds the greatest imbalance between two tdqs in a group.
- */
-static void
-sched_balance_group(struct tdq_group *tdg)
-{
-	struct tdq *tdq;
-	struct tdq *high;
-	struct tdq *low;
-	int load;
-
-	if (tdg->tdg_transferable == 0)
-		return;
-	low = NULL;
-	high = NULL;
-	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
-		load = tdq->tdq_load;
-		if (high == NULL || load > high->tdq_load)
-			high = tdq;
-		if (low == NULL || load < low->tdq_load)
-			low = tdq;
-	}
-	if (high != NULL && low != NULL && high != low)
-		sched_balance_pair(high, low);
-}
-
-/*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
@@ -647,20 +737,9 @@
 	int i;
 
 	tdq_lock_pair(high, low);
-	/*
-	 * If we're transfering within a group we have to use this specific
-	 * tdq's transferable count, otherwise we can steal from other members
-	 * of the group.
-	 */
-	if (high->tdq_group == low->tdq_group) {
-		transferable = high->tdq_transferable;
-		high_load = high->tdq_load;
-		low_load = low->tdq_load;
-	} else {
-		transferable = high->tdq_group->tdg_transferable;
-		high_load = high->tdq_group->tdg_load;
-		low_load = low->tdq_group->tdg_load;
-	}
+	transferable = high->tdq_transferable;
+	high_load = high->tdq_load;
+	low_load = low->tdq_load;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
@@ -700,20 +779,7 @@
 	tdq = from;
 	cpu = TDQ_ID(to);
 	ts = tdq_steal(tdq);
-	if (ts == NULL) {
-		struct tdq_group *tdg;
-
-		tdg = tdq->tdq_group;
-		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
-			if (tdq == from || tdq->tdq_transferable == 0)
-				continue;
-			ts = tdq_steal(tdq);
-			break;
-		}
-		if (ts == NULL)
-			return;
-	}
-	if (tdq == to)
+	if (ts == NULL)
 		return;
 	td = ts->ts_thread;
 	/*
@@ -736,72 +802,42 @@
 static int
 tdq_idled(struct tdq *tdq)
 {
-	struct tdq_group *tdg;
+	struct cpu_group *cg;
 	struct tdq *steal;
-	int highload;
-	int highcpu;
 	int cpu;
 
 	if (smp_started == 0 || steal_idle == 0)
 		return (1);
-	/* We don't want to be preempted while we're iterating over tdqs */
+	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
-	tdg = tdq->tdq_group;
-	/*
-	 * If we're in a cpu group, try and steal threads from another cpu in
-	 * the group before idling.  In a HTT group all cpus share the same
-	 * run-queue lock, however, we still need a recursive lock to
-	 * call tdq_move().
-	 */
-	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
-		TDQ_LOCK(tdq);
-		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
-			if (steal == tdq || steal->tdq_transferable == 0)
-				continue;
-			TDQ_LOCK(steal);
-			goto steal;
-		}
-		TDQ_UNLOCK(tdq);
-	}
-	/*
-	 * Find the least loaded CPU with a transferable thread and attempt
-	 * to steal it.  We make a lockless pass and then verify that the
-	 * thread is still available after locking.
-	 */
-	for (;;) {
-		highcpu = 0;
-		highload = 0;
-		for (cpu = 0; cpu <= mp_maxid; cpu++) {
-			if (CPU_ABSENT(cpu))
-				continue;
-			steal = TDQ_CPU(cpu);
-			if (steal->tdq_transferable == 0)
-				continue;
-			if (steal->tdq_load < highload)
-				continue;
-			highload = steal->tdq_load;
-			highcpu = cpu;
-		}
-		if (highload < steal_thresh)
-			break;
-		steal = TDQ_CPU(highcpu);
+	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) {
+		cpu = sched_highest(cg, steal_thresh);
+		if (cpu == -1)
+			continue;
+		steal = TDQ_CPU(cpu);
 		if (steal == tdq)
-			break;
+			continue;
 		tdq_lock_pair(tdq, steal);
-		if (steal->tdq_load >= steal_thresh && steal->tdq_transferable)
-			goto steal;
-		tdq_unlock_pair(tdq, steal);
+		if (steal->tdq_load < steal_thresh || 
+		    steal->tdq_transferable == 0) {
+			tdq_unlock_pair(tdq, steal);
+			continue;
+		}
+		spinlock_exit();
+		/*
+		 * If we got a thread while we had interrupts disabled
+		 * don't steal one here.
+		 */
+		if (tdq->tdq_load == 0)
+			tdq_move(steal, tdq);
+		TDQ_UNLOCK(steal);
+		mi_switch(SW_VOL, NULL);
+		thread_unlock(curthread);
+
+		return (0);
 	}
 	spinlock_exit();
 	return (1);
-steal:
-	spinlock_exit();
-	tdq_move(steal, tdq);
-	TDQ_UNLOCK(steal);
-	mi_switch(SW_VOL, NULL);
-	thread_unlock(curthread);
-
-	return (0);
 }
 
 /*
@@ -850,6 +886,28 @@
 }
 
 /*
+ * Set lowpri to its exact value by searching the run-queue and
+ * evaluating curthread.  curthread may be passed as an optimization.
+ */
+static void
+tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
+{
+	struct td_sched *ts;
+	struct thread *td;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if (ctd == NULL)
+		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
+	ts = tdq_choose(tdq);
+	if (ts)
+		td = ts->ts_thread;
+	if (ts == NULL || td->td_priority > ctd->td_priority)
+		tdq->tdq_lowpri = ctd->td_priority;
+	else
+		tdq->tdq_lowpri = td->td_priority;
+}
+
+/*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
@@ -981,155 +1039,51 @@
 	return (tdq);
 }
 
-/*
- * Find the thread queue running the lowest priority thread.
- */
-static int
-tdq_lowestpri(void)
-{
-	struct tdq *tdq;
-	int lowpri;
-	int lowcpu;
-	int lowload;
-	int load;
-	int cpu;
-	int pri;
-
-	lowload = 0;
-	lowpri = lowcpu = 0;
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		tdq = TDQ_CPU(cpu);
-		pri = tdq->tdq_lowpri;
-		load = TDQ_CPU(cpu)->tdq_load;
-		CTR4(KTR_ULE,
-		    "cpu %d pri %d lowcpu %d lowpri %d",
-		    cpu, pri, lowcpu, lowpri);
-		if (pri < lowpri)
-			continue;
-		if (lowpri && lowpri == pri && load > lowload)
-			continue;
-		lowpri = pri;
-		lowcpu = cpu;
-		lowload = load;
-	}
-
-	return (lowcpu);
-}
-
-/*
- * Find the thread queue with the least load.
- */
-static int
-tdq_lowestload(void)
-{
-	struct tdq *tdq;
-	int lowload;
-	int lowpri;
-	int lowcpu;
-	int load;
-	int cpu;
-	int pri;
-
-	lowcpu = 0;
-	lowload = TDQ_CPU(0)->tdq_load;
-	lowpri = TDQ_CPU(0)->tdq_lowpri;
-	for (cpu = 1; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		tdq = TDQ_CPU(cpu);
-		load = tdq->tdq_load;
-		pri = tdq->tdq_lowpri;
-		CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
-		    cpu, load, lowcpu, lowload);
-		if (load > lowload)
-			continue;
-		if (load == lowload && pri < lowpri)
-			continue;
-		lowcpu = cpu;
-		lowload = load;
-		lowpri = pri;
-	}
-
-	return (lowcpu);
-}
-
-/*
- * Pick the destination cpu for sched_add().  Respects affinity and makes
- * a determination based on load or priority of available processors.
- */
 static int
 sched_pickcpu(struct td_sched *ts, int flags)
 {
+	struct cpu_group *cg;
 	struct tdq *tdq;
 	int self;
 	int pri;
 	int cpu;
 
-	cpu = self = PCPU_GET(cpuid);
+	self = PCPU_GET(cpuid);
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
-	if (flags & SRQ_OURSELF) {
-		CTR1(KTR_ULE, "YIELDING %d",
-		    curthread->td_priority);
+	if (flags & SRQ_OURSELF)
 		return (self);
-	}
 	pri = ts->ts_thread->td_priority;
-	cpu = ts->ts_cpu;
-	/*
-	 * Regardless of affinity, if the last cpu is idle send it there.
-	 */
-	tdq = TDQ_CPU(cpu);
-	if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
-		CTR5(KTR_ULE,
-		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
-		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    tdq->tdq_lowpri);
+	tdq = TDQ_CPU(ts->ts_cpu);
+	if (tdq->tdq_lowpri > PRI_MIN_IDLE)
 		return (ts->ts_cpu);
-	}
-	/*
-	 * If we have affinity, try to place it on the cpu we last ran on.
-	 */
-	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
-		CTR5(KTR_ULE,
-		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
-		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    tdq->tdq_lowpri);
+	if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri)
 		return (ts->ts_cpu);
-	}
-	/*
-	 * Look for an idle group.
-	 */
-	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
-	cpu = ffs(tdq_idle);
-	if (cpu)
-		return (--cpu);
-	/*
-	 * If there are no idle cores see if we can run the thread locally.
-	 * This may improve locality among sleepers and wakers when there
-	 * is shared data.
-	 */
-	if (tryself && pri < TDQ_CPU(self)->tdq_lowpri) {
-		CTR1(KTR_ULE, "tryself %d",
-		    curthread->td_priority);
-		return (self);
-	}
-	/*
- 	 * Now search for the cpu running the lowest priority thread with
-	 * the least load.
-	 */
-	if (pick_pri)
-		cpu = tdq_lowestpri();
-	else
-		cpu = tdq_lowestload();
+	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
+		if (SCHED_AFFINITY(ts, cg->cg_level))
+			break;
+	cpu = -1;
+	if (cg)
+		cpu = sched_lowest(cg, pri);
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, -1);
+	if (tryself) {
+		cg = TDQ_SELF()->tdq_cg;
+		if (0 && cg->cg_level)
+			self = sched_lowest(cg, -1);
+		if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+		    TDQ_CPU(self)->tdq_lowpri > pri)
+			cpu = self;
+	}
+	if (oldtryself && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+	    curthread->td_priority > pri)
+		cpu = self;
 	return (cpu);
 }
-
-#endif	/* SMP */
+#endif
 
 /*
  * Pick the highest priority task we have and return it.
@@ -1174,84 +1128,39 @@
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
-	tdq->tdq_load = 0;
-}
-
-#ifdef SMP
-static void
-tdg_setup(struct tdq_group *tdg)
-{
-	if (bootverbose)
-		printf("ULE: setup cpu group %d\n", TDG_ID(tdg));
-	snprintf(tdg->tdg_name, sizeof(tdg->tdg_name),
-	    "sched lock %d", (int)TDG_ID(tdg));
-	mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock",
+	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+	    "sched lock %d", (int)TDQ_ID(tdq));
+	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
 	    MTX_SPIN | MTX_RECURSE);
-	LIST_INIT(&tdg->tdg_members);
-	tdg->tdg_load = 0;
-	tdg->tdg_transferable = 0;
-	tdg->tdg_cpus = 0;
-	tdg->tdg_mask = 0;
-	tdg->tdg_cpumask = 0;
-	tdg->tdg_idlemask = 0;
 }
 
-static void
-tdg_add(struct tdq_group *tdg, struct tdq *tdq)
-{
-	if (tdg->tdg_mask == 0)
-		tdg->tdg_mask |= 1 << TDQ_ID(tdq);
-	tdg->tdg_cpumask |= 1 << TDQ_ID(tdq);
-	tdg->tdg_cpus++;
-	tdq->tdq_group = tdg;
-	tdq->tdq_lock = &tdg->tdg_lock;
-	LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
-	if (bootverbose)
-		printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n",
-		    TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask);
-}
-
-static void
-sched_setup_topology(void)
+#ifdef SMP
+static struct cpu_group *
+sched_find_group(int cpu)
 {
-	struct tdq_group *tdg;
 	struct cpu_group *cg;
-	int balance_groups;
-	struct tdq *tdq;
+	cpumask_t mask;
+	int children;
 	int i;
-	int j;
 
-	topology = 1;
-	balance_groups = 0;
-	for (i = 0; i < smp_topology->ct_count; i++) {
-		cg = &smp_topology->ct_group[i];
-		tdg = &tdq_groups[i];
-		/*
-		 * Initialize the group.
-		 */
-		tdg_setup(tdg);
-		/*
-		 * Find all of the group members and add them.
-		 */
-		for (j = 0; j < MAXCPU; j++) { 
-			if ((cg->cg_mask & (1 << j)) != 0) {
-				tdq = TDQ_CPU(j);
-				tdq_setup(tdq);
-				tdg_add(tdg, tdq);
-			}
-		}
-		if (tdg->tdg_cpus > 1)
-			balance_groups = 1;
+	mask = (1 << cpu);
+	cg = cpu_top;
+	for (;;) {
+		if ((cg->cg_mask & mask) == 0)
+			return (NULL);
+		if (cg->cg_children == 0)
+			return (cg);
+		children = cg->cg_children;
+		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
+			if ((cg->cg_mask & mask) != 0)
+				break;
 	}
-	tdg_maxid = smp_topology->ct_count - 1;
-	if (balance_groups)
-		sched_balance_groups();
+	panic("How did I get here?");
 }
 
 static void
 sched_setup_smp(void)
 {
-	struct tdq_group *tdg;
 	struct tdq *tdq;
 	int cpus;
 	int i;
@@ -1259,37 +1168,149 @@
 	for (cpus = 0, i = 0; i < MAXCPU; i++) {
 		if (CPU_ABSENT(i))
 			continue;
-		tdq = &tdq_cpu[i];
-		tdg = &tdq_groups[i];
-		/*
-		 * Setup a tdq group with one member.
-		 */
-		tdg_setup(tdg);
+		tdq = TDQ_CPU(i);
 		tdq_setup(tdq);
-		tdg_add(tdg, tdq);
+		tdq->tdq_cg = sched_find_group(i);
 		cpus++;
 	}
-	tdg_maxid = cpus - 1;
 }
 
-/*
- * Fake a topology with one group containing all CPUs.
- */
-static void
-sched_fake_topo(void)
+struct cpu_group *cpu_topo_none(void);
+struct cpu_group *cpu_topo_1level(int l1share, int l1count, int l1flags);
+struct cpu_group *cpu_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags);
+
+struct cpu_group *
+cpu_topo_none(void)
+{
+	static struct cpu_group top;
+
+	top.cg_parent = NULL;
+	top.cg_child = NULL;
+	top.cg_mask = (1 << mp_ncpus) - 1;
+	top.cg_count = mp_ncpus;
+	top.cg_children = 0;
+	top.cg_level = CG_SHARE_NONE;
+	top.cg_flags = 0;
+	
+	return (&top);
+}
+
+struct cpu_group *
+cpu_topo_1level(int l1share, int l1count, int l1flags)
 {
-#ifdef SCHED_FAKE_TOPOLOGY
-	static struct cpu_top top;
-	static struct cpu_group group;
+	static struct cpu_group group[16];
+	struct cpu_group *top;
+	struct cpu_group *l1g;
+	int l1, l1cpus, l1cpumask;
+
+	top = &group[0];
+	l1g = &group[1];
+	l1cpus = mp_ncpus / l1count;
+	l1cpumask = (1 << l1cpus) - 1;
+	top->cg_parent = NULL;
+	top->cg_child = l1g;
+	top->cg_mask = (1 << mp_ncpus) - 1;
+	top->cg_count = mp_ncpus;
+	top->cg_children = l1count;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_flags = 0;
+	for (l1 = 0; l1 < l1count; l1++, l1g++, l1cpumask <<= l1cpus) {
+		l1g->cg_parent = top;
+		l1g->cg_child = NULL;
+		l1g->cg_mask = l1cpumask;
+		l1g->cg_count = l1cpus;
+		l1g->cg_children = 0;
+		l1g->cg_level = l1share;
+		l1g->cg_flags = l1flags;
+	}
+
+	return (top);
+}
+
+struct cpu_group *
+cpu_topo_2level(int l2share, int l2count, int l1share, int l1count,
+    int l1flags)
+{
+	static struct cpu_group group[16];
+	struct cpu_group *top;
+	struct cpu_group *l1g;
+	struct cpu_group *l2g;
+	int l1, l2, l1cpus, l2cpus, l1cpumask, l2cpumask;
+
+	top = &group[0];
+	l2g = &group[1];
+	l1g = &group[1 + l2count];
+	l2cpus = mp_ncpus / l2count;
+	l1cpus = l2cpus / l1count;
+	l2cpumask = (1 << l2cpus) - 1;
+	l1cpumask = (1 << l1cpus) - 1;
+	top->cg_parent = NULL;
+	top->cg_child = l2g;
+	top->cg_mask = (1 << mp_ncpus) - 1;
+	top->cg_count = mp_ncpus;
+	top->cg_children = l2count;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_flags = 0;
+	for (l2 = 0; l2 < l2count; l2++, l2g++, l2cpumask <<= l2cpus) {
+		l2g->cg_parent = top;
+		l2g->cg_child = l1g;
+		l2g->cg_mask = l2cpumask;
+		l2g->cg_count = l2cpus;
+		l2g->cg_children = l1count;
+		l2g->cg_level = l2share;
+		l2g->cg_flags = 0;
+		for (l1 = 0; l1 < l1count; l1++, l1g++, l1cpumask <<= l1cpus) {
+			l1g->cg_parent = l2g;
+			l1g->cg_child = NULL;
+			l1g->cg_mask = l1cpumask;
+			l1g->cg_count = l1cpus;
+			l1g->cg_children = 0;
+			l1g->cg_level = l1share;
+			l1g->cg_flags = l1flags;
+		}
+	}
 
-	top.ct_count = 1;
-	top.ct_group = &group;
-	group.cg_mask = all_cpus;
-	group.cg_count = mp_ncpus;
-	group.cg_children = 0;
-	smp_topology = &top;
-#endif
+	return (top);
+}
+
+static struct cpu_group *
+sched_build_topo(void)
+{
+	/*
+	 * Dual core with no sharing.
+	 */
+	if (1)
+		return cpu_topo_1level(CG_SHARE_NONE, 2, 0);
+	/*
+	 * Dual core with shared L2.
+	 */
+	if (0)
+		return cpu_topo_1level(CG_SHARE_L2, 2, 0);
+	/*
+	 * quad core barcelona: shared l3 among each package, private l2.
+	 */
+	if (0)
+		return cpu_topo_1level(CG_SHARE_L3, 4, 0);
+	/*
+	 * Intel quad core.  2 dualcore parts on each package share l2.
+	 * Package shares only system bus.
+	 */
+	if (0)
+		return cpu_topo_2level(CG_SHARE_NONE, 4, CG_SHARE_L2, 2, 0);
+	/*
+	 * Single-core 2xHTT
+	 */
+	if (0)
+		return cpu_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
+	/*
+	 * sun4v 4 cores with a shared l3 cache 8 threads sharing cache.
+	 */
+	if (0)
+		return cpu_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
+		    CG_FLAG_THREAD);
+	return (cpu_topo_none());
 }
+
 #endif
 
 /*
@@ -1303,21 +1324,12 @@
 
 	tdq = TDQ_SELF();
 #ifdef SMP
-	sched_fake_topo();
-	/*
-	 * Setup tdqs based on a topology configuration or vanilla SMP based
-	 * on mp_maxid.
-	 */
-	if (smp_topology == NULL)
-		sched_setup_smp();
-	else 
-		sched_setup_topology();
+	cpu_top = sched_build_topo();
+	sched_setup_smp();
 	balance_tdq = tdq;
 	sched_balance();
 #else
 	tdq_setup(tdq);
-	mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE);
-	tdq->tdq_lock = &tdq_lock;
 #endif
 	/*
 	 * To avoid divide-by-zero, we set realstathz a dummy value
@@ -1331,6 +1343,7 @@
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	tdq_load_add(tdq, &td_sched0);
+	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
@@ -1369,7 +1382,7 @@
 	 * prevents excess thrashing on large machines and excess idle on
 	 * smaller machines.
 	 */
-	steal_thresh = min(ffs(mp_ncpus) - 1, 4);
+	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 }
@@ -1620,12 +1633,15 @@
 #ifdef SMP
 	} else if (TD_IS_RUNNING(td)) {
 		struct tdq *tdq;
+		int oldpri;
 
 		tdq = TDQ_CPU(ts->ts_cpu);
-		if (prio < tdq->tdq_lowpri ||
-		   (td->td_priority == tdq->tdq_lowpri && tdq->tdq_load <= 1))
-			tdq->tdq_lowpri = prio;
+		oldpri = td->td_priority;
 		td->td_priority = prio;
+		if (prio < tdq->tdq_lowpri)
+			tdq->tdq_lowpri = prio;
+		else if (tdq->tdq_lowpri == oldpri)
+			tdq_setlowpri(tdq, td);
 #endif
 	} else
 		td->td_priority = prio;
@@ -2063,15 +2079,11 @@
 		struct tdq *tdq;
 
 		tdq = TDQ_CPU(td->td_sched->ts_cpu);
-		if (THREAD_CAN_MIGRATE(td)) {
+		if (THREAD_CAN_MIGRATE(td))
 			tdq->tdq_transferable--;
-			tdq->tdq_group->tdg_transferable--;
-		}
 		td->td_pri_class = class;
-		if (THREAD_CAN_MIGRATE(td)) {
+		if (THREAD_CAN_MIGRATE(td))
 			tdq->tdq_transferable++;
-			tdq->tdq_group->tdg_transferable++;
-		}
 	}
 #endif
 	td->td_pri_class = class;
@@ -2149,6 +2161,8 @@
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
+		if (lowpri_userret)
+			tdq_setlowpri(TDQ_SELF(), td);
 		thread_unlock(td);
         }
 }
@@ -2172,8 +2186,6 @@
 	if (balance_tdq == tdq) {
 		if (balance_ticks && --balance_ticks == 0)
 			sched_balance();
-		if (balance_group_ticks && --balance_group_ticks == 0)
-			sched_balance_groups();
 	}
 #endif
 	/*
@@ -2261,11 +2273,7 @@
 struct thread *
 sched_choose(void)
 {
-#ifdef SMP
-	struct tdq_group *tdg;
-#endif
 	struct td_sched *ts;
-	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
@@ -2275,20 +2283,7 @@
 		tdq_runq_rem(tdq, ts);
 		return (ts->ts_thread);
 	}
-	td = PCPU_GET(idlethread);
-#ifdef SMP
-	/*
-	 * We only set the idled bit when all of the cpus in the group are
-	 * idle.  Otherwise we could get into a situation where a thread bounces
-	 * back and forth between two idle cores on seperate physical CPUs.
-	 */
-	tdg = tdq->tdq_group;
-	tdg->tdg_idlemask |= PCPU_GET(cpumask);
-	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
-		atomic_set_int(&tdq_idle, tdg->tdg_mask);
-	tdq->tdq_lowpri = td->td_priority;
-#endif
-	return (td);
+	return (PCPU_GET(idlethread));
 }
 
 /*
@@ -2305,7 +2300,7 @@
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
-	if (td->td_priority < ctd->td_priority)
+	if (td->td_priority < cpri)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
@@ -2329,9 +2324,6 @@
 {
 	struct td_sched *ts;
 	int class;
-#ifdef SMP
-	int cpumask;
-#endif
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
@@ -2356,25 +2348,6 @@
 	else
 		ts->ts_runq = &tdq->tdq_idle;
 #ifdef SMP
-	cpumask = 1 << ts->ts_cpu;
-	/*
-	 * If we had been idle, clear our bit in the group and potentially
-	 * the global bitmap.
-	 */
-	if ((class != PRI_IDLE && class != PRI_ITHD) &&
-	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
-		/*
-		 * Check to see if our group is unidling, and if so, remove it
-		 * from the global idle mask.
-		 */
-		if (tdq->tdq_group->tdg_idlemask ==
-		    tdq->tdq_group->tdg_cpumask)
-			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
-		/*
-		 * Now remove ourselves from the group specific idle mask.
-		 */
-		tdq->tdq_group->tdg_idlemask &= ~cpumask;
-	}
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 #endif
@@ -2462,6 +2435,10 @@
 	tdq_runq_rem(tdq, ts);
 	tdq_load_rem(tdq, ts);
 	TD_SET_CAN_RUN(td);
+#ifdef SMP
+	if (td->td_priority == tdq->tdq_lowpri)
+		tdq_setlowpri(tdq, NULL);
+#endif
 }
 
 /*
@@ -2563,8 +2540,8 @@
 	int i;
 
 	total = 0;
-	for (i = 0; i <= tdg_maxid; i++)
-		total += TDQ_GROUP(i)->tdg_load;
+	for (i = 0; i <= mp_maxid; i++)
+		total += TDQ_CPU(i)->tdq_sysload;
 	return (total);
 #else
 	return (TDQ_SELF()->tdq_sysload);
@@ -2658,6 +2635,9 @@
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+#ifdef SMP
+	tdq->tdq_lowpri = td->td_priority;
+#endif
 }
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
@@ -2676,6 +2656,8 @@
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, userret, CTLFLAG_RW, &lowpri_userret, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, oldtryself, CTLFLAG_RW, &oldtryself, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,