Index: lib/libc/sys/Symbol.map
===================================================================
RCS file: /home/ncvs/src/lib/libc/sys/Symbol.map,v
retrieving revision 1.9
diff -u -p -r1.9 Symbol.map
--- lib/libc/sys/Symbol.map	22 Aug 2007 01:56:35 -0000	1.9
+++ lib/libc/sys/Symbol.map	2 Mar 2008 02:57:58 -0000
@@ -66,6 +66,11 @@ FBSD_1.0 {
 	clock_settime;
 	close;
 	connect;
+	cpuset;
+	cpuset_setid;
+	cpuset_getid;
+	cpuset_setaffinity;
+	cpuset_getaffinity;
 	dup;
 	dup2;
 	eaccess;
@@ -450,6 +455,16 @@ FBSDprivate_1.0 {
 	__sys_close;
 	_connect;
 	__sys_connect;
+	__cpuset;
+	__sys_cpuset;
+	__cpuset_setid;
+	__sys_cpuset_setid;
+	__cpuset_getid;
+	__sys_cpuset_getid;
+	__cpuset_setaffinity;
+	__sys_cpuset_setaffinity;
+	__cpuset_getaffinity;
+	__sys_cpuset_getaffinity;
 	_dup;
 	__sys_dup;
 	_dup2;
Index: sys/amd64/amd64/identcpu.c
===================================================================
RCS file: /home/ncvs/src/sys/amd64/amd64/identcpu.c,v
retrieving revision 1.157
diff -u -p -r1.157 identcpu.c
--- sys/amd64/amd64/identcpu.c	2 Feb 2008 23:17:27 -0000	1.157
+++ sys/amd64/amd64/identcpu.c	2 Mar 2008 02:58:05 -0000
@@ -97,6 +97,10 @@ static struct {
 	{ "Sledgehammer",	CPUCLASS_K8 },		/* CPU_SLEDGEHAMMER */
 };
 
+int cpu_cores;
+int cpu_logical;
+
+
 extern int pq_l2size;
 extern int pq_l2nways;
 
@@ -360,11 +364,13 @@ printcpuinfo(void)
 				if ((regs[0] & 0x1f) != 0)
 					cmp = ((regs[0] >> 26) & 0x3f) + 1;
 			}
+			cpu_cores = cmp;
+			cpu_logical = htt / cmp;
 			if (cmp > 1)
 				printf("\n  Cores per package: %d", cmp);
 			if ((htt / cmp) > 1)
 				printf("\n  Logical CPUs per core: %d",
-				    htt / cmp);
+				    cpu_logical);
 		}
 	}
 	/* Avoid ugly blank lines: only print newline when we have to. */
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/amd64/amd64/mp_machdep.c,v
retrieving revision 1.287
diff -u -p -r1.287 mp_machdep.c
--- sys/amd64/amd64/mp_machdep.c	2 Aug 2007 21:17:58 -0000	1.287
+++ sys/amd64/amd64/mp_machdep.c	2 Mar 2008 02:58:05 -0000
@@ -83,12 +83,6 @@ extern	int nkpt;
 
 extern  struct pcpu __pcpu[];
 
-/*
- * CPU topology map datastructures for HTT.
- */
-static struct cpu_group mp_groups[MAXCPU];
-static struct cpu_top mp_top;
-
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 static int bootAP;
@@ -182,40 +176,38 @@ mem_range_AP_init(void)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
-void
-mp_topology(void)
+struct cpu_group *
+cpu_topo(void)
 {
-	struct cpu_group *group;
-	int apic_id;
-	int groups;
-	int cpu;
-
-	/* Build the smp_topology map. */
-	/* Nothing to do if there is no HTT support. */
-	if (hyperthreading_cpus <= 1)
-		return;
-	group = &mp_groups[0];
-	groups = 1;
-	for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
-		if (!cpu_info[apic_id].cpu_present)
-			continue;
-		/*
-		 * If the current group has members and we're not a logical
-		 * cpu, create a new group.
-		 */
-		if (group->cg_count != 0 &&
-		    (apic_id % hyperthreading_cpus) == 0) {
-			group++;
-			groups++;
-		}
-		group->cg_count++;
-		group->cg_mask |= 1 << cpu;
-		cpu++;
+	if (cpu_cores == 0)
+		cpu_cores = 1;
+	if (cpu_logical == 0)
+		cpu_logical = 1;
+	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
+		printf("WARNING: Non-uniform processors.\n");
+		printf("WARNING: Using suboptimal topology.\n");
+		return (smp_topo_none());
 	}
-
-	mp_top.ct_count = groups;
-	mp_top.ct_group = mp_groups;
-	smp_topology = &mp_top;
+	/*
+	 * No multi-core or hyper-threaded.
+	 */
+	if (cpu_logical * cpu_cores == 1)
+		return (smp_topo_none());
+	/*
+	 * Only HTT no multi-core.
+	 */
+	if (cpu_logical > 1 && cpu_cores == 1)
+		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
+	/*
+	 * Only multi-core no HTT.
+	 */
+	if (cpu_cores > 1 && cpu_logical == 1)
+		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
+	/*
+	 * Both HTT and multi-core.
+	 */
+	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
+	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
 }
 
 /*
@@ -409,9 +401,6 @@ cpu_mp_start(void)
 	}
 
 	set_interrupt_apic_ids();
-
-	/* Last, setup the cpu topology now that we have probed CPUs */
-	mp_topology();
 }
 
 
Index: sys/amd64/include/smp.h
===================================================================
RCS file: /home/ncvs/src/sys/amd64/include/smp.h,v
retrieving revision 1.91
diff -u -p -r1.91 smp.h
--- sys/amd64/include/smp.h	20 Sep 2007 20:38:43 -0000	1.91
+++ sys/amd64/include/smp.h	2 Mar 2008 02:58:05 -0000
@@ -36,6 +36,10 @@ extern int			boot_cpu_id;
 extern struct pcb		stoppcbs[];
 extern int			cpu_apic_ids[];
 
+/* global data in identcpu.c */
+extern int			cpu_cores;
+extern int			cpu_logical;
+
 /* IPI handlers */
 inthand_t
 	IDTVEC(invltlb),	/* TLB shootdowns - global */
@@ -57,7 +61,6 @@ void	ipi_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 u_int	mp_bootaddress(u_int);
 int	mp_grab_cpu_hlt(void);
-void	mp_topology(void);
 void	smp_cache_flush(void);
 void	smp_invlpg(vm_offset_t addr);
 void	smp_masked_invlpg(u_int mask, vm_offset_t addr);
Index: sys/conf/files
===================================================================
RCS file: /home/ncvs/src/sys/conf/files,v
retrieving revision 1.1269
diff -u -p -r1.1269 files
--- sys/conf/files	20 Feb 2008 07:50:13 -0000	1.1269
+++ sys/conf/files	2 Mar 2008 02:58:06 -0000
@@ -1426,6 +1426,7 @@ kern/kern_clock.c		standard
 kern/kern_condvar.c		standard
 kern/kern_conf.c		standard
 kern/kern_cpu.c			standard
+kern/kern_cpuset.c		standard
 kern/kern_context.c		standard
 kern/kern_descrip.c		standard
 kern/kern_environment.c		standard
Index: sys/i386/i386/identcpu.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/identcpu.c,v
retrieving revision 1.180
diff -u -p -r1.180 identcpu.c
--- sys/i386/i386/identcpu.c	29 May 2007 19:39:18 -0000	1.180
+++ sys/i386/i386/identcpu.c	2 Mar 2008 02:58:09 -0000
@@ -141,6 +141,9 @@ static struct {
 	{ "Pentium 4",		CPUCLASS_686 },		/* CPU_P4 */
 };
 
+int cpu_cores;
+int cpu_logical;
+
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
 #endif
@@ -874,11 +877,13 @@ via_common:
 				if ((regs[0] & 0x1f) != 0)
 					cmp = ((regs[0] >> 26) & 0x3f) + 1;
 			}
+			cpu_cores = cmp;
+			cpu_logical = htt / cmp;
 			if (cmp > 1)
 				printf("\n  Cores per package: %d", cmp);
 			if ((htt / cmp) > 1)
 				printf("\n  Logical CPUs per core: %d",
-				    htt / cmp);
+				    cpu_logical);
 		}
 	} else if (strcmp(cpu_vendor, "CyrixInstead") == 0) {
 		printf("  DIR=0x%04x", cyrix_did);
Index: sys/i386/i386/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/mp_machdep.c,v
retrieving revision 1.282
diff -u -p -r1.282 mp_machdep.c
--- sys/i386/i386/mp_machdep.c	13 Nov 2007 23:00:24 -0000	1.282
+++ sys/i386/i386/mp_machdep.c	2 Mar 2008 02:58:09 -0000
@@ -135,12 +135,6 @@ extern	int nkpt;
 
 extern	struct pcpu __pcpu[];
 
-/*
- * CPU topology map datastructures for HTT.
- */
-static struct cpu_group mp_groups[MAXCPU];
-static struct cpu_top mp_top;
-
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 static int bootAP;
@@ -238,40 +232,38 @@ mem_range_AP_init(void)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
-void
-mp_topology(void)
+struct cpu_group *
+cpu_topo(void)
 {
-	struct cpu_group *group;
-	int apic_id;
-	int groups;
-	int cpu;
-
-	/* Build the smp_topology map. */
-	/* Nothing to do if there is no HTT support. */
-	if (hyperthreading_cpus <= 1)
-		return;
-	group = &mp_groups[0];
-	groups = 1;
-	for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
-		if (!cpu_info[apic_id].cpu_present)
-			continue;
-		/*
-		 * If the current group has members and we're not a logical
-		 * cpu, create a new group.
-		 */
-		if (group->cg_count != 0 &&
-		    (apic_id % hyperthreading_cpus) == 0) {
-			group++;
-			groups++;
-		}
-		group->cg_count++;
-		group->cg_mask |= 1 << cpu;
-		cpu++;
+	if (cpu_cores == 0)
+		cpu_cores = 1;
+	if (cpu_logical == 0)
+		cpu_logical = 1;
+	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
+		printf("WARNING: Non-uniform processors.\n");
+		printf("WARNING: Using suboptimal topology.\n");
+		return (smp_topo_none());
 	}
-
-	mp_top.ct_count = groups;
-	mp_top.ct_group = mp_groups;
-	smp_topology = &mp_top;
+	/*
+	 * No multi-core or hyper-threaded.
+	 */
+	if (cpu_logical * cpu_cores == 1)
+		return (smp_topo_none());
+	/*
+	 * Only HTT no multi-core.
+	 */
+	if (cpu_logical > 1 && cpu_cores == 1)
+		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
+	/*
+	 * Only multi-core no HTT.
+	 */
+	if (cpu_cores > 1 && cpu_logical == 1)
+		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
+	/*
+	 * Both HTT and multi-core.
+	 */
+	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
+	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
 }
 
 
@@ -459,9 +451,6 @@ cpu_mp_start(void)
 	}
 
 	set_interrupt_apic_ids();
-
-	/* Last, setup the cpu topology now that we have probed CPUs */
-	mp_topology();
 }
 
 
Index: sys/i386/include/smp.h
===================================================================
RCS file: /home/ncvs/src/sys/i386/include/smp.h,v
retrieving revision 1.90
diff -u -p -r1.90 smp.h
--- sys/i386/include/smp.h	20 Sep 2007 20:38:43 -0000	1.90
+++ sys/i386/include/smp.h	2 Mar 2008 02:58:10 -0000
@@ -45,6 +45,10 @@ extern u_long *ipi_rendezvous_counts[MAX
 extern u_long *ipi_lazypmap_counts[MAXCPU];
 #endif
 
+/* global data in identcpu.c */
+extern int			cpu_cores;
+extern int			cpu_logical;
+
 /* IPI handlers */
 inthand_t
 	IDTVEC(invltlb),	/* TLB shootdowns - global */
@@ -67,7 +71,6 @@ void	ipi_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 u_int	mp_bootaddress(u_int);
 int	mp_grab_cpu_hlt(void);
-void	mp_topology(void);
 void	smp_cache_flush(void);
 void	smp_invlpg(vm_offset_t addr);
 void	smp_masked_invlpg(u_int mask, vm_offset_t addr);
Index: sys/ia64/ia64/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/ia64/ia64/mp_machdep.c,v
retrieving revision 1.67
diff -u -p -r1.67 mp_machdep.c
--- sys/ia64/ia64/mp_machdep.c	6 Aug 2007 05:15:57 -0000	1.67
+++ sys/ia64/ia64/mp_machdep.c	2 Mar 2008 02:58:10 -0000
@@ -84,6 +84,13 @@ volatile int ap_spin;
 
 static void cpu_mp_unleash(void *);
 
+struct cpu_group *
+cpu_topo(void)
+{
+
+	return smp_topo_none();
+}
+
 void
 ia64_ap_startup(void)
 {
Index: sys/kern/init_main.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/init_main.c,v
retrieving revision 1.290
diff -u -p -r1.290 init_main.c
--- sys/kern/init_main.c	10 Jan 2008 22:11:20 -0000	1.290
+++ sys/kern/init_main.c	2 Mar 2008 02:58:10 -0000
@@ -73,6 +73,7 @@ __FBSDID("$FreeBSD: src/sys/kern/init_ma
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
+#include <sys/cpuset.h>
 
 #include <machine/cpu.h>
 
@@ -430,6 +431,7 @@ proc0_init(void *dummy __unused)
 	td->td_base_pri = PUSER;
 	td->td_oncpu = 0;
 	td->td_flags = TDF_INMEM|TDP_KTHREAD;
+	td->td_cpuset = cpuset_thread0();
 	p->p_peers = 0;
 	p->p_leader = p;
 
Index: sys/kern/init_sysent.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.233
diff -u -p -r1.233 init_sysent.c
--- sys/kern/init_sysent.c	12 Feb 2008 20:11:54 -0000	1.233
+++ sys/kern/init_sysent.c	2 Mar 2008 02:58:10 -0000
@@ -2,7 +2,7 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.233 2008/02/12 20:11:54 ru Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp 
  */
 
@@ -513,4 +513,9 @@ struct sysent sysent[] = {
 	{ AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0 },	/* 481 = thr_kill2 */
 	{ AS(shm_open_args), (sy_call_t *)shm_open, AUE_SHMOPEN, NULL, 0, 0 },	/* 482 = shm_open */
 	{ AS(shm_unlink_args), (sy_call_t *)shm_unlink, AUE_SHMUNLINK, NULL, 0, 0 },	/* 483 = shm_unlink */
+	{ AS(cpuset_args), (sy_call_t *)cpuset, AUE_NULL, NULL, 0, 0 },	/* 484 = cpuset */
+	{ AS(cpuset_setid_args), (sy_call_t *)cpuset_setid, AUE_NULL, NULL, 0, 0 },	/* 485 = cpuset_setid */
+	{ AS(cpuset_getid_args), (sy_call_t *)cpuset_getid, AUE_NULL, NULL, 0, 0 },	/* 486 = cpuset_getid */
+	{ AS(cpuset_getaffinity_args), (sy_call_t *)cpuset_getaffinity, AUE_NULL, NULL, 0, 0 },	/* 487 = cpuset_getaffinity */
+	{ AS(cpuset_setaffinity_args), (sy_call_t *)cpuset_setaffinity, AUE_NULL, NULL, 0, 0 },	/* 488 = cpuset_setaffinity */
 };
Index: sys/kern/kern_cpuset.c
===================================================================
RCS file: sys/kern/kern_cpuset.c
diff -N sys/kern/kern_cpuset.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/kern/kern_cpuset.c	2 Mar 2008 02:58:10 -0000
@@ -0,0 +1,907 @@
+/*-
+ * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/cpuset.h>
+#include <sys/sx.h>
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/limits.h>
+
+#include <vm/uma.h>
+
+/*
+ * cpusets provide a mechanism for creating and manipulating sets of
+ * processors for the purpose of constraining the scheduling of threads to
+ * specific processors.
+ *
+ * Each process belongs to an identified set, by default this is set 1.  Each
+ * thread may further restrict the cpus it may run on to a subset of this
+ * named set.  This creates an anonymous set which other threads and processes
+ * may not join by number.
+ *
+ * The named set is referred to herein as the 'base' set to avoid ambiguity.
+ * This set is usually a child of a 'root' set while the anonymous set may
+ * simply be referred to as a mask.  In the syscall api these are referred to
+ * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
+ *
+ * Threads inherit their set from their creator whether it be anonymous or
+ * not.  This means that anonymous sets are immutable because they may be
+ * shared.  To modify an anonymous set a new set is created with the desired
+ * mask and the same parent as the existing anonymous set.  This gives the
+ * illusion of each thread having a private mask.A
+ *
+ * Via the syscall apis a user may ask to retrieve or modify the root, base,
+ * or mask that is discovered via a pid, tid, or setid.  Modifying a set
+ * modifies all numbered and anonymous child sets to comply with the new mask.
+ * Modifying a pid or tid's mask applies only to that tid but must still
+ * exist within the assigned parent set.
+ *
+ * A thread may not be assigned to a a group seperate from other threads in
+ * the process.  This is to remove ambiguity when the setid is queried with
+ * a pid argument.  There is no other technical limitation.
+ *
+ * This somewhat complex arrangement is intended to make it easy for
+ * applications to query available processors and bind their threads to
+ * specific processors while also allowing administrators to dynamically
+ * reprovision by changing sets which apply to groups of processes.
+ *
+ * A simple application should not concern itself with sets at all and
+ * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
+ * meaning 'curthread'.  It may query availble cpus for that tid with a
+ * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
+ */
+static uma_zone_t cpuset_zone;
+static struct mtx cpuset_lock;
+static struct setlist cpuset_ids;
+struct cpuset *cpuset_zero;
+static struct unrhdr *cpuset_unr;
+
+/*
+ * Acquire a reference to a cpuset, all pointers must be tracked with refs.
+ */
+struct cpuset *
+cpuset_ref(struct cpuset *set)
+{
+
+	refcount_acquire(&set->cs_ref);
+	return (set);
+}
+
+/*
+ * Release a reference in a context where it is safe to allocte.
+ */
+void
+cpuset_rel(struct cpuset *set)
+{
+	cpusetid_t id;
+
+	if (refcount_release(&set->cs_ref) == 0)
+		return;
+	mtx_lock_spin(&cpuset_lock);
+	LIST_REMOVE(set, cs_siblings);
+	id = set->cs_id;
+	if (id != CPUSET_INVALID)
+		LIST_REMOVE(set, cs_link);
+	mtx_unlock_spin(&cpuset_lock);
+	cpuset_rel(set->cs_parent);
+	uma_zfree(cpuset_zone, set);
+	if (id != CPUSET_INVALID)
+		free_unr(cpuset_unr, id);
+}
+
+/*
+ * Deferred release must be used when in a context that is not safe to
+ * allocate/free.  This places any unreferenced sets on the list 'head'.
+ */
+static void
+cpuset_rel_defer(struct setlist *head, struct cpuset *set)
+{
+
+	if (refcount_release(&set->cs_ref) == 0)
+		return;
+	mtx_lock_spin(&cpuset_lock);
+	LIST_REMOVE(set, cs_siblings);
+	if (set->cs_id != CPUSET_INVALID)
+		LIST_REMOVE(set, cs_link);
+	LIST_INSERT_HEAD(head, set, cs_link);
+	mtx_unlock_spin(&cpuset_lock);
+}
+
+/*
+ * Complete a deferred release.  Removes the set from the list provided to
+ * cpuset_rel_defer.
+ */
+static void
+cpuset_rel_complete(struct cpuset *set)
+{
+	LIST_REMOVE(set, cs_link);
+	cpuset_rel(set->cs_parent);
+	uma_zfree(cpuset_zone, set);
+}
+
+/*
+ * Find a set based on an id.  Returns it with a ref.
+ */
+static struct cpuset *
+cpuset_lookup(cpusetid_t setid)
+{
+	struct cpuset *set;
+
+	if (setid == CPUSET_INVALID)
+		return (NULL);
+	mtx_lock_spin(&cpuset_lock);
+	LIST_FOREACH(set, &cpuset_ids, cs_link)
+		if (set->cs_id == setid)
+			break;
+	if (set)
+		cpuset_ref(set);
+	mtx_unlock_spin(&cpuset_lock);
+	return (set);
+}
+
+/*
+ * Create a set in the space provided in 'set' with the provided parameters.
+ * The set is returned with a single ref.  May return EDEADLK if the set
+ * will have no valid cpu based on restrictions from the parent.
+ */
+static int
+_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
+    cpusetid_t id)
+{
+	int error;
+
+	error = 0;
+	CPU_COPY(mask, &set->cs_mask);
+	LIST_INIT(&set->cs_children);
+	refcount_init(&set->cs_ref, 1);
+	set->cs_flags = 0;
+	mtx_lock_spin(&cpuset_lock);
+	CPU_AND(mask, &parent->cs_mask);
+	if (!CPU_EMPTY(mask)) {
+		set->cs_id = id;
+		set->cs_parent = cpuset_ref(parent);
+		LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
+		if (set->cs_id != CPUSET_INVALID)
+			LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+	} else
+		error = EDEADLK;
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (error);
+}
+
+/*
+ * Create a new non-anonymous set with the requested parent and mask.  May
+ * return failures if the mask is invalid or a new number can not be
+ * allocated.
+ */
+static int
+cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
+{
+	struct cpuset *set;
+	cpusetid_t id;
+	int error;
+
+	id = alloc_unr(cpuset_unr);
+	if (id == -1)
+		return (ENFILE);
+	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = _cpuset_create(set, parent, mask, id);
+	if (error == 0)
+		return (0);
+	free_unr(cpuset_unr, id);
+	uma_zfree(cpuset_zone, set);
+
+	return (error);
+}
+
+/*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'.  Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
+{
+	struct cpuset *nset;
+	cpuset_t newmask;
+	int error;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	if (set->cs_flags & CPU_SET_RDONLY)
+		return (EPERM);
+	error = 0;
+	CPU_COPY(&set->cs_mask, &newmask);
+	CPU_AND(&newmask, mask);
+	if (CPU_EMPTY(&newmask))
+		return (EDEADLK);
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
+			break;
+	return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update(struct cpuset *set, cpuset_t *mask)
+{
+	struct cpuset *nset;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	CPU_AND(&set->cs_mask, mask);
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		cpuset_update(nset, &set->cs_mask);
+
+	return;
+}
+
+/*
+ * Modify the set 'set' to use a copy of the mask provided.  Apply this new
+ * mask to restrict all children in the tree.  Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify(struct cpuset *set, cpuset_t *mask)
+{
+	int error;
+
+	error = suser(curthread);
+	if (error)
+		return (error);
+	mtx_lock_spin(&cpuset_lock);
+	error = cpuset_testupdate(set, mask);
+	if (error)
+		goto out;
+	cpuset_update(set, mask);
+	CPU_COPY(mask, &set->cs_mask);
+out:
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (error);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root.  Returns the root
+ * referenced.
+ */
+static struct cpuset *
+cpuset_root(struct cpuset *set)
+{
+
+	mtx_lock_spin(&cpuset_lock);
+	for (; set->cs_parent != NULL; set = set->cs_parent)
+		if (set->cs_flags & CPU_SET_ROOT)
+			break;
+	cpuset_ref(set);
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (set);
+}
+
+/*
+ * Find the first non-anonymous set starting from 'set'.  Returns this set
+ * referenced.  May return the passed in set with an extra ref if it is
+ * not anonymous. 
+ */
+static struct cpuset *
+cpuset_base(struct cpuset *set)
+{
+
+	mtx_lock_spin(&cpuset_lock);
+	if (set->cs_id == CPUSET_INVALID)
+		set = set->cs_parent;
+	cpuset_ref(set);
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (set);
+}
+
+/*
+ * Resolve the 'which' parameter of several cpuset apis.
+ *
+ * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
+ * checks for permission via p_cansched().
+ *
+ * For WHICH_SET returns a valid set with a new reference.
+ *
+ * -1 may be supplied for any argument to mean the current proc/thread or
+ * the base set of the current thread.  May fail with ESRCH/EPERM.
+ */
+static int
+cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
+    struct cpuset **setp)
+{
+	struct cpuset *set;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	*pp = p = NULL;
+	*tdp = td = NULL;
+	*setp = set = NULL;
+	switch (which) {
+	case CPU_WHICH_PID:
+		if (id == -1) {
+			PROC_LOCK(curproc);
+			p = curproc;
+			break;
+		}
+		if ((p = pfind(id)) == NULL)
+			return (ESRCH);
+		break;
+	case CPU_WHICH_TID:
+		if (id == -1) {
+			PROC_LOCK(curproc);
+			p = curproc;
+			td = curthread;
+			break;
+		}
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			PROC_SLOCK(p);
+			FOREACH_THREAD_IN_PROC(p, td)
+				if (td->td_tid == id)
+					break;
+			PROC_SUNLOCK(p);
+			if (td != NULL)
+				break;
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		if (td == NULL)
+			return (ESRCH);
+		break;
+	case CPU_WHICH_CPUSET:
+		if (id == -1) {
+			thread_lock(curthread);
+			set = cpuset_base(curthread->td_cpuset);
+			thread_unlock(curthread);
+		} else
+			set = cpuset_lookup(id);
+		if (set) {
+			*setp = set;
+			return (0);
+		}
+		return (ESRCH);
+	default:
+		return (EINVAL);
+	}
+	error = p_cansched(curthread, p);
+	if (error) {
+		PROC_UNLOCK(p);
+		return (error);
+	}
+	if (td == NULL)
+		td = FIRST_THREAD_IN_PROC(p);
+	*pp = p;
+	*tdp = td;
+	return (0);
+}
+
+/*
+ * Create an anonymous set with the provided mask in the space provided by
+ * 'fset'.  If the passed in set is anonymous we use its parent otherwise
+ * the new set is a child of 'set'.
+ */
+static int
+cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
+{
+	struct cpuset *parent;
+
+	if (set->cs_id == CPUSET_INVALID)
+		parent = set->cs_parent;
+	else
+		parent = set;
+	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+}
+
+/*
+ * Handle two cases for replacing the base set or mask of an entire process.
+ *
+ * 1) Set is non-null and mask is null.  This reparents all anonymous sets
+ *    to the provided set and replaces all non-anonymous td_cpusets with the
+ *    provided set.
+ * 2) Mask is non-null and set is null.  This replaces or creates anonymous
+ *    sets for every thread with the existing base as a parent.
+ *
+ * This is overly complicated because we can't allocate while holding a 
+ * spinlock and spinlocks must be held while changing and examining thread
+ * state.
+ */
+static int
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+{
+	struct setlist freelist;
+	struct setlist droplist;
+	struct cpuset *nset;
+	struct thread *td;
+	struct proc *p;
+	int threads;
+	int nfree;
+	int error;
+	/*
+	 * The algorithm requires two passes due to locking considerations.
+	 * 
+	 * 1) Lookup the process and acquire the locks in the required order.
+	 * 2) If enough cpusets have not been allocated release the locks and
+	 *    allocate them.  Loop.
+	 */
+	LIST_INIT(&freelist);
+	LIST_INIT(&droplist);
+	nfree = 0;
+	for (;;) {
+		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
+		if (error)
+			goto out;
+		PROC_SLOCK(p);
+		if (nfree >= p->p_numthreads)
+			break;
+		threads = p->p_numthreads;
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
+		for (; nfree < threads; nfree++) {
+			nset = uma_zalloc(cpuset_zone, M_WAITOK);
+			LIST_INSERT_HEAD(&freelist, nset, cs_link);
+		}
+	}
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * Now that the appropriate locks are held and we have enough cpusets,
+	 * replace each thread's cpuset while using deferred release.  We
+	 * must do this because the PROC_SLOCK has to be held while traversing
+	 * the thread list and this limits the type of operations allowed.
+	 */
+	error = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		struct cpuset *tdset;
+		thread_lock(td);
+		/*
+		 * If we presently have an anonymous set or are applying a
+		 * mask we must create an anonymous shadow set.  That is
+		 * either parented to our existing base or the supplied set.
+		 *
+		 * If we have a base set with no anonymous shadow we simply
+		 * replace it outright.
+		 */
+		tdset = td->td_cpuset;
+		if (tdset->cs_id == CPUSET_INVALID || mask) {
+			nset = LIST_FIRST(&freelist);
+			LIST_REMOVE(nset, cs_link);
+			if (mask)
+				error = cpuset_shadow(tdset, nset, mask);
+			else
+				error = _cpuset_create(nset, set,
+				    &tdset->cs_mask, CPUSET_INVALID);
+			if (error) {
+				LIST_INSERT_HEAD(&freelist, nset, cs_link);
+				thread_unlock(td);
+				break;
+			}
+		} else
+			nset = cpuset_ref(set);
+		cpuset_rel_defer(&droplist, tdset);
+		td->td_cpuset = nset;
+		sched_affinity(td);
+		thread_unlock(td);
+	}
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
+out:
+	while ((nset = LIST_FIRST(&droplist)) != NULL)
+		cpuset_rel_complete(nset);
+	while ((nset = LIST_FIRST(&freelist)) != NULL) {
+		LIST_REMOVE(nset, cs_link);
+		uma_zfree(cpuset_zone, nset);
+	}
+	return (error);
+}
+
+/*
+ * Apply an anonymous mask to a single thread.
+ */
+static int
+cpuset_setthread(lwpid_t id, cpuset_t *mask)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	nset = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &nset);
+	if (error)
+		goto out;
+	thread_lock(td);
+	set = td->td_cpuset;
+	error = cpuset_shadow(set, nset, mask);
+	if (error == 0) {
+		cpuset_rel(td->td_cpuset);
+		td->td_cpuset = nset;
+		sched_affinity(td);
+		nset = NULL;
+	}
+	thread_unlock(td);
+	PROC_UNLOCK(p);
+out:
+	if (nset)
+		uma_zfree(cpuset_zone, nset);
+	return (error);
+}
+
+/*
+ * Creates the cpuset for thread0.  We make two sets:
+ * 
+ * 0 - The root set which should represent all valid processors in the
+ *     system.  It is initially created with a mask of all processors
+ *     because we don't know what processors are valid until cpuset_init()
+ *     runs.  This set is immutable.
+ * 1 - The default set which all processes are a member of until changed.
+ *     This allows an administrator to move all threads off of given cpus to
+ *     dedicate them to high priority tasks or save power etc.
+ */
+struct cpuset *
+cpuset_thread0(void)
+{
+	struct cpuset *set;
+	int error;
+
+	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
+	/*
+	 * Create the root system set for the whole machine.  Doesn't use
+	 * cpuset_create() due to NULL parent.
+	 */
+	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+	set->cs_mask.__bits[0] = -1;
+	LIST_INIT(&set->cs_children);
+	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+	set->cs_ref = 1;
+	set->cs_flags = CPU_SET_ROOT;
+	cpuset_zero = set;
+	/*
+	 * Now derive a default, modifiable set from that to give out.
+	 */
+	set = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
+	KASSERT(error == 0, ("Error creating default set: %d\n", error));
+	/*
+	 * Initialize the unit allocator. 0 and 1 are allocated above.
+	 */
+	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
+
+	return (set);
+}
+
+/*
+ * This is called once the final set of system cpus is known.  Modifies
+ * the root set and all children and mark the root readonly.  
+ */
+static void
+cpuset_init(void *arg)
+{
+	cpuset_t mask;
+
+	CPU_ZERO(&mask);
+#ifdef SMP
+	mask.__bits[0] = all_cpus;
+#else
+	mask.__bits[0] = 1;
+#endif
+	if (cpuset_modify(cpuset_zero, &mask))
+		panic("Can't set initial cpuset mask.\n");
+	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
+}
+SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_args {
+	cpusetid_t	*setid;
+};
+#endif
+int
+cpuset(struct thread *td, struct cpuset_args *uap)
+{
+	struct cpuset *root;
+	struct cpuset *set;
+	int error;
+
+	thread_lock(td);
+	root = cpuset_root(td->td_cpuset);
+	thread_unlock(td);
+	error = cpuset_create(&set, root, &root->cs_mask);
+	cpuset_rel(root);
+	if (error)
+		return (error);
+	error = cpuset_setproc(-1, set, NULL);
+	if (error == 0)
+		error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
+	cpuset_rel(set);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setid_args {
+	cpuwhich_t	which;
+	id_t		id;
+	cpusetid_t	setid;
+};
+#endif
+int
+cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
+{
+	struct cpuset *set;
+	int error;
+
+	/*
+	 * Presently we only support per-process sets.
+	 */
+	if (uap->which != CPU_WHICH_PID)
+		return (EINVAL);
+	set = cpuset_lookup(uap->setid);
+	if (set == NULL)
+		return (ESRCH);
+	error = cpuset_setproc(uap->id, set, NULL);
+	cpuset_rel(set);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getid_args {
+	cpulevel_t	level;
+	cpuwhich_t	which;
+	id_t		id;
+	cpusetid_t	*setid;
+#endif
+int
+cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	cpusetid_t id;
+	int error;
+
+	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
+		return (EINVAL);
+	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+	if (error)
+		return (error);
+	switch (uap->which) {
+	case CPU_WHICH_TID:
+	case CPU_WHICH_PID:
+		thread_lock(ttd);
+		set = cpuset_base(ttd->td_cpuset);
+		thread_unlock(ttd);
+		PROC_UNLOCK(p);
+		break;
+	case CPU_WHICH_CPUSET:
+		break;
+	}
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+		nset = cpuset_root(set);
+		cpuset_rel(set);
+		set = nset;
+		break;
+	case CPU_LEVEL_CPUSET:
+		break;
+	case CPU_LEVEL_WHICH:
+		break;
+	}
+	id = set->cs_id;
+	cpuset_rel(set);
+	if (error == 0)
+		error = copyout(&id, uap->setid, sizeof(id));
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getaffinity_args {
+        cpulevel_t	level;
+        cpuwhich_t	which;
+        int		id;
+        int		cpusetsize;
+        long 		*mask;
+};
+#endif
+int
+cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
+{
+	struct thread *ttd;
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct proc *p;
+	cpuset_t *mask;
+	int error;
+	int size;
+
+	if (uap->cpusetsize < CPU_SETSIZE || uap->cpusetsize > CPU_MAXSIZE)
+		return (ERANGE);
+	size = uap->cpusetsize / NBBY;
+	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+	if (error)
+		goto out;
+	error = 0;
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+	case CPU_LEVEL_CPUSET:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+		case CPU_WHICH_PID:
+			thread_lock(ttd);
+			set = cpuset_ref(ttd->td_cpuset);
+			thread_unlock(ttd);
+			break;
+		case CPU_WHICH_CPUSET:
+			break;
+		}
+		if (uap->level == CPU_LEVEL_ROOT)
+			nset = cpuset_root(set);
+		else
+			nset = cpuset_base(set);
+		CPU_COPY(&nset->cs_mask, mask);
+		cpuset_rel(nset);
+		break;
+	case CPU_LEVEL_WHICH:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+			thread_lock(ttd);
+			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
+			thread_unlock(ttd);
+			break;
+		case CPU_WHICH_PID:
+			PROC_SLOCK(p);
+			FOREACH_THREAD_IN_PROC(p, ttd) {
+				thread_lock(ttd);
+				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
+				thread_unlock(ttd);
+			}
+			PROC_SUNLOCK(p);
+			break;
+		case CPU_WHICH_CPUSET:
+			CPU_COPY(&set->cs_mask, mask);
+			break;
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (set)
+		cpuset_rel(set);
+	if (p)
+		PROC_UNLOCK(p);
+	if (error == 0)
+		error = copyout(mask, uap->mask, size);
+out:
+	free(mask, M_TEMP);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setaffinity_args {
+	cpulevel_t	level;
+        cpuwhich_t	which;
+        int		id;
+        int		cpusetsize;
+        long 	*	mask;
+};
+#endif
+int
+cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	cpuset_t *mask;
+	int error;
+
+	if (uap->cpusetsize < CPU_SETSIZE || uap->cpusetsize > CPU_MAXSIZE)
+		return (ERANGE);
+	mask = malloc(uap->cpusetsize / NBBY, M_TEMP, M_WAITOK | M_ZERO);
+	error = copyin(uap->mask, mask, uap->cpusetsize / NBBY);
+	if (error)
+		goto out;
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+	case CPU_LEVEL_CPUSET:
+		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+		if (error)
+			break;
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+		case CPU_WHICH_PID:
+			thread_lock(ttd);
+			set = cpuset_ref(ttd->td_cpuset);
+			thread_unlock(ttd);
+			break;
+		case CPU_WHICH_CPUSET:
+			break;
+		}
+		if (uap->level == CPU_LEVEL_ROOT)
+			nset = cpuset_root(set);
+		else
+			nset = cpuset_base(set);
+		error = cpuset_modify(nset, mask);
+		cpuset_rel(nset);
+		cpuset_rel(set);
+		break;
+	case CPU_LEVEL_WHICH:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+			error = cpuset_setthread(uap->id, mask);
+			break;
+		case CPU_WHICH_PID:
+			error = cpuset_setproc(uap->id, NULL, mask);
+			break;
+		case CPU_WHICH_CPUSET:
+			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
+			    &ttd, &set);
+			if (error == 0) {
+				error = cpuset_modify(set, mask);
+				cpuset_rel(set);
+			}
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+out:
+	free(mask, M_TEMP);
+	return (error);
+}
Index: sys/kern/kern_thread.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_thread.c,v
retrieving revision 1.265
diff -u -p -r1.265 kern_thread.c
--- sys/kern/kern_thread.c	22 Dec 2007 04:56:48 -0000	1.265
+++ sys/kern/kern_thread.c	2 Mar 2008 02:58:10 -0000
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_th
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/umtx.h>
+#include <sys/cpuset.h>
 
 #include <security/audit/audit.h>
 
@@ -342,7 +343,8 @@ thread_alloc(void)
 void
 thread_free(struct thread *td)
 {
-
+	cpuset_rel(td->td_cpuset);
+	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_altkstack != 0)
 		vm_thread_dispose_altkstack(td);
@@ -527,6 +529,8 @@ thread_wait(struct proc *p)
 	/* Wait for any remaining threads to exit cpu_throw(). */
 	while (p->p_exitthreads)
 		sched_relinquish(curthread);
+	cpuset_rel(td->td_cpuset);
+	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	crfree(td->td_ucred);
 	thread_reap();	/* check for zombie threads etc. */
Index: sys/kern/sched_ule.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.223
diff -u -p -r1.223 sched_ule.c
--- sys/kern/sched_ule.c	23 Jan 2008 03:10:18 -0000	1.223
+++ sys/kern/sched_ule.c	2 Mar 2008 02:58:10 -0000
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
+#include <sys/cpuset.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -95,9 +96,7 @@ struct td_sched {	
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
-#ifdef SMP
 	int		ts_rltick;	/* Real last tick, for affinity. */
-#endif
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
@@ -105,6 +104,10 @@ struct td_sched {	
 
 static struct td_sched td_sched0;
 
+#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
+#define	THREAD_CAN_SCHED(td, cpu)	\
+    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
 /*
  * Cpu percentage computation macros and defines.
  *
@@ -183,6 +186,7 @@ static int preempt_thresh = PRI_MIN_KERN
 #else 
 static int preempt_thresh = 0;
 #endif
+static int lowpri_userret = 1;
 
 /*
  * tdq - per processor runqs and statistics.  All fields are protected by the
@@ -190,47 +194,26 @@ static int preempt_thresh = 0;
  * locking in sched_pickcpu();
  */
 struct tdq {
-	struct mtx	*tdq_lock;		/* Pointer to group lock. */
+	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
+	struct mtx	tdq_lock;		/* run queue lock. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
 	int		tdq_load;		/* Aggregate load. */
+	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
-#ifdef SMP
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	int		tdq_transferable;	/* Transferable thread count. */
-	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
-	struct tdq_group *tdq_group;		/* Our processor group. */
-#else
-	int		tdq_sysload;		/* For loadavg, !ITHD load. */
-#endif
+	char		tdq_name[sizeof("sched lock") + 6];
 } __aligned(64);
 
 
 #ifdef SMP
-/*
- * tdq groups are groups of processors which can cheaply share threads.  When
- * one processor in the group goes idle it will check the runqs of the other
- * processors in its group prior to halting and waiting for an interrupt.
- * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
- * In a numa environment we'd want an idle bitmap per group and a two tiered
- * load balancer.
- */
-struct tdq_group {
-	struct mtx	tdg_lock;	/* Protects all fields below. */
-	int		tdg_cpus;	/* Count of CPUs in this tdq group. */
-	cpumask_t 	tdg_cpumask;	/* Mask of cpus in this group. */
-	cpumask_t 	tdg_idlemask;	/* Idle cpus in this group. */
-	cpumask_t 	tdg_mask;	/* Bit mask for first cpu. */
-	int		tdg_load;	/* Total load of this group. */
-	int	tdg_transferable;	/* Transferable load of this group. */
-	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
-	char		tdg_name[16];	/* lock name. */
-} __aligned(64);
+struct cpu_group *cpu_top;
 
-#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
-#define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
+#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
@@ -240,6 +223,7 @@ static int balance_interval = 128;	/* De
 static int pick_pri = 1;
 static int affinity;
 static int tryself = 1;
+static int oldtryself = 0;
 static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
@@ -248,22 +232,15 @@ static int topology = 0;
 /*
  * One thread queue per processor.
  */
-static volatile cpumask_t tdq_idle;
-static int tdg_maxid;
 static struct tdq	tdq_cpu[MAXCPU];
-static struct tdq_group tdq_groups[MAXCPU];
 static struct tdq	*balance_tdq;
-static int balance_group_ticks;
 static int balance_ticks;
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
 #define	TDQ_ID(x)	((int)((x) - tdq_cpu))
-#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
-#define	TDG_ID(x)	((int)((x) - tdq_groups))
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
-static struct mtx	tdq_lock;
 
 #define	TDQ_ID(x)	(0)
 #define	TDQ_SELF()	(&tdq_cpu)
@@ -274,7 +251,7 @@ static struct mtx	tdq_lock;
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
-#define	TDQ_LOCKPTR(t)		((t)->tdq_lock)
+#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
@@ -294,22 +271,18 @@ void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
-static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct td_sched *);
-static struct td_sched *tdq_steal(struct tdq *);
-static struct td_sched *runq_steal(struct runq *);
+static struct td_sched *tdq_steal(struct tdq *, int);
+static struct td_sched *runq_steal(struct runq *, int);
 static int sched_pickcpu(struct td_sched *, int);
 static void sched_balance(void);
-static void sched_balance_groups(void);
-static void sched_balance_group(struct tdq_group *);
-static void sched_balance_pair(struct tdq *, struct tdq *);
+static int sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
 static inline struct mtx *thread_block_switch(struct thread *);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
-
-#define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
 #endif
 
 static void sched_setup(void *dummy);
@@ -356,7 +329,8 @@ tdq_print(int cpu)
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
-	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
@@ -366,12 +340,8 @@ tdq_print(int cpu)
 	runq_print(&tdq->tdq_timeshare);
 	printf("\tidle runq:\n");
 	runq_print(&tdq->tdq_idle);
-#ifdef SMP
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
-	printf("\tgroup:             %d\n", TDG_ID(tdq->tdq_group));
-	printf("\tLock name:         %s\n", tdq->tdq_group->tdg_name);
-#endif
 }
 
 #define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
@@ -385,13 +355,10 @@ tdq_runq_add(struct tdq *tdq, struct td_
 {
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
-#ifdef SMP
 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
 		tdq->tdq_transferable++;
-		tdq->tdq_group->tdg_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
-#endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		u_char pri;
 
@@ -431,13 +398,10 @@ tdq_runq_rem(struct tdq *tdq, struct td_
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(ts->ts_runq != NULL,
 	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
-#ifdef SMP
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
-		tdq->tdq_group->tdg_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
-#endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		if (tdq->tdq_idx != tdq->tdq_ridx)
 			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
@@ -470,11 +434,7 @@ tdq_load_add(struct tdq *tdq, struct td_
 	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
-		tdq->tdq_group->tdg_load++;
-#else
 		tdq->tdq_sysload++;
-#endif
 }
 
 /*
@@ -491,11 +451,7 @@ tdq_load_rem(struct tdq *tdq, struct td_
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
-		tdq->tdq_group->tdg_load--;
-#else
 		tdq->tdq_sysload--;
-#endif
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 	tdq->tdq_load--;
@@ -503,112 +459,282 @@ tdq_load_rem(struct tdq *tdq, struct td_
 	ts->ts_runq = NULL;
 }
 
+/*
+ * Set lowpri to its exact value by searching the run-queue and
+ * evaluating curthread.  curthread may be passed as an optimization.
+ */
+static void
+tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
+{
+	struct td_sched *ts;
+	struct thread *td;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if (ctd == NULL)
+		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
+	ts = tdq_choose(tdq);
+	if (ts)
+		td = ts->ts_thread;
+	if (ts == NULL || td->td_priority > ctd->td_priority)
+		tdq->tdq_lowpri = ctd->td_priority;
+	else
+		tdq->tdq_lowpri = td->td_priority;
+}
+
 #ifdef SMP
+struct cpu_search {
+	cpumask_t cs_mask;	/* Mask of valid cpus. */
+	u_int	cs_load;
+	u_int	cs_cpu;
+	int	cs_limit;	/* Min priority for low min load for high. */
+};
+
+#define	CPU_SEARCH_LOWEST	0x1
+#define	CPU_SEARCH_HIGHEST	0x2
+#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
+
+#define	CPUMASK_FOREACH(cpu, mask)				\
+	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
+		if ((mask) & 1 << (cpu))
+
+__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match);
+int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high);
+
+/*
+ * This routine compares according to the match argument and should be
+ * reduced in actual instantiations via constant propagation and dead code
+ * elimination.
+ */ 
+static __inline int
+cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
+    const int match)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_CPU(cpu);
+	if (match & CPU_SEARCH_LOWEST)
+		if (low->cs_mask & (1 << cpu) &&
+		    tdq->tdq_load < low->cs_load &&
+		    tdq->tdq_lowpri > low->cs_limit) {
+			low->cs_cpu = cpu;
+			low->cs_load = tdq->tdq_load;
+		}
+	if (match & CPU_SEARCH_HIGHEST)
+		if (high->cs_mask & (1 << cpu) &&
+		    tdq->tdq_load >= high->cs_limit && 
+		    tdq->tdq_load > high->cs_load &&
+		    tdq->tdq_transferable) {
+			high->cs_cpu = cpu;
+			high->cs_load = tdq->tdq_load;
+		}
+	return (tdq->tdq_load);
+}
+
 /*
- * sched_balance is a simple CPU load balancing algorithm.  It operates by
- * finding the least loaded and most loaded cpu and equalizing their load
- * by migrating some processes.
- *
- * Dealing only with two CPUs at a time has two advantages.  Firstly, most
- * installations will only have 2 cpus.  Secondly, load balancing too much at
- * once can have an unpleasant effect on the system.  The scheduler rarely has
- * enough information to make perfect decisions.  So this algorithm chooses
- * simplicity and more gradual effects on load in larger systems.
+ * Search the tree of cpu_groups for the lowest or highest loaded cpu
+ * according to the match argument.  This routine actually compares the
+ * load on all paths through the tree and finds the least loaded cpu on
+ * the least loaded path, which may differ from the least loaded cpu in
+ * the system.  This balances work among caches and busses.
  *
+ * This inline is instantiated in three forms below using constants for the
+ * match argument.  It is reduced to the minimum set for each case.  It is
+ * also recursive to the depth of the tree.
+ */
+static inline int
+cpu_search(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match)
+{
+	int total;
+
+	total = 0;
+	if (cg->cg_children) {
+		struct cpu_search lgroup;
+		struct cpu_search hgroup;
+		struct cpu_group *child;
+		u_int lload;
+		int hload;
+		int load;
+		int i;
+
+		lload = -1;
+		hload = -1;
+		for (i = 0; i < cg->cg_children; i++) {
+			child = &cg->cg_child[i];
+			if (match & CPU_SEARCH_LOWEST) {
+				lgroup = *low;
+				lgroup.cs_load = -1;
+			}
+			if (match & CPU_SEARCH_HIGHEST) {
+				hgroup = *high;
+				lgroup.cs_load = 0;
+			}
+			switch (match) {
+			case CPU_SEARCH_LOWEST:
+				load = cpu_search_lowest(child, &lgroup);
+				break;
+			case CPU_SEARCH_HIGHEST:
+				load = cpu_search_highest(child, &hgroup);
+				break;
+			case CPU_SEARCH_BOTH:
+				load = cpu_search_both(child, &lgroup, &hgroup);
+				break;
+			}
+			total += load;
+			if (match & CPU_SEARCH_LOWEST)
+				if (load < lload || low->cs_cpu == -1) {
+					*low = lgroup;
+					lload = load;
+				}
+			if (match & CPU_SEARCH_HIGHEST) 
+				if (load > hload || high->cs_cpu == -1) {
+					hload = load;
+					*high = hgroup;
+				}
+		}
+	} else {
+		int cpu;
+
+		CPUMASK_FOREACH(cpu, cg->cg_mask)
+			total += cpu_compare(cpu, low, high, match);
+	}
+	return (total);
+}
+
+/*
+ * cpu_search instantiations must pass constants to maintain the inline
+ * optimization.
+ */
+int
+cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+{
+	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
+}
+
+int
+cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+{
+	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
+}
+
+int
+cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high)
+{
+	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+}
+
+/*
+ * Find the cpu with the least load via the least loaded path that has a
+ * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
+ * acceptable.
+ */
+static inline int
+sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
+{
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_load = -1;
+	low.cs_mask = mask;
+	low.cs_limit = pri;
+	cpu_search_lowest(cg, &low);
+	return low.cs_cpu;
+}
+
+/*
+ * Find the cpu with the highest load via the highest loaded path.
+ */
+static inline int
+sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
+{
+	struct cpu_search high;
+
+	high.cs_cpu = -1;
+	high.cs_load = 0;
+	high.cs_mask = mask;
+	high.cs_limit = minload;
+	cpu_search_highest(cg, &high);
+	return high.cs_cpu;
+}
+
+/*
+ * Simultaneously find the highest and lowest loaded cpu reachable via
+ * cg.
  */
+static inline void 
+sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
+{
+	struct cpu_search high;
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_limit = -1;
+	low.cs_load = -1;
+	low.cs_mask = mask;
+	high.cs_load = 0;
+	high.cs_cpu = -1;
+	high.cs_limit = -1;
+	high.cs_mask = mask;
+	cpu_search_both(cg, &low, &high);
+	*lowcpu = low.cs_cpu;
+	*highcpu = high.cs_cpu;
+	return;
+}
+
 static void
-sched_balance()
+sched_balance_group(struct cpu_group *cg)
 {
-	struct tdq_group *high;
-	struct tdq_group *low;
-	struct tdq_group *tdg;
-	struct tdq *tdq;
-	int cnt;
+	cpumask_t mask;
+	int high;
+	int low;
 	int i;
 
-	/*
-	 * Select a random time between .5 * balance_interval and
-	 * 1.5 * balance_interval.
-	 */
-	balance_ticks = max(balance_interval / 2, 1);
-	balance_ticks += random() % balance_interval;
-	if (smp_started == 0 || rebalance == 0)
-		return;
-	tdq = TDQ_SELF();
-	TDQ_UNLOCK(tdq);
-	low = high = NULL;
-	i = random() % (tdg_maxid + 1);
-	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
-		tdg = TDQ_GROUP(i);
+	mask = -1;
+	for (;;) {
+		sched_both(cg, mask, &low, &high);
+		if (low == high || low == -1 || high == -1)
+			break;
+		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+			break;
 		/*
-		 * Find the CPU with the highest load that has some
-		 * threads to transfer.
-		 */
-		if ((high == NULL || tdg->tdg_load > high->tdg_load)
-		    && tdg->tdg_transferable)
-			high = tdg;
-		if (low == NULL || tdg->tdg_load < low->tdg_load)
-			low = tdg;
-		if (++i > tdg_maxid)
-			i = 0;
-	}
-	if (low != NULL && high != NULL && high != low)
-		sched_balance_pair(LIST_FIRST(&high->tdg_members),
-		    LIST_FIRST(&low->tdg_members));
-	TDQ_LOCK(tdq);
+		 * If we failed to move any threads determine which cpu
+		 * to kick out of the set and try again.
+	 	 */
+		if (TDQ_CPU(high)->tdq_transferable == 0)
+			mask &= ~(1 << high);
+		else
+			mask &= ~(1 << low);
+	}
+
+	for (i = 0; i < cg->cg_children; i++)
+		sched_balance_group(&cg->cg_child[i]);
 }
 
-/*
- * Balance load between CPUs in a group.  Will only migrate within the group.
- */
 static void
-sched_balance_groups()
+sched_balance()
 {
 	struct tdq *tdq;
-	int i;
 
 	/*
 	 * Select a random time between .5 * balance_interval and
 	 * 1.5 * balance_interval.
 	 */
-	balance_group_ticks = max(balance_interval / 2, 1);
-	balance_group_ticks += random() % balance_interval;
+	balance_ticks = max(balance_interval / 2, 1);
+	balance_ticks += random() % balance_interval;
 	if (smp_started == 0 || rebalance == 0)
 		return;
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
-	for (i = 0; i <= tdg_maxid; i++)
-		sched_balance_group(TDQ_GROUP(i));
+	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
- * Finds the greatest imbalance between two tdqs in a group.
- */
-static void
-sched_balance_group(struct tdq_group *tdg)
-{
-	struct tdq *tdq;
-	struct tdq *high;
-	struct tdq *low;
-	int load;
-
-	if (tdg->tdg_transferable == 0)
-		return;
-	low = NULL;
-	high = NULL;
-	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
-		load = tdq->tdq_load;
-		if (high == NULL || load > high->tdq_load)
-			high = tdq;
-		if (low == NULL || load < low->tdq_load)
-			low = tdq;
-	}
-	if (high != NULL && low != NULL && high != low)
-		sched_balance_pair(high, low);
-}
-
-/*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
@@ -636,31 +762,22 @@ tdq_unlock_pair(struct tdq *one, struct 
 /*
  * Transfer load between two imbalanced thread queues.
  */
-static void
+static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	int transferable;
 	int high_load;
 	int low_load;
+	int moved;
 	int move;
 	int diff;
 	int i;
 
 	tdq_lock_pair(high, low);
-	/*
-	 * If we're transfering within a group we have to use this specific
-	 * tdq's transferable count, otherwise we can steal from other members
-	 * of the group.
-	 */
-	if (high->tdq_group == low->tdq_group) {
-		transferable = high->tdq_transferable;
-		high_load = high->tdq_load;
-		low_load = low->tdq_load;
-	} else {
-		transferable = high->tdq_group->tdg_transferable;
-		high_load = high->tdq_group->tdg_load;
-		low_load = low->tdq_group->tdg_load;
-	}
+	transferable = high->tdq_transferable;
+	high_load = high->tdq_load;
+	low_load = low->tdq_load;
+	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
@@ -672,7 +789,7 @@ sched_balance_pair(struct tdq *high, str
 			move++;
 		move = min(move, transferable);
 		for (i = 0; i < move; i++)
-			tdq_move(high, low);
+			moved += tdq_move(high, low);
 		/*
 		 * IPI the target cpu to force it to reschedule with the new
 		 * workload.
@@ -680,13 +797,13 @@ sched_balance_pair(struct tdq *high, str
 		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
 	}
 	tdq_unlock_pair(high, low);
-	return;
+	return (moved);
 }
 
 /*
  * Move a thread from one thread queue to another.
  */
-static void
+static int
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct td_sched *ts;
@@ -699,22 +816,9 @@ tdq_move(struct tdq *from, struct tdq *t
 
 	tdq = from;
 	cpu = TDQ_ID(to);
-	ts = tdq_steal(tdq);
-	if (ts == NULL) {
-		struct tdq_group *tdg;
-
-		tdg = tdq->tdq_group;
-		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
-			if (tdq == from || tdq->tdq_transferable == 0)
-				continue;
-			ts = tdq_steal(tdq);
-			break;
-		}
-		if (ts == NULL)
-			return;
-	}
-	if (tdq == to)
-		return;
+	ts = tdq_steal(tdq, cpu);
+	if (ts == NULL)
+		return (0);
 	td = ts->ts_thread;
 	/*
 	 * Although the run queue is locked the thread may be blocked.  Lock
@@ -727,6 +831,7 @@ tdq_move(struct tdq *from, struct tdq *t
 	ts->ts_cpu = cpu;
 	td->td_lock = TDQ_LOCKPTR(to);
 	tdq_add(to, td, SRQ_YIELDING);
+	return (1);
 }
 
 /*
@@ -736,72 +841,54 @@ tdq_move(struct tdq *from, struct tdq *t
 static int
 tdq_idled(struct tdq *tdq)
 {
-	struct tdq_group *tdg;
+	struct cpu_group *cg;
 	struct tdq *steal;
-	int highload;
-	int highcpu;
+	cpumask_t mask;
+	int thresh;
 	int cpu;
 
 	if (smp_started == 0 || steal_idle == 0)
 		return (1);
-	/* We don't want to be preempted while we're iterating over tdqs */
+	mask = -1;
+	mask &= ~PCPU_GET(cpumask);
+	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
-	tdg = tdq->tdq_group;
-	/*
-	 * If we're in a cpu group, try and steal threads from another cpu in
-	 * the group before idling.  In a HTT group all cpus share the same
-	 * run-queue lock, however, we still need a recursive lock to
-	 * call tdq_move().
-	 */
-	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
-		TDQ_LOCK(tdq);
-		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
-			if (steal == tdq || steal->tdq_transferable == 0)
-				continue;
-			TDQ_LOCK(steal);
-			goto steal;
-		}
-		TDQ_UNLOCK(tdq);
-	}
-	/*
-	 * Find the least loaded CPU with a transferable thread and attempt
-	 * to steal it.  We make a lockless pass and then verify that the
-	 * thread is still available after locking.
-	 */
-	for (;;) {
-		highcpu = 0;
-		highload = 0;
-		for (cpu = 0; cpu <= mp_maxid; cpu++) {
-			if (CPU_ABSENT(cpu))
-				continue;
-			steal = TDQ_CPU(cpu);
-			if (steal->tdq_transferable == 0)
-				continue;
-			if (steal->tdq_load < highload)
-				continue;
-			highload = steal->tdq_load;
-			highcpu = cpu;
+	for (cg = tdq->tdq_cg; cg != NULL; ) {
+		if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
+			thresh = steal_thresh;
+		else
+			thresh = 1;
+		cpu = sched_highest(cg, mask, thresh);
+		if (cpu == -1) {
+			cg = cg->cg_parent;
+			continue;
 		}
-		if (highload < steal_thresh)
-			break;
-		steal = TDQ_CPU(highcpu);
-		if (steal == tdq)
-			break;
+		steal = TDQ_CPU(cpu);
+		mask &= ~(1 << cpu);
 		tdq_lock_pair(tdq, steal);
-		if (steal->tdq_load >= steal_thresh && steal->tdq_transferable)
-			goto steal;
-		tdq_unlock_pair(tdq, steal);
+		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
+			tdq_unlock_pair(tdq, steal);
+			continue;
+		}
+		/*
+		 * If a thread was added while interrupts were disabled don't
+		 * steal one here.  If we fail to acquire one due to affinity
+		 * restrictions loop again with this cpu removed from the
+		 * set.
+		 */
+		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
+			tdq_unlock_pair(tdq, steal);
+			continue;
+		}
+		spinlock_exit();
+		TDQ_UNLOCK(steal);
+		mi_switch(SW_VOL, NULL);
+		thread_unlock(curthread);
+
+		return (0);
 	}
 	spinlock_exit();
 	return (1);
-steal:
-	spinlock_exit();
-	tdq_move(steal, tdq);
-	TDQ_UNLOCK(steal);
-	mi_switch(SW_VOL, NULL);
-	thread_unlock(curthread);
-
-	return (0);
 }
 
 /*
@@ -854,7 +941,7 @@ sendipi:
  * index.
  */
 static struct td_sched *
-runq_steal_from(struct runq *rq, u_char start)
+runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct td_sched *ts;
 	struct rqbits *rqb;
@@ -883,7 +970,8 @@ again:
 		pri += (i << RQB_L2BPW);
 		rqh = &rq->rq_queues[pri];
 		TAILQ_FOREACH(ts, rqh, ts_procq) {
-			if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
+			if (first && THREAD_CAN_MIGRATE(ts->ts_thread) &&
+			    THREAD_CAN_SCHED(ts->ts_thread, cpu))
 				return (ts);
 			first = 1;
 		}
@@ -900,7 +988,7 @@ again:
  * Steals load from a standard linear queue.
  */
 static struct td_sched *
-runq_steal(struct runq *rq)
+runq_steal(struct runq *rq, int cpu)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
@@ -917,7 +1005,8 @@ runq_steal(struct runq *rq)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(ts, rqh, ts_procq)
-				if (THREAD_CAN_MIGRATE(ts->ts_thread))
+				if (THREAD_CAN_MIGRATE(ts->ts_thread) &&
+				    THREAD_CAN_SCHED(ts->ts_thread, cpu))
 					return (ts);
 		}
 	}
@@ -928,16 +1017,17 @@ runq_steal(struct runq *rq)
  * Attempt to steal a thread in priority order from a thread queue.
  */
 static struct td_sched *
-tdq_steal(struct tdq *tdq)
+tdq_steal(struct tdq *tdq, int cpu)
 {
 	struct td_sched *ts;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
-	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
+	if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
 		return (ts);
-	if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL)
+	if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx))
+	    != NULL)
 		return (ts);
-	return (runq_steal(&tdq->tdq_idle));
+	return (runq_steal(&tdq->tdq_idle, cpu));
 }
 
 /*
@@ -981,155 +1071,74 @@ sched_setcpu(struct td_sched *ts, int cp
 	return (tdq);
 }
 
-/*
- * Find the thread queue running the lowest priority thread.
- */
-static int
-tdq_lowestpri(void)
-{
-	struct tdq *tdq;
-	int lowpri;
-	int lowcpu;
-	int lowload;
-	int load;
-	int cpu;
-	int pri;
-
-	lowload = 0;
-	lowpri = lowcpu = 0;
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		tdq = TDQ_CPU(cpu);
-		pri = tdq->tdq_lowpri;
-		load = TDQ_CPU(cpu)->tdq_load;
-		CTR4(KTR_ULE,
-		    "cpu %d pri %d lowcpu %d lowpri %d",
-		    cpu, pri, lowcpu, lowpri);
-		if (pri < lowpri)
-			continue;
-		if (lowpri && lowpri == pri && load > lowload)
-			continue;
-		lowpri = pri;
-		lowcpu = cpu;
-		lowload = load;
-	}
-
-	return (lowcpu);
-}
-
-/*
- * Find the thread queue with the least load.
- */
-static int
-tdq_lowestload(void)
-{
-	struct tdq *tdq;
-	int lowload;
-	int lowpri;
-	int lowcpu;
-	int load;
-	int cpu;
-	int pri;
-
-	lowcpu = 0;
-	lowload = TDQ_CPU(0)->tdq_load;
-	lowpri = TDQ_CPU(0)->tdq_lowpri;
-	for (cpu = 1; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		tdq = TDQ_CPU(cpu);
-		load = tdq->tdq_load;
-		pri = tdq->tdq_lowpri;
-		CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
-		    cpu, load, lowcpu, lowload);
-		if (load > lowload)
-			continue;
-		if (load == lowload && pri < lowpri)
-			continue;
-		lowcpu = cpu;
-		lowload = load;
-		lowpri = pri;
-	}
-
-	return (lowcpu);
-}
-
-/*
- * Pick the destination cpu for sched_add().  Respects affinity and makes
- * a determination based on load or priority of available processors.
- */
 static int
 sched_pickcpu(struct td_sched *ts, int flags)
 {
+	struct cpu_group *cg;
+	struct thread *td;
 	struct tdq *tdq;
+	cpumask_t mask;
 	int self;
 	int pri;
 	int cpu;
 
-	cpu = self = PCPU_GET(cpuid);
+	self = PCPU_GET(cpuid);
+	td = ts->ts_thread;
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
-	if (flags & SRQ_OURSELF) {
-		CTR1(KTR_ULE, "YIELDING %d",
-		    curthread->td_priority);
-		return (self);
-	}
-	pri = ts->ts_thread->td_priority;
-	cpu = ts->ts_cpu;
-	/*
-	 * Regardless of affinity, if the last cpu is idle send it there.
-	 */
-	tdq = TDQ_CPU(cpu);
-	if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
-		CTR5(KTR_ULE,
-		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
-		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    tdq->tdq_lowpri);
+	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
 		return (ts->ts_cpu);
-	}
 	/*
-	 * If we have affinity, try to place it on the cpu we last ran on.
+	 * Prefer to run interrupt threads on the processors that generate
+	 * the interrupt.
 	 */
-	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
-		CTR5(KTR_ULE,
-		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
-		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    tdq->tdq_lowpri);
-		return (ts->ts_cpu);
-	}
+	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
+	    curthread->td_intr_nesting_level)
+		ts->ts_cpu = self;
 	/*
-	 * Look for an idle group.
+	 * If the thread can run on the last cpu and the affinity has not
+	 * expired or it is idle run it there.
 	 */
-	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
-	cpu = ffs(tdq_idle);
-	if (cpu)
-		return (--cpu);
-	/*
-	 * If there are no idle cores see if we can run the thread locally.
-	 * This may improve locality among sleepers and wakers when there
-	 * is shared data.
-	 */
-	if (tryself && pri < TDQ_CPU(self)->tdq_lowpri) {
-		CTR1(KTR_ULE, "tryself %d",
-		    curthread->td_priority);
-		return (self);
+	pri = td->td_priority;
+	tdq = TDQ_CPU(ts->ts_cpu);
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
+		if (tdq->tdq_lowpri > PRI_MIN_IDLE)
+			return (ts->ts_cpu);
+		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri)
+			return (ts->ts_cpu);
 	}
 	/*
- 	 * Now search for the cpu running the lowest priority thread with
-	 * the least load.
-	 */
-	if (pick_pri)
-		cpu = tdq_lowestpri();
-	else
-		cpu = tdq_lowestload();
+	 * Search for the highest level in the tree that still has affinity.
+	 */
+	cg = NULL;
+	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
+		if (SCHED_AFFINITY(ts, cg->cg_level))
+			break;
+	cpu = -1;
+	mask = td->td_cpuset->cs_mask.__bits[0];
+	if (cg)
+		cpu = sched_lowest(cg, mask, pri);
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, -1);
+	/*
+	 * Compare the lowest loaded cpu to current cpu.
+	 */
+	if (THREAD_CAN_SCHED(td, self) &&
+	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
+		if (tryself && TDQ_CPU(self)->tdq_lowpri > pri)
+			cpu = self;
+		else if (oldtryself && curthread->td_priority > pri)
+			cpu = self;
+	}
+	if (cpu == -1) {
+		panic("cpu == -1, mask 0x%X cpu top %p", mask, cpu_top);
+	}
 	return (cpu);
 }
-
-#endif	/* SMP */
+#endif
 
 /*
  * Pick the highest priority task we have and return it.
@@ -1174,121 +1183,31 @@ tdq_setup(struct tdq *tdq)
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
-	tdq->tdq_load = 0;
-}
-
-#ifdef SMP
-static void
-tdg_setup(struct tdq_group *tdg)
-{
-	if (bootverbose)
-		printf("ULE: setup cpu group %d\n", TDG_ID(tdg));
-	snprintf(tdg->tdg_name, sizeof(tdg->tdg_name),
-	    "sched lock %d", (int)TDG_ID(tdg));
-	mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock",
+	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+	    "sched lock %d", (int)TDQ_ID(tdq));
+	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
 	    MTX_SPIN | MTX_RECURSE);
-	LIST_INIT(&tdg->tdg_members);
-	tdg->tdg_load = 0;
-	tdg->tdg_transferable = 0;
-	tdg->tdg_cpus = 0;
-	tdg->tdg_mask = 0;
-	tdg->tdg_cpumask = 0;
-	tdg->tdg_idlemask = 0;
-}
-
-static void
-tdg_add(struct tdq_group *tdg, struct tdq *tdq)
-{
-	if (tdg->tdg_mask == 0)
-		tdg->tdg_mask |= 1 << TDQ_ID(tdq);
-	tdg->tdg_cpumask |= 1 << TDQ_ID(tdq);
-	tdg->tdg_cpus++;
-	tdq->tdq_group = tdg;
-	tdq->tdq_lock = &tdg->tdg_lock;
-	LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
-	if (bootverbose)
-		printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n",
-		    TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask);
-}
-
-static void
-sched_setup_topology(void)
-{
-	struct tdq_group *tdg;
-	struct cpu_group *cg;
-	int balance_groups;
-	struct tdq *tdq;
-	int i;
-	int j;
-
-	topology = 1;
-	balance_groups = 0;
-	for (i = 0; i < smp_topology->ct_count; i++) {
-		cg = &smp_topology->ct_group[i];
-		tdg = &tdq_groups[i];
-		/*
-		 * Initialize the group.
-		 */
-		tdg_setup(tdg);
-		/*
-		 * Find all of the group members and add them.
-		 */
-		for (j = 0; j < MAXCPU; j++) { 
-			if ((cg->cg_mask & (1 << j)) != 0) {
-				tdq = TDQ_CPU(j);
-				tdq_setup(tdq);
-				tdg_add(tdg, tdq);
-			}
-		}
-		if (tdg->tdg_cpus > 1)
-			balance_groups = 1;
-	}
-	tdg_maxid = smp_topology->ct_count - 1;
-	if (balance_groups)
-		sched_balance_groups();
 }
 
+#ifdef SMP
 static void
 sched_setup_smp(void)
 {
-	struct tdq_group *tdg;
 	struct tdq *tdq;
-	int cpus;
 	int i;
 
-	for (cpus = 0, i = 0; i < MAXCPU; i++) {
+	cpu_top = smp_topo();
+	for (i = 0; i < MAXCPU; i++) {
 		if (CPU_ABSENT(i))
 			continue;
-		tdq = &tdq_cpu[i];
-		tdg = &tdq_groups[i];
-		/*
-		 * Setup a tdq group with one member.
-		 */
-		tdg_setup(tdg);
+		tdq = TDQ_CPU(i);
 		tdq_setup(tdq);
-		tdg_add(tdg, tdq);
-		cpus++;
+		tdq->tdq_cg = smp_topo_find(cpu_top, i);
+		if (tdq->tdq_cg == NULL)
+			panic("Can't find cpu group for %d\n", i);
 	}
-	tdg_maxid = cpus - 1;
-}
-
-/*
- * Fake a topology with one group containing all CPUs.
- */
-static void
-sched_fake_topo(void)
-{
-#ifdef SCHED_FAKE_TOPOLOGY
-	static struct cpu_top top;
-	static struct cpu_group group;
-
-	top.ct_count = 1;
-	top.ct_group = &group;
-	group.cg_mask = all_cpus;
-	group.cg_count = mp_ncpus;
-	group.cg_children = 0;
-	smp_topology = &top;
-#endif
+	balance_tdq = TDQ_SELF();
+	sched_balance();
 }
 #endif
 
@@ -1303,21 +1222,9 @@ sched_setup(void *dummy)
 
 	tdq = TDQ_SELF();
 #ifdef SMP
-	sched_fake_topo();
-	/*
-	 * Setup tdqs based on a topology configuration or vanilla SMP based
-	 * on mp_maxid.
-	 */
-	if (smp_topology == NULL)
-		sched_setup_smp();
-	else 
-		sched_setup_topology();
-	balance_tdq = tdq;
-	sched_balance();
+	sched_setup_smp();
 #else
 	tdq_setup(tdq);
-	mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE);
-	tdq->tdq_lock = &tdq_lock;
 #endif
 	/*
 	 * To avoid divide-by-zero, we set realstathz a dummy value
@@ -1331,6 +1238,7 @@ sched_setup(void *dummy)
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	tdq_load_add(tdq, &td_sched0);
+	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
@@ -1369,7 +1277,7 @@ sched_initticks(void *dummy)
 	 * prevents excess thrashing on large machines and excess idle on
 	 * smaller machines.
 	 */
-	steal_thresh = min(ffs(mp_ncpus) - 1, 4);
+	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 }
@@ -1617,16 +1525,17 @@ sched_thread_priority(struct thread *td,
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
-#ifdef SMP
 	} else if (TD_IS_RUNNING(td)) {
 		struct tdq *tdq;
+		int oldpri;
 
 		tdq = TDQ_CPU(ts->ts_cpu);
-		if (prio < tdq->tdq_lowpri ||
-		   (td->td_priority == tdq->tdq_lowpri && tdq->tdq_load <= 1))
-			tdq->tdq_lowpri = prio;
+		oldpri = td->td_priority;
 		td->td_priority = prio;
-#endif
+		if (prio < tdq->tdq_lowpri)
+			tdq->tdq_lowpri = prio;
+		else if (tdq->tdq_lowpri == oldpri)
+			tdq_setlowpri(tdq, td);
 	} else
 		td->td_priority = prio;
 }
@@ -1843,9 +1752,7 @@ sched_switch(struct thread *td, struct t
 	tdq = TDQ_CPU(cpuid);
 	ts = td->td_sched;
 	mtx = td->td_lock;
-#ifdef SMP
 	ts->ts_rltick = ticks;
-#endif
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
@@ -1913,12 +1820,12 @@ sched_switch(struct thread *td, struct t
 	} else
 		thread_unblock_switch(td, mtx);
 	/*
-	 * Assert that all went well and return.
+	 * We should always get here with the lowest priority td possible.
 	 */
-#ifdef SMP
-	/* We should always get here with the lowest priority td possible */
 	tdq->tdq_lowpri = td->td_priority;
-#endif
+	/*
+	 * Assert that all went well and return.
+	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
@@ -2022,6 +1929,7 @@ sched_fork_thread(struct thread *td, str
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_newthread(child);
 	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 	ts2->ts_cpu = ts->ts_cpu;
@@ -2052,8 +1960,6 @@ sched_class(struct thread *td, int class
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
-
-#ifdef SMP
 	/*
 	 * On SMP if we're on the RUNQ we must adjust the transferable
 	 * count because could be changing to or from an interrupt
@@ -2063,17 +1969,12 @@ sched_class(struct thread *td, int class
 		struct tdq *tdq;
 
 		tdq = TDQ_CPU(td->td_sched->ts_cpu);
-		if (THREAD_CAN_MIGRATE(td)) {
+		if (THREAD_CAN_MIGRATE(td))
 			tdq->tdq_transferable--;
-			tdq->tdq_group->tdg_transferable--;
-		}
 		td->td_pri_class = class;
-		if (THREAD_CAN_MIGRATE(td)) {
+		if (THREAD_CAN_MIGRATE(td))
 			tdq->tdq_transferable++;
-			tdq->tdq_group->tdg_transferable++;
-		}
 	}
-#endif
 	td->td_pri_class = class;
 }
 
@@ -2149,6 +2050,8 @@ sched_userret(struct thread *td)
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
+		if (lowpri_userret)
+			tdq_setlowpri(TDQ_SELF(), td);
 		thread_unlock(td);
         }
 }
@@ -2172,8 +2075,6 @@ sched_clock(struct thread *td)
 	if (balance_tdq == tdq) {
 		if (balance_ticks && --balance_ticks == 0)
 			sched_balance();
-		if (balance_group_ticks && --balance_group_ticks == 0)
-			sched_balance_groups();
 	}
 #endif
 	/*
@@ -2261,11 +2162,7 @@ out:
 struct thread *
 sched_choose(void)
 {
-#ifdef SMP
-	struct tdq_group *tdg;
-#endif
 	struct td_sched *ts;
-	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
@@ -2275,20 +2172,7 @@ sched_choose(void)
 		tdq_runq_rem(tdq, ts);
 		return (ts->ts_thread);
 	}
-	td = PCPU_GET(idlethread);
-#ifdef SMP
-	/*
-	 * We only set the idled bit when all of the cpus in the group are
-	 * idle.  Otherwise we could get into a situation where a thread bounces
-	 * back and forth between two idle cores on seperate physical CPUs.
-	 */
-	tdg = tdq->tdq_group;
-	tdg->tdg_idlemask |= PCPU_GET(cpumask);
-	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
-		atomic_set_int(&tdq_idle, tdg->tdg_mask);
-	tdq->tdq_lowpri = td->td_priority;
-#endif
-	return (td);
+	return (PCPU_GET(idlethread));
 }
 
 /*
@@ -2305,7 +2189,7 @@ sched_setpreempt(struct thread *td)
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
-	if (td->td_priority < ctd->td_priority)
+	if (td->td_priority < cpri)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
@@ -2329,9 +2213,6 @@ tdq_add(struct tdq *tdq, struct thread *
 {
 	struct td_sched *ts;
 	int class;
-#ifdef SMP
-	int cpumask;
-#endif
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
@@ -2355,29 +2236,8 @@ tdq_add(struct tdq *tdq, struct thread *
 		ts->ts_runq = &tdq->tdq_timeshare;
 	else
 		ts->ts_runq = &tdq->tdq_idle;
-#ifdef SMP
-	cpumask = 1 << ts->ts_cpu;
-	/*
-	 * If we had been idle, clear our bit in the group and potentially
-	 * the global bitmap.
-	 */
-	if ((class != PRI_IDLE && class != PRI_ITHD) &&
-	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
-		/*
-		 * Check to see if our group is unidling, and if so, remove it
-		 * from the global idle mask.
-		 */
-		if (tdq->tdq_group->tdg_idlemask ==
-		    tdq->tdq_group->tdg_cpumask)
-			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
-		/*
-		 * Now remove ourselves from the group specific idle mask.
-		 */
-		tdq->tdq_group->tdg_idlemask &= ~cpumask;
-	}
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
-#endif
 	tdq_runq_add(tdq, ts, flags);
 	tdq_load_add(tdq, ts);
 }
@@ -2412,13 +2272,7 @@ sched_add(struct thread *td, int flags)
 	 * Pick the destination cpu and if it isn't ours transfer to the
 	 * target cpu.
 	 */
-	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td) &&
-	    curthread->td_intr_nesting_level)
-		ts->ts_cpu = cpuid;
-	if (!THREAD_CAN_MIGRATE(td))
-		cpu = ts->ts_cpu;
-	else
-		cpu = sched_pickcpu(ts, flags);
+	cpu = sched_pickcpu(ts, flags);
 	tdq = sched_setcpu(ts, cpu, flags);
 	tdq_add(tdq, td, flags);
 	if (cpu != cpuid) {
@@ -2462,6 +2316,8 @@ sched_rem(struct thread *td)
 	tdq_runq_rem(tdq, ts);
 	tdq_load_rem(tdq, ts);
 	TD_SET_CAN_RUN(td);
+	if (td->td_priority == tdq->tdq_lowpri)
+		tdq_setlowpri(tdq, NULL);
 }
 
 /*
@@ -2505,14 +2361,12 @@ sched_bind(struct thread *td, int cpu)
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
 	ts->ts_flags |= TSF_BOUND;
-#ifdef SMP
 	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
-#endif
 }
 
 /*
@@ -2528,9 +2382,7 @@ sched_unbind(struct thread *td)
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
 	ts->ts_flags &= ~TSF_BOUND;
-#ifdef SMP
 	sched_unpin();
-#endif
 }
 
 int
@@ -2540,6 +2392,34 @@ sched_is_bound(struct thread *td)
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
+void
+sched_affinity(struct thread *td)
+{
+#ifdef SMP
+	struct td_sched *ts;
+	int cpu;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
+		return;
+	if (!TD_IS_RUNNING(td))
+		return;
+	td->td_flags |= TDF_NEEDRESCHED;
+	if (!THREAD_CAN_MIGRATE(td))
+		return;
+	/*
+	 * Assign the new cpu and force a switch before returning to
+	 * userspace.  If the target thread is not running locally send
+	 * an ipi to force the issue.
+	 */
+	cpu = ts->ts_cpu;
+	ts->ts_cpu = sched_pickcpu(ts, 0);
+	if (cpu != PCPU_GET(cpuid))
+		ipi_selected(1 << cpu, IPI_PREEMPT);
+#endif
+}
+
 /*
  * Basic yield call.
  */
@@ -2563,8 +2443,8 @@ sched_load(void)
 	int i;
 
 	total = 0;
-	for (i = 0; i <= tdg_maxid; i++)
-		total += TDQ_GROUP(i)->tdg_load;
+	for (i = 0; i <= mp_maxid; i++)
+		total += TDQ_CPU(i)->tdq_sysload;
 	return (total);
 #else
 	return (TDQ_SELF()->tdq_sysload);
@@ -2658,6 +2538,7 @@ sched_fork_exit(struct thread *td)
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+	tdq->tdq_lowpri = td->td_priority;
 }
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
@@ -2676,6 +2557,8 @@ SYSCTL_INT(_kern_sched, OID_AUTO, pick_p
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, userret, CTLFLAG_RW, &lowpri_userret, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, oldtryself, CTLFLAG_RW, &oldtryself, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
Index: sys/kern/subr_smp.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/subr_smp.c,v
retrieving revision 1.203
diff -u -p -r1.203 subr_smp.c
--- sys/kern/subr_smp.c	2 Jan 2008 17:09:15 -0000	1.203
+++ sys/kern/subr_smp.c	2 Mar 2008 02:58:10 -0000
@@ -68,7 +68,6 @@ int mp_ncpus;
 /* export this for libkvm consumers. */
 int mp_maxcpus = MAXCPU;
 
-struct cpu_top *smp_topology;
 volatile int smp_started;
 u_int mp_maxid;
 
@@ -90,6 +89,11 @@ int smp_cpus = 1;	/* how many cpu's runn
 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0,
     "Number of CPUs online");
 
+int smp_topology = 0;	/* Which topology we're using. */
+SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
+    "Topology override setting; 0 is default provided by hardware.");
+TUNABLE_INT("kern.smp.topology", &smp_topology);
+
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
@@ -385,22 +389,177 @@ smp_rendezvous(void (* setup_func)(void 
 	/* release lock */
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
-#else /* !SMP */
 
-/*
- * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
- * APIs will still work using this dummy support.
- */
-static void
-mp_setvariables_for_up(void *dummy)
+static struct cpu_group group[MAXCPU];
+
+struct cpu_group *
+smp_topo(void)
 {
-	mp_ncpus = 1;
-	mp_maxid = PCPU_GET(cpuid);
-	all_cpus = PCPU_GET(cpumask);
-	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
+	struct cpu_group *top;
+
+	/*
+	 * Check for a fake topology request for debugging purposes.
+	 */
+	switch (smp_topology) {
+	case 1:
+		/* Dual core with no sharing.  */
+		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
+		break;
+	case 3:
+		/* Dual core with shared L2.  */
+		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
+		break;
+	case 4:
+		/* quad core, shared l3 among each package, private l2.  */
+		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
+		break;
+	case 5:
+		/* quad core,  2 dualcore parts on each package share l2.  */
+		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
+		break;
+	case 6:
+		/* Single-core 2xHTT */
+		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
+		break;
+	case 7:
+		/* quad core with a shared l3, 8 threads sharing L2.  */
+		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
+		    CG_FLAG_THREAD);
+		break;
+	default:
+		/* Default, ask the system what it wants. */
+		top = cpu_topo();
+		break;
+	}
+	/*
+	 * Verify the returned topology.
+	 */
+	if (top->cg_count != mp_ncpus)
+		panic("Built bad topology at %p.  CPU count %d != %d",
+		    top, top->cg_count, mp_ncpus);
+	if (top->cg_mask != all_cpus)
+		panic("Built bad topology at %p.  CPU mask 0x%X != 0x%X",
+		    top, top->cg_mask, all_cpus);
+	return (top);
+}
+
+struct cpu_group *
+smp_topo_none(void)
+{
+	struct cpu_group *top;
+
+	top = &group[0];
+	top->cg_parent = NULL;
+	top->cg_child = NULL;
+	top->cg_mask = (1 << mp_ncpus) - 1;
+	top->cg_count = mp_ncpus;
+	top->cg_children = 0;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_flags = 0;
+	
+	return (top);
+}
+
+static int
+smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
+    int count, int flags, int start)
+{
+	cpumask_t mask;
+	int i;
+
+	for (mask = 0, i = 0; i < count; i++, start++)
+		mask |= (1 << start);
+	child->cg_parent = parent;
+	child->cg_child = NULL;
+	child->cg_children = 0;
+	child->cg_level = share;
+	child->cg_count = count;
+	child->cg_flags = flags;
+	child->cg_mask = mask;
+	parent->cg_children++;
+	for (; parent != NULL; parent = parent->cg_parent) {
+		if ((parent->cg_mask & child->cg_mask) != 0)
+			panic("Duplicate children in %p.  mask 0x%X child 0x%X",
+			    parent, parent->cg_mask, child->cg_mask);
+		parent->cg_mask |= child->cg_mask;
+		parent->cg_count += child->cg_count;
+	}
+
+	return (start);
 }
-SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
-    mp_setvariables_for_up, NULL)
+
+struct cpu_group *
+smp_topo_1level(int share, int count, int flags)
+{
+	struct cpu_group *child;
+	struct cpu_group *top;
+	int packages;
+	int cpu;
+	int i;
+
+	cpu = 0;
+	top = &group[0];
+	packages = mp_ncpus / count;
+	top->cg_child = child = &group[1];
+	top->cg_level = CG_SHARE_NONE;
+	for (i = 0; i < packages; i++, child++)
+		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
+	return (top);
+}
+
+struct cpu_group *
+smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
+    int l1flags)
+{
+	struct cpu_group *top;
+	struct cpu_group *l1g;
+	struct cpu_group *l2g;
+	int cpu;
+	int i;
+	int j;
+
+	cpu = 0;
+	top = &group[0];
+	l2g = &group[1];
+	top->cg_child = l2g;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_children = mp_ncpus / (l2count * l1count);
+	l1g = l2g + top->cg_children;
+	for (i = 0; i < top->cg_children; i++, l2g++) {
+		l2g->cg_parent = top;
+		l2g->cg_child = l1g;
+		l2g->cg_level = l2share;
+		for (j = 0; j < l2count; j++, l1g++)
+			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
+			    l1flags, cpu);
+	}
+	return (top);
+}
+
+
+struct cpu_group *
+smp_topo_find(struct cpu_group *top, int cpu)
+{
+	struct cpu_group *cg;
+	cpumask_t mask;
+	int children;
+	int i;
+
+	mask = (1 << cpu);
+	cg = top;
+	for (;;) {
+		if ((cg->cg_mask & mask) == 0)
+			return (NULL);
+		if (cg->cg_children == 0)
+			return (cg);
+		children = cg->cg_children;
+		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
+			if ((cg->cg_mask & mask) != 0)
+				break;
+	}
+	return (NULL);
+}
+#else /* !SMP */
 
 void
 smp_rendezvous(void (*setup_func)(void *), 
@@ -416,4 +575,19 @@ smp_rendezvous(void (*setup_func)(void *
 	if (teardown_func != NULL)
 		teardown_func(arg);
 }
+
+/*
+ * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
+ * APIs will still work using this dummy support.
+ */
+static void
+mp_setvariables_for_up(void *dummy)
+{
+	mp_ncpus = 1;
+	mp_maxid = PCPU_GET(cpuid);
+	all_cpus = PCPU_GET(cpumask);
+	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
+}
+SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
+    mp_setvariables_for_up, NULL)
 #endif /* SMP */
Index: sys/kern/syscalls.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.c,v
retrieving revision 1.217
diff -u -p -r1.217 syscalls.c
--- sys/kern/syscalls.c	12 Feb 2008 20:11:54 -0000	1.217
+++ sys/kern/syscalls.c	2 Mar 2008 02:58:10 -0000
@@ -2,7 +2,7 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.217 2008/02/12 20:11:54 ru Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp 
  */
 
@@ -491,4 +491,9 @@ const char *syscallnames[] = {
 	"thr_kill2",			/* 481 = thr_kill2 */
 	"shm_open",			/* 482 = shm_open */
 	"shm_unlink",			/* 483 = shm_unlink */
+	"cpuset",			/* 484 = cpuset */
+	"cpuset_setid",			/* 485 = cpuset_setid */
+	"cpuset_getid",			/* 486 = cpuset_getid */
+	"cpuset_getaffinity",			/* 487 = cpuset_getaffinity */
+	"cpuset_setaffinity",			/* 488 = cpuset_setaffinity */
 };
Index: sys/kern/syscalls.master
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.master,v
retrieving revision 1.237
diff -u -p -r1.237 syscalls.master
--- sys/kern/syscalls.master	12 Feb 2008 20:09:04 -0000	1.237
+++ sys/kern/syscalls.master	2 Mar 2008 02:58:10 -0000
@@ -850,5 +850,18 @@
 482	AUE_SHMOPEN	STD	{ int shm_open(const char *path, int flags, \
 				    mode_t mode); }
 483	AUE_SHMUNLINK	STD	{ int shm_unlink(const char *path); }
+484	AUE_NULL	STD	{ int cpuset(cpusetid_t *setid); }
+485	AUE_NULL	STD	{ int cpuset_setid(cpuwhich_t which, id_t id, \
+				    cpusetid_t setid); }
+486	AUE_NULL	STD	{ int cpuset_getid(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, \
+				    cpusetid_t *setid); }
+487	AUE_NULL	STD	{ int cpuset_getaffinity(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, int cpusetsize, \
+				    long *mask); }
+488	AUE_NULL	STD	{ int cpuset_setaffinity(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, int cpusetsize, \
+				    long *mask); }
+
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
Index: sys/kern/systrace_args.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/systrace_args.c,v
retrieving revision 1.17
diff -u -p -r1.17 systrace_args.c
--- sys/kern/systrace_args.c	12 Feb 2008 20:11:54 -0000	1.17
+++ sys/kern/systrace_args.c	2 Mar 2008 02:58:11 -0000
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/systrace_args.c,v 1.17 2008/02/12 20:11:54 ru Exp $
+ * $FreeBSD$
  * This file is part of the DTrace syscall provider.
  */
 
@@ -2887,6 +2887,54 @@ systrace_args(int sysnum, void *params, 
 		*n_args = 1;
 		break;
 	}
+	/* cpuset */
+	case 484: {
+		struct cpuset_args *p = params;
+		uarg[0] = (intptr_t) p->setid; /* cpusetid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* cpuset_setid */
+	case 485: {
+		struct cpuset_setid_args *p = params;
+		iarg[0] = p->which; /* cpuwhich_t */
+		iarg[1] = p->id; /* id_t */
+		iarg[2] = p->setid; /* cpusetid_t */
+		*n_args = 3;
+		break;
+	}
+	/* cpuset_getid */
+	case 486: {
+		struct cpuset_getid_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		uarg[3] = (intptr_t) p->setid; /* cpusetid_t * */
+		*n_args = 4;
+		break;
+	}
+	/* cpuset_getaffinity */
+	case 487: {
+		struct cpuset_getaffinity_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		iarg[3] = p->cpusetsize; /* int */
+		uarg[4] = (intptr_t) p->mask; /* long * */
+		*n_args = 5;
+		break;
+	}
+	/* cpuset_setaffinity */
+	case 488: {
+		struct cpuset_setaffinity_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		iarg[3] = p->cpusetsize; /* int */
+		uarg[4] = (intptr_t) p->mask; /* long * */
+		*n_args = 5;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
Index: sys/powerpc/powerpc/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/powerpc/powerpc/mp_machdep.c,v
retrieving revision 1.13
diff -u -p -r1.13 mp_machdep.c
--- sys/powerpc/powerpc/mp_machdep.c	16 May 2006 14:32:17 -0000	1.13
+++ sys/powerpc/powerpc/mp_machdep.c	2 Mar 2008 02:58:16 -0000
@@ -45,6 +45,13 @@
 
 int			boot_cpu_id;
 
+struct cpu_group *
+cpu_topo(void)
+{
+
+        return smp_topo_none();
+}
+
 void
 cpu_mp_setmaxid(void)
 {
Index: sys/sparc64/sparc64/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/sparc64/sparc64/mp_machdep.c,v
retrieving revision 1.36
diff -u -p -r1.36 mp_machdep.c
--- sys/sparc64/sparc64/mp_machdep.c	16 Jun 2007 23:26:00 -0000	1.36
+++ sys/sparc64/sparc64/mp_machdep.c	2 Mar 2008 02:58:16 -0000
@@ -189,6 +189,13 @@ cpu_mp_probe(void)
 	return (mp_maxid > 0);
 }
 
+struct cpu_group *
+cpu_topo(void)
+{
+
+	return smp_topo_none();
+}
+
 static void
 sun4u_startcpu(phandle_t cpu, void *func, u_long arg)
 {
Index: sys/sun4v/sun4v/mp_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/sun4v/sun4v/mp_machdep.c,v
retrieving revision 1.8
diff -u -p -r1.8 mp_machdep.c
--- sys/sun4v/sun4v/mp_machdep.c	6 Jul 2007 00:41:53 -0000	1.8
+++ sys/sun4v/sun4v/mp_machdep.c	2 Mar 2008 02:58:16 -0000
@@ -241,6 +241,13 @@ cpu_mp_probe(void)
 	return (mp_maxid > 0);
 }
 
+struct cpu_group *
+cpu_topo(void)
+{
+
+	return smp_topo_none();
+}
+
 static int
 start_ap_bycpuid(int cpuid, void *func, u_long arg)
 {
Index: sys/sys/_types.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/_types.h,v
retrieving revision 1.23
diff -u -p -r1.23 _types.h
--- sys/sys/_types.h	1 Mar 2006 06:29:34 -0000	1.23
+++ sys/sys/_types.h	2 Mar 2008 02:58:16 -0000
@@ -61,6 +61,9 @@ typedef	struct __timer	*__timer_t;	/* ti
 typedef	struct __mq	*__mqd_t;	/* mq_open()... */
 typedef	__uint32_t	__uid_t;
 typedef	unsigned int	__useconds_t;	/* microseconds (unsigned) */
+typedef	int		__cpuwhich_t;	/* which parameter for cpuset. */
+typedef	int		__cpulevel_t;	/* level parameter for cpuset. */
+typedef int		__cpusetid_t;	/* cpuset identifier. */
 
 /*
  * Unusual type definitions.
Index: sys/sys/cpuset.h
===================================================================
RCS file: sys/sys/cpuset.h
diff -N sys/sys/cpuset.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/sys/cpuset.h	2 Mar 2008 02:58:16 -0000
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 2008,	Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_CPUSET_H_
+#define	_SYS_CPUSET_H_
+
+#ifdef _KERNEL
+#define	CPU_SETSIZE	MAXCPU
+#endif
+
+#define	CPU_MAXSIZE	128
+
+#ifndef	CPU_SETSIZE
+#define	CPU_SETSIZE	CPU_MAXSIZE
+#endif
+
+#define	_NCPUBITS	(sizeof(long) * NBBY)	/* bits per mask */
+#define	_NCPUWORDS	howmany(CPU_SETSIZE, _NCPUBITS)
+
+typedef	struct _cpuset {
+	long	__bits[howmany(CPU_SETSIZE, _NCPUBITS)];
+} cpuset_t;
+
+#define	__cpuset_mask(n)	((long)1 << ((n) % _NCPUBITS))
+#define	CPU_CLR(n, p)	((p)->__bits[(n)/_NCPUBITS] &= ~__cpuset_mask(n))
+#define	CPU_COPY(f, t)	(void)(*(t) = *(f))
+#define	CPU_ISSET(n, p)	(((p)->__bits[(n)/_NCPUBITS] & __cpuset_mask(n)) != 0)
+#define	CPU_SET(n, p)	((p)->__bits[(n)/_NCPUBITS] |= __cpuset_mask(n))
+#define	CPU_ZERO(p) do {				\
+	__size_t __i;					\
+	for (__i = 0; __i < _NCPUWORDS; __i++)		\
+		(p)->__bits[__i] = 0;			\
+} while (0)
+
+#define	CPU_EMPTY(p) __extension__ ({			\
+	__size_t __i;					\
+	for (__i = 0; __i < _NCPUWORDS; __i++)		\
+		if ((p)->__bits[__i])			\
+			break;				\
+	__i == _NCPUWORDS;				\
+})
+
+#define	CPU_OR(d, s) do {				\
+	__size_t __i;					\
+	for (__i = 0; __i < _NCPUWORDS; __i++)		\
+		(d)->__bits[__i] |= (s)->__bits[__i];	\
+} while (0)
+
+#define	CPU_AND(d, s) do {				\
+	__size_t __i;					\
+	for (__i = 0; __i < _NCPUWORDS; __i++)		\
+		(d)->__bits[__i] &= (s)->__bits[__i];	\
+} while (0)
+
+#define	CPU_NAND(d, s) do {				\
+	__size_t __i;					\
+	for (__i = 0; __i < _NCPUWORDS; __i++)		\
+		(d)->__bits[__i] &= ~(s)->__bits[__i];	\
+} while (0)
+
+/*
+ * Valid cpulevel_t values.
+ */
+#define	CPU_LEVEL_ROOT		1	/* All system cpus. */
+#define	CPU_LEVEL_CPUSET	2	/* Available cpus for which. */
+#define	CPU_LEVEL_WHICH		3	/* Actual mask/id for which. */
+
+/*
+ * Valid cpuwhich_t values.
+ */
+#define	CPU_WHICH_TID		1	/* Specifies a thread id. */
+#define	CPU_WHICH_PID		2	/* Specifies a process id. */
+#define	CPU_WHICH_CPUSET	3	/* Specifies a set id. */
+
+/*
+ * Reserved cpuset identifiers.
+ */
+#define	CPUSET_INVALID	-1
+#define	CPUSET_DEFAULT	0
+
+#ifdef _KERNEL
+LIST_HEAD(setlist, cpuset);
+
+/*
+ * cpusets encapsulate cpu binding information for one or more threads.
+ *
+ * 	a - Accessed with atomics.
+ *	s - Set at creation, never modified.  Only a ref required to read.
+ *	c - Locked internally by a cpuset lock.
+ *
+ * The bitmask is only modified while holding the cpuset lock.  It may be
+ * read while only a reference is held but the consumer must be prepared
+ * to deal with inconsistent results.
+ */
+struct cpuset {
+	cpuset_t		cs_mask;	/* bitmask of valid cpus. */
+	volatile u_int		cs_ref;		/* (a) Reference count. */
+	int			cs_flags;	/* (s) Flags from below. */
+	cpusetid_t		cs_id;		/* (s) Id or INVALID. */
+	struct cpuset		*cs_parent;	/* (s) Pointer to our parent. */
+	LIST_ENTRY(cpuset)	cs_link;	/* (c) All identified sets. */
+	LIST_ENTRY(cpuset)	cs_siblings;	/* (c) Sibling set link. */
+	struct setlist		cs_children;	/* (c) List of children. */
+};
+
+#define CPU_SET_ROOT    0x0001  /* Set is a root set. */
+#define CPU_SET_RDONLY  0x0002  /* No modification allowed. */
+
+struct cpuset *cpuset_thread0(void);
+struct cpuset *cpuset_ref(struct cpuset *);
+void cpuset_rel(struct cpuset *);
+#else
+__BEGIN_DECLS
+int	cpuset(cpusetid_t *);
+int	cpuset_setid(cpuwhich_t, id_t, cpusetid_t);
+int	cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *);
+int	cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, int, cpuset_t *);
+int	cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, int, cpuset_t *);
+__END_DECLS
+#endif
+#endif /* !_SYS_CPUSET_H_ */
Index: sys/sys/proc.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/proc.h,v
retrieving revision 1.503
diff -u -p -r1.503 proc.h
--- sys/sys/proc.h	7 Feb 2008 06:55:38 -0000	1.503
+++ sys/sys/proc.h	2 Mar 2008 02:58:16 -0000
@@ -163,6 +163,7 @@ struct thread;
 struct trapframe;
 struct turnstile;
 struct mqueue_notifier;
+struct cpuset;
 
 /*
  * Here we define the two structures used for process information.
@@ -208,7 +209,7 @@ struct thread {
 	/* The two queues below should someday be merged. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
-
+	struct cpuset	*td_cpuset;	/* (t) CPU affinity mask. */
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
Index: sys/sys/sched.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sched.h,v
retrieving revision 1.33
diff -u -p -r1.33 sched.h
--- sys/sys/sched.h	12 Jun 2007 19:49:39 -0000	1.33
+++ sys/sys/sched.h	2 Mar 2008 02:58:16 -0000
@@ -32,7 +32,7 @@
  */
 
 /*-
- * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2002-2008, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -129,6 +129,7 @@ static __inline void sched_pin(void);
 void	sched_unbind(struct thread *td);
 static __inline void sched_unpin(void);
 int	sched_is_bound(struct thread *td);
+void	sched_affinity(struct thread *td);
 
 /*
  * These procedures tell the process data structure allocation code how
@@ -175,6 +176,7 @@ extern long switch_needresched;
 void schedinit(void);
 void sched_newproc(struct proc *p, struct thread *td);
 void sched_newthread(struct thread *td);
+
 #endif /* _KERNEL */
 
 /* POSIX 1003.1b Process Scheduling */
Index: sys/sys/smp.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/smp.h,v
retrieving revision 1.86
diff -u -p -r1.86 smp.h
--- sys/sys/smp.h	8 Nov 2007 14:47:55 -0000	1.86
+++ sys/sys/smp.h	2 Mar 2008 02:58:16 -0000
@@ -32,18 +32,40 @@
  */
 
 struct cpu_group {
-	cpumask_t cg_mask;		/* Mask of cpus in this group. */
-	int	cg_count;		/* Count of cpus in this group. */
-	int	cg_children;		/* Number of children groups. */
-	struct cpu_group *cg_child;	/* Optional child group. */
+	struct cpu_group *cg_parent;	/* Our parent group. */
+	struct cpu_group *cg_child;	/* Optional children groups. */
+	cpumask_t	cg_mask;	/* Mask of cpus in this group. */
+	int8_t		cg_count;	/* Count of cpus in this group. */
+	int8_t		cg_children;	/* Number of children groups. */
+	int8_t		cg_level;	/* Shared cache level. */
+	int8_t		cg_flags;	/* Traversal modifiers. */
 };
 
-struct cpu_top {
-	int	ct_count;		/* Count of groups. */
-	struct cpu_group *ct_group;	/* Array of pointers to cpu groups. */
-};
+/*
+ * Defines common resources for CPUs in the group.  The highest level
+ * resource should be used when multiple are shared.
+ */
+#define	CG_SHARE_NONE	0
+#define	CG_SHARE_L1	1
+#define	CG_SHARE_L2	2
+#define	CG_SHARE_L3	3
+
+/*
+ * Behavior modifiers for load balancing and affinity.
+ */
+#define	CG_FLAG_HTT	0x01		/* Schedule the alternate core last. */
+#define	CG_FLAG_THREAD	0x02		/* New age htt, less crippled. */
+
+/*
+ * Convenience routines for building topologies.
+ */
+struct cpu_group *smp_topo(void);
+struct cpu_group *smp_topo_none(void);
+struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
+struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
+    int l1count, int l1flags);
+struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu);
 
-extern struct cpu_top *smp_topology;
 extern void (*cpustop_restartfunc)(void);
 extern int smp_active;
 extern int smp_cpus;
@@ -90,6 +112,7 @@ extern cpumask_t all_cpus;
  */
 struct thread;
 
+struct cpu_group *cpu_topo(void);
 void	cpu_mp_announce(void);
 int	cpu_mp_probe(void);
 void	cpu_mp_setmaxid(void);
Index: sys/sys/syscall.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.h,v
retrieving revision 1.214
diff -u -p -r1.214 syscall.h
--- sys/sys/syscall.h	12 Feb 2008 20:11:53 -0000	1.214
+++ sys/sys/syscall.h	2 Mar 2008 02:58:16 -0000
@@ -2,7 +2,7 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/syscall.h,v 1.214 2008/02/12 20:11:53 ru Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp 
  */
 
@@ -403,4 +403,9 @@
 #define	SYS_thr_kill2	481
 #define	SYS_shm_open	482
 #define	SYS_shm_unlink	483
-#define	SYS_MAXSYSCALL	484
+#define	SYS_cpuset	484
+#define	SYS_cpuset_setid	485
+#define	SYS_cpuset_getid	486
+#define	SYS_cpuset_getaffinity	487
+#define	SYS_cpuset_setaffinity	488
+#define	SYS_MAXSYSCALL	489
Index: sys/sys/syscall.mk
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.mk,v
retrieving revision 1.169
diff -u -p -r1.169 syscall.mk
--- sys/sys/syscall.mk	12 Feb 2008 20:11:53 -0000	1.169
+++ sys/sys/syscall.mk	2 Mar 2008 02:58:16 -0000
@@ -1,6 +1,6 @@
 # FreeBSD system call names.
 # DO NOT EDIT-- this file is automatically generated.
-# $FreeBSD: src/sys/sys/syscall.mk,v 1.169 2008/02/12 20:11:53 ru Exp $
+# $FreeBSD$
 # created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp 
 MIASM =  \
 	syscall.o \
@@ -351,4 +351,9 @@ MIASM =  \
 	ftruncate.o \
 	thr_kill2.o \
 	shm_open.o \
-	shm_unlink.o
+	shm_unlink.o \
+	cpuset.o \
+	cpuset_setid.o \
+	cpuset_getid.o \
+	cpuset_getaffinity.o \
+	cpuset_setaffinity.o
Index: sys/sys/sysproto.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sysproto.h,v
retrieving revision 1.218
diff -u -p -r1.218 sysproto.h
--- sys/sys/sysproto.h	12 Feb 2008 20:11:54 -0000	1.218
+++ sys/sys/sysproto.h	2 Mar 2008 02:58:16 -0000
@@ -2,7 +2,7 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/sysproto.h,v 1.218 2008/02/12 20:11:54 ru Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp 
  */
 
@@ -1528,6 +1528,34 @@ struct shm_open_args {
 struct shm_unlink_args {
 	char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)];
 };
+struct cpuset_args {
+	char setid_l_[PADL_(cpusetid_t *)]; cpusetid_t * setid; char setid_r_[PADR_(cpusetid_t *)];
+};
+struct cpuset_setid_args {
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char setid_l_[PADL_(cpusetid_t)]; cpusetid_t setid; char setid_r_[PADR_(cpusetid_t)];
+};
+struct cpuset_getid_args {
+	char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)];
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char setid_l_[PADL_(cpusetid_t *)]; cpusetid_t * setid; char setid_r_[PADR_(cpusetid_t *)];
+};
+struct cpuset_getaffinity_args {
+	char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)];
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char cpusetsize_l_[PADL_(int)]; int cpusetsize; char cpusetsize_r_[PADR_(int)];
+	char mask_l_[PADL_(long *)]; long * mask; char mask_r_[PADR_(long *)];
+};
+struct cpuset_setaffinity_args {
+	char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)];
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char cpusetsize_l_[PADL_(int)]; int cpusetsize; char cpusetsize_r_[PADR_(int)];
+	char mask_l_[PADL_(long *)]; long * mask; char mask_r_[PADR_(long *)];
+};
 int	nosys(struct thread *, struct nosys_args *);
 void	sys_exit(struct thread *, struct sys_exit_args *);
 int	fork(struct thread *, struct fork_args *);
@@ -1869,6 +1897,11 @@ int	ftruncate(struct thread *, struct ft
 int	thr_kill2(struct thread *, struct thr_kill2_args *);
 int	shm_open(struct thread *, struct shm_open_args *);
 int	shm_unlink(struct thread *, struct shm_unlink_args *);
+int	cpuset(struct thread *, struct cpuset_args *);
+int	cpuset_setid(struct thread *, struct cpuset_setid_args *);
+int	cpuset_getid(struct thread *, struct cpuset_getid_args *);
+int	cpuset_getaffinity(struct thread *, struct cpuset_getaffinity_args *);
+int	cpuset_setaffinity(struct thread *, struct cpuset_setaffinity_args *);
 
 #ifdef COMPAT_43
 
@@ -2435,6 +2468,11 @@ int	freebsd4_sigreturn(struct thread *, 
 #define	SYS_AUE_thr_kill2	AUE_KILL
 #define	SYS_AUE_shm_open	AUE_SHMOPEN
 #define	SYS_AUE_shm_unlink	AUE_SHMUNLINK
+#define	SYS_AUE_cpuset	AUE_NULL
+#define	SYS_AUE_cpuset_setid	AUE_NULL
+#define	SYS_AUE_cpuset_getid	AUE_NULL
+#define	SYS_AUE_cpuset_getaffinity	AUE_NULL
+#define	SYS_AUE_cpuset_setaffinity	AUE_NULL
 
 #undef PAD_
 #undef PADL_
Index: sys/sys/types.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/types.h,v
retrieving revision 1.97
diff -u -p -r1.97 types.h
--- sys/sys/types.h	28 Nov 2007 21:54:46 -0000	1.97
+++ sys/sys/types.h	2 Mar 2008 02:58:16 -0000
@@ -124,6 +124,10 @@ typedef	__blksize_t	blksize_t;
 #define	_BLKSIZE_T_DECLARED
 #endif
 
+typedef	__cpuwhich_t	cpuwhich_t;
+typedef	__cpulevel_t	cpulevel_t;
+typedef	__cpusetid_t	cpusetid_t;
+
 #ifndef _BLKCNT_T_DECLARED
 typedef	__blkcnt_t	blkcnt_t;
 #define	_BLKCNT_T_DECLARED
Index: usr.bin/Makefile
===================================================================
RCS file: /home/ncvs/src/usr.bin/Makefile,v
retrieving revision 1.306
diff -u -p -r1.306 Makefile
--- usr.bin/Makefile	20 Dec 2007 16:40:25 -0000	1.306
+++ usr.bin/Makefile	2 Mar 2008 02:58:20 -0000
@@ -42,6 +42,7 @@ SUBDIR=	alias \
 	comm \
 	compile_et \
 	compress \
+	cpuset \
 	csplit \
 	${_csup} \
 	ctags \
Index: usr.bin/cpuset/Makefile
===================================================================
RCS file: usr.bin/cpuset/Makefile
diff -N usr.bin/cpuset/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ usr.bin/cpuset/Makefile	2 Mar 2008 02:58:20 -0000
@@ -0,0 +1,7 @@
+# $FreeBSD$
+
+PROG=   cpuset
+NO_MAN=	true
+WARNS?=	1
+
+.include <bsd.prog.mk>
Index: usr.bin/cpuset/cpuset.c
===================================================================
RCS file: usr.bin/cpuset/cpuset.c
diff -N usr.bin/cpuset/cpuset.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ usr.bin/cpuset/cpuset.c	2 Mar 2008 02:58:20 -0000
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2007, 2008 	Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/cpuset.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+int cflag;
+int gflag;
+int iflag;
+int lflag;
+int pflag;
+int rflag;
+int sflag;
+int tflag;
+id_t id;
+cpulevel_t level;
+cpuwhich_t which;
+
+void usage(void);
+
+void printset(cpuset_t *mask);
+
+void
+parselist(char *list, cpuset_t *mask)
+{
+	enum { NONE, NUM, DASH } state;
+	int lastnum;
+	int curnum;
+	char *l;
+
+	state = NONE;
+	for (l = list; *l != '\0';) {
+		if (isdigit(*l)) {
+			curnum = atoi(l);
+			if (curnum > CPU_SETSIZE)
+				errx(EXIT_FAILURE,
+				    "Only %d cpus supported", CPU_SETSIZE);
+			while (isdigit(*l))
+				l++;
+			switch (state) {
+			case NONE:
+				lastnum = curnum;
+				state = NUM;
+				break;
+			case DASH:
+				for (; lastnum <= curnum; lastnum++)
+					CPU_SET(lastnum, mask);
+				state = NONE;
+				break;
+			case NUM:
+			default:
+				goto parserr;
+			}
+			continue;
+		}
+		switch (*l) {
+		case ',':
+			switch (state) {
+			case NONE:
+				break;
+			case NUM:
+				CPU_SET(curnum, mask);
+				state = NONE;
+				break;
+			case DASH:
+				goto parserr;
+				break;
+			}
+			break;
+		case '-':
+			if (state != NUM)
+				goto parserr;
+			state = DASH;
+			break;
+		default:
+			goto parserr;
+		}
+		l++;
+	}
+	switch (state) {
+		case NONE:
+			break;
+		case NUM:
+			CPU_SET(curnum, mask);
+			break;
+		case DASH:
+			goto parserr;
+	}
+	return;
+parserr:
+	errx(EXIT_FAILURE, "Malformed cpu list %s", list);
+}
+
+void
+printset(cpuset_t *mask)
+{
+	int once;
+	int cpu;
+
+	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+		if (CPU_ISSET(cpu, mask)) {
+			if (once == 0) {
+				printf("%d", cpu);
+				once = 1;
+			} else
+				printf(", %d", cpu);
+		}
+	}
+	printf("\n");
+}
+
+char *whichnames[] = { NULL, "tid", "pid", "cpuset" };
+char *levelnames[] = { NULL, " root", " cpuset", "" };
+
+void
+printaffinity(void)
+{
+	cpuset_t mask;
+
+	if (cpuset_getaffinity(level, which, id, CPU_SETSIZE,
+	    &mask) != 0)
+		err(EXIT_FAILURE, "getaffinity");
+	printf("%s %d%s mask: ", whichnames[which], id, levelnames[level]);
+	printset(&mask);
+	exit(EXIT_SUCCESS);
+}
+
+void
+printsetid(void)
+{
+	cpusetid_t setid;
+
+	/*
+	 * Only LEVEL_WHICH && WHICH_CPUSET has a numbered id.
+	 */
+	if (level == CPU_LEVEL_WHICH && !sflag)
+		level = CPU_LEVEL_CPUSET;
+	if (cpuset_getid(level, which, id, &setid))
+		err(errno, "getid");
+	printf("%s %d%s id: %d\n", whichnames[which], id,
+	    levelnames[level], setid);
+}
+
+int
+main(int argc, char *argv[])
+{
+	cpusetid_t setid;
+	cpuset_t mask;
+	lwpid_t tid;
+	pid_t pid;
+	int ch;
+
+	CPU_ZERO(&mask);
+	level = CPU_LEVEL_WHICH;
+	which = CPU_WHICH_PID;
+	id = -1;
+	while ((ch = getopt(argc, argv, "cgil:p:rs:t:")) != -1) {
+		switch (ch) {
+		case 'c':
+			if (rflag)
+				usage();
+			cflag = 1;
+			level = CPU_LEVEL_CPUSET;
+			break;
+		case 'g':
+			gflag = 1;
+			break;
+		case 'i':
+			iflag = 1;
+			break;
+		case 'l':
+			lflag = 1;
+			parselist(optarg, &mask);
+			break;
+		case 'p':
+			pflag = 1;
+			which = CPU_WHICH_PID;
+			id = pid = atoi(optarg);
+			break;
+		case 'r':
+			if (cflag)
+				usage();
+			level = CPU_LEVEL_ROOT;
+			rflag = 1;
+			break;
+		case 's':
+			sflag = 1;
+			which = CPU_WHICH_CPUSET;
+			id = setid = atoi(optarg);
+			break;
+		case 't':
+			tflag = 1;
+			which = CPU_WHICH_TID;
+			id = tid = atoi(optarg);
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+	if (gflag) {
+		if (argc || lflag)
+			usage();
+		/* Only one identity specifier. */
+		if (sflag + pflag + tflag > 1)
+			usage();
+		if (iflag)
+			printsetid();
+		else
+			printaffinity();
+		exit(EXIT_SUCCESS);
+	}
+	/*
+	 * The user wants to run a command with a set and possibly cpumask.
+	 */
+	if (argc) {
+		if (pflag | rflag | tflag || cflag)
+			usage();
+		if (sflag && iflag)
+			usage();
+		if (sflag) {
+			if (cpuset_setid(CPU_WHICH_PID, -1, setid))
+				err(argc, "setid");
+			which = CPU_WHICH_PID;
+			level = CPU_LEVEL_WHICH;
+		}
+		if (iflag) {
+			if (cpuset(&setid))
+				err(argc, "newid");
+			which = CPU_WHICH_CPUSET;
+			level = CPU_LEVEL_WHICH;
+		}
+		if (lflag) {
+			if (cpuset_setaffinity(level, which, -1,
+			    CPU_SETSIZE, &mask) != 0)
+				err(EXIT_FAILURE, "setaffinity");
+		}
+		errno = 0;
+		execvp(*argv, argv);
+		err(errno == ENOENT ? 127 : 126, "%s", *argv);
+	}
+	/*
+	 * We're modifying something that presently exists.
+	 */
+	if (iflag)
+		usage();
+	if (!lflag && (cflag || rflag))
+		usage();
+	if (!lflag && !sflag)
+		usage();
+	/* You can only set a mask on a thread. */
+	if (tflag && (sflag || pflag))
+		usage();
+	if (pflag && sflag) {
+		if (cpuset_setid(CPU_WHICH_PID, pid, setid))
+			err(EXIT_FAILURE, "setid");
+		/*
+		 * If the user specifies a set and a list we want the mask
+		 * to effect the pid and not the set.
+		 */
+		which = CPU_WHICH_PID;
+		id = pid;
+	}
+	if (lflag) {
+		if (cpuset_setaffinity(level, which, id, CPU_SETSIZE,
+		    &mask) != 0)
+			err(EXIT_FAILURE, "setaffinity");
+	}
+
+	exit(EXIT_SUCCESS);
+}
+
+void
+usage(void)
+{
+
+	fprintf(stderr,
+	    "usage: cpuset [-l cpu list] [-i | -s setid] cmd ...\n");
+	fprintf(stderr,
+	    "       cpuset [-l cpu list] [-s setid] -p pid\n");
+	fprintf(stderr,
+	    "       cpuset [-cr] [-l cpu list] [-p pid | -t tid | -s setid]\n");
+	fprintf(stderr,
+	    "       cpuset [-cgir] [-p pid | -t tid | -s setid]\n");
+	exit(1);
+}