Index: lib/libc/sys/Symbol.map =================================================================== RCS file: /home/ncvs/src/lib/libc/sys/Symbol.map,v retrieving revision 1.9 diff -u -p -r1.9 Symbol.map --- lib/libc/sys/Symbol.map 22 Aug 2007 01:56:35 -0000 1.9 +++ lib/libc/sys/Symbol.map 2 Mar 2008 02:57:58 -0000 @@ -66,6 +66,11 @@ FBSD_1.0 { clock_settime; close; connect; + cpuset; + cpuset_setid; + cpuset_getid; + cpuset_setaffinity; + cpuset_getaffinity; dup; dup2; eaccess; @@ -450,6 +455,16 @@ FBSDprivate_1.0 { __sys_close; _connect; __sys_connect; + __cpuset; + __sys_cpuset; + __cpuset_setid; + __sys_cpuset_setid; + __cpuset_getid; + __sys_cpuset_getid; + __cpuset_setaffinity; + __sys_cpuset_setaffinity; + __cpuset_getaffinity; + __sys_cpuset_getaffinity; _dup; __sys_dup; _dup2; Index: sys/amd64/amd64/identcpu.c =================================================================== RCS file: /home/ncvs/src/sys/amd64/amd64/identcpu.c,v retrieving revision 1.157 diff -u -p -r1.157 identcpu.c --- sys/amd64/amd64/identcpu.c 2 Feb 2008 23:17:27 -0000 1.157 +++ sys/amd64/amd64/identcpu.c 2 Mar 2008 02:58:05 -0000 @@ -97,6 +97,10 @@ static struct { { "Sledgehammer", CPUCLASS_K8 }, /* CPU_SLEDGEHAMMER */ }; +int cpu_cores; +int cpu_logical; + + extern int pq_l2size; extern int pq_l2nways; @@ -360,11 +364,13 @@ printcpuinfo(void) if ((regs[0] & 0x1f) != 0) cmp = ((regs[0] >> 26) & 0x3f) + 1; } + cpu_cores = cmp; + cpu_logical = htt / cmp; if (cmp > 1) printf("\n Cores per package: %d", cmp); if ((htt / cmp) > 1) printf("\n Logical CPUs per core: %d", - htt / cmp); + cpu_logical); } } /* Avoid ugly blank lines: only print newline when we have to. */ Index: sys/amd64/amd64/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/amd64/amd64/mp_machdep.c,v retrieving revision 1.287 diff -u -p -r1.287 mp_machdep.c --- sys/amd64/amd64/mp_machdep.c 2 Aug 2007 21:17:58 -0000 1.287 +++ sys/amd64/amd64/mp_machdep.c 2 Mar 2008 02:58:05 -0000 @@ -83,12 +83,6 @@ extern int nkpt; extern struct pcpu __pcpu[]; -/* - * CPU topology map datastructures for HTT. - */ -static struct cpu_group mp_groups[MAXCPU]; -static struct cpu_top mp_top; - /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; @@ -182,40 +176,38 @@ mem_range_AP_init(void) mem_range_softc.mr_op->initAP(&mem_range_softc); } -void -mp_topology(void) +struct cpu_group * +cpu_topo(void) { - struct cpu_group *group; - int apic_id; - int groups; - int cpu; - - /* Build the smp_topology map. */ - /* Nothing to do if there is no HTT support. */ - if (hyperthreading_cpus <= 1) - return; - group = &mp_groups[0]; - groups = 1; - for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) { - if (!cpu_info[apic_id].cpu_present) - continue; - /* - * If the current group has members and we're not a logical - * cpu, create a new group. - */ - if (group->cg_count != 0 && - (apic_id % hyperthreading_cpus) == 0) { - group++; - groups++; - } - group->cg_count++; - group->cg_mask |= 1 << cpu; - cpu++; + if (cpu_cores == 0) + cpu_cores = 1; + if (cpu_logical == 0) + cpu_logical = 1; + if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { + printf("WARNING: Non-uniform processors.\n"); + printf("WARNING: Using suboptimal topology.\n"); + return (smp_topo_none()); } - - mp_top.ct_count = groups; - mp_top.ct_group = mp_groups; - smp_topology = &mp_top; + /* + * No multi-core or hyper-threaded. + */ + if (cpu_logical * cpu_cores == 1) + return (smp_topo_none()); + /* + * Only HTT no multi-core. + */ + if (cpu_logical > 1 && cpu_cores == 1) + return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); + /* + * Only multi-core no HTT. + */ + if (cpu_cores > 1 && cpu_logical == 1) + return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); + /* + * Both HTT and multi-core. + */ + return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, + CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); } /* @@ -409,9 +401,6 @@ cpu_mp_start(void) } set_interrupt_apic_ids(); - - /* Last, setup the cpu topology now that we have probed CPUs */ - mp_topology(); } Index: sys/amd64/include/smp.h =================================================================== RCS file: /home/ncvs/src/sys/amd64/include/smp.h,v retrieving revision 1.91 diff -u -p -r1.91 smp.h --- sys/amd64/include/smp.h 20 Sep 2007 20:38:43 -0000 1.91 +++ sys/amd64/include/smp.h 2 Mar 2008 02:58:05 -0000 @@ -36,6 +36,10 @@ extern int boot_cpu_id; extern struct pcb stoppcbs[]; extern int cpu_apic_ids[]; +/* global data in identcpu.c */ +extern int cpu_cores; +extern int cpu_logical; + /* IPI handlers */ inthand_t IDTVEC(invltlb), /* TLB shootdowns - global */ @@ -57,7 +61,6 @@ void ipi_self(u_int ipi); void ipi_bitmap_handler(struct trapframe frame); u_int mp_bootaddress(u_int); int mp_grab_cpu_hlt(void); -void mp_topology(void); void smp_cache_flush(void); void smp_invlpg(vm_offset_t addr); void smp_masked_invlpg(u_int mask, vm_offset_t addr); Index: sys/conf/files =================================================================== RCS file: /home/ncvs/src/sys/conf/files,v retrieving revision 1.1269 diff -u -p -r1.1269 files --- sys/conf/files 20 Feb 2008 07:50:13 -0000 1.1269 +++ sys/conf/files 2 Mar 2008 02:58:06 -0000 @@ -1426,6 +1426,7 @@ kern/kern_clock.c standard kern/kern_condvar.c standard kern/kern_conf.c standard kern/kern_cpu.c standard +kern/kern_cpuset.c standard kern/kern_context.c standard kern/kern_descrip.c standard kern/kern_environment.c standard Index: sys/i386/i386/identcpu.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/identcpu.c,v retrieving revision 1.180 diff -u -p -r1.180 identcpu.c --- sys/i386/i386/identcpu.c 29 May 2007 19:39:18 -0000 1.180 +++ sys/i386/i386/identcpu.c 2 Mar 2008 02:58:09 -0000 @@ -141,6 +141,9 @@ static struct { { "Pentium 4", CPUCLASS_686 }, /* CPU_P4 */ }; +int cpu_cores; +int cpu_logical; + #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif @@ -874,11 +877,13 @@ via_common: if ((regs[0] & 0x1f) != 0) cmp = ((regs[0] >> 26) & 0x3f) + 1; } + cpu_cores = cmp; + cpu_logical = htt / cmp; if (cmp > 1) printf("\n Cores per package: %d", cmp); if ((htt / cmp) > 1) printf("\n Logical CPUs per core: %d", - htt / cmp); + cpu_logical); } } else if (strcmp(cpu_vendor, "CyrixInstead") == 0) { printf(" DIR=0x%04x", cyrix_did); Index: sys/i386/i386/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/mp_machdep.c,v retrieving revision 1.282 diff -u -p -r1.282 mp_machdep.c --- sys/i386/i386/mp_machdep.c 13 Nov 2007 23:00:24 -0000 1.282 +++ sys/i386/i386/mp_machdep.c 2 Mar 2008 02:58:09 -0000 @@ -135,12 +135,6 @@ extern int nkpt; extern struct pcpu __pcpu[]; -/* - * CPU topology map datastructures for HTT. - */ -static struct cpu_group mp_groups[MAXCPU]; -static struct cpu_top mp_top; - /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; @@ -238,40 +232,38 @@ mem_range_AP_init(void) mem_range_softc.mr_op->initAP(&mem_range_softc); } -void -mp_topology(void) +struct cpu_group * +cpu_topo(void) { - struct cpu_group *group; - int apic_id; - int groups; - int cpu; - - /* Build the smp_topology map. */ - /* Nothing to do if there is no HTT support. */ - if (hyperthreading_cpus <= 1) - return; - group = &mp_groups[0]; - groups = 1; - for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) { - if (!cpu_info[apic_id].cpu_present) - continue; - /* - * If the current group has members and we're not a logical - * cpu, create a new group. - */ - if (group->cg_count != 0 && - (apic_id % hyperthreading_cpus) == 0) { - group++; - groups++; - } - group->cg_count++; - group->cg_mask |= 1 << cpu; - cpu++; + if (cpu_cores == 0) + cpu_cores = 1; + if (cpu_logical == 0) + cpu_logical = 1; + if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { + printf("WARNING: Non-uniform processors.\n"); + printf("WARNING: Using suboptimal topology.\n"); + return (smp_topo_none()); } - - mp_top.ct_count = groups; - mp_top.ct_group = mp_groups; - smp_topology = &mp_top; + /* + * No multi-core or hyper-threaded. + */ + if (cpu_logical * cpu_cores == 1) + return (smp_topo_none()); + /* + * Only HTT no multi-core. + */ + if (cpu_logical > 1 && cpu_cores == 1) + return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); + /* + * Only multi-core no HTT. + */ + if (cpu_cores > 1 && cpu_logical == 1) + return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); + /* + * Both HTT and multi-core. + */ + return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, + CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); } @@ -459,9 +451,6 @@ cpu_mp_start(void) } set_interrupt_apic_ids(); - - /* Last, setup the cpu topology now that we have probed CPUs */ - mp_topology(); } Index: sys/i386/include/smp.h =================================================================== RCS file: /home/ncvs/src/sys/i386/include/smp.h,v retrieving revision 1.90 diff -u -p -r1.90 smp.h --- sys/i386/include/smp.h 20 Sep 2007 20:38:43 -0000 1.90 +++ sys/i386/include/smp.h 2 Mar 2008 02:58:10 -0000 @@ -45,6 +45,10 @@ extern u_long *ipi_rendezvous_counts[MAX extern u_long *ipi_lazypmap_counts[MAXCPU]; #endif +/* global data in identcpu.c */ +extern int cpu_cores; +extern int cpu_logical; + /* IPI handlers */ inthand_t IDTVEC(invltlb), /* TLB shootdowns - global */ @@ -67,7 +71,6 @@ void ipi_self(u_int ipi); void ipi_bitmap_handler(struct trapframe frame); u_int mp_bootaddress(u_int); int mp_grab_cpu_hlt(void); -void mp_topology(void); void smp_cache_flush(void); void smp_invlpg(vm_offset_t addr); void smp_masked_invlpg(u_int mask, vm_offset_t addr); Index: sys/ia64/ia64/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/ia64/ia64/mp_machdep.c,v retrieving revision 1.67 diff -u -p -r1.67 mp_machdep.c --- sys/ia64/ia64/mp_machdep.c 6 Aug 2007 05:15:57 -0000 1.67 +++ sys/ia64/ia64/mp_machdep.c 2 Mar 2008 02:58:10 -0000 @@ -84,6 +84,13 @@ volatile int ap_spin; static void cpu_mp_unleash(void *); +struct cpu_group * +cpu_topo(void) +{ + + return smp_topo_none(); +} + void ia64_ap_startup(void) { Index: sys/kern/init_main.c =================================================================== RCS file: /home/ncvs/src/sys/kern/init_main.c,v retrieving revision 1.290 diff -u -p -r1.290 init_main.c --- sys/kern/init_main.c 10 Jan 2008 22:11:20 -0000 1.290 +++ sys/kern/init_main.c 2 Mar 2008 02:58:10 -0000 @@ -73,6 +73,7 @@ __FBSDID("$FreeBSD: src/sys/kern/init_ma #include #include #include +#include #include @@ -430,6 +431,7 @@ proc0_init(void *dummy __unused) td->td_base_pri = PUSER; td->td_oncpu = 0; td->td_flags = TDF_INMEM|TDP_KTHREAD; + td->td_cpuset = cpuset_thread0(); p->p_peers = 0; p->p_leader = p; Index: sys/kern/init_sysent.c =================================================================== RCS file: /home/ncvs/src/sys/kern/init_sysent.c,v retrieving revision 1.233 diff -u -p -r1.233 init_sysent.c --- sys/kern/init_sysent.c 12 Feb 2008 20:11:54 -0000 1.233 +++ sys/kern/init_sysent.c 2 Mar 2008 02:58:10 -0000 @@ -2,7 +2,7 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/init_sysent.c,v 1.233 2008/02/12 20:11:54 ru Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp */ @@ -513,4 +513,9 @@ struct sysent sysent[] = { { AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0 }, /* 481 = thr_kill2 */ { AS(shm_open_args), (sy_call_t *)shm_open, AUE_SHMOPEN, NULL, 0, 0 }, /* 482 = shm_open */ { AS(shm_unlink_args), (sy_call_t *)shm_unlink, AUE_SHMUNLINK, NULL, 0, 0 }, /* 483 = shm_unlink */ + { AS(cpuset_args), (sy_call_t *)cpuset, AUE_NULL, NULL, 0, 0 }, /* 484 = cpuset */ + { AS(cpuset_setid_args), (sy_call_t *)cpuset_setid, AUE_NULL, NULL, 0, 0 }, /* 485 = cpuset_setid */ + { AS(cpuset_getid_args), (sy_call_t *)cpuset_getid, AUE_NULL, NULL, 0, 0 }, /* 486 = cpuset_getid */ + { AS(cpuset_getaffinity_args), (sy_call_t *)cpuset_getaffinity, AUE_NULL, NULL, 0, 0 }, /* 487 = cpuset_getaffinity */ + { AS(cpuset_setaffinity_args), (sy_call_t *)cpuset_setaffinity, AUE_NULL, NULL, 0, 0 }, /* 488 = cpuset_setaffinity */ }; Index: sys/kern/kern_cpuset.c =================================================================== RCS file: sys/kern/kern_cpuset.c diff -N sys/kern/kern_cpuset.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/kern_cpuset.c 2 Mar 2008 02:58:10 -0000 @@ -0,0 +1,907 @@ +/*- + * Copyright (c) 2008, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * cpusets provide a mechanism for creating and manipulating sets of + * processors for the purpose of constraining the scheduling of threads to + * specific processors. + * + * Each process belongs to an identified set, by default this is set 1. Each + * thread may further restrict the cpus it may run on to a subset of this + * named set. This creates an anonymous set which other threads and processes + * may not join by number. + * + * The named set is referred to herein as the 'base' set to avoid ambiguity. + * This set is usually a child of a 'root' set while the anonymous set may + * simply be referred to as a mask. In the syscall api these are referred to + * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. + * + * Threads inherit their set from their creator whether it be anonymous or + * not. This means that anonymous sets are immutable because they may be + * shared. To modify an anonymous set a new set is created with the desired + * mask and the same parent as the existing anonymous set. This gives the + * illusion of each thread having a private mask.A + * + * Via the syscall apis a user may ask to retrieve or modify the root, base, + * or mask that is discovered via a pid, tid, or setid. Modifying a set + * modifies all numbered and anonymous child sets to comply with the new mask. + * Modifying a pid or tid's mask applies only to that tid but must still + * exist within the assigned parent set. + * + * A thread may not be assigned to a a group seperate from other threads in + * the process. This is to remove ambiguity when the setid is queried with + * a pid argument. There is no other technical limitation. + * + * This somewhat complex arrangement is intended to make it easy for + * applications to query available processors and bind their threads to + * specific processors while also allowing administrators to dynamically + * reprovision by changing sets which apply to groups of processes. + * + * A simple application should not concern itself with sets at all and + * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id + * meaning 'curthread'. It may query availble cpus for that tid with a + * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). + */ +static uma_zone_t cpuset_zone; +static struct mtx cpuset_lock; +static struct setlist cpuset_ids; +struct cpuset *cpuset_zero; +static struct unrhdr *cpuset_unr; + +/* + * Acquire a reference to a cpuset, all pointers must be tracked with refs. + */ +struct cpuset * +cpuset_ref(struct cpuset *set) +{ + + refcount_acquire(&set->cs_ref); + return (set); +} + +/* + * Release a reference in a context where it is safe to allocte. + */ +void +cpuset_rel(struct cpuset *set) +{ + cpusetid_t id; + + if (refcount_release(&set->cs_ref) == 0) + return; + mtx_lock_spin(&cpuset_lock); + LIST_REMOVE(set, cs_siblings); + id = set->cs_id; + if (id != CPUSET_INVALID) + LIST_REMOVE(set, cs_link); + mtx_unlock_spin(&cpuset_lock); + cpuset_rel(set->cs_parent); + uma_zfree(cpuset_zone, set); + if (id != CPUSET_INVALID) + free_unr(cpuset_unr, id); +} + +/* + * Deferred release must be used when in a context that is not safe to + * allocate/free. This places any unreferenced sets on the list 'head'. + */ +static void +cpuset_rel_defer(struct setlist *head, struct cpuset *set) +{ + + if (refcount_release(&set->cs_ref) == 0) + return; + mtx_lock_spin(&cpuset_lock); + LIST_REMOVE(set, cs_siblings); + if (set->cs_id != CPUSET_INVALID) + LIST_REMOVE(set, cs_link); + LIST_INSERT_HEAD(head, set, cs_link); + mtx_unlock_spin(&cpuset_lock); +} + +/* + * Complete a deferred release. Removes the set from the list provided to + * cpuset_rel_defer. + */ +static void +cpuset_rel_complete(struct cpuset *set) +{ + LIST_REMOVE(set, cs_link); + cpuset_rel(set->cs_parent); + uma_zfree(cpuset_zone, set); +} + +/* + * Find a set based on an id. Returns it with a ref. + */ +static struct cpuset * +cpuset_lookup(cpusetid_t setid) +{ + struct cpuset *set; + + if (setid == CPUSET_INVALID) + return (NULL); + mtx_lock_spin(&cpuset_lock); + LIST_FOREACH(set, &cpuset_ids, cs_link) + if (set->cs_id == setid) + break; + if (set) + cpuset_ref(set); + mtx_unlock_spin(&cpuset_lock); + return (set); +} + +/* + * Create a set in the space provided in 'set' with the provided parameters. + * The set is returned with a single ref. May return EDEADLK if the set + * will have no valid cpu based on restrictions from the parent. + */ +static int +_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask, + cpusetid_t id) +{ + int error; + + error = 0; + CPU_COPY(mask, &set->cs_mask); + LIST_INIT(&set->cs_children); + refcount_init(&set->cs_ref, 1); + set->cs_flags = 0; + mtx_lock_spin(&cpuset_lock); + CPU_AND(mask, &parent->cs_mask); + if (!CPU_EMPTY(mask)) { + set->cs_id = id; + set->cs_parent = cpuset_ref(parent); + LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); + if (set->cs_id != CPUSET_INVALID) + LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); + } else + error = EDEADLK; + mtx_unlock_spin(&cpuset_lock); + + return (error); +} + +/* + * Create a new non-anonymous set with the requested parent and mask. May + * return failures if the mask is invalid or a new number can not be + * allocated. + */ +static int +cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask) +{ + struct cpuset *set; + cpusetid_t id; + int error; + + id = alloc_unr(cpuset_unr); + if (id == -1) + return (ENFILE); + *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); + error = _cpuset_create(set, parent, mask, id); + if (error == 0) + return (0); + free_unr(cpuset_unr, id); + uma_zfree(cpuset_zone, set); + + return (error); +} + +/* + * Recursively check for errors that would occur from applying mask to + * the tree of sets starting at 'set'. Checks for sets that would become + * empty as well as RDONLY flags. + */ +static int +cpuset_testupdate(struct cpuset *set, cpuset_t *mask) +{ + struct cpuset *nset; + cpuset_t newmask; + int error; + + mtx_assert(&cpuset_lock, MA_OWNED); + if (set->cs_flags & CPU_SET_RDONLY) + return (EPERM); + error = 0; + CPU_COPY(&set->cs_mask, &newmask); + CPU_AND(&newmask, mask); + if (CPU_EMPTY(&newmask)) + return (EDEADLK); + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + if ((error = cpuset_testupdate(nset, &newmask)) != 0) + break; + return (error); +} + +/* + * Applies the mask 'mask' without checking for empty sets or permissions. + */ +static void +cpuset_update(struct cpuset *set, cpuset_t *mask) +{ + struct cpuset *nset; + + mtx_assert(&cpuset_lock, MA_OWNED); + CPU_AND(&set->cs_mask, mask); + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + cpuset_update(nset, &set->cs_mask); + + return; +} + +/* + * Modify the set 'set' to use a copy of the mask provided. Apply this new + * mask to restrict all children in the tree. Checks for validity before + * applying the changes. + */ +static int +cpuset_modify(struct cpuset *set, cpuset_t *mask) +{ + int error; + + error = suser(curthread); + if (error) + return (error); + mtx_lock_spin(&cpuset_lock); + error = cpuset_testupdate(set, mask); + if (error) + goto out; + cpuset_update(set, mask); + CPU_COPY(mask, &set->cs_mask); +out: + mtx_unlock_spin(&cpuset_lock); + + return (error); +} + +/* + * Walks up the tree from 'set' to find the root. Returns the root + * referenced. + */ +static struct cpuset * +cpuset_root(struct cpuset *set) +{ + + mtx_lock_spin(&cpuset_lock); + for (; set->cs_parent != NULL; set = set->cs_parent) + if (set->cs_flags & CPU_SET_ROOT) + break; + cpuset_ref(set); + mtx_unlock_spin(&cpuset_lock); + + return (set); +} + +/* + * Find the first non-anonymous set starting from 'set'. Returns this set + * referenced. May return the passed in set with an extra ref if it is + * not anonymous. + */ +static struct cpuset * +cpuset_base(struct cpuset *set) +{ + + mtx_lock_spin(&cpuset_lock); + if (set->cs_id == CPUSET_INVALID) + set = set->cs_parent; + cpuset_ref(set); + mtx_unlock_spin(&cpuset_lock); + + return (set); +} + +/* + * Resolve the 'which' parameter of several cpuset apis. + * + * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also + * checks for permission via p_cansched(). + * + * For WHICH_SET returns a valid set with a new reference. + * + * -1 may be supplied for any argument to mean the current proc/thread or + * the base set of the current thread. May fail with ESRCH/EPERM. + */ +static int +cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, + struct cpuset **setp) +{ + struct cpuset *set; + struct thread *td; + struct proc *p; + int error; + + *pp = p = NULL; + *tdp = td = NULL; + *setp = set = NULL; + switch (which) { + case CPU_WHICH_PID: + if (id == -1) { + PROC_LOCK(curproc); + p = curproc; + break; + } + if ((p = pfind(id)) == NULL) + return (ESRCH); + break; + case CPU_WHICH_TID: + if (id == -1) { + PROC_LOCK(curproc); + p = curproc; + td = curthread; + break; + } + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + PROC_SLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + if (td->td_tid == id) + break; + PROC_SUNLOCK(p); + if (td != NULL) + break; + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + if (td == NULL) + return (ESRCH); + break; + case CPU_WHICH_CPUSET: + if (id == -1) { + thread_lock(curthread); + set = cpuset_base(curthread->td_cpuset); + thread_unlock(curthread); + } else + set = cpuset_lookup(id); + if (set) { + *setp = set; + return (0); + } + return (ESRCH); + default: + return (EINVAL); + } + error = p_cansched(curthread, p); + if (error) { + PROC_UNLOCK(p); + return (error); + } + if (td == NULL) + td = FIRST_THREAD_IN_PROC(p); + *pp = p; + *tdp = td; + return (0); +} + +/* + * Create an anonymous set with the provided mask in the space provided by + * 'fset'. If the passed in set is anonymous we use its parent otherwise + * the new set is a child of 'set'. + */ +static int +cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask) +{ + struct cpuset *parent; + + if (set->cs_id == CPUSET_INVALID) + parent = set->cs_parent; + else + parent = set; + return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); +} + +/* + * Handle two cases for replacing the base set or mask of an entire process. + * + * 1) Set is non-null and mask is null. This reparents all anonymous sets + * to the provided set and replaces all non-anonymous td_cpusets with the + * provided set. + * 2) Mask is non-null and set is null. This replaces or creates anonymous + * sets for every thread with the existing base as a parent. + * + * This is overly complicated because we can't allocate while holding a + * spinlock and spinlocks must be held while changing and examining thread + * state. + */ +static int +cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) +{ + struct setlist freelist; + struct setlist droplist; + struct cpuset *nset; + struct thread *td; + struct proc *p; + int threads; + int nfree; + int error; + /* + * The algorithm requires two passes due to locking considerations. + * + * 1) Lookup the process and acquire the locks in the required order. + * 2) If enough cpusets have not been allocated release the locks and + * allocate them. Loop. + */ + LIST_INIT(&freelist); + LIST_INIT(&droplist); + nfree = 0; + for (;;) { + error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); + if (error) + goto out; + PROC_SLOCK(p); + if (nfree >= p->p_numthreads) + break; + threads = p->p_numthreads; + PROC_SUNLOCK(p); + PROC_UNLOCK(p); + for (; nfree < threads; nfree++) { + nset = uma_zalloc(cpuset_zone, M_WAITOK); + LIST_INSERT_HEAD(&freelist, nset, cs_link); + } + } + PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + /* + * Now that the appropriate locks are held and we have enough cpusets, + * replace each thread's cpuset while using deferred release. We + * must do this because the PROC_SLOCK has to be held while traversing + * the thread list and this limits the type of operations allowed. + */ + error = 0; + FOREACH_THREAD_IN_PROC(p, td) { + struct cpuset *tdset; + thread_lock(td); + /* + * If we presently have an anonymous set or are applying a + * mask we must create an anonymous shadow set. That is + * either parented to our existing base or the supplied set. + * + * If we have a base set with no anonymous shadow we simply + * replace it outright. + */ + tdset = td->td_cpuset; + if (tdset->cs_id == CPUSET_INVALID || mask) { + nset = LIST_FIRST(&freelist); + LIST_REMOVE(nset, cs_link); + if (mask) + error = cpuset_shadow(tdset, nset, mask); + else + error = _cpuset_create(nset, set, + &tdset->cs_mask, CPUSET_INVALID); + if (error) { + LIST_INSERT_HEAD(&freelist, nset, cs_link); + thread_unlock(td); + break; + } + } else + nset = cpuset_ref(set); + cpuset_rel_defer(&droplist, tdset); + td->td_cpuset = nset; + sched_affinity(td); + thread_unlock(td); + } + PROC_SUNLOCK(p); + PROC_UNLOCK(p); +out: + while ((nset = LIST_FIRST(&droplist)) != NULL) + cpuset_rel_complete(nset); + while ((nset = LIST_FIRST(&freelist)) != NULL) { + LIST_REMOVE(nset, cs_link); + uma_zfree(cpuset_zone, nset); + } + return (error); +} + +/* + * Apply an anonymous mask to a single thread. + */ +static int +cpuset_setthread(lwpid_t id, cpuset_t *mask) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *td; + struct proc *p; + int error; + + nset = uma_zalloc(cpuset_zone, M_WAITOK); + error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &nset); + if (error) + goto out; + thread_lock(td); + set = td->td_cpuset; + error = cpuset_shadow(set, nset, mask); + if (error == 0) { + cpuset_rel(td->td_cpuset); + td->td_cpuset = nset; + sched_affinity(td); + nset = NULL; + } + thread_unlock(td); + PROC_UNLOCK(p); +out: + if (nset) + uma_zfree(cpuset_zone, nset); + return (error); +} + +/* + * Creates the cpuset for thread0. We make two sets: + * + * 0 - The root set which should represent all valid processors in the + * system. It is initially created with a mask of all processors + * because we don't know what processors are valid until cpuset_init() + * runs. This set is immutable. + * 1 - The default set which all processes are a member of until changed. + * This allows an administrator to move all threads off of given cpus to + * dedicate them to high priority tasks or save power etc. + */ +struct cpuset * +cpuset_thread0(void) +{ + struct cpuset *set; + int error; + + cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); + /* + * Create the root system set for the whole machine. Doesn't use + * cpuset_create() due to NULL parent. + */ + set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + set->cs_mask.__bits[0] = -1; + LIST_INIT(&set->cs_children); + LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); + set->cs_ref = 1; + set->cs_flags = CPU_SET_ROOT; + cpuset_zero = set; + /* + * Now derive a default, modifiable set from that to give out. + */ + set = uma_zalloc(cpuset_zone, M_WAITOK); + error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); + KASSERT(error == 0, ("Error creating default set: %d\n", error)); + /* + * Initialize the unit allocator. 0 and 1 are allocated above. + */ + cpuset_unr = new_unrhdr(2, INT_MAX, NULL); + + return (set); +} + +/* + * This is called once the final set of system cpus is known. Modifies + * the root set and all children and mark the root readonly. + */ +static void +cpuset_init(void *arg) +{ + cpuset_t mask; + + CPU_ZERO(&mask); +#ifdef SMP + mask.__bits[0] = all_cpus; +#else + mask.__bits[0] = 1; +#endif + if (cpuset_modify(cpuset_zero, &mask)) + panic("Can't set initial cpuset mask.\n"); + cpuset_zero->cs_flags |= CPU_SET_RDONLY; +} +SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_args { + cpusetid_t *setid; +}; +#endif +int +cpuset(struct thread *td, struct cpuset_args *uap) +{ + struct cpuset *root; + struct cpuset *set; + int error; + + thread_lock(td); + root = cpuset_root(td->td_cpuset); + thread_unlock(td); + error = cpuset_create(&set, root, &root->cs_mask); + cpuset_rel(root); + if (error) + return (error); + error = cpuset_setproc(-1, set, NULL); + if (error == 0) + error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); + cpuset_rel(set); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_setid_args { + cpuwhich_t which; + id_t id; + cpusetid_t setid; +}; +#endif +int +cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) +{ + struct cpuset *set; + int error; + + /* + * Presently we only support per-process sets. + */ + if (uap->which != CPU_WHICH_PID) + return (EINVAL); + set = cpuset_lookup(uap->setid); + if (set == NULL) + return (ESRCH); + error = cpuset_setproc(uap->id, set, NULL); + cpuset_rel(set); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_getid_args { + cpulevel_t level; + cpuwhich_t which; + id_t id; + cpusetid_t *setid; +#endif +int +cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *ttd; + struct proc *p; + cpusetid_t id; + int error; + + if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET) + return (EINVAL); + error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); + if (error) + return (error); + switch (uap->which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_base(ttd->td_cpuset); + thread_unlock(ttd); + PROC_UNLOCK(p); + break; + case CPU_WHICH_CPUSET: + break; + } + switch (uap->level) { + case CPU_LEVEL_ROOT: + nset = cpuset_root(set); + cpuset_rel(set); + set = nset; + break; + case CPU_LEVEL_CPUSET: + break; + case CPU_LEVEL_WHICH: + break; + } + id = set->cs_id; + cpuset_rel(set); + if (error == 0) + error = copyout(&id, uap->setid, sizeof(id)); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_getaffinity_args { + cpulevel_t level; + cpuwhich_t which; + int id; + int cpusetsize; + long *mask; +}; +#endif +int +cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) +{ + struct thread *ttd; + struct cpuset *nset; + struct cpuset *set; + struct proc *p; + cpuset_t *mask; + int error; + int size; + + if (uap->cpusetsize < CPU_SETSIZE || uap->cpusetsize > CPU_MAXSIZE) + return (ERANGE); + size = uap->cpusetsize / NBBY; + mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); + if (error) + goto out; + error = 0; + switch (uap->level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + switch (uap->which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + break; + case CPU_WHICH_CPUSET: + break; + } + if (uap->level == CPU_LEVEL_ROOT) + nset = cpuset_root(set); + else + nset = cpuset_base(set); + CPU_COPY(&nset->cs_mask, mask); + cpuset_rel(nset); + break; + case CPU_LEVEL_WHICH: + switch (uap->which) { + case CPU_WHICH_TID: + thread_lock(ttd); + CPU_COPY(&ttd->td_cpuset->cs_mask, mask); + thread_unlock(ttd); + break; + case CPU_WHICH_PID: + PROC_SLOCK(p); + FOREACH_THREAD_IN_PROC(p, ttd) { + thread_lock(ttd); + CPU_OR(mask, &ttd->td_cpuset->cs_mask); + thread_unlock(ttd); + } + PROC_SUNLOCK(p); + break; + case CPU_WHICH_CPUSET: + CPU_COPY(&set->cs_mask, mask); + break; + } + break; + default: + error = EINVAL; + break; + } + if (set) + cpuset_rel(set); + if (p) + PROC_UNLOCK(p); + if (error == 0) + error = copyout(mask, uap->mask, size); +out: + free(mask, M_TEMP); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_setaffinity_args { + cpulevel_t level; + cpuwhich_t which; + int id; + int cpusetsize; + long * mask; +}; +#endif +int +cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *ttd; + struct proc *p; + cpuset_t *mask; + int error; + + if (uap->cpusetsize < CPU_SETSIZE || uap->cpusetsize > CPU_MAXSIZE) + return (ERANGE); + mask = malloc(uap->cpusetsize / NBBY, M_TEMP, M_WAITOK | M_ZERO); + error = copyin(uap->mask, mask, uap->cpusetsize / NBBY); + if (error) + goto out; + switch (uap->level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); + if (error) + break; + switch (uap->which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + break; + case CPU_WHICH_CPUSET: + break; + } + if (uap->level == CPU_LEVEL_ROOT) + nset = cpuset_root(set); + else + nset = cpuset_base(set); + error = cpuset_modify(nset, mask); + cpuset_rel(nset); + cpuset_rel(set); + break; + case CPU_LEVEL_WHICH: + switch (uap->which) { + case CPU_WHICH_TID: + error = cpuset_setthread(uap->id, mask); + break; + case CPU_WHICH_PID: + error = cpuset_setproc(uap->id, NULL, mask); + break; + case CPU_WHICH_CPUSET: + error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p, + &ttd, &set); + if (error == 0) { + error = cpuset_modify(set, mask); + cpuset_rel(set); + } + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } +out: + free(mask, M_TEMP); + return (error); +} Index: sys/kern/kern_thread.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_thread.c,v retrieving revision 1.265 diff -u -p -r1.265 kern_thread.c --- sys/kern/kern_thread.c 22 Dec 2007 04:56:48 -0000 1.265 +++ sys/kern/kern_thread.c 2 Mar 2008 02:58:10 -0000 @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_th #include #include #include +#include #include @@ -342,7 +343,8 @@ thread_alloc(void) void thread_free(struct thread *td) { - + cpuset_rel(td->td_cpuset); + td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_altkstack != 0) vm_thread_dispose_altkstack(td); @@ -527,6 +529,8 @@ thread_wait(struct proc *p) /* Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads) sched_relinquish(curthread); + cpuset_rel(td->td_cpuset); + td->td_cpuset = NULL; cpu_thread_clean(td); crfree(td->td_ucred); thread_reap(); /* check for zombie threads etc. */ Index: sys/kern/sched_ule.c =================================================================== RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.223 diff -u -p -r1.223 sched_ule.c --- sys/kern/sched_ule.c 23 Jan 2008 03:10:18 -0000 1.223 +++ sys/kern/sched_ule.c 2 Mar 2008 02:58:10 -0000 @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u #include #include #include +#include #ifdef KTRACE #include #include @@ -95,9 +96,7 @@ struct td_sched { int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ -#ifdef SMP int ts_rltick; /* Real last tick, for affinity. */ -#endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ @@ -105,6 +104,10 @@ struct td_sched { static struct td_sched td_sched0; +#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) +#define THREAD_CAN_SCHED(td, cpu) \ + CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) + /* * Cpu percentage computation macros and defines. * @@ -183,6 +186,7 @@ static int preempt_thresh = PRI_MIN_KERN #else static int preempt_thresh = 0; #endif +static int lowpri_userret = 1; /* * tdq - per processor runqs and statistics. All fields are protected by the @@ -190,47 +194,26 @@ static int preempt_thresh = 0; * locking in sched_pickcpu(); */ struct tdq { - struct mtx *tdq_lock; /* Pointer to group lock. */ + struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ + struct mtx tdq_lock; /* run queue lock. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ int tdq_load; /* Aggregate load. */ + int tdq_sysload; /* For loadavg, !ITHD load. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ -#ifdef SMP u_char tdq_lowpri; /* Lowest priority thread. */ int tdq_transferable; /* Transferable thread count. */ - LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ - struct tdq_group *tdq_group; /* Our processor group. */ -#else - int tdq_sysload; /* For loadavg, !ITHD load. */ -#endif + char tdq_name[sizeof("sched lock") + 6]; } __aligned(64); #ifdef SMP -/* - * tdq groups are groups of processors which can cheaply share threads. When - * one processor in the group goes idle it will check the runqs of the other - * processors in its group prior to halting and waiting for an interrupt. - * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. - * In a numa environment we'd want an idle bitmap per group and a two tiered - * load balancer. - */ -struct tdq_group { - struct mtx tdg_lock; /* Protects all fields below. */ - int tdg_cpus; /* Count of CPUs in this tdq group. */ - cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ - cpumask_t tdg_idlemask; /* Idle cpus in this group. */ - cpumask_t tdg_mask; /* Bit mask for first cpu. */ - int tdg_load; /* Total load of this group. */ - int tdg_transferable; /* Transferable load of this group. */ - LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ - char tdg_name[16]; /* lock name. */ -} __aligned(64); +struct cpu_group *cpu_top; -#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) -#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) +#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) +#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. @@ -240,6 +223,7 @@ static int balance_interval = 128; /* De static int pick_pri = 1; static int affinity; static int tryself = 1; +static int oldtryself = 0; static int steal_htt = 1; static int steal_idle = 1; static int steal_thresh = 2; @@ -248,22 +232,15 @@ static int topology = 0; /* * One thread queue per processor. */ -static volatile cpumask_t tdq_idle; -static int tdg_maxid; static struct tdq tdq_cpu[MAXCPU]; -static struct tdq_group tdq_groups[MAXCPU]; static struct tdq *balance_tdq; -static int balance_group_ticks; static int balance_ticks; #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) #define TDQ_ID(x) ((int)((x) - tdq_cpu)) -#define TDQ_GROUP(x) (&tdq_groups[(x)]) -#define TDG_ID(x) ((int)((x) - tdq_groups)) #else /* !SMP */ static struct tdq tdq_cpu; -static struct mtx tdq_lock; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) @@ -274,7 +251,7 @@ static struct mtx tdq_lock; #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) -#define TDQ_LOCKPTR(t) ((t)->tdq_lock) +#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); @@ -294,22 +271,18 @@ void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP -static void tdq_move(struct tdq *, struct tdq *); +static int tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct td_sched *); -static struct td_sched *tdq_steal(struct tdq *); -static struct td_sched *runq_steal(struct runq *); +static struct td_sched *tdq_steal(struct tdq *, int); +static struct td_sched *runq_steal(struct runq *, int); static int sched_pickcpu(struct td_sched *, int); static void sched_balance(void); -static void sched_balance_groups(void); -static void sched_balance_group(struct tdq_group *); -static void sched_balance_pair(struct tdq *, struct tdq *); +static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct td_sched *, int, int); static inline struct mtx *thread_block_switch(struct thread *); static inline void thread_unblock_switch(struct thread *, struct mtx *); static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); - -#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #endif static void sched_setup(void *dummy); @@ -356,7 +329,8 @@ tdq_print(int cpu) tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); - printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); + printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); @@ -366,12 +340,8 @@ tdq_print(int cpu) runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); -#ifdef SMP printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); - printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); - printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); -#endif } #define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) @@ -385,13 +355,10 @@ tdq_runq_add(struct tdq *tdq, struct td_ { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); -#ifdef SMP if (THREAD_CAN_MIGRATE(ts->ts_thread)) { tdq->tdq_transferable++; - tdq->tdq_group->tdg_transferable++; ts->ts_flags |= TSF_XFERABLE; } -#endif if (ts->ts_runq == &tdq->tdq_timeshare) { u_char pri; @@ -431,13 +398,10 @@ tdq_runq_rem(struct tdq *tdq, struct td_ TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); -#ifdef SMP if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; - tdq->tdq_group->tdg_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } -#endif if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); @@ -470,11 +434,7 @@ tdq_load_add(struct tdq *tdq, struct td_ CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load++; -#else tdq->tdq_sysload++; -#endif } /* @@ -491,11 +451,7 @@ tdq_load_rem(struct tdq *tdq, struct td_ class = PRI_BASE(ts->ts_thread->td_pri_class); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load--; -#else tdq->tdq_sysload--; -#endif KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; @@ -503,112 +459,282 @@ tdq_load_rem(struct tdq *tdq, struct td_ ts->ts_runq = NULL; } +/* + * Set lowpri to its exact value by searching the run-queue and + * evaluating curthread. curthread may be passed as an optimization. + */ +static void +tdq_setlowpri(struct tdq *tdq, struct thread *ctd) +{ + struct td_sched *ts; + struct thread *td; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + if (ctd == NULL) + ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; + ts = tdq_choose(tdq); + if (ts) + td = ts->ts_thread; + if (ts == NULL || td->td_priority > ctd->td_priority) + tdq->tdq_lowpri = ctd->td_priority; + else + tdq->tdq_lowpri = td->td_priority; +} + #ifdef SMP +struct cpu_search { + cpumask_t cs_mask; /* Mask of valid cpus. */ + u_int cs_load; + u_int cs_cpu; + int cs_limit; /* Min priority for low min load for high. */ +}; + +#define CPU_SEARCH_LOWEST 0x1 +#define CPU_SEARCH_HIGHEST 0x2 +#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) + +#define CPUMASK_FOREACH(cpu, mask) \ + for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \ + if ((mask) & 1 << (cpu)) + +__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match); +int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); +int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); +int cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high); + +/* + * This routine compares according to the match argument and should be + * reduced in actual instantiations via constant propagation and dead code + * elimination. + */ +static __inline int +cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, + const int match) +{ + struct tdq *tdq; + + tdq = TDQ_CPU(cpu); + if (match & CPU_SEARCH_LOWEST) + if (low->cs_mask & (1 << cpu) && + tdq->tdq_load < low->cs_load && + tdq->tdq_lowpri > low->cs_limit) { + low->cs_cpu = cpu; + low->cs_load = tdq->tdq_load; + } + if (match & CPU_SEARCH_HIGHEST) + if (high->cs_mask & (1 << cpu) && + tdq->tdq_load >= high->cs_limit && + tdq->tdq_load > high->cs_load && + tdq->tdq_transferable) { + high->cs_cpu = cpu; + high->cs_load = tdq->tdq_load; + } + return (tdq->tdq_load); +} + /* - * sched_balance is a simple CPU load balancing algorithm. It operates by - * finding the least loaded and most loaded cpu and equalizing their load - * by migrating some processes. - * - * Dealing only with two CPUs at a time has two advantages. Firstly, most - * installations will only have 2 cpus. Secondly, load balancing too much at - * once can have an unpleasant effect on the system. The scheduler rarely has - * enough information to make perfect decisions. So this algorithm chooses - * simplicity and more gradual effects on load in larger systems. + * Search the tree of cpu_groups for the lowest or highest loaded cpu + * according to the match argument. This routine actually compares the + * load on all paths through the tree and finds the least loaded cpu on + * the least loaded path, which may differ from the least loaded cpu in + * the system. This balances work among caches and busses. * + * This inline is instantiated in three forms below using constants for the + * match argument. It is reduced to the minimum set for each case. It is + * also recursive to the depth of the tree. + */ +static inline int +cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match) +{ + int total; + + total = 0; + if (cg->cg_children) { + struct cpu_search lgroup; + struct cpu_search hgroup; + struct cpu_group *child; + u_int lload; + int hload; + int load; + int i; + + lload = -1; + hload = -1; + for (i = 0; i < cg->cg_children; i++) { + child = &cg->cg_child[i]; + if (match & CPU_SEARCH_LOWEST) { + lgroup = *low; + lgroup.cs_load = -1; + } + if (match & CPU_SEARCH_HIGHEST) { + hgroup = *high; + lgroup.cs_load = 0; + } + switch (match) { + case CPU_SEARCH_LOWEST: + load = cpu_search_lowest(child, &lgroup); + break; + case CPU_SEARCH_HIGHEST: + load = cpu_search_highest(child, &hgroup); + break; + case CPU_SEARCH_BOTH: + load = cpu_search_both(child, &lgroup, &hgroup); + break; + } + total += load; + if (match & CPU_SEARCH_LOWEST) + if (load < lload || low->cs_cpu == -1) { + *low = lgroup; + lload = load; + } + if (match & CPU_SEARCH_HIGHEST) + if (load > hload || high->cs_cpu == -1) { + hload = load; + *high = hgroup; + } + } + } else { + int cpu; + + CPUMASK_FOREACH(cpu, cg->cg_mask) + total += cpu_compare(cpu, low, high, match); + } + return (total); +} + +/* + * cpu_search instantiations must pass constants to maintain the inline + * optimization. + */ +int +cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) +{ + return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); +} + +int +cpu_search_highest(struct cpu_group *cg, struct cpu_search *high) +{ + return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); +} + +int +cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high) +{ + return cpu_search(cg, low, high, CPU_SEARCH_BOTH); +} + +/* + * Find the cpu with the least load via the least loaded path that has a + * lowpri greater than pri pri. A pri of -1 indicates any priority is + * acceptable. + */ +static inline int +sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri) +{ + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_load = -1; + low.cs_mask = mask; + low.cs_limit = pri; + cpu_search_lowest(cg, &low); + return low.cs_cpu; +} + +/* + * Find the cpu with the highest load via the highest loaded path. + */ +static inline int +sched_highest(struct cpu_group *cg, cpumask_t mask, int minload) +{ + struct cpu_search high; + + high.cs_cpu = -1; + high.cs_load = 0; + high.cs_mask = mask; + high.cs_limit = minload; + cpu_search_highest(cg, &high); + return high.cs_cpu; +} + +/* + * Simultaneously find the highest and lowest loaded cpu reachable via + * cg. */ +static inline void +sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu) +{ + struct cpu_search high; + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_limit = -1; + low.cs_load = -1; + low.cs_mask = mask; + high.cs_load = 0; + high.cs_cpu = -1; + high.cs_limit = -1; + high.cs_mask = mask; + cpu_search_both(cg, &low, &high); + *lowcpu = low.cs_cpu; + *highcpu = high.cs_cpu; + return; +} + static void -sched_balance() +sched_balance_group(struct cpu_group *cg) { - struct tdq_group *high; - struct tdq_group *low; - struct tdq_group *tdg; - struct tdq *tdq; - int cnt; + cpumask_t mask; + int high; + int low; int i; - /* - * Select a random time between .5 * balance_interval and - * 1.5 * balance_interval. - */ - balance_ticks = max(balance_interval / 2, 1); - balance_ticks += random() % balance_interval; - if (smp_started == 0 || rebalance == 0) - return; - tdq = TDQ_SELF(); - TDQ_UNLOCK(tdq); - low = high = NULL; - i = random() % (tdg_maxid + 1); - for (cnt = 0; cnt <= tdg_maxid; cnt++) { - tdg = TDQ_GROUP(i); + mask = -1; + for (;;) { + sched_both(cg, mask, &low, &high); + if (low == high || low == -1 || high == -1) + break; + if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) + break; /* - * Find the CPU with the highest load that has some - * threads to transfer. - */ - if ((high == NULL || tdg->tdg_load > high->tdg_load) - && tdg->tdg_transferable) - high = tdg; - if (low == NULL || tdg->tdg_load < low->tdg_load) - low = tdg; - if (++i > tdg_maxid) - i = 0; - } - if (low != NULL && high != NULL && high != low) - sched_balance_pair(LIST_FIRST(&high->tdg_members), - LIST_FIRST(&low->tdg_members)); - TDQ_LOCK(tdq); + * If we failed to move any threads determine which cpu + * to kick out of the set and try again. + */ + if (TDQ_CPU(high)->tdq_transferable == 0) + mask &= ~(1 << high); + else + mask &= ~(1 << low); + } + + for (i = 0; i < cg->cg_children; i++) + sched_balance_group(&cg->cg_child[i]); } -/* - * Balance load between CPUs in a group. Will only migrate within the group. - */ static void -sched_balance_groups() +sched_balance() { struct tdq *tdq; - int i; /* * Select a random time between .5 * balance_interval and * 1.5 * balance_interval. */ - balance_group_ticks = max(balance_interval / 2, 1); - balance_group_ticks += random() % balance_interval; + balance_ticks = max(balance_interval / 2, 1); + balance_ticks += random() % balance_interval; if (smp_started == 0 || rebalance == 0) return; tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); - for (i = 0; i <= tdg_maxid; i++) - sched_balance_group(TDQ_GROUP(i)); + sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* - * Finds the greatest imbalance between two tdqs in a group. - */ -static void -sched_balance_group(struct tdq_group *tdg) -{ - struct tdq *tdq; - struct tdq *high; - struct tdq *low; - int load; - - if (tdg->tdg_transferable == 0) - return; - low = NULL; - high = NULL; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - load = tdq->tdq_load; - if (high == NULL || load > high->tdq_load) - high = tdq; - if (low == NULL || load < low->tdq_load) - low = tdq; - } - if (high != NULL && low != NULL && high != low) - sched_balance_pair(high, low); -} - -/* * Lock two thread queues using their address to maintain lock order. */ static void @@ -636,31 +762,22 @@ tdq_unlock_pair(struct tdq *one, struct /* * Transfer load between two imbalanced thread queues. */ -static void +static int sched_balance_pair(struct tdq *high, struct tdq *low) { int transferable; int high_load; int low_load; + int moved; int move; int diff; int i; tdq_lock_pair(high, low); - /* - * If we're transfering within a group we have to use this specific - * tdq's transferable count, otherwise we can steal from other members - * of the group. - */ - if (high->tdq_group == low->tdq_group) { - transferable = high->tdq_transferable; - high_load = high->tdq_load; - low_load = low->tdq_load; - } else { - transferable = high->tdq_group->tdg_transferable; - high_load = high->tdq_group->tdg_load; - low_load = low->tdq_group->tdg_load; - } + transferable = high->tdq_transferable; + high_load = high->tdq_load; + low_load = low->tdq_load; + moved = 0; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). @@ -672,7 +789,7 @@ sched_balance_pair(struct tdq *high, str move++; move = min(move, transferable); for (i = 0; i < move; i++) - tdq_move(high, low); + moved += tdq_move(high, low); /* * IPI the target cpu to force it to reschedule with the new * workload. @@ -680,13 +797,13 @@ sched_balance_pair(struct tdq *high, str ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT); } tdq_unlock_pair(high, low); - return; + return (moved); } /* * Move a thread from one thread queue to another. */ -static void +static int tdq_move(struct tdq *from, struct tdq *to) { struct td_sched *ts; @@ -699,22 +816,9 @@ tdq_move(struct tdq *from, struct tdq *t tdq = from; cpu = TDQ_ID(to); - ts = tdq_steal(tdq); - if (ts == NULL) { - struct tdq_group *tdg; - - tdg = tdq->tdq_group; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - if (tdq == from || tdq->tdq_transferable == 0) - continue; - ts = tdq_steal(tdq); - break; - } - if (ts == NULL) - return; - } - if (tdq == to) - return; + ts = tdq_steal(tdq, cpu); + if (ts == NULL) + return (0); td = ts->ts_thread; /* * Although the run queue is locked the thread may be blocked. Lock @@ -727,6 +831,7 @@ tdq_move(struct tdq *from, struct tdq *t ts->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); + return (1); } /* @@ -736,72 +841,54 @@ tdq_move(struct tdq *from, struct tdq *t static int tdq_idled(struct tdq *tdq) { - struct tdq_group *tdg; + struct cpu_group *cg; struct tdq *steal; - int highload; - int highcpu; + cpumask_t mask; + int thresh; int cpu; if (smp_started == 0 || steal_idle == 0) return (1); - /* We don't want to be preempted while we're iterating over tdqs */ + mask = -1; + mask &= ~PCPU_GET(cpumask); + /* We don't want to be preempted while we're iterating. */ spinlock_enter(); - tdg = tdq->tdq_group; - /* - * If we're in a cpu group, try and steal threads from another cpu in - * the group before idling. In a HTT group all cpus share the same - * run-queue lock, however, we still need a recursive lock to - * call tdq_move(). - */ - if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { - TDQ_LOCK(tdq); - LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { - if (steal == tdq || steal->tdq_transferable == 0) - continue; - TDQ_LOCK(steal); - goto steal; - } - TDQ_UNLOCK(tdq); - } - /* - * Find the least loaded CPU with a transferable thread and attempt - * to steal it. We make a lockless pass and then verify that the - * thread is still available after locking. - */ - for (;;) { - highcpu = 0; - highload = 0; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - steal = TDQ_CPU(cpu); - if (steal->tdq_transferable == 0) - continue; - if (steal->tdq_load < highload) - continue; - highload = steal->tdq_load; - highcpu = cpu; + for (cg = tdq->tdq_cg; cg != NULL; ) { + if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0) + thresh = steal_thresh; + else + thresh = 1; + cpu = sched_highest(cg, mask, thresh); + if (cpu == -1) { + cg = cg->cg_parent; + continue; } - if (highload < steal_thresh) - break; - steal = TDQ_CPU(highcpu); - if (steal == tdq) - break; + steal = TDQ_CPU(cpu); + mask &= ~(1 << cpu); tdq_lock_pair(tdq, steal); - if (steal->tdq_load >= steal_thresh && steal->tdq_transferable) - goto steal; - tdq_unlock_pair(tdq, steal); + if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { + tdq_unlock_pair(tdq, steal); + continue; + } + /* + * If a thread was added while interrupts were disabled don't + * steal one here. If we fail to acquire one due to affinity + * restrictions loop again with this cpu removed from the + * set. + */ + if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { + tdq_unlock_pair(tdq, steal); + continue; + } + spinlock_exit(); + TDQ_UNLOCK(steal); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + + return (0); } spinlock_exit(); return (1); -steal: - spinlock_exit(); - tdq_move(steal, tdq); - TDQ_UNLOCK(steal); - mi_switch(SW_VOL, NULL); - thread_unlock(curthread); - - return (0); } /* @@ -854,7 +941,7 @@ sendipi: * index. */ static struct td_sched * -runq_steal_from(struct runq *rq, u_char start) +runq_steal_from(struct runq *rq, int cpu, u_char start) { struct td_sched *ts; struct rqbits *rqb; @@ -883,7 +970,8 @@ again: pri += (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(ts, rqh, ts_procq) { - if (first && THREAD_CAN_MIGRATE(ts->ts_thread)) + if (first && THREAD_CAN_MIGRATE(ts->ts_thread) && + THREAD_CAN_SCHED(ts->ts_thread, cpu)) return (ts); first = 1; } @@ -900,7 +988,7 @@ again: * Steals load from a standard linear queue. */ static struct td_sched * -runq_steal(struct runq *rq) +runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; @@ -917,7 +1005,8 @@ runq_steal(struct runq *rq) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(ts, rqh, ts_procq) - if (THREAD_CAN_MIGRATE(ts->ts_thread)) + if (THREAD_CAN_MIGRATE(ts->ts_thread) && + THREAD_CAN_SCHED(ts->ts_thread, cpu)) return (ts); } } @@ -928,16 +1017,17 @@ runq_steal(struct runq *rq) * Attempt to steal a thread in priority order from a thread queue. */ static struct td_sched * -tdq_steal(struct tdq *tdq) +tdq_steal(struct tdq *tdq, int cpu) { struct td_sched *ts; TDQ_LOCK_ASSERT(tdq, MA_OWNED); - if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) + if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (ts); - if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL) + if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) + != NULL) return (ts); - return (runq_steal(&tdq->tdq_idle)); + return (runq_steal(&tdq->tdq_idle, cpu)); } /* @@ -981,155 +1071,74 @@ sched_setcpu(struct td_sched *ts, int cp return (tdq); } -/* - * Find the thread queue running the lowest priority thread. - */ -static int -tdq_lowestpri(void) -{ - struct tdq *tdq; - int lowpri; - int lowcpu; - int lowload; - int load; - int cpu; - int pri; - - lowload = 0; - lowpri = lowcpu = 0; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - tdq = TDQ_CPU(cpu); - pri = tdq->tdq_lowpri; - load = TDQ_CPU(cpu)->tdq_load; - CTR4(KTR_ULE, - "cpu %d pri %d lowcpu %d lowpri %d", - cpu, pri, lowcpu, lowpri); - if (pri < lowpri) - continue; - if (lowpri && lowpri == pri && load > lowload) - continue; - lowpri = pri; - lowcpu = cpu; - lowload = load; - } - - return (lowcpu); -} - -/* - * Find the thread queue with the least load. - */ -static int -tdq_lowestload(void) -{ - struct tdq *tdq; - int lowload; - int lowpri; - int lowcpu; - int load; - int cpu; - int pri; - - lowcpu = 0; - lowload = TDQ_CPU(0)->tdq_load; - lowpri = TDQ_CPU(0)->tdq_lowpri; - for (cpu = 1; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - tdq = TDQ_CPU(cpu); - load = tdq->tdq_load; - pri = tdq->tdq_lowpri; - CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d", - cpu, load, lowcpu, lowload); - if (load > lowload) - continue; - if (load == lowload && pri < lowpri) - continue; - lowcpu = cpu; - lowload = load; - lowpri = pri; - } - - return (lowcpu); -} - -/* - * Pick the destination cpu for sched_add(). Respects affinity and makes - * a determination based on load or priority of available processors. - */ static int sched_pickcpu(struct td_sched *ts, int flags) { + struct cpu_group *cg; + struct thread *td; struct tdq *tdq; + cpumask_t mask; int self; int pri; int cpu; - cpu = self = PCPU_GET(cpuid); + self = PCPU_GET(cpuid); + td = ts->ts_thread; if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ - if (flags & SRQ_OURSELF) { - CTR1(KTR_ULE, "YIELDING %d", - curthread->td_priority); - return (self); - } - pri = ts->ts_thread->td_priority; - cpu = ts->ts_cpu; - /* - * Regardless of affinity, if the last cpu is idle send it there. - */ - tdq = TDQ_CPU(cpu); - if (tdq->tdq_lowpri > PRI_MIN_IDLE) { - CTR5(KTR_ULE, - "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", - ts->ts_cpu, ts->ts_rltick, ticks, pri, - tdq->tdq_lowpri); + if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); - } /* - * If we have affinity, try to place it on the cpu we last ran on. + * Prefer to run interrupt threads on the processors that generate + * the interrupt. */ - if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) { - CTR5(KTR_ULE, - "affinity for %d, ltick %d ticks %d pri %d curthread %d", - ts->ts_cpu, ts->ts_rltick, ticks, pri, - tdq->tdq_lowpri); - return (ts->ts_cpu); - } + if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && + curthread->td_intr_nesting_level) + ts->ts_cpu = self; /* - * Look for an idle group. + * If the thread can run on the last cpu and the affinity has not + * expired or it is idle run it there. */ - CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); - cpu = ffs(tdq_idle); - if (cpu) - return (--cpu); - /* - * If there are no idle cores see if we can run the thread locally. - * This may improve locality among sleepers and wakers when there - * is shared data. - */ - if (tryself && pri < TDQ_CPU(self)->tdq_lowpri) { - CTR1(KTR_ULE, "tryself %d", - curthread->td_priority); - return (self); + pri = td->td_priority; + tdq = TDQ_CPU(ts->ts_cpu); + if (THREAD_CAN_SCHED(td, ts->ts_cpu)) { + if (tdq->tdq_lowpri > PRI_MIN_IDLE) + return (ts->ts_cpu); + if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) + return (ts->ts_cpu); } /* - * Now search for the cpu running the lowest priority thread with - * the least load. - */ - if (pick_pri) - cpu = tdq_lowestpri(); - else - cpu = tdq_lowestload(); + * Search for the highest level in the tree that still has affinity. + */ + cg = NULL; + for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) + if (SCHED_AFFINITY(ts, cg->cg_level)) + break; + cpu = -1; + mask = td->td_cpuset->cs_mask.__bits[0]; + if (cg) + cpu = sched_lowest(cg, mask, pri); + if (cpu == -1) + cpu = sched_lowest(cpu_top, mask, -1); + /* + * Compare the lowest loaded cpu to current cpu. + */ + if (THREAD_CAN_SCHED(td, self) && + TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) { + if (tryself && TDQ_CPU(self)->tdq_lowpri > pri) + cpu = self; + else if (oldtryself && curthread->td_priority > pri) + cpu = self; + } + if (cpu == -1) { + panic("cpu == -1, mask 0x%X cpu top %p", mask, cpu_top); + } return (cpu); } - -#endif /* SMP */ +#endif /* * Pick the highest priority task we have and return it. @@ -1174,121 +1183,31 @@ tdq_setup(struct tdq *tdq) runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); - tdq->tdq_load = 0; -} - -#ifdef SMP -static void -tdg_setup(struct tdq_group *tdg) -{ - if (bootverbose) - printf("ULE: setup cpu group %d\n", TDG_ID(tdg)); - snprintf(tdg->tdg_name, sizeof(tdg->tdg_name), - "sched lock %d", (int)TDG_ID(tdg)); - mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock", + snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), + "sched lock %d", (int)TDQ_ID(tdq)); + mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN | MTX_RECURSE); - LIST_INIT(&tdg->tdg_members); - tdg->tdg_load = 0; - tdg->tdg_transferable = 0; - tdg->tdg_cpus = 0; - tdg->tdg_mask = 0; - tdg->tdg_cpumask = 0; - tdg->tdg_idlemask = 0; -} - -static void -tdg_add(struct tdq_group *tdg, struct tdq *tdq) -{ - if (tdg->tdg_mask == 0) - tdg->tdg_mask |= 1 << TDQ_ID(tdq); - tdg->tdg_cpumask |= 1 << TDQ_ID(tdq); - tdg->tdg_cpus++; - tdq->tdq_group = tdg; - tdq->tdq_lock = &tdg->tdg_lock; - LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); - if (bootverbose) - printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n", - TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask); -} - -static void -sched_setup_topology(void) -{ - struct tdq_group *tdg; - struct cpu_group *cg; - int balance_groups; - struct tdq *tdq; - int i; - int j; - - topology = 1; - balance_groups = 0; - for (i = 0; i < smp_topology->ct_count; i++) { - cg = &smp_topology->ct_group[i]; - tdg = &tdq_groups[i]; - /* - * Initialize the group. - */ - tdg_setup(tdg); - /* - * Find all of the group members and add them. - */ - for (j = 0; j < MAXCPU; j++) { - if ((cg->cg_mask & (1 << j)) != 0) { - tdq = TDQ_CPU(j); - tdq_setup(tdq); - tdg_add(tdg, tdq); - } - } - if (tdg->tdg_cpus > 1) - balance_groups = 1; - } - tdg_maxid = smp_topology->ct_count - 1; - if (balance_groups) - sched_balance_groups(); } +#ifdef SMP static void sched_setup_smp(void) { - struct tdq_group *tdg; struct tdq *tdq; - int cpus; int i; - for (cpus = 0, i = 0; i < MAXCPU; i++) { + cpu_top = smp_topo(); + for (i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; - tdq = &tdq_cpu[i]; - tdg = &tdq_groups[i]; - /* - * Setup a tdq group with one member. - */ - tdg_setup(tdg); + tdq = TDQ_CPU(i); tdq_setup(tdq); - tdg_add(tdg, tdq); - cpus++; + tdq->tdq_cg = smp_topo_find(cpu_top, i); + if (tdq->tdq_cg == NULL) + panic("Can't find cpu group for %d\n", i); } - tdg_maxid = cpus - 1; -} - -/* - * Fake a topology with one group containing all CPUs. - */ -static void -sched_fake_topo(void) -{ -#ifdef SCHED_FAKE_TOPOLOGY - static struct cpu_top top; - static struct cpu_group group; - - top.ct_count = 1; - top.ct_group = &group; - group.cg_mask = all_cpus; - group.cg_count = mp_ncpus; - group.cg_children = 0; - smp_topology = ⊤ -#endif + balance_tdq = TDQ_SELF(); + sched_balance(); } #endif @@ -1303,21 +1222,9 @@ sched_setup(void *dummy) tdq = TDQ_SELF(); #ifdef SMP - sched_fake_topo(); - /* - * Setup tdqs based on a topology configuration or vanilla SMP based - * on mp_maxid. - */ - if (smp_topology == NULL) - sched_setup_smp(); - else - sched_setup_topology(); - balance_tdq = tdq; - sched_balance(); + sched_setup_smp(); #else tdq_setup(tdq); - mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE); - tdq->tdq_lock = &tdq_lock; #endif /* * To avoid divide-by-zero, we set realstathz a dummy value @@ -1331,6 +1238,7 @@ sched_setup(void *dummy) TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &td_sched0); + tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } @@ -1369,7 +1277,7 @@ sched_initticks(void *dummy) * prevents excess thrashing on large machines and excess idle on * smaller machines. */ - steal_thresh = min(ffs(mp_ncpus) - 1, 4); + steal_thresh = min(ffs(mp_ncpus) - 1, 3); affinity = SCHED_AFFINITY_DEFAULT; #endif } @@ -1617,16 +1525,17 @@ sched_thread_priority(struct thread *td, sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); -#ifdef SMP } else if (TD_IS_RUNNING(td)) { struct tdq *tdq; + int oldpri; tdq = TDQ_CPU(ts->ts_cpu); - if (prio < tdq->tdq_lowpri || - (td->td_priority == tdq->tdq_lowpri && tdq->tdq_load <= 1)) - tdq->tdq_lowpri = prio; + oldpri = td->td_priority; td->td_priority = prio; -#endif + if (prio < tdq->tdq_lowpri) + tdq->tdq_lowpri = prio; + else if (tdq->tdq_lowpri == oldpri) + tdq_setlowpri(tdq, td); } else td->td_priority = prio; } @@ -1843,9 +1752,7 @@ sched_switch(struct thread *td, struct t tdq = TDQ_CPU(cpuid); ts = td->td_sched; mtx = td->td_lock; -#ifdef SMP ts->ts_rltick = ticks; -#endif td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; @@ -1913,12 +1820,12 @@ sched_switch(struct thread *td, struct t } else thread_unblock_switch(td, mtx); /* - * Assert that all went well and return. + * We should always get here with the lowest priority td possible. */ -#ifdef SMP - /* We should always get here with the lowest priority td possible */ tdq->tdq_lowpri = td->td_priority; -#endif + /* + * Assert that all went well and return. + */ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; @@ -2022,6 +1929,7 @@ sched_fork_thread(struct thread *td, str THREAD_LOCK_ASSERT(td, MA_OWNED); sched_newthread(child); child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); + child->td_cpuset = cpuset_ref(td->td_cpuset); ts = td->td_sched; ts2 = child->td_sched; ts2->ts_cpu = ts->ts_cpu; @@ -2052,8 +1960,6 @@ sched_class(struct thread *td, int class THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; - -#ifdef SMP /* * On SMP if we're on the RUNQ we must adjust the transferable * count because could be changing to or from an interrupt @@ -2063,17 +1969,12 @@ sched_class(struct thread *td, int class struct tdq *tdq; tdq = TDQ_CPU(td->td_sched->ts_cpu); - if (THREAD_CAN_MIGRATE(td)) { + if (THREAD_CAN_MIGRATE(td)) tdq->tdq_transferable--; - tdq->tdq_group->tdg_transferable--; - } td->td_pri_class = class; - if (THREAD_CAN_MIGRATE(td)) { + if (THREAD_CAN_MIGRATE(td)) tdq->tdq_transferable++; - tdq->tdq_group->tdg_transferable++; - } } -#endif td->td_pri_class = class; } @@ -2149,6 +2050,8 @@ sched_userret(struct thread *td) thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; + if (lowpri_userret) + tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } } @@ -2172,8 +2075,6 @@ sched_clock(struct thread *td) if (balance_tdq == tdq) { if (balance_ticks && --balance_ticks == 0) sched_balance(); - if (balance_group_ticks && --balance_group_ticks == 0) - sched_balance_groups(); } #endif /* @@ -2261,11 +2162,7 @@ out: struct thread * sched_choose(void) { -#ifdef SMP - struct tdq_group *tdg; -#endif struct td_sched *ts; - struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); @@ -2275,20 +2172,7 @@ sched_choose(void) tdq_runq_rem(tdq, ts); return (ts->ts_thread); } - td = PCPU_GET(idlethread); -#ifdef SMP - /* - * We only set the idled bit when all of the cpus in the group are - * idle. Otherwise we could get into a situation where a thread bounces - * back and forth between two idle cores on seperate physical CPUs. - */ - tdg = tdq->tdq_group; - tdg->tdg_idlemask |= PCPU_GET(cpumask); - if (tdg->tdg_idlemask == tdg->tdg_cpumask) - atomic_set_int(&tdq_idle, tdg->tdg_mask); - tdq->tdq_lowpri = td->td_priority; -#endif - return (td); + return (PCPU_GET(idlethread)); } /* @@ -2305,7 +2189,7 @@ sched_setpreempt(struct thread *td) ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; - if (td->td_priority < ctd->td_priority) + if (td->td_priority < cpri) curthread->td_flags |= TDF_NEEDRESCHED; if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; @@ -2329,9 +2213,6 @@ tdq_add(struct tdq *tdq, struct thread * { struct td_sched *ts; int class; -#ifdef SMP - int cpumask; -#endif TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), @@ -2355,29 +2236,8 @@ tdq_add(struct tdq *tdq, struct thread * ts->ts_runq = &tdq->tdq_timeshare; else ts->ts_runq = &tdq->tdq_idle; -#ifdef SMP - cpumask = 1 << ts->ts_cpu; - /* - * If we had been idle, clear our bit in the group and potentially - * the global bitmap. - */ - if ((class != PRI_IDLE && class != PRI_ITHD) && - (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { - /* - * Check to see if our group is unidling, and if so, remove it - * from the global idle mask. - */ - if (tdq->tdq_group->tdg_idlemask == - tdq->tdq_group->tdg_cpumask) - atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); - /* - * Now remove ourselves from the group specific idle mask. - */ - tdq->tdq_group->tdg_idlemask &= ~cpumask; - } if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; -#endif tdq_runq_add(tdq, ts, flags); tdq_load_add(tdq, ts); } @@ -2412,13 +2272,7 @@ sched_add(struct thread *td, int flags) * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ - if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td) && - curthread->td_intr_nesting_level) - ts->ts_cpu = cpuid; - if (!THREAD_CAN_MIGRATE(td)) - cpu = ts->ts_cpu; - else - cpu = sched_pickcpu(ts, flags); + cpu = sched_pickcpu(ts, flags); tdq = sched_setcpu(ts, cpu, flags); tdq_add(tdq, td, flags); if (cpu != cpuid) { @@ -2462,6 +2316,8 @@ sched_rem(struct thread *td) tdq_runq_rem(tdq, ts); tdq_load_rem(tdq, ts); TD_SET_CAN_RUN(td); + if (td->td_priority == tdq->tdq_lowpri) + tdq_setlowpri(tdq, NULL); } /* @@ -2505,14 +2361,12 @@ sched_bind(struct thread *td, int cpu) if (ts->ts_flags & TSF_BOUND) sched_unbind(td); ts->ts_flags |= TSF_BOUND; -#ifdef SMP sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); -#endif } /* @@ -2528,9 +2382,7 @@ sched_unbind(struct thread *td) if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; -#ifdef SMP sched_unpin(); -#endif } int @@ -2540,6 +2392,34 @@ sched_is_bound(struct thread *td) return (td->td_sched->ts_flags & TSF_BOUND); } +void +sched_affinity(struct thread *td) +{ +#ifdef SMP + struct td_sched *ts; + int cpu; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + if (THREAD_CAN_SCHED(td, ts->ts_cpu)) + return; + if (!TD_IS_RUNNING(td)) + return; + td->td_flags |= TDF_NEEDRESCHED; + if (!THREAD_CAN_MIGRATE(td)) + return; + /* + * Assign the new cpu and force a switch before returning to + * userspace. If the target thread is not running locally send + * an ipi to force the issue. + */ + cpu = ts->ts_cpu; + ts->ts_cpu = sched_pickcpu(ts, 0); + if (cpu != PCPU_GET(cpuid)) + ipi_selected(1 << cpu, IPI_PREEMPT); +#endif +} + /* * Basic yield call. */ @@ -2563,8 +2443,8 @@ sched_load(void) int i; total = 0; - for (i = 0; i <= tdg_maxid; i++) - total += TDQ_GROUP(i)->tdg_load; + for (i = 0; i <= mp_maxid; i++) + total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); @@ -2658,6 +2538,7 @@ sched_fork_exit(struct thread *td) TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); + tdq->tdq_lowpri = td->td_priority; } static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, @@ -2676,6 +2557,8 @@ SYSCTL_INT(_kern_sched, OID_AUTO, pick_p SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, userret, CTLFLAG_RW, &lowpri_userret, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, oldtryself, CTLFLAG_RW, &oldtryself, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, Index: sys/kern/subr_smp.c =================================================================== RCS file: /home/ncvs/src/sys/kern/subr_smp.c,v retrieving revision 1.203 diff -u -p -r1.203 subr_smp.c --- sys/kern/subr_smp.c 2 Jan 2008 17:09:15 -0000 1.203 +++ sys/kern/subr_smp.c 2 Mar 2008 02:58:10 -0000 @@ -68,7 +68,6 @@ int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; -struct cpu_top *smp_topology; volatile int smp_started; u_int mp_maxid; @@ -90,6 +89,11 @@ int smp_cpus = 1; /* how many cpu's runn SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0, "Number of CPUs online"); +int smp_topology = 0; /* Which topology we're using. */ +SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0, + "Topology override setting; 0 is default provided by hardware."); +TUNABLE_INT("kern.smp.topology", &smp_topology); + #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; @@ -385,22 +389,177 @@ smp_rendezvous(void (* setup_func)(void /* release lock */ mtx_unlock_spin(&smp_ipi_mtx); } -#else /* !SMP */ -/* - * Provide dummy SMP support for UP kernels. Modules that need to use SMP - * APIs will still work using this dummy support. - */ -static void -mp_setvariables_for_up(void *dummy) +static struct cpu_group group[MAXCPU]; + +struct cpu_group * +smp_topo(void) { - mp_ncpus = 1; - mp_maxid = PCPU_GET(cpuid); - all_cpus = PCPU_GET(cpumask); - KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); + struct cpu_group *top; + + /* + * Check for a fake topology request for debugging purposes. + */ + switch (smp_topology) { + case 1: + /* Dual core with no sharing. */ + top = smp_topo_1level(CG_SHARE_NONE, 2, 0); + break; + case 3: + /* Dual core with shared L2. */ + top = smp_topo_1level(CG_SHARE_L2, 2, 0); + break; + case 4: + /* quad core, shared l3 among each package, private l2. */ + top = smp_topo_1level(CG_SHARE_L3, 4, 0); + break; + case 5: + /* quad core, 2 dualcore parts on each package share l2. */ + top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); + break; + case 6: + /* Single-core 2xHTT */ + top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); + break; + case 7: + /* quad core with a shared l3, 8 threads sharing L2. */ + top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, + CG_FLAG_THREAD); + break; + default: + /* Default, ask the system what it wants. */ + top = cpu_topo(); + break; + } + /* + * Verify the returned topology. + */ + if (top->cg_count != mp_ncpus) + panic("Built bad topology at %p. CPU count %d != %d", + top, top->cg_count, mp_ncpus); + if (top->cg_mask != all_cpus) + panic("Built bad topology at %p. CPU mask 0x%X != 0x%X", + top, top->cg_mask, all_cpus); + return (top); +} + +struct cpu_group * +smp_topo_none(void) +{ + struct cpu_group *top; + + top = &group[0]; + top->cg_parent = NULL; + top->cg_child = NULL; + top->cg_mask = (1 << mp_ncpus) - 1; + top->cg_count = mp_ncpus; + top->cg_children = 0; + top->cg_level = CG_SHARE_NONE; + top->cg_flags = 0; + + return (top); +} + +static int +smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, + int count, int flags, int start) +{ + cpumask_t mask; + int i; + + for (mask = 0, i = 0; i < count; i++, start++) + mask |= (1 << start); + child->cg_parent = parent; + child->cg_child = NULL; + child->cg_children = 0; + child->cg_level = share; + child->cg_count = count; + child->cg_flags = flags; + child->cg_mask = mask; + parent->cg_children++; + for (; parent != NULL; parent = parent->cg_parent) { + if ((parent->cg_mask & child->cg_mask) != 0) + panic("Duplicate children in %p. mask 0x%X child 0x%X", + parent, parent->cg_mask, child->cg_mask); + parent->cg_mask |= child->cg_mask; + parent->cg_count += child->cg_count; + } + + return (start); } -SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, - mp_setvariables_for_up, NULL) + +struct cpu_group * +smp_topo_1level(int share, int count, int flags) +{ + struct cpu_group *child; + struct cpu_group *top; + int packages; + int cpu; + int i; + + cpu = 0; + top = &group[0]; + packages = mp_ncpus / count; + top->cg_child = child = &group[1]; + top->cg_level = CG_SHARE_NONE; + for (i = 0; i < packages; i++, child++) + cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); + return (top); +} + +struct cpu_group * +smp_topo_2level(int l2share, int l2count, int l1share, int l1count, + int l1flags) +{ + struct cpu_group *top; + struct cpu_group *l1g; + struct cpu_group *l2g; + int cpu; + int i; + int j; + + cpu = 0; + top = &group[0]; + l2g = &group[1]; + top->cg_child = l2g; + top->cg_level = CG_SHARE_NONE; + top->cg_children = mp_ncpus / (l2count * l1count); + l1g = l2g + top->cg_children; + for (i = 0; i < top->cg_children; i++, l2g++) { + l2g->cg_parent = top; + l2g->cg_child = l1g; + l2g->cg_level = l2share; + for (j = 0; j < l2count; j++, l1g++) + cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, + l1flags, cpu); + } + return (top); +} + + +struct cpu_group * +smp_topo_find(struct cpu_group *top, int cpu) +{ + struct cpu_group *cg; + cpumask_t mask; + int children; + int i; + + mask = (1 << cpu); + cg = top; + for (;;) { + if ((cg->cg_mask & mask) == 0) + return (NULL); + if (cg->cg_children == 0) + return (cg); + children = cg->cg_children; + for (i = 0, cg = cg->cg_child; i < children; cg++, i++) + if ((cg->cg_mask & mask) != 0) + break; + } + return (NULL); +} +#else /* !SMP */ void smp_rendezvous(void (*setup_func)(void *), @@ -416,4 +575,19 @@ smp_rendezvous(void (*setup_func)(void * if (teardown_func != NULL) teardown_func(arg); } + +/* + * Provide dummy SMP support for UP kernels. Modules that need to use SMP + * APIs will still work using this dummy support. + */ +static void +mp_setvariables_for_up(void *dummy) +{ + mp_ncpus = 1; + mp_maxid = PCPU_GET(cpuid); + all_cpus = PCPU_GET(cpumask); + KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); +} +SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, + mp_setvariables_for_up, NULL) #endif /* SMP */ Index: sys/kern/syscalls.c =================================================================== RCS file: /home/ncvs/src/sys/kern/syscalls.c,v retrieving revision 1.217 diff -u -p -r1.217 syscalls.c --- sys/kern/syscalls.c 12 Feb 2008 20:11:54 -0000 1.217 +++ sys/kern/syscalls.c 2 Mar 2008 02:58:10 -0000 @@ -2,7 +2,7 @@ * System call names. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/syscalls.c,v 1.217 2008/02/12 20:11:54 ru Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp */ @@ -491,4 +491,9 @@ const char *syscallnames[] = { "thr_kill2", /* 481 = thr_kill2 */ "shm_open", /* 482 = shm_open */ "shm_unlink", /* 483 = shm_unlink */ + "cpuset", /* 484 = cpuset */ + "cpuset_setid", /* 485 = cpuset_setid */ + "cpuset_getid", /* 486 = cpuset_getid */ + "cpuset_getaffinity", /* 487 = cpuset_getaffinity */ + "cpuset_setaffinity", /* 488 = cpuset_setaffinity */ }; Index: sys/kern/syscalls.master =================================================================== RCS file: /home/ncvs/src/sys/kern/syscalls.master,v retrieving revision 1.237 diff -u -p -r1.237 syscalls.master --- sys/kern/syscalls.master 12 Feb 2008 20:09:04 -0000 1.237 +++ sys/kern/syscalls.master 2 Mar 2008 02:58:10 -0000 @@ -850,5 +850,18 @@ 482 AUE_SHMOPEN STD { int shm_open(const char *path, int flags, \ mode_t mode); } 483 AUE_SHMUNLINK STD { int shm_unlink(const char *path); } +484 AUE_NULL STD { int cpuset(cpusetid_t *setid); } +485 AUE_NULL STD { int cpuset_setid(cpuwhich_t which, id_t id, \ + cpusetid_t setid); } +486 AUE_NULL STD { int cpuset_getid(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + cpusetid_t *setid); } +487 AUE_NULL STD { int cpuset_getaffinity(cpulevel_t level, \ + cpuwhich_t which, id_t id, int cpusetsize, \ + long *mask); } +488 AUE_NULL STD { int cpuset_setaffinity(cpulevel_t level, \ + cpuwhich_t which, id_t id, int cpusetsize, \ + long *mask); } + ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/systrace_args.c =================================================================== RCS file: /home/ncvs/src/sys/kern/systrace_args.c,v retrieving revision 1.17 diff -u -p -r1.17 systrace_args.c --- sys/kern/systrace_args.c 12 Feb 2008 20:11:54 -0000 1.17 +++ sys/kern/systrace_args.c 2 Mar 2008 02:58:11 -0000 @@ -2,7 +2,7 @@ * System call argument to DTrace register array converstion. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/systrace_args.c,v 1.17 2008/02/12 20:11:54 ru Exp $ + * $FreeBSD$ * This file is part of the DTrace syscall provider. */ @@ -2887,6 +2887,54 @@ systrace_args(int sysnum, void *params, *n_args = 1; break; } + /* cpuset */ + case 484: { + struct cpuset_args *p = params; + uarg[0] = (intptr_t) p->setid; /* cpusetid_t * */ + *n_args = 1; + break; + } + /* cpuset_setid */ + case 485: { + struct cpuset_setid_args *p = params; + iarg[0] = p->which; /* cpuwhich_t */ + iarg[1] = p->id; /* id_t */ + iarg[2] = p->setid; /* cpusetid_t */ + *n_args = 3; + break; + } + /* cpuset_getid */ + case 486: { + struct cpuset_getid_args *p = params; + iarg[0] = p->level; /* cpulevel_t */ + iarg[1] = p->which; /* cpuwhich_t */ + iarg[2] = p->id; /* id_t */ + uarg[3] = (intptr_t) p->setid; /* cpusetid_t * */ + *n_args = 4; + break; + } + /* cpuset_getaffinity */ + case 487: { + struct cpuset_getaffinity_args *p = params; + iarg[0] = p->level; /* cpulevel_t */ + iarg[1] = p->which; /* cpuwhich_t */ + iarg[2] = p->id; /* id_t */ + iarg[3] = p->cpusetsize; /* int */ + uarg[4] = (intptr_t) p->mask; /* long * */ + *n_args = 5; + break; + } + /* cpuset_setaffinity */ + case 488: { + struct cpuset_setaffinity_args *p = params; + iarg[0] = p->level; /* cpulevel_t */ + iarg[1] = p->which; /* cpuwhich_t */ + iarg[2] = p->id; /* id_t */ + iarg[3] = p->cpusetsize; /* int */ + uarg[4] = (intptr_t) p->mask; /* long * */ + *n_args = 5; + break; + } default: *n_args = 0; break; Index: sys/powerpc/powerpc/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/powerpc/powerpc/mp_machdep.c,v retrieving revision 1.13 diff -u -p -r1.13 mp_machdep.c --- sys/powerpc/powerpc/mp_machdep.c 16 May 2006 14:32:17 -0000 1.13 +++ sys/powerpc/powerpc/mp_machdep.c 2 Mar 2008 02:58:16 -0000 @@ -45,6 +45,13 @@ int boot_cpu_id; +struct cpu_group * +cpu_topo(void) +{ + + return smp_topo_none(); +} + void cpu_mp_setmaxid(void) { Index: sys/sparc64/sparc64/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/sparc64/sparc64/mp_machdep.c,v retrieving revision 1.36 diff -u -p -r1.36 mp_machdep.c --- sys/sparc64/sparc64/mp_machdep.c 16 Jun 2007 23:26:00 -0000 1.36 +++ sys/sparc64/sparc64/mp_machdep.c 2 Mar 2008 02:58:16 -0000 @@ -189,6 +189,13 @@ cpu_mp_probe(void) return (mp_maxid > 0); } +struct cpu_group * +cpu_topo(void) +{ + + return smp_topo_none(); +} + static void sun4u_startcpu(phandle_t cpu, void *func, u_long arg) { Index: sys/sun4v/sun4v/mp_machdep.c =================================================================== RCS file: /home/ncvs/src/sys/sun4v/sun4v/mp_machdep.c,v retrieving revision 1.8 diff -u -p -r1.8 mp_machdep.c --- sys/sun4v/sun4v/mp_machdep.c 6 Jul 2007 00:41:53 -0000 1.8 +++ sys/sun4v/sun4v/mp_machdep.c 2 Mar 2008 02:58:16 -0000 @@ -241,6 +241,13 @@ cpu_mp_probe(void) return (mp_maxid > 0); } +struct cpu_group * +cpu_topo(void) +{ + + return smp_topo_none(); +} + static int start_ap_bycpuid(int cpuid, void *func, u_long arg) { Index: sys/sys/_types.h =================================================================== RCS file: /home/ncvs/src/sys/sys/_types.h,v retrieving revision 1.23 diff -u -p -r1.23 _types.h --- sys/sys/_types.h 1 Mar 2006 06:29:34 -0000 1.23 +++ sys/sys/_types.h 2 Mar 2008 02:58:16 -0000 @@ -61,6 +61,9 @@ typedef struct __timer *__timer_t; /* ti typedef struct __mq *__mqd_t; /* mq_open()... */ typedef __uint32_t __uid_t; typedef unsigned int __useconds_t; /* microseconds (unsigned) */ +typedef int __cpuwhich_t; /* which parameter for cpuset. */ +typedef int __cpulevel_t; /* level parameter for cpuset. */ +typedef int __cpusetid_t; /* cpuset identifier. */ /* * Unusual type definitions. Index: sys/sys/cpuset.h =================================================================== RCS file: sys/sys/cpuset.h diff -N sys/sys/cpuset.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/cpuset.h 2 Mar 2008 02:58:16 -0000 @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 2008, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_CPUSET_H_ +#define _SYS_CPUSET_H_ + +#ifdef _KERNEL +#define CPU_SETSIZE MAXCPU +#endif + +#define CPU_MAXSIZE 128 + +#ifndef CPU_SETSIZE +#define CPU_SETSIZE CPU_MAXSIZE +#endif + +#define _NCPUBITS (sizeof(long) * NBBY) /* bits per mask */ +#define _NCPUWORDS howmany(CPU_SETSIZE, _NCPUBITS) + +typedef struct _cpuset { + long __bits[howmany(CPU_SETSIZE, _NCPUBITS)]; +} cpuset_t; + +#define __cpuset_mask(n) ((long)1 << ((n) % _NCPUBITS)) +#define CPU_CLR(n, p) ((p)->__bits[(n)/_NCPUBITS] &= ~__cpuset_mask(n)) +#define CPU_COPY(f, t) (void)(*(t) = *(f)) +#define CPU_ISSET(n, p) (((p)->__bits[(n)/_NCPUBITS] & __cpuset_mask(n)) != 0) +#define CPU_SET(n, p) ((p)->__bits[(n)/_NCPUBITS] |= __cpuset_mask(n)) +#define CPU_ZERO(p) do { \ + __size_t __i; \ + for (__i = 0; __i < _NCPUWORDS; __i++) \ + (p)->__bits[__i] = 0; \ +} while (0) + +#define CPU_EMPTY(p) __extension__ ({ \ + __size_t __i; \ + for (__i = 0; __i < _NCPUWORDS; __i++) \ + if ((p)->__bits[__i]) \ + break; \ + __i == _NCPUWORDS; \ +}) + +#define CPU_OR(d, s) do { \ + __size_t __i; \ + for (__i = 0; __i < _NCPUWORDS; __i++) \ + (d)->__bits[__i] |= (s)->__bits[__i]; \ +} while (0) + +#define CPU_AND(d, s) do { \ + __size_t __i; \ + for (__i = 0; __i < _NCPUWORDS; __i++) \ + (d)->__bits[__i] &= (s)->__bits[__i]; \ +} while (0) + +#define CPU_NAND(d, s) do { \ + __size_t __i; \ + for (__i = 0; __i < _NCPUWORDS; __i++) \ + (d)->__bits[__i] &= ~(s)->__bits[__i]; \ +} while (0) + +/* + * Valid cpulevel_t values. + */ +#define CPU_LEVEL_ROOT 1 /* All system cpus. */ +#define CPU_LEVEL_CPUSET 2 /* Available cpus for which. */ +#define CPU_LEVEL_WHICH 3 /* Actual mask/id for which. */ + +/* + * Valid cpuwhich_t values. + */ +#define CPU_WHICH_TID 1 /* Specifies a thread id. */ +#define CPU_WHICH_PID 2 /* Specifies a process id. */ +#define CPU_WHICH_CPUSET 3 /* Specifies a set id. */ + +/* + * Reserved cpuset identifiers. + */ +#define CPUSET_INVALID -1 +#define CPUSET_DEFAULT 0 + +#ifdef _KERNEL +LIST_HEAD(setlist, cpuset); + +/* + * cpusets encapsulate cpu binding information for one or more threads. + * + * a - Accessed with atomics. + * s - Set at creation, never modified. Only a ref required to read. + * c - Locked internally by a cpuset lock. + * + * The bitmask is only modified while holding the cpuset lock. It may be + * read while only a reference is held but the consumer must be prepared + * to deal with inconsistent results. + */ +struct cpuset { + cpuset_t cs_mask; /* bitmask of valid cpus. */ + volatile u_int cs_ref; /* (a) Reference count. */ + int cs_flags; /* (s) Flags from below. */ + cpusetid_t cs_id; /* (s) Id or INVALID. */ + struct cpuset *cs_parent; /* (s) Pointer to our parent. */ + LIST_ENTRY(cpuset) cs_link; /* (c) All identified sets. */ + LIST_ENTRY(cpuset) cs_siblings; /* (c) Sibling set link. */ + struct setlist cs_children; /* (c) List of children. */ +}; + +#define CPU_SET_ROOT 0x0001 /* Set is a root set. */ +#define CPU_SET_RDONLY 0x0002 /* No modification allowed. */ + +struct cpuset *cpuset_thread0(void); +struct cpuset *cpuset_ref(struct cpuset *); +void cpuset_rel(struct cpuset *); +#else +__BEGIN_DECLS +int cpuset(cpusetid_t *); +int cpuset_setid(cpuwhich_t, id_t, cpusetid_t); +int cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *); +int cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, int, cpuset_t *); +int cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, int, cpuset_t *); +__END_DECLS +#endif +#endif /* !_SYS_CPUSET_H_ */ Index: sys/sys/proc.h =================================================================== RCS file: /home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.503 diff -u -p -r1.503 proc.h --- sys/sys/proc.h 7 Feb 2008 06:55:38 -0000 1.503 +++ sys/sys/proc.h 2 Mar 2008 02:58:16 -0000 @@ -163,6 +163,7 @@ struct thread; struct trapframe; struct turnstile; struct mqueue_notifier; +struct cpuset; /* * Here we define the two structures used for process information. @@ -208,7 +209,7 @@ struct thread { /* The two queues below should someday be merged. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ - + struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ Index: sys/sys/sched.h =================================================================== RCS file: /home/ncvs/src/sys/sys/sched.h,v retrieving revision 1.33 diff -u -p -r1.33 sched.h --- sys/sys/sched.h 12 Jun 2007 19:49:39 -0000 1.33 +++ sys/sys/sched.h 2 Mar 2008 02:58:16 -0000 @@ -32,7 +32,7 @@ */ /*- - * Copyright (c) 2002, Jeffrey Roberson + * Copyright (c) 2002-2008, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -129,6 +129,7 @@ static __inline void sched_pin(void); void sched_unbind(struct thread *td); static __inline void sched_unpin(void); int sched_is_bound(struct thread *td); +void sched_affinity(struct thread *td); /* * These procedures tell the process data structure allocation code how @@ -175,6 +176,7 @@ extern long switch_needresched; void schedinit(void); void sched_newproc(struct proc *p, struct thread *td); void sched_newthread(struct thread *td); + #endif /* _KERNEL */ /* POSIX 1003.1b Process Scheduling */ Index: sys/sys/smp.h =================================================================== RCS file: /home/ncvs/src/sys/sys/smp.h,v retrieving revision 1.86 diff -u -p -r1.86 smp.h --- sys/sys/smp.h 8 Nov 2007 14:47:55 -0000 1.86 +++ sys/sys/smp.h 2 Mar 2008 02:58:16 -0000 @@ -32,18 +32,40 @@ */ struct cpu_group { - cpumask_t cg_mask; /* Mask of cpus in this group. */ - int cg_count; /* Count of cpus in this group. */ - int cg_children; /* Number of children groups. */ - struct cpu_group *cg_child; /* Optional child group. */ + struct cpu_group *cg_parent; /* Our parent group. */ + struct cpu_group *cg_child; /* Optional children groups. */ + cpumask_t cg_mask; /* Mask of cpus in this group. */ + int8_t cg_count; /* Count of cpus in this group. */ + int8_t cg_children; /* Number of children groups. */ + int8_t cg_level; /* Shared cache level. */ + int8_t cg_flags; /* Traversal modifiers. */ }; -struct cpu_top { - int ct_count; /* Count of groups. */ - struct cpu_group *ct_group; /* Array of pointers to cpu groups. */ -}; +/* + * Defines common resources for CPUs in the group. The highest level + * resource should be used when multiple are shared. + */ +#define CG_SHARE_NONE 0 +#define CG_SHARE_L1 1 +#define CG_SHARE_L2 2 +#define CG_SHARE_L3 3 + +/* + * Behavior modifiers for load balancing and affinity. + */ +#define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ +#define CG_FLAG_THREAD 0x02 /* New age htt, less crippled. */ + +/* + * Convenience routines for building topologies. + */ +struct cpu_group *smp_topo(void); +struct cpu_group *smp_topo_none(void); +struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); +struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, + int l1count, int l1flags); +struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu); -extern struct cpu_top *smp_topology; extern void (*cpustop_restartfunc)(void); extern int smp_active; extern int smp_cpus; @@ -90,6 +112,7 @@ extern cpumask_t all_cpus; */ struct thread; +struct cpu_group *cpu_topo(void); void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); Index: sys/sys/syscall.h =================================================================== RCS file: /home/ncvs/src/sys/sys/syscall.h,v retrieving revision 1.214 diff -u -p -r1.214 syscall.h --- sys/sys/syscall.h 12 Feb 2008 20:11:53 -0000 1.214 +++ sys/sys/syscall.h 2 Mar 2008 02:58:16 -0000 @@ -2,7 +2,7 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/syscall.h,v 1.214 2008/02/12 20:11:53 ru Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp */ @@ -403,4 +403,9 @@ #define SYS_thr_kill2 481 #define SYS_shm_open 482 #define SYS_shm_unlink 483 -#define SYS_MAXSYSCALL 484 +#define SYS_cpuset 484 +#define SYS_cpuset_setid 485 +#define SYS_cpuset_getid 486 +#define SYS_cpuset_getaffinity 487 +#define SYS_cpuset_setaffinity 488 +#define SYS_MAXSYSCALL 489 Index: sys/sys/syscall.mk =================================================================== RCS file: /home/ncvs/src/sys/sys/syscall.mk,v retrieving revision 1.169 diff -u -p -r1.169 syscall.mk --- sys/sys/syscall.mk 12 Feb 2008 20:11:53 -0000 1.169 +++ sys/sys/syscall.mk 2 Mar 2008 02:58:16 -0000 @@ -1,6 +1,6 @@ # FreeBSD system call names. # DO NOT EDIT-- this file is automatically generated. -# $FreeBSD: src/sys/sys/syscall.mk,v 1.169 2008/02/12 20:11:53 ru Exp $ +# $FreeBSD$ # created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp MIASM = \ syscall.o \ @@ -351,4 +351,9 @@ MIASM = \ ftruncate.o \ thr_kill2.o \ shm_open.o \ - shm_unlink.o + shm_unlink.o \ + cpuset.o \ + cpuset_setid.o \ + cpuset_getid.o \ + cpuset_getaffinity.o \ + cpuset_setaffinity.o Index: sys/sys/sysproto.h =================================================================== RCS file: /home/ncvs/src/sys/sys/sysproto.h,v retrieving revision 1.218 diff -u -p -r1.218 sysproto.h --- sys/sys/sysproto.h 12 Feb 2008 20:11:54 -0000 1.218 +++ sys/sys/sysproto.h 2 Mar 2008 02:58:16 -0000 @@ -2,7 +2,7 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/sysproto.h,v 1.218 2008/02/12 20:11:54 ru Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.237 2008/02/12 20:09:04 ru Exp */ @@ -1528,6 +1528,34 @@ struct shm_open_args { struct shm_unlink_args { char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; }; +struct cpuset_args { + char setid_l_[PADL_(cpusetid_t *)]; cpusetid_t * setid; char setid_r_[PADR_(cpusetid_t *)]; +}; +struct cpuset_setid_args { + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char setid_l_[PADL_(cpusetid_t)]; cpusetid_t setid; char setid_r_[PADR_(cpusetid_t)]; +}; +struct cpuset_getid_args { + char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)]; + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char setid_l_[PADL_(cpusetid_t *)]; cpusetid_t * setid; char setid_r_[PADR_(cpusetid_t *)]; +}; +struct cpuset_getaffinity_args { + char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)]; + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char cpusetsize_l_[PADL_(int)]; int cpusetsize; char cpusetsize_r_[PADR_(int)]; + char mask_l_[PADL_(long *)]; long * mask; char mask_r_[PADR_(long *)]; +}; +struct cpuset_setaffinity_args { + char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)]; + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char cpusetsize_l_[PADL_(int)]; int cpusetsize; char cpusetsize_r_[PADR_(int)]; + char mask_l_[PADL_(long *)]; long * mask; char mask_r_[PADR_(long *)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_exit(struct thread *, struct sys_exit_args *); int fork(struct thread *, struct fork_args *); @@ -1869,6 +1897,11 @@ int ftruncate(struct thread *, struct ft int thr_kill2(struct thread *, struct thr_kill2_args *); int shm_open(struct thread *, struct shm_open_args *); int shm_unlink(struct thread *, struct shm_unlink_args *); +int cpuset(struct thread *, struct cpuset_args *); +int cpuset_setid(struct thread *, struct cpuset_setid_args *); +int cpuset_getid(struct thread *, struct cpuset_getid_args *); +int cpuset_getaffinity(struct thread *, struct cpuset_getaffinity_args *); +int cpuset_setaffinity(struct thread *, struct cpuset_setaffinity_args *); #ifdef COMPAT_43 @@ -2435,6 +2468,11 @@ int freebsd4_sigreturn(struct thread *, #define SYS_AUE_thr_kill2 AUE_KILL #define SYS_AUE_shm_open AUE_SHMOPEN #define SYS_AUE_shm_unlink AUE_SHMUNLINK +#define SYS_AUE_cpuset AUE_NULL +#define SYS_AUE_cpuset_setid AUE_NULL +#define SYS_AUE_cpuset_getid AUE_NULL +#define SYS_AUE_cpuset_getaffinity AUE_NULL +#define SYS_AUE_cpuset_setaffinity AUE_NULL #undef PAD_ #undef PADL_ Index: sys/sys/types.h =================================================================== RCS file: /home/ncvs/src/sys/sys/types.h,v retrieving revision 1.97 diff -u -p -r1.97 types.h --- sys/sys/types.h 28 Nov 2007 21:54:46 -0000 1.97 +++ sys/sys/types.h 2 Mar 2008 02:58:16 -0000 @@ -124,6 +124,10 @@ typedef __blksize_t blksize_t; #define _BLKSIZE_T_DECLARED #endif +typedef __cpuwhich_t cpuwhich_t; +typedef __cpulevel_t cpulevel_t; +typedef __cpusetid_t cpusetid_t; + #ifndef _BLKCNT_T_DECLARED typedef __blkcnt_t blkcnt_t; #define _BLKCNT_T_DECLARED Index: usr.bin/Makefile =================================================================== RCS file: /home/ncvs/src/usr.bin/Makefile,v retrieving revision 1.306 diff -u -p -r1.306 Makefile --- usr.bin/Makefile 20 Dec 2007 16:40:25 -0000 1.306 +++ usr.bin/Makefile 2 Mar 2008 02:58:20 -0000 @@ -42,6 +42,7 @@ SUBDIR= alias \ comm \ compile_et \ compress \ + cpuset \ csplit \ ${_csup} \ ctags \ Index: usr.bin/cpuset/Makefile =================================================================== RCS file: usr.bin/cpuset/Makefile diff -N usr.bin/cpuset/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/cpuset/Makefile 2 Mar 2008 02:58:20 -0000 @@ -0,0 +1,7 @@ +# $FreeBSD$ + +PROG= cpuset +NO_MAN= true +WARNS?= 1 + +.include Index: usr.bin/cpuset/cpuset.c =================================================================== RCS file: usr.bin/cpuset/cpuset.c diff -N usr.bin/cpuset/cpuset.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/cpuset/cpuset.c 2 Mar 2008 02:58:20 -0000 @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2007, 2008 Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +int cflag; +int gflag; +int iflag; +int lflag; +int pflag; +int rflag; +int sflag; +int tflag; +id_t id; +cpulevel_t level; +cpuwhich_t which; + +void usage(void); + +void printset(cpuset_t *mask); + +void +parselist(char *list, cpuset_t *mask) +{ + enum { NONE, NUM, DASH } state; + int lastnum; + int curnum; + char *l; + + state = NONE; + for (l = list; *l != '\0';) { + if (isdigit(*l)) { + curnum = atoi(l); + if (curnum > CPU_SETSIZE) + errx(EXIT_FAILURE, + "Only %d cpus supported", CPU_SETSIZE); + while (isdigit(*l)) + l++; + switch (state) { + case NONE: + lastnum = curnum; + state = NUM; + break; + case DASH: + for (; lastnum <= curnum; lastnum++) + CPU_SET(lastnum, mask); + state = NONE; + break; + case NUM: + default: + goto parserr; + } + continue; + } + switch (*l) { + case ',': + switch (state) { + case NONE: + break; + case NUM: + CPU_SET(curnum, mask); + state = NONE; + break; + case DASH: + goto parserr; + break; + } + break; + case '-': + if (state != NUM) + goto parserr; + state = DASH; + break; + default: + goto parserr; + } + l++; + } + switch (state) { + case NONE: + break; + case NUM: + CPU_SET(curnum, mask); + break; + case DASH: + goto parserr; + } + return; +parserr: + errx(EXIT_FAILURE, "Malformed cpu list %s", list); +} + +void +printset(cpuset_t *mask) +{ + int once; + int cpu; + + for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (CPU_ISSET(cpu, mask)) { + if (once == 0) { + printf("%d", cpu); + once = 1; + } else + printf(", %d", cpu); + } + } + printf("\n"); +} + +char *whichnames[] = { NULL, "tid", "pid", "cpuset" }; +char *levelnames[] = { NULL, " root", " cpuset", "" }; + +void +printaffinity(void) +{ + cpuset_t mask; + + if (cpuset_getaffinity(level, which, id, CPU_SETSIZE, + &mask) != 0) + err(EXIT_FAILURE, "getaffinity"); + printf("%s %d%s mask: ", whichnames[which], id, levelnames[level]); + printset(&mask); + exit(EXIT_SUCCESS); +} + +void +printsetid(void) +{ + cpusetid_t setid; + + /* + * Only LEVEL_WHICH && WHICH_CPUSET has a numbered id. + */ + if (level == CPU_LEVEL_WHICH && !sflag) + level = CPU_LEVEL_CPUSET; + if (cpuset_getid(level, which, id, &setid)) + err(errno, "getid"); + printf("%s %d%s id: %d\n", whichnames[which], id, + levelnames[level], setid); +} + +int +main(int argc, char *argv[]) +{ + cpusetid_t setid; + cpuset_t mask; + lwpid_t tid; + pid_t pid; + int ch; + + CPU_ZERO(&mask); + level = CPU_LEVEL_WHICH; + which = CPU_WHICH_PID; + id = -1; + while ((ch = getopt(argc, argv, "cgil:p:rs:t:")) != -1) { + switch (ch) { + case 'c': + if (rflag) + usage(); + cflag = 1; + level = CPU_LEVEL_CPUSET; + break; + case 'g': + gflag = 1; + break; + case 'i': + iflag = 1; + break; + case 'l': + lflag = 1; + parselist(optarg, &mask); + break; + case 'p': + pflag = 1; + which = CPU_WHICH_PID; + id = pid = atoi(optarg); + break; + case 'r': + if (cflag) + usage(); + level = CPU_LEVEL_ROOT; + rflag = 1; + break; + case 's': + sflag = 1; + which = CPU_WHICH_CPUSET; + id = setid = atoi(optarg); + break; + case 't': + tflag = 1; + which = CPU_WHICH_TID; + id = tid = atoi(optarg); + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + if (gflag) { + if (argc || lflag) + usage(); + /* Only one identity specifier. */ + if (sflag + pflag + tflag > 1) + usage(); + if (iflag) + printsetid(); + else + printaffinity(); + exit(EXIT_SUCCESS); + } + /* + * The user wants to run a command with a set and possibly cpumask. + */ + if (argc) { + if (pflag | rflag | tflag || cflag) + usage(); + if (sflag && iflag) + usage(); + if (sflag) { + if (cpuset_setid(CPU_WHICH_PID, -1, setid)) + err(argc, "setid"); + which = CPU_WHICH_PID; + level = CPU_LEVEL_WHICH; + } + if (iflag) { + if (cpuset(&setid)) + err(argc, "newid"); + which = CPU_WHICH_CPUSET; + level = CPU_LEVEL_WHICH; + } + if (lflag) { + if (cpuset_setaffinity(level, which, -1, + CPU_SETSIZE, &mask) != 0) + err(EXIT_FAILURE, "setaffinity"); + } + errno = 0; + execvp(*argv, argv); + err(errno == ENOENT ? 127 : 126, "%s", *argv); + } + /* + * We're modifying something that presently exists. + */ + if (iflag) + usage(); + if (!lflag && (cflag || rflag)) + usage(); + if (!lflag && !sflag) + usage(); + /* You can only set a mask on a thread. */ + if (tflag && (sflag || pflag)) + usage(); + if (pflag && sflag) { + if (cpuset_setid(CPU_WHICH_PID, pid, setid)) + err(EXIT_FAILURE, "setid"); + /* + * If the user specifies a set and a list we want the mask + * to effect the pid and not the set. + */ + which = CPU_WHICH_PID; + id = pid; + } + if (lflag) { + if (cpuset_setaffinity(level, which, id, CPU_SETSIZE, + &mask) != 0) + err(EXIT_FAILURE, "setaffinity"); + } + + exit(EXIT_SUCCESS); +} + +void +usage(void) +{ + + fprintf(stderr, + "usage: cpuset [-l cpu list] [-i | -s setid] cmd ...\n"); + fprintf(stderr, + " cpuset [-l cpu list] [-s setid] -p pid\n"); + fprintf(stderr, + " cpuset [-cr] [-l cpu list] [-p pid | -t tid | -s setid]\n"); + fprintf(stderr, + " cpuset [-cgir] [-p pid | -t tid | -s setid]\n"); + exit(1); +}