Index: sys/smp.h =================================================================== RCS file: /home/ncvs/src/sys/sys/smp.h,v retrieving revision 1.86 diff -u -r1.86 smp.h --- sys/smp.h 8 Nov 2007 14:47:55 -0000 1.86 +++ sys/smp.h 16 Feb 2008 00:13:42 -0000 @@ -32,12 +32,30 @@ */ struct cpu_group { - cpumask_t cg_mask; /* Mask of cpus in this group. */ - int cg_count; /* Count of cpus in this group. */ - int cg_children; /* Number of children groups. */ - struct cpu_group *cg_child; /* Optional child group. */ + struct cpu_group *cg_parent; /* Our parent group. */ + struct cpu_group *cg_child; /* Optional children groups. */ + cpumask_t cg_mask; /* Mask of cpus in this group. */ + int8_t cg_count; /* Count of cpus in this group. */ + int8_t cg_children; /* Number of children groups. */ + int8_t cg_level; /* Shared cache level. */ + int8_t cg_flags; /* Traversal modifiers. */ }; +/* + * Defines common resources for CPUs in the group. The highest level + * resource should be used when multiple are shared. + */ +#define CG_SHARE_NONE 0 +#define CG_SHARE_L1 1 +#define CG_SHARE_L2 2 +#define CG_SHARE_L3 3 + +/* + * Behavior modifiers for load balancing and affinity. + */ +#define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ +#define CG_FLAG_THREAD 0x02 /* New age htt, less crippled. */ + struct cpu_top { int ct_count; /* Count of groups. */ struct cpu_group *ct_group; /* Array of pointers to cpu groups. */ Index: kern/sched_ule.c =================================================================== RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.223 diff -u -r1.223 sched_ule.c --- kern/sched_ule.c 23 Jan 2008 03:10:18 -0000 1.223 +++ kern/sched_ule.c 16 Feb 2008 00:13:42 -0000 @@ -190,47 +190,28 @@ * locking in sched_pickcpu(); */ struct tdq { - struct mtx *tdq_lock; /* Pointer to group lock. */ + struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ + struct mtx tdq_lock; /* run queue lock. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ int tdq_load; /* Aggregate load. */ + int tdq_sysload; /* For loadavg, !ITHD load. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ #ifdef SMP u_char tdq_lowpri; /* Lowest priority thread. */ int tdq_transferable; /* Transferable thread count. */ - LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ - struct tdq_group *tdq_group; /* Our processor group. */ -#else - int tdq_sysload; /* For loadavg, !ITHD load. */ #endif + char tdq_name[sizeof("sched lock") + 6]; } __aligned(64); #ifdef SMP -/* - * tdq groups are groups of processors which can cheaply share threads. When - * one processor in the group goes idle it will check the runqs of the other - * processors in its group prior to halting and waiting for an interrupt. - * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. - * In a numa environment we'd want an idle bitmap per group and a two tiered - * load balancer. - */ -struct tdq_group { - struct mtx tdg_lock; /* Protects all fields below. */ - int tdg_cpus; /* Count of CPUs in this tdq group. */ - cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ - cpumask_t tdg_idlemask; /* Idle cpus in this group. */ - cpumask_t tdg_mask; /* Bit mask for first cpu. */ - int tdg_load; /* Total load of this group. */ - int tdg_transferable; /* Transferable load of this group. */ - LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ - char tdg_name[16]; /* lock name. */ -} __aligned(64); +struct cpu_group *cpu_top; -#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) -#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) +#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) +#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. @@ -240,6 +221,8 @@ static int pick_pri = 1; static int affinity; static int tryself = 1; +static int lowpri_userret = 0; +static int oldtryself = 0; static int steal_htt = 1; static int steal_idle = 1; static int steal_thresh = 2; @@ -248,19 +231,13 @@ /* * One thread queue per processor. */ -static volatile cpumask_t tdq_idle; -static int tdg_maxid; static struct tdq tdq_cpu[MAXCPU]; -static struct tdq_group tdq_groups[MAXCPU]; static struct tdq *balance_tdq; -static int balance_group_ticks; static int balance_ticks; #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) #define TDQ_ID(x) ((int)((x) - tdq_cpu)) -#define TDQ_GROUP(x) (&tdq_groups[(x)]) -#define TDG_ID(x) ((int)((x) - tdq_groups)) #else /* !SMP */ static struct tdq tdq_cpu; static struct mtx tdq_lock; @@ -274,7 +251,7 @@ #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) -#define TDQ_LOCKPTR(t) ((t)->tdq_lock) +#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); @@ -301,8 +278,6 @@ static struct td_sched *runq_steal(struct runq *); static int sched_pickcpu(struct td_sched *, int); static void sched_balance(void); -static void sched_balance_groups(void); -static void sched_balance_group(struct tdq_group *); static void sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct td_sched *, int, int); static inline struct mtx *thread_block_switch(struct thread *); @@ -356,7 +331,8 @@ tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); - printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); + printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); @@ -369,8 +345,6 @@ #ifdef SMP printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); - printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); - printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); #endif } @@ -388,7 +362,6 @@ #ifdef SMP if (THREAD_CAN_MIGRATE(ts->ts_thread)) { tdq->tdq_transferable++; - tdq->tdq_group->tdg_transferable++; ts->ts_flags |= TSF_XFERABLE; } #endif @@ -434,7 +407,6 @@ #ifdef SMP if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; - tdq->tdq_group->tdg_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } #endif @@ -470,11 +442,7 @@ CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load++; -#else tdq->tdq_sysload++; -#endif } /* @@ -491,11 +459,7 @@ class = PRI_BASE(ts->ts_thread->td_pri_class); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load--; -#else tdq->tdq_sysload--; -#endif KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; @@ -504,111 +468,237 @@ } #ifdef SMP +struct cpu_search { + u_int cs_load; + u_int cs_cpu; + int cs_limit; /* Min priority for low min load for high. */ +}; + +#define CPU_SEARCH_LOWEST 0x1 +#define CPU_SEARCH_HIGHEST 0x2 +#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) + +#define CPUMASK_FOREACH(cpu, mask) \ + for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \ + if ((mask) & 1 << (cpu)) + +__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match); +int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); +int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); +int cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high); + +/* + * This routine compares according to the match argument and should be + * reduced in actual instantiations via constant propagation and dead code + * elimination. + */ +static __inline int +cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, + const int match) +{ + struct tdq *tdq; + + tdq = TDQ_CPU(cpu); + if (match & CPU_SEARCH_LOWEST) + if (tdq->tdq_load < low->cs_load && + tdq->tdq_lowpri > low->cs_limit) { + low->cs_cpu = cpu; + low->cs_load = tdq->tdq_load; + } + if (match & CPU_SEARCH_HIGHEST) + if (tdq->tdq_load >= high->cs_limit && + tdq->tdq_load > high->cs_load && tdq->tdq_transferable) { + high->cs_cpu = cpu; + high->cs_load = tdq->tdq_load; + } + return (tdq->tdq_load); +} + /* - * sched_balance is a simple CPU load balancing algorithm. It operates by - * finding the least loaded and most loaded cpu and equalizing their load - * by migrating some processes. - * - * Dealing only with two CPUs at a time has two advantages. Firstly, most - * installations will only have 2 cpus. Secondly, load balancing too much at - * once can have an unpleasant effect on the system. The scheduler rarely has - * enough information to make perfect decisions. So this algorithm chooses - * simplicity and more gradual effects on load in larger systems. + * Search the tree of cpu_groups for the lowest or highest loaded cpu + * according to the match argument. This routine actually compares the + * load on all paths through the tree and finds the least loaded cpu on + * the least loaded path, which may differ from the least loaded cpu in + * the system. This balances work among caches and busses. * + * This inline is instantiated in three forms below using constants for the + * match argument. It is reduced to the minimum set for each case. It is + * also recursive to the depth of the tree. + */ +static inline int +cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match) +{ + int total; + + total = 0; + if (cg->cg_children) { + struct cpu_search lgroup; + struct cpu_search hgroup; + struct cpu_group *child; + u_int lload; + int hload; + int load; + int i; + + lload = -1; + hload = -1; + for (i = 0; i < cg->cg_children; i++) { + child = &cg->cg_child[i]; + if (match & CPU_SEARCH_LOWEST) { + lgroup.cs_cpu = -1; + lgroup.cs_load = -1; + lgroup.cs_limit = low->cs_limit; + } + if (match & CPU_SEARCH_HIGHEST) { + hgroup.cs_cpu = -1; + hgroup.cs_load = 0; + hgroup.cs_limit = high->cs_limit; + } + switch (match) { + case CPU_SEARCH_LOWEST: + load = cpu_search_lowest(child, &lgroup); + break; + case CPU_SEARCH_HIGHEST: + load = cpu_search_highest(child, &hgroup); + break; + case CPU_SEARCH_BOTH: + load = cpu_search_both(child, &lgroup, &hgroup); + break; + } + total += load; + if (match & CPU_SEARCH_LOWEST && load < lload) { + *low = lgroup; + lload = load; + } + if (match & CPU_SEARCH_HIGHEST && load > hload) { + hload = load; + *high = hgroup; + } + } + } else { + int cpu; + + CPUMASK_FOREACH(cpu, cg->cg_mask) + total += cpu_compare(cpu, low, high, match); + } + return (total); +} + +/* + * cpu_search instantiations must pass constants to maintain the inline + * optimization. */ -static void -sched_balance() +int +cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) { - struct tdq_group *high; - struct tdq_group *low; - struct tdq_group *tdg; - struct tdq *tdq; - int cnt; - int i; + return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); +} - /* - * Select a random time between .5 * balance_interval and - * 1.5 * balance_interval. - */ - balance_ticks = max(balance_interval / 2, 1); - balance_ticks += random() % balance_interval; - if (smp_started == 0 || rebalance == 0) - return; - tdq = TDQ_SELF(); - TDQ_UNLOCK(tdq); - low = high = NULL; - i = random() % (tdg_maxid + 1); - for (cnt = 0; cnt <= tdg_maxid; cnt++) { - tdg = TDQ_GROUP(i); - /* - * Find the CPU with the highest load that has some - * threads to transfer. - */ - if ((high == NULL || tdg->tdg_load > high->tdg_load) - && tdg->tdg_transferable) - high = tdg; - if (low == NULL || tdg->tdg_load < low->tdg_load) - low = tdg; - if (++i > tdg_maxid) - i = 0; - } - if (low != NULL && high != NULL && high != low) - sched_balance_pair(LIST_FIRST(&high->tdg_members), - LIST_FIRST(&low->tdg_members)); - TDQ_LOCK(tdq); +int +cpu_search_highest(struct cpu_group *cg, struct cpu_search *high) +{ + return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); +} + +int +cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high) +{ + return cpu_search(cg, low, high, CPU_SEARCH_BOTH); +} + +/* + * Find the cpu with the least load via the least loaded path that has a + * lowpri greater than pri pri. A pri of -1 indicates any priority is + * acceptable. + */ +static inline int +sched_lowest(struct cpu_group *cg, int pri) +{ + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_limit = pri; + low.cs_load = -1; + cpu_search_lowest(cg, &low); + return low.cs_cpu; +} + +/* + * Find the cpu with the highest load via the highest loaded path. + */ +static inline int +sched_highest(struct cpu_group *cg, int minload) +{ + struct cpu_search high; + + high.cs_cpu = -1; + high.cs_load = 0; + high.cs_limit = minload; + cpu_search_highest(cg, &high); + return high.cs_cpu; } /* - * Balance load between CPUs in a group. Will only migrate within the group. + * Simultaneously find the highest and lowest loaded cpu reachable via + * cg. */ +static inline void +sched_both(struct cpu_group *cg, int *lowcpu, int *highcpu) +{ + struct cpu_search high; + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_limit = -1; + low.cs_load = -1; + high.cs_load = 0; + high.cs_cpu = -1; + high.cs_limit = -1; + cpu_search_both(cg, &low, &high); + *lowcpu = low.cs_cpu; + *highcpu = high.cs_cpu; + return; +} + static void -sched_balance_groups() +sched_balance_group(struct cpu_group *cg) { - struct tdq *tdq; + int high; + int low; int i; + sched_both(cg, &low, &high); + if (low != high && low != -1 && high != -1) + sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)); + + for (i = 0; i < cg->cg_children; i++) + sched_balance_group(&cg->cg_child[i]); +} + +static void +sched_balance() +{ + struct tdq *tdq; + /* * Select a random time between .5 * balance_interval and * 1.5 * balance_interval. */ - balance_group_ticks = max(balance_interval / 2, 1); - balance_group_ticks += random() % balance_interval; + balance_ticks = max(balance_interval / 2, 1); + balance_ticks += random() % balance_interval; if (smp_started == 0 || rebalance == 0) return; tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); - for (i = 0; i <= tdg_maxid; i++) - sched_balance_group(TDQ_GROUP(i)); + sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* - * Finds the greatest imbalance between two tdqs in a group. - */ -static void -sched_balance_group(struct tdq_group *tdg) -{ - struct tdq *tdq; - struct tdq *high; - struct tdq *low; - int load; - - if (tdg->tdg_transferable == 0) - return; - low = NULL; - high = NULL; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - load = tdq->tdq_load; - if (high == NULL || load > high->tdq_load) - high = tdq; - if (low == NULL || load < low->tdq_load) - low = tdq; - } - if (high != NULL && low != NULL && high != low) - sched_balance_pair(high, low); -} - -/* * Lock two thread queues using their address to maintain lock order. */ static void @@ -647,20 +737,9 @@ int i; tdq_lock_pair(high, low); - /* - * If we're transfering within a group we have to use this specific - * tdq's transferable count, otherwise we can steal from other members - * of the group. - */ - if (high->tdq_group == low->tdq_group) { - transferable = high->tdq_transferable; - high_load = high->tdq_load; - low_load = low->tdq_load; - } else { - transferable = high->tdq_group->tdg_transferable; - high_load = high->tdq_group->tdg_load; - low_load = low->tdq_group->tdg_load; - } + transferable = high->tdq_transferable; + high_load = high->tdq_load; + low_load = low->tdq_load; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). @@ -700,20 +779,7 @@ tdq = from; cpu = TDQ_ID(to); ts = tdq_steal(tdq); - if (ts == NULL) { - struct tdq_group *tdg; - - tdg = tdq->tdq_group; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - if (tdq == from || tdq->tdq_transferable == 0) - continue; - ts = tdq_steal(tdq); - break; - } - if (ts == NULL) - return; - } - if (tdq == to) + if (ts == NULL) return; td = ts->ts_thread; /* @@ -736,72 +802,42 @@ static int tdq_idled(struct tdq *tdq) { - struct tdq_group *tdg; + struct cpu_group *cg; struct tdq *steal; - int highload; - int highcpu; int cpu; if (smp_started == 0 || steal_idle == 0) return (1); - /* We don't want to be preempted while we're iterating over tdqs */ + /* We don't want to be preempted while we're iterating. */ spinlock_enter(); - tdg = tdq->tdq_group; - /* - * If we're in a cpu group, try and steal threads from another cpu in - * the group before idling. In a HTT group all cpus share the same - * run-queue lock, however, we still need a recursive lock to - * call tdq_move(). - */ - if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { - TDQ_LOCK(tdq); - LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { - if (steal == tdq || steal->tdq_transferable == 0) - continue; - TDQ_LOCK(steal); - goto steal; - } - TDQ_UNLOCK(tdq); - } - /* - * Find the least loaded CPU with a transferable thread and attempt - * to steal it. We make a lockless pass and then verify that the - * thread is still available after locking. - */ - for (;;) { - highcpu = 0; - highload = 0; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - steal = TDQ_CPU(cpu); - if (steal->tdq_transferable == 0) - continue; - if (steal->tdq_load < highload) - continue; - highload = steal->tdq_load; - highcpu = cpu; - } - if (highload < steal_thresh) - break; - steal = TDQ_CPU(highcpu); + for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) { + cpu = sched_highest(cg, steal_thresh); + if (cpu == -1) + continue; + steal = TDQ_CPU(cpu); if (steal == tdq) - break; + continue; tdq_lock_pair(tdq, steal); - if (steal->tdq_load >= steal_thresh && steal->tdq_transferable) - goto steal; - tdq_unlock_pair(tdq, steal); + if (steal->tdq_load < steal_thresh || + steal->tdq_transferable == 0) { + tdq_unlock_pair(tdq, steal); + continue; + } + spinlock_exit(); + /* + * If we got a thread while we had interrupts disabled + * don't steal one here. + */ + if (tdq->tdq_load == 0) + tdq_move(steal, tdq); + TDQ_UNLOCK(steal); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + + return (0); } spinlock_exit(); return (1); -steal: - spinlock_exit(); - tdq_move(steal, tdq); - TDQ_UNLOCK(steal); - mi_switch(SW_VOL, NULL); - thread_unlock(curthread); - - return (0); } /* @@ -850,6 +886,28 @@ } /* + * Set lowpri to its exact value by searching the run-queue and + * evaluating curthread. curthread may be passed as an optimization. + */ +static void +tdq_setlowpri(struct tdq *tdq, struct thread *ctd) +{ + struct td_sched *ts; + struct thread *td; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + if (ctd == NULL) + ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; + ts = tdq_choose(tdq); + if (ts) + td = ts->ts_thread; + if (ts == NULL || td->td_priority > ctd->td_priority) + tdq->tdq_lowpri = ctd->td_priority; + else + tdq->tdq_lowpri = td->td_priority; +} + +/* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ @@ -981,155 +1039,51 @@ return (tdq); } -/* - * Find the thread queue running the lowest priority thread. - */ -static int -tdq_lowestpri(void) -{ - struct tdq *tdq; - int lowpri; - int lowcpu; - int lowload; - int load; - int cpu; - int pri; - - lowload = 0; - lowpri = lowcpu = 0; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - tdq = TDQ_CPU(cpu); - pri = tdq->tdq_lowpri; - load = TDQ_CPU(cpu)->tdq_load; - CTR4(KTR_ULE, - "cpu %d pri %d lowcpu %d lowpri %d", - cpu, pri, lowcpu, lowpri); - if (pri < lowpri) - continue; - if (lowpri && lowpri == pri && load > lowload) - continue; - lowpri = pri; - lowcpu = cpu; - lowload = load; - } - - return (lowcpu); -} - -/* - * Find the thread queue with the least load. - */ -static int -tdq_lowestload(void) -{ - struct tdq *tdq; - int lowload; - int lowpri; - int lowcpu; - int load; - int cpu; - int pri; - - lowcpu = 0; - lowload = TDQ_CPU(0)->tdq_load; - lowpri = TDQ_CPU(0)->tdq_lowpri; - for (cpu = 1; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - tdq = TDQ_CPU(cpu); - load = tdq->tdq_load; - pri = tdq->tdq_lowpri; - CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d", - cpu, load, lowcpu, lowload); - if (load > lowload) - continue; - if (load == lowload && pri < lowpri) - continue; - lowcpu = cpu; - lowload = load; - lowpri = pri; - } - - return (lowcpu); -} - -/* - * Pick the destination cpu for sched_add(). Respects affinity and makes - * a determination based on load or priority of available processors. - */ static int sched_pickcpu(struct td_sched *ts, int flags) { + struct cpu_group *cg; struct tdq *tdq; int self; int pri; int cpu; - cpu = self = PCPU_GET(cpuid); + self = PCPU_GET(cpuid); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ - if (flags & SRQ_OURSELF) { - CTR1(KTR_ULE, "YIELDING %d", - curthread->td_priority); + if (flags & SRQ_OURSELF) return (self); - } pri = ts->ts_thread->td_priority; - cpu = ts->ts_cpu; - /* - * Regardless of affinity, if the last cpu is idle send it there. - */ - tdq = TDQ_CPU(cpu); - if (tdq->tdq_lowpri > PRI_MIN_IDLE) { - CTR5(KTR_ULE, - "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", - ts->ts_cpu, ts->ts_rltick, ticks, pri, - tdq->tdq_lowpri); + tdq = TDQ_CPU(ts->ts_cpu); + if (tdq->tdq_lowpri > PRI_MIN_IDLE) return (ts->ts_cpu); - } - /* - * If we have affinity, try to place it on the cpu we last ran on. - */ - if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) { - CTR5(KTR_ULE, - "affinity for %d, ltick %d ticks %d pri %d curthread %d", - ts->ts_cpu, ts->ts_rltick, ticks, pri, - tdq->tdq_lowpri); + if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) return (ts->ts_cpu); - } - /* - * Look for an idle group. - */ - CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); - cpu = ffs(tdq_idle); - if (cpu) - return (--cpu); - /* - * If there are no idle cores see if we can run the thread locally. - * This may improve locality among sleepers and wakers when there - * is shared data. - */ - if (tryself && pri < TDQ_CPU(self)->tdq_lowpri) { - CTR1(KTR_ULE, "tryself %d", - curthread->td_priority); - return (self); - } - /* - * Now search for the cpu running the lowest priority thread with - * the least load. - */ - if (pick_pri) - cpu = tdq_lowestpri(); - else - cpu = tdq_lowestload(); + for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) + if (SCHED_AFFINITY(ts, cg->cg_level)) + break; + cpu = -1; + if (cg) + cpu = sched_lowest(cg, pri); + if (cpu == -1) + cpu = sched_lowest(cpu_top, -1); + if (tryself) { + cg = TDQ_SELF()->tdq_cg; + if (0 && cg->cg_level) + self = sched_lowest(cg, -1); + if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE && + TDQ_CPU(self)->tdq_lowpri > pri) + cpu = self; + } + if (oldtryself && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE && + curthread->td_priority > pri) + cpu = self; return (cpu); } - -#endif /* SMP */ +#endif /* * Pick the highest priority task we have and return it. @@ -1174,84 +1128,39 @@ runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); - tdq->tdq_load = 0; -} - -#ifdef SMP -static void -tdg_setup(struct tdq_group *tdg) -{ - if (bootverbose) - printf("ULE: setup cpu group %d\n", TDG_ID(tdg)); - snprintf(tdg->tdg_name, sizeof(tdg->tdg_name), - "sched lock %d", (int)TDG_ID(tdg)); - mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock", + snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), + "sched lock %d", (int)TDQ_ID(tdq)); + mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN | MTX_RECURSE); - LIST_INIT(&tdg->tdg_members); - tdg->tdg_load = 0; - tdg->tdg_transferable = 0; - tdg->tdg_cpus = 0; - tdg->tdg_mask = 0; - tdg->tdg_cpumask = 0; - tdg->tdg_idlemask = 0; } -static void -tdg_add(struct tdq_group *tdg, struct tdq *tdq) -{ - if (tdg->tdg_mask == 0) - tdg->tdg_mask |= 1 << TDQ_ID(tdq); - tdg->tdg_cpumask |= 1 << TDQ_ID(tdq); - tdg->tdg_cpus++; - tdq->tdq_group = tdg; - tdq->tdq_lock = &tdg->tdg_lock; - LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); - if (bootverbose) - printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n", - TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask); -} - -static void -sched_setup_topology(void) +#ifdef SMP +static struct cpu_group * +sched_find_group(int cpu) { - struct tdq_group *tdg; struct cpu_group *cg; - int balance_groups; - struct tdq *tdq; + cpumask_t mask; + int children; int i; - int j; - topology = 1; - balance_groups = 0; - for (i = 0; i < smp_topology->ct_count; i++) { - cg = &smp_topology->ct_group[i]; - tdg = &tdq_groups[i]; - /* - * Initialize the group. - */ - tdg_setup(tdg); - /* - * Find all of the group members and add them. - */ - for (j = 0; j < MAXCPU; j++) { - if ((cg->cg_mask & (1 << j)) != 0) { - tdq = TDQ_CPU(j); - tdq_setup(tdq); - tdg_add(tdg, tdq); - } - } - if (tdg->tdg_cpus > 1) - balance_groups = 1; + mask = (1 << cpu); + cg = cpu_top; + for (;;) { + if ((cg->cg_mask & mask) == 0) + return (NULL); + if (cg->cg_children == 0) + return (cg); + children = cg->cg_children; + for (i = 0, cg = cg->cg_child; i < children; cg++, i++) + if ((cg->cg_mask & mask) != 0) + break; } - tdg_maxid = smp_topology->ct_count - 1; - if (balance_groups) - sched_balance_groups(); + panic("How did I get here?"); } static void sched_setup_smp(void) { - struct tdq_group *tdg; struct tdq *tdq; int cpus; int i; @@ -1259,37 +1168,149 @@ for (cpus = 0, i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; - tdq = &tdq_cpu[i]; - tdg = &tdq_groups[i]; - /* - * Setup a tdq group with one member. - */ - tdg_setup(tdg); + tdq = TDQ_CPU(i); tdq_setup(tdq); - tdg_add(tdg, tdq); + tdq->tdq_cg = sched_find_group(i); cpus++; } - tdg_maxid = cpus - 1; } -/* - * Fake a topology with one group containing all CPUs. - */ -static void -sched_fake_topo(void) +struct cpu_group *cpu_topo_none(void); +struct cpu_group *cpu_topo_1level(int l1share, int l1count, int l1flags); +struct cpu_group *cpu_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags); + +struct cpu_group * +cpu_topo_none(void) +{ + static struct cpu_group top; + + top.cg_parent = NULL; + top.cg_child = NULL; + top.cg_mask = (1 << mp_ncpus) - 1; + top.cg_count = mp_ncpus; + top.cg_children = 0; + top.cg_level = CG_SHARE_NONE; + top.cg_flags = 0; + + return (&top); +} + +struct cpu_group * +cpu_topo_1level(int l1share, int l1count, int l1flags) { -#ifdef SCHED_FAKE_TOPOLOGY - static struct cpu_top top; - static struct cpu_group group; + static struct cpu_group group[16]; + struct cpu_group *top; + struct cpu_group *l1g; + int l1, l1cpus, l1cpumask; + + top = &group[0]; + l1g = &group[1]; + l1cpus = mp_ncpus / l1count; + l1cpumask = (1 << l1cpus) - 1; + top->cg_parent = NULL; + top->cg_child = l1g; + top->cg_mask = (1 << mp_ncpus) - 1; + top->cg_count = mp_ncpus; + top->cg_children = l1count; + top->cg_level = CG_SHARE_NONE; + top->cg_flags = 0; + for (l1 = 0; l1 < l1count; l1++, l1g++, l1cpumask <<= l1cpus) { + l1g->cg_parent = top; + l1g->cg_child = NULL; + l1g->cg_mask = l1cpumask; + l1g->cg_count = l1cpus; + l1g->cg_children = 0; + l1g->cg_level = l1share; + l1g->cg_flags = l1flags; + } + + return (top); +} + +struct cpu_group * +cpu_topo_2level(int l2share, int l2count, int l1share, int l1count, + int l1flags) +{ + static struct cpu_group group[16]; + struct cpu_group *top; + struct cpu_group *l1g; + struct cpu_group *l2g; + int l1, l2, l1cpus, l2cpus, l1cpumask, l2cpumask; + + top = &group[0]; + l2g = &group[1]; + l1g = &group[1 + l2count]; + l2cpus = mp_ncpus / l2count; + l1cpus = l2cpus / l1count; + l2cpumask = (1 << l2cpus) - 1; + l1cpumask = (1 << l1cpus) - 1; + top->cg_parent = NULL; + top->cg_child = l2g; + top->cg_mask = (1 << mp_ncpus) - 1; + top->cg_count = mp_ncpus; + top->cg_children = l2count; + top->cg_level = CG_SHARE_NONE; + top->cg_flags = 0; + for (l2 = 0; l2 < l2count; l2++, l2g++, l2cpumask <<= l2cpus) { + l2g->cg_parent = top; + l2g->cg_child = l1g; + l2g->cg_mask = l2cpumask; + l2g->cg_count = l2cpus; + l2g->cg_children = l1count; + l2g->cg_level = l2share; + l2g->cg_flags = 0; + for (l1 = 0; l1 < l1count; l1++, l1g++, l1cpumask <<= l1cpus) { + l1g->cg_parent = l2g; + l1g->cg_child = NULL; + l1g->cg_mask = l1cpumask; + l1g->cg_count = l1cpus; + l1g->cg_children = 0; + l1g->cg_level = l1share; + l1g->cg_flags = l1flags; + } + } - top.ct_count = 1; - top.ct_group = &group; - group.cg_mask = all_cpus; - group.cg_count = mp_ncpus; - group.cg_children = 0; - smp_topology = ⊤ -#endif + return (top); +} + +static struct cpu_group * +sched_build_topo(void) +{ + /* + * Dual core with no sharing. + */ + if (1) + return cpu_topo_1level(CG_SHARE_NONE, 2, 0); + /* + * Dual core with shared L2. + */ + if (0) + return cpu_topo_1level(CG_SHARE_L2, 2, 0); + /* + * quad core barcelona: shared l3 among each package, private l2. + */ + if (0) + return cpu_topo_1level(CG_SHARE_L3, 4, 0); + /* + * Intel quad core. 2 dualcore parts on each package share l2. + * Package shares only system bus. + */ + if (0) + return cpu_topo_2level(CG_SHARE_NONE, 4, CG_SHARE_L2, 2, 0); + /* + * Single-core 2xHTT + */ + if (0) + return cpu_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); + /* + * sun4v 4 cores with a shared l3 cache 8 threads sharing cache. + */ + if (0) + return cpu_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, + CG_FLAG_THREAD); + return (cpu_topo_none()); } + #endif /* @@ -1303,21 +1324,12 @@ tdq = TDQ_SELF(); #ifdef SMP - sched_fake_topo(); - /* - * Setup tdqs based on a topology configuration or vanilla SMP based - * on mp_maxid. - */ - if (smp_topology == NULL) - sched_setup_smp(); - else - sched_setup_topology(); + cpu_top = sched_build_topo(); + sched_setup_smp(); balance_tdq = tdq; sched_balance(); #else tdq_setup(tdq); - mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE); - tdq->tdq_lock = &tdq_lock; #endif /* * To avoid divide-by-zero, we set realstathz a dummy value @@ -1331,6 +1343,7 @@ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &td_sched0); + tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } @@ -1369,7 +1382,7 @@ * prevents excess thrashing on large machines and excess idle on * smaller machines. */ - steal_thresh = min(ffs(mp_ncpus) - 1, 4); + steal_thresh = min(ffs(mp_ncpus) - 1, 3); affinity = SCHED_AFFINITY_DEFAULT; #endif } @@ -1620,12 +1633,15 @@ #ifdef SMP } else if (TD_IS_RUNNING(td)) { struct tdq *tdq; + int oldpri; tdq = TDQ_CPU(ts->ts_cpu); - if (prio < tdq->tdq_lowpri || - (td->td_priority == tdq->tdq_lowpri && tdq->tdq_load <= 1)) - tdq->tdq_lowpri = prio; + oldpri = td->td_priority; td->td_priority = prio; + if (prio < tdq->tdq_lowpri) + tdq->tdq_lowpri = prio; + else if (tdq->tdq_lowpri == oldpri) + tdq_setlowpri(tdq, td); #endif } else td->td_priority = prio; @@ -2063,15 +2079,11 @@ struct tdq *tdq; tdq = TDQ_CPU(td->td_sched->ts_cpu); - if (THREAD_CAN_MIGRATE(td)) { + if (THREAD_CAN_MIGRATE(td)) tdq->tdq_transferable--; - tdq->tdq_group->tdg_transferable--; - } td->td_pri_class = class; - if (THREAD_CAN_MIGRATE(td)) { + if (THREAD_CAN_MIGRATE(td)) tdq->tdq_transferable++; - tdq->tdq_group->tdg_transferable++; - } } #endif td->td_pri_class = class; @@ -2149,6 +2161,8 @@ thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; + if (lowpri_userret) + tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } } @@ -2172,8 +2186,6 @@ if (balance_tdq == tdq) { if (balance_ticks && --balance_ticks == 0) sched_balance(); - if (balance_group_ticks && --balance_group_ticks == 0) - sched_balance_groups(); } #endif /* @@ -2261,11 +2273,7 @@ struct thread * sched_choose(void) { -#ifdef SMP - struct tdq_group *tdg; -#endif struct td_sched *ts; - struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); @@ -2275,20 +2283,7 @@ tdq_runq_rem(tdq, ts); return (ts->ts_thread); } - td = PCPU_GET(idlethread); -#ifdef SMP - /* - * We only set the idled bit when all of the cpus in the group are - * idle. Otherwise we could get into a situation where a thread bounces - * back and forth between two idle cores on seperate physical CPUs. - */ - tdg = tdq->tdq_group; - tdg->tdg_idlemask |= PCPU_GET(cpumask); - if (tdg->tdg_idlemask == tdg->tdg_cpumask) - atomic_set_int(&tdq_idle, tdg->tdg_mask); - tdq->tdq_lowpri = td->td_priority; -#endif - return (td); + return (PCPU_GET(idlethread)); } /* @@ -2305,7 +2300,7 @@ ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; - if (td->td_priority < ctd->td_priority) + if (td->td_priority < cpri) curthread->td_flags |= TDF_NEEDRESCHED; if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; @@ -2329,9 +2324,6 @@ { struct td_sched *ts; int class; -#ifdef SMP - int cpumask; -#endif TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), @@ -2356,25 +2348,6 @@ else ts->ts_runq = &tdq->tdq_idle; #ifdef SMP - cpumask = 1 << ts->ts_cpu; - /* - * If we had been idle, clear our bit in the group and potentially - * the global bitmap. - */ - if ((class != PRI_IDLE && class != PRI_ITHD) && - (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { - /* - * Check to see if our group is unidling, and if so, remove it - * from the global idle mask. - */ - if (tdq->tdq_group->tdg_idlemask == - tdq->tdq_group->tdg_cpumask) - atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); - /* - * Now remove ourselves from the group specific idle mask. - */ - tdq->tdq_group->tdg_idlemask &= ~cpumask; - } if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; #endif @@ -2462,6 +2435,10 @@ tdq_runq_rem(tdq, ts); tdq_load_rem(tdq, ts); TD_SET_CAN_RUN(td); +#ifdef SMP + if (td->td_priority == tdq->tdq_lowpri) + tdq_setlowpri(tdq, NULL); +#endif } /* @@ -2563,8 +2540,8 @@ int i; total = 0; - for (i = 0; i <= tdg_maxid; i++) - total += TDQ_GROUP(i)->tdg_load; + for (i = 0; i <= mp_maxid; i++) + total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); @@ -2658,6 +2635,9 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); +#ifdef SMP + tdq->tdq_lowpri = td->td_priority; +#endif } static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, @@ -2676,6 +2656,8 @@ SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, userret, CTLFLAG_RW, &lowpri_userret, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, oldtryself, CTLFLAG_RW, &oldtryself, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,