Index: sched_ule.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.202 diff -u -r1.202 sched_ule.c --- sched_ule.c 19 Jul 2007 20:03:15 -0000 1.202 +++ sched_ule.c 3 Aug 2007 15:24:46 -0000 @@ -183,7 +183,7 @@ * locking in sched_pickcpu(); */ struct tdq { - struct mtx tdq_lock; /* Protects all fields below. */ + struct mtx *tdq_lock; /* Pointer to group lock. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ @@ -198,7 +198,6 @@ #else int tdq_sysload; /* For loadavg, !ITHD load. */ #endif - char tdq_name[16]; /* lock name. */ } __aligned(64); @@ -212,13 +211,15 @@ * load balancer. */ struct tdq_group { - int tdg_cpus; /* Count of CPUs in this tdq group. */ - cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ - cpumask_t tdg_idlemask; /* Idle cpus in this group. */ - cpumask_t tdg_mask; /* Bit mask for first cpu. */ - int tdg_load; /* Total load of this group. */ + struct mtx tdg_lock; /* Protects all fields below. */ + int tdg_cpus; /* Count of CPUs in this tdq group. */ + cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ + cpumask_t tdg_idlemask; /* Idle cpus in this group. */ + cpumask_t tdg_mask; /* Bit mask for first cpu. */ + int tdg_load; /* Total load of this group. */ int tdg_transferable; /* Transferable load of this group. */ LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ + char tdg_name[16]; /* lock name. */ } __aligned(64); #define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) @@ -249,10 +250,12 @@ #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) -#define TDQ_ID(x) ((x) - tdq_cpu) +#define TDQ_ID(x) ((int)((x) - tdq_cpu)) #define TDQ_GROUP(x) (&tdq_groups[(x)]) +#define TDG_ID(x) ((int)((x) - tdq_groups)) #else /* !SMP */ static struct tdq tdq_cpu; +static struct mtx tdq_lock; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) @@ -263,7 +266,7 @@ #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) -#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) +#define TDQ_LOCKPTR(t) ((t)->tdq_lock) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); @@ -296,6 +299,7 @@ static inline struct tdq *sched_setcpu(struct td_sched *, int, int); static inline struct mtx *thread_block_switch(struct thread *); static inline void thread_unblock_switch(struct thread *, struct mtx *); +static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #endif @@ -343,9 +347,8 @@ tdq = TDQ_CPU(cpu); - printf("tdq:\n"); + printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); - printf("\tlock name %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); @@ -357,7 +360,9 @@ runq_print(&tdq->tdq_idle); #ifdef SMP printf("\tload transferable: %d\n", tdq->tdq_transferable); - printf("\tlowest priority: %d\n", tdq->tdq_lowpri); + printf("\tlowest priority: %d\n", tdq->tdq_lowpri); + printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); + printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); #endif } @@ -389,7 +394,7 @@ * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ - if ((flags & SRQ_BORROWING) == 0) { + if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* @@ -454,7 +459,7 @@ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); tdq->tdq_load++; - CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load); + CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP @@ -484,7 +489,7 @@ tdq->tdq_sysload--; #endif KASSERT(tdq->tdq_load != 0, - ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq))); + ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); ts->ts_runq = NULL; @@ -916,6 +921,8 @@ tdq = TDQ_CPU(cpu); td = ts->ts_thread; ts->ts_cpu = cpu; + + /* If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) return (tdq); #ifdef notyet @@ -936,9 +943,7 @@ */ thread_lock_block(td); TDQ_LOCK(tdq); - /* Return to sched_switch() with the lock still blocked */ - if ((flags & SRQ_OURSELF) == 0) - thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); + thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); return (tdq); } @@ -1129,107 +1134,158 @@ tdq_setup(struct tdq *tdq) { - snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), - "sched lock %d", (int)TDQ_ID(tdq)); - mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", - MTX_SPIN | MTX_RECURSE); + if (bootverbose) + printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_load = 0; } -/* - * Setup the thread queues and initialize the topology based on MD - * information. - */ +#ifdef SMP static void -sched_setup(void *dummy) +tdg_setup(struct tdq_group *tdg) { - struct tdq *tdq; -#ifdef SMP + if (bootverbose) + printf("ULE: setup cpu group %d\n", TDG_ID(tdg)); + snprintf(tdg->tdg_name, sizeof(tdg->tdg_name), + "sched lock %d", (int)TDG_ID(tdg)); + mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock", + MTX_SPIN | MTX_RECURSE); + LIST_INIT(&tdg->tdg_members); + tdg->tdg_load = 0; + tdg->tdg_transferable = 0; + tdg->tdg_cpus = 0; + tdg->tdg_mask = 0; + tdg->tdg_cpumask = 0; + tdg->tdg_idlemask = 0; +} + +static void +tdg_add(struct tdq_group *tdg, struct tdq *tdq) +{ + if (tdg->tdg_mask == 0) + tdg->tdg_mask |= 1 << TDQ_ID(tdq); + tdg->tdg_cpumask |= 1 << TDQ_ID(tdq); + tdg->tdg_cpus++; + tdq->tdq_group = tdg; + tdq->tdq_lock = &tdg->tdg_lock; + LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); + if (bootverbose) + printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n", + TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask); +} + +static void +sched_setup_topology(void) +{ + struct tdq_group *tdg; + struct cpu_group *cg; int balance_groups; + struct tdq *tdq; int i; + int j; - balance_groups = 0; - /* - * Initialize the tdqs. - */ - for (i = 0; i < MAXCPU; i++) { - tdq = &tdq_cpu[i]; - tdq_setup(&tdq_cpu[i]); + topology = 1; + for (i = 0; i < smp_topology->ct_count; i++) { + cg = &smp_topology->ct_group[i]; + tdg = &tdq_groups[i]; + /* + * Initialize the group. + */ + tdg_setup(tdg); + /* + * Find all of the group members and add them. + */ + for (j = 0; j < MAXCPU; j++) { + if ((cg->cg_mask & (1 << j)) != 0) { + tdq = TDQ_CPU(j); + tdq_setup(tdq); + tdg_add(tdg, tdq); + } + } + if (tdg->tdg_cpus > 1) + balance_groups = 1; } - if (smp_topology == NULL) { - struct tdq_group *tdg; - int cpus; + tdg_maxid = smp_topology->ct_count - 1; + if (balance_groups) + sched_balance_groups(NULL); +} - for (cpus = 0, i = 0; i < MAXCPU; i++) { - if (CPU_ABSENT(i)) - continue; - tdq = &tdq_cpu[i]; - tdg = &tdq_groups[cpus]; - /* - * Setup a tdq group with one member. - */ - tdq->tdq_transferable = 0; - tdq->tdq_group = tdg; - tdg->tdg_cpus = 1; - tdg->tdg_idlemask = 0; - tdg->tdg_cpumask = tdg->tdg_mask = 1 << i; - tdg->tdg_load = 0; - tdg->tdg_transferable = 0; - LIST_INIT(&tdg->tdg_members); - LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); - cpus++; - } - tdg_maxid = cpus - 1; - } else { - struct tdq_group *tdg; - struct cpu_group *cg; - int j; +static void +sched_setup_smp(void) +{ + struct tdq_group *tdg; + struct tdq *tdq; + int cpus; + int i; - topology = 1; - for (i = 0; i < smp_topology->ct_count; i++) { - cg = &smp_topology->ct_group[i]; - tdg = &tdq_groups[i]; - /* - * Initialize the group. - */ - tdg->tdg_idlemask = 0; - tdg->tdg_load = 0; - tdg->tdg_transferable = 0; - tdg->tdg_cpus = cg->cg_count; - tdg->tdg_cpumask = cg->cg_mask; - LIST_INIT(&tdg->tdg_members); - /* - * Find all of the group members and add them. - */ - for (j = 0; j < MAXCPU; j++) { - if ((cg->cg_mask & (1 << j)) != 0) { - if (tdg->tdg_mask == 0) - tdg->tdg_mask = 1 << j; - tdq_cpu[j].tdq_transferable = 0; - tdq_cpu[j].tdq_group = tdg; - LIST_INSERT_HEAD(&tdg->tdg_members, - &tdq_cpu[j], tdq_siblings); - } - } - if (tdg->tdg_cpus > 1) - balance_groups = 1; - } - tdg_maxid = smp_topology->ct_count - 1; + for (cpus = 0, i = 0; i < MAXCPU; i++) { + if (CPU_ABSENT(i)) + continue; + tdq = &tdq_cpu[i]; + tdg = &tdq_groups[i]; + /* + * Setup a tdq group with one member. + */ + tdg_setup(tdg); + tdq_setup(tdq); + tdg_add(tdg, tdq); + cpus++; } + tdg_maxid = cpus - 1; +} + +/* + * Fake a topology with one group containing all CPUs. + */ +static void +sched_fake_topo(void) +{ +#ifdef SCHED_FAKE_TOPOLOGY + static struct cpu_top top; + static struct cpu_group group; + + top.ct_count = 1; + top.ct_group = &group; + group.cg_mask = all_cpus; + group.cg_count = mp_ncpus; + group.cg_children = 0; + smp_topology = ⊤ +#endif +} +#endif + +/* + * Setup the thread queues and initialize the topology based on MD + * information. + */ +static void +sched_setup(void *dummy) +{ + struct tdq *tdq; + + tdq = TDQ_SELF(); +#ifdef SMP /* * Initialize long-term cpu balancing algorithm. */ callout_init(&balco, CALLOUT_MPSAFE); callout_init(&gbalco, CALLOUT_MPSAFE); + sched_fake_topo(); + /* + * Setup tdqs based on a topology configuration or vanilla SMP based + * on mp_maxid. + */ + if (smp_topology == NULL) + sched_setup_smp(); + else + sched_setup_topology(); sched_balance(NULL); - if (balance_groups) - sched_balance_groups(NULL); - #else - tdq_setup(TDQ_SELF()); + tdq_setup(tdq); + mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE); + tdq->tdq_lock = &tdq_lock; #endif /* * To avoid divide-by-zero, we set realstathz a dummy value @@ -1240,8 +1296,8 @@ tickincr = 1 << SCHED_TICK_SHIFT; /* Add thread0's load since it's running. */ - tdq = TDQ_SELF(); TDQ_LOCK(tdq); + thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &td_sched0); TDQ_UNLOCK(tdq); } @@ -1441,7 +1497,6 @@ */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; - thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); td_sched0.ts_ltick = ticks; td_sched0.ts_ftick = ticks; td_sched0.ts_thread = &thread0; @@ -1660,6 +1715,42 @@ } /* + * Handle migration from sched_switch(). This happens only for + * cpu binding. + */ +static struct mtx * +sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) +{ + struct tdq *tdn; + + tdn = TDQ_CPU(td->td_sched->ts_cpu); +#ifdef SMP + /* + * Do the lock dance required to avoid LOR. We grab an extra + * spinlock nesting to prevent preemption while we're + * not holding either run-queue lock. + */ + spinlock_enter(); + thread_block_switch(td); /* This releases the lock on tdq. */ + TDQ_LOCK(tdn); + tdq_add(tdn, td, flags); + tdq_notify(td->td_sched); + /* + * After we unlock tdn the new cpu still can't switch into this + * thread until we've unblocked it in cpu_switch(). The lock + * pointers may match in the case of HTT cores. Don't unlock here + * or we can deadlock when the other CPU runs the IPI handler. + */ + if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) { + TDQ_UNLOCK(tdn); + TDQ_LOCK(tdq); + } + spinlock_exit(); +#endif + return (TDQ_LOCKPTR(tdn)); +} + +/* * Block a thread for switching. Similar to thread_block() but does not * bump the spin count. */ @@ -1698,6 +1789,7 @@ struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; + int srqflag; int cpuid; THREAD_LOCK_ASSERT(td, MA_OWNED); @@ -1705,7 +1797,7 @@ cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); ts = td->td_sched; - mtx = TDQ_LOCKPTR(tdq); + mtx = td->td_lock; #ifdef SMP ts->ts_rltick = ticks; if (newtd && newtd->td_priority < tdq->tdq_lowpri) @@ -1724,22 +1816,14 @@ TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); - /* Remove our load so the selection algorithm is not biased. */ tdq_load_rem(tdq, ts); - sched_add(td, (flags & SW_PREEMPT) ? + srqflag = (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : - SRQ_OURSELF|SRQ_YIELDING); - /* - * When migrating we return from sched_add with an extra - * spinlock nesting, the tdq locked, and a blocked thread. - * This is to optimize out an extra block/unblock cycle here. - */ - if (ts->ts_cpu != cpuid) { - mtx = TDQ_LOCKPTR(TDQ_CPU(ts->ts_cpu)); - mtx_unlock_spin(mtx); - TDQ_LOCK(tdq); - spinlock_exit(); - } + SRQ_OURSELF|SRQ_YIELDING; + if (ts->ts_cpu == cpuid) + tdq_add(tdq, td, srqflag); + else + mtx = sched_switch_migrate(tdq, td, srqflag); } else { /* This thread must be going to sleep. */ TDQ_LOCK(tdq); @@ -2358,7 +2442,7 @@ { struct td_sched *ts; - THREAD_LOCK_ASSERT(td, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); ts = td->td_sched; if (ts->ts_flags & TSF_BOUND) sched_unbind(td); Index: kern_switch.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_switch.c,v retrieving revision 1.131 diff -u -r1.131 kern_switch.c --- kern_switch.c 12 Jun 2007 19:50:31 -0000 1.131 +++ kern_switch.c 3 Aug 2007 01:24:18 -0000 @@ -192,7 +192,7 @@ thread_lock(td); td->td_critnest--; SCHED_STAT_INC(switch_owepreempt); - mi_switch(SW_INVOL, NULL); + mi_switch(SW_INVOL|SW_PREEMPT, NULL); thread_unlock(td); } } else