Index: sched_ule.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.200 diff -u -r1.200 sched_ule.c --- sched_ule.c 17 Jul 2007 22:53:23 -0000 1.200 +++ sched_ule.c 19 Jul 2007 00:17:51 -0000 @@ -177,8 +177,6 @@ static int sched_slice; static int preempt_thresh = PRI_MIN_KERN; -#define SCHED_BAL_SECS 2 /* How often we run the rebalance algorithm. */ - /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess @@ -229,14 +227,14 @@ /* * Run-time tunables. */ -static int rebalance = 0; -static int pick_pri = 0; -static int pick_zero = 0; +static int rebalance = 1; +static int balance_secs = 1; +static int pick_pri = 1; static int affinity; static int tryself = 1; -static int tryselfidle = 1; static int steal_htt = 0; -static int steal_idle = 0; +static int steal_idle = 1; +static int steal_thresh = 2; static int topology = 0; /* @@ -514,7 +512,7 @@ int cnt; int i; - callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)), + callout_reset(&balco, max(hz / 2, random() % (hz * balance_secs)), sched_balance, NULL); if (smp_started == 0 || rebalance == 0) return; @@ -547,7 +545,7 @@ { int i; - callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)), + callout_reset(&gbalco, max(hz / 2, random() % (hz * balance_secs)), sched_balance_groups, NULL); if (smp_started == 0 || rebalance == 0) return; @@ -683,6 +681,7 @@ ts->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); + tdq_notify(ts); } /* @@ -734,11 +733,11 @@ highload = load; highcpu = cpu; } - if (highload < 2) + if (highload < steal_thresh) break; steal = TDQ_CPU(highcpu); TDQ_LOCK(steal); - if (steal->tdq_transferable > 1 && + if (steal->tdq_transferable >= steal_thresh && (ts = tdq_steal(steal, 1)) != NULL) goto steal; TDQ_UNLOCK(steal); @@ -863,11 +862,9 @@ struct rqhead *rqh; struct rqbits *rqb; struct td_sched *ts; - int first; int word; int bit; - first = 0; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) @@ -876,11 +873,9 @@ if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; - TAILQ_FOREACH(ts, rqh, ts_procq) { - if (first && THREAD_CAN_MIGRATE(ts->ts_thread)) + TAILQ_FOREACH(ts, rqh, ts_procq) + if (THREAD_CAN_MIGRATE(ts->ts_thread)) return (ts); - first = 1; - } } } return (NULL); @@ -1036,6 +1031,14 @@ cpu = self = PCPU_GET(cpuid); if (smp_started == 0) return (self); + /* + * Don't migrate a running thread from sched_switch(). + */ + if (flags & SRQ_OURSELF) { + CTR1(KTR_ULE, "YIELDING %d", + curthread->td_priority); + return (self); + } pri = ts->ts_thread->td_priority; cpu = ts->ts_cpu; /* @@ -1060,42 +1063,22 @@ return (ts->ts_cpu); } /* - * Try ourself first; If we're running something lower priority this - * may have some locality with the waking thread and execute faster - * here. - */ - if (tryself) { - /* - * If we're being awoken by an interrupt thread or the waker - * is going right to sleep run here as well. - */ - if ((TDQ_SELF()->tdq_load <= 1) && (flags & (SRQ_YIELDING) || - curthread->td_pri_class == PRI_ITHD)) { - CTR2(KTR_ULE, "tryself load %d flags %d", - TDQ_SELF()->tdq_load, flags); - return (self); - } - } - /* * Look for an idle group. */ CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); cpu = ffs(tdq_idle); if (cpu) return (--cpu); - if (tryselfidle && pri < curthread->td_priority) { - CTR1(KTR_ULE, "tryselfidle %d", + /* + * If there are no idle cores see if we can run the thread locally. This may + * improve locality among sleepers and wakers when there is shared data. + */ + if (tryself && pri < curthread->td_priority) { + CTR1(KTR_ULE, "tryself %d", curthread->td_priority); return (self); } /* - * XXX Under heavy load mysql performs way better if you - * serialize the non-running threads on one cpu. This is - * a horrible hack. - */ - if (pick_zero) - return (0); - /* * Now search for the cpu running the lowest priority thread with * the least load. */ @@ -1657,6 +1640,22 @@ } /* + * Add the thread passed as 'newtd' to the run queue before selecting + * the next thread to run. This is only used for KSE. + */ +static void +sched_switchin(struct tdq *tdq, struct thread *newtd) +{ + spinlock_enter(); + TDQ_UNLOCK(tdq); + thread_lock(newtd); + spinlock_exit(); + sched_setcpu(newtd->td_sched, TDQ_ID(tdq), SRQ_YIELDING); + tdq_add(tdq, newtd, SRQ_YIELDING); + MPASS(newtd->td_lock == TDQ_LOCKPTR(tdq)); +} + +/* * Block a thread for switching. Similar to thread_block() but does not * bump the spin count. */ @@ -1750,14 +1749,11 @@ */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); /* - * If KSE assigned a new thread just add it here and pick the best one. + * If KSE assigned a new thread just add it here and let choosethread + * select the best one. */ - if (newtd != NULL) { - /* XXX This is bogus. What if the thread is locked elsewhere? */ - td->td_lock = TDQ_LOCKPTR(tdq); - td->td_sched->ts_cpu = cpuid; - tdq_add(tdq, td, SRQ_YIELDING); - } + if (newtd != NULL) + sched_switchin(tdq, newtd); newtd = choosethread(); /* * Call the MD code to switch contexts if necessary. @@ -2528,19 +2524,19 @@ #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "Pick the target cpu based on priority rather than load."); -SYSCTL_INT(_kern_sched, OID_AUTO, pick_zero, CTLFLAG_RW, &pick_zero, 0, - "If there are no idle cpus pick cpu0"); SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); -SYSCTL_INT(_kern_sched, OID_AUTO, tryselfidle, CTLFLAG_RW, - &tryselfidle, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); +SYSCTL_INT(_kern_sched, OID_AUTO, balance_secs, CTLFLAG_RW, &balance_secs, 0, + "Average frequence in seconds to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "Steals work from another hyper-threaded core on idle"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); +SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, + "Minimum load on remote cpu before we'll steal"); SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "True when a topology has been specified by the MD code."); #endif