Index: sched_ule.c =================================================================== RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.214 diff -u -r1.214 sched_ule.c --- sched_ule.c 8 Oct 2007 23:50:39 -0000 1.214 +++ sched_ule.c 12 Oct 2007 01:19:02 -0000 @@ -88,6 +88,7 @@ short ts_flags; /* TSF_* flags. */ u_char ts_rqindex; /* Run queue index. */ u_char ts_cpu; /* CPU that we have affinity for. */ + int ts_score; /* Interactivity score. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ @@ -102,6 +103,7 @@ /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ +#define TSF_INTERLOAD 0x0004 /* Interactive load on runq. */ static struct td_sched td_sched0; @@ -167,13 +169,16 @@ * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. - * sched_slice: Runtime of each thread before rescheduling. + * sched_slice_max: Maximum runtime of each thread before rescheduling. + * sched_slice_min: Minimum runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int sched_interact = SCHED_INTERACT_THRESH; static int realstathz; static int tickincr; -static int sched_slice; +static int sched_slice_max = 1; +static int sched_slice_min = 1; +static int sched_late_target = 1; #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; @@ -194,6 +199,8 @@ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ + unsigned int tdq_interload; /* Interactive load. */ + int tdq_lastslice; /* Last slice size assigned. */ int tdq_load; /* Aggregate load. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ @@ -290,8 +297,9 @@ static void tdq_load_rem(struct tdq *, struct td_sched *); static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); -void tdq_print(int cpu); -static void runq_print(struct runq *rq); +void tdq_print(int); +void sched_print(struct thread *); +static void runq_print(struct runq *); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static void tdq_move(struct tdq *, struct tdq *); @@ -345,6 +353,26 @@ } } +void +sched_print(struct thread *td) +{ + struct td_sched *ts; + + if (td == NULL) + td = curthread; + ts = td->td_sched; + printf("flags:\t\t0x%X\n", ts->ts_flags); + printf("rqindex:\t%d\n", ts->ts_rqindex); + printf("cpu:\t\t%d\n", ts->ts_cpu); + printf("score:\t\t%d\n", ts->ts_score); + printf("slice:\t\t%d\n", ts->ts_slice); + printf("slptime:\t%d\n", ts->ts_slptime); + printf("runtime:\t%d\n", ts->ts_runtime); + printf("ltick:\t\t%d\n", ts->ts_ltick); + printf("ftick:\t\t%d\n", ts->ts_ftick); + printf("ticks:\t\t%d\n", ts->ts_ticks); +} + /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ @@ -357,6 +385,7 @@ printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tinterload: %d\n", tdq->tdq_interload); printf("\tload: %d\n", tdq->tdq_load); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); @@ -383,8 +412,12 @@ static __inline void tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) { + u_char pri; + + pri = ts->ts_thread->td_priority; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + TD_SET_RUNQ(ts->ts_thread); #ifdef SMP if (THREAD_CAN_MIGRATE(ts->ts_thread)) { tdq->tdq_transferable++; @@ -392,15 +425,15 @@ ts->ts_flags |= TSF_XFERABLE; } #endif - if (ts->ts_runq == &tdq->tdq_timeshare) { - u_char pri; - - pri = ts->ts_thread->td_priority; + if (pri <= PRI_MAX_REALTIME) { + ts->ts_runq = &tdq->tdq_realtime; + } else if (pri <= PRI_MAX_TIMESHARE) { + ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX - * realtime. Use the whole queue to represent these values. + * timeshare. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; @@ -416,8 +449,10 @@ } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, ts, pri, flags); + return; } else - runq_add(ts->ts_runq, ts, flags); + ts->ts_runq = &tdq->tdq_idle; + runq_add(ts->ts_runq, ts, flags); } /* @@ -443,13 +478,6 @@ runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, ts, NULL); - /* - * For timeshare threads we update the priority here so - * the priority reflects the time we've been sleeping. - */ - ts->ts_ltick = ticks; - sched_pctcpu_update(ts); - sched_priority(ts->ts_thread); } else runq_remove(ts->ts_runq, ts); } @@ -466,6 +494,8 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); + tdq->tdq_interload += ts->ts_score; + ts->ts_flags |= TSF_INTERLOAD; tdq->tdq_load++; CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && @@ -498,6 +528,8 @@ #endif KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); + tdq->tdq_interload -= ts->ts_score; + ts->ts_flags &= ~TSF_INTERLOAD; tdq->tdq_load--; CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); ts->ts_runq = NULL; @@ -1175,6 +1207,7 @@ runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_load = 0; + tdq->tdq_interload = 0; } #ifdef SMP @@ -1324,12 +1357,12 @@ * in case which sched_clock() called before sched_initticks(). */ realstathz = hz; - sched_slice = (realstathz/10); /* ~100ms */ tickincr = 1 << SCHED_TICK_SHIFT; /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); + td_sched0.ts_score = 0; tdq_load_add(tdq, &td_sched0); TDQ_UNLOCK(tdq); } @@ -1344,7 +1377,9 @@ int incr; realstathz = stathz ? stathz : hz; - sched_slice = (realstathz/10); /* ~100ms */ + sched_slice_max = realstathz / 10; /* ~100ms */ + sched_slice_min = realstathz / 100; /* ~10ms */ + sched_late_target = realstathz / 4; /* ~250ms */ /* * tickincr is shifted out by 10 to avoid rounding errors due to @@ -1374,6 +1409,30 @@ #endif } +static void +tdq_slice_update(struct tdq *tdq, struct td_sched *ts) +{ + int slice; + int load; + + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + /* + * An interload of 100 is roughly equivalent to 100% cpu utilization + * requested. Calculate how many times overloaded we are and then + * divide the latency target by this number. None of this is precise + * but it does yield decreasing slice values within the [min, max] + * range as load increases. + */ + load = (tdq->tdq_interload + 99) / 100; + if (load) { + slice = sched_late_target / load; + slice = max(min(sched_slice_max, slice), sched_slice_min); + } else + slice = sched_slice_max; + tdq->tdq_lastslice = slice; + ts->ts_slice = slice; +} /* * This is the core of the interactivity algorithm. Determines a score based @@ -1389,15 +1448,6 @@ int div; ts = td->td_sched; - /* - * The score is only needed if this is likely to be an interactive - * task. Don't go through the expense of computing it if there's - * no chance. - */ - if (sched_interact <= SCHED_INTERACT_HALF && - ts->ts_runtime >= ts->ts_slptime) - return (SCHED_INTERACT_HALF); - if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + @@ -1443,7 +1493,7 @@ * score. Negative nice values make it easier for a thread to be * considered interactive. */ - score = imax(0, sched_interact_score(td) - td->td_proc->p_nice); + score = imax(0, td->td_sched->ts_score - td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_REALTIME; pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) @@ -1477,9 +1527,20 @@ sched_interact_update(struct thread *td) { struct td_sched *ts; + struct tdq *tdq; u_int sum; + int score; + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; + score = sched_interact_score(td); + if (ts->ts_flags & TSF_INTERLOAD) { + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + tdq->tdq_interload -= ts->ts_score; + tdq->tdq_interload += score; + } + ts->ts_score = score; sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; @@ -1559,7 +1620,7 @@ { /* Convert sched_slice to hz */ - return (hz/(realstathz/sched_slice)); + return (hz/(realstathz/sched_slice_max)); } /* @@ -1598,15 +1659,20 @@ sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; + struct tdq *tdq; CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); ts = td->td_sched; + tdq = TDQ_CPU(ts->ts_cpu); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; - +#ifdef SMP + if (prio < tdq->tdq_lowpri) + tdq->tdq_lowpri = prio; +#endif if (TD_ON_RUNQ(td) && prio < td->td_priority) { /* * If the priority has been elevated due to priority @@ -1617,16 +1683,8 @@ sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); - } else { -#ifdef SMP - struct tdq *tdq; - - tdq = TDQ_CPU(ts->ts_cpu); - if (prio < tdq->tdq_lowpri) - tdq->tdq_lowpri = prio; -#endif + } else td->td_priority = prio; - } } /* @@ -1772,6 +1830,8 @@ tdn = TDQ_CPU(td->td_sched->ts_cpu); #ifdef SMP + /* The load is being removed from the current cpu. */ + tdq_load_rem(tdq, td->td_sched); /* * Do the lock dance required to avoid LOR. We grab an extra * spinlock nesting to prevent preemption while we're @@ -1863,12 +1923,11 @@ TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); - tdq_load_rem(tdq, ts); srqflag = (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; if (ts->ts_cpu == cpuid) - tdq_add(tdq, td, srqflag); + tdq_runq_add(tdq, ts, srqflag); else mtx = sched_switch_migrate(tdq, td, srqflag); } else { @@ -1970,22 +2029,17 @@ THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; /* - * If we slept for more than a tick update our interactivity and - * priority. + * Update interactivity and priority after a sleep. */ slptick = td->td_slptick; td->td_slptick = 0; - if (slptick && slptick != ticks) { - u_int hzticks; - - hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; - ts->ts_slptime += hzticks; + if (slptick && slptick != ticks) { + ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts); - sched_priority(td); + /* Reset the slice value after we sleep. */ + ts->ts_slice = sched_slice_max; } - /* Reset the slice value after we sleep. */ - ts->ts_slice = sched_slice; sched_add(td, SRQ_BORING); } @@ -2040,7 +2094,8 @@ */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; - ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ + /* Attempt to quickly learn interactivity. */ + ts2->ts_slice = sched_slice_min; } /* @@ -2188,25 +2243,31 @@ } ts = td->td_sched; /* + * We used a tick; charge it to the thread so that we can compute our + * interactivity. + */ + td->td_sched->ts_runtime += tickincr; + sched_interact_update(td); + /* * We only do slicing code for TIMESHARE threads. */ if (td->td_pri_class != PRI_TIMESHARE) return; + sched_priority(td); /* - * We used a tick; charge it to the thread so that we can compute our - * interactivity. + * Clamp the slice if recent load has shrunk it significantly. */ - td->td_sched->ts_runtime += tickincr; - sched_interact_update(td); + if (ts->ts_slice > tdq->tdq_lastslice) + ts->ts_slice = tdq->tdq_lastslice; /* * We used up one time slice. */ if (--ts->ts_slice > 0) return; /* - * We're out of time, recompute priorities and requeue. + * We're out of time, force a requeue later. */ - sched_priority(td); + tdq_slice_update(tdq, ts); td->td_flags |= TDF_NEEDRESCHED; } @@ -2328,11 +2389,10 @@ tdq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; - int class; #ifdef SMP + int class; int cpumask; #endif - TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); @@ -2342,20 +2402,11 @@ ("sched_add: thread swapped out")); ts = td->td_sched; - class = PRI_BASE(td->td_pri_class); - TD_SET_RUNQ(td); - if (ts->ts_slice == 0) - ts->ts_slice = sched_slice; - /* - * Pick the run queue based on priority. - */ - if (td->td_priority <= PRI_MAX_REALTIME) - ts->ts_runq = &tdq->tdq_realtime; - else if (td->td_priority <= PRI_MAX_TIMESHARE) - ts->ts_runq = &tdq->tdq_timeshare; - else - ts->ts_runq = &tdq->tdq_idle; + tdq_runq_add(tdq, ts, flags); + tdq_load_add(tdq, ts); + tdq_slice_update(tdq, ts); #ifdef SMP + class = PRI_BASE(td->td_pri_class); cpumask = 1 << ts->ts_cpu; /* * If we had been idle, clear our bit in the group and potentially @@ -2378,8 +2429,6 @@ if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; #endif - tdq_runq_add(tdq, ts, flags); - tdq_load_add(tdq, ts); } /* @@ -2660,8 +2709,12 @@ "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); -SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, - "Slice size for timeshare threads"); +SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice_max, 0, + "Maximum slice size for timeshare threads"); +SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &sched_slice_min, 0, + "Minimum slice size for timeshare threads"); +SYSCTL_INT(_kern_sched, OID_AUTO, latency, CTLFLAG_RW, &sched_late_target, 0, + "Target maximum latency for timeshare threads"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,