commit f09824ff2dd6a3c883e4f789257380aa238c5f47 Author: Mateusz Guzik Date: Fri Mar 31 19:55:41 2023 +0000 ule: queue partial slice users on tdq_realtime and enable more preemption This is a low-effort attempt at damage controlling one of the bugs, simple enough to be suitable for inclusion in the pending release. It comes with its own woes which will be addressed in a more involved patch down the road. The problem at hand: a thread going off CPU has to wait the full slice to get back on if there is a CPU hog running. Should a thread of the sort keep going off frequently, each time only utilizing a small fraction of its slice, it will be struggling to get any work done as it will wait full slice every time. This is trivially reproducible by running a bunch of CPU hogs (one for each hw thread) and make -j $(nproc) buildkernel. A sample timing is from an 8-core vm, where ~7 minute total real time is extended to over 1h(!), even if the hogs are niced to 20. Another bug (which is not fixed) is that the calendar queue does not properly distribute CPU time between different priorities, for example running a nice 0 hog vs nice 20 hog gives them about 50:50. This once more negatively affects scheduling for buildkernel vs hogs. One more bug which needs to be mentioned is the general stavation potential of the runq mechanism. In principle the calendar queue sorts it out for tdq_timeshare (except for the above bug), but it remains unaddressed for tdq_realtime, all while regular user threads can land there as is. Work around the problem by: 1. queueing threads on tdq_realtime if they only used part of their slice 2. bumping preemption threshold to PRI_MAX_TIMESHARE Upsides: near-starvation of frequent off CPU users is worked around Downsides: there is more starvation potential for CPU hogs and the entire ordeal negatively affects some workloads This in particular extends -j 8 buildkernel by about 0.25%. Interestingly a kernel with 4BSD takes slightly *less* total real time to do the same build than stock ULE, all while not having the problem fixed here. Or to put it differently, with enough work the entire thing can go faster than it does with stock ULE without having the problem. This will be sorted out for the next release. Example: x 4bsd.out + ule.out * ulev2.out +--------------------------------------------------------------------------------+ | * | |x +x x xx x x xx ++++ ++*+ * * * * * * *| | |___________M_A_________|___|_______A__M________||_______AM_______| | +--------------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 9 434.32 436.63 435.53 435.66667 0.73333144 + 9 435.08 437.3 437 436.85111 0.68641176 Difference at 95.0% confidence 1.18444 +/- 0.709817 0.271869% +/- 0.163163% (Student's t, pooled s = 0.710259) * 9 437.25 438.54 438.01 437.94333 0.46384265 Difference at 95.0% confidence 2.27667 +/- 0.613184 0.522571% +/- 0.141272% Users which want to restore the previous behavior can put the following into their /etc/sysctl.conf: kern.sched.preempt_thresh=48 kern.sched.preempt_bottom=0 kern.sched.pick_short=0 diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 881413ca5e73..904ae3c2b763 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -95,6 +95,7 @@ struct td_sched { int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ + int ts_usedslice; u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ @@ -212,11 +213,13 @@ static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; static int __read_mostly realstathz = 127; /* reset during boot. */ static int __read_mostly sched_slice = 10; /* reset during boot. */ static int __read_mostly sched_slice_min = 1; /* reset during boot. */ +static bool __read_mostly sched_pick_short = true; +static int __read_mostly preempt_bottom = PRI_MAX_INTERACT; #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int __read_mostly preempt_thresh = PRI_MAX_IDLE; #else -static int __read_mostly preempt_thresh = PRI_MIN_KERN; +static int __read_mostly preempt_thresh = PRI_MAX_TIMESHARE + 1; #endif #else static int __read_mostly preempt_thresh = 0; @@ -340,6 +343,7 @@ static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); static void tdq_print(int cpu); static void runq_print(struct runq *rq); +static inline int td_slice(struct thread *td, struct tdq *tdq); static int tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static int tdq_move(struct tdq *, struct tdq *); @@ -460,7 +464,7 @@ sched_shouldpreempt(int pri, int cpri, int remote) /* * Preempt if we exceed the threshold. */ - if (pri <= preempt_thresh) + if (pri <= preempt_thresh && pri > preempt_bottom) return (1); /* * If we're interactive or better and there is non-interactive @@ -495,6 +499,12 @@ tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { + if (sched_pick_short && ts->ts_usedslice < td_slice(td, tdq)) { + ts->ts_runq = &tdq->tdq_realtime; + runq_add(ts->ts_runq, td, flags); + return; + } + ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); @@ -1779,6 +1789,7 @@ schedinit(void) ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; + ts0->ts_usedslice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } @@ -2298,6 +2309,8 @@ sched_switch(struct thread *td, int flags) cpu_switch(td, newtd, mtx); cpuid = td->td_oncpu = PCPU_GET(cpuid); + ts = td_get_sched(td); + ts->ts_usedslice = 0; SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) @@ -2632,6 +2645,7 @@ sched_clock(struct thread *td, int cnt) * time slice (default is 100ms). */ ts->ts_slice += cnt; + ts->ts_usedslice += cnt; if (ts->ts_slice >= td_slice(td, tdq)) { ts->ts_slice = 0; @@ -3326,6 +3340,9 @@ SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); +SYSCTL_INT(_kern_sched, OID_AUTO, preempt_bottom, CTLFLAG_RW, + &preempt_bottom, 0, + "Minimal (highest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, @@ -3333,6 +3350,8 @@ SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); +SYSCTL_BOOL(_kern_sched, OID_AUTO, pick_short, CTLFLAG_RW, + &sched_pick_short, 0, ""); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for");