commit f09824ff2dd6a3c883e4f789257380aa238c5f47
Author: Mateusz Guzik <mjg@FreeBSD.org>
Date:   Fri Mar 31 19:55:41 2023 +0000

    ule: queue partial slice users on tdq_realtime and enable more preemption
    
    This is a low-effort attempt at damage controlling one of the bugs,
    simple enough to be suitable for inclusion in the pending release.
    It comes with its own woes which will be addressed in a more involved
    patch down the road.
    
    The problem at hand: a thread going off CPU has to wait the full slice
    to get back on if there is a CPU hog running. Should a thread of the
    sort keep going off frequently, each time only utilizing a small fraction
    of its slice, it will be struggling to get any work done as it will wait
    full slice every time.
    
    This is trivially reproducible by running a bunch of CPU hogs (one for
    each hw thread) and make -j $(nproc) buildkernel. A sample timing is
    from an 8-core vm, where ~7 minute total real time is extended to over
    1h(!), even if the hogs are niced to 20.
    
    Another bug (which is not fixed) is that the calendar queue does not
    properly distribute CPU time between different priorities, for example
    running a nice 0 hog vs nice 20 hog gives them about 50:50. This once
    more negatively affects scheduling for buildkernel vs hogs.
    
    One more bug which needs to be mentioned is the general stavation
    potential of the runq mechanism. In principle the calendar queue sorts
    it out for tdq_timeshare (except for the above bug), but it remains
    unaddressed for tdq_realtime, all while regular user threads can land
    there as is.
    
    Work around the problem by:
    1. queueing threads on tdq_realtime if they only used part of their slice
    2. bumping preemption threshold to PRI_MAX_TIMESHARE
    
    Upsides: near-starvation of frequent off CPU users is worked around
    Downsides: there is more starvation potential for CPU hogs and the
    entire ordeal negatively affects some workloads
    
    This in particular extends -j 8 buildkernel by about 0.25%.
    
    Interestingly a kernel with 4BSD takes slightly *less* total real time
    to do the same build than stock ULE, all while not having the problem
    fixed here. Or to put it differently, with enough work the entire thing
    can go faster than it does with stock ULE without having the problem.
    This will be sorted out for the next release.
    
    Example:
    x 4bsd.out
    + ule.out
    * ulev2.out
    +--------------------------------------------------------------------------------+
    |                                                       *                        |
    |x             +x    x xx  x         x     xx   ++++  ++*+      *   * *  * * *  *|
    |           |___________M_A_________|___|_______A__M________||_______AM_______|  |
    +--------------------------------------------------------------------------------+
        N           Min           Max        Median           Avg        Stddev
    x   9        434.32        436.63        435.53     435.66667    0.73333144
    +   9        435.08         437.3           437     436.85111    0.68641176
    Difference at 95.0% confidence
            1.18444 +/- 0.709817
            0.271869% +/- 0.163163%
            (Student's t, pooled s = 0.710259)
    *   9        437.25        438.54        438.01     437.94333    0.46384265
    Difference at 95.0% confidence
            2.27667 +/- 0.613184
            0.522571% +/- 0.141272%
    
    Users which want to restore the previous behavior can put the following
    into their /etc/sysctl.conf:
    kern.sched.preempt_thresh=48
    kern.sched.preempt_bottom=0
    kern.sched.pick_short=0

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 881413ca5e73..904ae3c2b763 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -95,6 +95,7 @@ struct td_sched {
 	int		ts_cpu;		/* CPU that we have affinity for. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
+	int		ts_usedslice;
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	int		ts_ltick;	/* Last tick that we were running on */
@@ -212,11 +213,13 @@ static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
 static int __read_mostly realstathz = 127;	/* reset during boot. */
 static int __read_mostly sched_slice = 10;	/* reset during boot. */
 static int __read_mostly sched_slice_min = 1;	/* reset during boot. */
+static bool __read_mostly sched_pick_short = true;
+static int __read_mostly preempt_bottom = PRI_MAX_INTERACT;
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
 #else
-static int __read_mostly preempt_thresh = PRI_MIN_KERN;
+static int __read_mostly preempt_thresh = PRI_MAX_TIMESHARE + 1;
 #endif
 #else 
 static int __read_mostly preempt_thresh = 0;
@@ -340,6 +343,7 @@ static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 static void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
+static inline int td_slice(struct thread *td, struct tdq *tdq);
 static int tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static int tdq_move(struct tdq *, struct tdq *);
@@ -460,7 +464,7 @@ sched_shouldpreempt(int pri, int cpri, int remote)
 	/*
 	 * Preempt if we exceed the threshold.
 	 */
-	if (pri <= preempt_thresh)
+	if (pri <= preempt_thresh && pri > preempt_bottom)
 		return (1);
 	/*
 	 * If we're interactive or better and there is non-interactive
@@ -495,6 +499,12 @@ tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 	if (pri < PRI_MIN_BATCH) {
 		ts->ts_runq = &tdq->tdq_realtime;
 	} else if (pri <= PRI_MAX_BATCH) {
+		if (sched_pick_short && ts->ts_usedslice < td_slice(td, tdq)) {
+			ts->ts_runq = &tdq->tdq_realtime;
+			runq_add(ts->ts_runq, td, flags);
+			return;
+		}
+
 		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
 			("Invalid priority %d on timeshare runq", pri));
@@ -1779,6 +1789,7 @@ schedinit(void)
 	ts0->ts_ltick = ticks;
 	ts0->ts_ftick = ticks;
 	ts0->ts_slice = 0;
+	ts0->ts_usedslice = 0;
 	ts0->ts_cpu = curcpu;	/* set valid CPU number */
 }
 
@@ -2298,6 +2309,8 @@ sched_switch(struct thread *td, int flags)
 		cpu_switch(td, newtd, mtx);
 		cpuid = td->td_oncpu = PCPU_GET(cpuid);
 
+		ts = td_get_sched(td);
+		ts->ts_usedslice = 0;
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
@@ -2632,6 +2645,7 @@ sched_clock(struct thread *td, int cnt)
 	 * time slice (default is 100ms).
 	 */
 	ts->ts_slice += cnt;
+	ts->ts_usedslice += cnt;
 	if (ts->ts_slice >= td_slice(td, tdq)) {
 		ts->ts_slice = 0;
 
@@ -3326,6 +3340,9 @@ SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
     &preempt_thresh, 0,
     "Maximal (lowest) priority for preemption");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_bottom, CTLFLAG_RW,
+    &preempt_bottom, 0,
+    "Minimal (highest) priority for preemption");
 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
     "Assign static kernel priorities to sleeping threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
@@ -3333,6 +3350,8 @@ SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
     &sched_idlespinthresh, 0,
     "Threshold before we will permit idle thread spinning");
+SYSCTL_BOOL(_kern_sched, OID_AUTO, pick_short, CTLFLAG_RW,
+    &sched_pick_short, 0, "");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");