Index: sys/conf/files =================================================================== --- sys/conf/files (revision 248027) +++ sys/conf/files (working copy) @@ -2244,6 +2244,7 @@ kern/posix4_mib.c standard kern/sched_4bsd.c optional sched_4bsd kern/sched_ule.c optional sched_ule +kern/sched_fbfs.c optional sched_fbfs kern/serdev_if.m standard kern/stack_protector.c standard \ compile-with "${NORMAL_C:N-fstack-protector*}" Index: sys/conf/options =================================================================== --- sys/conf/options (revision 248027) +++ sys/conf/options (working copy) @@ -153,6 +153,7 @@ SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h +SCHED_FBFS opt_sched.h opt_runq.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h SPX_HACK Index: sys/kern/kern_switch.c =================================================================== --- sys/kern/kern_switch.c (revision 248027) +++ sys/kern/kern_switch.c (working copy) @@ -334,7 +334,21 @@ struct rqhead *rqh; int pri; +#ifdef SCHED_FBFS + if (td->td_priority >= PRI_MIN_IDLE) { + pri = RQ_IDLE; + } else if (td->td_priority >= PRI_MIN_TIMESHARE) { + pri = RQ_TIMESHARE; + } else if (td->td_priority >= PRI_MIN_REALTIME) { + pri = min(RQ_MIN_REALTIME + td->td_priority - PRI_MIN_REALTIME, + RQ_MAX_REALTIME); + } else { + pri = td->td_priority / RQ_PPQ; + } +#else pri = td->td_priority / RQ_PPQ; +#endif + td->td_rqindex = pri; runq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; Index: sys/kern/sched_fbfs.c =================================================================== --- sys/kern/sched_fbfs.c (revision 0) +++ sys/kern/sched_fbfs.c (working copy) @@ -0,0 +1,1205 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.131.2.7.2.1 2010/12/21 17:09:25 kensmith Exp $"); + +#include "opt_hwpmc_hooks.h" +#include "opt_sched.h" +#include "opt_kdtrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HWPMC_HOOKS +#include +#endif + +#ifdef KDTRACE_HOOKS +#include +int dtrace_vtime_active; +dtrace_vtime_switch_func_t dtrace_vtime_switch_func; +#endif + +#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) + +static int realstathz; +static int sched_slice = 1; + +/* + * The time window size over which we compute the CPU utilization percentage. + */ +#define PCT_WINDOW 5 + +/* + * The schedulable entity that runs a context. + * This is an extension to the thread structure and is tailored to + * the requirements of this scheduler + */ +struct td_sched { + int ts_flags; + int ts_vdeadline; /* virtual deadline. */ + int ts_slice; /* Remaining slice in number of ticks */ + int ts_cswtick; + int ts_incrtick; + int ts_used; + struct runq *ts_runq; /* runq the thread is currently on */ +#ifdef KTR + char ts_name[TS_NAME_LEN]; +#endif +}; + +static struct cpu_group * cpu_top; +static struct cpu_group * cpu_topology[MAXCPU]; + +/* flags kept in td_flags */ +#define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ +#define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ + +/* flags kept in ts_flags */ +#define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */ + +#define SKE_RUNQ_PCPU(ts) \ + ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) + +#define THREAD_CAN_SCHED(td, cpu) \ + CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) + +static struct td_sched td_sched0; +struct mtx sched_lock; + +static int sched_tdcnt; /* Total runnable threads in the system. */ + +static void setup_runqs(void); +static void sched_priority(struct thread *td, u_char prio); +static void sched_setup(void *dummy); +static void sched_initticks(void *dummy); + +static struct thread *edf_choose(struct rqhead * rqh); +static struct thread *runq_choose_bfs(struct runq * rq); +static int preempt_lastcpu(struct thread *td); +static struct thread *worst_running_thread(void); + +SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); +SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); + +/* + * Global run queue. + */ +static struct runq runq; + +/* + * Priority ratios for virtual deadline per nice value calculations. + */ +static int prio_ratios[PRIO_MAX - PRIO_MIN + 1]; + +static void +setup_runqs(void) +{ + runq_init(&runq); +} + +SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); + +SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "FBFS", 0, + "Scheduler name"); + +SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, + "Slice size for timeshare threads"); + +SDT_PROVIDER_DEFINE(sched); + +SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", + "struct proc *", "uint8_t"); +SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", + "struct proc *", "void *"); +SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", + "struct proc *", "void *", "int"); +SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", + "struct proc *", "uint8_t", "struct thread *"); +SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int"); +SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", + "struct proc *"); +SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu); +SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu); +SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", + "struct proc *"); + +static __inline void +sched_load_add(void) +{ + + sched_tdcnt++; + KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); + SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt); +} + +static __inline void +sched_load_rem(void) +{ + + sched_tdcnt--; + KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); + SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt); +} + +int +maybe_preempt(struct thread *td) +{ + return (0); +} + +/* I keep it here because the top command wants it. */ +static fixpt_t ccpu = 0; +SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); + +/* ARGSUSED */ +static void +sched_setup(void *dummy) +{ + int i; + + cpu_top = smp_topo(); + for (i = 0; i < MAXCPU; i++) { + if (CPU_ABSENT(i)) + continue; + cpu_topology[i] = smp_topo_find(cpu_top, i); + if (cpu_topology[i] == NULL) + panic("Can't find cpu group for %d\n", i); + } + + realstathz = hz; + sched_slice = (realstathz/10); + + prio_ratios[0] = 128; + for (i = 1; i <= PRIO_MAX - PRIO_MIN; ++i) { + prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + } + + setup_runqs(); + + /* Account for thread0. */ + sched_load_add(); +} + +static void +sched_initticks(void *dummy) +{ + realstathz = stathz ? stathz : hz; + sched_slice = (realstathz/10); /* ~100ms */ +} + +/* External interfaces start here */ + +/* + * Very early in the boot some setup of scheduler-specific + * parts of proc0 and of some scheduler resources needs to be done. + * Called from: + * proc0_init() + */ +void +schedinit(void) +{ + /* + * Set up the scheduler specific parts of proc0. + */ + proc0.p_sched = NULL; /* XXX */ + thread0.td_sched = &td_sched0; + thread0.td_lock = &sched_lock; + mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); + td_sched0.ts_used = 0; + td_sched0.ts_slice = sched_slice; +} + +int +sched_runnable(void) +{ + return runq_check(&runq); +} + +int +sched_rr_interval(void) +{ + return (hz/(realstathz/sched_slice)); +} + +void +sched_clock(struct thread *td) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + + if (--ts->ts_slice > 0) + return; + + ts->ts_vdeadline = ticks + sched_slice * + prio_ratios[td->td_proc->p_nice - PRIO_MIN] / 128; + ts->ts_slice = sched_slice; + td->td_flags |= TDF_NEEDRESCHED; + + CTR4(KTR_SCHED, "timeslice fill: t: %d, i: %d, r: %d, d: %d", + ticks, td->td_proc->p_nice - PRIO_MIN, + prio_ratios[td->td_proc->p_nice - PRIO_MIN], + ts->ts_vdeadline + ); + + CTR1(KTR_SCHED, "queue number: %d", td->td_rqindex); + CTR1(KTR_SCHED, "thread: 0x%x", td); +} + +/* + * Charge child's scheduling CPU usage to parent. + */ +void +sched_exit(struct proc *p, struct thread *td) +{ + KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit", + "prio:%d", td->td_priority); + + PROC_LOCK_ASSERT(p, MA_OWNED); + sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); +} + +void +sched_exit_thread(struct thread *td, struct thread *child) +{ + KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit", + "prio:%d", child->td_priority); + thread_lock(child); + if ((child->td_flags & TDF_NOLOAD) == 0) + sched_load_rem(); + thread_unlock(child); +} + +void +sched_fork(struct thread *td, struct thread *childtd) +{ + sched_fork_thread(td, childtd); +} + +void +sched_fork_thread(struct thread *td, struct thread *childtd) +{ + struct td_sched *ts; + + childtd->td_lock = &sched_lock; + childtd->td_cpuset = cpuset_ref(td->td_cpuset); + ts = childtd->td_sched; + bzero(ts, sizeof(*ts)); + td->td_sched->ts_slice /= 2; + ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY); + ts->ts_vdeadline = td->td_sched->ts_vdeadline; + ts->ts_slice = td->td_sched->ts_slice; + ts->ts_used = td->td_sched->ts_used; +} + +void +sched_nice(struct proc *p, int nice) +{ + PROC_LOCK_ASSERT(p, MA_OWNED); + p->p_nice = nice; +} + +void +sched_class(struct thread *td, int class) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + td->td_pri_class = class; +} + +/* + * Adjust the priority of a thread. + */ +static void +sched_priority(struct thread *td, u_char prio) +{ + KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change", + "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, + sched_tdname(curthread)); + SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio); + if (td != curthread && prio > td->td_priority) { + KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), + "lend prio", "prio:%d", td->td_priority, "new prio:%d", + prio, KTR_ATTR_LINKED, sched_tdname(td)); + SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, + curthread); + } + THREAD_LOCK_ASSERT(td, MA_OWNED); + if (td->td_priority == prio) + return; + td->td_priority = prio; + if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { + sched_rem(td); + sched_add(td, SRQ_BORING); + } +} + +/* + * Update a thread's priority when it is lent another thread's + * priority. + */ +void +sched_lend_prio(struct thread *td, u_char prio) +{ + + td->td_flags |= TDF_BORROWING; + sched_priority(td, prio); +} + +/* + * Restore a thread's priority when priority propagation is + * over. The prio argument is the minimum priority the thread + * needs to have to satisfy other possible priority lending + * requests. If the thread's regulary priority is less + * important than prio the thread will keep a priority boost + * of prio. + */ +void +sched_unlend_prio(struct thread *td, u_char prio) +{ + u_char base_pri; + + if (td->td_base_pri >= PRI_MIN_TIMESHARE && + td->td_base_pri <= PRI_MAX_TIMESHARE) + base_pri = td->td_user_pri; + else + base_pri = td->td_base_pri; + if (prio >= base_pri) { + td->td_flags &= ~TDF_BORROWING; + sched_prio(td, base_pri); + } else + sched_lend_prio(td, prio); +} + +void +sched_prio(struct thread *td, u_char prio) +{ + u_char oldprio; + + /* First, update the base priority. */ + td->td_base_pri = prio; + + /* + * If the thread is borrowing another thread's priority, don't ever + * lower the priority. + */ + if (td->td_flags & TDF_BORROWING && td->td_priority < prio) + return; + + /* Change the real priority. */ + oldprio = td->td_priority; + sched_priority(td, prio); + + /* + * If the thread is on a turnstile, then let the turnstile update + * its state. + */ + if (TD_ON_LOCK(td) && oldprio != prio) + turnstile_adjust(td, oldprio); +} + +void +sched_user_prio(struct thread *td, u_char prio) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + td->td_base_user_pri = prio; + if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) + return; + td->td_user_pri = prio; +} + +void +sched_lend_user_prio(struct thread *td, u_char prio) +{ + u_char oldprio; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + td->td_flags |= TDF_UBORROWING; + oldprio = td->td_user_pri; + td->td_user_pri = prio; +} + +void +sched_unlend_user_prio(struct thread *td, u_char prio) +{ + u_char base_pri; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + base_pri = td->td_base_user_pri; + if (prio >= base_pri) { + td->td_flags &= ~TDF_UBORROWING; + sched_user_prio(td, base_pri); + } else { + sched_lend_user_prio(td, prio); + } +} + +void +sched_sleep(struct thread *td, int pri) +{ + + THREAD_LOCK_ASSERT(td, MA_OWNED); + td->td_slptick = ticks; + if (pri) + sched_prio(td, pri); + if (TD_IS_SUSPENDED(td) || pri >= PSOCK) + td->td_flags |= TDF_CANSWAP; +} + +void +sched_switch(struct thread *td, struct thread *newtd, int flags) +{ + struct mtx *tmtx; + struct td_sched *ts; + int time_passed; + + tmtx = NULL; + ts = td->td_sched; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + + /* + * Switch to the sched lock to fix things up and pick + * a new thread. + * Block the td_lock in order to avoid breaking the critical path. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + tmtx = thread_lock_block(td); + } + + if ((td->td_flags & TDF_NOLOAD) == 0) + sched_load_rem(); + + if (newtd) { + MPASS(newtd->td_lock == &sched_lock); + newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); + } + + td->td_lastcpu = td->td_oncpu; + td->td_flags &= ~TDF_NEEDRESCHED; + td->td_owepreempt = 0; + td->td_oncpu = NOCPU; + + /* + * At the last moment, if this thread is still marked RUNNING, + * then put it back on the run queue as it has not been suspended + * or stopped or any thing else similar. We never put the idle + * threads on the run queue, however. + */ + if (td->td_flags & TDF_IDLETD) { + TD_SET_CAN_RUN(td); +#ifdef SMP + idle_cpus_mask &= ~PCPU_GET(cpumask); +#endif + } else { + if (TD_IS_RUNNING(td)) { + /* Put us back on the run queue. */ + sched_add(td, (flags & SW_PREEMPT) ? + SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : + SRQ_OURSELF|SRQ_YIELDING); + } + } + if (newtd) { + /* + * The thread we are about to run needs to be counted + * as if it had been added to the run queue and selected. + * It came from: + * * A preemption + * * An upcall + * * A followon + */ + KASSERT((newtd->td_inhibitors == 0), + ("trying to run inhibited thread")); + newtd->td_flags |= TDF_DIDRUN; + TD_SET_RUNNING(newtd); + if ((newtd->td_flags & TDF_NOLOAD) == 0) + sched_load_add(); + } else { + newtd = choosethread(); + MPASS(newtd->td_lock == &sched_lock); + } + + if (td != newtd) { +#ifdef HWPMC_HOOKS + if (PMC_PROC_IS_USING_PMCS(td->td_proc)) + PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); +#endif + + SDT_PROBE2(sched, , , off_cpu, td, td->td_proc); + + /* I feel sleepy */ + lock_profile_release_lock(&sched_lock.lock_object); +#ifdef KDTRACE_HOOKS + /* + * If DTrace has set the active vtime enum to anything + * other than INACTIVE (0), then it should have set the + * function to call. + */ + if (dtrace_vtime_active) + (*dtrace_vtime_switch_func)(newtd); +#endif + + ts->ts_cswtick = ticks; + cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock); + lock_profile_obtain_lock_success(&sched_lock.lock_object, + 0, 0, __FILE__, __LINE__); + /* + * Where am I? What year is it? + * We are in the same thread that went to sleep above, + * but any amount of time may have passed. All our context + * will still be available as will local variables. + * PCPU values however may have changed as we may have + * changed CPU so don't trust cached values of them. + * New threads will go to fork_exit() instead of here + * so if you change things here you may need to change + * things there too. + * + * If the thread above was exiting it will never wake + * up again here, so either it has saved everything it + * needed to, or the thread_wait() or wait() will + * need to reap it. + */ + time_passed = ticks - ts->ts_cswtick; + ts->ts_used = imax(ts->ts_used - time_passed, 0); + if (ts->ts_used < 0) + panic("Negative ts_used value\n"); + + SDT_PROBE0(sched, , , on_cpu); +#ifdef HWPMC_HOOKS + if (PMC_PROC_IS_USING_PMCS(td->td_proc)) + PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); +#endif + } else + SDT_PROBE0(sched, , , remain_cpu); + +#ifdef SMP + if (td->td_flags & TDF_IDLETD) + idle_cpus_mask |= PCPU_GET(cpumask); +#endif + sched_lock.mtx_lock = (uintptr_t)td; + td->td_oncpu = PCPU_GET(cpuid); + MPASS(td->td_lock == &sched_lock); +} + +int +preempt_lastcpu(struct thread *td) +{ + int cpri; + struct pcpu * pcpu; + struct td_sched *ts; + struct td_sched *tsc; + struct thread *pcpu_thr; + u_char c; + + c = td->td_lastcpu; + if (c == NOCPU) + return (0); + pcpu = pcpu_find(c); + pcpu_thr = pcpu->pc_curthread; + if (pcpu_thr == NULL) + return (0); + if (pcpu_thr == pcpu->pc_idlethread) { + if (PCPU_GET(cpuid) != c) + ipi_cpu(c, IPI_AST); + return (1); + } + cpri = pcpu_thr->td_priority; + if (cpri < td->td_priority) + return (0); + if (cpri > td->td_priority) { + pcpu_thr->td_flags |= TDF_NEEDRESCHED; + if (PCPU_GET(cpuid) != c) + ipi_cpu(c, IPI_AST); + return (1); + } + ts = td->td_sched; + tsc = pcpu_thr->td_sched; + if ((td->td_pri_class == PRI_TIMESHARE) || + (td->td_pri_class == PRI_IDLE)) { + if (ts->ts_vdeadline >= tsc->ts_vdeadline) + return (0); + } else + return (0); + /* + * Here, the priorities of td, and current thread on td_lastcpu are + * equal. And their scheduling class is PRI_IDLE or PRI_TIMESHARE + * Further, the virtual deadline of td is lower. Therefore we + * reschedule the td_lastcpu. + */ + pcpu_thr->td_flags |= TDF_NEEDRESCHED; + if (PCPU_GET(cpuid) != c) + ipi_cpu(c, IPI_AST); + + return (1); +} + +struct thread * +worst_running_thread(void) +{ + struct td_sched *ts, *ts2; + struct thread *max_thread, *cthr; + struct pcpu *pc; + u_char max_prio; + + max_thread = curthread; + MPASS(max_thread != NULL); + max_prio = max_thread->td_priority; + ts = max_thread->td_sched; + SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { + cthr = pc->pc_curthread; + if (cthr == NULL) { + continue; + } + if (max_prio < cthr->td_priority) { + max_thread = cthr; + max_prio = max_thread->td_priority; + ts = max_thread->td_sched; + } else if (max_prio == cthr->td_priority) { + ts2 = cthr->td_sched; + if (ts->ts_vdeadline > ts2->ts_vdeadline) { + max_thread = cthr; + ts = ts2; + } + } + } + MPASS(max_thread != NULL); + return (max_thread); +} + +void +sched_wakeup(struct thread *td) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + td->td_flags &= ~TDF_CANSWAP; + td->td_slptick = 0; + sched_add(td, SRQ_BORING); + +} + +void +sched_add(struct thread *td, int flags) +{ + struct td_sched *ts; + struct thread *thr_worst; + cpumask_t dontuse, map, me; + struct cpu_group *cg; + u_char c; + + ts = td->td_sched; + THREAD_LOCK_ASSERT(td, MA_OWNED); + KASSERT((td->td_inhibitors == 0), + ("sched_add: trying to run inhibited thread")); + KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), + ("sched_add: bad thread state")); + KASSERT(td->td_flags & TDF_INMEM, + ("sched_add: thread swapped out")); + KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", + "prio:%d", td->td_priority, KTR_ATTR_LINKED, + sched_tdname(curthread)); + KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", + KTR_ATTR_LINKED, sched_tdname(td)); + SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, + flags & SRQ_PREEMPTED); + + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } + TD_SET_RUNQ(td); + CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); + ts->ts_runq = &runq; + + if ((td->td_flags & TDF_NOLOAD) == 0) + sched_load_add(); + runq_add(ts->ts_runq, td, flags); + + me = PCPU_GET(cpumask); + dontuse = me | stopped_cpus | hlt_cpus_mask; + map = idle_cpus_mask & ~dontuse; + + /* + * Firstly check if we should reschedule the last cpu the thread + * run on. + */ + if (preempt_lastcpu(td)) { + if (map) + ipi_selected(map, IPI_AST); + return; + } + /* + * Is there any idle cpu ? + */ + if (map) { + cg = cpu_topology[td->td_lastcpu]; + while ((cg != NULL) && ((map & cg->cg_mask) == 0)) + cg = cg->cg_parent; + if (map & cg->cg_mask) { + ipi_selected(map & cg->cg_mask, IPI_AST); + return; + } + ipi_selected(map, IPI_AST); + return; + } + /* + * We did not wake lastcpu and there is no suitable idle cpu + */ + thr_worst = worst_running_thread(); + MPASS(thr_worst != NULL); + c = thr_worst->td_oncpu; + if (thr_worst->td_priority < td->td_priority) + return; + if (thr_worst->td_priority > td->td_priority) { + thr_worst->td_flags |= TDF_NEEDRESCHED; + if ((thr_worst != curthread) && (c != NOCPU)) + ipi_cpu(c, IPI_AST); + return; + } + /* + * thr_worst->td_priority == td->td_priority + */ + if (ts->ts_vdeadline < thr_worst->td_sched->ts_vdeadline) { + thr_worst->td_flags |= TDF_NEEDRESCHED; + if ((thr_worst != curthread) && (c != NOCPU)) + ipi_cpu(c, IPI_AST); + } +} + +void +sched_rem(struct thread *td) +{ + struct td_sched *ts; + + ts = td->td_sched; + KASSERT(td->td_flags & TDF_INMEM, + ("sched_rem: thread swapped out")); + KASSERT(TD_ON_RUNQ(td), + ("sched_rem: thread not on run queue")); + mtx_assert(&sched_lock, MA_OWNED); + KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem", + "prio:%d", td->td_priority, KTR_ATTR_LINKED, + sched_tdname(curthread)); + SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); + + if ((td->td_flags & TDF_NOLOAD) == 0) + sched_load_rem(); + runq_remove(ts->ts_runq, td); + TD_SET_CAN_RUN(td); +} + +static struct thread * +edf_choose(struct rqhead * rqh) +{ + struct thread *td; + struct thread *td_min; + struct td_sched *ts; + int deadline_min; + int c; + + td_min = NULL; + deadline_min = 0; + td = TAILQ_FIRST(rqh); + MPASS(td != NULL); + while (td != NULL) { + c = PCPU_GET(cpuid); + if (!THREAD_CAN_SCHED(td, c)) { + td = TAILQ_NEXT(td, td_runq); + continue; + } + if (td_min == NULL) { + td_min = td; + deadline_min = td->td_sched->ts_vdeadline; + } + ts = td->td_sched; + if (ts->ts_vdeadline < deadline_min) { + deadline_min = ts->ts_vdeadline; + td_min = td; + } + td = TAILQ_NEXT(td, td_runq); + } + return (td_min); +} + +static struct thread * +runq_choose_bfs(struct runq * rq) +{ + struct rqhead *rqh; + struct thread *td; + struct rqbits * rqb; + int pri; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < RQB_LEN; i++) { + if (rqb->rqb_bits[i] == 0) + continue; + pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); + if ((pri == RQ_TIMESHARE) || (pri == RQ_IDLE)) { + td = edf_choose(&rq->rq_queues[pri]); + return (td); + } + rqh = &rq->rq_queues[pri]; + td = TAILQ_FIRST(rqh); + KASSERT(td != NULL, "runq_choose_bfs: no thread on busy queue"); + CTR3(KTR_RUNQ, + "runq_choose_bfs: pri=%d thread=%p rqh=%p", pri, td, rqh); + return (td); + } + CTR1(KTR_RUNQ, "runq_choose_bfs: idlethread pri=%d", pri); + + return (NULL); +} + +/* + * Select threads to run. Note that running threads still consume a + * slot. + */ +struct thread * +sched_choose(void) +{ + struct thread *td; + struct runq *rq; + + mtx_assert(&sched_lock, MA_OWNED); + + rq = &runq; + td = runq_choose_bfs(&runq); + + if (td != NULL) { + runq_remove(rq, td); + td->td_flags |= TDF_DIDRUN; + + KASSERT(td->td_flags & TDF_INMEM, + ("sched_choose: thread swapped out")); + return (td); + } + return (PCPU_GET(idlethread)); +} + +void +sched_preempt(struct thread *td) +{ + + SDT_PROBE2(sched, , , surrender, td, td->td_proc); + thread_lock(td); + if (td->td_critnest > 1) + td->td_owepreempt = 1; + else + mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL); + thread_unlock(td); +} + +void +sched_userret(struct thread *td) +{ + /* + * XXX we cheat slightly on the locking here to avoid locking in + * the usual case. Setting td_priority here is essentially an + * incomplete workaround for not setting it properly elsewhere. + * Now that some interrupt handlers are threads, not setting it + * properly elsewhere can clobber it in the window between setting + * it here and returning to user mode, so don't waste time setting + * it perfectly here. + */ + KASSERT((td->td_flags & TDF_BORROWING) == 0, + ("thread with borrowed priority returning to userland")); + if (td->td_priority != td->td_user_pri) { + thread_lock(td); + td->td_priority = td->td_user_pri; + td->td_base_pri = td->td_user_pri; + thread_unlock(td); + } +} + +void +sched_bind(struct thread *td, int cpu) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); + KASSERT(td == curthread, ("sched_bind: can only bind curthread")); + + ts = td->td_sched; + + td->td_flags |= TDF_BOUND; +#ifdef SMP + if (PCPU_GET(cpuid) == cpu) + return; + + mi_switch(SW_VOL, NULL); +#endif +} + +void +sched_unbind(struct thread* td) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); + td->td_flags &= ~TDF_BOUND; +} + +int +sched_is_bound(struct thread *td) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + return (td->td_flags & TDF_BOUND); +} + +void +sched_relinquish(struct thread *td) +{ + thread_lock(td); + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); + thread_unlock(td); +} + +int +sched_load(void) +{ + return (sched_tdcnt); +} + +int +sched_sizeof_proc(void) +{ + return (sizeof(struct proc)); +} + +int +sched_sizeof_thread(void) +{ + return (sizeof(struct thread) + sizeof(struct td_sched)); +} + +fixpt_t +sched_pctcpu(struct thread *td) +{ + struct td_sched *ts; + int time_passed; + int nticks; + fixpt_t pct; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + + switch (td->td_state) { + case TDS_RUNNING: + if (ts->ts_used < 0) panic("Bad ts_used value\n"); + nticks = ts->ts_used; + default: + time_passed = ticks - ts->ts_cswtick; + nticks = imax(ts->ts_used - time_passed, 0); + } + nticks /= PCT_WINDOW; + + if (nticks > hz) panic("too big nticks value.\n"); + if (nticks < 0) panic("bad nticks value.\n"); + + pct = (FSCALE * ((FSCALE * nticks) / hz)) >> FSHIFT; + + return (pct); +} + +void +sched_tick(void) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(curthread, MA_OWNED); + ts = curthread->td_sched; + if (ts->ts_incrtick == ticks) + return; + if (ts->ts_used < (hz * PCT_WINDOW)) { + ts->ts_used += 1; + ts->ts_incrtick = ticks; + } +} + +/* + * The actual idle process. + */ +void +sched_idletd(void *dummy) +{ + + for (;;) { + mtx_assert(&Giant, MA_NOTOWNED); + + while (sched_runnable() == 0) + cpu_idle(0); + + mtx_lock_spin(&sched_lock); + mi_switch(SW_VOL | SWT_IDLE, NULL); + mtx_unlock_spin(&sched_lock); + } +} + +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + lock_profile_release_lock(&sched_lock.lock_object); + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(td, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *td) +{ + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + td->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)td; + lock_profile_obtain_lock_success(&sched_lock.lock_object, + 0, 0, __FILE__, __LINE__); + THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); +} + +char * +sched_tdname(struct thread *td) +{ +#ifdef KTR + struct td_sched *ts; + + ts = td->td_sched; + if (ts->ts_name[0] == '\0') + snprintf(ts->ts_name, sizeof(ts->ts_name), + "%s tid %d", td->td_name, td->td_tid); + return (ts->ts_name); +#else + return (td->td_name); +#endif +} + +void +sched_affinity(struct thread *td) +{ +#ifdef SMP + struct td_sched *ts; + int cpu; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + + /* + * Set the TSF_AFFINITY flag if there is at least one CPU this + * thread can't run on. + */ + ts = td->td_sched; + ts->ts_flags &= ~TSF_AFFINITY; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (CPU_ABSENT(cpu)) + continue; + if (!THREAD_CAN_SCHED(td, cpu)) { + ts->ts_flags |= TSF_AFFINITY; + break; + } + } + + /* + * If this thread can run on all CPUs, nothing else to do. + */ + if (!(ts->ts_flags & TSF_AFFINITY)) + return; + + /* Pinned threads and bound threads should be left alone. */ + if (td->td_pinned != 0 || td->td_flags & TDF_BOUND) + return; + + switch (td->td_state) { + case TDS_RUNNING: + /* + * See if our current CPU is in the set. If not, force a + * context switch. + */ + if (THREAD_CAN_SCHED(td, td->td_oncpu)) + return; + + td->td_flags |= TDF_NEEDRESCHED; + if (td != curthread) + ipi_cpu(cpu, IPI_AST); + break; + default: + break; + } +#endif +} Index: sys/sys/runq.h =================================================================== --- sys/sys/runq.h (revision 248027) +++ sys/sys/runq.h (working copy) @@ -40,6 +40,13 @@ #define RQ_NQS (64) /* Number of run queues. */ #define RQ_PPQ (4) /* Priorities per queue. */ +#ifdef SCHED_FBFS +#define RQ_IDLE (RQ_NQS - 1) +#define RQ_TIMESHARE (RQ_IDLE - 1) +#define RQ_MIN_REALTIME (PRI_MIN_REALTIME / 4) +#define RQ_MAX_REALTIME (RQ_TIMESHARE - 1) +#endif + /* * Head of run queues. */