commit 43a7473cf143e8830766ed62a22fcee04e3f4426 Author: Andriy Gapon Date: Thu Sep 15 12:09:34 2011 +0300 [wip] re-implement smp rendezvous code - create one rendezvous (outgoing) mailbox per each cpu where a cpu would place its rendezvous request directed to other cpus - create a cpu mask for each cpu where other cpus can set a bit to indicate that they send a rendezvous request to the cpu in question - send an ipi only for a first rv request, piggyback subsequent requests if a target cpu is still processing previous incoming requests - many-to-many rv requests can be sent now, there is no locking, the only limitation is that a cpu can have only a single outgoing request at a time - to avoid deadlocks, when a cpu waits for its requested to be completed by target cpus, it also checks for and processes incoming requests - to avoid deadlock with cpu stopping logic, cpus also check for stop requests while waiting - when a rendezvous has a setup and/or teardown action, target CPUs do not spin-wait for other target CPUs between executing the steps of the rv, instead sequential execution of the steps is always coordianted via the master CPU; target CPUs can process other incoming rv requests between steps of the "complex" rv; this is done to avoid a potential deadlock (see an example below) - there can be only one cpu asking other cpus to stop; this is implemented via a handrolled spin mutex analogue; similar to the above, to avoid deadlocks a cpu spinning for this lock also checks for an incoming stop request An example of a deadlock that can happen if target/slave CPUs would spin-wait between execution of steps of a complex rendezvous action: - processors Ps1 and Ps2 issue complex rendezvous requests R1 and R2 to processors Pt1 and Pt2 - because of request timings Pt1 gets R1 before R2 while Pt2 gets R2 before R1 - Pt1 executes setup action of R1 and waits for Pt2 to do the same - Pt2 executes setup action of R2 and waits for Pt1 to do the same - deadlock Some implementation details: - master CPU increases work count for slave CPUs by total count of non-no-barrier actions in a rendezvous, so that the slave CPUs do not leave rendezvous handling context until all actions are done - master CPU processes incoming request in between posting its actions - master CPU bumps its work count before posting an action, so that other CPUs know that the master CPU can process incoming events without being IPI-ed - master CPU processes all incoming events before leaving smp_rendezvous so that it fully completes all incoming complex rendezvous without leaving the context diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 5dd0c55..d9391bc 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -97,6 +97,10 @@ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); TUNABLE_INT("kern.smp.topology", &smp_topology); +unsigned int coalesced_ipi_count; +SYSCTL_INT(_kern_smp, OID_AUTO, coalesced_ipi_count, CTLFLAG_RD, + &coalesced_ipi_count, 0, "Count of coalesced SMP rendezvous IPIs"); + #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; @@ -104,15 +108,23 @@ SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); +#define CPUSET_FOREACH(cpu, mask) \ + for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \ + if (CPU_ISSET(cpu, &mask)) + /* Variables needed for SMP rendezvous. */ -static volatile int smp_rv_ncpus; -static void (*volatile smp_rv_setup_func)(void *arg); -static void (*volatile smp_rv_action_func)(void *arg); -static void (*volatile smp_rv_teardown_func)(void *arg); -static void *volatile smp_rv_func_arg; -static volatile int smp_rv_waiters[4]; - -/* +struct smp_rendezvous_data { + void (*smp_rv_action_func)(void *arg); + void *smp_rv_func_arg; + volatile int smp_rv_done; + int smp_rv_ncpus; +}; + +static DPCPU_DEFINE(struct smp_rendezvous_data, smp_rv_data); +static volatile DPCPU_DEFINE(cpuset_t, smp_rv_senders); +static volatile DPCPU_DEFINE(int, smp_rv_count); + +/* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with @@ -313,28 +325,16 @@ restart_cpus(cpuset_t map) * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ -void -smp_rendezvous_action(void) +static __inline void +smp_rendezvous_action_body(int cpu) { + struct smp_rendezvous_data *rv; struct thread *td; - void *local_func_arg; - void (*local_setup_func)(void*); - void (*local_action_func)(void*); - void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif - /* Ensure we have up-to-date values. */ - atomic_add_acq_int(&smp_rv_waiters[0], 1); - while (smp_rv_waiters[0] < smp_rv_ncpus) - cpu_spinwait(); - - /* Fetch rendezvous parameters after acquire barrier. */ - local_func_arg = smp_rv_func_arg; - local_setup_func = smp_rv_setup_func; - local_action_func = smp_rv_action_func; - local_teardown_func = smp_rv_teardown_func; + rv = DPCPU_ID_PTR(cpu, smp_rv_data); /* * Use a nested critical section to prevent any preemptions @@ -363,50 +363,168 @@ smp_rendezvous_action(void) #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif - - /* - * If requested, run a setup function before the main action - * function. Ensure all CPUs have completed the setup - * function before moving on to the action function. - */ - if (local_setup_func != smp_no_rendevous_barrier) { - if (smp_rv_setup_func != NULL) - smp_rv_setup_func(smp_rv_func_arg); - atomic_add_int(&smp_rv_waiters[1], 1); - while (smp_rv_waiters[1] < smp_rv_ncpus) - cpu_spinwait(); + if (rv->smp_rv_action_func != NULL) + rv->smp_rv_action_func(rv->smp_rv_func_arg); + + td->td_critnest--; + KASSERT(owepreempt == td->td_owepreempt, + ("rendezvous action changed td_owepreempt")); + + atomic_add_int(&rv->smp_rv_done, 1); +} + +static __inline int +smp_rendezvous_action_pass(void) +{ + volatile cpuset_t *senders; + int count; + int cpu; + + senders = DPCPU_PTR(smp_rv_senders); + if (CPU_EMPTY(senders)) + return (0); + + count = 0; + + CPUSET_FOREACH(cpu, *senders) { + count++; +#define CPU_CLR_ACQ_ATOMIC(n, p) \ + atomic_clear_long(&(p)->__bits[(n)/_NCPUBITS], __cpuset_mask(n)) + CPU_CLR_ACQ_ATOMIC(cpu, senders); + smp_rendezvous_action_body(cpu); } - if (local_action_func != NULL) - local_action_func(local_func_arg); - - if (local_teardown_func != smp_no_rendevous_barrier) { - /* - * Signal that the main action has been completed. If a - * full exit rendezvous is requested, then all CPUs will - * wait here until all CPUs have finished the main action. - */ - atomic_add_int(&smp_rv_waiters[2], 1); - while (smp_rv_waiters[2] < smp_rv_ncpus) + return (count); +} + +void +smp_rendezvous_action(void) +{ + int pending; + int count; + + pending = atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), 0); + while (pending != 0) { + KASSERT(pending > 0, ("negative pending rendezvous count")); + while ((count = smp_rendezvous_action_pass()) == 0) { + //wasted_spins++; cpu_spinwait(); + } + pending = atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), -count); + pending -= count; + } +} - if (local_teardown_func != NULL) - local_teardown_func(local_func_arg); +static void +smp_rendezvous_wait(struct smp_rendezvous_data *rv) +{ + int ncpus; + int count; + + ncpus = rv->smp_rv_ncpus; + + while (rv->smp_rv_done < ncpus) { + /* check for incoming events */ + if (CPU_ISSET(curcpu, &stopping_cpus)) + cpustop_handler(); + + count = 0; + if (atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), 0) != 0) + count = smp_rendezvous_action_pass(); + if (count != 0) + atomic_add_int(DPCPU_PTR(smp_rv_count), -count); + else + cpu_spinwait(); } +} + + +static void +smp_rendezvous_notify(int cpu, int nhold) +{ + int send_ipi; + int x; + + if (cpu == curcpu) + return; + + KASSERT(!CPU_ISSET(curcpu, DPCPU_ID_PTR(cpu, smp_rv_senders)), + ("curcpu bit is set in target cpu's senders map")); /* - * Signal that the rendezvous is fully completed by this CPU. - * This means that no member of smp_rv_* pseudo-structure will be - * accessed by this target CPU after this point; in particular, - * memory pointed by smp_rv_func_arg. + * If this is a first action of a rendezvous invocation + * and we are the first to send an event, then send an ipi. */ - atomic_add_int(&smp_rv_waiters[3], 1); + send_ipi = 0; + if (nhold != 0) { + x = atomic_fetchadd_int(DPCPU_ID_PTR(cpu, smp_rv_count), nhold); + send_ipi = (x == 0); + if (!send_ipi) + coalesced_ipi_count++; + } +#define CPU_SET_REL_ATOMIC(n, p) \ + atomic_set_rel_long(&(p)->__bits[(n)/_NCPUBITS], __cpuset_mask(n)) + CPU_SET_REL_ATOMIC(curcpu, DPCPU_ID_PTR(cpu, smp_rv_senders)); + if (send_ipi) + ipi_cpu(cpu, IPI_RENDEZVOUS); +} - td->td_critnest--; - KASSERT(owepreempt == td->td_owepreempt, - ("rendezvous action changed td_owepreempt")); +static void +smp_rendezvous_cpus_oneaction(cpuset_t map, + struct smp_rendezvous_data *rv, + int nhold, + void (*action_func)(void *), + void *arg) +{ + int ncpus; + int cpu; + + + rv->smp_rv_func_arg = arg; + rv->smp_rv_action_func = action_func; + rv->smp_rv_done = 0; + ncpus = 0; + + CPUSET_FOREACH(cpu, map) { + ncpus++; + if (cpu != curcpu) + smp_rendezvous_notify(cpu, nhold); + } + rv->smp_rv_ncpus = ncpus; + + /* Check if the current CPU is in the map */ + if (CPU_ISSET(curcpu, &map)) + smp_rendezvous_action_body(curcpu); } +/* + * Execute the action_func on the targeted CPUs. + * + * setup_func: + * - if a function pointer is given, then first execute the function; + * only after the function is executed on all targeted can they proceed + * to the next step; + * - if NULL is given, this is equivalent to specifying a pointer to an + * empty function; as such there is no actual setup function, but all + * targeted CPUs proceed to the next step at about the same time; + * - smp_no_rendevous_barrier is a special value that signifies that there + * is no setup function nor the targeted CPUs should wait for anything + * before proceeding to the next step. + * + * action_func: + * - a function to be executed on the targeted CPUs; + * NULL is equivalent to specifying a pointer to an empty function. + * + * teardown_func: + * - if a function pointer is given, then first wait for all targeted CPUs + * to complete execution of action_func, then execute this function; + * - if NULL is given, this is equivalent to specifying a pointer to an + * empty function; as such there is no actual teardown action, but all + * targeted CPUs wait for each other to complete execution of action_func; + * - smp_no_rendevous_barrier is a special value that signifies that there + * is no teardown function nor the targeted CPUs should wait for anything + * after completing action_func. + */ void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), @@ -414,7 +532,8 @@ smp_rendezvous_cpus(cpuset_t map, void (* teardown_func)(void *), void *arg) { - int curcpumap, i, ncpus = 0; + struct smp_rendezvous_data *rv; + int nhold; /* Look comments in the !SMP case. */ if (!smp_started) { @@ -429,48 +548,39 @@ smp_rendezvous_cpus(cpuset_t map, return; } - CPU_FOREACH(i) { - if (CPU_ISSET(i, &map)) - ncpus++; + nhold = 1; + if (setup_func != smp_no_rendevous_barrier) + nhold++; + if (teardown_func != smp_no_rendevous_barrier) + nhold++; + + spinlock_enter(); + rv = DPCPU_PTR(smp_rv_data); + + /* Let other CPUs know that we are here, no need to IPI us. */ + atomic_add_int(DPCPU_PTR(smp_rv_count), 1); + + if (setup_func != smp_no_rendevous_barrier) { + smp_rendezvous_cpus_oneaction(map, rv, nhold, setup_func, arg); + smp_rendezvous_wait(rv); + nhold = 0; } - if (ncpus == 0) - panic("ncpus is 0 with non-zero map"); - - mtx_lock_spin(&smp_ipi_mtx); - - /* Pass rendezvous parameters via global variables. */ - smp_rv_ncpus = ncpus; - smp_rv_setup_func = setup_func; - smp_rv_action_func = action_func; - smp_rv_teardown_func = teardown_func; - smp_rv_func_arg = arg; - smp_rv_waiters[1] = 0; - smp_rv_waiters[2] = 0; - smp_rv_waiters[3] = 0; - atomic_store_rel_int(&smp_rv_waiters[0], 0); - /* - * Signal other processors, which will enter the IPI with - * interrupts off. - */ - curcpumap = CPU_ISSET(curcpu, &map); - CPU_CLR(curcpu, &map); - ipi_selected(map, IPI_RENDEZVOUS); + smp_rendezvous_cpus_oneaction(map, rv, nhold, action_func, arg); - /* Check if the current CPU is in the map */ - if (curcpumap != 0) - smp_rendezvous_action(); + if (teardown_func != smp_no_rendevous_barrier) { + smp_rendezvous_wait(rv); + smp_rendezvous_cpus_oneaction(map, rv, 0, teardown_func, arg); + } + smp_rendezvous_wait(rv); - /* - * Ensure that the master CPU waits for all the other - * CPUs to finish the rendezvous, so that smp_rv_* - * pseudo-structure and the arg are guaranteed to not - * be in use. - */ - while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) - cpu_spinwait(); + /* We are done with our work. */ + atomic_add_int(DPCPU_PTR(smp_rv_count), -1); - mtx_unlock_spin(&smp_ipi_mtx); + /* Process all pending incoming actions. */ + smp_rendezvous_action(); + + spinlock_exit(); } void