commit 43a7473cf143e8830766ed62a22fcee04e3f4426
Author: Andriy Gapon <avg@icyb.net.ua>
Date:   Thu Sep 15 12:09:34 2011 +0300

    [wip] re-implement smp rendezvous code
    
    - create one rendezvous (outgoing) mailbox per each cpu where a cpu would
      place its rendezvous request directed to other cpus
    - create a cpu mask for each cpu where other cpus can set a bit to indicate
      that they send a rendezvous request to the cpu in question
    - send an ipi only for a first rv request, piggyback subsequent
      requests if a target cpu is still processing previous incoming requests
    - many-to-many rv requests can be sent now, there is no locking, the only
      limitation is that a cpu can have only a single outgoing request at
      a time
    - to avoid deadlocks, when a cpu waits for its requested to be completed
      by target cpus, it also checks for and processes incoming requests
    - to avoid deadlock with cpu stopping logic, cpus also check for stop
      requests while waiting
    - when a rendezvous has a setup and/or teardown action, target CPUs do not
      spin-wait for other target CPUs between executing the steps of the rv,
      instead sequential execution of the steps is always coordianted via
      the master CPU; target CPUs can process other incoming rv requests
      between steps of the "complex" rv; this is done to avoid a potential
      deadlock (see an example below)
    - there can be only one cpu asking other cpus to stop; this is implemented
      via a handrolled spin mutex analogue; similar to the above, to avoid
      deadlocks a cpu spinning for this lock also checks for an incoming stop
      request
    
    An example of a deadlock that can happen if target/slave CPUs would
    spin-wait between execution of steps of a complex rendezvous action:
    - processors Ps1 and Ps2 issue complex rendezvous requests R1 and R2
      to processors Pt1 and Pt2
    - because of request timings Pt1 gets R1 before R2 while Pt2 gets R2 before R1
    - Pt1 executes setup action of R1 and waits for Pt2 to do the same
    - Pt2 executes setup action of R2 and waits for Pt1 to do the same
    - deadlock
    
    Some implementation details:
    - master CPU increases work count for slave CPUs by total count of
      non-no-barrier actions in a rendezvous, so that the slave CPUs
      do not leave rendezvous handling context until all actions are done
    - master CPU processes incoming request in between posting its
      actions
    - master CPU bumps its work count before posting an action, so that
      other CPUs know that the master CPU can process incoming events
      without being IPI-ed
    - master CPU processes all incoming events before leaving smp_rendezvous
      so that it fully completes all incoming complex rendezvous without
      leaving the context

diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 5dd0c55..d9391bc 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -97,6 +97,10 @@ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
     "Topology override setting; 0 is default provided by hardware.");
 TUNABLE_INT("kern.smp.topology", &smp_topology);
 
+unsigned int coalesced_ipi_count;
+SYSCTL_INT(_kern_smp, OID_AUTO, coalesced_ipi_count, CTLFLAG_RD,
+    &coalesced_ipi_count, 0, "Count of coalesced SMP rendezvous IPIs");
+
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
@@ -104,15 +108,23 @@ SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 	   &forward_signal_enabled, 0,
 	   "Forwarding of a signal to a process on a different CPU");
 
+#define	CPUSET_FOREACH(cpu, mask)				\
+	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
+		if (CPU_ISSET(cpu, &mask))
+
 /* Variables needed for SMP rendezvous. */
-static volatile int smp_rv_ncpus;
-static void (*volatile smp_rv_setup_func)(void *arg);
-static void (*volatile smp_rv_action_func)(void *arg);
-static void (*volatile smp_rv_teardown_func)(void *arg);
-static void *volatile smp_rv_func_arg;
-static volatile int smp_rv_waiters[4];
-
-/* 
+struct smp_rendezvous_data {
+	void (*smp_rv_action_func)(void *arg);
+	void *smp_rv_func_arg;
+	volatile int smp_rv_done;
+	int smp_rv_ncpus;
+};
+
+static DPCPU_DEFINE(struct smp_rendezvous_data, smp_rv_data);
+static volatile DPCPU_DEFINE(cpuset_t, smp_rv_senders);
+static volatile DPCPU_DEFINE(int, smp_rv_count);
+
+/*
  * Shared mutex to restrict busywaits between smp_rendezvous() and
  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  * functions trigger at once and cause multiple CPUs to busywait with
@@ -313,28 +325,16 @@ restart_cpus(cpuset_t map)
  * Note that the supplied external functions _must_ be reentrant and aware
  * that they are running in parallel and in an unknown lock context.
  */
-void
-smp_rendezvous_action(void)
+static __inline void
+smp_rendezvous_action_body(int cpu)
 {
+	struct smp_rendezvous_data *rv;
 	struct thread *td;
-	void *local_func_arg;
-	void (*local_setup_func)(void*);
-	void (*local_action_func)(void*);
-	void (*local_teardown_func)(void*);
 #ifdef INVARIANTS
 	int owepreempt;
 #endif
 
-	/* Ensure we have up-to-date values. */
-	atomic_add_acq_int(&smp_rv_waiters[0], 1);
-	while (smp_rv_waiters[0] < smp_rv_ncpus)
-		cpu_spinwait();
-
-	/* Fetch rendezvous parameters after acquire barrier. */
-	local_func_arg = smp_rv_func_arg;
-	local_setup_func = smp_rv_setup_func;
-	local_action_func = smp_rv_action_func;
-	local_teardown_func = smp_rv_teardown_func;
+	rv = DPCPU_ID_PTR(cpu, smp_rv_data);
 
 	/*
 	 * Use a nested critical section to prevent any preemptions
@@ -363,50 +363,168 @@ smp_rendezvous_action(void)
 #ifdef INVARIANTS
 	owepreempt = td->td_owepreempt;
 #endif
-	
-	/*
-	 * If requested, run a setup function before the main action
-	 * function.  Ensure all CPUs have completed the setup
-	 * function before moving on to the action function.
-	 */
-	if (local_setup_func != smp_no_rendevous_barrier) {
-		if (smp_rv_setup_func != NULL)
-			smp_rv_setup_func(smp_rv_func_arg);
-		atomic_add_int(&smp_rv_waiters[1], 1);
-		while (smp_rv_waiters[1] < smp_rv_ncpus)
-                	cpu_spinwait();
+	if (rv->smp_rv_action_func != NULL)
+		rv->smp_rv_action_func(rv->smp_rv_func_arg);
+
+	td->td_critnest--;
+	KASSERT(owepreempt == td->td_owepreempt,
+	    ("rendezvous action changed td_owepreempt"));
+
+	atomic_add_int(&rv->smp_rv_done, 1);
+}
+
+static __inline int
+smp_rendezvous_action_pass(void)
+{
+	volatile cpuset_t *senders;
+	int count;
+	int cpu;
+
+	senders = DPCPU_PTR(smp_rv_senders);
+	if (CPU_EMPTY(senders))
+		return (0);
+
+	count = 0;
+
+	CPUSET_FOREACH(cpu, *senders) {
+		count++;
+#define	CPU_CLR_ACQ_ATOMIC(n, p)						\
+	atomic_clear_long(&(p)->__bits[(n)/_NCPUBITS], __cpuset_mask(n))
+		CPU_CLR_ACQ_ATOMIC(cpu, senders);
+		smp_rendezvous_action_body(cpu);
 	}
 
-	if (local_action_func != NULL)
-		local_action_func(local_func_arg);
-
-	if (local_teardown_func != smp_no_rendevous_barrier) {
-		/*
-		 * Signal that the main action has been completed.  If a
-		 * full exit rendezvous is requested, then all CPUs will
-		 * wait here until all CPUs have finished the main action.
-		 */
-		atomic_add_int(&smp_rv_waiters[2], 1);
-		while (smp_rv_waiters[2] < smp_rv_ncpus)
+	return (count);
+}
+
+void
+smp_rendezvous_action(void)
+{
+	int pending;
+	int count;
+
+	pending = atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), 0);
+	while (pending != 0) {
+		KASSERT(pending > 0, ("negative pending rendezvous count"));
+		while ((count = smp_rendezvous_action_pass()) == 0) {
+			//wasted_spins++;
 			cpu_spinwait();
+		}
+		pending = atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), -count);
+		pending -= count;
+	}
+}
 
-		if (local_teardown_func != NULL)
-			local_teardown_func(local_func_arg);
+static void
+smp_rendezvous_wait(struct smp_rendezvous_data *rv)
+{
+	int ncpus;
+	int count;
+
+	ncpus = rv->smp_rv_ncpus;
+
+	while (rv->smp_rv_done < ncpus) {
+		/* check for incoming events */
+		if (CPU_ISSET(curcpu, &stopping_cpus))
+			cpustop_handler();
+
+		count = 0;
+		if (atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), 0) != 0)
+			count = smp_rendezvous_action_pass();
+		if (count != 0)
+			atomic_add_int(DPCPU_PTR(smp_rv_count), -count);
+		else
+			cpu_spinwait();
 	}
+}
+
+
+static void
+smp_rendezvous_notify(int cpu, int nhold)
+{
+	int send_ipi;
+	int x;
+
+	if (cpu == curcpu)
+		return;
+
+	KASSERT(!CPU_ISSET(curcpu, DPCPU_ID_PTR(cpu, smp_rv_senders)),
+	    ("curcpu bit is set in target cpu's senders map"));
 
 	/*
-	 * Signal that the rendezvous is fully completed by this CPU.
-	 * This means that no member of smp_rv_* pseudo-structure will be
-	 * accessed by this target CPU after this point; in particular,
-	 * memory pointed by smp_rv_func_arg.
+	 * If this is a first action of a rendezvous invocation
+	 * and we are the first to send an event, then send an ipi.
 	 */
-	atomic_add_int(&smp_rv_waiters[3], 1);
+	send_ipi = 0;
+	if (nhold != 0) {
+		x = atomic_fetchadd_int(DPCPU_ID_PTR(cpu, smp_rv_count), nhold);
+		send_ipi = (x == 0);
+		if (!send_ipi)
+			coalesced_ipi_count++;
+	}
+#define	CPU_SET_REL_ATOMIC(n, p)	\
+	atomic_set_rel_long(&(p)->__bits[(n)/_NCPUBITS], __cpuset_mask(n))
+	CPU_SET_REL_ATOMIC(curcpu, DPCPU_ID_PTR(cpu, smp_rv_senders));
+	if (send_ipi)
+		ipi_cpu(cpu, IPI_RENDEZVOUS);
+}
 
-	td->td_critnest--;
-	KASSERT(owepreempt == td->td_owepreempt,
-	    ("rendezvous action changed td_owepreempt"));
+static void
+smp_rendezvous_cpus_oneaction(cpuset_t map,
+	struct smp_rendezvous_data *rv,
+	int nhold,
+	void (*action_func)(void *),
+	void *arg)
+{
+	int ncpus;
+	int cpu;
+
+
+	rv->smp_rv_func_arg = arg;
+	rv->smp_rv_action_func = action_func;
+	rv->smp_rv_done = 0;
+	ncpus = 0;
+
+	CPUSET_FOREACH(cpu, map) {
+		ncpus++;
+		if (cpu != curcpu)
+			smp_rendezvous_notify(cpu, nhold);
+	}
+	rv->smp_rv_ncpus = ncpus;
+
+	/* Check if the current CPU is in the map */
+	if (CPU_ISSET(curcpu, &map))
+		smp_rendezvous_action_body(curcpu);
 }
 
+/*
+ * Execute the action_func on the targeted CPUs.
+ *
+ * setup_func:
+ * - if a function pointer is given, then first execute the function;
+ *   only after the function is executed on all targeted can they proceed
+ *   to the next step;
+ * - if NULL is given, this is equivalent to specifying a pointer to an
+ *   empty function; as such there is no actual setup function, but all
+ *   targeted CPUs proceed to the next step at about the same time;
+ * - smp_no_rendevous_barrier is a special value that signifies that there
+ *   is no setup function nor the targeted CPUs should wait for anything
+ *   before proceeding to the next step.
+ *
+ * action_func:
+ * - a function to be executed on the targeted CPUs;
+ *   NULL is equivalent to specifying a pointer to an empty function.
+ *
+ * teardown_func:
+ * - if a function pointer is given, then first wait for all targeted CPUs
+ *   to complete execution of action_func, then execute this function;
+ * - if NULL is given, this is equivalent to specifying a pointer to an
+ *   empty function; as such there is no actual teardown action, but all
+ *   targeted CPUs wait for each other to complete execution of action_func;
+ * - smp_no_rendevous_barrier is a special value that signifies that there
+ *   is no teardown function nor the targeted CPUs should wait for anything
+ *   after completing action_func.
+ */
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (* setup_func)(void *), 
@@ -414,7 +532,8 @@ smp_rendezvous_cpus(cpuset_t map,
 	void (* teardown_func)(void *),
 	void *arg)
 {
-	int curcpumap, i, ncpus = 0;
+	struct smp_rendezvous_data *rv;
+	int nhold;
 
 	/* Look comments in the !SMP case. */
 	if (!smp_started) {
@@ -429,48 +548,39 @@ smp_rendezvous_cpus(cpuset_t map,
 		return;
 	}
 
-	CPU_FOREACH(i) {
-		if (CPU_ISSET(i, &map))
-			ncpus++;
+	nhold = 1;
+	if (setup_func != smp_no_rendevous_barrier)
+		nhold++;
+	if (teardown_func != smp_no_rendevous_barrier)
+		nhold++;
+
+	spinlock_enter();
+	rv = DPCPU_PTR(smp_rv_data);
+
+	/* Let other CPUs know that we are here, no need to IPI us. */
+	atomic_add_int(DPCPU_PTR(smp_rv_count), 1);
+
+	if (setup_func != smp_no_rendevous_barrier) {
+		smp_rendezvous_cpus_oneaction(map, rv, nhold, setup_func, arg);
+		smp_rendezvous_wait(rv);
+		nhold = 0;
 	}
-	if (ncpus == 0)
-		panic("ncpus is 0 with non-zero map");
-
-	mtx_lock_spin(&smp_ipi_mtx);
-
-	/* Pass rendezvous parameters via global variables. */
-	smp_rv_ncpus = ncpus;
-	smp_rv_setup_func = setup_func;
-	smp_rv_action_func = action_func;
-	smp_rv_teardown_func = teardown_func;
-	smp_rv_func_arg = arg;
-	smp_rv_waiters[1] = 0;
-	smp_rv_waiters[2] = 0;
-	smp_rv_waiters[3] = 0;
-	atomic_store_rel_int(&smp_rv_waiters[0], 0);
 
-	/*
-	 * Signal other processors, which will enter the IPI with
-	 * interrupts off.
-	 */
-	curcpumap = CPU_ISSET(curcpu, &map);
-	CPU_CLR(curcpu, &map);
-	ipi_selected(map, IPI_RENDEZVOUS);
+	smp_rendezvous_cpus_oneaction(map, rv, nhold, action_func, arg);
 
-	/* Check if the current CPU is in the map */
-	if (curcpumap != 0)
-		smp_rendezvous_action();
+	if (teardown_func != smp_no_rendevous_barrier) {
+		smp_rendezvous_wait(rv);
+		smp_rendezvous_cpus_oneaction(map, rv, 0, teardown_func, arg);
+	}
+	smp_rendezvous_wait(rv);
 
-	/*
-	 * Ensure that the master CPU waits for all the other
-	 * CPUs to finish the rendezvous, so that smp_rv_*
-	 * pseudo-structure and the arg are guaranteed to not
-	 * be in use.
-	 */
-	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
-		cpu_spinwait();
+	/* We are done with our work. */
+	atomic_add_int(DPCPU_PTR(smp_rv_count), -1);
 
-	mtx_unlock_spin(&smp_ipi_mtx);
+	/* Process all pending incoming actions. */
+	smp_rendezvous_action();
+
+	spinlock_exit();
 }
 
 void