Index: amd64/include/asmacros.h
===================================================================
--- amd64/include/asmacros.h	(revision 225462)
+++ amd64/include/asmacros.h	(working copy)
@@ -192,6 +192,22 @@
 1:	addq	$TF_RIP,%rsp	/* skip over tf_err, tf_trapno */
 
 /*
+ * Macros to manage %gs for interrupt entry/exit for cheaper IPI
+ * handlers that don't push a full frame.
+ */
+#define	IPI_START							\
+	testb	$SEL_RPL_MASK,0x8(%rsp) ; /* come from kernel? */	\
+	jz	1f ;		/* Yes, dont swapgs again */		\
+	swapgs ;							\
+1: ;
+
+#define	IPI_END								\
+	testb	$SEL_RPL_MASK,0x8(%rsp) ; /* come from kernel? */	\
+	jz	1f ;		/* keep kernel GS.base */		\
+	swapgs ;							\
+1: ;
+
+/*
  * Access per-CPU data.
  */
 #define	PCPU(member)	%gs:PC_ ## member
Index: amd64/include/pcpu.h
===================================================================
--- amd64/include/pcpu.h	(revision 225462)
+++ amd64/include/pcpu.h	(working copy)
@@ -51,6 +51,8 @@
 #define PCPU_XEN_FIELDS
 #endif
 
+struct tlb_group;
+
 /*
  * The SMP parts are setup in pmap.c and locore.s for the BSP, and
  * mp_machdep.c sets up the data for the AP's to "see" when they awake.
@@ -76,6 +78,8 @@
 	struct system_segment_descriptor *pc_ldt;			\
 	/* Pointer to the CPU TSS descriptor */				\
 	struct system_segment_descriptor *pc_tss;			\
+	struct tlb_group *pc_tlb_group;					\
+	volatile int *pc_tlb_wait;					\
 	u_int	pc_cmci_mask		/* MCx banks for CMCI */	\
 	PCPU_XEN_FIELDS
 
Index: amd64/amd64/genassym.c
===================================================================
--- amd64/amd64/genassym.c	(revision 225462)
+++ amd64/amd64/genassym.c	(working copy)
@@ -221,6 +221,7 @@
 ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+ASSYM(PC_TLB_WAIT, offsetof(struct pcpu, pc_tlb_wait));
  
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
Index: amd64/amd64/mp_machdep.c
===================================================================
--- amd64/amd64/mp_machdep.c	(revision 225462)
+++ amd64/amd64/mp_machdep.c	(working copy)
@@ -108,9 +108,18 @@
 struct pcb **susppcbs = NULL;
 
 /* Variables needed for SMP tlb shootdown. */
+struct tlb_group {
+	volatile int tg_wait;
+	int	tg_package_id;
+	int	tg_count;
+	STAILQ_ENTRY(tlb_group) tg_link;
+} __aligned(CACHE_LINE_SIZE);
+
+static MALLOC_DEFINE(M_TLB_GROUP, "tlb_group", "TLB group");
+static STAILQ_HEAD(, tlb_group) smp_tlb_groups;
 vm_offset_t smp_tlb_addr1;
 vm_offset_t smp_tlb_addr2;
-volatile int smp_tlb_wait;
+volatile int smp_tlb_wait __aligned(CACHE_LINE_SIZE);
 
 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
@@ -145,12 +154,13 @@
 static u_int boot_address;
 static int cpu_logical;			/* logical cpus per core */
 static int cpu_cores;			/* cores per package */
+static int package_id_bits;		/* local APIC ID bits per package */
 
 static void	assign_cpu_ids(void);
 static void	set_interrupt_apic_ids(void);
 static int	start_all_aps(void);
 static int	start_ap(int apic_id);
-static void	release_aps(void *dummy);
+static void	tlb_groups_init(void);
 
 static int	hlt_logical_cpus;
 static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
@@ -197,6 +207,7 @@
 			continue;
 		cpu_cores++;
 	}
+	package_id_bits = core_id_bits;
 }
 
 /*
@@ -215,7 +226,6 @@
 topo_probe_0x4(void)
 {
 	u_int p[4];
-	int pkg_id_bits;
 	int core_id_bits;
 	int max_cores;
 	int max_logical;
@@ -242,14 +252,14 @@
 	core_id_bits = mask_width(max_logical/max_cores);
 	if (core_id_bits < 0)
 		return;
-	pkg_id_bits = core_id_bits + mask_width(max_cores);
+	package_id_bits = core_id_bits + mask_width(max_cores);
 
 	for (id = 0; id <= MAX_APIC_ID; id++) {
 		/* Check logical CPU availability. */
 		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
 			continue;
 		/* Check if logical CPU has the same package ID. */
-		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
+		if ((id >> package_id_bits) != (boot_cpu_id >> package_id_bits))
 			continue;
 		cpu_cores++;
 		/* Check if logical CPU has the same package and core IDs. */
@@ -288,7 +298,7 @@
 		bits = p[0] & 0x1f;
 		logical = p[1] &= 0xffff;
 		type = (p[2] >> 8) & 0xff;
-		if (type == 0 || logical == 0)
+		if (type == CPUID_TYPE_INVAL || logical == 0)
 			break;
 		/*
 		 * Because of uniformity assumption we examine only
@@ -304,8 +314,10 @@
 		}
 		if (type == CPUID_TYPE_SMT)
 			cpu_logical = cnt;
-		else if (type == CPUID_TYPE_CORE)
+		else if (type == CPUID_TYPE_CORE) {
 			cpu_cores = cnt;
+			package_id_bits = cnt;
+		}
 	}
 	if (cpu_logical == 0)
 		cpu_logical = 1;
@@ -367,7 +379,7 @@
 
 	/*
 	 * Determine whether any threading flags are
-	 * necessry.
+	 * necessary.
 	 */
 	topo_probe();
 	if (cpu_logical > 1 && hyperthreading_cpus)
@@ -551,6 +563,7 @@
 	start_all_aps();
 
 	set_interrupt_apic_ids();
+	tlb_groups_init();
 }
 
 
@@ -1045,24 +1058,108 @@
 }
 
 /*
+ * To minimize cache traffic during TLB shootdown processing we place
+ * CPUs into groups.  Each group decrements a per-group counter until it
+ * drops to zero.  When the last CPU from a group finishes its handler
+ * it decrements a global count of groups stored in smb_tlb_wait.
+ *
+ * In a multi-core system we place all CPUs in a package within a single
+ * group.  For other systems we simply use a single TLB group containing
+ * all CPUs.
+ */
+static void
+tlb_groups_init(void)
+{
+	struct tlb_group *tg;
+	struct pcpu *pc;
+	int cpu, package_id;
+
+	STAILQ_INIT(&smp_tlb_groups);
+	CPU_FOREACH(cpu) {
+		pc = pcpu_find(cpu);
+		if (package_id_bits == 0)
+			package_id = 0;
+		else
+			package_id = pc->pc_apic_id >> package_id_bits;
+		STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) {
+			if (tg->tg_package_id == package_id)
+				break;
+		}
+		if (tg == NULL) {
+			tg = malloc(sizeof(*tg), M_TLB_GROUP, M_WAITOK);
+			tg->tg_wait = 0;
+			tg->tg_package_id = package_id;
+			tg->tg_count = 0;
+			STAILQ_INSERT_TAIL(&smp_tlb_groups, tg, tg_link);
+			/* XXX */
+			printf("TLB: Creating group for package %d\n",
+			    package_id);
+		}
+		pc->pc_tlb_group = tg;
+		pc->pc_tlb_wait = &tg->tg_wait;
+		tg->tg_count++;
+		/* XXX */
+		printf("TLB: Adding CPU %d (APIC ID %d) to TLB group %d\n",
+		    pc->pc_cpuid, pc->pc_apic_id, package_id);
+	}
+}
+
+static __inline void
+tlb_groups_populate(cpumask_t mask)
+{
+	struct tlb_group *tg;
+	struct pcpu *pc;
+	int cpu;
+
+	KASSERT(smp_tlb_wait == 0, ("non-zero global TLB wait count"));
+#ifdef INVARIANTS
+	STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) {
+		KASSERT(tg->tg_wait == 0, ("non-zero TLB wait for group %d",
+		    tg->tg_package_id));
+	}
+#endif
+
+	if (mask == -1) {
+		STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) {
+			tg->tg_wait = tg->tg_count;
+			smp_tlb_wait++;
+		}
+		tg = PCPU_GET(tlb_group);
+		tg->tg_wait--;
+		if (tg->tg_wait == 0)
+			smp_tlb_wait--;
+		return;
+	}
+
+	while (mask != 0) {
+		cpu = ffs(mask) - 1;
+		mask &= ~(1 << cpu);
+		if (cpu == PCPU_GET(cpuid))
+			continue;
+		pc = pcpu_find(cpu);
+		pc->pc_tlb_group->tg_wait++;
+		if (pc->pc_tlb_group->tg_wait == 1)
+			smp_tlb_wait++;
+	}
+}
+
+/*
  * Flush the TLB on all other CPU's
  */
 static void
 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
-	u_int ncpu;
 
-	ncpu = mp_ncpus - 1;	/* does not shootdown self */
-	if (ncpu < 1)
+	if (mp_ncpus == 1)
 		return;		/* no other cpus */
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
-	atomic_store_rel_int(&smp_tlb_wait, 0);
+	tlb_groups_populate(-1);
 	ipi_all_but_self(vector);
-	while (smp_tlb_wait < ncpu)
+	while (smp_tlb_wait != 0)
 		ia32_pause();
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
@@ -1070,39 +1167,26 @@
 static void
 smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
-	int ncpu, othercpus;
 
-	othercpus = mp_ncpus - 1;
 	if (mask == (cpumask_t)-1) {
-		ncpu = othercpus;
-		if (ncpu < 1)
+		if (mp_ncpus == 1)
 			return;
 	} else {
 		mask &= ~PCPU_GET(cpumask);
 		if (mask == 0)
 			return;
-		ncpu = bitcount32(mask);
-		if (ncpu > othercpus) {
-			/* XXX this should be a panic offence */
-			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
-			    ncpu, othercpus);
-			ncpu = othercpus;
-		}
-		/* XXX should be a panic, implied by mask == 0 above */
-		if (ncpu < 1)
-			return;
 	}
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
-	atomic_store_rel_int(&smp_tlb_wait, 0);
+	tlb_groups_populate(mask);
 	if (mask == (cpumask_t)-1)
 		ipi_all_but_self(vector);
 	else
 		ipi_selected(mask, vector);
-	while (smp_tlb_wait < ncpu)
+	while (smp_tlb_wait != 0)
 		ia32_pause();
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
Index: amd64/amd64/apic_vector.S
===================================================================
--- amd64/amd64/apic_vector.S	(revision 225462)
+++ amd64/amd64/apic_vector.S	(working copy)
@@ -129,12 +129,30 @@
 	jmp	doreti
 
 #ifdef SMP
+
 /*
+ * Acknowledge a TLB IPI by decrementing the group's counter.  When the
+ * group counter reaches zero, decrement the global counter.
+ *
+ * Assumes we can use %rax.
+ */
+#define	TLB_ACK								\
+	movq	PCPU(TLB_WAIT), %rax ;					\
+	lock								\
+	decl	(%rax) ;						\
+	jnz	1f ;							\
+	lock								\
+	decl	smp_tlb_wait ;						\
+1:	
+	
+	
+/*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
+	IPI_START
 	pushq	%rax
 
 	movq	%cr3, %rax		/* invalidate the TLB */
@@ -143,10 +161,10 @@
 	movq	lapic, %rax
 	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
-	lock
-	incl	smp_tlb_wait
+	TLB_ACK
 
 	popq	%rax
+	IPI_END
 	jmp	doreti_iret
 
 /*
@@ -155,6 +173,7 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
+	IPI_START
 	pushq	%rax
 
 	movq	smp_tlb_addr1, %rax
@@ -163,10 +182,10 @@
 	movq	lapic, %rax
 	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
-	lock
-	incl	smp_tlb_wait
+	TLB_ACK
 
 	popq	%rax
+	IPI_END
 	jmp	doreti_iret
 
 /*
@@ -175,6 +194,7 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
+	IPI_START
 	pushq	%rax
 	pushq	%rdx
 
@@ -188,11 +208,11 @@
 	movq	lapic, %rax
 	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
-	lock
-	incl	smp_tlb_wait
+	TLB_ACK
 
 	popq	%rdx
 	popq	%rax
+	IPI_END
 	jmp	doreti_iret
 
 /*
@@ -201,6 +221,7 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
+	IPI_START
 	pushq	%rax
 
 	wbinvd
@@ -208,10 +229,10 @@
 	movq	lapic, %rax
 	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
-	lock
-	incl	smp_tlb_wait
+	TLB_ACK
 
 	popq	%rax
+	IPI_END
 	jmp	doreti_iret
 
 /*