Index: amd64/include/asmacros.h =================================================================== --- amd64/include/asmacros.h (revision 225462) +++ amd64/include/asmacros.h (working copy) @@ -192,6 +192,22 @@ 1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ /* + * Macros to manage %gs for interrupt entry/exit for cheaper IPI + * handlers that don't push a full frame. + */ +#define IPI_START \ + testb $SEL_RPL_MASK,0x8(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* Yes, dont swapgs again */ \ + swapgs ; \ +1: ; + +#define IPI_END \ + testb $SEL_RPL_MASK,0x8(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* keep kernel GS.base */ \ + swapgs ; \ +1: ; + +/* * Access per-CPU data. */ #define PCPU(member) %gs:PC_ ## member Index: amd64/include/pcpu.h =================================================================== --- amd64/include/pcpu.h (revision 225462) +++ amd64/include/pcpu.h (working copy) @@ -51,6 +51,8 @@ #define PCPU_XEN_FIELDS #endif +struct tlb_group; + /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. @@ -76,6 +78,8 @@ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ + struct tlb_group *pc_tlb_group; \ + volatile int *pc_tlb_wait; \ u_int pc_cmci_mask /* MCx banks for CMCI */ \ PCPU_XEN_FIELDS Index: amd64/amd64/genassym.c =================================================================== --- amd64/amd64/genassym.c (revision 225462) +++ amd64/amd64/genassym.c (working copy) @@ -221,6 +221,7 @@ ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); +ASSYM(PC_TLB_WAIT, offsetof(struct pcpu, pc_tlb_wait)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); Index: amd64/amd64/mp_machdep.c =================================================================== --- amd64/amd64/mp_machdep.c (revision 225462) +++ amd64/amd64/mp_machdep.c (working copy) @@ -108,9 +108,18 @@ struct pcb **susppcbs = NULL; /* Variables needed for SMP tlb shootdown. */ +struct tlb_group { + volatile int tg_wait; + int tg_package_id; + int tg_count; + STAILQ_ENTRY(tlb_group) tg_link; +} __aligned(CACHE_LINE_SIZE); + +static MALLOC_DEFINE(M_TLB_GROUP, "tlb_group", "TLB group"); +static STAILQ_HEAD(, tlb_group) smp_tlb_groups; vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; -volatile int smp_tlb_wait; +volatile int smp_tlb_wait __aligned(CACHE_LINE_SIZE); extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); @@ -145,12 +154,13 @@ static u_int boot_address; static int cpu_logical; /* logical cpus per core */ static int cpu_cores; /* cores per package */ +static int package_id_bits; /* local APIC ID bits per package */ static void assign_cpu_ids(void); static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); -static void release_aps(void *dummy); +static void tlb_groups_init(void); static int hlt_logical_cpus; static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ @@ -197,6 +207,7 @@ continue; cpu_cores++; } + package_id_bits = core_id_bits; } /* @@ -215,7 +226,6 @@ topo_probe_0x4(void) { u_int p[4]; - int pkg_id_bits; int core_id_bits; int max_cores; int max_logical; @@ -242,14 +252,14 @@ core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; - pkg_id_bits = core_id_bits + mask_width(max_cores); + package_id_bits = core_id_bits + mask_width(max_cores); for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ - if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) + if ((id >> package_id_bits) != (boot_cpu_id >> package_id_bits)) continue; cpu_cores++; /* Check if logical CPU has the same package and core IDs. */ @@ -288,7 +298,7 @@ bits = p[0] & 0x1f; logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; - if (type == 0 || logical == 0) + if (type == CPUID_TYPE_INVAL || logical == 0) break; /* * Because of uniformity assumption we examine only @@ -304,8 +314,10 @@ } if (type == CPUID_TYPE_SMT) cpu_logical = cnt; - else if (type == CPUID_TYPE_CORE) + else if (type == CPUID_TYPE_CORE) { cpu_cores = cnt; + package_id_bits = cnt; + } } if (cpu_logical == 0) cpu_logical = 1; @@ -367,7 +379,7 @@ /* * Determine whether any threading flags are - * necessry. + * necessary. */ topo_probe(); if (cpu_logical > 1 && hyperthreading_cpus) @@ -551,6 +563,7 @@ start_all_aps(); set_interrupt_apic_ids(); + tlb_groups_init(); } @@ -1045,24 +1058,108 @@ } /* + * To minimize cache traffic during TLB shootdown processing we place + * CPUs into groups. Each group decrements a per-group counter until it + * drops to zero. When the last CPU from a group finishes its handler + * it decrements a global count of groups stored in smb_tlb_wait. + * + * In a multi-core system we place all CPUs in a package within a single + * group. For other systems we simply use a single TLB group containing + * all CPUs. + */ +static void +tlb_groups_init(void) +{ + struct tlb_group *tg; + struct pcpu *pc; + int cpu, package_id; + + STAILQ_INIT(&smp_tlb_groups); + CPU_FOREACH(cpu) { + pc = pcpu_find(cpu); + if (package_id_bits == 0) + package_id = 0; + else + package_id = pc->pc_apic_id >> package_id_bits; + STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) { + if (tg->tg_package_id == package_id) + break; + } + if (tg == NULL) { + tg = malloc(sizeof(*tg), M_TLB_GROUP, M_WAITOK); + tg->tg_wait = 0; + tg->tg_package_id = package_id; + tg->tg_count = 0; + STAILQ_INSERT_TAIL(&smp_tlb_groups, tg, tg_link); + /* XXX */ + printf("TLB: Creating group for package %d\n", + package_id); + } + pc->pc_tlb_group = tg; + pc->pc_tlb_wait = &tg->tg_wait; + tg->tg_count++; + /* XXX */ + printf("TLB: Adding CPU %d (APIC ID %d) to TLB group %d\n", + pc->pc_cpuid, pc->pc_apic_id, package_id); + } +} + +static __inline void +tlb_groups_populate(cpumask_t mask) +{ + struct tlb_group *tg; + struct pcpu *pc; + int cpu; + + KASSERT(smp_tlb_wait == 0, ("non-zero global TLB wait count")); +#ifdef INVARIANTS + STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) { + KASSERT(tg->tg_wait == 0, ("non-zero TLB wait for group %d", + tg->tg_package_id)); + } +#endif + + if (mask == -1) { + STAILQ_FOREACH(tg, &smp_tlb_groups, tg_link) { + tg->tg_wait = tg->tg_count; + smp_tlb_wait++; + } + tg = PCPU_GET(tlb_group); + tg->tg_wait--; + if (tg->tg_wait == 0) + smp_tlb_wait--; + return; + } + + while (mask != 0) { + cpu = ffs(mask) - 1; + mask &= ~(1 << cpu); + if (cpu == PCPU_GET(cpuid)) + continue; + pc = pcpu_find(cpu); + pc->pc_tlb_group->tg_wait++; + if (pc->pc_tlb_group->tg_wait == 1) + smp_tlb_wait++; + } +} + +/* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { - u_int ncpu; - ncpu = mp_ncpus - 1; /* does not shootdown self */ - if (ncpu < 1) + if (mp_ncpus == 1) return; /* no other cpus */ if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); + tlb_groups_populate(-1); ipi_all_but_self(vector); - while (smp_tlb_wait < ncpu) + while (smp_tlb_wait != 0) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } @@ -1070,39 +1167,26 @@ static void smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { - int ncpu, othercpus; - othercpus = mp_ncpus - 1; if (mask == (cpumask_t)-1) { - ncpu = othercpus; - if (ncpu < 1) + if (mp_ncpus == 1) return; } else { mask &= ~PCPU_GET(cpumask); if (mask == 0) return; - ncpu = bitcount32(mask); - if (ncpu > othercpus) { - /* XXX this should be a panic offence */ - printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", - ncpu, othercpus); - ncpu = othercpus; - } - /* XXX should be a panic, implied by mask == 0 above */ - if (ncpu < 1) - return; } if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); + tlb_groups_populate(mask); if (mask == (cpumask_t)-1) ipi_all_but_self(vector); else ipi_selected(mask, vector); - while (smp_tlb_wait < ncpu) + while (smp_tlb_wait != 0) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } Index: amd64/amd64/apic_vector.S =================================================================== --- amd64/amd64/apic_vector.S (revision 225462) +++ amd64/amd64/apic_vector.S (working copy) @@ -129,12 +129,30 @@ jmp doreti #ifdef SMP + /* + * Acknowledge a TLB IPI by decrementing the group's counter. When the + * group counter reaches zero, decrement the global counter. + * + * Assumes we can use %rax. + */ +#define TLB_ACK \ + movq PCPU(TLB_WAIT), %rax ; \ + lock \ + decl (%rax) ; \ + jnz 1f ; \ + lock \ + decl smp_tlb_wait ; \ +1: + + +/* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invltlb) + IPI_START pushq %rax movq %cr3, %rax /* invalidate the TLB */ @@ -143,10 +161,10 @@ movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - lock - incl smp_tlb_wait + TLB_ACK popq %rax + IPI_END jmp doreti_iret /* @@ -155,6 +173,7 @@ .text SUPERALIGN_TEXT IDTVEC(invlpg) + IPI_START pushq %rax movq smp_tlb_addr1, %rax @@ -163,10 +182,10 @@ movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - lock - incl smp_tlb_wait + TLB_ACK popq %rax + IPI_END jmp doreti_iret /* @@ -175,6 +194,7 @@ .text SUPERALIGN_TEXT IDTVEC(invlrng) + IPI_START pushq %rax pushq %rdx @@ -188,11 +208,11 @@ movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - lock - incl smp_tlb_wait + TLB_ACK popq %rdx popq %rax + IPI_END jmp doreti_iret /* @@ -201,6 +221,7 @@ .text SUPERALIGN_TEXT IDTVEC(invlcache) + IPI_START pushq %rax wbinvd @@ -208,10 +229,10 @@ movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - lock - incl smp_tlb_wait + TLB_ACK popq %rax + IPI_END jmp doreti_iret /*