diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 6465247..876d39e 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -43,6 +43,12 @@ #include "assym.s" +#ifdef SMP +#define LK lock ; +#else +#define LK +#endif + /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word @@ -133,6 +139,38 @@ IDTVEC(errorint) * Global address space TLB shootdown. */ .text + +#define NAKE_INTR_CS 24 + + SUPERALIGN_TEXT +global_invltlb: + movl %cr4,%eax + andl $~0x80,%eax + movl %eax,%cr4 + orl $0x80,%eax + movl %eax,%cr4 +invltlb_ret_clear_pm_save: + movq smp_tlb_pmap,%rdx + testq %rdx,%rdx + jz invltlb_ret + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz 1f + swapgs +1: + movl PCPU(CPUID),%eax + jz 2f + swapgs +2: + LK btcl %eax,PM_SAVE(%rdx) + SUPERALIGN_TEXT +invltlb_ret: + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + LK incl smp_tlb_wait + popq %rdx + popq %rax + jmp doreti_iret + SUPERALIGN_TEXT IDTVEC(invltlb) #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) @@ -149,18 +187,44 @@ IDTVEC(invltlb) #endif pushq %rax + pushq %rdx - movq %cr3, %rax /* invalidate the TLB */ - movq %rax, %cr3 - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + movq %cr3,%rax + cmpl $0,pmap_pcid_enabled + je 2f + + movq $smp_tlb_invpcid,%rdx + cmpl $0,(%rdx) + je global_invltlb + cmpl $-1,(%rdx) + je global_invltlb + + /* + * Non-zero smp_tlb_invpcid, only invalidate TLB for entries with + * current PCID. + */ + cmpl $0,invpcid_works + je 1f + /* Use invpcid if available. */ + movl $1,%eax /* INVPCID_CTX */ + /* invpcid (%rdx),%rax */ + .byte 0x66,0x0f,0x38,0x82,0x02 + jmp invltlb_ret_clear_pm_save +1: + /* Otherwise reload %cr3 twice. */ + movq pcid_cr3,%rdx + cmpq %rax,%rdx + je 2f + movq %rdx,%cr3 /* Invalidate, bit 63 is zero. */ + btsq $63,%rax + + /* + * Invalidate the TLB if PCID is not enabled. + * Restore the old address space. + */ +2: + movq %rax,%cr3 + jmp invltlb_ret_clear_pm_save /* * Single page TLB shootdown @@ -182,18 +246,54 @@ IDTVEC(invlpg) #endif pushq %rax - - movq smp_tlb_addr1, %rax - invlpg (%rax) /* invalidate single page */ - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + pushq %rdx + movq $smp_tlb_invpcid,%rdx + cmpl $0,pmap_pcid_enabled + je 3f + cmpl $0,invpcid_works + jne 2f + + /* kernel pmap - use invlpg to invalidate global mapping */ + cmpl $0,(%rdx) + je 3f + cmpl $-1,(%rdx) + je global_invltlb + + /* + * PCID supported, but INVPCID is not. + * Temporarily switch to the target address space and do INVLPG. + */ + pushq %rcx + movq %cr3,%rcx + movq pcid_cr3,%rax + cmp %rcx,%rax + je 1f + btsq $63,%rax + movq %rax,%cr3 +1: movq 8(%rdx),%rax + invlpg (%rax) + btsq $63,%rcx + movq %rcx,%cr3 + popq %rcx + jmp invltlb_ret + + /* + * Invalidate the TLB entry using INVPCID_ADDR. + */ +2: + xorl %eax,%eax +/* invpcid (%rdx),%rax */ + .byte 0x66,0x0f,0x38,0x82,0x02 + jmp invltlb_ret + + /* + * PCID is not supported or kernel pmap. + * Invalidate single page using INVLPG. + */ +3: + movq 8(%rdx),%rax + invlpg (%rax) + jmp invltlb_ret /* * Page range TLB shootdown. @@ -216,23 +316,76 @@ IDTVEC(invlrng) pushq %rax pushq %rdx - - movq smp_tlb_addr1, %rdx - movq smp_tlb_addr2, %rax + movq $smp_tlb_invpcid,%rdx + cmpl $0,pmap_pcid_enabled + jne invlrng_single_page + cmpl $0,invpcid_works + jne invlrng_invpcid + + /* kernel pmap - use invlpg to invalidate global mapping */ + cmpl $0,(%rdx) + je invlrng_single_page + cmpl $-1,(%rdx) + je global_invltlb + + pushq %rcx + movq %cr3,%rcx + movq pcid_cr3,%rax + cmpq %rcx,%rax + je 1f + btsq $63,%rax + movq %rax,%cr3 +1: + movq 8(%rdx),%rdx + movq smp_tlb_addr2,%rax +2: + invlpg (%rdx) + addq $PAGE_SIZE,%rdx + cmpq %rax,%rdx + jb 2b + btsq $63,%rcx + movq %rcx,%cr3 + popq %rcx + jmp invltlb_ret + +invlrng_invpcid: + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz 1f + swapgs +1: + pushq %rcx + movq (%rdx),%rcx + movq %rcx,PCPU(INVPCID_DESCR) + movq 8(%rdx),%rax + movq %rax,PCPU(INVPCID_DESCR)+8 + movq smp_tlb_addr2,%rcx + xorl %eax,%eax + movq $PC_INVPCID_DESCR,%rdx + gs + subq 8(%rdx),%rcx + shrq $PAGE_SHIFT,%rcx +2: + gs +// invpcid (%rdx),%rax + .byte 0x66,0x0f,0x38,0x82,0x02 + gs + addq $PAGE_SIZE,8(%rdx) + dec %rcx + jne 2b + popq %rcx + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz invltlb_ret + swapgs + jmp invltlb_ret + +invlrng_single_page: + movq 8(%rdx),%rdx + movq smp_tlb_addr2,%rax 1: invlpg (%rdx) /* invalidate single page */ - addq $PAGE_SIZE, %rdx - cmpq %rax, %rdx + addq $PAGE_SIZE,%rdx + cmpq %rax,%rdx jb 1b - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rdx - popq %rax - jmp doreti_iret + jmp invltlb_ret /* * Invalidate cache. @@ -249,17 +402,9 @@ IDTVEC(invlcache) #endif pushq %rax - + pushq %rdx wbinvd - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index ed1ccb5..ac30990 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -77,8 +77,7 @@ ENTRY(cpu_throw) LK btrl %eax,PM_ACTIVE(%rdx) /* clear old */ 1: movq TD_PCB(%rsi),%r8 /* newtd->td_pcb */ - movq PCB_CR3(%r8),%rdx - movq %rdx,%cr3 /* new address space */ + movq PCB_CR3(%r8),%rcx /* new address space */ jmp swact END(cpu_throw) @@ -145,20 +144,41 @@ ctx_switch_xsave: SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */ jmp sw1 swinact: - movq %rcx,%cr3 /* new address space */ - movl PCPU(CPUID), %eax + movl PCPU(CPUID),%eax /* Release bit from old pmap->pm_active */ - movq PCPU(CURPMAP),%rcx - LK btrl %eax,PM_ACTIVE(%rcx) /* clear old */ - SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */ + movq PCPU(CURPMAP),%r12 + LK btrl %eax,PM_ACTIVE(%r12) /* clear old */ + SETLK %rdx,TD_LOCK(%rdi) /* Release the old thread */ swact: /* Set bit in new pmap->pm_active */ movq TD_PROC(%rsi),%rdx /* newproc */ movq P_VMSPACE(%rdx), %rdx addq $VM_PMAP,%rdx + cmpl $-1,PM_PCID(%rdx) + je 1f + LK btsl %eax,PM_SAVE(%rdx) + jnc 1f + btsq $63,%rcx /* CR3_PCID_SAVE */ + incq PCPU(PM_SAVE_CNT) +1: + movq %rcx,%cr3 /* new address space */ LK btsl %eax,PM_ACTIVE(%rdx) /* set new */ movq %rdx,PCPU(CURPMAP) + /* + * We might lose the race and other CPU might have changed + * the pmap after we set our bit in pmap->pm_save. Recheck. + * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was + * modified, causing TLB flush for this pcid. + */ + btrq $63,%rcx + jnc 1f + LK btsl %eax,PM_SAVE(%rdx) + jc 1f + decq PCPU(PM_SAVE_CNT) + movq %rcx,%cr3 +1: + sw1: #if defined(SCHED_ULE) && defined(SMP) /* Wait for the new thread to become unblocked */ diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 3043bb5..62017e7 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -76,6 +76,8 @@ __FBSDID("$FreeBSD$"); ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PM_SAVE, offsetof(struct pmap, pm_save)); +ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); @@ -225,6 +227,8 @@ ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); +ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); +ASSYM(PC_INVPCID_DESCR, offsetof(struct pcpu, pc_invpcid_descr)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 7a39ef8..deca6a6 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1903,7 +1903,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; - thread0.td_pcb->pcb_cr3 = KPML4phys; + thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */ thread0.td_frame = &proc0_tf; env = getenv("kernelname"); diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 79aeb9c..2f1df0a 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -103,9 +103,11 @@ struct pcb stoppcbs[MAXCPU]; struct pcb **susppcbs; /* Variables needed for SMP tlb shootdown. */ -vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; +struct invpcid_descr smp_tlb_invpcid; volatile int smp_tlb_wait; +uint64_t pcid_cr3; +pmap_t smp_tlb_pmap; #ifdef COUNT_IPIS /* Interrupt counts. */ @@ -599,6 +601,8 @@ cpu_mp_announce(void) } } +extern int pmap_pcid_enabled; + /* * AP CPU's call this to initialize themselves. */ @@ -759,6 +763,8 @@ init_secondary(void) */ load_cr4(rcr4() | CR4_PGE); + if (pmap_pcid_enabled) + load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); @@ -1110,7 +1116,8 @@ ipi_send_cpu(int cpu, u_int ipi) * Flush the TLB on all other CPU's */ static void -smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1, + vm_offset_t addr2) { u_int ncpu; @@ -1120,8 +1127,16 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); - smp_tlb_addr1 = addr1; + smp_tlb_invpcid.addr = addr1; + if (pmap == NULL) { + smp_tlb_invpcid.pcid = 0; + } else { + smp_tlb_invpcid.pcid = pmap->pm_pcid; + pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | + (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid); + } smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) @@ -1130,7 +1145,8 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) } static void -smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, + vm_offset_t addr1, vm_offset_t addr2) { int cpu, ncpu, othercpus; @@ -1146,8 +1162,16 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_of if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); - smp_tlb_addr1 = addr1; + smp_tlb_invpcid.addr = addr1; + if (pmap == NULL) { + smp_tlb_invpcid.pcid = 0; + } else { + smp_tlb_invpcid.pcid = pmap->pm_pcid; + pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | + (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid); + } smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); if (CPU_ISFULLSET(&mask)) { ncpu = othercpus; @@ -1173,15 +1197,15 @@ smp_cache_flush(void) { if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); + smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); } void -smp_invltlb(void) +smp_invltlb(pmap_t pmap) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLTLB, 0, 0); + smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif @@ -1189,11 +1213,11 @@ smp_invltlb(void) } void -smp_invlpg(vm_offset_t addr) +smp_invlpg(pmap_t pmap, vm_offset_t addr) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLPG, addr, 0); + smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif @@ -1201,11 +1225,11 @@ smp_invlpg(vm_offset_t addr) } void -smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); + smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; @@ -1214,11 +1238,11 @@ smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) } void -smp_masked_invltlb(cpuset_t mask) +smp_masked_invltlb(cpuset_t mask, pmap_t pmap) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, NULL, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif @@ -1226,11 +1250,11 @@ smp_masked_invltlb(cpuset_t mask) } void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) +smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif @@ -1238,11 +1262,13 @@ smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) } void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) +smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, + vm_offset_t addr2) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, + addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 7fb1277..3a471b9 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -116,11 +116,8 @@ __FBSDID("$FreeBSD$"); #include #include #include -#ifdef SMP +#include #include -#else -#include -#endif #include #include @@ -250,6 +247,53 @@ static struct md_page *pv_table; pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static struct unrhdr pcid_unr; +static struct mtx pcid_mtx; +int pmap_pcid_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled, + 0, "Is TLB Context ID enabled ?"); +int invpcid_works = 0; + +/* + * Perform the guaranteed invalidation of all TLB entries. This + * includes the global entries, and entries in all PCIDs, not only the + * current context. The function works both on non-PCID CPUs and CPUs + * with the PCID turned off or on. See IA-32 SDM Vol. 3a 4.10.4.1 + * Operations that Invalidate TLBs and Paging-Structure Caches. + */ +static __inline void +invltlb_globpcid(void) +{ + uint64_t cr4; + + cr4 = rcr4(); + load_cr4(cr4 & ~CR4_PGE); + /* + * Although preemption at this point could be detrimental to + * performance, it would not lead to an error. PG_G is simply + * ignored if CR4.PGE is clear. Moreover, in case this block + * is re-entered, the load_cr4() either above or below will + * modify CR4.PGE flushing the TLB. + */ + load_cr4(cr4 | CR4_PGE); +} + +static int +pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) +{ + int i; + uint64_t res; + + res = 0; + CPU_FOREACH(i) { + res += cpuid_to_pcpu[i]->pc_pm_save_cnt; + } + return (sysctl_handle_64(oidp, &res, 0, req)); +} +SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | + CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", + "Count of saved TLB context on switch"); + /* * Crashdump maps. */ @@ -685,6 +729,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + CPU_ZERO(&kernel_pmap->pm_save); TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* @@ -716,6 +761,19 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* Initialize the PAT MSR. */ pmap_init_pat(); + + /* Initialize TLB Context Id. */ + TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); + if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { + load_cr4(rcr4() | CR4_PCIDE); + mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF); + init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx); + /* Check for INVPCID support */ + invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) + != 0; + kernel_pmap->pm_pcid = 0; + } else + pmap_pcid_enabled = 0; } /* @@ -952,7 +1010,6 @@ pmap_cache_bits(int mode, boolean_t is_pde) static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { - u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ @@ -968,19 +1025,34 @@ pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ - cr4 = rcr4(); - load_cr4(cr4 & ~CR4_PGE); - /* - * Although preemption at this point could be detrimental to - * performance, it would not lead to an error. PG_G is simply - * ignored if CR4.PGE is clear. Moreover, in case this block - * is re-entered, the load_cr4() either above or below will - * modify CR4.PGE flushing the TLB. - */ - load_cr4(cr4 | CR4_PGE); + invltlb_globpcid(); } } #ifdef SMP + +static void +pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) +{ + struct invpcid_descr d; + uint64_t cr3; + + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + return; + } + + cr3 = rcr3(); + critical_enter(); + load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid | + CR3_PCID_SAVE); + invlpg(va); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); +} + /* * For SMP, these functions have to use the IPI mechanism for coherence. * @@ -1008,21 +1080,68 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invlpg(va); - smp_invlpg(va); + if (!pmap_pcid_enabled) { + invlpg(va); + } else { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (pmap == vmspace_pmap(curproc->p_vmspace)) + invlpg(va); + else + pmap_invalidate_page_pcid(pmap, va); + } else { + invltlb_globpcid(); + } + } + smp_invlpg(pmap, va); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); - CPU_AND(&other_cpus, &pmap->pm_active); + else if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) + pmap_invalidate_page_pcid(pmap, va); + else + invltlb_globpcid(); + } + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg(other_cpus, va); + smp_masked_invlpg(other_cpus, pmap, va); } sched_unpin(); } +static void +pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct invpcid_descr d; + uint64_t cr3; + vm_offset_t addr; + + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + for (addr = sva; addr < eva; addr += PAGE_SIZE) { + d.addr = addr; + invpcid(&d, INVPCID_ADDR); + } + return; + } + + cr3 = rcr3(); + critical_enter(); + load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid | + CR3_PCID_SAVE); + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); +} + void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { @@ -1032,19 +1151,43 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - smp_invlpg_range(sva, eva); + if (!pmap_pcid_enabled) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } else { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (pmap == vmspace_pmap(curproc->p_vmspace)) { + for (addr = sva; addr < eva; + addr += PAGE_SIZE) + invlpg(addr); + } else { + pmap_invalidate_range_pcid(pmap, + sva, eva); + } + } else { + invltlb_globpcid(); + } + } + smp_invlpg_range(pmap, sva, eva); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) + if (CPU_ISSET(cpuid, &pmap->pm_active)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - CPU_AND(&other_cpus, &pmap->pm_active); + } else if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) + pmap_invalidate_range_pcid(pmap, sva, eva); + else + invltlb_globpcid(); + } + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg_range(other_cpus, sva, eva); + smp_masked_invlpg_range(other_cpus, pmap, sva, eva); } sched_unpin(); } @@ -1053,21 +1196,63 @@ void pmap_invalidate_all(pmap_t pmap) { cpuset_t other_cpus; + struct invpcid_descr d; + uint64_t cr3; u_int cpuid; sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invltlb(); - smp_invltlb(); + cpuid = PCPU_GET(cpuid); + if (pmap == kernel_pmap || + (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) || + !CPU_CMP(&pmap->pm_active, &all_cpus)) { + if (invpcid_works) { + bzero(&d, sizeof(d)); + invpcid(&d, INVPCID_CTXGLOB); + } else { + invltlb_globpcid(); + } + CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); + smp_invltlb(pmap); } else { - cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) + + /* + * This logic is duplicated in the Xinvltlb shootdown + * IPI handler. + */ + if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + cr3 = rcr3(); + critical_enter(); + + /* + * Bit 63 is clear, pcid TLB + * entries are invalidated. + */ + load_cr3(DMAP_TO_PHYS((vm_offset_t) + pmap->pm_pml4) | pmap->pm_pcid); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); + } + } else { + invltlb_globpcid(); + } + } else if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); - CPU_AND(&other_cpus, &pmap->pm_active); + CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invltlb(other_cpus); + smp_masked_invltlb(other_cpus, pmap); } sched_unpin(); } @@ -1129,8 +1314,10 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; - else + else { active = pmap->pm_active; + CPU_AND_ATOMIC(&pmap->pm_save, &active); + } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; @@ -1193,6 +1380,8 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); + else + CPU_ZERO(&pmap->pm_save); } #endif /* !SMP */ @@ -1675,6 +1864,11 @@ pmap_pinit0(pmap_t pmap) PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + if (pmap_pcid_enabled) + pmap->pm_pcid = 0; + else + pmap->pm_pcid = -1; + CPU_ZERO(&pmap->pm_save); } /* @@ -1717,6 +1911,13 @@ pmap_pinit(pmap_t pmap) TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + if (pmap_pcid_enabled) { + pmap->pm_pcid = alloc_unr(&pcid_unr); + pmap_invalidate_all(pmap); + } else + pmap->pm_pcid = -1; + CPU_ZERO(&pmap->pm_save); + return (1); } @@ -1957,6 +2158,14 @@ pmap_release(pmap_t pmap) KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); + if (pmap_pcid_enabled) { + /* + * Invalidate any left TLB entries, to allow the reuse + * of the pcid. + */ + pmap_invalidate_all(pmap); + } + m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); for (i = 0; i < NKPML4E; i++) /* KVA */ @@ -1968,6 +2177,8 @@ pmap_release(pmap_t pmap) m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); + if (pmap->pm_pcid != -1) + free_unr(&pcid_unr, pmap->pm_pcid); } static int @@ -5612,15 +5823,20 @@ pmap_activate(struct thread *td) critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); + CPU_ZERO(&pmap->pm_save); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); + CPU_SET_ATOMIC(cpuid, &pmap->pm_save); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); + CPU_SET(cpuid, &pmap->pm_save); #endif cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4); + if (pmap->pm_pcid != -1) + cr3 |= pmap->pm_pcid; td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index ed0e7e9..3764f72 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -221,6 +221,8 @@ cpu_fork(td1, p2, td2, flags) */ pmap2 = vmspace_pmap(p2->p_vmspace); pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4); + if (pmap2->pm_pcid != -1) + pcb2->pcb_cr3 |= pmap2->pm_pcid; pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */ pcb2->pcb_rbp = 0; pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *); diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index 881fcd2..3d381c6 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -472,6 +472,26 @@ invlpg(u_long addr) __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); } +#define INVPCID_ADDR 0 +#define INVPCID_CTX 1 +#define INVPCID_CTXGLOB 2 +#define INVPCID_ALLCTX 3 + +struct invpcid_descr { + uint64_t pcid:12 __packed; + uint64_t pad:52 __packed; + uint64_t addr; +} __packed; + +static __inline void +invpcid(struct invpcid_descr *d, int type) +{ + + /* invpcid (%rdx),%rax */ + __asm __volatile(".byte 0x66,0x0f,0x38,0x82,0x02" + : : "d" (d), "a" ((u_long)type) : "memory"); +} + static __inline u_short rfs(void) { diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index ba4c618..44b3bab 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -76,6 +76,8 @@ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ + uint64_t pc_pm_save_cnt; \ + char pc_invpcid_descr[16]; \ u_int pc_cmci_mask /* MCx banks for CMCI */ \ PCPU_XEN_FIELDS; \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index aacb9ba..fa42389 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -240,6 +240,8 @@ struct pmap { pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ + cpuset_t pm_save; /* Context valid on cpus mask */ + int pm_pcid; /* context id */ /* spare u_int here due to padding */ struct pmap_statistics pm_stats; /* pmap statistics */ struct vm_radix pm_root; /* spare page table pages */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 16d87ea..d6cd476 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -54,6 +54,8 @@ inthand_t IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ IDTVEC(rendezvous); /* handle CPU rendezvous */ +struct pmap; + /* functions in mp_machdep.c */ void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); @@ -67,13 +69,14 @@ int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); u_int mp_bootaddress(u_int); void smp_cache_flush(void); -void smp_invlpg(vm_offset_t addr); -void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr); -void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva); -void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, +void smp_invlpg(struct pmap *pmap, vm_offset_t addr); +void smp_masked_invlpg(cpuset_t mask, struct pmap *pmap, vm_offset_t addr); +void smp_invlpg_range(struct pmap *pmap, vm_offset_t startva, vm_offset_t endva); -void smp_invltlb(void); -void smp_masked_invltlb(cpuset_t mask); +void smp_masked_invlpg_range(cpuset_t mask, struct pmap *pmap, + vm_offset_t startva, vm_offset_t endva); +void smp_invltlb(struct pmap *pmap); +void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap); #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/kern/subr_unit.c b/sys/kern/subr_unit.c index 9cf1781..3bf7aaf 100644 --- a/sys/kern/subr_unit.c +++ b/sys/kern/subr_unit.c @@ -68,8 +68,8 @@ */ #include -#include #include +#include #ifdef _KERNEL @@ -187,22 +187,6 @@ CTASSERT(sizeof(struct unr) == sizeof(struct unrb)); /* Number of bits in the bitmap */ #define NBITS ((int)sizeof(((struct unrb *)NULL)->map) * 8) -/* Header element for a unr number space. */ - -struct unrhdr { - TAILQ_HEAD(unrhd,unr) head; - u_int low; /* Lowest item */ - u_int high; /* Highest item */ - u_int busy; /* Count of allocated items */ - u_int alloc; /* Count of memory allocations */ - u_int first; /* items in allocated from start */ - u_int last; /* items free at end */ - struct mtx *mtx; - TAILQ_HEAD(unrfr,unr) ppfree; /* Items to be freed after mtx - lock dropped */ -}; - - #if defined(DIAGNOSTIC) || !defined(_KERNEL) /* * Consistency check function. @@ -315,20 +299,12 @@ clean_unrhdr(struct unrhdr *uh) mtx_unlock(uh->mtx); } -/* - * Allocate a new unrheader set. - * - * Highest and lowest valid values given as parameters. - */ - -struct unrhdr * -new_unrhdr(int low, int high, struct mtx *mutex) +void +init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex) { - struct unrhdr *uh; KASSERT(low >= 0 && low <= high, ("UNR: use error: new_unrhdr(%d, %d)", low, high)); - uh = Malloc(sizeof *uh); if (mutex != NULL) uh->mtx = mutex; else @@ -340,6 +316,21 @@ new_unrhdr(int low, int high, struct mtx *mutex) uh->first = 0; uh->last = 1 + (high - low); check_unrhdr(uh, __LINE__); +} + +/* + * Allocate a new unrheader set. + * + * Highest and lowest valid values given as parameters. + */ + +struct unrhdr * +new_unrhdr(int low, int high, struct mtx *mutex) +{ + struct unrhdr *uh; + + uh = Malloc(sizeof *uh); + init_unrhdr(uh, low, high, mutex); return (uh); } diff --git a/sys/sys/_unrhdr.h b/sys/sys/_unrhdr.h new file mode 100644 index 0000000..f3c25d1 --- /dev/null +++ b/sys/sys/_unrhdr.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2004 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_UNRHDR_H +#define _SYS_UNRHDR_H + +#include + +struct mtx; + +/* Header element for a unr number space. */ + +struct unrhdr { + TAILQ_HEAD(unrhd,unr) head; + u_int low; /* Lowest item */ + u_int high; /* Highest item */ + u_int busy; /* Count of allocated items */ + u_int alloc; /* Count of memory allocations */ + u_int first; /* items in allocated from start */ + u_int last; /* items free at end */ + struct mtx *mtx; + TAILQ_HEAD(unrfr,unr) ppfree; /* Items to be freed after mtx + lock dropped */ +}; + +#endif diff --git a/sys/sys/bitset.h b/sys/sys/bitset.h index dee5542..7c24ecd 100644 --- a/sys/sys/bitset.h +++ b/sys/sys/bitset.h @@ -135,7 +135,14 @@ atomic_set_long(&(p)->__bits[__bitset_word(_s, n)], \ __bitset_mask((_s), n)) -/* Convenience functions catering special cases. */ +/* Convenience functions catering special cases. */ +#define BIT_AND_ATOMIC(_s, d, s) do { \ + __size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + atomic_clear_long(&(d)->__bits[__i], \ + ~(s)->__bits[__i]); \ +} while (0) + #define BIT_OR_ATOMIC(_s, d, s) do { \ __size_t __i; \ for (__i = 0; __i < __bitset_words((_s)); __i++) \ diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h index fc078d3..e1ee37d 100644 --- a/sys/sys/cpuset.h +++ b/sys/sys/cpuset.h @@ -55,6 +55,7 @@ #define CPU_NAND(d, s) BIT_NAND(CPU_SETSIZE, d, s) #define CPU_CLR_ATOMIC(n, p) BIT_CLR_ATOMIC(CPU_SETSIZE, n, p) #define CPU_SET_ATOMIC(n, p) BIT_SET_ATOMIC(CPU_SETSIZE, n, p) +#define CPU_AND_ATOMIC(n, p) BIT_AND_ATOMIC(CPU_SETSIZE, n, p) #define CPU_OR_ATOMIC(d, s) BIT_OR_ATOMIC(CPU_SETSIZE, d, s) #define CPU_COPY_STORE_REL(f, t) BIT_COPY_STORE_REL(CPU_SETSIZE, f, t) #define CPU_FFS(p) BIT_FFS(CPU_SETSIZE, p) diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 4887d71..e3ea9cf 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -396,6 +396,7 @@ int root_mounted(void); */ struct unrhdr; struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); +void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh);