diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index 1254f3f..ed1ccb5 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -122,6 +122,9 @@ done_store_dr: 1: movq %rdx,%rcx movl xsave_mask,%eax movl xsave_mask+4,%edx + .globl ctx_switch_xsave +ctx_switch_xsave: + /* This is patched to xsaveopt if supported, see fpuinit_bsp1() */ xsave (%r8) movq %rcx,%rdx 2: smsw %ax diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index 77e11bc..1483314 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -135,10 +135,16 @@ static void fpu_clean_state(void); SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, NULL, 1, "Floating point instructions executed in hardware"); +static int use_xsaveopt; int use_xsave; /* non-static for cpu_switch.S */ uint64_t xsave_mask; /* the same */ static struct savefpu *fpu_initialstate; +struct xsave_area_elm_descr { + u_int offset; + u_int size; +} *xsave_area_desc; + void fpusave(void *addr) { @@ -185,6 +191,17 @@ fpuinit_bsp1(void) TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user); xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; xsave_mask &= xsave_mask_user; + + cpuid_count(0xd, 0x1, cp); + if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) { + /* + * Patch the XSAVE instruction in the cpu_switch code + * to XSAVEOPT. We assume that XSAVE encoding used + * REX byte, and set the bit 4 of the r/m byte. + */ + ctx_switch_xsave[3] |= 0x10; + use_xsaveopt = 1; + } } /* @@ -255,6 +272,7 @@ static void fpuinitstate(void *arg __unused) { register_t saveintr; + int cp[4], i, max_ext_n; fpu_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF, M_WAITOK | M_ZERO); @@ -276,6 +294,28 @@ fpuinitstate(void *arg __unused) */ bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc)); + /* + * Create a table describing the layout of the CPU Extended + * Save Area. + */ + if (use_xsaveopt) { + max_ext_n = flsl(xsave_mask); + xsave_area_desc = malloc(max_ext_n * sizeof(struct + xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO); + /* x87 state */ + xsave_area_desc[0].offset = 0; + xsave_area_desc[0].size = 160; + /* XMM */ + xsave_area_desc[1].offset = 160; + xsave_area_desc[1].size = 288 - 160; + + for (i = 2; i < max_ext_n; i++) { + cpuid_count(0xd, i, cp); + xsave_area_desc[i].offset = cp[1]; + xsave_area_desc[i].size = cp[0]; + } + } + start_emulating(); intr_restore(saveintr); } @@ -599,6 +639,9 @@ int fpugetregs(struct thread *td) { struct pcb *pcb; + uint64_t *xstate_bv, bit; + char *sa; + int max_ext_n, i; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) { @@ -616,6 +659,25 @@ fpugetregs(struct thread *td) return (_MC_FPOWNED_FPU); } else { critical_exit(); + if (use_xsaveopt) { + /* + * Handle partially saved state. + */ + sa = (char *)get_pcb_user_save_pcb(pcb); + xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) + + offsetof(struct xstate_hdr, xstate_bv)); + max_ext_n = flsl(xsave_mask); + for (i = 0; i < max_ext_n; i++) { + bit = 1 << i; + if ((*xstate_bv & bit) != 0) + continue; + bcopy((char *)fpu_initialstate + + xsave_area_desc[i].offset, + sa + xsave_area_desc[i].offset, + xsave_area_desc[i].size); + *xstate_bv |= bit; + } + } return (_MC_FPOWNED_PCB); } } diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index ff11ea1..ff322bb 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -57,6 +57,7 @@ extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; +extern char ctx_switch_xsave[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode;