diff -u -p --recursive head2/sys/amd64/conf/GENERIC head/sys/amd64/conf/GENERIC --- head2/sys/amd64/conf/GENERIC 2015-04-22 10:35:57.545270000 -0700 +++ head/sys/amd64/conf/GENERIC 2015-04-19 17:40:06.405129000 -0700 @@ -354,3 +354,6 @@ device vmx # VMware VMXNET3 Ethernet # Netmap provides direct access to TX/RX rings on supported NICs device netmap # netmap(4) support +options KTR +options KTR_MASK=(KTR_GEN) +options KTR_ENTRIES=(1024*1024) diff -u -p --recursive head2/sys/amd64/include/vmm.h head/sys/amd64/include/vmm.h --- head2/sys/amd64/include/vmm.h 2015-04-22 10:36:09.784930000 -0700 +++ head/sys/amd64/include/vmm.h 2015-04-22 10:27:03.437946000 -0700 @@ -497,6 +497,10 @@ enum vm_exitcode { VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, +#ifdef NMI_IRET_TESTING + VM_EXITCODE_INVAL_NPT, /* invalidate nested PTEs when executing iret */ + VM_EXITCODE_IRET_GPF, /* induce a #GP when executing iret */ +#endif VM_EXITCODE_MAX }; diff -u -p --recursive head2/sys/amd64/vmm/amd/svm.c head/sys/amd64/vmm/amd/svm.c --- head2/sys/amd64/vmm/amd/svm.c 2015-04-22 10:36:42.901410000 -0700 +++ head/sys/amd64/vmm/amd/svm.c 2015-04-22 17:39:14.702594000 -0700 @@ -1280,6 +1280,52 @@ vm_exit_svm(struct vm_exit *vme, uint64_ vme->u.svm.exitinfo2 = info2; } +#ifdef NMI_IRET_TESTING +void svm_swapgs(void *arg, int vcpu); + +SYSCTL_NODE(_debug, OID_AUTO, nit, CTLFLAG_RW, 0, ""); + +static int invalidate_nested_mappings; +SYSCTL_INT(_debug_nit, OID_AUTO, invalidate_nested_mappings, CTLFLAG_RW, + &invalidate_nested_mappings, 0, + "Invalidate nested page table entries before IRET tracing"); + +static int bounce_to_userspace; +SYSCTL_INT(_debug_nit, OID_AUTO, bounce_to_userspace, CTLFLAG_RW, + &bounce_to_userspace, 0, "Bounce up to userspace before executing iret"); + +static int iret_trigger_gpf; +SYSCTL_INT(_debug_nit, OID_AUTO, iret_trigger_gpf, CTLFLAG_RW, + &iret_trigger_gpf, 0, "Trigger a #GP when executing iret"); + +void +svm_swapgs(void *arg, int vcpu) +{ + struct seg_desc desc; + struct vmcb_state *state; + struct svm_softc *sc; + struct vmcb *vmcb; + uint64_t tmp; + int error; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + + error = vmcb_getdesc(sc, vcpu, VM_REG_GUEST_GS, &desc); + KASSERT(error == 0, ("%s: vmcb_getdesc error %d", __func__, error)); + + VCPU_CTR2(sc->vm, vcpu, "Swapping gs.base (%#lx) and kgsbase (%#lx)", + desc.base, state->kernelgsbase); + + tmp = state->kernelgsbase; + state->kernelgsbase = desc.base; + desc.base = tmp; + error = vmcb_setdesc(sc, vcpu, VM_REG_GUEST_GS, &desc); + KASSERT(error == 0, ("%s: vmcb_setdesc error %d", __func__, error)); +} +#endif /* NMI_IRET_TESTING */ + static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { @@ -1338,6 +1384,21 @@ svm_vmexit(struct svm_softc *svm_sc, int vmexit->inst_length = 0; nmi_enable_iret_tracing(svm_sc, vcpu); handled = 1; +#ifdef NMI_IRET_TESTING + /* Invalidate all nested page table entries */ + if (invalidate_nested_mappings) { + vmexit->exitcode = VM_EXITCODE_INVAL_NPT; + vmexit->u.svm.exitinfo1 = bounce_to_userspace; + invalidate_nested_mappings = 0; + bounce_to_userspace = 0; + handled = 0; + } else if (iret_trigger_gpf) { + vmexit->exitcode = VM_EXITCODE_IRET_GPF; + svm_paging_info(vmcb, &vmexit->u.inst_emul.paging); + iret_trigger_gpf = 0; + handled = 0; + } +#endif break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); diff -u -p --recursive head2/sys/amd64/vmm/io/vatpit.c head/sys/amd64/vmm/io/vatpit.c --- head2/sys/amd64/vmm/io/vatpit.c 2015-04-22 10:37:03.975427000 -0700 +++ head/sys/amd64/vmm/io/vatpit.c 2015-04-22 10:29:35.369513000 -0700 @@ -399,6 +399,19 @@ vatpit_handler(struct vm *vm, int vcpuid return (0); } +#ifdef NMI_IRET_TESTING +#include +SYSCTL_DECL(_debug_nit); + +static int recursive_nmi = 0; +SYSCTL_INT(_debug_nit, OID_AUTO, recursive_nmi, CTLFLAG_RW, + &recursive_nmi, 0, NULL); + +static int nmisc_delay_usecs = 0; +SYSCTL_INT(_debug_nit, OID_AUTO, delay_usecs, CTLFLAG_RW, + &nmisc_delay_usecs, 0, NULL); +#endif + int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) @@ -415,6 +428,22 @@ vatpit_nmisc_handler(struct vm *vm, int *eax = 0; VATPIT_UNLOCK(vatpit); +#ifdef NMI_IRET_TESTING + if (recursive_nmi) { + recursive_nmi = 0; + VCPU_CTR0(vm, vcpuid, "Injecting recursive " + "vNMI"); + vm_inject_nmi(vm, vcpuid); + } + + if (nmisc_delay_usecs) { + VCPU_CTR1(vm, vcpuid, "Delaying %d usecs", + nmisc_delay_usecs); + pause("nmisc", + nmisc_delay_usecs * hz / 1000000); + nmisc_delay_usecs = 0; + } +#endif } return (0); diff -u -p --recursive head2/sys/amd64/vmm/vmm.c head/sys/amd64/vmm/vmm.c --- head2/sys/amd64/vmm/vmm.c 2015-04-22 10:37:31.011927000 -0700 +++ head/sys/amd64/vmm/vmm.c 2015-04-22 10:33:06.851686000 -0700 @@ -1501,6 +1501,84 @@ restart: retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { +#ifdef NMI_IRET_TESTING + case VM_EXITCODE_INVAL_NPT: + VCPU_CTR0(vm, vcpuid, "Removing all nested pages"); + pmap_remove(vmspace_pmap(vm->vmspace), 0, + VM_MAXUSER_ADDRESS); + if (vme->u.svm.exitinfo1) { + VCPU_CTR0(vm, vcpuid, "Bouncing to userspace"); + vme->exitcode = VM_EXITCODE_BOGUS; + retu = true; + } + break; + + case VM_EXITCODE_IRET_GPF: + { + void svm_swapgs(void *arg, int vcpuid); + struct vm_copyinfo copyinfo[2]; + struct vm_guest_paging *paging; + register_t rsp = ~0UL; /* invalid */ + register_t val = 0xdeadbeefbeefdead; + + /* + * When the NMI handler iret is intercepted the + * guest %rsp is pointing to the return address. + */ + vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT((rsp & 0x7) == 0, ("Guest NMI stack pointer " + "not aligned: %#lx", rsp)); + + /* + * Update the return address on the stack with a + * non-canonical address which should trigger a + * protection fault. + */ + paging = &vme->u.inst_emul.paging; + error = vm_copy_setup(vm, vcpuid, paging, rsp, 8, + PROT_WRITE, copyinfo, nitems(copyinfo)); + KASSERT(error == 0, ("vm_copy_setup error %d", error)); + vm_copyout(vm, vcpuid, &val, copyinfo, 8); + vm_copy_teardown(vm, vcpuid, copyinfo,nitems(copyinfo)); + VCPU_CTR2(vm, vcpuid, "vNMI return address at %#lx " + "updated to non-canonical value %#lx", rsp, val); + + /* + * Copy 'frame->tf_cs' to 'val' to inspect whether + * the 'iret' is returning to userspace or kernel. + */ + error = vm_copy_setup(vm, vcpuid, paging, rsp + 8, 8, + PROT_READ, copyinfo, nitems(copyinfo)); + KASSERT(error == 0, ("vm_copy_setup error %d", error)); + vm_copyin(vm, vcpuid, copyinfo, &val, 8); + vm_copy_teardown(vm, vcpuid, copyinfo,nitems(copyinfo)); + + /* + * XXX + * The #GP fault handler will do a 'swapgs' if the + * fault was caused by the 'iret'. This assumes that + * kgsbase points to the pcpu region. + * + * If the NMI handler interrupted userspace then it + * will do a 'swapgs' on entry and another 'swapgs' + * right before 'iret'. In this case kgsbase would + * be initialized as expected by the #GP handler. + * + * However, if the NMI handler interrupted the kernel + * it will modify gsbase directly and not modify + * kgsbase at all. Therefore the contents of kgsbase + * are not guaranteed to be valid. On the other hand + * the contents of gsbase are very likely to point to + * the pcpu area. By swapping gsbase and kgsbase we + * initialize kgsbase to point to the pcpu area as + * expected by the #GP handler. + */ + if (ISPL(val) == SEL_KPL) + svm_swapgs(vm->cookie, vcpuid); + + break; + } +#endif case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); break; Only in head/sys/i386/i386: .trap.c.swp diff -u -p --recursive head2/sys/modules/vmm/Makefile head/sys/modules/vmm/Makefile --- head2/sys/modules/vmm/Makefile 2015-04-22 10:37:49.734428000 -0700 +++ head/sys/modules/vmm/Makefile 2015-04-21 11:06:42.010943000 -0700 @@ -6,6 +6,8 @@ SRCS= opt_acpi.h opt_ddb.h device_if.h b SRCS+= vmx_assym.h svm_assym.h DPSRCS= vmx_genassym.c svm_genassym.c +CFLAGS+= -DNMI_IRET_TESTING + CFLAGS+= -DVMM_KEEP_STATS -DSMP CFLAGS+= -I${.CURDIR}/../../amd64/vmm CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io