Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c (revision 277046) +++ lib/libvmmapi/vmmapi.c (working copy) @@ -1002,6 +1002,7 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) { + void *va; uint64_t gpa; int error, fault, i, n, off; @@ -1021,7 +1022,11 @@ off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); - iov->iov_base = (void *)gpa; + va = vm_map_gpa(ctx, gpa, n); + if (va == NULL) + return (-1); + + iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; @@ -1033,19 +1038,24 @@ } void +vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) +{ + + return; +} + +void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; - uint64_t gpa; size_t n; dst = vp; while (len) { assert(iov->iov_len); - gpa = (uint64_t)iov->iov_base; n = min(len, iov->iov_len); - src = vm_map_gpa(ctx, gpa, n); + src = iov->iov_base; bcopy(src, dst, n); iov++; @@ -1060,15 +1070,13 @@ { const char *src; char *dst; - uint64_t gpa; size_t n; src = vp; while (len) { assert(iov->iov_len); - gpa = (uint64_t)iov->iov_base; n = min(len, iov->iov_len); - dst = vm_map_gpa(ctx, gpa, n); + dst = iov->iov_base; bcopy(src, dst, n); iov++; Index: lib/libvmmapi/vmmapi.h =================================================================== --- lib/libvmmapi/vmmapi.h (revision 277046) +++ lib/libvmmapi/vmmapi.h (working copy) @@ -132,6 +132,8 @@ void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, struct iovec *guest_iov, size_t len); +void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, + int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h (revision 277046) +++ sys/amd64/include/vmm.h (working copy) @@ -445,8 +445,11 @@ rex_x:1, rex_b:1, rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ - addrsize_override:1; /* Address size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, @@ -462,6 +465,7 @@ uint8_t scale; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ @@ -628,4 +632,6 @@ void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); +void vm_restart_instruction(void *vm, int vcpuid); + #endif /* _VMM_H_ */ Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c (revision 277046) +++ sys/amd64/vmm/vmm.c (working copy) @@ -1525,6 +1525,19 @@ return (error); } +void +vm_restart_instruction(void *vm, int vcpuid) +{ + struct vm_exit *vmexit; + + /* + * Set the 'inst_length' to 0 we ensure that the instruction + * pointer remains at the current instruction. + */ + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->inst_length = 0; +} + int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { @@ -1771,7 +1784,6 @@ int errcode) { struct vm_exception exception; - struct vm_exit *vmexit; struct vm *vm; int error; @@ -1786,12 +1798,8 @@ /* * A fault-like exception allows the instruction to be restarted * after the exception handler returns. - * - * By setting the inst_length to 0 we ensure that the instruction - * pointer remains at the faulting instruction. */ - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->inst_length = 0; + vm_restart_instruction(vm, vcpuid); } void Index: sys/amd64/vmm/vmm_instruction_emul.c =================================================================== --- sys/amd64/vmm/vmm_instruction_emul.c (revision 277046) +++ sys/amd64/vmm/vmm_instruction_emul.c (working copy) @@ -70,6 +70,7 @@ VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, VIE_OP_TYPE_LAST }; @@ -78,6 +79,7 @@ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) static const struct vie_op two_byte_opcodes[256] = { [0xB6] = { @@ -133,6 +135,16 @@ .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, @@ -559,7 +571,221 @@ return (error); } +/* + * Helper function to calculate and validate a linear address. + * + * Returns 0 on success and 1 if an exception was injected into the guest. + */ static int +get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, + int opsize, int addrsize, int prot, enum vm_reg_name seg, + enum vm_reg_name gpr, uint64_t *gla) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (1); + } + + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, val; + uint64_t rcx, rdi, rsi, rflags; + int error, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio not emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + goto done; + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ +#ifndef _KERNEL + vm_restart_instruction(vm, vcpuid); /* XXX fixme */ +#endif + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + } + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we commit to it + * only after vm_copy_setup() is successful. If a page-fault + * needs to be injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ +#ifndef _KERNEL + vm_restart_instruction(vm, vcpuid); /* XXX fixme */ +#endif + goto done; + } else { + goto done; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + if (error < 0) + return (EFAULT); + else + return (0); +} + +static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { @@ -926,9 +1152,7 @@ error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); rsp += size; } -#ifdef _KERNEL vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); -#endif if (error == 0) { error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, @@ -1012,6 +1236,10 @@ error = emulate_movx(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); @@ -1193,6 +1421,7 @@ vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; if (inst_length) { bcopy(inst_bytes, vie->inst, inst_length); @@ -1458,6 +1687,35 @@ vie->num_processed++; } +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { @@ -1471,6 +1729,12 @@ vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; else break; @@ -1923,8 +2187,10 @@ if (verify_inst_length(vie)) return (-1); - if (verify_gla(vm, cpuid, gla, vie)) - return (-1); + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); + } vie->decoded = 1; /* success */ Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c (revision 277046) +++ usr.sbin/bhyve/bhyverun.c (working copy) @@ -181,6 +181,16 @@ } void +vm_restart_instruction(void *arg, int vcpu) +{ + /* + * Set the instruction length to 0 to ensure that the instruction is + * restarted when the fault handler returns. + */ + vmexit[vcpu].inst_length = 0; +} + +void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { @@ -194,11 +204,7 @@ error = vm_inject_exception(ctx, vcpu, vector); assert(error == 0); - /* - * Set the instruction length to 0 to ensure that the instruction is - * restarted when the fault handler returns. - */ - vmexit[vcpu].inst_length = 0; + vm_restart_instruction(ctx, vcpu); } void *