Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile (revision 243595) +++ usr.sbin/bhyve/Makefile (working copy) @@ -7,11 +7,14 @@ DEBUG_FLAGS= -g -O0 SRCS= acpi.c atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c -SRCS+= instruction_emul.c ioapic.c mem.c mevent.c mptbl.c +SRCS+= ioapic.c mem.c mevent.c mptbl.c SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c SRCS+= xmsr.c spinup_ap.c +.PATH: ${.CURDIR}/../../sys/amd64/vmm +SRCS+= vmm_instruction_emul.c + NO_MAN= DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD} Index: usr.sbin/bhyve/fbsdrun.c =================================================================== --- usr.sbin/bhyve/fbsdrun.c (revision 243595) +++ usr.sbin/bhyve/fbsdrun.c (working copy) @@ -57,7 +57,6 @@ #include "mptbl.h" #include "pci_emul.h" #include "xmsr.h" -#include "instruction_emul.h" #include "ioapic.h" #include "spinup_ap.h" @@ -455,7 +454,8 @@ stats.vmexit_paging++; err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa, vmexit->rip, - vmexit->u.paging.cr3, vmexit->u.paging.rwx); + vmexit->u.paging.cr3, vmexit->u.paging.rwx, + &vmexit->u.paging.vie); if (err) { if (err == EINVAL) { Index: usr.sbin/bhyve/ioapic.c =================================================================== --- usr.sbin/bhyve/ioapic.c (revision 243595) +++ usr.sbin/bhyve/ioapic.c (working copy) @@ -42,7 +42,6 @@ #include "inout.h" #include "mem.h" -#include "instruction_emul.h" #include "fbsdrun.h" #include Index: usr.sbin/bhyve/mem.c =================================================================== --- usr.sbin/bhyve/mem.c (revision 243595) +++ usr.sbin/bhyve/mem.c (working copy) @@ -51,7 +51,6 @@ #include #include "mem.h" -#include "instruction_emul.h" struct mmio_rb_range { RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ @@ -134,33 +133,57 @@ RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + int emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, uint64_t rip, - uint64_t cr3, int mode) + uint64_t cr3, int mode, struct vie *vie) { struct mmio_rb_range *entry; int err; - err = 0; - /* * First check the per-vCPU cache */ if (mmio_hint[vcpu] && paddr >= mmio_hint[vcpu]->mr_base && paddr <= mmio_hint[vcpu]->mr_end) { - err = emulate_instruction(ctx, vcpu, rip, cr3, paddr, mode, - &mmio_hint[vcpu]->mr_param); - } else { - if (mmio_rb_lookup(paddr, &entry)) { - err = ENOENT; - } else { - mmio_hint[vcpu] = entry; - err = emulate_instruction(ctx, vcpu, rip, cr3, paddr, - mode, &entry->mr_param); - } + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(paddr, &entry)) + return (ESRCH); + + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; } + assert(entry != NULL && entry == mmio_hint[vcpu]); + + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, + mem_read, mem_write, &entry->mr_param); return (err); } Index: usr.sbin/bhyve/mem.h =================================================================== --- usr.sbin/bhyve/mem.h (revision 243595) +++ usr.sbin/bhyve/mem.h (working copy) @@ -51,7 +51,7 @@ void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, uint64_t rip, - uint64_t cr3, int mode); + uint64_t cr3, int mode, struct vie *vie); int register_mem(struct mem_range *memp); Index: usr.sbin/bhyve/instruction_emul.c =================================================================== --- usr.sbin/bhyve/instruction_emul.c (revision 243595) +++ usr.sbin/bhyve/instruction_emul.c (working copy) @@ -1,641 +0,0 @@ -/*- - * Copyright (c) 2012 Sandvine, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include -#include -#include -#include -#include - -#include "fbsdrun.h" -#include "mem.h" -#include "instruction_emul.h" - -#define PREFIX_LOCK 0xF0 -#define PREFIX_REPNE 0xF2 -#define PREFIX_REPE 0xF3 -#define PREFIX_CS_OVERRIDE 0x2E -#define PREFIX_SS_OVERRIDE 0x36 -#define PREFIX_DS_OVERRIDE 0x3E -#define PREFIX_ES_OVERRIDE 0x26 -#define PREFIX_FS_OVERRIDE 0x64 -#define PREFIX_GS_OVERRIDE 0x65 -#define PREFIX_BRANCH_NOT_TAKEN 0x2E -#define PREFIX_BRANCH_TAKEN 0x3E -#define PREFIX_OPSIZE 0x66 -#define is_opsz_prefix(x) ((x) == PREFIX_OPSIZE) -#define PREFIX_ADDRSIZE 0x67 - -#define OPCODE_2BYTE_ESCAPE 0x0F -#define OPCODE_3BYTE_ESCAPE 0x38 - -#define MODRM_MOD_MASK 0xC0 -#define MODRM_MOD_SHIFT 6 -#define MODRM_RM_MASK 0x07 -#define MODRM_RM_SHIFT 0 -#define MODRM_REG_MASK 0x38 -#define MODRM_REG_SHIFT 3 - -#define MOD_INDIRECT 0x0 -#define MOD_INDIRECT_DISP8 0x1 -#define MOD_INDIRECT_DISP32 0x2 -#define MOD_DIRECT 0x3 - -#define RM_EAX 0x0 -#define RM_ECX 0x1 -#define RM_EDX 0x2 -#define RM_EBX 0x3 -#define RM_SIB 0x4 -#define RM_DISP32 0x5 -#define RM_EBP RM_DISP32 -#define RM_ESI 0x6 -#define RM_EDI 0x7 - -#define REG_EAX 0x0 -#define REG_ECX 0x1 -#define REG_EDX 0x2 -#define REG_EBX 0x3 -#define REG_ESP 0x4 -#define REG_EBP 0x5 -#define REG_ESI 0x6 -#define REG_EDI 0x7 -#define REG_R8 0x8 -#define REG_R9 0x9 -#define REG_R10 0xA -#define REG_R11 0xB -#define REG_R12 0xC -#define REG_R13 0xD -#define REG_R14 0xE -#define REG_R15 0xF - -#define HAS_MODRM 1 -#define FROM_RM (1<<1) -#define FROM_REG (1<<2) -#define TO_RM (1<<3) -#define TO_REG (1<<4) -#define ZEXT (1<<5) -#define FROM_8 (1<<6) -#define FROM_16 (1<<7) -#define TO_8 (1<<8) -#define TO_16 (1<<9) - -#define REX_MASK 0xF0 -#define REX_PREFIX 0x40 -#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX ) -#define REX_W_MASK 0x8 -#define REX_R_MASK 0x4 -#define REX_X_MASK 0x2 -#define REX_B_MASK 0x1 - -#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \ - (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \ - (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \ - (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \ - (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \ - (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \ - (x) == PREFIX_ADDRSIZE || is_rex_prefix((x))) - -#define PAGE_FRAME_MASK 0x80 -#define PAGE_OFFSET_MASK 0xFFF -#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK) -#define PML4E_OFFSET_MASK 0x0000FF8000000000 -#define PML4E_SHIFT 39 - -#define INSTR_VERIFY - -struct decoded_instruction -{ - void *instruction; - uint8_t *opcode; - uint8_t *modrm; - uint8_t *sib; - uint8_t *displacement; - uint8_t *immediate; - - uint16_t opcode_flags; - - uint8_t addressing_mode; - uint8_t rm; - uint8_t reg; - uint8_t opsz; - uint8_t rex_r; - uint8_t rex_w; - uint8_t rex_b; - uint8_t rex_x; - - int32_t disp; -}; - -static enum vm_reg_name vm_reg_name_mappings[] = { - [REG_EAX] = VM_REG_GUEST_RAX, - [REG_EBX] = VM_REG_GUEST_RBX, - [REG_ECX] = VM_REG_GUEST_RCX, - [REG_EDX] = VM_REG_GUEST_RDX, - [REG_ESP] = VM_REG_GUEST_RSP, - [REG_EBP] = VM_REG_GUEST_RBP, - [REG_ESI] = VM_REG_GUEST_RSI, - [REG_EDI] = VM_REG_GUEST_RDI, - [REG_R8] = VM_REG_GUEST_R8, - [REG_R9] = VM_REG_GUEST_R9, - [REG_R10] = VM_REG_GUEST_R10, - [REG_R11] = VM_REG_GUEST_R11, - [REG_R12] = VM_REG_GUEST_R12, - [REG_R13] = VM_REG_GUEST_R13, - [REG_R14] = VM_REG_GUEST_R14, - [REG_R15] = VM_REG_GUEST_R15 -}; - -uint16_t one_byte_opcodes[256] = { - [0x88] = HAS_MODRM | FROM_REG | TO_RM | TO_8 | FROM_8, - [0x89] = HAS_MODRM | FROM_REG | TO_RM, - [0x8B] = HAS_MODRM | FROM_RM | TO_REG, -}; - -uint16_t two_byte_opcodes[256] = { - [0xB6] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_8, - [0xB7] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_16, -}; - -static uintptr_t -gla2gpa(uint64_t gla, uint64_t guest_cr3) -{ - uint64_t *table; - uint64_t mask, entry; - int level, shift; - uintptr_t page_frame; - - table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK); - mask = PML4E_OFFSET_MASK; - shift = PML4E_SHIFT; - for (level = 0; level < 4; ++level) - { - entry = table[(gla & mask) >> shift]; - table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK); - - /* This entry does not point to another page table */ - if (entry & PAGE_FRAME_MASK || level >= 3) - break; - - table = paddr_guest2host((uintptr_t)table); - mask >>= 9; - shift -= 9; - } - - mask = (1 << shift) - 1; - page_frame = ((uintptr_t)table & ~mask); - return (page_frame | (gla & mask)); -} - -static void * -gla2hla(uint64_t gla, uint64_t guest_cr3) -{ - uintptr_t gpa; - - gpa = gla2gpa(gla, guest_cr3); - - return (paddr_guest2host(gpa)); -} - -/* - * Decodes all of the prefixes of the instruction. Only a subset of REX - * prefixes are currently supported. If any unsupported prefix is - * encountered, returns -1. - */ -static int -decode_prefixes(struct decoded_instruction *decoded) -{ - uint8_t *current_prefix; - - current_prefix = decoded->instruction; - - if (is_rex_prefix(*current_prefix)) { - decoded->rex_w = *current_prefix & REX_W_MASK; - decoded->rex_r = *current_prefix & REX_R_MASK; - decoded->rex_x = *current_prefix & REX_X_MASK; - decoded->rex_b = *current_prefix & REX_B_MASK; - current_prefix++; - } else if (is_opsz_prefix(*current_prefix)) { - decoded->opsz = 1; - current_prefix++; - } else if (is_prefix(*current_prefix)) { - return (-1); - } - - decoded->opcode = current_prefix; - return (0); -} - -/* - * Decodes the instruction's opcode. If the opcode is not understood, returns - * -1 indicating an error. Sets the instruction's mod_rm pointer to the - * location of the ModR/M field. - */ -static int -decode_opcode(struct decoded_instruction *decoded) -{ - uint8_t opcode; - uint16_t flags; - int extra; - - opcode = *decoded->opcode; - extra = 0; - - if (opcode != 0xf) - flags = one_byte_opcodes[opcode]; - else { - opcode = *(decoded->opcode + 1); - flags = two_byte_opcodes[opcode]; - extra = 1; - } - - if (!flags) - return (-1); - - if (flags & HAS_MODRM) { - decoded->modrm = decoded->opcode + 1 + extra; - } - - decoded->opcode_flags = flags; - - return (0); -} - -/* - * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer - * to the location of the SIB if one is expected to be present, or 0 if not. - */ -static int -decode_mod_rm(struct decoded_instruction *decoded) -{ - uint8_t modrm; - uint8_t *extension_operands; - - if (decoded->modrm) { - modrm = *decoded->modrm; - - decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT; - decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT; - decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT; - - if (decoded->rex_b) - decoded->rm |= (1<<3); - - if (decoded->rex_r) - decoded->reg |= (1<<3); - - extension_operands = decoded->modrm + 1; - - if (decoded->rm == RM_SIB) { - decoded->sib = decoded->modrm + 1; - extension_operands = decoded->sib + 1; - } - - switch (decoded->addressing_mode) { - case MOD_INDIRECT: - case MOD_DIRECT: - decoded->displacement = 0; - break; - case MOD_INDIRECT_DISP8: - decoded->displacement = extension_operands; - break; - case MOD_INDIRECT_DISP32: - decoded->displacement = extension_operands; - break; - } - } - - return (0); -} - -/* - * Decodes the instruction's SIB field. No such instructions are currently - * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise. - */ -static int -decode_sib(struct decoded_instruction *decoded) -{ - - if (decoded->sib) - return (-1); - - return (0); -} - -/* - * Grabs and saves the instruction's immediate operand and displacement if - * they are present. Immediates are not currently supported, so if an - * immediate is present it will return -1 indicating an error. - */ -static int -decode_extension_operands(struct decoded_instruction *decoded) -{ - - if (decoded->displacement) { - if (decoded->addressing_mode == MOD_INDIRECT_DISP8) { - decoded->disp = *((int8_t *)decoded->displacement); - } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) { - decoded->disp = *((int32_t *)decoded->displacement); - } - } - - if (decoded->immediate) { - return (-1); - } - - return (0); -} - -static int -decode_instruction(void *instr, struct decoded_instruction *decoded) -{ - int error; - - bzero(decoded, sizeof(*decoded)); - decoded->instruction = instr; - - error = decode_prefixes(decoded); - if (error) - return (error); - - error = decode_opcode(decoded); - if (error) - return (error); - - error = decode_mod_rm(decoded); - if (error) - return (error); - - error = decode_sib(decoded); - if (error) - return (error); - - error = decode_extension_operands(decoded); - if (error) - return (error); - - return (0); -} - -static enum vm_reg_name -get_vm_reg_name(uint8_t reg) -{ - - return (vm_reg_name_mappings[reg]); -} - -static uint64_t -adjust_operand(const struct decoded_instruction *instruction, uint64_t val, - int size) -{ - uint64_t ret; - - if (instruction->opcode_flags & ZEXT) { - switch (size) { - case 1: - ret = val & 0xff; - break; - case 2: - ret = val & 0xffff; - break; - case 4: - ret = val & 0xffffffff; - break; - case 8: - ret = val; - break; - default: - break; - } - } else { - /* - * Extend the sign - */ - switch (size) { - case 1: - ret = (int8_t)(val & 0xff); - break; - case 2: - ret = (int16_t)(val & 0xffff); - break; - case 4: - ret = (int32_t)(val & 0xffffffff); - break; - case 8: - ret = val; - break; - default: - break; - } - } - - return (ret); -} - -static int -get_operand(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3, - const struct decoded_instruction *instruction, uint64_t *operand, - struct mem_range *mr) -{ - enum vm_reg_name regname; - uint64_t reg; - int error; - uint8_t rm, addressing_mode, size; - - if (instruction->opcode_flags & FROM_RM) { - rm = instruction->rm; - addressing_mode = instruction->addressing_mode; - } else if (instruction->opcode_flags & FROM_REG) { - rm = instruction->reg; - addressing_mode = MOD_DIRECT; - } else - return (-1); - - /* - * Determine size of operand - */ - size = 4; - if (instruction->opcode_flags & FROM_8) { - size = 1; - } else if (instruction->opcode_flags & FROM_16 || - instruction->opsz) { - size = 2; - } - - regname = get_vm_reg_name(rm); - error = vm_get_register(vm, vcpu, regname, ®); - if (error) - return (error); - - switch (addressing_mode) { - case MOD_DIRECT: - *operand = reg; - error = 0; - break; - case MOD_INDIRECT: - case MOD_INDIRECT_DISP8: - case MOD_INDIRECT_DISP32: -#ifdef INSTR_VERIFY - { - uintptr_t target; - - target = gla2gpa(reg, guest_cr3); - target += instruction->disp; - assert(gpa == target); - } -#endif - error = (*mr->handler)(vm, vcpu, MEM_F_READ, gpa, size, - operand, mr->arg1, mr->arg2); - break; - default: - return (-1); - } - - if (!error) - *operand = adjust_operand(instruction, *operand, size); - - return (error); -} - -static uint64_t -adjust_write(uint64_t reg, uint64_t operand, int size) -{ - uint64_t val; - - switch (size) { - case 1: - val = (reg & ~0xff) | (operand & 0xff); - break; - case 2: - val = (reg & ~0xffff) | (operand & 0xffff); - break; - case 4: - val = (reg & ~0xffffffff) | (operand & 0xffffffff); - break; - case 8: - val = operand; - default: - break; - } - - return (val); -} - -static int -perform_write(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3, - const struct decoded_instruction *instruction, uint64_t operand, - struct mem_range *mr) -{ - enum vm_reg_name regname; - uintptr_t target; - int error; - int size; - uint64_t reg; - uint8_t addressing_mode; - - if (instruction->opcode_flags & TO_RM) { - reg = instruction->rm; - addressing_mode = instruction->addressing_mode; - } else if (instruction->opcode_flags & TO_REG) { - reg = instruction->reg; - addressing_mode = MOD_DIRECT; - } else - return (-1); - - /* - * Determine the operand size. rex.w has priority - */ - size = 4; - if (instruction->rex_w) { - size = 8; - } else if (instruction->opcode_flags & TO_8) { - size = 1; - } else if (instruction->opsz) { - size = 2; - }; - - switch(addressing_mode) { - case MOD_DIRECT: - regname = get_vm_reg_name(reg); - error = vm_get_register(vm, vcpu, regname, ®); - if (error) - return (error); - operand = adjust_write(reg, operand, size); - - return (vm_set_register(vm, vcpu, regname, operand)); - case MOD_INDIRECT: - case MOD_INDIRECT_DISP8: - case MOD_INDIRECT_DISP32: -#ifdef INSTR_VERIFY - regname = get_vm_reg_name(reg); - error = vm_get_register(vm, vcpu, regname, ®); - assert(!error); - target = gla2gpa(reg, guest_cr3); - target += instruction->disp; - assert(gpa == target); -#endif - error = (*mr->handler)(vm, vcpu, MEM_F_WRITE, gpa, size, - &operand, mr->arg1, mr->arg2); - return (error); - default: - return (-1); - } -} - -static int -emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t gpa, - uint64_t cr3, - const struct decoded_instruction *instruction, - struct mem_range *mr) -{ - uint64_t operand; - int error; - - error = get_operand(vm, vcpu, gpa, cr3, instruction, &operand, mr); - if (error) - return (error); - - return perform_write(vm, vcpu, gpa, cr3, instruction, operand, mr); -} - -int -emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3, - uint64_t gpa, int flags, struct mem_range *mr) -{ - struct decoded_instruction instr; - int error; - void *instruction; - - instruction = gla2hla(rip, cr3); - - error = decode_instruction(instruction, &instr); - if (!error) - error = emulate_decoded_instruction(vm, vcpu, gpa, cr3, - &instr, mr); - - return (error); -} Index: usr.sbin/bhyve/pci_passthru.c =================================================================== --- usr.sbin/bhyve/pci_passthru.c (revision 243595) +++ usr.sbin/bhyve/pci_passthru.c (working copy) @@ -48,7 +48,6 @@ #include #include "pci_emul.h" #include "mem.h" -#include "instruction_emul.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" Index: usr.sbin/bhyve/instruction_emul.h =================================================================== --- usr.sbin/bhyve/instruction_emul.h (revision 243595) +++ usr.sbin/bhyve/instruction_emul.h (working copy) @@ -1,36 +0,0 @@ -/*- - * Copyright (c) 2012 Sandvine, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _INSTRUCTION_EMUL_H_ -#define _INSTRUCTION_EMUL_H_ - -int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, - uint64_t cr3, uint64_t gpa, int flags, - struct mem_range *mr); - -#endif Index: sys/amd64/include/vmm_instruction_emul.h =================================================================== --- sys/amd64/include/vmm_instruction_emul.h (revision 0) +++ sys/amd64/include/vmm_instruction_emul.h (working copy) @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1; + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t mrr, mem_region_write_t mrw, + void *mrarg); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + uint64_t rip, int inst_length, uint64_t cr3, + struct vie *vie); + +int vmm_decode_instruction(struct vm *vm, int cpuid, + uint64_t gla, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ Property changes on: sys/amd64/include/vmm_instruction_emul.h ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +text/plain Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H Added: svn:eol-style ## -0,0 +1 ## +native Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h (revision 243595) +++ sys/amd64/include/vmm.h (working copy) @@ -150,6 +150,8 @@ #endif /* KERNEL */ +#include + #define VM_MAXCPU 8 /* maximum virtual cpus */ /* @@ -268,6 +270,7 @@ uint64_t cr3; uint64_t gpa; int rwx; + struct vie vie; } paging; /* * VMX specific payload. Used when there is no "better" Index: sys/amd64/vmm/vmm_lapic.c =================================================================== --- sys/amd64/vmm/vmm_lapic.c (revision 243595) +++ sys/amd64/vmm/vmm_lapic.c (working copy) @@ -34,12 +34,12 @@ #include #include +#include #include #include "vmm_ipi.h" #include "vmm_lapic.h" #include "vlapic.h" -#include "vmm_instruction_emul.h" static int lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val) @@ -177,64 +177,45 @@ } int -lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie) +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) { - int handled, error; - uint64_t val; + int error; + uint64_t off; struct vlapic *vlapic; - const int UNHANDLED = 0; + off = gpa - DEFAULT_APIC_BASE; - vlapic = vm_lapic(vm, cpu); - - /* Only 32-bit accesses to local apic */ - if (vie->op_size != VIE_OP_SIZE_32BIT) - return (UNHANDLED); - /* - * XXX - * The operand register in which we store the result of the - * read must be a GPR that we can modify even if the vcpu - * is "running". All the GPRs qualify except for %rsp. - * - * This is a limitation of the vm_set_register() API - * and can be fixed if necessary. + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. */ - if (vie->operand_register == VM_REG_GUEST_RSP) - return (UNHANDLED); + if (size != 4 || off & 0xf) + return (EINVAL); - if (read) { - if ((vie->opcode_flags & VIE_F_TO_REG) == 0) - return (UNHANDLED); + vlapic = vm_lapic(vm, cpu); + error = vlapic_op_mem_write(vlapic, off, DWORD, wval); + return (error); +} - if (vie->operand_register >= VM_REG_LAST) - return (UNHANDLED); +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; - handled = lapic_read(vlapic, offset, &val); - if (handled) { - error = vm_set_register(vm, cpu, vie->operand_register, - val); - if (error) - panic("lapic_mmio: error %d setting gpr %d", - error, vie->operand_register); - } - } else { - if ((vie->opcode_flags & VIE_F_FROM_REG) && - (vie->operand_register < VM_REG_LAST)) { - error = vm_get_register(vm, cpu, vie->operand_register, - &val); - if (error) { - panic("lapic_mmio: error %d getting gpr %d", - error, vie->operand_register); - } - } else if (vie->opcode_flags & VIE_F_FROM_IMM) { - val = vie->immediate; - } else { - return (UNHANDLED); - } + off = gpa - DEFAULT_APIC_BASE; - handled = lapic_write(vlapic, offset, val); - } + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); - return (handled); + vlapic = vm_lapic(vm, cpu); + error = vlapic_op_mem_read(vlapic, off, DWORD, rval); + return (error); } Index: sys/amd64/vmm/vmm_lapic.h =================================================================== --- sys/amd64/vmm/vmm_lapic.h (revision 243595) +++ sys/amd64/vmm/vmm_lapic.h (working copy) @@ -30,13 +30,15 @@ #define _VMM_LAPIC_H_ struct vm; -struct vie; boolean_t lapic_msr(u_int num); int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); -int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *); +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); int lapic_timer_tick(struct vm *vm, int cpu); Index: sys/amd64/vmm/vmm_instruction_emul.c =================================================================== --- sys/amd64/vmm/vmm_instruction_emul.c (revision 243595) +++ sys/amd64/vmm/vmm_instruction_emul.c (working copy) @@ -30,6 +30,7 @@ #include __FBSDID("$FreeBSD$"); +#ifdef _KERNEL #include #include #include @@ -40,11 +41,61 @@ #include #include #include +#else /* !_KERNEL */ +#include +#include -#include "vmm_instruction_emul.h" +#include -#define GB (1024 * 1024 * 1024) +#include +#endif /* _KERNEL */ + + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ + +static const struct vie_op one_byte_opcodes[256] = { + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, @@ -64,17 +115,232 @@ VM_REG_GUEST_R15 }; +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_valid_register(enum vm_reg_name reg) +{ +#ifdef _KERNEL + /* + * XXX + * The operand register in which we store the result of the + * read must be a GPR that we can modify even if the vcpu + * is "running". All the GPRs qualify except for %rsp. + * + * This is a limitation of the vm_set_register() API + * and can be fixed if necessary. + */ + if (reg == VM_REG_GUEST_RSP) + return (0); +#endif + return (1); +} + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + if (!vie_valid_register(reg)) + return (EINVAL); + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + if (!vie_valid_register(reg)) + return (EINVAL); + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +/* + * The following simplifying assumptions are made during emulation: + * + * - guest is in 64-bit mode + * - default address size is 64-bits + * - default operand size is 32-bits + * + * - operand size override is not supported + * + * - address size override is not supported + */ +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = 4; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + if (vie->rex_w) + size = 8; + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + if (vie->rex_w) + size = 8; + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xC7: + /* + * MOV from imm32 to mem (ModRM:r/m) + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate; /* already sign-extended */ + + if (vie->rex_w) + size = 8; + + if (size != 8) + val &= size2mask[size]; + + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val1, val2; + + size = 4; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + if (vie->rex_w) + size = 8; + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + val1 &= val2; + error = vie_update_register(vm, vcpuid, reg, val1, size); + break; + default: + break; + } + return (error); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, + void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +#ifdef _KERNEL static void vie_init(struct vie *vie) { bzero(vie, sizeof(struct vie)); - vie->op_size = VIE_OP_SIZE_32BIT; - vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; - vie->operand_register = VM_REG_LAST; } static int @@ -129,7 +395,7 @@ } int -vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, +vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, uint64_t cr3, struct vie *vie) { int n, err; @@ -172,6 +438,7 @@ static int vie_peek(struct vie *vie, uint8_t *x) { + if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); @@ -182,8 +449,6 @@ static void vie_advance(struct vie *vie) { - if (vie->num_processed >= vie->num_valid) - panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid); vie->num_processed++; } @@ -213,24 +478,16 @@ { uint8_t x; - static const uint8_t flags[256] = { - [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM, - [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG, - [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM, - }; - if (vie_peek(vie, &x)) return (-1); - vie->opcode_byte = x; - vie->opcode_flags = flags[x]; + vie->op = one_byte_opcodes[x]; + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + vie_advance(vie); - - if (vie->opcode_flags == 0) - return (-1); - else - return (0); + return (0); } /* @@ -241,9 +498,6 @@ { uint8_t x; - if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0) - return (0); - if (vie_peek(vie, &x)) return (-1); @@ -251,35 +505,40 @@ vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { - /* - * Table 2-5: Special Cases of REX Encodings - * - * mod=0, r/m=5 is used in the compatibility mode to - * indicate a disp32 without a base register. - * - * mod!=3, r/m=4 is used in the compatibility mode to - * indicate that the SIB byte is present. - * - * The 'b' bit in the REX prefix is don't care in - * this case. - */ + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); - /* SIB addressing not supported yet */ + /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) - return (-1); + goto done; vie->base_register = gpr_map[vie->rm]; - if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG)) - vie->operand_register = gpr_map[vie->reg]; - switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; @@ -295,19 +554,83 @@ break; } - /* calculate the operand size */ - if (vie->rex_w) - vie->op_size = VIE_OP_SIZE_64BIT; - - if (vie->opcode_flags & VIE_F_FROM_IMM) + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) vie->imm_bytes = 4; + else if (vie->op.op_flags & VIE_OP_F_IMM8) + vie->imm_bytes = 1; +done: vie_advance(vie); return (0); } static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int decode_displacement(struct vie *vie) { int n, i; @@ -348,13 +671,14 @@ uint8_t x; union { char buf[4]; + int8_t signed8; int32_t signed32; } u; if ((n = vie->imm_bytes) == 0) return (0); - if (n != 4) + if (n != 1 && n != 4) panic("decode_immediate: invalid imm_bytes %d", n); for (i = 0; i < n; i++) { @@ -365,14 +689,62 @@ vie_advance(vie); } - vie->immediate = u.signed32; /* sign-extended */ + if (n == 1) + vie->immediate = u.signed8; /* sign-extended */ + else + vie->immediate = u.signed32; /* sign-extended */ return (0); } +#define VERIFY_GLA +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +#ifdef VERIFY_GLA +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +{ + int error; + uint64_t base, idx; + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + if (base + vie->scale * idx + vie->displacement != gla) { + printf("verify_gla mismatch: " + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx)\n", + base, vie->scale, idx, vie->displacement, gla); + return (-1); + } + + return (0); +} +#endif /* VERIFY_GLA */ + int -vmm_decode_instruction(struct vie *vie) +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) { + if (decode_rex(vie)) return (-1); @@ -382,11 +754,22 @@ if (decode_modrm(vie)) return (-1); + if (decode_sib(vie)) + return (-1); + if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); +#ifdef VERIFY_GLA + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); +#endif + + vie->decoded = 1; /* success */ + return (0); } +#endif /* _KERNEL */ Index: sys/amd64/vmm/vmm_instruction_emul.h =================================================================== --- sys/amd64/vmm/vmm_instruction_emul.h (revision 243595) +++ sys/amd64/vmm/vmm_instruction_emul.h (working copy) @@ -1,91 +0,0 @@ -/*- - * Copyright (c) 2012 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _VMM_INSTRUCTION_EMUL_H_ -#define _VMM_INSTRUCTION_EMUL_H_ - -enum vie_op_size { - VIE_OP_SIZE_32BIT, /* default */ - VIE_OP_SIZE_64BIT, - VIE_OP_SIZE_8BIT -}; - -#define VIE_INST_SIZE 15 -struct vie { - uint8_t inst[VIE_INST_SIZE]; - - uint8_t rex_w:1, - rex_r:1, - rex_x:1, - rex_b:1; - - uint8_t mod:2, - reg:4, - rm:4; - - - uint8_t opcode_byte; - uint16_t opcode_flags; - uint8_t disp_bytes; - uint8_t imm_bytes; - - int num_valid; - int num_processed; - - enum vm_reg_name base_register; - enum vm_reg_name index_register; - enum vm_reg_name operand_register; - - int op_size; - int64_t displacement; - int64_t immediate; -}; - -#define VIE_F_HAS_MODRM (1 << 0) -#define VIE_F_FROM_RM (1 << 1) -#define VIE_F_FROM_REG (1 << 2) -#define VIE_F_TO_RM (1 << 3) -#define VIE_F_TO_REG (1 << 4) -#define VIE_F_FROM_IMM (1 << 5) - -#define VIE_MOD_INDIRECT 0 -#define VIE_MOD_INDIRECT_DISP8 1 -#define VIE_MOD_INDIRECT_DISP32 2 -#define VIE_MOD_DIRECT 3 - -#define VIE_RM_SIB 4 -#define VIE_RM_DISP32 5 - -struct vm; - -int vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, - uint64_t cr3, struct vie *vie); - -int vmm_decode_instruction(struct vie *vie); - -#endif Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c (revision 243595) +++ sys/amd64/vmm/intel/vmx.c (working copy) @@ -63,7 +63,6 @@ #include "vmx.h" #include "x86.h" #include "vmx_controls.h" -#include "vmm_instruction_emul.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ @@ -1150,24 +1149,12 @@ } static int -vmx_lapic_fault(struct vm *vm, int cpu, - uint64_t gpa, uint64_t rip, int inst_length, - uint64_t cr3, uint64_t ept_qual) +vmx_ept_fault(struct vm *vm, int cpu, + uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, + uint64_t cr3, uint64_t ept_qual, struct vie *vie) { - int read, write, handled; - struct vie vie; + int read, write, error; - /* - * For this to be a legitimate access to the local apic: - * - the GPA in the local apic page - * - the GPA must be aligned on a 16 byte boundary - */ - if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) - return (UNHANDLED); - - if ((gpa & 0xF) != 0) - return (UNHANDLED); - /* EPT violation on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) return (UNHANDLED); @@ -1188,15 +1175,22 @@ } /* Fetch, decode and emulate the faulting instruction */ - if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0) + if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) return (UNHANDLED); - if (vmm_decode_instruction(&vie) != 0) + if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) return (UNHANDLED); - handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie); + /* + * Check if this is a local apic access + */ + if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) + return (UNHANDLED); - return (handled); + error = vmm_emulate_instruction(vm, cpu, gpa, vie, + lapic_mmio_read, lapic_mmio_write, 0); + + return (error ? UNHANDLED : HANDLED); } static int @@ -1206,7 +1200,7 @@ struct vmcs *vmcs; struct vmxctx *vmxctx; uint32_t eax, ecx, edx; - uint64_t qual, gpa, cr3, intr_info; + uint64_t qual, gla, gpa, cr3, intr_info; handled = 0; vmcs = &vmx->vmcs[vcpu]; @@ -1299,11 +1293,12 @@ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EPT_FAULT: + gla = vmcs_gla(); gpa = vmcs_gpa(); cr3 = vmcs_guest_cr3(); - handled = vmx_lapic_fault(vmx->vm, vcpu, - gpa, vmexit->rip, vmexit->inst_length, - cr3, qual); + handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, + vmexit->rip, vmexit->inst_length, + cr3, qual, &vmexit->u.paging.vie); if (!handled) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.cr3 = cr3; Index: sys/amd64/vmm/intel/vmcs.h =================================================================== --- sys/amd64/vmm/intel/vmcs.h (revision 243595) +++ sys/amd64/vmm/intel/vmcs.h (working copy) @@ -67,6 +67,7 @@ #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) #define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) #endif /* _KERNEL */