Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h (revision 258953) +++ sys/amd64/include/vmm.h (working copy) @@ -264,6 +264,7 @@ VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_SPINDOWN_CPU, VM_EXITCODE_MAX }; Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c (revision 258953) +++ sys/amd64/vmm/intel/vmx.c (working copy) @@ -1560,7 +1560,6 @@ panic("vmx_run: error %d setting up pcpu defaults", error); do { - lapic_timer_tick(vmx->vm, vcpu); vmx_inject_interrupts(vmx, vcpu); vmx_run_trace(vmx, vcpu); rc = vmx_setjmp(vmxctx); Index: sys/amd64/vmm/io/vlapic.c =================================================================== --- sys/amd64/vmm/io/vlapic.c (revision 258953) +++ sys/amd64/vmm/io/vlapic.c (working copy) @@ -30,8 +30,10 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include +#include #include #include @@ -53,6 +55,9 @@ #define VLAPIC_CTR1(vlapic, format, p1) \ VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + #define VLAPIC_CTR_IRR(vlapic, msg) \ do { \ uint32_t *irrptr = &(vlapic)->apic.irr0; \ @@ -100,12 +105,15 @@ struct vm *vm; int vcpuid; - struct LAPIC apic; + struct LAPIC apic; int esr_update; - int divisor; - int ccr_ticks; + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; /* * The 'isrvec_stk' is a stack of vectors injected by the local apic. @@ -120,6 +128,21 @@ enum boot_state boot_state; }; +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to the following registers: + * - initial count register aka icr_timer + * - current count register aka ccr_timer + * - divide config register aka dcr_timer + * - timer LVT register + * + * Note that the vlapic_callout_handler() does not write to any of these + * registers so they can be safely read from the vcpu context without locking. + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + #define VLAPIC_BUS_FREQ tsc_freq static int @@ -167,14 +190,65 @@ } #endif -static uint64_t +static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { - struct LAPIC *lapic = &vlapic->apic; - return lapic->ccr_timer; + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = &vlapic->apic; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); } static void +vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr) +{ + struct LAPIC *lapic; + int divisor; + + lapic = &vlapic->apic; + VLAPIC_TIMER_LOCK(vlapic); + + lapic->dcr_timer = dcr; + divisor = vlapic_timer_divisor(dcr); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +static void vlapic_update_errors(struct vlapic *vlapic) { struct LAPIC *lapic = &vlapic->apic; @@ -200,7 +274,7 @@ memset(lapic, 0, sizeof(*lapic)); lapic->apr = vlapic->vcpuid; vlapic_init_ipi(vlapic); - vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer); + vlapic_set_dcr(vlapic, 0); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ @@ -221,6 +295,12 @@ if (vector < 0 || vector >= 256) panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return; + } + idx = (vector / 32) * 4; mask = 1 << (vector % 32); @@ -241,30 +321,8 @@ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); } -static void -vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) -{ - uint32_t icr_timer; - - icr_timer = vlapic->apic.icr_timer; - - vlapic->ccr_ticks = ticks; - if (elapsed < icr_timer) - vlapic->apic.ccr_timer = icr_timer - elapsed; - else { - /* - * This can happen when the guest is trying to run its local - * apic timer higher that the setting of 'hz' in the host. - * - * We deal with this by running the guest local apic timer - * at the rate of the host's 'hz' setting. - */ - vlapic->apic.ccr_timer = 0; - } -} - static __inline uint32_t * -vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = &vlapic->apic; int i; @@ -276,6 +334,33 @@ return ((&lapic->lvt_timer) + i);; } +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + + return (*vlapic_get_lvtptr(vlapic, offset)); +} + +static void +vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val) +{ + uint32_t *lvtptr; + struct LAPIC *lapic; + + lapic = &vlapic->apic; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + + if (offset == APIC_OFFSET_TIMER_LVT) + VLAPIC_TIMER_LOCK(vlapic); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + *lvtptr = val; + + if (offset == APIC_OFFSET_TIMER_LVT) + VLAPIC_TIMER_UNLOCK(vlapic); +} + #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) @@ -398,15 +483,16 @@ } static __inline int -vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask) +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { - return (*lvt & mask); + + return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { - uint32_t *lvt; + uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); @@ -419,17 +505,111 @@ vlapic_fire_timer(struct vlapic *vlapic) { int vector; - uint32_t *lvt; + uint32_t lvt; + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); - vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR); + vector = vlapic_get_lvt_field(lvt, APIC_LVTT_VECTOR); vlapic_set_intr_ready(vlapic, vector, false); + vcpu_notify_event(vlapic->vm, vlapic->vcpuid); } } +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled")); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +static void +vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer) +{ + struct LAPIC *lapic; + sbintime_t sbt; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = &vlapic->apic; + lapic->icr_timer = icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); static int @@ -555,7 +735,6 @@ break; } } - VLAPIC_CTR0(vlapic, "no pending intr"); return (-1); } @@ -593,6 +772,37 @@ vlapic_update_ppr(vlapic); } +static void +lapic_set_svr(struct vlapic *vlapic, uint32_t new) +{ + struct LAPIC *lapic; + uint32_t old, changed; + + lapic = &vlapic->apic; + old = lapic->svr; + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_set_icr_timer(vlapic, lapic->icr_timer); + } + } + lapic->svr = new; +} + int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) { @@ -663,8 +873,7 @@ *data = lapic->icr_hi; break; case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: - reg = vlapic_get_lvt(vlapic, offset); - *data = *(reg); + *data = vlapic_get_lvt(vlapic, offset); break; case APIC_OFFSET_ICR: *data = lapic->icr_timer; @@ -680,6 +889,7 @@ *data = 0; break; } + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } @@ -687,9 +897,10 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data) { struct LAPIC *lapic = &vlapic->apic; - uint32_t *reg; int retval; + VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data); + if (offset > sizeof(*lapic)) { return 0; } @@ -712,7 +923,7 @@ case APIC_OFFSET_DFR: break; case APIC_OFFSET_SVR: - lapic->svr = data; + lapic_set_svr(vlapic, data); break; case APIC_OFFSET_ICR_LOW: if (!x2apic(vlapic)) { @@ -728,21 +939,14 @@ } break; case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: - reg = vlapic_get_lvt(vlapic, offset); - if (!(lapic->svr & APIC_SVR_ENABLE)) { - data |= APIC_LVT_M; - } - *reg = data; - // vlapic_dump_lvt(offset, reg); + vlapic_set_lvt(vlapic, offset, data); break; case APIC_OFFSET_ICR: - lapic->icr_timer = data; - vlapic_start_timer(vlapic, 0); + vlapic_set_icr_timer(vlapic, data); break; case APIC_OFFSET_DCR: - lapic->dcr_timer = data; - vlapic->divisor = vlapic_timer_divisor(data); + vlapic_set_dcr(vlapic, data); break; case APIC_OFFSET_ESR: @@ -764,70 +968,6 @@ return (retval); } -int -vlapic_timer_tick(struct vlapic *vlapic) -{ - int curticks, delta, periodic, fired; - uint32_t ccr; - uint32_t decrement, leftover; - -restart: - curticks = ticks; - delta = curticks - vlapic->ccr_ticks; - - /* Local APIC timer is disabled */ - if (vlapic->apic.icr_timer == 0) - return (-1); - - /* One-shot mode and timer has already counted down to zero */ - periodic = vlapic_periodic_timer(vlapic); - if (!periodic && vlapic->apic.ccr_timer == 0) - return (-1); - /* - * The 'curticks' and 'ccr_ticks' are out of sync by more than - * 2^31 ticks. We deal with this by restarting the timer. - */ - if (delta < 0) { - vlapic_start_timer(vlapic, 0); - goto restart; - } - - fired = 0; - decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; - - vlapic->ccr_ticks = curticks; - ccr = vlapic->apic.ccr_timer; - - while (delta-- > 0) { - if (ccr > decrement) { - ccr -= decrement; - continue; - } - - /* Trigger the local apic timer interrupt */ - vlapic_fire_timer(vlapic); - if (periodic) { - leftover = decrement - ccr; - vlapic_start_timer(vlapic, leftover); - ccr = vlapic->apic.ccr_timer; - } else { - /* - * One-shot timer has counted down to zero. - */ - ccr = 0; - } - fired = 1; - break; - } - - vlapic->apic.ccr_timer = ccr; - - if (!fired) - return ((ccr / decrement) + 1); - else - return (0); -} - struct vlapic * vlapic_init(struct vm *vm, int vcpuid) { @@ -837,6 +977,9 @@ vlapic->vm = vm; vlapic->vcpuid = vcpuid; + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_DEF); + callout_init(&vlapic->callout, 1); + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vcpuid == 0) @@ -851,6 +994,7 @@ vlapic_cleanup(struct vlapic *vlapic) { + callout_drain(&vlapic->callout); free(vlapic, M_VLAPIC); } @@ -887,3 +1031,15 @@ if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; } + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} Index: sys/amd64/vmm/io/vlapic.h =================================================================== --- sys/amd64/vmm/io/vlapic.h (revision 258953) +++ sys/amd64/vmm/io/vlapic.h (working copy) @@ -95,10 +95,10 @@ int vlapic_pending_intr(struct vlapic *vlapic); void vlapic_intr_accepted(struct vlapic *vlapic, int vector); void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); -int vlapic_timer_tick(struct vlapic *vlapic); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); #endif /* _VLAPIC_H_ */ Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c (revision 258953) +++ sys/amd64/vmm/vmm.c (working copy) @@ -861,39 +861,37 @@ static int vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) { + struct vm_exit *vmexit; struct vcpu *vcpu; - int sleepticks, t; + int t; vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); /* - * Figure out the number of host ticks until the next apic - * timer interrupt in the guest. - */ - sleepticks = lapic_timer_tick(vm, vcpuid); - - /* - * If the guest local apic timer is disabled then sleep for - * a long time but not forever. - */ - if (sleepticks < 0) - sleepticks = hz; - - /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. * * These interrupts could have happened any time after we * returned from VMRUN() and before we grabbed the vcpu lock. */ - if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { - if (sleepticks <= 0) - panic("invalid sleepticks %d", sleepticks); + if (!vm_nmi_pending(vm, vcpuid) && + vlapic_pending_intr(vcpu->vlapic) < 0) { t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); - msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); + if (vlapic_enabled(vcpu->vlapic)) { + msleep_spin(vcpu, &vcpu->mtx, "vmidle", 0); + } else { + /* + * Spindown the vcpu if the apic is disabled and it + * had entered the halted state. + */ + *retu = TRUE; + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; + VCPU_CTR0(vm, vcpuid, "vm_handle_hlt: spinning down"); + } vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } Index: sys/amd64/vmm/vmm_lapic.c =================================================================== --- sys/amd64/vmm/vmm_lapic.c (revision 258953) +++ sys/amd64/vmm/vmm_lapic.c (working copy) @@ -80,16 +80,6 @@ return (0); } -int -lapic_timer_tick(struct vm *vm, int cpu) -{ - struct vlapic *vlapic; - - vlapic = vm_lapic(vm, cpu); - - return (vlapic_timer_tick(vlapic)); -} - static boolean_t x2apic_msr(u_int msr) { Index: sys/amd64/vmm/vmm_lapic.h =================================================================== --- sys/amd64/vmm/vmm_lapic.h (revision 258953) +++ sys/amd64/vmm/vmm_lapic.h (working copy) @@ -40,8 +40,6 @@ int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, void *arg); -int lapic_timer_tick(struct vm *vm, int cpu); - /* * Returns a vector between 32 and 255 if an interrupt is pending in the * IRR that can be delivered based on the current state of ISR and TPR. Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c (revision 258953) +++ usr.sbin/bhyve/bhyverun.c (working copy) @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -85,8 +86,6 @@ static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic; static int virtio_msix = 1; -static int foundcpus; - static int strictio; static int acpi; @@ -210,8 +209,7 @@ exit(1); } - cpumask |= 1 << vcpu; - foundcpus++; + atomic_set_int(&cpumask, 1 << vcpu); /* * Set up the vmexit struct to allow execution to start @@ -229,6 +227,20 @@ } static int +fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) +{ + + if ((cpumask & (1 << vcpu)) == 0) { + fprintf(stderr, "addcpu: attempting to delete unknown cpu %d\n", + vcpu); + exit(1); + } + + atomic_clear_int(&cpumask, 1 << vcpu); + return (cpumask == 0); +} + +static int vmexit_catch_reset(void) { stats.io_reset++; @@ -327,6 +339,17 @@ } static int +vmexit_spindown_cpu(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int lastcpu; + + lastcpu = fbsdrun_deletecpu(ctx, *pvcpu); + if (!lastcpu) + pthread_exit(NULL); + return (vmexit_catch_reset()); +} + +static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { @@ -417,6 +440,7 @@ [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SPINDOWN_CPU] = vmexit_spindown_cpu, }; static void