xen: add suspend/resume support for PVHVM (PV timer and PV IPIs) sys/amd64/amd64/mp_machdep.c: sys/i386/i386/mp_machdep.c: - Make sure that are no MMU related IPIs pending on migration. - Reset pending IPI_BITMAP on resume. - Init vcpu_info on resume. sys/dev/acpica/acpi_timer.c: - Don't switch timer on migration if we are using the XENTIMER. sys/dev/uart/uart_dev_ns8250.c: - Set the WAIT_LIMIT to a lower value when running under Xen, since Qemu seems to be really slow when emulating reg writes/reads if the serial console has not been initializated. sys/dev/xen/control/control.c: - Perform proper suspend/resume process for PVHVM: - Suspend all APs before going into suspension, this allows us to reset the vcpu_info on resume for each AP. - Reset shared info page and callback on resume. sys/dev/xen/timer/timer.c: - Implement suspend/resume support for the PV timer, since FreeBSD doesn't perform a per-cpu resume of the timer, we need to call smp_rendezvous in order to correctly resume the timer on each CPU. sys/dev/xen/xenpci/xenpci.c: - Don't reset the PCI interrupt on each suspend/resume. sys/kern/subr_smp.c: - When suspending a PVHVM domain make sure there are no MMU IPIs in-flight, or we will get a lockup on resume due to the fact that pending event channels are not carried over on migration. - Implement a generic version of restart_cpus that can be used by suspended and stopped cpus. sys/x86/xen/hvm.c: - Implement resume support for the hypercall page and shared info. - Clear vcpu_info so it can be reset by APs when resuming from suspension. sys/x86/xen/xen_intr.c: - Properly rebind per-cpus VIRQs and IPIs on resume. Index: amd64/amd64/mp_machdep.c =================================================================== --- amd64/amd64/mp_machdep.c (revision 255520) +++ amd64/amd64/mp_machdep.c (working copy) @@ -1468,6 +1468,10 @@ cpususpend_handler(void) cpu = PCPU_GET(cpuid); +#ifdef XENHVM + mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); +#endif + if (savectx(susppcbs[cpu])) { ctx_fpusave(susppcbs[cpu]->pcb_fpususpend); wbinvd(); @@ -1486,11 +1490,23 @@ cpususpend_handler(void) while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); +#ifdef XENHVM + /* + * Reset pending bitmap IPIs, because Xen doesn't preserve pending + * event channels on migration. + */ + cpu_ipi_pending[cpu] = 0; + /* register vcpu_info area */ + xen_hvm_init_cpu(); +#endif + /* Resume MCA and local APIC */ mca_resume(); lapic_setup(0); CPU_CLR_ATOMIC(cpu, &started_cpus); + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Index: amd64/include/intr_machdep.h =================================================================== --- amd64/include/intr_machdep.h (revision 255520) +++ amd64/include/intr_machdep.h (working copy) @@ -102,7 +102,7 @@ struct pic { int (*pic_vector)(struct intsrc *); int (*pic_source_pending)(struct intsrc *); void (*pic_suspend)(struct pic *); - void (*pic_resume)(struct pic *); + void (*pic_resume)(struct pic *, bool suspend_cancelled); int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); @@ -170,7 +170,7 @@ struct intsrc *intr_lookup_source(int vector); int intr_register_pic(struct pic *pic); int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); -void intr_resume(void); +void intr_resume(bool suspend_cancelled); void intr_suspend(void); void intrcnt_add(const char *name, u_long **countp); void nexus_add_irq(u_long irq); Index: dev/acpica/acpi_timer.c =================================================================== --- dev/acpica/acpi_timer.c (revision 255520) +++ dev/acpica/acpi_timer.c (working copy) @@ -189,6 +189,7 @@ acpi_timer_probe(device_t dev) else acpi_timer_timecounter.tc_counter_mask = 0x00ffffff; acpi_timer_timecounter.tc_frequency = acpi_timer_frequency; + acpi_timer_timecounter.tc_flags = TC_FLAGS_SUSPEND_SAFE; if (testenv("debug.acpi.timer_test")) acpi_timer_boot_test(); @@ -285,6 +286,14 @@ acpi_timer_suspend_handler(struct timecounter *new acpi_timer_eh = NULL; } + if ((timecounter->tc_flags & TC_FLAGS_SUSPEND_SAFE) != 0) { + /* + * If we are using a suspend safe timecounter don't + * save/restore it across suspend/resume. + */ + return; + } + KASSERT(newtc == &acpi_timer_timecounter, ("acpi_timer_suspend_handler: wrong timecounter")); Index: dev/uart/uart_dev_ns8250.c =================================================================== --- dev/uart/uart_dev_ns8250.c (revision 255520) +++ dev/uart/uart_dev_ns8250.c (working copy) @@ -54,6 +54,18 @@ __FBSDID("$FreeBSD$"); #define DEFAULT_RCLK 1843200 +#ifdef XENHVM +/* + * If we are running in a virtual environment + * the device model has to emulate the calls inb, + * which takes much more time than on native, + * so reduce the timeout to something more sensible. + */ +#define WAIT_LIMIT 100 +#else +#define WAIT_LIMIT 250000 +#endif + static int broken_txfifo = 0; SYSCTL_INT(_hw, OID_AUTO, broken_txfifo, CTLFLAG_RW | CTLFLAG_TUN, &broken_txfifo, 0, "UART FIFO has QEMU emulation bug"); @@ -316,12 +328,12 @@ ns8250_putc(struct uart_bas *bas, int c) { int limit; - limit = 250000; + limit = WAIT_LIMIT; while ((uart_getreg(bas, REG_LSR) & LSR_THRE) == 0 && --limit) DELAY(4); uart_setreg(bas, REG_DATA, c); uart_barrier(bas); - limit = 250000; + limit = WAIT_LIMIT; while ((uart_getreg(bas, REG_LSR) & LSR_TEMT) == 0 && --limit) DELAY(4); } Index: dev/xen/control/control.c =================================================================== --- dev/xen/control/control.c (revision 255520) +++ dev/xen/control/control.c (working copy) @@ -119,11 +119,9 @@ __FBSDID("$FreeBSD$"); #include #include #include - -#ifndef XENHVM #include #include -#endif +#include #include @@ -140,6 +138,10 @@ __FBSDID("$FreeBSD$"); #include #include +#ifdef XENHVM +#include +#endif + #include #include @@ -199,7 +201,7 @@ extern void xencons_resume(void); static void xctrl_suspend() { - int i, j, k, fpp; + int i, j, k, fpp, suspend_cancelled; unsigned long max_pfn, start_info_mfn; EVENTHANDLER_INVOKE(power_suspend); @@ -264,7 +266,7 @@ xctrl_suspend() */ start_info_mfn = VTOMFN(xen_start_info); pmap_suspend(); - HYPERVISOR_suspend(start_info_mfn); + suspend_cancelled = HYPERVISOR_suspend(start_info_mfn); pmap_resume(); pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info); @@ -287,7 +289,7 @@ xctrl_suspend() HYPERVISOR_shared_info->arch.max_pfn = max_pfn; gnttab_resume(); - intr_resume(); + intr_resume(supend_cancelled != 0); local_irq_enable(); xencons_resume(); @@ -331,17 +333,32 @@ xen_pv_shutdown_final(void *arg, int howto) } #else -extern void xenpci_resume(void); /* HVM mode suspension. */ static void xctrl_suspend() { +#ifdef SMP + cpuset_t cpu_suspend_map; +#endif int suspend_cancelled; EVENTHANDLER_INVOKE(power_suspend); + if (smp_started) { + thread_lock(curthread); + sched_bind(curthread, 0); + thread_unlock(curthread); + } + KASSERT((PCPU_GET(cpuid) == 0), ("Not running on CPU#0")); + /* + * Clear our XenStore node so the toolstack knows we are + * responding to the suspend request. + */ + xs_write(XST_NIL, "control", "shutdown", ""); + + /* * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE * drivers need this. */ @@ -353,31 +370,67 @@ xctrl_suspend() } mtx_unlock(&Giant); +#ifdef SMP + if (smp_started) { + /* + * Suspend other CPUs. This prevents IPIs while we + * are resuming, and will allow us to reset per-cpu + * vcpu_info on resume. + */ + cpu_suspend_map = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &cpu_suspend_map); + if (!CPU_EMPTY(&cpu_suspend_map)) + suspend_cpus(cpu_suspend_map); + } +#endif + /* * Prevent any races with evtchn_interrupt() handler. */ disable_intr(); intr_suspend(); + xen_hvm_suspend(); suspend_cancelled = HYPERVISOR_suspend(0); - intr_resume(); + xen_hvm_resume(suspend_cancelled != 0); + intr_resume(suspend_cancelled != 0); + enable_intr(); /* - * Re-enable interrupts and put the scheduler back to normal. + * Reset grant table info. */ - enable_intr(); + gnttab_resume(); +#ifdef SMP + if (smp_started && !CPU_EMPTY(&cpu_suspend_map)) { + /* + * Now that event channels have been initialized, + * resume CPUs. + */ + resume_cpus(cpu_suspend_map); + } +#endif + /* * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or * similar. */ mtx_lock(&Giant); - if (!suspend_cancelled) - DEVICE_RESUME(root_bus); + DEVICE_RESUME(root_bus); mtx_unlock(&Giant); + if (smp_started) { + thread_lock(curthread); + sched_unbind(curthread); + thread_unlock(curthread); + } + EVENTHANDLER_INVOKE(power_resume); + + if (bootverbose) + printf("System resumed after suspension\n"); + } #endif Index: dev/xen/timer/timer.c =================================================================== --- dev/xen/timer/timer.c (revision 255520) +++ dev/xen/timer/timer.c (working copy) @@ -1,4 +1,4 @@ -/** +/*- * Copyright (c) 2009 Adrian Chadd * Copyright (c) 2012 Spectra Logic Corporation * All rights reserved. @@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "clock_if.h" @@ -316,7 +317,7 @@ xentimer_settime(device_t dev __unused, struct tim * Don't return EINVAL here; just silently fail if the domain isn't * privileged enough to set the TOD. */ - return(0); + return (0); } /** @@ -339,7 +340,7 @@ xentimer_gettime(device_t dev, struct timespec *ts xen_fetch_uptime(&u_ts); timespecadd(ts, &u_ts); - return(0); + return (0); } /** @@ -457,8 +458,9 @@ xentimer_attach(device_t dev) /* Bind an event channel to a VIRQ on each VCPU. */ CPU_FOREACH(i) { - struct xentimer_pcpu_data *pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); + struct xentimer_pcpu_data *pcpu; + pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); if (error) { device_printf(dev, "Error disabling Xen periodic timer " @@ -493,6 +495,7 @@ xentimer_attach(device_t dev) /* Register the timecounter. */ sc->tc.tc_name = "XENTIMER"; sc->tc.tc_quality = XENTIMER_QUALITY; + sc->tc.tc_flags = TC_FLAGS_SUSPEND_SAFE; /* * The underlying resolution is in nanoseconds, since the timer info * scales TSC frequencies using a fraction that represents time in @@ -523,75 +526,60 @@ xentimer_detach(device_t dev) return (EBUSY); } -/** - * The following device methods are disabled because they wouldn't work - * properly. - */ -#ifdef NOTYET +static void +xentimer_percpu_resume(void *arg) +{ + device_t dev = (device_t) arg; + struct xentimer_softc *sc = device_get_softc(dev); + + xentimer_et_start(&sc->et, sc->et.et_min_period, 0); +} + static int xentimer_resume(device_t dev) { - struct xentimer_softc *sc = device_get_softc(dev); - int error = 0; + int error; int i; - device_printf(sc->dev, "%s", __func__); + /* Disable the periodic timer */ CPU_FOREACH(i) { - struct xentimer_pcpu_data *pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); - - /* Skip inactive timers. */ - if (pcpu->timer == 0) - continue; - - /* - * XXX This won't actually work, because Xen requires that - * singleshot timers be set while running on the given CPU. - */ - error = xentimer_vcpu_start_timer(i, pcpu->timer); - if (error == -ETIME) { - /* Event time has already passed; process. */ - xentimer_intr(sc); - } else if (error != 0) { - panic("%s: error %d restarting vcpu %d\n", - __func__, error, i); + error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); + if (error != 0) { + device_printf(dev, + "Error disabling Xen periodic timer on CPU %d\n", + i); + return (error); } } - return (error); + /* Reset the last uptime value */ + xen_timer_last_time = 0; + + /* Reset the RTC clock */ + inittodr(time_second); + + /* Kick the timers on all CPUs */ + smp_rendezvous(NULL, xentimer_percpu_resume, NULL, dev); + + if (bootverbose) + device_printf(dev, "resumed operation after suspension\n"); + + return (0); } static int xentimer_suspend(device_t dev) { - struct xentimer_softc *sc = device_get_softc(dev); - int error = 0; - int i; - - device_printf(sc->dev, "%s", __func__); - CPU_FOREACH(i) { - struct xentimer_pcpu_data *pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); - - /* Skip inactive timers. */ - if (pcpu->timer == 0) - continue; - error = xentimer_vcpu_stop_timer(i); - if (error) - panic("Error %d stopping VCPU %d timer\n", error, i); - } - - return (error); + return (0); } -#endif static device_method_t xentimer_methods[] = { DEVMETHOD(device_identify, xentimer_identify), DEVMETHOD(device_probe, xentimer_probe), DEVMETHOD(device_attach, xentimer_attach), DEVMETHOD(device_detach, xentimer_detach), -#ifdef NOTYET DEVMETHOD(device_suspend, xentimer_suspend), DEVMETHOD(device_resume, xentimer_resume), -#endif /* clock interface */ DEVMETHOD(clock_gettime, xentimer_gettime), DEVMETHOD(clock_settime, xentimer_settime), Index: dev/xen/xenpci/xenpci.c =================================================================== --- dev/xen/xenpci/xenpci.c (revision 255520) +++ dev/xen/xenpci/xenpci.c (working copy) @@ -77,6 +77,7 @@ xenpci_irq_init(device_t device, struct xenpci_sof if (error) return error; +#ifdef SMP /* * When using the PCI event delivery callback we cannot assign * events to specific vCPUs, so all events are delivered to vCPU#0 by @@ -88,6 +89,7 @@ xenpci_irq_init(device_t device, struct xenpci_sof scp->res_irq, 0); if (error) return error; +#endif xen_hvm_set_callback(device); return (0); @@ -309,28 +311,12 @@ xenpci_detach(device_t dev) static int xenpci_suspend(device_t dev) { - struct xenpci_softc *scp = device_get_softc(dev); - device_t parent = device_get_parent(dev); - - if (scp->intr_cookie != NULL) { - if (BUS_TEARDOWN_INTR(parent, dev, scp->res_irq, - scp->intr_cookie) != 0) - printf("intr teardown failed.. continuing\n"); - scp->intr_cookie = NULL; - } - return (bus_generic_suspend(dev)); } static int xenpci_resume(device_t dev) { - struct xenpci_softc *scp = device_get_softc(dev); - device_t parent = device_get_parent(dev); - - BUS_SETUP_INTR(parent, dev, scp->res_irq, - INTR_MPSAFE|INTR_TYPE_MISC, xenpci_intr_filter, NULL, - /*trap_frame*/NULL, &scp->intr_cookie); xen_hvm_set_callback(dev); return (bus_generic_resume(dev)); } Index: i386/i386/mp_machdep.c =================================================================== --- i386/i386/mp_machdep.c (revision 255520) +++ i386/i386/mp_machdep.c (working copy) @@ -1529,6 +1529,10 @@ cpususpend_handler(void) cpu = PCPU_GET(cpuid); +#ifdef XENHVM + mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); +#endif + if (savectx(susppcbs[cpu])) { wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); @@ -1545,10 +1549,22 @@ cpususpend_handler(void) while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); +#ifdef XENHVM + /* + * Reset pending bitmap IPIs, because Xen doesn't preserve pending + * event channels on migration. + */ + cpu_ipi_pending[cpu] = 0; + /* register vcpu_info area */ + xen_hvm_init_cpu(); +#endif + /* Resume MCA and local APIC */ mca_resume(); lapic_setup(0); + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &started_cpus); } /* Index: kern/subr_smp.c =================================================================== --- kern/subr_smp.c (revision 255520) +++ kern/subr_smp.c (working copy) @@ -225,6 +225,18 @@ generic_stop_cpus(cpuset_t map, u_int type) CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); +#ifdef XENHVM + /* + * When migrating a PVHVM domain we need to make sure there are + * no IPIs in progress. IPIs that have been issued, but not + * yet delivered (not pending on a vCPU) will be lost in the + * IPI rebinding process, violating FreeBSD's assumption of + * reliable IPI delivery. + */ + if (type == IPI_SUSPEND) + mtx_lock_spin(&smp_ipi_mtx); +#endif + if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) @@ -252,6 +264,11 @@ generic_stop_cpus(cpuset_t map, u_int type) } } +#ifdef XENHVM + if (type == IPI_SUSPEND) + mtx_unlock_spin(&smp_ipi_mtx); +#endif + stopping_cpu = NOCPU; return (1); } @@ -292,28 +309,60 @@ suspend_cpus(cpuset_t map) * 0: NA * 1: ok */ -int -restart_cpus(cpuset_t map) +static int +generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif + volatile cpuset_t *cpus; + KASSERT( +#if defined(__amd64__) || defined(__i386__) + type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, +#else + type == IPI_STOP || type == IPI_STOP_HARD, +#endif + ("%s: invalid stop type", __func__)); + if (!smp_started) return 0; CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); +#if defined(__amd64__) || defined(__i386__) + if (type == IPI_SUSPEND) + cpus = &suspended_cpus; + else +#endif + cpus = &stopped_cpus; + /* signal other cpus to restart */ CPU_COPY_STORE_REL(&map, &started_cpus); /* wait for each to clear its bit */ - while (CPU_OVERLAP(&stopped_cpus, &map)) + while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); return 1; } +int +restart_cpus(cpuset_t map) +{ + + return (generic_restart_cpus(map, IPI_STOP)); +} + +#if defined(__amd64__) || defined(__i386__) +int +resume_cpus(cpuset_t map) +{ + + return (generic_restart_cpus(map, IPI_SUSPEND)); +} +#endif + /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), Index: sys/smp.h =================================================================== --- sys/smp.h (revision 255520) +++ sys/smp.h (working copy) @@ -166,6 +166,7 @@ int stop_cpus(cpuset_t); int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); +int resume_cpus(cpuset_t); #endif void smp_rendezvous_action(void); Index: sys/timetc.h =================================================================== --- sys/timetc.h (revision 255520) +++ sys/timetc.h (working copy) @@ -59,6 +59,10 @@ struct timecounter { */ u_int tc_flags; #define TC_FLAGS_C3STOP 1 /* Timer dies in C3. */ +#define TC_FLAGS_SUSPEND_SAFE 2 /* + * Timer functional across + * suspend/resume. + */ void *tc_priv; /* Pointer to the timecounter's private parts. */ Index: x86/acpica/acpi_wakeup.c =================================================================== --- x86/acpica/acpi_wakeup.c (revision 255520) +++ x86/acpica/acpi_wakeup.c (working copy) @@ -266,7 +266,7 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int sta restart_cpus(suspcpus); #endif mca_resume(); - intr_resume(); + intr_resume(/*suspend_cancelled*/false); AcpiSetFirmwareWakingVector(0); } else { Index: x86/x86/intr_machdep.c =================================================================== --- x86/x86/intr_machdep.c (revision 255520) +++ x86/x86/intr_machdep.c (working copy) @@ -279,7 +279,7 @@ intr_execute_handlers(struct intsrc *isrc, struct } void -intr_resume(void) +intr_resume(bool suspend_cancelled) { struct pic *pic; @@ -289,7 +289,7 @@ void mtx_lock(&intr_table_lock); TAILQ_FOREACH(pic, &pics, pics) { if (pic->pic_resume != NULL) - pic->pic_resume(pic); + pic->pic_resume(pic, suspend_cancelled); } mtx_unlock(&intr_table_lock); } Index: x86/x86/io_apic.c =================================================================== --- x86/x86/io_apic.c (revision 255520) +++ x86/x86/io_apic.c (working copy) @@ -119,7 +119,7 @@ static int ioapic_vector(struct intsrc *isrc); static int ioapic_source_pending(struct intsrc *isrc); static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); -static void ioapic_resume(struct pic *pic); +static void ioapic_resume(struct pic *pic, bool suspend_cancelled); static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id); static void ioapic_program_intpin(struct ioapic_intsrc *intpin); @@ -486,7 +486,7 @@ ioapic_config_intr(struct intsrc *isrc, enum intr_ } static void -ioapic_resume(struct pic *pic) +ioapic_resume(struct pic *pic, bool suspend_cancelled) { struct ioapic *io = (struct ioapic *)pic; int i; Index: x86/x86/local_apic.c =================================================================== --- x86/x86/local_apic.c (revision 255520) +++ x86/x86/local_apic.c (working copy) @@ -161,7 +161,7 @@ static u_long lapic_timer_divisor; static struct eventtimer lapic_et; static void lapic_enable(void); -static void lapic_resume(struct pic *pic); +static void lapic_resume(struct pic *pic, bool suspend_cancelled); static void lapic_timer_oneshot(struct lapic *, u_int count, int enable_int); static void lapic_timer_periodic(struct lapic *, @@ -566,7 +566,7 @@ lapic_enable(void) /* Reset the local APIC on the BSP during resume. */ static void -lapic_resume(struct pic *pic) +lapic_resume(struct pic *pic, bool suspend_cancelled) { lapic_setup(0); Index: x86/xen/hvm.c =================================================================== --- x86/xen/hvm.c (revision 255520) +++ x86/xen/hvm.c (working copy) @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include /*--------------------------- Forward Declarations ---------------------------*/ +#ifdef SMP static driver_filter_t xen_smp_rendezvous_action; static driver_filter_t xen_invltlb; static driver_filter_t xen_invlpg; @@ -70,6 +71,7 @@ static driver_filter_t xen_ipi_bitmap_handler; static driver_filter_t xen_cpustop_handler; static driver_filter_t xen_cpususpend_handler; static driver_filter_t xen_cpustophard_handler; +#endif /*---------------------------- Extern Declarations ---------------------------*/ /* Variables used by mp_machdep to perform the MMU related IPIs */ @@ -93,6 +95,12 @@ extern void pmap_lazyfix_action(void); #define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) /*-------------------------------- Local Types -------------------------------*/ +enum xen_hvm_init_type { + XEN_HVM_INIT_COLD, + XEN_HVM_INIT_CANCELLED_SUSPEND, + XEN_HVM_INIT_RESUME +}; + struct xen_ipi_handler { driver_filter_t *filter; @@ -104,6 +112,7 @@ enum xen_domain_type xen_domain_type = XEN_NATIVE; static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support"); +#ifdef SMP static struct xen_ipi_handler xen_ipis[] = { [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, @@ -119,6 +128,7 @@ static struct xen_ipi_handler xen_ipis[] = [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" }, }; +#endif /** * If non-zero, the hypervisor has been configured to use a direct @@ -129,13 +139,16 @@ int xen_vector_callback_enabled; /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(struct vcpu_info, vcpu_local_info); DPCPU_DEFINE(struct vcpu_info *, vcpu_info); +#ifdef SMP DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); +#endif /*------------------ Hypervisor Access Shared Memory Regions -----------------*/ /** Hypercall table accessed via HYPERVISOR_*_op() methods. */ char *hypercall_stubs; shared_info_t *HYPERVISOR_shared_info; +#ifdef SMP /*---------------------------- XEN PV IPI Handlers ---------------------------*/ /* * This are C clones of the ASM functions found in apic_vector.s @@ -496,6 +509,7 @@ xen_init_ipis(void) /* Set the xen pv ipi ops to replace the native ones */ cpu_ops.ipi_vectored = xen_ipi_vectored; } +#endif /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/ static uint32_t @@ -579,6 +593,9 @@ xen_hvm_set_callback(device_t dev) struct xen_hvm_param xhp; int irq; + if (xen_vector_callback_enabled) + return; + xhp.domid = DOMID_SELF; xhp.index = HVM_PARAM_CALLBACK_IRQ; if (xen_feature(XENFEAT_hvm_callback_vector) != 0) { @@ -637,41 +654,83 @@ xen_hvm_disable_emulated_devices(void) outw(XEN_MAGIC_IOPORT, XMI_UNPLUG_IDE_DISKS|XMI_UNPLUG_NICS); } +static void +xen_hvm_init(enum xen_hvm_init_type init_type) +{ + int error; + int i; + + if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND) + return; + + error = xen_hvm_init_hypercall_stubs(); + + switch (init_type) { + case XEN_HVM_INIT_COLD: + if (error != 0) + return; + + setup_xen_features(); + break; + case XEN_HVM_INIT_RESUME: + if (error != 0) + panic("Unable to init Xen hypercall stubs on resume"); + break; + default: + panic("Unsupported HVM initialization type"); + } + + /* Clear any stale vcpu_info. */ + CPU_FOREACH(i) + DPCPU_ID_SET(i, vcpu_info, NULL); + + xen_vector_callback_enabled = 0; + xen_domain_type = XEN_HVM_DOMAIN; + xen_hvm_init_shared_info_page(); + xen_hvm_set_callback(NULL); + xen_hvm_disable_emulated_devices(); +} + void xen_hvm_suspend(void) { } void -xen_hvm_resume(void) +xen_hvm_resume(bool suspend_cancelled) { - xen_hvm_init_hypercall_stubs(); - xen_hvm_init_shared_info_page(); + xen_hvm_init(suspend_cancelled ? + XEN_HVM_INIT_CANCELLED_SUSPEND : XEN_HVM_INIT_RESUME); + + /* Register vcpu_info area for CPU#0. */ + xen_hvm_init_cpu(); } static void -xen_hvm_init(void *dummy __unused) +xen_hvm_sysinit(void *arg __unused) { + xen_hvm_init(XEN_HVM_INIT_COLD); +} - if (xen_hvm_init_hypercall_stubs() != 0) - return; - - xen_domain_type = XEN_HVM_DOMAIN; - setup_xen_features(); - xen_hvm_init_shared_info_page(); - xen_hvm_set_callback(NULL); - xen_hvm_disable_emulated_devices(); -} - -void xen_hvm_init_cpu(void) +void +xen_hvm_init_cpu(void) { struct vcpu_register_vcpu_info info; struct vcpu_info *vcpu_info; int cpu, rc; + if (DPCPU_GET(vcpu_info) != NULL) { + /* + * vcpu_info has already been set, + * this happens on failed migration, + * do nothing and just return. + */ + return; + } + + vcpu_info = DPCPU_PTR(vcpu_local_info); cpu = PCPU_GET(acpi_id); - vcpu_info = DPCPU_PTR(vcpu_local_info); info.mfn = vtophys(vcpu_info) >> PAGE_SHIFT; info.offset = vtophys(vcpu_info) - trunc_page(vtophys(vcpu_info)); @@ -682,6 +741,8 @@ static void DPCPU_SET(vcpu_info, vcpu_info); } -SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_init, NULL); +SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL); +#ifdef SMP SYSINIT(xen_init_ipis, SI_SUB_SMP, SI_ORDER_FIRST, xen_init_ipis, NULL); +#endif SYSINIT(xen_hvm_init_cpu, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_init_cpu, NULL); Index: x86/xen/xen_intr.c =================================================================== --- x86/xen/xen_intr.c (revision 255520) +++ x86/xen/xen_intr.c (working copy) @@ -120,7 +120,7 @@ struct xenisrc { #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) static void xen_intr_suspend(struct pic *); -static void xen_intr_resume(struct pic *); +static void xen_intr_resume(struct pic *, bool suspend_cancelled); static void xen_intr_enable_source(struct intsrc *isrc); static void xen_intr_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_eoi_source(struct intsrc *isrc); @@ -334,7 +334,7 @@ xen_intr_release_isrc(struct xenisrc *isrc) evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); evtchn_cpu_unmask_port(0, isrc->xi_port); - if (isrc->xi_close != 0) { + if (isrc->xi_close != 0 && is_valid_evtchn(isrc->xi_port)) { struct evtchn_close close = { .port = isrc->xi_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); @@ -408,6 +408,7 @@ xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_ return (error); } *isrcp = isrc; + evtchn_unmask_port(local_port); return (0); } @@ -571,6 +572,9 @@ xen_intr_init(void *dummy __unused) struct xen_intr_pcpu_data *pcpu; int i; + if (!xen_domain()) + return (0); + mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF); /* @@ -602,20 +606,116 @@ xen_intr_suspend(struct pic *unused) { } +static void +xen_rebind_ipi(struct xenisrc *isrc) +{ +#ifdef SMP + int cpu = isrc->xi_cpu; + int acpi_id = pcpu_find(cpu)->pc_acpi_id; + int error; + struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id }; + + error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi); + if (error != 0) + panic("unable to rebind xen IPI: %d", error); + + isrc->xi_port = bind_ipi.port; + isrc->xi_cpu = 0; + xen_intr_port_to_isrc[bind_ipi.port] = isrc; + + error = xen_intr_assign_cpu(&isrc->xi_intsrc, + cpu_apic_ids[cpu]); + if (error) + panic("unable to bind xen IPI to CPU#%d: %d", + cpu, error); + + evtchn_unmask_port(bind_ipi.port); +#else + panic("Resume IPI event channel on UP"); +#endif +} + +static void +xen_rebind_virq(struct xenisrc *isrc) +{ + int cpu = isrc->xi_cpu; + int acpi_id = pcpu_find(cpu)->pc_acpi_id; + int error; + struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq, + .vcpu = acpi_id }; + + error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (error != 0) + panic("unable to rebind xen VIRQ#%d: %d", isrc->xi_virq, error); + + isrc->xi_port = bind_virq.port; + isrc->xi_cpu = 0; + xen_intr_port_to_isrc[bind_virq.port] = isrc; + +#ifdef SMP + error = xen_intr_assign_cpu(&isrc->xi_intsrc, + cpu_apic_ids[cpu]); + if (error) + panic("unable to bind xen VIRQ#%d to CPU#%d: %d", + isrc->xi_virq, cpu, error); +#endif + + evtchn_unmask_port(bind_virq.port); +} + /** * Return this PIC to service after being suspended. */ static void -xen_intr_resume(struct pic *unused) +xen_intr_resume(struct pic *unused, bool suspend_cancelled) { - u_int port; + shared_info_t *s = HYPERVISOR_shared_info; + struct xenisrc *isrc; + u_int isrc_idx; + int i; - /* - * Mask events for all ports. They will be unmasked after - * drivers have re-registered their handlers. - */ - for (port = 0; port < NR_EVENT_CHANNELS; port++) - evtchn_mask_port(port); + if (suspend_cancelled) + return; + + /* Reset the per-CPU masks */ + CPU_FOREACH(i) { + struct xen_intr_pcpu_data *pcpu; + + pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); + memset(pcpu->evtchn_enabled, + i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); + } + + /* Mask all event channels. */ + for (i = 0; i < nitems(s->evtchn_mask); i++) + atomic_store_rel_long(&s->evtchn_mask[i], ~0); + + /* Remove port -> isrc mappings */ + memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc)); + + /* Free unused isrcs and rebind VIRQs and IPIs */ + for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx++) { + u_int vector; + + vector = FIRST_EVTCHN_INT + isrc_idx; + isrc = (struct xenisrc *)intr_lookup_source(vector); + if (isrc != NULL) { + isrc->xi_port = 0; + switch (isrc->xi_type) { + case EVTCHN_TYPE_IPI: + xen_rebind_ipi(isrc); + break; + case EVTCHN_TYPE_VIRQ: + xen_rebind_virq(isrc); + break; + default: + isrc->xi_cpu = 0; + break; + } + } + } } /** @@ -693,6 +793,7 @@ xen_intr_config_intr(struct intsrc *isrc, enum int static int xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id) { +#ifdef SMP struct evtchn_bind_vcpu bind_vcpu; struct xenisrc *isrc; u_int to_cpu, acpi_id; @@ -749,6 +850,9 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_in } mtx_unlock(&xen_intr_isrc_lock); return (0); +#else + return (EOPNOTSUPP); +#endif } /*------------------- Virtual Interrupt Source PIC Functions -----------------*/ @@ -979,8 +1083,11 @@ xen_intr_bind_virq(device_t dev, u_int virq, u_int error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev, filter, handler, arg, flags, port_handlep); + +#ifdef SMP if (error == 0) error = intr_event_bind(isrc->xi_intsrc.is_event, cpu); +#endif if (error != 0) { evtchn_close_t close = { .port = bind_virq.port }; @@ -991,6 +1098,7 @@ xen_intr_bind_virq(device_t dev, u_int virq, u_int return (error); } +#ifdef SMP if (isrc->xi_cpu != cpu) { /* * Too early in the boot process for the generic interrupt @@ -1000,12 +1108,15 @@ xen_intr_bind_virq(device_t dev, u_int virq, u_int */ xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); } +#endif /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; + isrc->xi_virq = virq; + return (0); } @@ -1014,6 +1125,7 @@ xen_intr_alloc_and_bind_ipi(device_t dev, u_int cp driver_filter_t filter, enum intr_type flags, xen_intr_handle_t *port_handlep) { +#ifdef SMP int acpi_id = pcpu_find(cpu)->pc_acpi_id; struct xenisrc *isrc; struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id }; @@ -1063,6 +1175,9 @@ xen_intr_alloc_and_bind_ipi(device_t dev, u_int cp */ isrc->xi_close = 1; return (0); +#else + return (EOPNOTSUPP); +#endif } int Index: xen/hvm.h =================================================================== --- xen/hvm.h (revision 255520) +++ xen/hvm.h (working copy) @@ -93,6 +93,6 @@ enum { void xen_hvm_set_callback(device_t); void xen_hvm_suspend(void); -void xen_hvm_resume(void); +void xen_hvm_resume(bool suspend_cancelled); void xen_hvm_init_cpu(void); #endif /* __XEN_HVM_H__ */