Index: conf/options.i386 =================================================================== --- conf/options.i386 (.../stable/6/sys) (revision 184012) +++ conf/options.i386 (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -163,3 +163,6 @@ # Debugging KDB_STOP_NMI opt_kdb.h NPX_DEBUG opt_npx.h + +NATIVE opt_global.h +XEN opt_global.h Index: conf/kern.pre.mk =================================================================== --- conf/kern.pre.mk (.../stable/6/sys) (revision 184012) +++ conf/kern.pre.mk (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -20,12 +20,12 @@ COPTFLAGS?= -O .else . if defined(DEBUG) -_MINUS_O= -O +_MINUS_O= -O -fno-optimize-sibling-calls . else _MINUS_O= -O2 . endif . if ${MACHINE_ARCH} == "amd64" -COPTFLAGS?=-O2 -frename-registers -pipe +COPTFLAGS?=${_MINUS_O} -frename-registers -pipe . else COPTFLAGS?=${_MINUS_O} -pipe . endif @@ -70,6 +70,9 @@ # .. and the same for em INCLUDES+= -I$S/dev/em +INCLUDES+= -I$S/xen/interface -I$S/xen/interface/io -I$S/xen/interface/hvm + + CFLAGS= ${COPTFLAGS} ${CWARNFLAGS} ${DEBUG} CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h .if ${CC} != "icc" Index: conf/files.i386 =================================================================== --- conf/files.i386 (.../stable/6/sys) (revision 184012) +++ conf/files.i386 (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -291,8 +291,8 @@ i386/i386/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/i386/autoconf.c standard -i386/i386/bios.c standard -i386/i386/bioscall.s standard +i386/i386/bios.c optional native +i386/i386/bioscall.s optional native i386/i386/busdma_machdep.c standard i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb @@ -301,7 +301,8 @@ i386/i386/elan-mmcr.c optional cpu_elan i386/i386/elan-mmcr.c optional cpu_soekris i386/i386/elf_machdep.c standard -i386/i386/exception.s standard +i386/i386/exception.s optional native +i386/xen/exception.s optional xen i386/i386/gdb_machdep.c optional gdb i386/i386/geode.c optional cpu_geode i386/i386/i686_mem.c optional mem @@ -314,22 +315,27 @@ i386/i386/k6_mem.c optional mem i386/i386/legacy.c standard i386/i386/local_apic.c optional apic -i386/i386/locore.s standard no-obj +i386/i386/locore.s optional native no-obj +i386/xen/locore.s optional xen no-obj i386/i386/longrun.c optional cpu_enable_longrun i386/i386/machdep.c standard i386/i386/mem.c optional mem i386/i386/minidump_machdep.c standard i386/i386/mp_clock.c optional smp -i386/i386/mp_machdep.c optional smp +i386/i386/mp_machdep.c optional native smp +i386/xen/mp_machdep.c optional xen smp i386/i386/mp_watchdog.c optional mp_watchdog smp -i386/i386/mpboot.s optional smp -i386/i386/mptable.c optional apic +i386/i386/mpboot.s optional native smp +i386/xen/mptable.c optional apic xen +i386/i386/mptable.c optional apic native i386/i386/mptable_pci.c optional apic pci i386/i386/msi.c optional apic pci i386/i386/nexus.c standard i386/i386/perfmon.c optional perfmon i386/i386/perfmon.c optional perfmon profiling-routine -i386/i386/pmap.c standard +i386/i386/pmap.c optional native +i386/xen/pmap.c optional xen +i386/xen/xen_machdep.c optional xen i386/i386/ptrace_machdep.c standard i386/i386/support.s standard i386/i386/swtch.s standard @@ -358,9 +364,10 @@ i386/ibcs2/ibcs2_xenix.c optional ibcs2 i386/ibcs2/ibcs2_xenix_sysent.c optional ibcs2 i386/ibcs2/imgact_coff.c optional ibcs2 -i386/isa/atpic.c standard +i386/isa/atpic.c optional atpic #i386/isa/atpic_vector.s standard -i386/isa/clock.c standard +i386/isa/clock.c optional native +i386/xen/clock.c optional xen i386/isa/elcr.c standard i386/isa/elink.c optional ep i386/isa/elink.c optional ie Index: conf/files =================================================================== --- conf/files (.../stable/6/sys) (revision 184012) +++ conf/files (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -1475,6 +1475,7 @@ libkern/strcat.c standard libkern/strcmp.c standard libkern/strcpy.c standard +libkern/strcspn.c standard libkern/strdup.c standard libkern/strlcat.c standard libkern/strlcpy.c standard @@ -2043,4 +2044,41 @@ xdr/xdr_mbuf.c optional nfslockd xdr/xdr_mem.c optional nfslockd xdr/xdr_reference.c optional nfslockd -xdr/xdr_sizeof.c optional nfslockd \ No newline at end of file +xdr/xdr_sizeof.c optional nfslockd + + +xen/gnttab.c optional xen +xen/features.c optional xen +xen/evtchn/evtchn.c optional xen +xen/evtchn/evtchn_dev.c optional xen +xen/reboot.c optional xen +xen/xenbus/xenbus_client.c optional xen +xen/xenbus/xenbus_comms.c optional xen +xen/xenbus/xenbus_dev.c optional xen +xen/xenbus/xenbus_if.m optional xen +xen/xenbus/xenbus_probe.c optional xen +#xen/xenbus/xenbus_probe_backend.c optional xen +xen/xenbus/xenbus_xs.c optional xen +dev/xen/balloon/balloon.c optional xen +dev/xen/balloon/balloon.c optional xenhvm +dev/xen/console/console.c optional xen +dev/xen/console/xencons_ring.c optional xen +dev/xen/blkfront/blkfront.c optional xen +dev/xen/netfront/netfront.c optional xen +dev/xen/blkfront/blkfront.c optional xenhvm +dev/xen/netfront/netfront.c optional xenhvm + +xen/gnttab.c optional xenhvm +xen/features.c optional xenhvm +dev/xen/xenpci/evtchn.c optional xenhvm +dev/xen/xenpci/machine_reboot.c optional xenhvm +xen/evtchn/evtchn_dev.c optional xenhvm +xen/reboot.c optional xenhvm +xen/xenbus/xenbus_client.c optional xenhvm +xen/xenbus/xenbus_comms.c optional xenhvm +xen/xenbus/xenbus_dev.c optional xenhvm +xen/xenbus/xenbus_if.m optional xenhvm +xen/xenbus/xenbus_probe.c optional xenhvm +#xen/xenbus/xenbus_probe_backend.c optional xenhvm +xen/xenbus/xenbus_xs.c optional xenhvm +dev/xen/xenpci/xenpci.c optional xenpci Index: conf/options.amd64 =================================================================== --- conf/options.amd64 (.../stable/6/sys) (revision 184012) +++ conf/options.amd64 (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -58,3 +58,5 @@ # Debugging KDB_STOP_NMI opt_kdb.h + +XENHVM opt_global.h Index: kern/kern_timeout.c =================================================================== --- kern/kern_timeout.c (.../stable/6/sys) (revision 184012) +++ kern/kern_timeout.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -557,7 +557,7 @@ mtx_unlock_spin(&callout_lock); sleepq_add(&callout_wait, &callout_lock.mtx_object, "codrain", - SLEEPQ_MSLEEP, 0); + SLEEPQ_SLEEP, 0); sleepq_wait(&callout_wait); sq_locked = 0; Index: kern/kern_mutex.c =================================================================== --- kern/kern_mutex.c (.../stable/6/sys) (revision 184012) +++ kern/kern_mutex.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -92,25 +92,66 @@ #ifdef DDB static void db_show_mtx(struct lock_object *lock); #endif +static void lock_mtx(struct lock_object *lock, int how); +static void lock_spin(struct lock_object *lock, int how); +static int unlock_mtx(struct lock_object *lock); +static int unlock_spin(struct lock_object *lock); + /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { - "sleep mutex", - LC_SLEEPLOCK | LC_RECURSABLE, + .lc_name = "sleep mutex", + .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, #ifdef DDB - db_show_mtx + .lc_ddb_show = db_show_mtx, #endif + .lc_lock = lock_mtx, + .lc_unlock = unlock_mtx, }; struct lock_class lock_class_mtx_spin = { - "spin mutex", - LC_SPINLOCK | LC_RECURSABLE, + .lc_name = "spin mutex", + .lc_flags = LC_SPINLOCK | LC_RECURSABLE, #ifdef DDB - db_show_mtx + .lc_ddb_show = db_show_mtx, #endif + .lc_lock = lock_spin, + .lc_unlock = unlock_spin, }; +void +lock_mtx(struct lock_object *lock, int how) +{ + + mtx_lock((struct mtx *)lock); +} + +void +lock_spin(struct lock_object *lock, int how) +{ + + panic("spin locks can only use msleep_spin"); +} + +int +unlock_mtx(struct lock_object *lock) +{ + struct mtx *m; + + m = (struct mtx *)lock; + mtx_assert(m, MA_OWNED | MA_NOTRECURSED); + mtx_unlock(m); + return (0); +} + +int +unlock_spin(struct lock_object *lock) +{ + + panic("spin locks can only use msleep_spin"); +} + /* * System-wide mutexes */ Index: kern/kern_synch.c =================================================================== --- kern/kern_synch.c (.../stable/6/sys) (revision 184012) +++ kern/kern_synch.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -64,11 +64,18 @@ #include +#ifdef XEN +#include +#include +#include +#endif + static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL) int hogticks; int lbolt; +static int pause_wchan; static struct callout loadav_callout; static struct callout lbolt_callout; @@ -100,7 +107,144 @@ init_sleepqueues(); } + /* + * General sleep call. Suspends the current thread until a wakeup is + * performed on the specified identifier. The thread will then be made + * runnable with the specified priority. Sleeps at most timo/hz seconds + * (0 means no timeout). If pri includes PCATCH flag, signals are checked + * before and after sleeping, else signals are not checked. Returns 0 if + * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a + * signal needs to be delivered, ERESTART is returned if the current system + * call should be restarted if possible, and EINTR is returned if the system + * call should be interrupted by the signal (return EINTR). + * + * The lock argument is unlocked before the caller is suspended, and + * re-locked before _sleep() returns. If priority includes the PDROP + * flag the lock is not re-locked before returning. + */ +int +_sleep(void *ident, struct lock_object *lock, int priority, + const char *wmesg, int timo) +{ + struct thread *td; + struct proc *p; + struct lock_class *class; + int catch, flags, lock_state, pri, rval; + WITNESS_SAVE_DECL(lock_witness); + + td = curthread; + p = td->td_proc; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Sleeping on \"%s\"", wmesg); + KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL || + ident == &lbolt, ("sleeping without a lock")); + KASSERT(p != NULL, ("msleep1")); + KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); + if (lock != NULL) + class = LOCK_CLASS(lock); + else + class = NULL; + + if (cold) { + /* + * During autoconfiguration, just return; + * don't run any other threads or panic below, + * in case this is the idle thread and already asleep. + * XXX: this used to do "s = splhigh(); splx(safepri); + * splx(s);" to give interrupts a chance, but there is + * no way to give interrupts a chance now. + */ + if (lock != NULL && priority & PDROP) + class->lc_unlock(lock); + return (0); + } + catch = priority & PCATCH; + rval = 0; + + /* + * If we are already on a sleep queue, then remove us from that + * sleep queue first. We have to do this to handle recursive + * sleeps. + */ + if (TD_ON_SLEEPQ(td)) + sleepq_remove(td, td->td_wchan); + + if (ident == &pause_wchan) + flags = SLEEPQ_PAUSE; + else + flags = SLEEPQ_SLEEP; + if (catch) + flags |= SLEEPQ_INTERRUPTIBLE; + + sleepq_lock(ident); + CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", + td->td_tid, p->p_pid, p->p_comm, wmesg, ident); + + DROP_GIANT(); + if (lock != NULL && !(class->lc_flags & LC_SLEEPABLE)) { + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + } else + /* GCC needs to follow the Yellow Brick Road */ + lock_state = -1; + + /* + * We put ourselves on the sleep queue and start our timeout + * before calling thread_suspend_check, as we could stop there, + * and a wakeup or a SIGCONT (or both) could occur while we were + * stopped without resuming us. Thus, we must be ready for sleep + * when cursig() is called. If the wakeup happens while we're + * stopped, then td will no longer be on a sleep queue upon + * return from cursig(). + */ + sleepq_add(ident, ident == &lbolt ? NULL : lock, wmesg, flags, 0); + if (timo) + sleepq_set_timeout(ident, timo); + if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { + sleepq_release(ident); + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + sleepq_lock(ident); + } + + /* + * Adjust this thread's priority, if necessary. + */ + pri = priority & PRIMASK; + if (pri != 0 && pri != td->td_priority) { + mtx_lock_spin(&sched_lock); + sched_prio(td, pri); + mtx_unlock_spin(&sched_lock); + } + + if (timo && catch) + rval = sleepq_timedwait_sig(ident); + else if (timo) + rval = sleepq_timedwait(ident); + else if (catch) + rval = sleepq_wait_sig(ident); + else { + sleepq_wait(ident); + rval = 0; + } +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (lock != NULL && !(priority & PDROP)) { + class->lc_lock(lock, lock_state); + WITNESS_RESTORE(lock, lock_witness); + } + return (rval); +} + +/* * General sleep call. Suspends the current process until a wakeup is * performed on the specified identifier. The process will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds @@ -164,7 +308,7 @@ if (TD_ON_SLEEPQ(td)) sleepq_remove(td, td->td_wchan); - flags = SLEEPQ_MSLEEP; + flags = SLEEPQ_SLEEP; if (catch) flags |= SLEEPQ_INTERRUPTIBLE; @@ -265,7 +409,7 @@ /* * We put ourselves on the sleep queue and start our timeout. */ - sleepq_add(ident, &mtx->mtx_object, wmesg, SLEEPQ_MSLEEP, 0); + sleepq_add(ident, &mtx->mtx_object, wmesg, SLEEPQ_SLEEP, 0); if (timo) sleepq_set_timeout(ident, timo); @@ -314,7 +458,7 @@ { sleepq_lock(ident); - sleepq_broadcast(ident, SLEEPQ_MSLEEP, -1, 0); + sleepq_broadcast(ident, SLEEPQ_SLEEP, -1, 0); } /* @@ -328,7 +472,7 @@ { sleepq_lock(ident); - sleepq_signal(ident, SLEEPQ_MSLEEP, -1, 0); + sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0); } /* @@ -417,6 +561,9 @@ td, td->td_proc->p_comm, td->td_priority, td->td_inhibitors, td->td_wmesg, td->td_lockname); #endif +#ifdef XEN + PT_UPDATES_FLUSH(); +#endif sched_switch(td, newtd, flags); CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d", td, td->td_proc->p_comm, td->td_priority); Index: kern/subr_trap.c =================================================================== --- kern/subr_trap.c (.../stable/6/sys) (revision 184012) +++ kern/subr_trap.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -67,6 +67,12 @@ #include #include +#ifdef XEN +#include +#include +#include +#endif + /* * Define the code needed before returning to user mode, for * trap and syscall. @@ -139,6 +145,9 @@ sched_userret(td); KASSERT(td->td_locks == 0, ("userret: Returning with %d locks held.", td->td_locks)); +#ifdef XEN + PT_UPDATES_FLUSH(); +#endif } /* Index: kern/kern_rwlock.c =================================================================== --- kern/kern_rwlock.c (.../stable/6/sys) (revision 184012) +++ kern/kern_rwlock.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -60,13 +60,19 @@ static void db_show_rwlock(struct lock_object *lock); #endif +static void lock_rw(struct lock_object *lock, int how); +static int unlock_rw(struct lock_object *lock); + struct lock_class lock_class_rw = { .lc_name = "rw", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE, #ifdef DDB .lc_ddb_show = db_show_rwlock, #endif + .lc_lock = lock_rw, + .lc_unlock = unlock_rw }; + /* * Return a pointer to the owning thread if the lock is write-locked or @@ -99,6 +105,34 @@ #endif void +lock_rw(struct lock_object *lock, int how) +{ + struct rwlock *rw; + + rw = (struct rwlock *)lock; + if (how) + rw_wlock(rw); + else + rw_rlock(rw); +} + +int +unlock_rw(struct lock_object *lock) +{ + struct rwlock *rw; + + rw = (struct rwlock *)lock; + rw_assert(rw, RA_LOCKED | LA_NOTRECURSED); + if (rw->rw_lock & RW_LOCK_READ) { + rw_runlock(rw); + return (0); + } else { + rw_wunlock(rw); + return (1); + } +} + +void rw_init_flags(struct rwlock *rw, const char *name, int opts) { int flags; Index: kern/kern_sx.c =================================================================== --- kern/kern_sx.c (.../stable/6/sys) (revision 184012) +++ kern/kern_sx.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -108,12 +108,17 @@ static void db_show_sx(struct lock_object *lock); #endif +static void lock_sx(struct lock_object *lock, int how); +static int unlock_sx(struct lock_object *lock); + struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, #ifdef DDB .lc_ddb_show = db_show_sx, #endif + .lc_lock = lock_sx, + .lc_unlock = unlock_sx, }; #ifndef INVARIANTS @@ -121,6 +126,34 @@ #endif void +lock_sx(struct lock_object *lock, int how) +{ + struct sx *sx; + + sx = (struct sx *)lock; + if (how) + sx_xlock(sx); + else + sx_slock(sx); +} + +int +unlock_sx(struct lock_object *lock) +{ + struct sx *sx; + + sx = (struct sx *)lock; + sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); + if (sx_xlocked(sx)) { + sx_xunlock(sx); + return (1); + } else { + sx_sunlock(sx); + return (0); + } +} + +void sx_sysinit(void *arg) { struct sx_args *sargs = arg; @@ -845,6 +878,7 @@ } } +#if 0 /* * Atomically drop an sx lock while going to sleep. This is just a hack * for 6.x. In 7.0 and later this is done more cleanly. @@ -961,6 +995,7 @@ } return (rval); } +#endif #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS Index: kern/kern_fork.c =================================================================== --- kern/kern_fork.c (.../stable/6/sys) (revision 184012) +++ kern/kern_fork.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -112,10 +112,15 @@ struct thread *td; struct vfork_args *uap; { - int error; + int error, flags; struct proc *p2; - error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2); +#ifdef XEN + flags = RFFDG | RFPROC; +#else + flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; +#endif + error = fork1(td, flags, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; Index: kern/kern_lock.c =================================================================== --- kern/kern_lock.c (.../stable/6/sys) (revision 184012) +++ kern/kern_lock.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -62,11 +62,48 @@ #include #endif + +#ifdef DDB +#include +static void db_show_lockmgr(struct lock_object *lock); +#endif +static void lock_lockmgr(struct lock_object *lock, int how); +static int unlock_lockmgr(struct lock_object *lock); + +struct lock_class lock_class_lockmgr = { + .lc_name = "lockmgr", + .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, +#ifdef DDB + .lc_ddb_show = db_show_lockmgr, +#endif + .lc_lock = lock_lockmgr, + .lc_unlock = unlock_lockmgr, +}; + /* * Locking primitives implementation. * Locks provide shared/exclusive sychronization. */ +void +lock_lockmgr(struct lock_object *lock, int how) +{ + + panic("lockmgr locks do not support sleep interlocking"); +} + +int +unlock_lockmgr(struct lock_object *lock) +{ + + panic("lockmgr locks do not support sleep interlocking"); +} + +/* + * Locking primitives implementation. + * Locks provide shared/exclusive sychronization. + */ + #define COUNT(td, x) if ((td)) (td)->td_locks += (x) #define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \ LK_SHARE_NONZERO | LK_WAIT_NONZERO) @@ -639,14 +676,13 @@ return (1); } -DB_SHOW_COMMAND(lockmgr, db_show_lockmgr) +void +db_show_lockmgr(struct lock_object *lock) { struct thread *td; struct lock *lkp; - if (!have_addr) - return; - lkp = (struct lock *)addr; + lkp = (struct lock *)lock; db_printf("lock type: %s\n", lkp->lk_wmesg); db_printf("state: "); Index: kern/kern_condvar.c =================================================================== --- kern/kern_condvar.c (.../stable/6/sys) (revision 184012) +++ kern/kern_condvar.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -124,8 +124,7 @@ DROP_GIANT(); mtx_unlock(mp); - sleepq_add(cvp, &mp->mtx_object, cvp->cv_description, SLEEPQ_CONDVAR, - 0); + sleepq_add(cvp, &mp->mtx_object, cvp->cv_description, SLEEPQ_CONDVAR, 0); sleepq_wait(cvp); #ifdef KTRACE @@ -232,8 +231,7 @@ DROP_GIANT(); mtx_unlock(mp); - sleepq_add(cvp, &mp->mtx_object, cvp->cv_description, SLEEPQ_CONDVAR, - 0); + sleepq_add(cvp, &mp->mtx_object, cvp->cv_description, SLEEPQ_CONDVAR, 0); sleepq_set_timeout(cvp, timo); rval = sleepq_timedwait(cvp); Index: dev/xen/netfront/mbufq.h =================================================================== --- dev/xen/netfront/mbufq.h (.../stable/6/sys) (revision 0) +++ dev/xen/netfront/mbufq.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,123 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ + +#ifndef CXGB_MBUFQ_H_ +#define CXGB_MBUFQ_H_ + +struct mbuf_head { + struct mbuf *head; + struct mbuf *tail; + uint32_t qlen; + uint32_t qsize; + struct mtx lock; +}; + +static __inline void +mbufq_init(struct mbuf_head *l) +{ + l->head = l->tail = NULL; + l->qlen = l->qsize = 0; +} + +static __inline int +mbufq_empty(struct mbuf_head *l) +{ + return (l->head == NULL); +} + +static __inline int +mbufq_len(struct mbuf_head *l) +{ + return (l->qlen); +} + +static __inline int +mbufq_size(struct mbuf_head *l) +{ + return (l->qsize); +} + +static __inline int +mbufq_head_size(struct mbuf_head *l) +{ + return (l->head ? l->head->m_pkthdr.len : 0); +} + +static __inline void +mbufq_tail(struct mbuf_head *l, struct mbuf *m) +{ + l->qlen++; + if (l->head == NULL) + l->head = m; + else + l->tail->m_nextpkt = m; + l->tail = m; + l->qsize += m->m_pkthdr.len; +} + +static __inline struct mbuf * +mbufq_dequeue(struct mbuf_head *l) +{ + struct mbuf *m; + + m = l->head; + if (m) { + if (m == l->tail) + l->head = l->tail = NULL; + else + l->head = m->m_nextpkt; + m->m_nextpkt = NULL; + l->qlen--; + l->qsize -= m->m_pkthdr.len; + } + + return (m); +} + +static __inline struct mbuf * +mbufq_peek(struct mbuf_head *l) +{ + return (l->head); +} + +static __inline void +mbufq_append(struct mbuf_head *a, struct mbuf_head *b) +{ + if (a->tail) + a->tail->m_nextpkt = b->head; + if (b->tail) + a->tail = b->tail; + a->qlen += b->qlen; + a->qsize += b->qsize; + + +} +#endif /* CXGB_MBUFQ_H_ */ Property changes on: dev/xen/netfront/mbufq.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/netfront/netfront.c =================================================================== --- dev/xen/netfront/netfront.c (.../stable/6/sys) (revision 0) +++ dev/xen/netfront/netfront.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1988 @@ +/* + * + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#if __FreeBSD_version >= 700000 +#include +#include +#endif + +#include +#include + +#include /* for DELAY */ +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xenbus_if.h" + +#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) + +#define GRANT_INVALID_REF 0 + +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) + +#if __FreeBSD_version >= 700000 +/* + * Should the driver do LRO on the RX end + * this can be toggled on the fly, but the + * interface must be reset (down/up) for it + * to take effect. + */ +static int xn_enable_lro = 1; +TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); +#else + +#define IFCAP_TSO4 0 +#define CSUM_TSO 0 + +#endif + +#ifdef CONFIG_XEN +static int MODPARM_rx_copy = 0; +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); +static int MODPARM_rx_flip = 0; +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); +#else +static const int MODPARM_rx_copy = 1; +static const int MODPARM_rx_flip = 0; +#endif + +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) +#define RX_COPY_THRESHOLD 256 + +#define net_ratelimit() 0 + +struct netfront_info; +struct netfront_rx_info; + +static void xn_txeof(struct netfront_info *); +static void xn_rxeof(struct netfront_info *); +static void network_alloc_rx_buffers(struct netfront_info *); + +static void xn_tick_locked(struct netfront_info *); +static void xn_tick(void *); + +static void xn_intr(void *); +static void xn_start_locked(struct ifnet *); +static void xn_start(struct ifnet *); +static int xn_ioctl(struct ifnet *, u_long, caddr_t); +static void xn_ifinit_locked(struct netfront_info *); +static void xn_ifinit(void *); +static void xn_stop(struct netfront_info *); +#ifdef notyet +static void xn_watchdog(struct ifnet *); +#endif + +static void show_device(struct netfront_info *sc); +#ifdef notyet +static void netfront_closing(device_t dev); +#endif +static void netif_free(struct netfront_info *info); +static int netfront_detach(device_t dev); + +static int talk_to_backend(device_t dev, struct netfront_info *info); +static int create_netdev(device_t dev); +static void netif_disconnect_backend(struct netfront_info *info); +static int setup_device(device_t dev, struct netfront_info *info); +static void end_access(int ref, void *page); + +/* Xenolinux helper functions */ +int network_connect(struct netfront_info *); + +static void xn_free_rx_ring(struct netfront_info *); + +static void xn_free_tx_ring(struct netfront_info *); + +static int xennet_get_responses(struct netfront_info *np, + struct netfront_rx_info *rinfo, RING_IDX rp, struct mbuf **list, + int *pages_flipped_p); + +#define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) + +#define INVALID_P2M_ENTRY (~0UL) + +/* + * Mbuf pointers. We need these to keep track of the virtual addresses + * of our mbuf chains since we can only convert from virtual to physical, + * not the other way around. The size must track the free index arrays. + */ +struct xn_chain_data { + struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; + struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; +}; + + +struct net_device_stats +{ + u_long rx_packets; /* total packets received */ + u_long tx_packets; /* total packets transmitted */ + u_long rx_bytes; /* total bytes received */ + u_long tx_bytes; /* total bytes transmitted */ + u_long rx_errors; /* bad packets received */ + u_long tx_errors; /* packet transmit problems */ + u_long rx_dropped; /* no space in linux buffers */ + u_long tx_dropped; /* no space available in linux */ + u_long multicast; /* multicast packets received */ + u_long collisions; + + /* detailed rx_errors: */ + u_long rx_length_errors; + u_long rx_over_errors; /* receiver ring buff overflow */ + u_long rx_crc_errors; /* recved pkt with crc error */ + u_long rx_frame_errors; /* recv'd frame alignment error */ + u_long rx_fifo_errors; /* recv'r fifo overrun */ + u_long rx_missed_errors; /* receiver missed packet */ + + /* detailed tx_errors */ + u_long tx_aborted_errors; + u_long tx_carrier_errors; + u_long tx_fifo_errors; + u_long tx_heartbeat_errors; + u_long tx_window_errors; + + /* for cslip etc */ + u_long rx_compressed; + u_long tx_compressed; +}; + +struct netfront_info { + + struct ifnet *xn_ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl xn_lro; +#endif + + struct net_device_stats stats; + u_int tx_full; + + netif_tx_front_ring_t tx; + netif_rx_front_ring_t rx; + + struct mtx tx_lock; + struct mtx rx_lock; + struct sx sc_lock; + + u_int handle; + u_int irq; + u_int copying_receiver; + u_int carrier; + + /* Receive-ring batched refills. */ +#define RX_MIN_TARGET 32 +#define RX_MAX_TARGET NET_RX_RING_SIZE + int rx_min_target, rx_max_target, rx_target; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + + grant_ref_t gref_tx_head; + grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; + grant_ref_t gref_rx_head; + grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; + +#define TX_MAX_TARGET min(NET_RX_RING_SIZE, 256) + device_t xbdev; + int tx_ring_ref; + int rx_ring_ref; + uint8_t mac[ETHER_ADDR_LEN]; + struct xn_chain_data xn_cdata; /* mbufs */ + struct mbuf_head xn_rx_batch; /* head of the batch queue */ + + int xn_if_flags; + struct callout xn_stat_ch; + + u_long rx_pfn_array[NET_RX_RING_SIZE]; + multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; + mmu_update_t rx_mmu[NET_RX_RING_SIZE]; +}; + +#define rx_mbufs xn_cdata.xn_rx_chain +#define tx_mbufs xn_cdata.xn_tx_chain + +#define XN_LOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \ + mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF); \ + sx_init(&(_sc)->sc_lock, #_name"_rx") + +#define XN_RX_LOCK(_sc) mtx_lock(&(_sc)->rx_lock) +#define XN_RX_UNLOCK(_sc) mtx_unlock(&(_sc)->rx_lock) + +#define XN_TX_LOCK(_sc) mtx_lock(&(_sc)->tx_lock) +#define XN_TX_UNLOCK(_sc) mtx_unlock(&(_sc)->tx_lock) + +#define XN_LOCK(_sc) sx_xlock(&(_sc)->sc_lock); +#define XN_UNLOCK(_sc) sx_xunlock(&(_sc)->sc_lock); + +#define XN_LOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_lock, SX_LOCKED); +#define XN_RX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->rx_lock, MA_OWNED); +#define XN_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_lock, MA_OWNED); +#define XN_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->rx_lock); \ + mtx_destroy(&(_sc)->tx_lock); \ + sx_destroy(&(_sc)->sc_lock); + +struct netfront_rx_info { + struct netif_rx_response rx; + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; +}; + +#define netfront_carrier_on(netif) ((netif)->carrier = 1) +#define netfront_carrier_off(netif) ((netif)->carrier = 0) +#define netfront_carrier_ok(netif) ((netif)->carrier) + +/* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */ + + + +/* + * Access macros for acquiring freeing slots in tx_skbs[]. + */ + +static inline void +add_id_to_freelist(struct mbuf **list, unsigned short id) +{ + list[id] = list[0]; + list[0] = (void *)(u_long)id; +} + +static inline unsigned short +get_id_from_freelist(struct mbuf **list) +{ + u_int id = (u_int)(u_long)list[0]; + list[0] = list[id]; + return (id); +} + +static inline int +xennet_rxidx(RING_IDX idx) +{ + return idx & (NET_RX_RING_SIZE - 1); +} + +static inline struct mbuf * +xennet_get_rx_mbuf(struct netfront_info *np, + RING_IDX ri) +{ + int i = xennet_rxidx(ri); + struct mbuf *m; + + m = np->rx_mbufs[i]; + np->rx_mbufs[i] = NULL; + return (m); +} + +static inline grant_ref_t +xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri) +{ + int i = xennet_rxidx(ri); + grant_ref_t ref = np->grant_rx_ref[i]; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + return ref; +} + +#ifdef DEBUG + +#endif +#define IPRINTK(fmt, args...) \ + printf("[XEN] " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printf("[XEN] " fmt, ##args) +#if 0 +#define DPRINTK(fmt, args...) \ + printf("[XEN] %s: " fmt, __func__, ##args) +#else +#define DPRINTK(fmt, args...) +#endif + +/** + * Read the 'mac' node at the given device's node in the store, and parse that + * as colon-separated octets, placing result the given mac array. mac must be + * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h). + * Return 0 on success, or errno on error. + */ +static int +xen_net_read_mac(device_t dev, uint8_t mac[]) +{ + int error, i; + char *s, *e, *macstr; + + error = xenbus_read(XBT_NIL, xenbus_get_node(dev), "mac", NULL, + (void **) &macstr); + if (error) + return (error); + + s = macstr; + for (i = 0; i < ETHER_ADDR_LEN; i++) { + mac[i] = strtoul(s, &e, 16); + if (s == e || (e[0] != ':' && e[0] != 0)) { + free(macstr, M_DEVBUF); + return (ENOENT); + } + s = &e[1]; + } + free(macstr, M_DEVBUF); + return (0); +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffers for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Connected state. + */ +static int +netfront_probe(device_t dev) +{ + + if (!strcmp(xenbus_get_type(dev), "vif")) { + device_set_desc(dev, "Virtual Network Interface"); + return (0); + } + + return (ENXIO); +} + +static int +netfront_attach(device_t dev) +{ + int err; + + err = create_netdev(dev); + if (err) { + xenbus_dev_fatal(dev, err, "creating netdev"); + return err; + } + +#if __FreeBSD_version >= 700000 + SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW, + &xn_enable_lro, 0, "Large Receive Offload"); +#endif + + return 0; +} + + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int +netfront_resume(device_t dev) +{ + struct netfront_info *info = device_get_softc(dev); + + netif_disconnect_backend(info); + return (0); +} + + +/* Common code used when first setting up, and when resuming. */ +static int +talk_to_backend(device_t dev, struct netfront_info *info) +{ + const char *message; + struct xenbus_transaction xbt; + const char *node = xenbus_get_node(dev); + int err; + + err = xen_net_read_mac(dev, info->mac); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", node); + goto out; + } + + /* Create shared ring, alloc event channel. */ + err = setup_device(dev, info); + if (err) + goto out; + + again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_ring; + } + err = xenbus_printf(xbt, node, "tx-ring-ref","%u", + info->tx_ring_ref); + if (err) { + message = "writing tx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, node, "rx-ring-ref","%u", + info->rx_ring_ref); + if (err) { + message = "writing rx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, node, + "event-channel", "%u", irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, node, "request-rx-copy", "%u", + info->copying_receiver); + if (err) { + message = "writing request-rx-copy"; + goto abort_transaction; + } + err = xenbus_printf(xbt, node, "feature-rx-notify", "%d", 1); + if (err) { + message = "writing feature-rx-notify"; + goto abort_transaction; + } + err = xenbus_printf(xbt, node, "feature-sg", "%d", 1); + if (err) { + message = "writing feature-sg"; + goto abort_transaction; + } +#if __FreeBSD_version >= 700000 + err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1); + if (err) { + message = "writing feature-gso-tcpv4"; + goto abort_transaction; + } +#endif + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_ring; + } + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, err, "%s", message); + destroy_ring: + netif_free(info); + out: + return err; +} + + +static int +setup_device(device_t dev, struct netfront_info *info) +{ + netif_tx_sring_t *txs; + netif_rx_sring_t *rxs; + int error; + struct ifnet *ifp; + + ifp = info->xn_ifp; + + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->rx.sring = NULL; + info->tx.sring = NULL; + info->irq = 0; + + txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!txs) { + error = ENOMEM; + xenbus_dev_fatal(dev, error, "allocating tx ring page"); + goto fail; + } + SHARED_RING_INIT(txs); + FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); + error = xenbus_grant_ring(dev, virt_to_mfn(txs), &info->tx_ring_ref); + if (error) + goto fail; + + rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!rxs) { + error = ENOMEM; + xenbus_dev_fatal(dev, error, "allocating rx ring page"); + goto fail; + } + SHARED_RING_INIT(rxs); + FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); + + error = xenbus_grant_ring(dev, virt_to_mfn(rxs), &info->rx_ring_ref); + if (error) + goto fail; + + error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev), + "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq); + + if (error) { + xenbus_dev_fatal(dev, error, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + + show_device(info); + + return (0); + + fail: + netif_free(info); + return (error); +} + +/** + * If this interface has an ipv4 address, send an arp for it. This + * helps to get the network going again after migrating hosts. + */ +static void +netfront_send_fake_arp(device_t dev, struct netfront_info *info) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + ifp = info->xn_ifp; + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + arp_ifinit(ifp, ifa); + } + } +} + +/** + * Callback received when the backend's state changes. + */ +static void +netfront_backend_changed(device_t dev, XenbusState newstate) +{ + struct netfront_info *sc = device_get_softc(dev); + + DPRINTK("newstate=%d\n", newstate); + + switch (newstate) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateConnected: + case XenbusStateUnknown: + case XenbusStateClosed: + case XenbusStateReconfigured: + case XenbusStateReconfiguring: + break; + case XenbusStateInitWait: + if (xenbus_get_state(dev) != XenbusStateInitialising) + break; + if (network_connect(sc) != 0) + break; + xenbus_set_state(dev, XenbusStateConnected); + netfront_send_fake_arp(dev, sc); + break; + case XenbusStateClosing: + xenbus_set_state(dev, XenbusStateClosed); + break; + } +} + +static void +xn_free_rx_ring(struct netfront_info *sc) +{ +#if 0 + int i; + + for (i = 0; i < NET_RX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_rx_chain[i] != NULL) { + m_freem(sc->xn_cdata.xn_rx_chain[i]); + sc->xn_cdata.xn_rx_chain[i] = NULL; + } + } + + sc->rx.rsp_cons = 0; + sc->xn_rx_if->req_prod = 0; + sc->xn_rx_if->event = sc->rx.rsp_cons ; +#endif +} + +static void +xn_free_tx_ring(struct netfront_info *sc) +{ +#if 0 + int i; + + for (i = 0; i < NET_TX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_tx_chain[i] != NULL) { + m_freem(sc->xn_cdata.xn_tx_chain[i]); + sc->xn_cdata.xn_tx_chain[i] = NULL; + } + } + + return; +#endif +} + +static inline int +netfront_tx_slot_available(struct netfront_info *np) +{ + return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < + (TX_MAX_TARGET - /* MAX_SKB_FRAGS */ 24 - 2)); +} +static void +netif_release_tx_bufs(struct netfront_info *np) +{ + struct mbuf *m; + int i; + + for (i = 1; i <= NET_TX_RING_SIZE; i++) { + m = np->xn_cdata.xn_tx_chain[i]; + + if (((u_long)m) < KERNBASE) + continue; + gnttab_grant_foreign_access_ref(np->grant_tx_ref[i], + xenbus_get_otherend_id(np->xbdev), + virt_to_mfn(mtod(m, vm_offset_t)), + GNTMAP_readonly); + gnttab_release_grant_reference(&np->gref_tx_head, + np->grant_tx_ref[i]); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(np->tx_mbufs, i); + m_freem(m); + } +} + +static void +network_alloc_rx_buffers(struct netfront_info *sc) +{ + int otherend_id = xenbus_get_otherend_id(sc->xbdev); + unsigned short id; + struct mbuf *m_new; + int i, batch_target, notify; + RING_IDX req_prod; + struct xen_memory_reservation reservation; + grant_ref_t ref; + int nr_flips; + netif_rx_request_t *req; + vm_offset_t vaddr; + u_long pfn; + + req_prod = sc->rx.req_prod_pvt; + + if (unlikely(sc->carrier == 0)) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory + * allocator, so should reduce the chance of failed allocation + * requests both for ourself and for other kernel subsystems. + */ + batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons); + for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) { + MGETHDR(m_new, M_DONTWAIT, MT_DATA); + if (m_new == NULL) + goto no_mbuf; + + m_cljget(m_new, M_DONTWAIT, MJUMPAGESIZE); + if ((m_new->m_flags & M_EXT) == 0) { + m_freem(m_new); + +no_mbuf: + if (i != 0) + goto refill; + /* + * XXX set timer + */ + break; + } + m_new->m_len = m_new->m_pkthdr.len = MJUMPAGESIZE; + + /* queue the mbufs allocated */ + mbufq_tail(&sc->xn_rx_batch, m_new); + } + + /* Is the batch large enough to be worthwhile? */ + if (i < (sc->rx_target/2)) { + if (req_prod >sc->rx.sring->req_prod) + goto push; + return; + } + /* Adjust floating fill target if we risked running out of buffers. */ + if ( ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) && + ((sc->rx_target *= 2) > sc->rx_max_target) ) + sc->rx_target = sc->rx_max_target; + +refill: + for (nr_flips = i = 0; ; i++) { + if ((m_new = mbufq_dequeue(&sc->xn_rx_batch)) == NULL) + break; + + m_new->m_ext.ext_args = (vm_paddr_t *)(uintptr_t)( + vtophys(m_new->m_ext.ext_buf) >> PAGE_SHIFT); + + id = xennet_rxidx(req_prod + i); + + KASSERT(sc->xn_cdata.xn_rx_chain[id] == NULL, + ("non-NULL xm_rx_chain")); + sc->xn_cdata.xn_rx_chain[id] = m_new; + + ref = gnttab_claim_grant_reference(&sc->gref_rx_head); + KASSERT((short)ref >= 0, ("negative ref")); + sc->grant_rx_ref[id] = ref; + + vaddr = mtod(m_new, vm_offset_t); + pfn = vtophys(vaddr) >> PAGE_SHIFT; + req = RING_GET_REQUEST(&sc->rx, req_prod + i); + + if (sc->copying_receiver == 0) { + gnttab_grant_foreign_transfer_ref(ref, + otherend_id, pfn); + sc->rx_pfn_array[nr_flips] = PFNTOMFN(pfn); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remove this page before passing + * back to Xen. + */ + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + MULTI_update_va_mapping(&sc->rx_mcl[i], + vaddr, 0, 0); + } + nr_flips++; + } else { + gnttab_grant_foreign_access_ref(ref, + otherend_id, + PFNTOMFN(pfn), 0); + } + req->id = id; + req->gref = ref; + + sc->rx_pfn_array[i] = + vtomach(mtod(m_new,vm_offset_t)) >> PAGE_SHIFT; + } + + KASSERT(i, ("no mbufs processed")); /* should have returned earlier */ + KASSERT(mbufq_len(&sc->xn_rx_batch) == 0, ("not all mbufs processed")); + /* + * We may have allocated buffers which have entries outstanding + * in the page * update queue -- make sure we flush those first! + */ + PT_UPDATES_FLUSH(); + if (nr_flips != 0) { +#ifdef notyet + /* Tell the ballon driver what is going on. */ + balloon_update_driver_allowance(i); +#endif + set_xen_guest_handle(reservation.extent_start, sc->rx_pfn_array); + reservation.nr_extents = i; + reservation.extent_order = 0; + reservation.address_bits = 0; + reservation.domid = DOMID_SELF; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + + /* After all PTEs have been zapped, flush the TLB. */ + sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = + UVMF_TLB_FLUSH|UVMF_ALL; + + /* Give away a batch of pages. */ + sc->rx_mcl[i].op = __HYPERVISOR_memory_op; + sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation; + sc->rx_mcl[i].args[1] = (u_long)&reservation; + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(sc->rx_mcl, i+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if (unlikely(sc->rx_mcl[i].result != i)) + panic("Unable to reduce memory reservation\n"); + } else { + if (HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation) + != i) + panic("Unable to reduce memory " + "reservation\n"); + } + } else { + wmb(); + } + + /* Above is a suitable barrier to ensure backend will see requests. */ + sc->rx.req_prod_pvt = req_prod + i; +push: + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->rx, notify); + if (notify) + notify_remote_via_irq(sc->irq); +} + +static void +xn_rxeof(struct netfront_info *np) +{ + struct ifnet *ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl *lro = &np->xn_lro; + struct lro_entry *queued; +#endif + struct netfront_rx_info rinfo; + struct netif_rx_response *rx = &rinfo.rx; + struct netif_extra_info *extras = rinfo.extras; + RING_IDX i, rp; + multicall_entry_t *mcl; + struct mbuf *m; + struct mbuf_head rxq, errq; + int err, pages_flipped = 0, work_to_do; + + do { + XN_RX_LOCK_ASSERT(np); + if (!netfront_carrier_ok(np)) + return; + + mbufq_init(&errq); + mbufq_init(&rxq); + + ifp = np->xn_ifp; + + rp = np->rx.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + i = np->rx.rsp_cons; + while ((i != rp)) { + memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); + memset(extras, 0, sizeof(rinfo.extras)); + + m = NULL; + err = xennet_get_responses(np, &rinfo, rp, &m, + &pages_flipped); + + if (unlikely(err)) { + if (m) + mbufq_tail(&errq, m); + np->stats.rx_errors++; + i = np->rx.rsp_cons; + continue; + } + + m->m_pkthdr.rcvif = ifp; + if ( rx->flags & NETRXF_data_validated ) { + /* Tell the stack the checksums are okay */ + /* + * XXX this isn't necessarily the case - need to add + * check + */ + + m->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID + | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + + np->stats.rx_packets++; + np->stats.rx_bytes += m->m_pkthdr.len; + + mbufq_tail(&rxq, m); + np->rx.rsp_cons = ++i; + } + + if (pages_flipped) { + /* Some pages are no longer absent... */ +#ifdef notyet + balloon_update_driver_allowance(-pages_flipped); +#endif + /* Do all the remapping work, and M->P updates, in one big + * hypercall. + */ + if (!!xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = np->rx_mcl + pages_flipped; + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (u_long)np->rx_mmu; + mcl->args[1] = pages_flipped; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + (void)HYPERVISOR_multicall(np->rx_mcl, + pages_flipped + 1); + } + } + + while ((m = mbufq_dequeue(&errq))) + m_freem(m); + + /* + * Process all the mbufs after the remapping is complete. + * Break the mbuf chain first though. + */ + while ((m = mbufq_dequeue(&rxq)) != NULL) { + ifp->if_ipackets++; + + /* + * Do we really need to drop the rx lock? + */ + XN_RX_UNLOCK(np); +#if __FreeBSD_version >= 700000 + /* Use LRO if possible */ + if ((ifp->if_capenable & IFCAP_LRO) == 0 || + lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { + /* + * If LRO fails, pass up to the stack + * directly. + */ + (*ifp->if_input)(ifp, m); + } +#else + (*ifp->if_input)(ifp, m); +#endif + XN_RX_LOCK(np); + } + + np->rx.rsp_cons = i; + +#if __FreeBSD_version >= 700000 + /* + * Flush any outstanding LRO work + */ + while (!SLIST_EMPTY(&lro->lro_active)) { + queued = SLIST_FIRST(&lro->lro_active); + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif + +#if 0 + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > + ((3*np->rx_target) / 4)) && (--np->rx_target < np->rx_min_target)) + np->rx_target = np->rx_min_target; +#endif + + network_alloc_rx_buffers(np); + + RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, work_to_do); + } while (work_to_do); +} + +static void +xn_txeof(struct netfront_info *np) +{ + RING_IDX i, prod; + unsigned short id; + struct ifnet *ifp; + netif_tx_response_t *txr; + struct mbuf *m; + + XN_TX_LOCK_ASSERT(np); + + if (!netfront_carrier_ok(np)) + return; + + ifp = np->xn_ifp; + ifp->if_timer = 0; + + do { + prod = np->tx.sring->rsp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ + + for (i = np->tx.rsp_cons; i != prod; i++) { + txr = RING_GET_RESPONSE(&np->tx, i); + if (txr->status == NETIF_RSP_NULL) + continue; + + id = txr->id; + m = np->xn_cdata.xn_tx_chain[id]; + + /* + * Increment packet count if this is the last + * mbuf of the chain. + */ + if (!m->m_next) + ifp->if_opackets++; + KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); + M_ASSERTVALID(m); + if (unlikely(gnttab_query_foreign_access( + np->grant_tx_ref[id]) != 0)) { + printf("network_tx_buf_gc: warning " + "-- grant still in use by backend " + "domain.\n"); + goto out; + } + gnttab_end_foreign_access_ref( + np->grant_tx_ref[id]); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[id]); + np->grant_tx_ref[id] = GRANT_INVALID_REF; + + np->xn_cdata.xn_tx_chain[id] = NULL; + add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); + m_free(m); + } + np->tx.rsp_cons = prod; + + /* + * Set a new event, then check for race with update of + * tx_cons. Note that it is essential to schedule a + * callback, no matter how few buffers are pending. Even if + * there is space in the transmit ring, higher layers may + * be blocked because too much data is outstanding: in such + * cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->tx.sring->rsp_event = + prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; + + mb(); + + } while (prod != np->tx.sring->rsp_prod); + + out: + if (np->tx_full && + ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) { + np->tx_full = 0; +#if 0 + if (np->user_state == UST_OPEN) + netif_wake_queue(dev); +#endif + } + +} + +static void +xn_intr(void *xsc) +{ + struct netfront_info *np = xsc; + struct ifnet *ifp = np->xn_ifp; + +#if 0 + if (!(np->rx.rsp_cons != np->rx.sring->rsp_prod && + likely(netfront_carrier_ok(np)) && + ifp->if_drv_flags & IFF_DRV_RUNNING)) + return; +#endif + if (np->tx.rsp_cons != np->tx.sring->rsp_prod) { + XN_TX_LOCK(np); + xn_txeof(np); + XN_TX_UNLOCK(np); + } + + XN_RX_LOCK(np); + xn_rxeof(np); + XN_RX_UNLOCK(np); + + if (ifp->if_drv_flags & IFF_DRV_RUNNING && + !IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + xn_start(ifp); +} + + +static void +xennet_move_rx_slot(struct netfront_info *np, struct mbuf *m, + grant_ref_t ref) +{ + int new = xennet_rxidx(np->rx.req_prod_pvt); + + KASSERT(np->rx_mbufs[new] == NULL, ("rx_mbufs != NULL")); + np->rx_mbufs[new] = m; + np->grant_rx_ref[new] = ref; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; + np->rx.req_prod_pvt++; +} + +static int +xennet_get_extras(struct netfront_info *np, + struct netif_extra_info *extras, RING_IDX rp) +{ + struct netif_extra_info *extra; + RING_IDX cons = np->rx.rsp_cons; + + int err = 0; + + do { + struct mbuf *m; + grant_ref_t ref; + + if (unlikely(cons + 1 == rp)) { +#if 0 + if (net_ratelimit()) + WPRINTK("Missing extra info\n"); +#endif + err = -EINVAL; + break; + } + + extra = (struct netif_extra_info *) + RING_GET_RESPONSE(&np->rx, ++cons); + + if (unlikely(!extra->type || + extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { +#if 0 + if (net_ratelimit()) + WPRINTK("Invalid extra type: %d\n", + extra->type); +#endif + err = -EINVAL; + } else { + memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); + } + + m = xennet_get_rx_mbuf(np, cons); + ref = xennet_get_rx_ref(np, cons); + xennet_move_rx_slot(np, m, ref); + } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + + np->rx.rsp_cons = cons; + return err; +} + +static int +xennet_get_responses(struct netfront_info *np, + struct netfront_rx_info *rinfo, RING_IDX rp, + struct mbuf **list, + int *pages_flipped_p) +{ + int pages_flipped = *pages_flipped_p; + struct mmu_update *mmu; + struct multicall_entry *mcl; + struct netif_rx_response *rx = &rinfo->rx; + struct netif_extra_info *extras = rinfo->extras; + RING_IDX cons = np->rx.rsp_cons; + struct mbuf *m, *m0, *m_prev; + grant_ref_t ref = xennet_get_rx_ref(np, cons); + int max = 5 /* MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */; + int frags = 1; + int err = 0; + u_long ret; + + m0 = m = m_prev = xennet_get_rx_mbuf(np, cons); + + + if (rx->flags & NETRXF_extra_info) { + err = xennet_get_extras(np, extras, rp); + cons = np->rx.rsp_cons; + } + + + if (m0 != NULL) { + m0->m_pkthdr.len = 0; + m0->m_next = NULL; + } + + for (;;) { + u_long mfn; + +#if 0 + printf("rx->status=%hd rx->offset=%hu frags=%u\n", + rx->status, rx->offset, frags); +#endif + if (unlikely(rx->status < 0 || + rx->offset + rx->status > PAGE_SIZE)) { +#if 0 + if (net_ratelimit()) + WPRINTK("rx->offset: %x, size: %u\n", + rx->offset, rx->status); +#endif + xennet_move_rx_slot(np, m, ref); + err = -EINVAL; + goto next; + } + + /* + * This definitely indicates a bug, either in this driver or in + * the backend driver. In future this should flag the bad + * situation to the system controller to reboot the backed. + */ + if (ref == GRANT_INVALID_REF) { +#if 0 + if (net_ratelimit()) + WPRINTK("Bad rx response id %d.\n", rx->id); +#endif + err = -EINVAL; + goto next; + } + + if (!np->copying_receiver) { + /* Memory pressure, insufficient buffer + * headroom, ... + */ + if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { + if (net_ratelimit()) + WPRINTK("Unfulfilled rx req " + "(id=%d, st=%d).\n", + rx->id, rx->status); + xennet_move_rx_slot(np, m, ref); + err = -ENOMEM; + goto next; + } + + if (!xen_feature( XENFEAT_auto_translated_physmap)) { + /* Remap the page. */ + void *vaddr = mtod(m, void *); + uint32_t pfn; + + mcl = np->rx_mcl + pages_flipped; + mmu = np->rx_mmu + pages_flipped; + + MULTI_update_va_mapping(mcl, (u_long)vaddr, + (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW | + PG_V | PG_M | PG_A, 0); + pfn = (uintptr_t)m->m_ext.ext_args; + mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE; + mmu->val = pfn; + + set_phys_to_machine(pfn, mfn); + } + pages_flipped++; + } else { + ret = gnttab_end_foreign_access_ref(ref); + KASSERT(ret, ("ret != 0")); + } + + gnttab_release_grant_reference(&np->gref_rx_head, ref); + +next: + if (m != NULL) { + m->m_len = rx->status; + m->m_data += rx->offset; + m0->m_pkthdr.len += rx->status; + } + + if (!(rx->flags & NETRXF_more_data)) + break; + + if (cons + frags == rp) { + if (net_ratelimit()) + WPRINTK("Need more frags\n"); + err = -ENOENT; + break; + } + m_prev = m; + + rx = RING_GET_RESPONSE(&np->rx, cons + frags); + m = xennet_get_rx_mbuf(np, cons + frags); + + m_prev->m_next = m; + m->m_next = NULL; + ref = xennet_get_rx_ref(np, cons + frags); + frags++; + } + *list = m0; + + if (unlikely(frags > max)) { + if (net_ratelimit()) + WPRINTK("Too many frags\n"); + err = -E2BIG; + } + + if (unlikely(err)) + np->rx.rsp_cons = cons + frags; + + *pages_flipped_p = pages_flipped; + + return err; +} + +static void +xn_tick_locked(struct netfront_info *sc) +{ + XN_RX_LOCK_ASSERT(sc); + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + + /* XXX placeholder for printing debug information */ + +} + + +static void +xn_tick(void *xsc) +{ + struct netfront_info *sc; + + sc = xsc; + XN_RX_LOCK(sc); + xn_tick_locked(sc); + XN_RX_UNLOCK(sc); + +} +static void +xn_start_locked(struct ifnet *ifp) +{ + int otherend_id; + unsigned short id; + struct mbuf *m_head, *m; + struct netfront_info *sc; + netif_tx_request_t *tx; + netif_extra_info_t *extra; + RING_IDX i; + grant_ref_t ref; + u_long mfn, tx_bytes; + int notify, nfrags; + + sc = ifp->if_softc; + otherend_id = xenbus_get_otherend_id(sc->xbdev); + tx_bytes = 0; + + if (!netfront_carrier_ok(sc)) + return; + + for (i = sc->tx.req_prod_pvt; TRUE; i++) { + IF_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + + if (!netfront_tx_slot_available(sc)) { + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } + + + /* + * Defragment the mbuf if necessary. + */ + for (m = m_head, nfrags = 0; m; m = m->m_next) + nfrags++; + if (nfrags > MAX_SKB_FRAGS) { + m = m_defrag(m_head, M_DONTWAIT); + if (!m) { + m_freem(m_head); + break; + } + m_head = m; + } + + /* + * Start packing the mbufs in this chain into + * the fragment pointers. Stop when we run out + * of fragments or hit the end of the mbuf chain. + */ + m = m_head; + extra = NULL; + for (m = m_head; m; m = m->m_next) { + tx = RING_GET_REQUEST(&sc->tx, i); + id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + sc->xn_cdata.xn_tx_chain[id] = m; + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); + tx->flags = 0; + if (m == m_head) { + /* + * The first fragment has the entire packet + * size, subsequent fragments have just the + * fragment size. The backend works out the + * true size of the first fragment by + * subtracting the sizes of the other + * fragments. + */ + tx->size = m->m_pkthdr.len; + + /* + * The first fragment contains the + * checksum flags and is optionally + * followed by extra data for TSO etc. + */ + if (m->m_pkthdr.csum_flags + & CSUM_DELAY_DATA) { + tx->flags |= (NETTXF_csum_blank + | NETTXF_data_validated); + } +#if __FreeBSD_version >= 700000 + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_REQUEST(&sc->tx, ++i); + + if (extra) + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; + else + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = m->m_pkthdr.tso_segsz; + gso->u.gso.type = + XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + extra = gso; + } +#endif + } else { + tx->size = m->m_len; + } + if (m->m_next) { + tx->flags |= NETTXF_more_data; + i++; + } + } + + BPF_MTAP(ifp, m_head); + + sc->stats.tx_bytes += m_head->m_pkthdr.len; + sc->stats.tx_packets++; + } + + sc->tx.req_prod_pvt = i; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->tx, notify); + if (notify) + notify_remote_via_irq(sc->irq); + + xn_txeof(sc); + + if (RING_FULL(&sc->tx)) { + sc->tx_full = 1; +#if 0 + netif_stop_queue(dev); +#endif + } + + return; +} + +static void +xn_start(struct ifnet *ifp) +{ + struct netfront_info *sc; + sc = ifp->if_softc; + XN_TX_LOCK(sc); + xn_start_locked(ifp); + XN_TX_UNLOCK(sc); +} + +/* equivalent of network_open() in Linux */ +static void +xn_ifinit_locked(struct netfront_info *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = sc->xn_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + xn_stop(sc); + + network_alloc_rx_buffers(sc); + sc->rx.sring->rsp_event = sc->rx.rsp_cons + 1; + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + +} + + +static void +xn_ifinit(void *xsc) +{ + struct netfront_info *sc = xsc; + + XN_LOCK(sc); + xn_ifinit_locked(sc); + XN_UNLOCK(sc); + +} + + +static int +xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct netfront_info *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *) data; + struct ifaddr *ifa = (struct ifaddr *)data; + + int mask, error = 0; + switch(cmd) { + case SIOCSIFADDR: + case SIOCGIFADDR: + XN_LOCK(sc); + if (ifa->ifa_addr->sa_family == AF_INET) { + ifp->if_flags |= IFF_UP; + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) + xn_ifinit_locked(sc); + arp_ifinit(ifp, ifa); + XN_UNLOCK(sc); + } else { + XN_UNLOCK(sc); + error = ether_ioctl(ifp, cmd, data); + } + break; + case SIOCSIFMTU: + /* XXX can we alter the MTU on a VN ?*/ +#ifdef notyet + if (ifr->ifr_mtu > XN_JUMBO_MTU) + error = EINVAL; + else +#endif + { + ifp->if_mtu = ifr->ifr_mtu; + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + xn_ifinit(sc); + } + break; + case SIOCSIFFLAGS: + XN_LOCK(sc); + if (ifp->if_flags & IFF_UP) { + /* + * If only the state of the PROMISC flag changed, + * then just use the 'set promisc mode' command + * instead of reinitializing the entire NIC. Doing + * a full re-init means reloading the firmware and + * waiting for it to start up, which may take a + * second or two. + */ +#ifdef notyet + /* No promiscuous mode with Xen */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING && + ifp->if_flags & IFF_PROMISC && + !(sc->xn_if_flags & IFF_PROMISC)) { + XN_SETBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && + !(ifp->if_flags & IFF_PROMISC) && + sc->xn_if_flags & IFF_PROMISC) { + XN_CLRBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else +#endif + xn_ifinit_locked(sc); + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + xn_stop(sc); + } + } + sc->xn_if_flags = ifp->if_flags; + XN_UNLOCK(sc); + error = 0; + break; + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_TXCSUM) { + if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); + ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP + | CSUM_IP | CSUM_TSO); + } else { + ifp->if_capenable |= IFCAP_TXCSUM; + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP + | CSUM_IP); + } + } + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + } +#if __FreeBSD_version >= 700000 + if (mask & IFCAP_TSO4) { + if (IFCAP_TSO4 & ifp->if_capenable) { + ifp->if_capenable &= ~IFCAP_TSO4; + ifp->if_hwassist &= ~CSUM_TSO; + } else if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_TSO; + } else { + DPRINTK("Xen requires tx checksum offload" + " be enabled to use TSO\n"); + error = EINVAL; + } + } + if (mask & IFCAP_LRO) { + ifp->if_capenable ^= IFCAP_LRO; + + } +#endif + error = 0; + break; + case SIOCADDMULTI: + case SIOCDELMULTI: +#ifdef notyet + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + XN_LOCK(sc); + xn_setmulti(sc); + XN_UNLOCK(sc); + error = 0; + } +#endif + /* FALLTHROUGH */ + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = EINVAL; + break; + default: + error = ether_ioctl(ifp, cmd, data); + } + + return (error); +} + +static void +xn_stop(struct netfront_info *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = sc->xn_ifp; + + callout_stop(&sc->xn_stat_ch); + + xn_free_rx_ring(sc); + xn_free_tx_ring(sc); + + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); +} + +/* START of Xenolinux helper functions adapted to FreeBSD */ +int +network_connect(struct netfront_info *np) +{ + int i, requeue_idx, error; + grant_ref_t ref; + netif_rx_request_t *req; + u_int feature_rx_copy, feature_rx_flip; + + error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev), + "feature-rx-copy", NULL, "%u", &feature_rx_copy); + if (error) + feature_rx_copy = 0; + error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev), + "feature-rx-flip", NULL, "%u", &feature_rx_flip); + if (error) + feature_rx_flip = 1; + + /* + * Copy packets on receive path if: + * (a) This was requested by user, and the backend supports it; or + * (b) Flipping was requested, but this is unsupported by the backend. + */ + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || + (MODPARM_rx_flip && !feature_rx_flip)); + + XN_LOCK(np); + /* Recovery procedure: */ + error = talk_to_backend(np->xbdev, np); + if (error) + return (error); + + /* Step 1: Reinitialise variables. */ + netif_release_tx_bufs(np); + + /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ + for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { + struct mbuf *m; + u_long pfn; + + if (np->rx_mbufs[i] == NULL) + continue; + + m = np->rx_mbufs[requeue_idx] = xennet_get_rx_mbuf(np, i); + ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); + req = RING_GET_REQUEST(&np->rx, requeue_idx); + pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT; + + if (!np->copying_receiver) { + gnttab_grant_foreign_transfer_ref(ref, + xenbus_get_otherend_id(np->xbdev), + pfn); + } else { + gnttab_grant_foreign_access_ref(ref, + xenbus_get_otherend_id(np->xbdev), + PFNTOMFN(pfn), 0); + } + req->gref = ref; + req->id = requeue_idx; + + requeue_idx++; + } + + np->rx.req_prod_pvt = requeue_idx; + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + netfront_carrier_on(np); + notify_remote_via_irq(np->irq); + XN_TX_LOCK(np); + xn_txeof(np); + XN_TX_UNLOCK(np); + network_alloc_rx_buffers(np); + XN_UNLOCK(np); + + return (0); +} + +static void +show_device(struct netfront_info *sc) +{ +#ifdef DEBUG + if (sc) { + IPRINTK("\n", + sc->xn_ifno, + be_state_name[sc->xn_backend_state], + sc->xn_user_state ? "open" : "closed", + sc->xn_evtchn, + sc->xn_irq, + sc->xn_tx_if, + sc->xn_rx_if); + } else { + IPRINTK("\n"); + } +#endif +} + +/** Create a network device. + * @param handle device handle + */ +int +create_netdev(device_t dev) +{ + int i; + struct netfront_info *np; + int err; + struct ifnet *ifp; + + np = device_get_softc(dev); + + np->xbdev = dev; + + XN_LOCK_INIT(np, xennetif); + np->rx_target = RX_MIN_TARGET; + np->rx_min_target = RX_MIN_TARGET; + np->rx_max_target = RX_MAX_TARGET; + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for (i = 0; i <= NET_TX_RING_SIZE; i++) { + np->tx_mbufs[i] = (void *) ((u_long) i+1); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + } + for (i = 0; i <= NET_RX_RING_SIZE; i++) { + np->rx_mbufs[i] = NULL; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + } + /* A grant for every tx ring slot */ + if (gnttab_alloc_grant_references(TX_MAX_TARGET, + &np->gref_tx_head) < 0) { + printf("#### netfront can't alloc tx grant refs\n"); + err = ENOMEM; + goto exit; + } + /* A grant for every rx ring slot */ + if (gnttab_alloc_grant_references(RX_MAX_TARGET, + &np->gref_rx_head) < 0) { + printf("#### netfront can't alloc rx grant refs\n"); + gnttab_free_grant_references(np->gref_tx_head); + err = ENOMEM; + goto exit; + } + + err = xen_net_read_mac(dev, np->mac); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", + xenbus_get_node(dev)); + goto out; + } + + /* Set up ifnet structure */ + ifp = np->xn_ifp = if_alloc(IFT_ETHER); + ifp->if_softc = np; + if_initname(ifp, "xn", device_get_unit(dev)); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = xn_ioctl; + ifp->if_output = ether_output; + ifp->if_start = xn_start; +#ifdef notyet + ifp->if_watchdog = xn_watchdog; +#endif + ifp->if_init = xn_ifinit; + ifp->if_mtu = ETHERMTU; + ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; + + ifp->if_hwassist = XN_CSUM_FEATURES; + ifp->if_capabilities = IFCAP_HWCSUM; +#if __FreeBSD_version >= 700000 + ifp->if_capabilities |= IFCAP_TSO4; + if (xn_enable_lro) { + int err = tcp_lro_init(&np->xn_lro); + if (err) { + device_printf(dev, "LRO initialization failed\n"); + goto exit; + } + np->xn_lro.ifp = ifp; + ifp->if_capabilities |= IFCAP_LRO; + } +#endif + ifp->if_capenable = ifp->if_capabilities; + + ether_ifattach(ifp, np->mac); + callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE); + netfront_carrier_off(np); + + return (0); + +exit: + gnttab_free_grant_references(np->gref_tx_head); +out: + panic("do something smart"); + +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +#if 0 +static void netfront_closing(device_t dev) +{ +#if 0 + struct netfront_info *info = dev->dev_driver_data; + + DPRINTK("netfront_closing: %s removed\n", dev->nodename); + + close_netdev(info); +#endif + xenbus_switch_state(dev, XenbusStateClosed); +} +#endif + +static int netfront_detach(device_t dev) +{ + struct netfront_info *info = device_get_softc(dev); + + DPRINTK("%s\n", xenbus_get_node(dev)); + + netif_free(info); + + return 0; +} + + +static void netif_free(struct netfront_info *info) +{ + netif_disconnect_backend(info); +#if 0 + close_netdev(info); +#endif +} + +static void netif_disconnect_backend(struct netfront_info *info) +{ + XN_RX_LOCK(info); + XN_TX_LOCK(info); + netfront_carrier_off(info); + XN_TX_UNLOCK(info); + XN_RX_UNLOCK(info); + + end_access(info->tx_ring_ref, info->tx.sring); + end_access(info->rx_ring_ref, info->rx.sring); + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->tx.sring = NULL; + info->rx.sring = NULL; + + if (info->irq) + unbind_from_irqhandler(info->irq); + + info->irq = 0; +} + + +static void end_access(int ref, void *page) +{ + if (ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(ref, page); +} + +/* ** Driver registration ** */ +static device_method_t netfront_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, netfront_probe), + DEVMETHOD(device_attach, netfront_attach), + DEVMETHOD(device_detach, netfront_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, netfront_resume), + + /* Xenbus interface */ + DEVMETHOD(xenbus_backend_changed, netfront_backend_changed), + + { 0, 0 } +}; + +static driver_t netfront_driver = { + "xn", + netfront_methods, + sizeof(struct netfront_info), +}; +devclass_t netfront_devclass; + +DRIVER_MODULE(xe, xenbus, netfront_driver, netfront_devclass, 0, 0); Property changes on: dev/xen/netfront/netfront.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Property changes on: dev/xen/netfront ___________________________________________________________________ Added: svn:mergeinfo Merged /stable/7/sys/dev/xen/netfront:r172506,172810,175956,179044,179776,180149,182402 Merged /head/sys/dev/xen/netfront:r153880,155086,155957,157624,158737,159574,159762,159802,159806,159810-159812,160052,162099,162118,162122,162458,162473,162619,162687-162688,163246,163398-163399,164281,164375,165225,165727,165852,165854,166067,166181,166901,169152,169451,169562,169609,169611,169796,169876,170273,170284,170405,170478,170802,170872,171053,171821-171822,171980,172025,172334,172607,172825,172919,172998,173081,173468,173592,173804,174385,174510,174756,174987,175005,175019-175021,175053,175162,175328-175329,175417,175466,176431,176526,176596,176996,177104,177228,177274,177289,177296,177462,177560,177567,177619,177635,177662,177685,177695,177862,177899,178033,178112,178241,178280,178589,178667,178719,178814,178920,178996,179057,179159,179174,179296,179335-179338,179343,179347,179425,179445,179488,179510,179631,179637,179655,179705,179716,179765,179831,179879,179925,179969,179971,180037-180038,180073,180077,180145,180152-180153,180220,180252-180253,180298-180299,180374,180382-180384,180437,180447,180503,180515,180567,180582,180612,180668,180753,180869,180946,180950,180952,180954,180981,181000,181002,181007,181016,181018,181020,181024,181089,181093,181129,181132,181333,181336,181399,181433,181436,181556-181557,181603,181606,181617-181619,181701,181824,181934,181953,181972,181976,181992,182003,182020,182046,182055,182060,182062,182066,182070,182078,182108,182110-182111,182115,182119,182122,182161,182321,182380,182391,182401,182461,182488,182600,182688,182713,182885,182887-182888,182913,182936,183078,183135,183236,183264,183628 Merged /user/dfr/xenhvm/7/sys/dev/xen/netfront:r188754,188757,188991,188996 Index: dev/xen/blkfront/block.h =================================================================== --- dev/xen/blkfront/block.h (.../stable/6/sys) (revision 0) +++ dev/xen/blkfront/block.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,97 @@ +/* + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * $FreeBSD$ + */ + + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ +#include + +struct xlbd_type_info +{ + int partn_shift; + int disks_per_major; + char *devname; + char *diskname; +}; + +struct xlbd_major_info +{ + int major; + int index; + int usage; + struct xlbd_type_info *type; +}; + +struct blk_shadow { + blkif_request_t req; + unsigned long request; + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) + + +struct xb_softc { + device_t xb_dev; + struct disk *xb_disk; /* disk params */ + struct bio_queue_head xb_bioq; /* sort queue */ + int xb_unit; + int xb_flags; + struct blkfront_info *xb_info; + LIST_ENTRY(xb_softc) entry; +#define XB_OPEN (1<<0) /* drive is open (can't shut down) */ +}; + + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + device_t xbdev; + dev_t dev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + int connected; + int ring_ref; + blkif_front_ring_t ring; + unsigned int irq; + struct xlbd_major_info *mi; +#if 0 + request_queue_t *rq; + struct work_struct work; +#endif + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + unsigned long shadow_free; + struct xb_softc *sc; + int feature_barrier; + int is_ready; + /** + * The number of people holding this device open. We won't allow a + * hot-unplug unless this is 0. + */ + int users; +}; +/* Note that xlvbd_add doesn't call add_disk for you: you're expected + to call add_disk on info->gd once the disk is properly connected + up. */ +int xlvbd_add(device_t, blkif_sector_t capacity, int device, + uint16_t vdisk_info, uint16_t sector_size, struct blkfront_info *info); +void xlvbd_del(struct blkfront_info *info); + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ + Property changes on: dev/xen/blkfront/block.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/blkfront/blkfront.c =================================================================== --- dev/xen/blkfront/blkfront.c (.../stable/6/sys) (revision 0) +++ dev/xen/blkfront/blkfront.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1119 @@ +/*- + * All rights reserved. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * XenoBSD block device driver + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "xenbus_if.h" + +#define ASSERT(S) KASSERT(S, (#S)) +/* prototypes */ +struct xb_softc; +static void xb_startio(struct xb_softc *sc); +static void connect(device_t, struct blkfront_info *); +static void blkfront_closing(device_t); +static int blkfront_detach(device_t); +static int talk_to_backend(device_t, struct blkfront_info *); +static int setup_blkring(device_t, struct blkfront_info *); +static void blkif_int(void *); +#if 0 +static void blkif_restart_queue(void *arg); +#endif +static void blkif_recover(struct blkfront_info *); +static void blkif_completion(struct blk_shadow *); +static void blkif_free(struct blkfront_info *, int); + +#define GRANT_INVALID_REF 0 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) + +LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head; + +/* Control whether runtime update of vbds is enabled. */ +#define ENABLE_VBD_UPDATE 0 + +#if ENABLE_VBD_UPDATE +static void vbd_update(void); +#endif + + +#define BLKIF_STATE_DISCONNECTED 0 +#define BLKIF_STATE_CONNECTED 1 +#define BLKIF_STATE_SUSPENDED 2 + +#ifdef notyet +static char *blkif_state_name[] = { + [BLKIF_STATE_DISCONNECTED] = "disconnected", + [BLKIF_STATE_CONNECTED] = "connected", + [BLKIF_STATE_SUSPENDED] = "closed", +}; + +static char * blkif_status_name[] = { + [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", + [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", + [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", +}; +#endif +#define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args) +#if 0 +#define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args) +#else +#define DPRINTK(fmt, args...) +#endif + +static grant_ref_t gref_head; +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) + +static void kick_pending_request_queues(struct blkfront_info *); +static int blkif_open(struct disk *dp); +static int blkif_close(struct disk *dp); +static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td); +static int blkif_queue_request(struct bio *bp); +static void xb_strategy(struct bio *bp); + + + +/* XXX move to xb_vbd.c when VBD update support is added */ +#define MAX_VBDS 64 + +#define XBD_SECTOR_SIZE 512 /* XXX: assume for now */ +#define XBD_SECTOR_SHFT 9 + +static struct mtx blkif_io_lock; + +static vm_paddr_t +pfn_to_mfn(vm_paddr_t pfn) +{ + return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT); +} + +/* + * Translate Linux major/minor to an appropriate name and unit + * number. For HVM guests, this allows us to use the same drive names + * with blkfront as the emulated drives, easing transition slightly. + */ +static void +blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name) +{ + static struct vdev_info { + int major; + int shift; + int base; + const char *name; + } info[] = { + {3, 6, 0, "ad"}, /* ide0 */ + {22, 6, 2, "ad"}, /* ide1 */ + {33, 6, 4, "ad"}, /* ide2 */ + {34, 6, 6, "ad"}, /* ide3 */ + {56, 6, 8, "ad"}, /* ide4 */ + {57, 6, 10, "ad"}, /* ide5 */ + {88, 6, 12, "ad"}, /* ide6 */ + {89, 6, 14, "ad"}, /* ide7 */ + {90, 6, 16, "ad"}, /* ide8 */ + {91, 6, 18, "ad"}, /* ide9 */ + + {8, 4, 0, "da"}, /* scsi disk0 */ + {65, 4, 16, "da"}, /* scsi disk1 */ + {66, 4, 32, "da"}, /* scsi disk2 */ + {67, 4, 48, "da"}, /* scsi disk3 */ + {68, 4, 64, "da"}, /* scsi disk4 */ + {69, 4, 80, "da"}, /* scsi disk5 */ + {70, 4, 96, "da"}, /* scsi disk6 */ + {71, 4, 112, "da"}, /* scsi disk7 */ + {128, 4, 128, "da"}, /* scsi disk8 */ + {129, 4, 144, "da"}, /* scsi disk9 */ + {130, 4, 160, "da"}, /* scsi disk10 */ + {131, 4, 176, "da"}, /* scsi disk11 */ + {132, 4, 192, "da"}, /* scsi disk12 */ + {133, 4, 208, "da"}, /* scsi disk13 */ + {134, 4, 224, "da"}, /* scsi disk14 */ + {135, 4, 240, "da"}, /* scsi disk15 */ + + {202, 4, 0, "xbd"}, /* xbd */ + + {0, 0, 0, NULL}, + }; + int major = vdevice >> 8; + int minor = vdevice & 0xff; + int i; + + if (vdevice & (1 << 28)) { + *unit = (vdevice & ((1 << 28) - 1)) >> 8; + *name = "xbd"; + } + + for (i = 0; info[i].major; i++) { + if (info[i].major == major) { + *unit = info[i].base + (minor >> info[i].shift); + *name = info[i].name; + return; + } + } + + *unit = minor >> 4; + *name = "xbd"; +} + +int +xlvbd_add(device_t dev, blkif_sector_t capacity, + int vdevice, uint16_t vdisk_info, uint16_t sector_size, + struct blkfront_info *info) +{ + struct xb_softc *sc; + int unit, error = 0; + const char *name; + + blkfront_vdevice_to_unit(vdevice, &unit, &name); + + sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + sc->xb_unit = unit; + sc->xb_info = info; + info->sc = sc; + + if (strcmp(name, "xbd")) + device_printf(dev, "attaching as %s%d\n", name, unit); + + memset(&sc->xb_disk, 0, sizeof(sc->xb_disk)); + sc->xb_disk = disk_alloc(); + sc->xb_disk->d_unit = sc->xb_unit; + sc->xb_disk->d_open = blkif_open; + sc->xb_disk->d_close = blkif_close; + sc->xb_disk->d_ioctl = blkif_ioctl; + sc->xb_disk->d_strategy = xb_strategy; + sc->xb_disk->d_name = name; + sc->xb_disk->d_drv1 = sc; + sc->xb_disk->d_sectorsize = sector_size; + + /* XXX */ + sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT; +#if 0 + sc->xb_disk->d_maxsize = DFLTPHYS; +#else /* XXX: xen can't handle large single i/o requests */ + sc->xb_disk->d_maxsize = 4096; +#endif +#ifdef notyet + XENPRINTF("attaching device 0x%x unit %d capacity %llu\n", + xb_diskinfo[sc->xb_unit].device, sc->xb_unit, + sc->xb_disk->d_mediasize); +#endif + sc->xb_disk->d_flags = 0; + disk_create(sc->xb_disk, DISK_VERSION_00); + bioq_init(&sc->xb_bioq); + + return error; +} + +void +xlvbd_del(struct blkfront_info *info) +{ + struct xb_softc *sc; + + sc = info->sc; + disk_destroy(sc->xb_disk); +} +/************************ end VBD support *****************/ + +/* + * Read/write routine for a buffer. Finds the proper unit, place it on + * the sortq and kick the controller. + */ +static void +xb_strategy(struct bio *bp) +{ + struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1; + + /* bogus disk? */ + if (sc == NULL) { + bp->bio_error = EINVAL; + bp->bio_flags |= BIO_ERROR; + goto bad; + } + + DPRINTK(""); + + /* + * Place it in the queue of disk activities for this disk + */ + mtx_lock(&blkif_io_lock); + bioq_disksort(&sc->xb_bioq, bp); + + xb_startio(sc); + mtx_unlock(&blkif_io_lock); + return; + + bad: + /* + * Correctly set the bio to indicate a failed tranfer. + */ + bp->bio_resid = bp->bio_bcount; + biodone(bp); + return; +} + +static int +blkfront_probe(device_t dev) +{ + + if (!strcmp(xenbus_get_type(dev), "vbd")) { + device_set_desc(dev, "Virtual Block Device"); + device_quiet(dev); + return (0); + } + + return (ENXIO); +} + +/* + * Setup supplies the backend dir, virtual device. We place an event + * channel and shared frame entries. We watch backend to wait if it's + * ok. + */ +static int +blkfront_attach(device_t dev) +{ + int error, vdevice, i, unit; + struct blkfront_info *info; + const char *name; + + /* FIXME: Use dynamic device id if this is not set. */ + error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev), + "virtual-device", NULL, "%i", &vdevice); + if (error) { + xenbus_dev_fatal(dev, error, "reading virtual-device"); + printf("couldn't find virtual device"); + return (error); + } + + blkfront_vdevice_to_unit(vdevice, &unit, &name); + if (!strcmp(name, "xbd")) + device_set_unit(dev, unit); + + info = device_get_softc(dev); + + /* + * XXX debug only + */ + for (i = 0; i < sizeof(*info); i++) + if (((uint8_t *)info)[i] != 0) + panic("non-null memory"); + + info->shadow_free = 0; + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + + /* work queue needed ? */ + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0); + + error = talk_to_backend(dev, info); + if (error) + return (error); + + return (0); +} + +static int +blkfront_suspend(device_t dev) +{ + struct blkfront_info *info = device_get_softc(dev); + + /* Prevent new requests being issued until we fix things up. */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_SUSPENDED; + mtx_unlock(&blkif_io_lock); + + return (0); +} + +static int +blkfront_resume(device_t dev) +{ + struct blkfront_info *info = device_get_softc(dev); + int err; + + DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev)); + + blkif_free(info, 1); + err = talk_to_backend(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + blkif_recover(info); + + return (err); +} + +/* Common code used when first setting up, and when resuming. */ +static int +talk_to_backend(device_t dev, struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + + again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, xenbus_get_node(dev), + "ring-ref","%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, xenbus_get_node(dev), + "event-channel", "%u", irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, xenbus_get_node(dev), + "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); + if (err) { + message = "writing protocol"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + xenbus_set_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + +static int +setup_blkring(device_t dev, struct blkfront_info *info) +{ + blkif_sring_t *sring; + int error; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (sring == NULL) { + xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring"); + return ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + error = xenbus_grant_ring(dev, + (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref); + if (error) { + free(sring, M_DEVBUF); + info->ring.sring = NULL; + goto fail; + } + + error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev), + "xbd", (driver_intr_t *)blkif_int, info, + INTR_TYPE_BIO | INTR_MPSAFE, &info->irq); + if (error) { + xenbus_dev_fatal(dev, error, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + + return (0); + fail: + blkif_free(info, 0); + return (error); +} + + +/** + * Callback received when the backend's state changes. + */ +static void +blkfront_backend_changed(device_t dev, XenbusState backend_state) +{ + struct blkfront_info *info = device_get_softc(dev); + + DPRINTK("backend_state=%d\n", backend_state); + + switch (backend_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateClosed: + case XenbusStateReconfigured: + case XenbusStateReconfiguring: + break; + + case XenbusStateConnected: + connect(dev, info); + break; + + case XenbusStateClosing: + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(dev); +#ifdef notyet + bd = bdget(info->dev); + if (bd == NULL) + xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); + + down(&bd->bd_sem); + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(dev); + up(&bd->bd_sem); + bdput(bd); +#endif + } +} + +/* +** Invoked when the backend is finally 'ready' (and has told produced +** the details about the physical device - #sectors, size, etc). +*/ +static void +connect(device_t dev, struct blkfront_info *info) +{ + unsigned long sectors, sector_size; + unsigned int binfo; + int err; + + if( (info->connected == BLKIF_STATE_CONNECTED) || + (info->connected == BLKIF_STATE_SUSPENDED) ) + return; + + DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev)); + + err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev), + "sectors", "%lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading backend fields at %s", + xenbus_get_otherend_path(dev)); + return; + } + err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev), + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + + device_printf(dev, "%juMB <%s> at %s", + (uintmax_t) sectors / (1048576 / sector_size), + device_get_desc(dev), + xenbus_get_node(dev)); + bus_print_child_footer(device_get_parent(dev), dev); + + xlvbd_add(dev, sectors, info->vdevice, binfo, sector_size, info); + + (void)xenbus_set_state(dev, XenbusStateConnected); + + /* Kick pending requests. */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); + info->is_ready = 1; + +#if 0 + add_disk(info->gd); +#endif +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +static void +blkfront_closing(device_t dev) +{ + struct blkfront_info *info = device_get_softc(dev); + + DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev)); + + if (info->mi) { + DPRINTK("Calling xlvbd_del\n"); + xlvbd_del(info); + info->mi = NULL; + } + + xenbus_set_state(dev, XenbusStateClosed); +} + + +static int +blkfront_detach(device_t dev) +{ + struct blkfront_info *info = device_get_softc(dev); + + DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev)); + + blkif_free(info, 0); + + return 0; +} + + +static inline int +GET_ID_FROM_FREELIST(struct blkfront_info *info) +{ + unsigned long nfree = info->shadow_free; + + KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree)); + info->shadow_free = info->shadow[nfree].req.id; + info->shadow[nfree].req.id = 0x0fffffee; /* debug */ + return nfree; +} + +static inline void +ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].request = 0; + info->shadow_free = id; +} + +static inline void +flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static void +kick_pending_request_queues(struct blkfront_info *info) +{ + /* XXX check if we can't simplify */ +#if 0 + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +#endif + if (!RING_FULL(&info->ring)) { +#if 0 + sc = LIST_FIRST(&xbsl_head); + LIST_REMOVE(sc, entry); + /* Re-enable calldowns. */ + blk_start_queue(di->rq); +#endif + /* Kick things off immediately. */ + xb_startio(info->sc); + } +} + +#if 0 +/* XXX */ +static void blkif_restart_queue(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + + mtx_lock(&blkif_io_lock); + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); +} +#endif + +static void blkif_restart_queue_callback(void *arg) +{ +#if 0 + struct blkfront_info *info = (struct blkfront_info *)arg; + /* XXX BSD equiv ? */ + + schedule_work(&info->work); +#endif +} + +static int +blkif_open(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) { + printf("xb%d: not found", sc->xb_unit); + return (ENXIO); + } + + sc->xb_flags |= XB_OPEN; + sc->xb_info->users++; + return (0); +} + +static int +blkif_close(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) + return (ENXIO); + sc->xb_flags &= ~XB_OPEN; + if (--(sc->xb_info->users) == 0) { + /* Check whether we have been instructed to close. We will + have ignored this request initially, as the device was + still mounted. */ + device_t dev = sc->xb_info->xbdev; + XenbusState state = + xenbus_read_driver_state(xenbus_get_otherend_path(dev)); + + if (state == XenbusStateClosing) + blkfront_closing(dev); + } + return (0); +} + +static int +blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) + return (ENXIO); + + return (ENOTTY); +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct bio *bp) +{ + caddr_t alignbuf; + vm_paddr_t buffer_ma; + blkif_request_t *ring_req; + unsigned long id; + uint64_t fsect, lsect; + struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1; + struct blkfront_info *info = sc->xb_info; + int ref; + + if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + + /* Check if the buffer is properly aligned */ + if ((vm_offset_t)bp->bio_data & PAGE_MASK) { + int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE : + PAGE_SIZE; + caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF, + M_NOWAIT); + + alignbuf = (char *)roundup2((u_long)newbuf, align); + + /* save a copy of the current buffer */ + bp->bio_driver1 = newbuf; + bp->bio_driver2 = alignbuf; + + /* Copy the data for a write */ + if (bp->bio_cmd == BIO_WRITE) + bcopy(bp->bio_data, alignbuf, bp->bio_bcount); + } else + alignbuf = bp->bio_data; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, + info->ring.req_prod_pvt); + id = GET_ID_FROM_FREELIST(info); + info->shadow[id].request = (unsigned long)bp; + + ring_req->id = id; + ring_req->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ : + BLKIF_OP_WRITE; + + ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk; + + ring_req->nr_segments = 0; /* XXX not doing scatter/gather since buffer + * chaining is not supported. + */ + + buffer_ma = vtomach(alignbuf); + fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; + lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + KASSERT( ref != -ENOSPC, ("grant_reference failed") ); + + gnttab_grant_foreign_access_ref( + ref, + xenbus_get_otherend_id(info->xbdev), + buffer_ma >> PAGE_SHIFT, + ring_req->operation & 1 ); /* ??? */ + info->shadow[id].frame[ring_req->nr_segments] = + buffer_ma >> PAGE_SHIFT; + + ring_req->seg[ring_req->nr_segments] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + + ring_req->nr_segments++; + KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0, + ("XEN buffer must be sector aligned")); + KASSERT(lsect <= 7, + ("XEN disk driver data cannot cross a page boundary")); + + buffer_ma &= ~PAGE_MASK; + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + gnttab_free_grant_references(gref_head); + + return 0; +} + + + +/* + * Dequeue buffers and place them in the shared communication ring. + * Return when no more requests can be accepted or all buffers have + * been queued. + * + * Signal XEN once the ring has been filled out. + */ +static void +xb_startio(struct xb_softc *sc) +{ + struct bio *bp; + int queued = 0; + struct blkfront_info *info = sc->xb_info; + DPRINTK(""); + + mtx_assert(&blkif_io_lock, MA_OWNED); + + while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) { + + if (RING_FULL(&info->ring)) + goto wait; + + if (blkif_queue_request(bp)) { + wait: + bioq_insert_head(&sc->xb_bioq, bp); + break; + } + queued++; + } + + if (queued != 0) + flush_requests(sc->xb_info); +} + +static void +blkif_int(void *xsc) +{ + struct xb_softc *sc = NULL; + struct bio *bp; + blkif_response_t *bret; + RING_IDX i, rp; + struct blkfront_info *info = xsc; + DPRINTK(""); + + TRACE_ENTER; + + mtx_lock(&blkif_io_lock); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + mtx_unlock(&blkif_io_lock); + return; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + bp = (struct bio *)info->shadow[id].request; + + blkif_completion(&info->shadow[id]); + + ADD_ID_TO_FREELIST(info, id); + + switch (bret->operation) { + case BLKIF_OP_READ: + /* had an unaligned buffer that needs to be copied */ + if (bp->bio_driver1) + bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount); + /* FALLTHROUGH */ + case BLKIF_OP_WRITE: + + /* free the copy buffer */ + if (bp->bio_driver1) { + free(bp->bio_driver1, M_DEVBUF); + bp->bio_driver1 = NULL; + } + + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) { + printf("Bad return from blkdev data request: %x\n", + bret->status); + bp->bio_flags |= BIO_ERROR; + } + + sc = (struct xb_softc *)bp->bio_disk->d_drv1; + + if (bp->bio_flags & BIO_ERROR) + bp->bio_error = EIO; + else + bp->bio_resid = 0; + + biodone(bp); + break; + default: + panic("received invalid operation"); + break; + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else { + info->ring.sring->rsp_event = i + 1; + } + + kick_pending_request_queues(info); + + mtx_unlock(&blkif_io_lock); +} + +static void +blkif_free(struct blkfront_info *info, int suspend) +{ + +/* Prevent new requests being issued until we fix things up. */ + mtx_lock(&blkif_io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + mtx_unlock(&blkif_io_lock); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, + info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq); + info->irq = 0; + +} + +static void +blkif_completion(struct blk_shadow *s) +{ + int i; + + for (i = 0; i < s->req.nr_segments; i++) + gnttab_end_foreign_access(s->req.seg[i].gref, 0UL); +} + +static void +blkif_recover(struct blkfront_info *info) +{ + int i, j; + blkif_request_t *req; + struct blk_shadow *copy; + + if (!info->sc) + return; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO); + memcpy(copy, info->shadow, sizeof(info->shadow)); + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Stage 3: Find pending requests and requeue them. */ + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (copy[i].request == 0) + continue; + + /* Grab a request slot and copy shadow state into it. */ + req = RING_GET_REQUEST( + &info->ring, info->ring.req_prod_pvt); + *req = copy[i].req; + + /* We get a new request id, and must reset the shadow state. */ + req->id = GET_ID_FROM_FREELIST(info); + memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + + /* Rewrite any grant references invalidated by suspend/resume. */ + for (j = 0; j < req->nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->seg[j].gref, + xenbus_get_otherend_id(info->xbdev), + pfn_to_mfn(info->shadow[req->id].frame[j]), + 0 /* assume not readonly */); + + info->shadow[req->id].req = *req; + + info->ring.req_prod_pvt++; + } + + free(copy, M_DEVBUF); + + xenbus_set_state(info->xbdev, XenbusStateConnected); + + /* Now safe for us to use the shared ring */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + mtx_unlock(&blkif_io_lock); + + /* Send off requeued requests */ + mtx_lock(&blkif_io_lock); + flush_requests(info); + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); +} + +/* ** Driver registration ** */ +static device_method_t blkfront_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, blkfront_probe), + DEVMETHOD(device_attach, blkfront_attach), + DEVMETHOD(device_detach, blkfront_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, blkfront_suspend), + DEVMETHOD(device_resume, blkfront_resume), + + /* Xenbus interface */ + DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed), + + { 0, 0 } +}; + +static driver_t blkfront_driver = { + "xbd", + blkfront_methods, + sizeof(struct blkfront_info), +}; +devclass_t blkfront_devclass; + +DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0); + +MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */ + Property changes on: dev/xen/blkfront/blkfront.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/console/xencons_ring.h =================================================================== --- dev/xen/console/xencons_ring.h (.../stable/6/sys) (revision 0) +++ dev/xen/console/xencons_ring.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,20 @@ +/* + * $FreeBSD$ + * + */ +#ifndef _XENCONS_RING_H +#define _XENCONS_RING_H + +int xencons_ring_init(void); +int xencons_ring_send(const char *data, unsigned len); +void xencons_rx(char *buf, unsigned len); +void xencons_tx(void); + + +typedef void (xencons_receiver_func)(char *buf, unsigned len); +void xencons_ring_register_receiver(xencons_receiver_func *f); + +void xencons_handle_input(void *unused); +int xencons_has_input(void); + +#endif /* _XENCONS_RING_H */ Property changes on: dev/xen/console/xencons_ring.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/console/console.c =================================================================== --- dev/xen/console/console.c (.../stable/6/sys) (revision 0) +++ dev/xen/console/console.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,569 @@ +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#include "opt_ddb.h" +#ifdef DDB +#include +#endif + +static char driver_name[] = "xc"; +devclass_t xc_devclass; /* do not make static */ +static void xcstart (struct tty *); +static int xcparam (struct tty *, struct termios *); +static void xcstop (struct tty *, int); +static void xc_timeout(void *); +static void __xencons_tx_flush(void); +static boolean_t xcons_putc(int c); + +/* switch console so that shutdown can occur gracefully */ +static void xc_shutdown(void *arg, int howto); +static int xc_mute; + +static void xcons_force_flush(void); +static void xencons_priv_interrupt(void *); + +static cn_probe_t xccnprobe; +static cn_init_t xccninit; +static cn_getc_t xccngetc; +static cn_putc_t xccnputc; +static cn_putc_t xccnputc_dom0; +static cn_checkc_t xccncheckc; + +#define XC_POLLTIME (hz/10) + +CONS_DRIVER(xc, xccnprobe, xccninit, NULL, xccngetc, + xccncheckc, xccnputc, NULL); + +static int xen_console_up; +static boolean_t xc_start_needed; +static struct callout xc_callout; +struct mtx cn_mtx; + +#define RBUF_SIZE 1024 +#define RBUF_MASK(_i) ((_i)&(RBUF_SIZE-1)) +#define WBUF_SIZE 4096 +#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) +static char wbuf[WBUF_SIZE]; +static char rbuf[RBUF_SIZE]; +static int rc, rp; +static unsigned int cnsl_evt_reg; +static unsigned int wc, wp; /* write_cons, write_prod */ + +#define CDEV_MAJOR 12 +#define XCUNIT(x) (minor(x)) +#define ISTTYOPEN(tp) ((tp) && ((tp)->t_state & TS_ISOPEN)) +#define CN_LOCK_INIT(x, _name) \ + mtx_init(&x, _name, NULL, MTX_DEF|MTX_RECURSE) + +#define CN_LOCK(l) \ + do { \ + if (panicstr == NULL) \ + mtx_lock(&(l)); \ + } while (0) +#define CN_UNLOCK(l) \ + do { \ + if (panicstr == NULL) \ + mtx_unlock(&(l)); \ + } while (0) +#define CN_LOCK_ASSERT(x) mtx_assert(&x, MA_OWNED) +#define CN_LOCK_DESTROY(x) mtx_destroy(&x) + + +static struct tty *xccons; + +struct xc_softc { + int xc_unit; + struct cdev *xc_dev; +}; + + +static d_open_t xcopen; +static d_close_t xcclose; +static d_ioctl_t xcioctl; + +static struct cdevsw xc_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_TTY | D_NEEDGIANT, + .d_name = driver_name, + .d_open = xcopen, + .d_close = xcclose, + .d_read = ttyread, + .d_write = ttywrite, + .d_ioctl = xcioctl, + .d_poll = ttypoll, + .d_kqfilter = ttykqfilter, +}; + +static void +xccnprobe(struct consdev *cp) +{ + cp->cn_pri = CN_REMOTE; + cp->cn_tp = xccons; + sprintf(cp->cn_name, "%s0", driver_name); +} + + +static void +xccninit(struct consdev *cp) +{ + CN_LOCK_INIT(cn_mtx,"XCONS LOCK"); + +} +int +xccngetc(struct consdev *dev) +{ + int c; + if (xc_mute) + return 0; + do { + if ((c = xccncheckc(dev)) == -1) { +#ifdef KDB + if (!kdb_active) +#endif + /* + * Polling without sleeping in Xen + * doesn't work well. Sleeping gives + * other things like clock a chance to + * run + */ + tsleep(&cn_mtx, PWAIT | PCATCH, + "console sleep", XC_POLLTIME); + } + } while(c == -1); + return c; +} + +int +xccncheckc(struct consdev *dev) +{ + int ret = (xc_mute ? 0 : -1); + + if (xencons_has_input()) + xencons_handle_input(NULL); + + CN_LOCK(cn_mtx); + if ((rp - rc)) { + if (kdb_active) printf("%s:%d\n", __func__, __LINE__); + /* we need to return only one char */ + ret = (int)rbuf[RBUF_MASK(rc)]; + rc++; + } + CN_UNLOCK(cn_mtx); + return(ret); +} + +static void +xccnputc(struct consdev *dev, int c) +{ + xcons_putc(c); +} + +static void +xccnputc_dom0(struct consdev *dev, int c) +{ + HYPERVISOR_console_io(CONSOLEIO_write, 1, (char *)&c); +} + +extern int db_active; +static boolean_t +xcons_putc(int c) +{ + int force_flush = xc_mute || +#ifdef DDB + db_active || +#endif + panicstr; /* we're not gonna recover, so force + * flush + */ + + if ((wp-wc) < (WBUF_SIZE-1)) { + if ((wbuf[WBUF_MASK(wp++)] = c) == '\n') { + wbuf[WBUF_MASK(wp++)] = '\r'; +#ifdef notyet + if (force_flush) + xcons_force_flush(); +#endif + } + } else if (force_flush) { +#ifdef notyet + xcons_force_flush(); +#endif + } + if (cnsl_evt_reg) + __xencons_tx_flush(); + + /* inform start path that we're pretty full */ + return ((wp - wc) >= WBUF_SIZE - 100) ? TRUE : FALSE; +} + +static void +xc_identify(driver_t *driver, device_t parent) +{ + device_t child; + child = BUS_ADD_CHILD(parent, 0, driver_name, 0); + device_set_driver(child, driver); + device_set_desc(child, "Xen Console"); +} + +static int +xc_probe(device_t dev) +{ + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + + sc->xc_unit = device_get_unit(dev); + return (0); +} + +static int +xc_attach(device_t dev) +{ + int error; + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + int error; + + if (xen_start_info->flags & SIF_INITDOMAIN) { + xc_consdev.cn_putc = xccnputc_dom0; + } + + sc->xc_dev = make_dev(&xc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "xc%r", 0); + xccons = ttyalloc(); + + sc->xc_dev->si_drv1 = (void *)sc; + sc->xc_dev->si_tty = xccons; + + xccons->t_oproc = xcstart; + xccons->t_param = xcparam; + xccons->t_stop = xcstop; + xccons->t_dev = sc->xc_dev; + + callout_init(&xc_callout, 0); + + xencons_ring_init(); + + cnsl_evt_reg = 1; + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, xccons); + + if (xen_start_info->flags & SIF_INITDOMAIN) { + error = bind_virq_to_irqhandler( + VIRQ_CONSOLE, + 0, + "console", + xencons_priv_interrupt, + INTR_TYPE_TTY, NULL); + + KASSERT(error >= 0, ("can't register console interrupt")); + } + + /* register handler to flush console on shutdown */ + if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xc_shutdown, + NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) + printf("xencons: shutdown event registration failed!\n"); + + return (0); +} + +/* + * return 0 for all console input, force flush all output. + */ +static void +xc_shutdown(void *arg, int howto) +{ + xc_mute = 1; + xcons_force_flush(); +} + +void +xencons_rx(char *buf, unsigned len) +{ + int i; + struct tty *tp = xccons; + +#if 1 + if (len > 0 && buf[0] == '`') + printf("%08lx %08lx\r", + HYPERVISOR_shared_info->evtchn_pending[0], + HYPERVISOR_shared_info->evtchn_mask[0]); +#endif + for (i = 0; i < len; i++) { + if (xen_console_up +#ifdef DDB + && !kdb_active +#endif + ) + (*linesw[tp->t_line]->l_rint)(buf[i], tp); + else + rbuf[RBUF_MASK(rp++)] = buf[i]; + } +} + +static void +__xencons_tx_flush(void) +{ + int sz, work_done = 0; + + CN_LOCK(cn_mtx); + while (wc != wp) { + int sent; + sz = wp - wc; + if (sz > (WBUF_SIZE - WBUF_MASK(wc))) + sz = WBUF_SIZE - WBUF_MASK(wc); + if (xen_start_info->flags & SIF_INITDOMAIN) { + HYPERVISOR_console_io(CONSOLEIO_write, sz, &wbuf[WBUF_MASK(wc)]); + wc += sz; + } else { + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent == 0) + break; + wc += sent; + } + work_done = 1; + } + CN_UNLOCK(cn_mtx); + + /* + * ttwakeup calls routines using blocking locks + * + */ + if (work_done && xen_console_up && curthread->td_critnest == 0) + ttwakeup(xccons); +} + +void +xencons_tx(void) +{ + __xencons_tx_flush(); +} + +static void +xencons_priv_interrupt(void *arg) +{ + + static char rbuf[16]; + int l; + + while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) + xencons_rx(rbuf, l); + + xencons_tx(); +} + +int +xcopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct xc_softc *sc; + int unit = XCUNIT(dev); + struct tty *tp; + int s, error; + + sc = (struct xc_softc *)device_get_softc( + devclass_get_device(xc_devclass, unit)); + if (sc == NULL) + return (ENXIO); + + tp = dev->si_tty; + s = spltty(); + if (!ISTTYOPEN(tp)) { + tp->t_state |= TS_CARR_ON; + ttychars(tp); + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_cflag = TTYDEF_CFLAG|CLOCAL; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + xcparam(tp, &tp->t_termios); + ttsetwater(tp); + } else if (tp->t_state & TS_XCLUDE && suser(td)) { + splx(s); + return (EBUSY); + } + splx(s); + + xen_console_up = 1; + + error = (*linesw[tp->t_line]->l_open)(dev, tp); + return error; +} + +int +xcclose(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct tty *tp = dev->si_tty; + + if (tp == NULL) + return (0); + xen_console_up = 0; + + spltty(); + (*linesw[tp->t_line]->l_close)(tp, flag); + tty_close(tp); + spl0(); + return (0); +} + + +int +xcioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + struct tty *tp = dev->si_tty; + int error; + + error = (*linesw[tp->t_line]->l_ioctl)(tp, cmd, data, flag, td); + if (error != ENOIOCTL) + return (error); + + error = ttioctl(tp, cmd, data, flag); + + if (error != ENOIOCTL) + return (error); + + return (ENOTTY); +} + +static inline int +__xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ((wp - wc) == WBUF_SIZE) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + + +static void +xcstart(struct tty *tp) +{ + boolean_t cons_full = FALSE; + + CN_LOCK(cn_mtx); + if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { + CN_UNLOCK(cn_mtx); + + ttwwakeup(tp); + return; + } + + tp->t_state |= TS_BUSY; + CN_UNLOCK(cn_mtx); + + while (tp->t_outq.c_cc != 0 && !cons_full) + cons_full = xcons_putc(getc(&tp->t_outq)); + + /* if the console is close to full leave our state as busy */ + if (!cons_full) { + CN_LOCK(cn_mtx); + tp->t_state &= ~TS_BUSY; + CN_UNLOCK(cn_mtx); + ttwwakeup(tp); + } else { + /* let the timeout kick us in a bit */ + xc_start_needed = TRUE; + } + +} + +static void +xcstop(struct tty *tp, int flag) +{ + + if (tp->t_state & TS_BUSY) { + if ((tp->t_state & TS_TTSTOP) == 0) { + tp->t_state |= TS_FLUSH; + } + } +} + +static void +xc_timeout(void *v) +{ + struct tty *tp; + int c; + + tp = (struct tty *)v; + + while ((c = xccncheckc(NULL)) != -1) { + if (tp->t_state & TS_ISOPEN) { + (*linesw[tp->t_line]->l_rint)(c, tp); + } + } + + if (xc_start_needed) { + xc_start_needed = FALSE; + xcstart(tp); + } + + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, tp); +} + +/* + * Set line parameters. + */ +int +xcparam(struct tty *tp, struct termios *t) +{ + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + tp->t_cflag = t->c_cflag; + return (0); +} + + +static device_method_t xc_methods[] = { + DEVMETHOD(device_identify, xc_identify), + DEVMETHOD(device_probe, xc_probe), + DEVMETHOD(device_attach, xc_attach), + {0, 0} +}; + +static driver_t xc_driver = { + driver_name, + xc_methods, + sizeof(struct xc_softc), +}; + +/*** Forcibly flush console data before dying. ***/ +void +xcons_force_flush(void) +{ + int sz; + + if (xen_start_info->flags & SIF_INITDOMAIN) + return; + + /* Spin until console data is flushed through to the domain controller. */ + while (wc != wp) { + int sent = 0; + if ((sz = wp - wc) == 0) + continue; + + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) + wc += sent; + } +} + +DRIVER_MODULE(xc, nexus, xc_driver, xc_devclass, 0, 0); Property changes on: dev/xen/console/console.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/console/xencons_ring.c =================================================================== --- dev/xen/console/xencons_ring.c (.../stable/6/sys) (revision 0) +++ dev/xen/console/xencons_ring.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,165 @@ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define console_evtchn console.domU.evtchn +static unsigned int console_irq; +extern char *console_page; +extern struct mtx cn_mtx; + +static inline struct xencons_interface * +xencons_interface(void) +{ + return (struct xencons_interface *)console_page; +} + + +int +xencons_has_input(void) +{ + struct xencons_interface *intf; + + intf = xencons_interface(); + + return (intf->in_cons != intf->in_prod); +} + + +int +xencons_ring_send(const char *data, unsigned len) +{ + struct xencons_interface *intf; + XENCONS_RING_IDX cons, prod; + int sent; + + intf = xencons_interface(); + cons = intf->out_cons; + prod = intf->out_prod; + sent = 0; + + mb(); + KASSERT((prod - cons) <= sizeof(intf->out), + ("console send ring inconsistent")); + + while ((sent < len) && ((prod - cons) < sizeof(intf->out))) + intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; + + wmb(); + intf->out_prod = prod; + + notify_remote_via_evtchn(xen_start_info->console_evtchn); + + return sent; + +} + + +static xencons_receiver_func *xencons_receiver; + +void +xencons_handle_input(void *unused) +{ + struct xencons_interface *intf; + XENCONS_RING_IDX cons, prod; + + mtx_lock(&cn_mtx); + intf = xencons_interface(); + + cons = intf->in_cons; + prod = intf->in_prod; + + /* XXX needs locking */ + while (cons != prod) { + xencons_rx(intf->in + MASK_XENCONS_IDX(cons, intf->in), 1); + cons++; + } + + mb(); + intf->in_cons = cons; + + notify_remote_via_evtchn(xen_start_info->console_evtchn); + + xencons_tx(); + mtx_unlock(&cn_mtx); +} + +void +xencons_ring_register_receiver(xencons_receiver_func *f) +{ + xencons_receiver = f; +} + +int +xencons_ring_init(void) +{ + int err; + + if (!xen_start_info->console_evtchn) + return 0; + + err = bind_caller_port_to_irqhandler(xen_start_info->console_evtchn, + "xencons", xencons_handle_input, NULL, + INTR_TYPE_MISC | INTR_MPSAFE, &console_irq); + if (err) { + return err; + } + + return 0; +} + +extern void xencons_suspend(void); +extern void xencons_resume(void); + +void +xencons_suspend(void) +{ + + if (!xen_start_info->console_evtchn) + return; + + unbind_from_irqhandler(console_irq); +} + +void +xencons_resume(void) +{ + + (void)xencons_ring_init(); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 8 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ Property changes on: dev/xen/console/xencons_ring.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/pcifront/pcifront.c =================================================================== --- dev/xen/pcifront/pcifront.c (.../stable/6/sys) (revision 0) +++ dev/xen/pcifront/pcifront.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "pcib_if.h" + +#ifdef XEN_PCIDEV_FE_DEBUG +#define DPRINTF(fmt, args...) \ + printf("pcifront (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif +#define WPRINTF(fmt, args...) \ + printf("pcifront (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define INVALID_GRANT_REF (0) +#define INVALID_EVTCHN (-1) +#define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) + +struct pcifront_device { + STAILQ_ENTRY(pcifront_device) next; + + struct xenbus_device *xdev; + + int unit; + int evtchn; + int gnt_ref; + + /* Lock this when doing any operations in sh_info */ + struct mtx sh_info_lock; + struct xen_pci_sharedinfo *sh_info; + + device_t ndev; + + int ref_cnt; +}; + +static STAILQ_HEAD(pcifront_dlist, pcifront_device) pdev_list = STAILQ_HEAD_INITIALIZER(pdev_list); + +struct xpcib_softc { + int domain; + int bus; + struct pcifront_device *pdev; +}; + +/* Allocate a PCI device structure */ +static struct pcifront_device * +alloc_pdev(struct xenbus_device *xdev) +{ + struct pcifront_device *pdev = NULL; + int err, unit; + + err = sscanf(xdev->nodename, "device/pci/%d", &unit); + if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, "Error scanning pci device instance number"); + goto out; + } + + pdev = (struct pcifront_device *)malloc(sizeof(struct pcifront_device), M_DEVBUF, M_NOWAIT); + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(xdev, err, "Error allocating pcifront_device struct"); + goto out; + } + pdev->unit = unit; + pdev->xdev = xdev; + pdev->ref_cnt = 1; + + pdev->sh_info = (struct xen_pci_sharedinfo *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pdev->sh_info == NULL) { + free(pdev, M_DEVBUF); + pdev = NULL; + err = -ENOMEM; + xenbus_dev_fatal(xdev, err, "Error allocating sh_info struct"); + goto out; + } + pdev->sh_info->flags = 0; + + xdev->data = pdev; + + mtx_init(&pdev->sh_info_lock, "info_lock", "pci shared dev info lock", MTX_DEF); + + pdev->evtchn = INVALID_EVTCHN; + pdev->gnt_ref = INVALID_GRANT_REF; + + STAILQ_INSERT_TAIL(&pdev_list, pdev, next); + + DPRINTF("Allocated pdev @ 0x%p (unit=%d)\n", pdev, unit); + + out: + return pdev; +} + +/* Hold a reference to a pcifront device */ +static void +get_pdev(struct pcifront_device *pdev) +{ + pdev->ref_cnt++; +} + +/* Release a reference to a pcifront device */ +static void +put_pdev(struct pcifront_device *pdev) +{ + if (--pdev->ref_cnt > 0) + return; + + DPRINTF("freeing pdev @ 0x%p (ref_cnt=%d)\n", pdev, pdev->ref_cnt); + + if (pdev->evtchn != INVALID_EVTCHN) + xenbus_free_evtchn(pdev->xdev, pdev->evtchn); + + if (pdev->gnt_ref != INVALID_GRANT_REF) + gnttab_end_foreign_access(pdev->gnt_ref, 0, (void *)pdev->sh_info); + + pdev->xdev->data = NULL; + + free(pdev, M_DEVBUF); +} + + +/* Write to the xenbus info needed by backend */ +static int +pcifront_publish_info(struct pcifront_device *pdev) +{ + int err = 0; + struct xenbus_transaction *trans; + + err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info)); + if (err < 0) { + WPRINTF("error granting access to ring page\n"); + goto out; + } + + pdev->gnt_ref = err; + + err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn); + if (err) + goto out; + + do_publish: + trans = xenbus_transaction_start(); + if (IS_ERR(trans)) { + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend " + "(start transaction)"); + goto out; + } + + err = xenbus_printf(trans, pdev->xdev->nodename, + "pci-op-ref", "%u", pdev->gnt_ref); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "event-channel", "%u", pdev->evtchn); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "magic", XEN_PCI_MAGIC); + if (!err) + err = xenbus_switch_state(pdev->xdev, trans, + XenbusStateInitialised); + + if (err) { + xenbus_transaction_end(trans, 1); + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend"); + goto out; + } else { + err = xenbus_transaction_end(trans, 0); + if (err == -EAGAIN) + goto do_publish; + else if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error completing transaction for backend"); + goto out; + } + } + + out: + return err; +} + +/* The backend is now connected so complete the connection process on our side */ +static int +pcifront_connect(struct pcifront_device *pdev) +{ + device_t nexus; + devclass_t nexus_devclass; + + /* We will add our device as a child of the nexus0 device */ + if (!(nexus_devclass = devclass_find("nexus")) || + !(nexus = devclass_get_device(nexus_devclass, 0))) { + WPRINTF("could not find nexus0!\n"); + return -1; + } + + /* Create a newbus device representing this frontend instance */ + pdev->ndev = BUS_ADD_CHILD(nexus, 0, "xpcife", pdev->unit); + if (!pdev->ndev) { + WPRINTF("could not create xpcife%d!\n", pdev->unit); + return -EFAULT; + } + get_pdev(pdev); + device_set_ivars(pdev->ndev, pdev); + + /* Good to go connected now */ + xenbus_switch_state(pdev->xdev, NULL, XenbusStateConnected); + + printf("pcifront: connected to %s\n", pdev->xdev->nodename); + + mtx_lock(&Giant); + device_probe_and_attach(pdev->ndev); + mtx_unlock(&Giant); + + return 0; +} + +/* The backend is closing so process a disconnect */ +static int +pcifront_disconnect(struct pcifront_device *pdev) +{ + int err = 0; + XenbusState prev_state; + + prev_state = xenbus_read_driver_state(pdev->xdev->nodename); + + if (prev_state < XenbusStateClosing) { + err = xenbus_switch_state(pdev->xdev, NULL, XenbusStateClosing); + if (!err && prev_state == XenbusStateConnected) { + /* TODO - need to detach the newbus devices */ + } + } + + return err; +} + +/* Process a probe from the xenbus */ +static int +pcifront_probe(struct xenbus_device *xdev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct pcifront_device *pdev; + + DPRINTF("xenbus probing\n"); + + if ((pdev = alloc_pdev(xdev)) == NULL) + goto out; + + err = pcifront_publish_info(pdev); + + out: + if (err) + put_pdev(pdev); + return err; +} + +/* Remove the xenbus PCI device */ +static int +pcifront_remove(struct xenbus_device *xdev) +{ + DPRINTF("removing xenbus device node (%s)\n", xdev->nodename); + if (xdev->data) + put_pdev(xdev->data); + return 0; +} + +/* Called by xenbus when our backend node changes state */ +static void +pcifront_backend_changed(struct xenbus_device *xdev, + XenbusState be_state) +{ + struct pcifront_device *pdev = xdev->data; + + switch (be_state) { + case XenbusStateClosing: + DPRINTF("backend closing (%s)\n", xdev->nodename); + pcifront_disconnect(pdev); + break; + + case XenbusStateClosed: + DPRINTF("backend closed (%s)\n", xdev->nodename); + pcifront_disconnect(pdev); + break; + + case XenbusStateConnected: + DPRINTF("backend connected (%s)\n", xdev->nodename); + pcifront_connect(pdev); + break; + + default: + break; + } +} + +/* Process PCI operation */ +static int +do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op) +{ + int err = 0; + struct xen_pci_op *active_op = &pdev->sh_info->op; + evtchn_port_t port = pdev->evtchn; + time_t timeout; + + mtx_lock(&pdev->sh_info_lock); + + memcpy(active_op, op, sizeof(struct xen_pci_op)); + + /* Go */ + wmb(); + set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_evtchn(port); + + timeout = time_uptime + 2; + + clear_evtchn(port); + + /* Spin while waiting for the answer */ + while (test_bit + (_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)) { + int err = HYPERVISOR_poll(&port, 1, 3 * hz); + if (err) + panic("Failed HYPERVISOR_poll: err=%d", err); + clear_evtchn(port); + if (time_uptime > timeout) { + WPRINTF("pciback not responding!!!\n"); + clear_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags); + err = XEN_PCI_ERR_dev_not_found; + goto out; + } + } + + memcpy(op, active_op, sizeof(struct xen_pci_op)); + + err = op->err; + out: + mtx_unlock(&pdev->sh_info_lock); + return err; +} + +/* ** XenBus Driver registration ** */ + +static struct xenbus_device_id pcifront_ids[] = { + { "pci" }, + { "" } +}; + +static struct xenbus_driver pcifront = { + .name = "pcifront", + .ids = pcifront_ids, + .probe = pcifront_probe, + .remove = pcifront_remove, + .otherend_changed = pcifront_backend_changed, +}; + +/* Register the driver with xenbus during sys init */ +static void +pcifront_init(void *unused) +{ + if ((xen_start_info->flags & SIF_INITDOMAIN)) + return; + + DPRINTF("xenbus registering\n"); + + xenbus_register_frontend(&pcifront); +} + +SYSINIT(pciif, SI_SUB_PSEUDO, SI_ORDER_ANY, pcifront_init, NULL) + + +/* Newbus xpcife device driver probe */ +static int +xpcife_probe(device_t dev) +{ +#ifdef XEN_PCIDEV_FE_DEBUG + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(dev); + DPRINTF("xpcife probe (unit=%d)\n", pdev->unit); +#endif + return 0; +} + +/* Newbus xpcife device driver attach */ +static int +xpcife_attach(device_t dev) +{ + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(dev); + int i, num_roots, len, err; + char str[64]; + unsigned int domain, bus; + + DPRINTF("xpcife attach (unit=%d)\n", pdev->unit); + + err = xenbus_scanf(NULL, pdev->xdev->otherend, + "root_num", "%d", &num_roots); + if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of PCI roots"); + goto out; + } + + /* Add a pcib device for each root */ + for (i = 0; i < num_roots; i++) { + device_t child; + + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(NULL, pdev->xdev->otherend, str, + "%x:%x", &domain, &bus); + if (err != 2) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading PCI root %d", i); + goto out; + } + err = 0; + if (domain != pdev->xdev->otherend_id) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Domain mismatch %d != %d", domain, pdev->xdev->otherend_id); + goto out; + } + + child = device_add_child(dev, "pcib", bus); + if (!child) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Unable to create pcib%d", bus); + goto out; + } + } + + out: + return bus_generic_attach(dev); +} + +static devclass_t xpcife_devclass; + +static device_method_t xpcife_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xpcife_probe), + DEVMETHOD(device_attach, xpcife_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + /* Bus interface */ + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + {0, 0} +}; + +static driver_t xpcife_driver = { + "xpcife", + xpcife_methods, + 0, +}; + +DRIVER_MODULE(xpcife, nexus, xpcife_driver, xpcife_devclass, 0, 0); + + +/* Newbus xen pcib device driver probe */ +static int +xpcib_probe(device_t dev) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(device_get_parent(dev)); + + DPRINTF("xpcib probe (bus=%d)\n", device_get_unit(dev)); + + sc->domain = pdev->xdev->otherend_id; + sc->bus = device_get_unit(dev); + sc->pdev = pdev; + + return 0; +} + +/* Newbus xen pcib device driver attach */ +static int +xpcib_attach(device_t dev) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + + DPRINTF("xpcib attach (bus=%d)\n", sc->bus); + + device_add_child(dev, "pci", sc->bus); + return bus_generic_attach(dev); +} + +static int +xpcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + switch (which) { + case PCIB_IVAR_BUS: + *result = sc->bus; + return 0; + } + return ENOENT; +} + +/* Return the number of slots supported */ +static int +xpcib_maxslots(device_t dev) +{ + return 31; +} + +#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) + +/* Read configuration space register */ +static u_int32_t +xpcib_read_config(device_t dev, int bus, int slot, int func, + int reg, int bytes) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_read, + .domain = sc->domain, + .bus = sc->bus, + .devfn = PCI_DEVFN(slot, func), + .offset = reg, + .size = bytes, + }; + int err; + + err = do_pci_op(sc->pdev, &op); + + DPRINTF("read config (b=%d, s=%d, f=%d, reg=%d, len=%d, val=%x, err=%d)\n", + bus, slot, func, reg, bytes, op.value, err); + + if (err) + op.value = ~0; + + return op.value; +} + +/* Write configuration space register */ +static void +xpcib_write_config(device_t dev, int bus, int slot, int func, + int reg, u_int32_t data, int bytes) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_write, + .domain = sc->domain, + .bus = sc->bus, + .devfn = PCI_DEVFN(slot, func), + .offset = reg, + .size = bytes, + .value = data, + }; + int err; + + err = do_pci_op(sc->pdev, &op); + + DPRINTF("write config (b=%d, s=%d, f=%d, reg=%d, len=%d, val=%x, err=%d)\n", + bus, slot, func, reg, bytes, data, err); +} + +static int +xpcib_route_interrupt(device_t pcib, device_t dev, int pin) +{ + struct pci_devinfo *dinfo = device_get_ivars(dev); + pcicfgregs *cfg = &dinfo->cfg; + + DPRINTF("route intr (pin=%d, line=%d)\n", pin, cfg->intline); + + return cfg->intline; +} + +static device_method_t xpcib_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xpcib_probe), + DEVMETHOD(device_attach, xpcib_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_read_ivar, xpcib_read_ivar), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + + /* pcib interface */ + DEVMETHOD(pcib_maxslots, xpcib_maxslots), + DEVMETHOD(pcib_read_config, xpcib_read_config), + DEVMETHOD(pcib_write_config, xpcib_write_config), + DEVMETHOD(pcib_route_interrupt, xpcib_route_interrupt), + { 0, 0 } +}; + +static devclass_t xpcib_devclass; + +DEFINE_CLASS_0(pcib, xpcib_driver, xpcib_methods, sizeof(struct xpcib_softc)); +DRIVER_MODULE(pcib, xpcife, xpcib_driver, xpcib_devclass, 0, 0); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ Property changes on: dev/xen/pcifront/pcifront.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/balloon/balloon.c =================================================================== --- dev/xen/balloon/balloon.c (.../stable/6/sys) (revision 0) +++ dev/xen/balloon/balloon.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,565 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver"); + +struct mtx balloon_mutex; + +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +struct mtx balloon_lock; + +/* We increase/decrease in batches which fit in a page */ +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +#define ARRAY_SIZE(A) (sizeof(A) / sizeof(A[0])) + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* We may hit the hard limit in Xen. If we do then we remember it. */ + unsigned long hard_limit; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; + +static struct balloon_stats balloon_stats; +#define bs balloon_stats + +SYSCTL_DECL(_dev_xen); +SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD, + &bs.current_pages, 0, "Current allocation"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD, + &bs.target_pages, 0, "Target allocation"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD, + &bs.driver_pages, 0, "Driver pages"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD, + &bs.hard_limit, 0, "Xen hard limit"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD, + &bs.balloon_low, 0, "Low-mem balloon"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD, + &bs.balloon_high, 0, "High-mem balloon"); + +struct balloon_entry { + vm_page_t page; + STAILQ_ENTRY(balloon_entry) list; +}; + +/* List of ballooned pages, threaded through the mem_map array. */ +static STAILQ_HEAD(,balloon_entry) ballooned_pages; + +/* Main work function, always executed in process context. */ +static void balloon_process(void *unused); + +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_mem: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_mem: " fmt, ##args) + +/* balloon_append: add the given page to the balloon. */ +static void +balloon_append(vm_page_t page) +{ + struct balloon_entry *entry; + + entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_WAITOK); + entry->page = page; + STAILQ_INSERT_HEAD(&ballooned_pages, entry, list); + bs.balloon_low++; +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static vm_page_t +balloon_retrieve(void) +{ + vm_page_t page; + struct balloon_entry *entry; + + if (STAILQ_EMPTY(&ballooned_pages)) + return NULL; + + entry = STAILQ_FIRST(&ballooned_pages); + STAILQ_REMOVE_HEAD(&ballooned_pages, list); + + page = entry->page; + free(entry, M_DEVBUF); + + bs.balloon_low--; + + return page; +} + +static void +balloon_alarm(void *unused) +{ + wakeup(balloon_process); +} + +static unsigned long +current_target(void) +{ + unsigned long target = min(bs.target_pages, bs.hard_limit); + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; + return target; +} + +static unsigned long +minimum_target(void) +{ +#ifdef XENHVM +#define max_pfn physmem +#endif + unsigned long min_pages, curr_pages = current_target(); + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* Simple continuous piecewiese linear function: + * max MiB -> min MiB gradient + * 0 0 + * 16 16 + * 32 24 + * 128 72 (1/2) + * 512 168 (1/4) + * 2048 360 (1/8) + * 8192 552 (1/32) + * 32768 1320 + * 131072 4392 + */ + if (max_pfn < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (max_pfn >> 1); + else if (max_pfn < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (max_pfn >> 2); + else if (max_pfn < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (max_pfn >> 3); + else + min_pages = MB2PAGES(296) + (max_pfn >> 5); +#undef MB2PAGES + + /* Don't enforce growth */ + return min(min_pages, curr_pages); +#ifndef CONFIG_XEN +#undef max_pfn +#endif +} + +static int +increase_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i; + struct balloon_entry *entry; + vm_page_t page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + mtx_lock(&balloon_lock); + + for (entry = STAILQ_FIRST(&ballooned_pages), i = 0; + i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) { + KASSERT(entry, ("ballooned_pages list corrupt")); + page = entry->page; + frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op( + XENMEM_populate_physmap, &reservation); + if (rc < nr_pages) { + if (rc > 0) { + int ret; + + /* We hit the Xen hard limit: reprobe. */ + reservation.nr_extents = rc; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + KASSERT(ret == rc, ("HYPERVISOR_memory_op failed")); + } + if (rc >= 0) + bs.hard_limit = (bs.current_pages + rc - + bs.driver_pages); + goto out; + } + + for (i = 0; i < nr_pages; i++) { + page = balloon_retrieve(); + KASSERT(page, ("balloon_retrieve failed")); + + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + KASSERT((xen_feature(XENFEAT_auto_translated_physmap) || + !phys_to_machine_mapping_valid(pfn)), + ("auto translated physmap but mapping is valid")); + + set_phys_to_machine(pfn, frame_list[i]); + +#ifndef XENHVM + /* Link back into the page tables if not highmem. */ + if (pfn < max_low_pfn) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte_ma(frame_list[i], PAGE_KERNEL), + 0); + PASSING(ret == 0, + ("HYPERVISOR_update_va_mapping failed")); + } +#endif + + /* Relinquish the page back to the allocator. */ + vm_page_unwire(page, 0); + vm_page_free(page); + } + + bs.current_pages += nr_pages; + //totalram_pages = bs.current_pages; + + out: + mtx_unlock(&balloon_lock); + + return 0; +} + +static int +decrease_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i; + vm_page_t page; + int need_sleep = 0; + int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { + int color = 0; + if ((page = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; + } + + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + frame_list[i] = PFNTOMFN(pfn); + +#if 0 + if (!PageHighMem(page)) { + v = phys_to_virt(pfn << PAGE_SHIFT); + scrub_pages(v, 1); +#ifdef CONFIG_XEN + ret = HYPERVISOR_update_va_mapping( + (unsigned long)v, __pte_ma(0), 0); + BUG_ON(ret); +#endif + } +#endif +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } +#endif + } + +#ifdef CONFIG_XEN + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); +#endif + + mtx_lock(&balloon_lock); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = MFNTOPFN(frame_list[i]); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + balloon_append(PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT)); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed")); + + bs.current_pages -= nr_pages; + //totalram_pages = bs.current_pages; + + mtx_unlock(&balloon_lock); + + return (need_sleep); +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void +balloon_process(void *unused) +{ + int need_sleep = 0; + long credit; + + mtx_lock(&balloon_mutex); + for (;;) { + do { + credit = current_target() - bs.current_pages; + if (credit > 0) + need_sleep = (increase_reservation(credit) != 0); + if (credit < 0) + need_sleep = (decrease_reservation(-credit) != 0); + + } while ((credit != 0) && !need_sleep); + + /* Schedule more work if there is some still to be done. */ + if (current_target() != bs.current_pages) + timeout(balloon_alarm, NULL, ticks + hz); + + msleep(balloon_process, &balloon_mutex, 0, "balloon", -1); + } + mtx_unlock(&balloon_mutex); +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +static void +set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + bs.hard_limit = ~0UL; + bs.target_pages = max(target, minimum_target()); + wakeup(balloon_process); +} + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void +watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", NULL, + "%llu", &new_target); + if (err) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + set_new_target(new_target >> (PAGE_SHIFT - 10)); + +} + +static void +balloon_init_watcher(void *arg) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printf("Failed to set balloon watcher\n"); + +} +SYSINIT(balloon_init_watcher, SI_SUB_PSEUDO, SI_ORDER_ANY, + balloon_init_watcher, NULL); + +static void +balloon_init(void *arg) +{ +#ifndef XENHVM + vm_page_t page; +#endif + + if (!is_running_on_xen()) + return; + + mtx_init(&balloon_lock, "balloon_lock", NULL, MTX_DEF); + mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF); + +#ifndef XENHVM + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); +#else + bs.current_pages = physmem; +#endif + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + bs.hard_limit = ~0UL; + + kthread_create(balloon_process, NULL, NULL, 0, 0, "balloon"); +// init_timer(&balloon_timer); +// balloon_timer.data = 0; +// balloon_timer.function = balloon_alarm; + +#ifndef XENHVM + /* Initialise the balloon with excess memory space. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT); + balloon_append(page); + } +#endif + + target_watch.callback = watch_target; + + return; +} +SYSINIT(balloon_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, balloon_init, NULL); + +void balloon_update_driver_allowance(long delta); + +void +balloon_update_driver_allowance(long delta) +{ + mtx_lock(&balloon_lock); + bs.driver_pages += delta; + mtx_unlock(&balloon_lock); +} + +#if 0 +static int dealloc_pte_fn( + pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + unsigned long mfn = pte_mfn(*pte); + int ret; + struct xen_memory_reservation reservation = { + .extent_start = &mfn, + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_pte_at(&init_mm, addr, pte, __pte_ma(0)); + set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + KASSERT(ret == 1, ("HYPERVISOR_memory_op failed")); + return 0; +} + +#endif + +#if 0 +vm_page_t +balloon_alloc_empty_page_range(unsigned long nr_pages) +{ + vm_page_t pages; + int i, rc; + unsigned long *mfn_list; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = vm_page_alloc_contig(nr_pages, 0, -1, 4, 4) + if (pages == NULL) + return NULL; + + mfn_list = malloc(nr_pages*sizeof(unsigned long), M_DEVBUF, M_WAITOK); + + for (i = 0; i < nr_pages; i++) { + mfn_list[i] = PFNTOMFN(VM_PAGE_TO_PHYS(pages[i]) >> PAGE_SHIFT); + PFNTOMFN(i) = INVALID_P2M_ENTRY; + reservation.extent_start = mfn_list; + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + KASSERT(rc == nr_pages, ("HYPERVISOR_memory_op failed")); + } + + current_pages -= nr_pages; + + wakeup(balloon_process); + + return pages; +} + +void +balloon_dealloc_empty_page_range(vm_page_t page, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + balloon_append(page + i); + + wakeup(balloon_process); +} +#endif Property changes on: dev/xen/balloon/balloon.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/xenpci/machine_reboot.c =================================================================== --- dev/xen/xenpci/machine_reboot.c (.../stable/6/sys) (revision 0) +++ dev/xen/xenpci/machine_reboot.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include + +#include + +void +xen_suspend() +{ + int suspend_cancelled; + + if (DEVICE_SUSPEND(root_bus)) { + printf("xen_suspend: device_suspend failed\n"); + return; + } + + /* + * Make sure we don't change cpus or switch to some other + * thread. for the duration. + */ + critical_enter(); + + /* + * Prevent any races with evtchn_interrupt() handler. + */ + irq_suspend(); + disable_intr(); + + suspend_cancelled = HYPERVISOR_suspend(0); + if (!suspend_cancelled) + xenpci_resume(); + + /* + * Re-enable interrupts and put the scheduler back to normal. + */ + enable_intr(); + critical_exit(); + + /* + * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or + * similar. + */ + if (!suspend_cancelled) + DEVICE_RESUME(root_bus); +} Property changes on: dev/xen/xenpci/machine_reboot.c ___________________________________________________________________ Added: svn:keywords + FreeBSD=%H Index: dev/xen/xenpci/xenpcivar.h =================================================================== --- dev/xen/xenpci/xenpcivar.h (.../stable/6/sys) (revision 0) +++ dev/xen/xenpci/xenpcivar.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * One of these per allocated device. + */ +struct xenpci_softc { + int rid_ioport; + int rid_memory; + int rid_irq; + struct resource* res_memory; /* Resource for mem range. */ + struct resource* res_irq; /* Resource for irq range. */ + void *intr_cookie; + + vm_paddr_t phys_next; /* next page from mem range */ +}; + +extern int xenpci_irq_init(device_t device, struct xenpci_softc *scp); +extern int xenpci_alloc_space(size_t sz, vm_paddr_t *pa); +extern void xenpci_resume(void); +extern void xen_suspend(void); Property changes on: dev/xen/xenpci/xenpcivar.h ___________________________________________________________________ Added: svn:keywords + FreeBSD=%H Index: dev/xen/xenpci/xenpci.c =================================================================== --- dev/xen/xenpci/xenpci.c (.../stable/6/sys) (revision 0) +++ dev/xen/xenpci/xenpci.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* + * These variables are used by the rest of the kernel to access the + * hypervisor. + */ +char *hypercall_stubs; +shared_info_t *HYPERVISOR_shared_info; +static vm_paddr_t shared_info_pa; + +/* + * This is used to find our platform device instance. + */ +static devclass_t xenpci_devclass; + +/* + * Return the CPUID base address for Xen functions. + */ +static uint32_t +xenpci_cpuid_base(void) +{ + uint32_t base, regs[4]; + + for (base = 0x40000000; base < 0x40001000; base += 0x100) { + do_cpuid(base, regs); + if (!memcmp("XenVMMXenVMM", ®s[1], 12) + && (regs[0] - base) >= 2) + return (base); + } + return (0); +} + +/* + * Allocate and fill in the hypcall page. + */ +static int +xenpci_init_hypercall_stubs(device_t dev, struct xenpci_softc * scp) +{ + uint32_t base, regs[4]; + int i; + + base = xenpci_cpuid_base(); + if (!base) { + device_printf(dev, "Xen platform device but not Xen VMM\n"); + return (EINVAL); + } + + if (bootverbose) { + do_cpuid(base + 1, regs); + device_printf(dev, "Xen version %d.%d.\n", + regs[0] >> 16, regs[0] & 0xffff); + } + + /* + * Find the hypercall pages. + */ + do_cpuid(base + 2, regs); + + hypercall_stubs = malloc(regs[0] * PAGE_SIZE, M_TEMP, M_WAITOK); + + for (i = 0; i < regs[0]; i++) { + wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i); + } + + return (0); +} + +/* + * After a resume, re-initialise the hypercall page. + */ +static void +xenpci_resume_hypercall_stubs(device_t dev, struct xenpci_softc * scp) +{ + uint32_t base, regs[4]; + int i; + + base = xenpci_cpuid_base(); + + do_cpuid(base + 2, regs); + for (i = 0; i < regs[0]; i++) { + wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i); + } +} + +/* + * Tell the hypervisor how to contact us for event channel callbacks. + */ +static void +xenpci_set_callback(device_t dev) +{ + int irq; + uint64_t callback; + struct xen_hvm_param xhp; + + irq = pci_get_irq(dev); + if (irq < 16) { + callback = irq; + } else { + callback = (pci_get_intpin(dev) - 1) & 3; + callback |= pci_get_slot(dev) << 11; + callback |= 1ull << 56; + } + + xhp.domid = DOMID_SELF; + xhp.index = HVM_PARAM_CALLBACK_IRQ; + xhp.value = callback; + if (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp)) + panic("Can't set evtchn callback"); +} + + +/* + * Deallocate anything allocated by xenpci_allocate_resources. + */ +static int +xenpci_deallocate_resources(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + + if (scp->res_irq != 0) { + bus_deactivate_resource(dev, SYS_RES_IRQ, + scp->rid_irq, scp->res_irq); + bus_release_resource(dev, SYS_RES_IRQ, + scp->rid_irq, scp->res_irq); + scp->res_irq = 0; + } + if (scp->res_memory != 0) { + bus_deactivate_resource(dev, SYS_RES_MEMORY, + scp->rid_memory, scp->res_memory); + bus_release_resource(dev, SYS_RES_MEMORY, + scp->rid_memory, scp->res_memory); + scp->res_memory = 0; + } + + return (0); +} + +/* + * Allocate irq and memory resources. + */ +static int +xenpci_allocate_resources(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + + scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &scp->rid_irq, RF_SHAREABLE|RF_ACTIVE); + if (scp->res_irq == NULL) + goto errexit; + + scp->rid_memory = PCIR_BAR(1); + scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &scp->rid_memory, RF_ACTIVE); + if (scp->res_memory == NULL) + goto errexit; + return (0); + +errexit: + /* Cleanup anything we may have assigned. */ + xenpci_deallocate_resources(dev); + return (ENXIO); /* For want of a better idea. */ +} + +/* + * Allocate a physical address range from our mmio region. + */ +static int +xenpci_alloc_space_int(struct xenpci_softc *scp, size_t sz, + vm_paddr_t *pa) +{ + + if (scp->phys_next + sz > rman_get_end(scp->res_memory)) { + return (ENOMEM); + } + + *pa = scp->phys_next; + scp->phys_next += sz; + + return (0); +} + +/* + * Allocate a physical address range from our mmio region. + */ +int +xenpci_alloc_space(size_t sz, vm_paddr_t *pa) +{ + device_t dev = devclass_get_device(xenpci_devclass, 0); + + if (dev) { + return (xenpci_alloc_space_int(device_get_softc(dev), + sz, pa)); + } else { + return (ENOMEM); + } +} + +/* + * Called very early in the resume sequence - reinitialise the various + * bits of Xen machinery including the hypercall page and the shared + * info page. + */ +void +xenpci_resume() +{ + device_t dev = devclass_get_device(xenpci_devclass, 0); + struct xenpci_softc *scp = device_get_softc(dev); + struct xen_add_to_physmap xatp; + + xenpci_resume_hypercall_stubs(dev, scp); + + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = shared_info_pa >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + panic("HYPERVISOR_memory_op failed"); + + pmap_kenter((vm_offset_t) HYPERVISOR_shared_info, shared_info_pa); + + xenpci_set_callback(dev); + + gnttab_resume(); + irq_resume(); +} + +/* + * Probe - just check device ID. + */ +static int +xenpci_probe(device_t dev) +{ + + if (pci_get_devid(dev) != 0x00015853) + return (ENXIO); + + device_set_desc(dev, "Xen Platform Device"); + return (bus_generic_probe(dev)); +} + +/* + * Attach - find resources and talk to Xen. + */ +static int +xenpci_attach(device_t dev) +{ + int error; + struct xenpci_softc *scp = device_get_softc(dev); + struct xen_add_to_physmap xatp; + vm_offset_t shared_va; + + error = xenpci_allocate_resources(dev); + if (error) + goto errexit; + + scp->phys_next = rman_get_start(scp->res_memory); + + error = xenpci_init_hypercall_stubs(dev, scp); + if (error) + goto errexit; + + setup_xen_features(); + + xenpci_alloc_space_int(scp, PAGE_SIZE, &shared_info_pa); + + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = shared_info_pa >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + panic("HYPERVISOR_memory_op failed"); + + shared_va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + pmap_kenter(shared_va, shared_info_pa); + HYPERVISOR_shared_info = (void *) shared_va; + + /* + * Hook the irq up to evtchn + */ + xenpci_irq_init(dev, scp); + xenpci_set_callback(dev); + + return (bus_generic_attach(dev)); + +errexit: + /* + * Undo anything we may have done. + */ + xenpci_deallocate_resources(dev); + return (error); +} + +/* + * Detach - reverse anything done by attach. + */ +static int +xenpci_detach(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + device_t parent = device_get_parent(dev); + + /* + * Take our interrupt handler out of the list of handlers + * that can handle this irq. + */ + if (scp->intr_cookie != NULL) { + if (BUS_TEARDOWN_INTR(parent, dev, + scp->res_irq, scp->intr_cookie) != 0) + printf("intr teardown failed.. continuing\n"); + scp->intr_cookie = NULL; + } + + /* + * Deallocate any system resources we may have + * allocated on behalf of this driver. + */ + return (xenpci_deallocate_resources(dev)); +} + +static device_method_t xenpci_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xenpci_probe), + DEVMETHOD(device_attach, xenpci_attach), + DEVMETHOD(device_detach, xenpci_detach), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + + { 0, 0 } +}; + +static driver_t xenpci_driver = { + "xenpci", + xenpci_methods, + sizeof(struct xenpci_softc), +}; + +DRIVER_MODULE(xenpci, pci, xenpci_driver, xenpci_devclass, 0, 0); Property changes on: dev/xen/xenpci/xenpci.c ___________________________________________________________________ Added: svn:keywords + FreeBSD=%H Index: dev/xen/xenpci/evtchn.c =================================================================== --- dev/xen/xenpci/evtchn.c (.../stable/6/sys) (revision 0) +++ dev/xen/xenpci/evtchn.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,418 @@ +/****************************************************************************** + * evtchn.c + * + * A simplified event channel for para-drivers in unmodified linux + * + * Copyright (c) 2002-2005, K A Fraser + * Copyright (c) 2005, Intel Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static inline unsigned long __ffs(unsigned long word) +{ + __asm__("bsfq %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + +#define is_valid_evtchn(x) ((x) != 0) +#define evtchn_from_irq(x) (irq_evtchn[irq].evtchn) + +static struct { + struct mtx lock; + driver_intr_t *handler; + void *arg; + int evtchn; + int close:1; /* close on unbind_from_irqhandler()? */ + int inuse:1; + int in_handler:1; + int mpsafe:1; +} irq_evtchn[256]; +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 }; + +static struct mtx irq_alloc_lock; +static device_t xenpci_device; + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +static unsigned int +alloc_xen_irq(void) +{ + static int warned; + unsigned int irq; + + mtx_lock(&irq_alloc_lock); + + for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) { + if (irq_evtchn[irq].inuse) + continue; + irq_evtchn[irq].inuse = 1; + mtx_unlock(&irq_alloc_lock); + return irq; + } + + if (!warned) { + warned = 1; + printf("alloc_xen_irq: No available IRQ to bind to: " + "increase irq_evtchn[] size in evtchn.c.\n"); + } + + mtx_unlock(&irq_alloc_lock); + + return -ENOSPC; +} + +static void +free_xen_irq(int irq) +{ + + mtx_lock(&irq_alloc_lock); + irq_evtchn[irq].inuse = 0; + mtx_unlock(&irq_alloc_lock); +} + +int +irq_to_evtchn_port(int irq) +{ + + return irq_evtchn[irq].evtchn; +} + +void +mask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + synch_set_bit(port, &s->evtchn_mask[0]); +} + +void +unmask_evtchn(int port) +{ + evtchn_unmask_t op = { .port = port }; + + HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &op); +} + +int +bind_listening_port_to_irqhandler(unsigned int remote_domain, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) +{ + struct evtchn_alloc_unbound alloc_unbound; + unsigned int irq; + int error; + + irq = alloc_xen_irq(); + if (irq < 0) + return irq; + + mtx_lock(&irq_evtchn[irq].lock); + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = remote_domain; + error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (error) { + mtx_unlock(&irq_evtchn[irq].lock); + free_xen_irq(irq); + return (-error); + } + + irq_evtchn[irq].handler = handler; + irq_evtchn[irq].arg = arg; + irq_evtchn[irq].evtchn = alloc_unbound.port; + irq_evtchn[irq].close = 1; + irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0; + + evtchn_to_irq[alloc_unbound.port] = irq; + + unmask_evtchn(alloc_unbound.port); + + mtx_unlock(&irq_evtchn[irq].lock); + + if (irqp) + *irqp = irq; + return (0); +} + +int +bind_caller_port_to_irqhandler(unsigned int caller_port, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) +{ + unsigned int irq; + + irq = alloc_xen_irq(); + if (irq < 0) + return irq; + + mtx_lock(&irq_evtchn[irq].lock); + + irq_evtchn[irq].handler = handler; + irq_evtchn[irq].arg = arg; + irq_evtchn[irq].evtchn = caller_port; + irq_evtchn[irq].close = 0; + irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0; + + evtchn_to_irq[caller_port] = irq; + + unmask_evtchn(caller_port); + + mtx_unlock(&irq_evtchn[irq].lock); + + if (irqp) + *irqp = irq; + return (0); +} + +void +unbind_from_irqhandler(unsigned int irq) +{ + int evtchn; + + mtx_lock(&irq_evtchn[irq].lock); + + evtchn = evtchn_from_irq(irq); + + if (is_valid_evtchn(evtchn)) { + evtchn_to_irq[evtchn] = -1; + mask_evtchn(evtchn); + if (irq_evtchn[irq].close) { + struct evtchn_close close = { .port = evtchn }; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + panic("EVTCHNOP_close failed"); + } + } + + irq_evtchn[irq].handler = NULL; + irq_evtchn[irq].evtchn = 0; + + mtx_unlock(&irq_evtchn[irq].lock); + + while (irq_evtchn[irq].in_handler) + cpu_relax(); + + free_xen_irq(irq); +} + +void notify_remote_via_irq(int irq) +{ + int evtchn; + + evtchn = evtchn_from_irq(irq); + if (is_valid_evtchn(evtchn)) + notify_remote_via_evtchn(evtchn); +} + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); +} + +static void +evtchn_interrupt(void *arg) +{ + unsigned int l1i, l2i, port; + unsigned long masked_l1, masked_l2; + /* XXX: All events are bound to vcpu0 but irq may be redirected. */ + int cpu = 0; /*smp_processor_id();*/ + driver_intr_t *handler; + void *handler_arg; + int irq, handler_mpsafe; + shared_info_t *s = HYPERVISOR_shared_info; + vcpu_info_t *v = &s->vcpu_info[cpu]; + struct pcpu *pc = pcpu_find(cpu); + unsigned long l1, l2; + + v->evtchn_upcall_pending = 0; + +#if 0 +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ + /* Clear master flag /before/ clearing selector flag. */ + wmb(); +#endif +#endif + + l1 = atomic_readandclear_long(&v->evtchn_pending_sel); + + l1i = pc->pc_last_processed_l1i; + l2i = pc->pc_last_processed_l2i; + + while (l1 != 0) { + + l1i = (l1i + 1) % LONG_BIT; + masked_l1 = l1 & ((~0UL) << l1i); + + if (masked_l1 == 0) { /* if we masked out all events, wrap around to the beginning */ + l1i = LONG_BIT - 1; + l2i = LONG_BIT - 1; + continue; + } + l1i = __ffs(masked_l1); + + do { + l2 = active_evtchns(cpu, s, l1i); + + l2i = (l2i + 1) % LONG_BIT; + masked_l2 = l2 & ((~0UL) << l2i); + + if (masked_l2 == 0) { /* if we masked out all events, move on */ + l2i = LONG_BIT - 1; + break; + } + l2i = __ffs(masked_l2); + + /* process port */ + port = (l1i * LONG_BIT) + l2i; + synch_clear_bit(port, &s->evtchn_pending[0]); + + irq = evtchn_to_irq[port]; + if (irq < 0) + continue; + + mtx_lock(&irq_evtchn[irq].lock); + handler = irq_evtchn[irq].handler; + handler_arg = irq_evtchn[irq].arg; + handler_mpsafe = irq_evtchn[irq].mpsafe; + if (unlikely(handler == NULL)) { + printf("Xen IRQ%d (port %d) has no handler!\n", + irq, port); + mtx_unlock(&irq_evtchn[irq].lock); + continue; + } + irq_evtchn[irq].in_handler = 1; + mtx_unlock(&irq_evtchn[irq].lock); + + //local_irq_enable(); + if (!handler_mpsafe) + mtx_lock(&Giant); + handler(handler_arg); + if (!handler_mpsafe) + mtx_unlock(&Giant); + //local_irq_disable(); + + mtx_lock(&irq_evtchn[irq].lock); + irq_evtchn[irq].in_handler = 0; + mtx_unlock(&irq_evtchn[irq].lock); + + /* if this is the final port processed, we'll pick up here+1 next time */ + pc->pc_last_processed_l1i = l1i; + pc->pc_last_processed_l2i = l2i; + + } while (l2i != LONG_BIT - 1); + + l2 = active_evtchns(cpu, s, l1i); + if (l2 == 0) /* we handled all ports, so we can clear the selector bit */ + l1 &= ~(1UL << l1i); + } +} + +void +irq_suspend(void) +{ + struct xenpci_softc *scp = device_get_softc(xenpci_device); + + /* + * Take our interrupt handler out of the list of handlers + * that can handle this irq. + */ + if (scp->intr_cookie != NULL) { + if (BUS_TEARDOWN_INTR(device_get_parent(xenpci_device), + xenpci_device, scp->res_irq, scp->intr_cookie) != 0) + printf("intr teardown failed.. continuing\n"); + scp->intr_cookie = NULL; + } +} + +void +irq_resume(void) +{ + struct xenpci_softc *scp = device_get_softc(xenpci_device); + int evtchn, irq; + + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) { + mask_evtchn(evtchn); + evtchn_to_irq[evtchn] = -1; + } + + for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) + irq_evtchn[irq].evtchn = 0; + + BUS_SETUP_INTR(device_get_parent(xenpci_device), + xenpci_device, scp->res_irq, INTR_TYPE_MISC, + evtchn_interrupt, NULL, &scp->intr_cookie); +} + +int +xenpci_irq_init(device_t device, struct xenpci_softc *scp) +{ + int irq, cpu; + int error; + + mtx_init(&irq_alloc_lock, "xen-irq-lock", NULL, MTX_DEF); + + for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) + mtx_init(&irq_evtchn[irq].lock, "irq-evtchn", NULL, MTX_DEF); + + for (cpu = 0; cpu < mp_ncpus; cpu++) { + pcpu_find(cpu)->pc_last_processed_l1i = LONG_BIT - 1; + pcpu_find(cpu)->pc_last_processed_l2i = LONG_BIT - 1; + } + + error = BUS_SETUP_INTR(device_get_parent(device), device, + scp->res_irq, INTR_MPSAFE|INTR_TYPE_MISC, evtchn_interrupt, NULL, + &scp->intr_cookie); + if (error) + return (error); + + xenpci_device = device; + + return (0); +} Property changes on: dev/xen/xenpci/evtchn.c ___________________________________________________________________ Added: svn:keywords + FreeBSD=%H Index: dev/xen/evtchn/evtchn_dev.c =================================================================== --- dev/xen/evtchn/evtchn_dev.c (.../stable/6/sys) (revision 0) +++ dev/xen/evtchn/evtchn_dev.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,394 @@ +/****************************************************************************** + * evtchn.c + * + * Xenolinux driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004, K A Fraser + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +typedef struct evtchn_sotfc { + + struct selinfo ev_rsel; +} evtchn_softc_t; + + +#ifdef linuxcrap +/* NB. This must be shared amongst drivers if more things go in /dev/xen */ +static devfs_handle_t xen_dev_dir; +#endif + +/* Only one process may open /dev/xen/evtchn at any time. */ +static unsigned long evtchn_dev_inuse; + +/* Notification ring, accessed via /dev/xen/evtchn. */ + +#define EVTCHN_RING_SIZE 2048 /* 2048 16-bit entries */ + +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) +static uint16_t *ring; +static unsigned int ring_cons, ring_prod, ring_overflow; + +/* Which ports is user-space bound to? */ +static uint32_t bound_ports[32]; + +/* Unique address for processes to sleep on */ +static void *evtchn_waddr = ˚ + +static struct mtx lock, upcall_lock; + +static d_read_t evtchn_read; +static d_write_t evtchn_write; +static d_ioctl_t evtchn_ioctl; +static d_poll_t evtchn_poll; +static d_open_t evtchn_open; +static d_close_t evtchn_close; + + +void +evtchn_device_upcall(int port) +{ + mtx_lock(&upcall_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ( ring != NULL ) { + if ( (ring_prod - ring_cons) < EVTCHN_RING_SIZE ) { + ring[EVTCHN_RING_MASK(ring_prod)] = (uint16_t)port; + if ( ring_cons == ring_prod++ ) { + wakeup(evtchn_waddr); + } + } + else { + ring_overflow = 1; + } + } + + mtx_unlock(&upcall_lock); +} + +static void +__evtchn_reset_buffer_ring(void) +{ + /* Initialise the ring to empty. Clear errors. */ + ring_cons = ring_prod = ring_overflow = 0; +} + +static int +evtchn_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc; + unsigned int count, c, p, sst = 0, bytes1 = 0, bytes2 = 0; + count = uio->uio_resid; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + for ( ; ; ) { + if ( (c = ring_cons) != (p = ring_prod) ) + break; + + if ( ring_overflow ) { + rc = EFBIG; + goto out; + } + + if (sst != 0) { + rc = EINTR; + goto out; + } + + /* PCATCH == check for signals before and after sleeping + * PWAIT == priority of waiting on resource + */ + sst = tsleep(evtchn_waddr, PWAIT|PCATCH, "evchwt", 10); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if ( ((c ^ p) & EVTCHN_RING_SIZE) != 0 ) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * sizeof(uint16_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(uint16_t); + } + else { + bytes1 = (p - c) * sizeof(uint16_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if ( bytes1 > count ) { + bytes1 = count; + bytes2 = 0; + } + else if ( (bytes1 + bytes2) > count ) { + bytes2 = count - bytes1; + } + + if ( uiomove(&ring[EVTCHN_RING_MASK(c)], bytes1, uio) || + ((bytes2 != 0) && uiomove(&ring[0], bytes2, uio))) + /* keeping this around as its replacement is not equivalent + * copyout(&ring[0], &buf[bytes1], bytes2) + */ + { + rc = EFAULT; + goto out; + } + + ring_cons += (bytes1 + bytes2) / sizeof(uint16_t); + + rc = bytes1 + bytes2; + + out: + + return rc; +} + +static int +evtchn_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc, i, count; + + count = uio->uio_resid; + + uint16_t *kbuf = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + + + if ( kbuf == NULL ) + return ENOMEM; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + if ( uiomove(kbuf, count, uio) != 0 ) { + rc = EFAULT; + goto out; + } + + mtx_lock_spin(&lock); + for ( i = 0; i < (count/2); i++ ) + if ( test_bit(kbuf[i], &bound_ports[0]) ) + unmask_evtchn(kbuf[i]); + mtx_unlock_spin(&lock); + + rc = count; + + out: + free(kbuf, M_DEVBUF); + return rc; +} + +static int +evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, + int mode, struct thread *td __unused) +{ + int rc = 0; + + mtx_lock_spin(&lock); + + switch ( cmd ) + { + case EVTCHN_RESET: + __evtchn_reset_buffer_ring(); + break; + case EVTCHN_BIND: + if ( !synch_test_and_set_bit((int)arg, &bound_ports[0]) ) + unmask_evtchn((int)arg); + else + rc = EINVAL; + break; + case EVTCHN_UNBIND: + if ( synch_test_and_clear_bit((int)arg, &bound_ports[0]) ) + mask_evtchn((int)arg); + else + rc = EINVAL; + break; + default: + rc = ENOSYS; + break; + } + + mtx_unlock_spin(&lock); + + return rc; +} + +static int +evtchn_poll(struct cdev *dev, int poll_events, struct thread *td) +{ + + evtchn_softc_t *sc; + unsigned int mask = POLLOUT | POLLWRNORM; + + sc = dev->si_drv1; + + if ( ring_cons != ring_prod ) + mask |= POLLIN | POLLRDNORM; + else if ( ring_overflow ) + mask = POLLERR; + else + selrecord(td, &sc->ev_rsel); + + + return mask; +} + + +static int +evtchn_open(struct cdev *dev, int flag, int otyp, struct thread *td) +{ + uint16_t *_ring; + + if (flag & O_NONBLOCK) + return EBUSY; + + if ( synch_test_and_set_bit(0, &evtchn_dev_inuse) ) + return EBUSY; + + if ( (_ring = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK)) == NULL ) + return ENOMEM; + + mtx_lock_spin(&lock); + ring = _ring; + __evtchn_reset_buffer_ring(); + mtx_unlock_spin(&lock); + + + return 0; +} + +static int +evtchn_close(struct cdev *dev, int flag, int otyp, struct thread *td __unused) +{ + int i; + + mtx_lock_spin(&lock); + if (ring != NULL) { + free(ring, M_DEVBUF); + ring = NULL; + } + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + if ( synch_test_and_clear_bit(i, &bound_ports[0]) ) + mask_evtchn(i); + mtx_unlock_spin(&lock); + + evtchn_dev_inuse = 0; + + return 0; +} + +static struct cdevsw evtchn_devsw = { + d_version: D_VERSION, + d_open: evtchn_open, + d_close: evtchn_close, + d_read: evtchn_read, + d_write: evtchn_write, + d_ioctl: evtchn_ioctl, + d_poll: evtchn_poll, + d_name: "evtchn", + d_flags: 0, +}; + + +/* XXX - if this device is ever supposed to support use by more than one process + * this global static will have to go away + */ +static struct cdev *evtchn_dev; + + + +static int +evtchn_init(void *dummy __unused) +{ + /* XXX I believe we don't need these leaving them here for now until we + * have some semblance of it working + */ + mtx_init(&upcall_lock, "evtchup", NULL, MTX_DEF); + + /* (DEVFS) create '/dev/misc/evtchn'. */ + evtchn_dev = make_dev(&evtchn_devsw, 0, UID_ROOT, GID_WHEEL, 0600, "xen/evtchn"); + + mtx_init(&lock, "evch", NULL, MTX_SPIN | MTX_NOWITNESS); + + evtchn_dev->si_drv1 = malloc(sizeof(evtchn_softc_t), M_DEVBUF, M_WAITOK); + bzero(evtchn_dev->si_drv1, sizeof(evtchn_softc_t)); + + /* XXX I don't think we need any of this rubbish */ +#if 0 + if ( err != 0 ) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + /* (DEVFS) create directory '/dev/xen'. */ + xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); + + /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ + pos = devfs_generate_path(evtchn_miscdev.devfs_handle, + &link_dest[3], + sizeof(link_dest) - 3); + if ( pos >= 0 ) + strncpy(&link_dest[pos], "../", 3); + /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ + (void)devfs_mk_symlink(xen_dev_dir, + "evtchn", + DEVFS_FL_DEFAULT, + &link_dest[pos], + &symlink_handle, + NULL); + + /* (DEVFS) automatically destroy the symlink with its destination. */ + devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); +#endif + printk("Event-channel device installed.\n"); + + return 0; +} + + +SYSINIT(evtchn_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, evtchn_init, NULL); + + Property changes on: dev/xen/evtchn/evtchn_dev.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/netback/netback.c =================================================================== --- dev/xen/netback/netback.c (.../stable/6/sys) (revision 0) +++ dev/xen/netback/netback.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1585 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef XEN_NETBACK_DEBUG +#define DPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif + +#ifdef XEN_NETBACK_DEBUG_LOTS +#define DDPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#define DPRINTF_MBUF(_m) print_mbuf(_m, 0) +#define DPRINTF_MBUF_LEN(_m, _len) print_mbuf(_m, _len) +#else +#define DDPRINTF(fmt, args...) ((void)0) +#define DPRINTF_MBUF(_m) ((void)0) +#define DPRINTF_MBUF_LEN(_m, _len) ((void)0) +#endif + +#define WPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#define BUG_ON PANIC_IF + +#define IFNAME(_np) (_np)->ifp->if_xname + +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) + +struct ring_ref { + vm_offset_t va; + grant_handle_t handle; + uint64_t bus_addr; +}; + +typedef struct netback_info { + + /* Schedule lists */ + STAILQ_ENTRY(netback_info) next_tx; + STAILQ_ENTRY(netback_info) next_rx; + int on_tx_sched_list; + int on_rx_sched_list; + + struct xenbus_device *xdev; + XenbusState frontend_state; + + domid_t domid; + int handle; + char *bridge; + + int rings_connected; + struct ring_ref tx_ring_ref; + struct ring_ref rx_ring_ref; + netif_tx_back_ring_t tx; + netif_rx_back_ring_t rx; + evtchn_port_t evtchn; + int irq; + void *irq_cookie; + + struct ifnet *ifp; + int ref_cnt; + + device_t ndev; + int attached; +} netif_t; + + +#define MAX_PENDING_REQS 256 +#define PKT_PROT_LEN 64 + +static struct { + netif_tx_request_t req; + netif_t *netif; +} pending_tx_info[MAX_PENDING_REQS]; +static uint16_t pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +static unsigned long mmap_vstart; +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) + +/* Freed TX mbufs get batched on this ring before return to pending_ring. */ +static uint16_t dealloc_ring[MAX_PENDING_REQS]; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; +static mmu_update_t rx_mmu[NET_RX_RING_SIZE]; +static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE]; + +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; + +static struct task net_tx_task, net_rx_task; +static struct callout rx_task_callout; + +static STAILQ_HEAD(netback_tx_sched_list, netback_info) tx_sched_list = + STAILQ_HEAD_INITIALIZER(tx_sched_list); +static STAILQ_HEAD(netback_rx_sched_list, netback_info) rx_sched_list = + STAILQ_HEAD_INITIALIZER(rx_sched_list); +static struct mtx tx_sched_list_lock; +static struct mtx rx_sched_list_lock; + +static int vif_unit_maker = 0; + +/* Protos */ +static void netback_start(struct ifnet *ifp); +static int netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); +static int vif_add_dev(struct xenbus_device *xdev); +static void disconnect_rings(netif_t *netif); + +#ifdef XEN_NETBACK_DEBUG_LOTS +/* Debug code to display the contents of an mbuf */ +static void +print_mbuf(struct mbuf *m, int max) +{ + int i, j=0; + printf("mbuf %08x len = %d", (unsigned int)m, m->m_pkthdr.len); + for (; m; m = m->m_next) { + unsigned char *d = m->m_data; + for (i=0; i < m->m_len; i++) { + if (max && j == max) + break; + if ((j++ % 16) == 0) + printf("\n%04x:", j); + printf(" %02x", d[i]); + } + } + printf("\n"); +} +#endif + + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; + +static unsigned long +alloc_mfn(void) +{ + unsigned long mfn = 0; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = MAX_MFN_ALLOC, + .extent_order = 0, + .domid = DOMID_SELF + }; + if ( unlikely(alloc_index == 0) ) + alloc_index = HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation); + if ( alloc_index != 0 ) + mfn = mfn_list[--alloc_index]; + return mfn; +} + +static unsigned long +alloc_empty_page_range(unsigned long nr_pages) +{ + void *pages; + int i = 0, j = 0; + multicall_entry_t mcl[17]; + unsigned long mfn_list[16]; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = 0, + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pages == NULL) + return 0; + + memset(mcl, 0, sizeof(mcl)); + + while (i < nr_pages) { + unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + + mcl[j].op = __HYPERVISOR_update_va_mapping; + mcl[j].args[0] = va; + + mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; + + xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; + + if (j == 16 || i == nr_pages) { + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; + + reservation.nr_extents = j; + + mcl[j].op = __HYPERVISOR_memory_op; + mcl[j].args[0] = XENMEM_decrease_reservation; + mcl[j].args[1] = (unsigned long)&reservation; + + (void)HYPERVISOR_multicall(mcl, j+1); + + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; + j = 0; + } + } + + return (unsigned long)pages; +} + +#ifdef XEN_NETBACK_FIXUP_CSUM +static void +fixup_checksum(struct mbuf *m) +{ + struct ether_header *eh = mtod(m, struct ether_header *); + struct ip *ip = (struct ip *)(eh + 1); + int iphlen = ip->ip_hl << 2; + int iplen = ntohs(ip->ip_len); + + if ((m->m_pkthdr.csum_flags & CSUM_TCP)) { + struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iphlen); + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_TCP + (iplen - iphlen))); + th->th_sum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen); + m->m_pkthdr.csum_flags &= ~CSUM_TCP; + } else { + u_short csum; + struct udphdr *uh = (struct udphdr *)((caddr_t)ip + iphlen); + uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_UDP + (iplen - iphlen))); + if ((csum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen)) == 0) + csum = 0xffff; + uh->uh_sum = csum; + m->m_pkthdr.csum_flags &= ~CSUM_UDP; + } +} +#endif + +/* Add the interface to the specified bridge */ +static int +add_to_bridge(struct ifnet *ifp, char *bridge) +{ + struct ifdrv ifd; + struct ifbreq ifb; + struct ifnet *ifp_bridge = ifunit(bridge); + + if (!ifp_bridge) + return ENOENT; + + bzero(&ifd, sizeof(ifd)); + bzero(&ifb, sizeof(ifb)); + + strcpy(ifb.ifbr_ifsname, ifp->if_xname); + strcpy(ifd.ifd_name, ifp->if_xname); + ifd.ifd_cmd = BRDGADD; + ifd.ifd_len = sizeof(ifb); + ifd.ifd_data = &ifb; + + return bridge_ioctl_kern(ifp_bridge, SIOCSDRVSPEC, &ifd); + +} + +static int +netif_create(int handle, struct xenbus_device *xdev, char *bridge) +{ + netif_t *netif; + struct ifnet *ifp; + + netif = (netif_t *)malloc(sizeof(*netif), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!netif) + return ENOMEM; + + netif->ref_cnt = 1; + netif->handle = handle; + netif->domid = xdev->otherend_id; + netif->xdev = xdev; + netif->bridge = bridge; + xdev->data = netif; + + /* Set up ifnet structure */ + ifp = netif->ifp = if_alloc(IFT_ETHER); + if (!ifp) { + if (bridge) + free(bridge, M_DEVBUF); + free(netif, M_DEVBUF); + return ENOMEM; + } + + ifp->if_softc = netif; + if_initname(ifp, "vif", + atomic_fetchadd_int(&vif_unit_maker, 1) /* ifno */ ); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX; + ifp->if_output = ether_output; + ifp->if_start = netback_start; + ifp->if_ioctl = netback_ioctl; + ifp->if_mtu = ETHERMTU; + ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; + + DPRINTF("Created %s for domid=%d handle=%d\n", IFNAME(netif), netif->domid, netif->handle); + + return 0; +} + +static void +netif_get(netif_t *netif) +{ + atomic_add_int(&netif->ref_cnt, 1); +} + +static void +netif_put(netif_t *netif) +{ + if (atomic_fetchadd_int(&netif->ref_cnt, -1) == 1) { + DPRINTF("%s\n", IFNAME(netif)); + disconnect_rings(netif); + if (netif->ifp) { + if_free(netif->ifp); + netif->ifp = NULL; + } + if (netif->bridge) + free(netif->bridge, M_DEVBUF); + free(netif, M_DEVBUF); + } +} + +static int +netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + switch (cmd) { + case SIOCSIFFLAGS: + DDPRINTF("%s cmd=SIOCSIFFLAGS flags=%x\n", + IFNAME((struct netback_info *)ifp->if_softc), ((struct ifreq *)data)->ifr_flags); + return 0; + } + + DDPRINTF("%s cmd=%lx\n", IFNAME((struct netback_info *)ifp->if_softc), cmd); + + return ether_ioctl(ifp, cmd, data); +} + +static inline void +maybe_schedule_tx_action(void) +{ + smp_mb(); + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && !STAILQ_EMPTY(&tx_sched_list)) + taskqueue_enqueue(taskqueue_swi, &net_tx_task); +} + +/* Removes netif from front of list and does not call netif_put() (caller must) */ +static netif_t * +remove_from_tx_schedule_list(void) +{ + netif_t *netif; + + mtx_lock(&tx_sched_list_lock); + + if ((netif = STAILQ_FIRST(&tx_sched_list))) { + STAILQ_REMOVE(&tx_sched_list, netif, netback_info, next_tx); + STAILQ_NEXT(netif, next_tx) = NULL; + netif->on_tx_sched_list = 0; + } + + mtx_unlock(&tx_sched_list_lock); + + return netif; +} + +/* Adds netif to end of list and calls netif_get() */ +static void +add_to_tx_schedule_list_tail(netif_t *netif) +{ + if (netif->on_tx_sched_list) + return; + + mtx_lock(&tx_sched_list_lock); + if (!netif->on_tx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) { + netif_get(netif); + STAILQ_INSERT_TAIL(&tx_sched_list, netif, next_tx); + netif->on_tx_sched_list = 1; + } + mtx_unlock(&tx_sched_list_lock); +} + +/* + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER: + * If this driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- frontend only needs to + * send a notification if there are no outstanding unreceived responses. + * If we may be buffer transmit buffers for any reason then we must be rather + * more conservative and treat this as the final check for pending work. + */ +static void +netif_schedule_tx_work(netif_t *netif) +{ + int more_to_do; + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx); +#else + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); +#endif + + if (more_to_do) { + DDPRINTF("Adding %s to tx sched list\n", IFNAME(netif)); + add_to_tx_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +static struct mtx dealloc_lock; +MTX_SYSINIT(netback_dealloc, &dealloc_lock, "DEALLOC LOCK", MTX_SPIN | MTX_NOWITNESS); + +static void +netif_idx_release(uint16_t pending_idx) +{ + mtx_lock_spin(&dealloc_lock); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + mtx_unlock_spin(&dealloc_lock); + + taskqueue_enqueue(taskqueue_swi, &net_tx_task); +} + +static void +make_tx_response(netif_t *netif, + uint16_t id, + int8_t st) +{ + RING_IDX i = netif->tx.rsp_prod_pvt; + netif_tx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->tx, i); + resp->id = id; + resp->status = st; + + netif->tx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); + if (notify) + notify_remote_via_irq(netif->irq); + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + if (i == netif->tx.req_cons) { + int more_to_do; + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); + if (more_to_do) + add_to_tx_schedule_list_tail(netif); + } +#endif +} + +inline static void +net_tx_action_dealloc(void) +{ + gnttab_unmap_grant_ref_t *gop; + uint16_t pending_idx; + PEND_RING_IDX dc, dp; + netif_t *netif; + int ret; + + dc = dealloc_cons; + dp = dealloc_prod; + + /* + * Free up any grants we have finished using + */ + gop = tx_unmap_ops; + while (dc != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + gop->host_addr = MMAP_VADDR(pending_idx); + gop->dev_bus_addr = 0; + gop->handle = grant_tx_handle[pending_idx]; + gop++; + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); + BUG_ON(ret); + + while (dealloc_cons != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + make_tx_response(netif, pending_tx_info[pending_idx].req.id, + NETIF_RSP_OKAY); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + netif_put(netif); + } +} + +static void +netif_page_release(void *buf, void *args) +{ + uint16_t pending_idx = (unsigned int)args; + + DDPRINTF("pending_idx=%u\n", pending_idx); + + KASSERT(pending_idx < MAX_PENDING_REQS, ("%s: bad index %u", __func__, pending_idx)); + + netif_idx_release(pending_idx); +} + +static void +net_tx_action(void *context, int pending) +{ + struct mbuf *m; + netif_t *netif; + netif_tx_request_t txreq; + uint16_t pending_idx; + RING_IDX i; + gnttab_map_grant_ref_t *mop; + int ret, work_to_do; + struct mbuf *txq = NULL, *txq_last = NULL; + + if (dealloc_cons != dealloc_prod) + net_tx_action_dealloc(); + + mop = tx_map_ops; + while ((NR_PENDING_REQS < MAX_PENDING_REQS) && !STAILQ_EMPTY(&tx_sched_list)) { + + /* Get a netif from the list with work to do. */ + netif = remove_from_tx_schedule_list(); + + DDPRINTF("Processing %s (prod=%u, cons=%u)\n", + IFNAME(netif), netif->tx.sring->req_prod, netif->tx.req_cons); + + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); + if (!work_to_do) { + netif_put(netif); + continue; + } + + i = netif->tx.req_cons; + rmb(); /* Ensure that we see the request before we copy it. */ + memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + + /* If we want credit-based scheduling, coud add it here - WORK */ + + netif->tx.req_cons++; + + netif_schedule_tx_work(netif); + + if (unlikely(txreq.size < ETHER_HDR_LEN) || + unlikely(txreq.size > (ETHER_MAX_LEN-ETHER_CRC_LEN))) { + WPRINTF("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page as the payload mustn't fragment. */ + if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) { + WPRINTF("txreq.offset: %x, size: %u, end: %u\n", + txreq.offset, txreq.size, + (txreq.offset & PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) { + WPRINTF("Failed to allocate mbuf\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + m->m_pkthdr.rcvif = netif->ifp; + + if ((m->m_pkthdr.len = txreq.size) > PKT_PROT_LEN) { + struct mbuf *n; + MGET(n, M_DONTWAIT, MT_DATA); + if (!(m->m_next = n)) { + m_freem(m); + WPRINTF("Failed to allocate second mbuf\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + n->m_len = txreq.size - PKT_PROT_LEN; + m->m_len = PKT_PROT_LEN; + } else + m->m_len = txreq.size; + + mop->host_addr = MMAP_VADDR(pending_idx); + mop->dom = netif->domid; + mop->ref = txreq.gref; + mop->flags = GNTMAP_host_map | GNTMAP_readonly; + mop++; + + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((uint16_t *)m->m_data) = pending_idx; + + if (txq_last) + txq_last->m_nextpkt = m; + else + txq = m; + txq_last = m; + + pending_cons++; + + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) + break; + } + + if (!txq) + return; + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops); + BUG_ON(ret); + + mop = tx_map_ops; + while ((m = txq) != NULL) { + caddr_t data; + + txq = m->m_nextpkt; + m->m_nextpkt = NULL; + + pending_idx = *((uint16_t *)m->m_data); + netif = pending_tx_info[pending_idx].netif; + memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); + + /* Check the remap error code. */ + if (unlikely(mop->status)) { + WPRINTF("#### netback grant fails\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + m_freem(m); + mop++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + continue; + } + +#if 0 + /* Can't do this in FreeBSD since vtophys() returns the pfn */ + /* of the remote domain who loaned us the machine page - DPT */ + xen_phys_machine[(vtophys(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT)] = + mop->dev_bus_addr >> PAGE_SHIFT; +#endif + grant_tx_handle[pending_idx] = mop->handle; + + /* Setup data in mbuf (lengths are already set) */ + data = (caddr_t)(MMAP_VADDR(pending_idx)|txreq.offset); + bcopy(data, m->m_data, m->m_len); + if (m->m_next) { + struct mbuf *n = m->m_next; + MEXTADD(n, MMAP_VADDR(pending_idx), PAGE_SIZE, netif_page_release, + (void *)(unsigned int)pending_idx, M_RDONLY, EXT_NET_DRV); + n->m_data = &data[PKT_PROT_LEN]; + } else { + /* Schedule a response immediately. */ + netif_idx_release(pending_idx); + } + + if ((txreq.flags & NETTXF_data_validated)) { + /* Tell the stack the checksums are okay */ + m->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + + /* If necessary, inform stack to compute the checksums if it forwards the packet */ + if ((txreq.flags & NETTXF_csum_blank)) { + struct ether_header *eh = mtod(m, struct ether_header *); + if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + struct ip *ip = (struct ip *)&m->m_data[14]; + if (ip->ip_p == IPPROTO_TCP) + m->m_pkthdr.csum_flags |= CSUM_TCP; + else if (ip->ip_p == IPPROTO_UDP) + m->m_pkthdr.csum_flags |= CSUM_UDP; + } + } + + netif->ifp->if_ibytes += m->m_pkthdr.len; + netif->ifp->if_ipackets++; + + DDPRINTF("RECV %d bytes from %s (cflags=%x)\n", + m->m_pkthdr.len, IFNAME(netif), m->m_pkthdr.csum_flags); + DPRINTF_MBUF_LEN(m, 128); + + (*netif->ifp->if_input)(netif->ifp, m); + + mop++; + } +} + +/* Handle interrupt from a frontend */ +static void +netback_intr(void *arg) +{ + netif_t *netif = arg; + DDPRINTF("%s\n", IFNAME(netif)); + add_to_tx_schedule_list_tail(netif); + maybe_schedule_tx_action(); +} + +/* Removes netif from front of list and does not call netif_put() (caller must) */ +static netif_t * +remove_from_rx_schedule_list(void) +{ + netif_t *netif; + + mtx_lock(&rx_sched_list_lock); + + if ((netif = STAILQ_FIRST(&rx_sched_list))) { + STAILQ_REMOVE(&rx_sched_list, netif, netback_info, next_rx); + STAILQ_NEXT(netif, next_rx) = NULL; + netif->on_rx_sched_list = 0; + } + + mtx_unlock(&rx_sched_list_lock); + + return netif; +} + +/* Adds netif to end of list and calls netif_get() */ +static void +add_to_rx_schedule_list_tail(netif_t *netif) +{ + if (netif->on_rx_sched_list) + return; + + mtx_lock(&rx_sched_list_lock); + if (!netif->on_rx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) { + netif_get(netif); + STAILQ_INSERT_TAIL(&rx_sched_list, netif, next_rx); + netif->on_rx_sched_list = 1; + } + mtx_unlock(&rx_sched_list_lock); +} + +static int +make_rx_response(netif_t *netif, uint16_t id, int8_t st, + uint16_t offset, uint16_t size, uint16_t flags) +{ + RING_IDX i = netif->rx.rsp_prod_pvt; + netif_rx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->rx, i); + resp->offset = offset; + resp->flags = flags; + resp->id = id; + resp->status = (int16_t)size; + if (st < 0) + resp->status = (int16_t)st; + + DDPRINTF("rx resp(%d): off=%x fl=%x id=%x stat=%d\n", + i, resp->offset, resp->flags, resp->id, resp->status); + + netif->rx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify); + + return notify; +} + +static int +netif_rx(netif_t *netif) +{ + struct ifnet *ifp = netif->ifp; + struct mbuf *m; + multicall_entry_t *mcl; + mmu_update_t *mmu; + gnttab_transfer_t *gop; + unsigned long vdata, old_mfn, new_mfn; + struct mbuf *rxq = NULL, *rxq_last = NULL; + int ret, notify = 0, pkts_dequeued = 0; + + DDPRINTF("%s\n", IFNAME(netif)); + + mcl = rx_mcl; + mmu = rx_mmu; + gop = grant_rx_op; + + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + + /* Quit if the target domain has no receive buffers */ + if (netif->rx.req_cons == netif->rx.sring->req_prod) + break; + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + pkts_dequeued++; + + /* Check if we need to copy the data */ + if (((m->m_flags & (M_RDONLY|M_EXT)) != M_EXT) || + (*m->m_ext.ref_cnt > 1) || m->m_next != NULL) { + struct mbuf *n; + + DDPRINTF("copying mbuf (fl=%x ext=%x rc=%d n=%x)\n", + m->m_flags, + (m->m_flags & M_EXT) ? m->m_ext.ext_type : 0, + (m->m_flags & M_EXT) ? *m->m_ext.ref_cnt : 0, + (unsigned int)m->m_next); + + /* Make copy */ + MGETHDR(n, M_DONTWAIT, MT_DATA); + if (!n) + goto drop; + + MCLGET(n, M_DONTWAIT); + if (!(n->m_flags & M_EXT)) { + m_freem(n); + goto drop; + } + + /* Leave space at front and keep current alignment */ + n->m_data += 16 + ((unsigned int)m->m_data & 0x3); + + if (m->m_pkthdr.len > M_TRAILINGSPACE(n)) { + WPRINTF("pkt to big %d\n", m->m_pkthdr.len); + m_freem(n); + goto drop; + } + m_copydata(m, 0, m->m_pkthdr.len, n->m_data); + n->m_pkthdr.len = n->m_len = m->m_pkthdr.len; + n->m_pkthdr.csum_flags = (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA); + m_freem(m); + m = n; + } + + vdata = (unsigned long)m->m_data; + old_mfn = vtomach(vdata) >> PAGE_SHIFT; + + if ((new_mfn = alloc_mfn()) == 0) + goto drop; + +#ifdef XEN_NETBACK_FIXUP_CSUM + /* Check if we need to compute a checksum. This happens */ + /* when bridging from one domain to another. */ + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) + fixup_checksum(m); +#endif + + xen_phys_machine[(vtophys(vdata) >> PAGE_SHIFT)] = new_mfn; + + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = vdata; + mcl->args[1] = (new_mfn << PAGE_SHIFT) | PG_V | PG_RW | PG_M | PG_A; + mcl->args[2] = 0; + mcl->args[3] = 0; + mcl++; + + gop->mfn = old_mfn; + gop->domid = netif->domid; + gop->ref = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons)->gref; + netif->rx.req_cons++; + gop++; + + mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu->val = vtophys(vdata) >> PAGE_SHIFT; + mmu++; + + if (rxq_last) + rxq_last->m_nextpkt = m; + else + rxq = m; + rxq_last = m; + + DDPRINTF("XMIT %d bytes to %s\n", m->m_pkthdr.len, IFNAME(netif)); + DPRINTF_MBUF_LEN(m, 128); + + /* Filled the batch queue? */ + if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op)) + break; + + continue; + drop: + DDPRINTF("dropping pkt\n"); + ifp->if_oerrors++; + m_freem(m); + } + + if (mcl == rx_mcl) + return pkts_dequeued; + + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + + mcl[-2].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + BUG_ON(ret != 0); + + ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, gop - grant_rx_op); + BUG_ON(ret != 0); + + mcl = rx_mcl; + gop = grant_rx_op; + + while ((m = rxq) != NULL) { + int8_t status; + uint16_t id, flags = 0; + + rxq = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* Rederive the machine addresses. */ + new_mfn = mcl->args[1] >> PAGE_SHIFT; + old_mfn = gop->mfn; + + ifp->if_obytes += m->m_pkthdr.len; + ifp->if_opackets++; + + /* The update_va_mapping() must not fail. */ + BUG_ON(mcl->result != 0); + + /* Setup flags */ + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) + flags |= NETRXF_csum_blank | NETRXF_data_validated; + else if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) + flags |= NETRXF_data_validated; + + /* Check the reassignment error code. */ + status = NETIF_RSP_OKAY; + if (gop->status != 0) { + DPRINTF("Bad status %d from grant transfer to DOM%u\n", + gop->status, netif->domid); + /* + * Page no longer belongs to us unless GNTST_bad_page, + * but that should be a fatal error anyway. + */ + BUG_ON(gop->status == GNTST_bad_page); + status = NETIF_RSP_ERROR; + } + id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id; + notify |= make_rx_response(netif, id, status, + (unsigned long)m->m_data & PAGE_MASK, + m->m_pkthdr.len, flags); + + m_freem(m); + mcl++; + gop++; + } + + if (notify) + notify_remote_via_irq(netif->irq); + + return pkts_dequeued; +} + +static void +rx_task_timer(void *arg) +{ + DDPRINTF("\n"); + taskqueue_enqueue(taskqueue_swi, &net_rx_task); +} + +static void +net_rx_action(void *context, int pending) +{ + netif_t *netif, *last_zero_work = NULL; + + DDPRINTF("\n"); + + while ((netif = remove_from_rx_schedule_list())) { + struct ifnet *ifp = netif->ifp; + + if (netif == last_zero_work) { + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + add_to_rx_schedule_list_tail(netif); + netif_put(netif); + if (!STAILQ_EMPTY(&rx_sched_list)) + callout_reset(&rx_task_callout, 1, rx_task_timer, NULL); + break; + } + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (netif_rx(netif)) + last_zero_work = NULL; + else if (!last_zero_work) + last_zero_work = netif; + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + add_to_rx_schedule_list_tail(netif); + } + + netif_put(netif); + } +} + +static void +netback_start(struct ifnet *ifp) +{ + netif_t *netif = (netif_t *)ifp->if_softc; + + DDPRINTF("%s\n", IFNAME(netif)); + + add_to_rx_schedule_list_tail(netif); + taskqueue_enqueue(taskqueue_swi, &net_rx_task); +} + +/* Map a grant ref to a ring */ +static int +map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +{ + struct gnttab_map_grant_ref op; + + ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + if (ring->va == 0) + return ENOMEM; + + op.host_addr = ring->va; + op.flags = GNTMAP_host_map; + op.ref = ref; + op.dom = dom; + HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); + if (op.status) { + WPRINTF("grant table op err=%d\n", op.status); + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; + return EACCES; + } + + ring->handle = op.handle; + ring->bus_addr = op.dev_bus_addr; + + return 0; +} + +/* Unmap grant ref for a ring */ +static void +unmap_ring(struct ring_ref *ring) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = ring->va; + op.dev_bus_addr = ring->bus_addr; + op.handle = ring->handle; + HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); + if (op.status) + WPRINTF("grant table op err=%d\n", op.status); + + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; +} + +static int +connect_rings(netif_t *netif) +{ + struct xenbus_device *xdev = netif->xdev; + netif_tx_sring_t *txs; + netif_rx_sring_t *rxs; + unsigned long tx_ring_ref, rx_ring_ref; + evtchn_port_t evtchn; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + // Grab FE data and map his memory + err = xenbus_gather(NULL, xdev->otherend, + "tx-ring-ref", "%lu", &tx_ring_ref, + "rx-ring-ref", "%lu", &rx_ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(xdev, err, + "reading %s/ring-ref and event-channel", + xdev->otherend); + return err; + } + + err = map_ring(tx_ring_ref, netif->domid, &netif->tx_ring_ref); + if (err) { + xenbus_dev_fatal(xdev, err, "mapping tx ring"); + return err; + } + txs = (netif_tx_sring_t *)netif->tx_ring_ref.va; + BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); + + err = map_ring(rx_ring_ref, netif->domid, &netif->rx_ring_ref); + if (err) { + unmap_ring(&netif->tx_ring_ref); + xenbus_dev_fatal(xdev, err, "mapping rx ring"); + return err; + } + rxs = (netif_rx_sring_t *)netif->rx_ring_ref.va; + BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); + + op.u.bind_interdomain.remote_dom = netif->domid; + op.u.bind_interdomain.remote_port = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_ring(&netif->tx_ring_ref); + unmap_ring(&netif->rx_ring_ref); + xenbus_dev_fatal(xdev, err, "binding event channel"); + return err; + } + netif->evtchn = op.u.bind_interdomain.local_port; + + /* bind evtchn to irq handler */ + netif->irq = + bind_evtchn_to_irqhandler(netif->evtchn, "netback", + netback_intr, netif, INTR_TYPE_NET|INTR_MPSAFE, &netif->irq_cookie); + + netif->rings_connected = 1; + + DPRINTF("%s connected! evtchn=%d irq=%d\n", + IFNAME(netif), netif->evtchn, netif->irq); + + return 0; +} + +static void +disconnect_rings(netif_t *netif) +{ + DPRINTF("\n"); + + if (netif->rings_connected) { + unbind_from_irqhandler(netif->irq, netif->irq_cookie); + netif->irq = 0; + unmap_ring(&netif->tx_ring_ref); + unmap_ring(&netif->rx_ring_ref); + netif->rings_connected = 0; + } +} + +static void +connect(netif_t *netif) +{ + if (!netif->xdev || + !netif->attached || + netif->frontend_state != XenbusStateConnected) { + return; + } + + if (!connect_rings(netif)) { + xenbus_switch_state(netif->xdev, NULL, XenbusStateConnected); + + /* Turn on interface */ + netif->ifp->if_drv_flags |= IFF_DRV_RUNNING; + netif->ifp->if_flags |= IFF_UP; + } +} + +static int +netback_remove(struct xenbus_device *xdev) +{ + netif_t *netif = xdev->data; + device_t ndev; + + DPRINTF("remove %s\n", xdev->nodename); + + if ((ndev = netif->ndev)) { + netif->ndev = NULL; + mtx_lock(&Giant); + device_detach(ndev); + mtx_unlock(&Giant); + } + + xdev->data = NULL; + netif->xdev = NULL; + netif_put(netif); + + return 0; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffers for communication with the frontend. + * Switch to Connected state. + */ +static int +netback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) +{ + int err; + long handle; + char *bridge; + + DPRINTF("node=%s\n", xdev->nodename); + + /* Grab the handle */ + err = xenbus_scanf(NULL, xdev->nodename, "handle", "%li", &handle); + if (err != 1) { + xenbus_dev_fatal(xdev, err, "reading handle"); + return err; + } + + /* Check for bridge */ + bridge = xenbus_read(NULL, xdev->nodename, "bridge", NULL); + if (IS_ERR(bridge)) + bridge = NULL; + + err = xenbus_switch_state(xdev, NULL, XenbusStateInitWait); + if (err) { + xenbus_dev_fatal(xdev, err, "writing switch state"); + return err; + } + + err = netif_create(handle, xdev, bridge); + if (err) { + xenbus_dev_fatal(xdev, err, "creating netif"); + return err; + } + + err = vif_add_dev(xdev); + if (err) { + netif_put((netif_t *)xdev->data); + xenbus_dev_fatal(xdev, err, "adding vif device"); + return err; + } + + return 0; +} + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int netback_resume(struct xenbus_device *xdev) +{ + DPRINTF("node=%s\n", xdev->nodename); + return 0; +} + + +/** + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *xdev, + XenbusState frontend_state) +{ + netif_t *netif = xdev->data; + + DPRINTF("state=%d\n", frontend_state); + + netif->frontend_state = frontend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + break; + case XenbusStateConnected: + connect(netif); + break; + case XenbusStateClosing: + xenbus_switch_state(xdev, NULL, XenbusStateClosing); + break; + case XenbusStateClosed: + xenbus_remove_device(xdev); + break; + case XenbusStateUnknown: + case XenbusStateInitWait: + xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* ** Driver registration ** */ + +static struct xenbus_device_id netback_ids[] = { + { "vif" }, + { "" } +}; + +static struct xenbus_driver netback = { + .name = "netback", + .ids = netback_ids, + .probe = netback_probe, + .remove = netback_remove, + .resume= netback_resume, + .otherend_changed = frontend_changed, +}; + +static void +netback_init(void *unused) +{ + callout_init(&rx_task_callout, CALLOUT_MPSAFE); + + mmap_vstart = alloc_empty_page_range(MAX_PENDING_REQS); + BUG_ON(!mmap_vstart); + + pending_cons = 0; + for (pending_prod = 0; pending_prod < MAX_PENDING_REQS; pending_prod++) + pending_ring[pending_prod] = pending_prod; + + TASK_INIT(&net_tx_task, 0, net_tx_action, NULL); + TASK_INIT(&net_rx_task, 0, net_rx_action, NULL); + mtx_init(&tx_sched_list_lock, "nb_tx_sched_lock", "netback tx sched lock", MTX_DEF); + mtx_init(&rx_sched_list_lock, "nb_rx_sched_lock", "netback rx sched lock", MTX_DEF); + + DPRINTF("registering %s\n", netback.name); + + xenbus_register_backend(&netback); +} + +SYSINIT(xnbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, netback_init, NULL) + +static int +vif_add_dev(struct xenbus_device *xdev) +{ + netif_t *netif = xdev->data; + device_t nexus, ndev; + devclass_t dc; + int err = 0; + + mtx_lock(&Giant); + + /* We will add a vif device as a child of nexus0 (for now) */ + if (!(dc = devclass_find("nexus")) || + !(nexus = devclass_get_device(dc, 0))) { + WPRINTF("could not find nexus0!\n"); + err = ENOENT; + goto done; + } + + + /* Create a newbus device representing the vif */ + ndev = BUS_ADD_CHILD(nexus, 0, "vif", netif->ifp->if_dunit); + if (!ndev) { + WPRINTF("could not create newbus device %s!\n", IFNAME(netif)); + err = EFAULT; + goto done; + } + + netif_get(netif); + device_set_ivars(ndev, netif); + netif->ndev = ndev; + + device_probe_and_attach(ndev); + + done: + + mtx_unlock(&Giant); + + return err; +} + +enum { + VIF_SYSCTL_DOMID, + VIF_SYSCTL_HANDLE, + VIF_SYSCTL_TXRING, + VIF_SYSCTL_RXRING, +}; + +static char * +vif_sysctl_ring_info(netif_t *netif, int cmd) +{ + char *buf = malloc(256, M_DEVBUF, M_WAITOK); + if (buf) { + if (!netif->rings_connected) + sprintf(buf, "rings not connected\n"); + else if (cmd == VIF_SYSCTL_TXRING) { + netif_tx_back_ring_t *tx = &netif->tx; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + tx->nr_ents, tx->req_cons, + tx->sring->req_prod, tx->sring->req_event, + tx->sring->rsp_prod, tx->sring->rsp_event); + } else { + netif_rx_back_ring_t *rx = &netif->rx; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + rx->nr_ents, rx->req_cons, + rx->sring->req_prod, rx->sring->req_event, + rx->sring->rsp_prod, rx->sring->rsp_event); + } + } + return buf; +} + +static int +vif_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + device_t dev = (device_t)arg1; + netif_t *netif = (netif_t *)device_get_ivars(dev); + const char *value; + char *buf = NULL; + int err; + + switch (arg2) { + case VIF_SYSCTL_DOMID: + return sysctl_handle_int(oidp, NULL, netif->domid, req); + case VIF_SYSCTL_HANDLE: + return sysctl_handle_int(oidp, NULL, netif->handle, req); + case VIF_SYSCTL_TXRING: + case VIF_SYSCTL_RXRING: + value = buf = vif_sysctl_ring_info(netif, arg2); + break; + default: + return (EINVAL); + } + + err = SYSCTL_OUT(req, value, strlen(value)); + if (buf != NULL) + free(buf, M_DEVBUF); + + return err; +} + +/* Newbus vif device driver probe */ +static int +vif_probe(device_t dev) +{ + DDPRINTF("vif%d\n", device_get_unit(dev)); + return 0; +} + +/* Newbus vif device driver attach */ +static int +vif_attach(device_t dev) +{ + netif_t *netif = (netif_t *)device_get_ivars(dev); + uint8_t mac[ETHER_ADDR_LEN]; + + DDPRINTF("%s\n", IFNAME(netif)); + + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, + dev, VIF_SYSCTL_DOMID, vif_sysctl_handler, "I", + "domid of frontend"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "handle", CTLTYPE_INT|CTLFLAG_RD, + dev, VIF_SYSCTL_HANDLE, vif_sysctl_handler, "I", + "handle of frontend"); +#ifdef XEN_NETBACK_DEBUG + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "txring", CTLFLAG_RD, + dev, VIF_SYSCTL_TXRING, vif_sysctl_handler, "A", + "tx ring info"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "rxring", CTLFLAG_RD, + dev, VIF_SYSCTL_RXRING, vif_sysctl_handler, "A", + "rx ring info"); +#endif + + memset(mac, 0xff, sizeof(mac)); + mac[0] &= ~0x01; + + ether_ifattach(netif->ifp, mac); + netif->attached = 1; + + connect(netif); + + if (netif->bridge) { + DPRINTF("Adding %s to bridge %s\n", IFNAME(netif), netif->bridge); + int err = add_to_bridge(netif->ifp, netif->bridge); + if (err) { + WPRINTF("Error adding %s to %s; err=%d\n", + IFNAME(netif), netif->bridge, err); + } + } + + return bus_generic_attach(dev); +} + +/* Newbus vif device driver detach */ +static int +vif_detach(device_t dev) +{ + netif_t *netif = (netif_t *)device_get_ivars(dev); + struct ifnet *ifp = netif->ifp; + + DDPRINTF("%s\n", IFNAME(netif)); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + ether_ifdetach(ifp); + + bus_generic_detach(dev); + + netif->attached = 0; + + netif_put(netif); + + return 0; +} + +static device_method_t vif_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vif_probe), + DEVMETHOD(device_attach, vif_attach), + DEVMETHOD(device_detach, vif_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + {0, 0} +}; + +static devclass_t vif_devclass; + +static driver_t vif_driver = { + "vif", + vif_methods, + 0, +}; + +DRIVER_MODULE(vif, nexus, vif_driver, vif_devclass, 0, 0); + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ Property changes on: dev/xen/netback/netback.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: dev/xen/blkback/blkback.c =================================================================== --- dev/xen/blkback/blkback.c (.../stable/6/sys) (revision 0) +++ dev/xen/blkback/blkback.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1349 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#if XEN_BLKBACK_DEBUG +#define DPRINTF(fmt, args...) \ + printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif + +#define WPRINTF(fmt, args...) \ + printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define BLKBACK_INVALID_HANDLE (~0) + +struct ring_ref { + vm_offset_t va; + grant_handle_t handle; + uint64_t bus_addr; +}; + +typedef struct blkback_info { + + /* Schedule lists */ + STAILQ_ENTRY(blkback_info) next_req; + int on_req_sched_list; + + struct xenbus_device *xdev; + XenbusState frontend_state; + + domid_t domid; + + int state; + int ring_connected; + struct ring_ref rr; + blkif_back_ring_t ring; + evtchn_port_t evtchn; + int irq; + void *irq_cookie; + + int ref_cnt; + + int handle; + char *mode; + char *type; + char *dev_name; + + struct vnode *vn; + struct cdev *cdev; + struct cdevsw *csw; + u_int sector_size; + int sector_size_shift; + off_t media_size; + u_int media_num_sectors; + int major; + int minor; + int read_only; + + struct mtx blk_ring_lock; + + device_t ndev; + + /* Stats */ + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_err_req; +} blkif_t; + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** + * + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +static int blkif_reqs = 64; +TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs); + +static int mmap_pages; + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct pending_req { + blkif_t *blkif; + uint64_t id; + int nr_pages; + int pendcnt; + unsigned short operation; + int status; + STAILQ_ENTRY(pending_req) free_list; +} pending_req_t; + +static pending_req_t *pending_reqs; +static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free = + STAILQ_HEAD_INITIALIZER(pending_free); +static struct mtx pending_free_lock; + +static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list = + STAILQ_HEAD_INITIALIZER(req_sched_list); +static struct mtx req_sched_list_lock; + +static unsigned long mmap_vstart; +static unsigned long *pending_vaddrs; +static grant_handle_t *pending_grant_handles; + +static struct task blk_req_task; + +/* Protos */ +static void disconnect_ring(blkif_t *blkif); +static int vbd_add_dev(struct xenbus_device *xdev); + +static inline int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +static inline unsigned long vaddr(pending_req_t *req, int seg) +{ + return pending_vaddrs[vaddr_pagenr(req, seg)]; +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + +static unsigned long +alloc_empty_page_range(unsigned long nr_pages) +{ + void *pages; + int i = 0, j = 0; + multicall_entry_t mcl[17]; + unsigned long mfn_list[16]; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = 0, + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pages == NULL) + return 0; + + memset(mcl, 0, sizeof(mcl)); + + while (i < nr_pages) { + unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + + mcl[j].op = __HYPERVISOR_update_va_mapping; + mcl[j].args[0] = va; + + mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; + + xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; + + if (j == 16 || i == nr_pages) { + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; + + reservation.nr_extents = j; + + mcl[j].op = __HYPERVISOR_memory_op; + mcl[j].args[0] = XENMEM_decrease_reservation; + mcl[j].args[1] = (unsigned long)&reservation; + + (void)HYPERVISOR_multicall(mcl, j+1); + + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; + j = 0; + } + } + + return (unsigned long)pages; +} + +static pending_req_t * +alloc_req(void) +{ + pending_req_t *req; + mtx_lock(&pending_free_lock); + if ((req = STAILQ_FIRST(&pending_free))) { + STAILQ_REMOVE(&pending_free, req, pending_req, free_list); + STAILQ_NEXT(req, free_list) = NULL; + } + mtx_unlock(&pending_free_lock); + return req; +} + +static void +free_req(pending_req_t *req) +{ + int was_empty; + + mtx_lock(&pending_free_lock); + was_empty = STAILQ_EMPTY(&pending_free); + STAILQ_INSERT_TAIL(&pending_free, req, free_list); + mtx_unlock(&pending_free_lock); + if (was_empty) + taskqueue_enqueue(taskqueue_swi, &blk_req_task); +} + +static void +fast_flush_area(pending_req_t *req) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int ret; + + for (i = 0; i < req->nr_pages; i++) { + handle = pending_handle(req, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + unmap[invcount].host_addr = vaddr(req, i); + unmap[invcount].dev_bus_addr = 0; + unmap[invcount].handle = handle; + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + PANIC_IF(ret); +} + +static void +blkif_get(blkif_t *blkif) +{ + atomic_add_int(&blkif->ref_cnt, 1); +} + +static void +blkif_put(blkif_t *blkif) +{ + if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) { + DPRINTF("Removing %x\n", (unsigned int)blkif); + disconnect_ring(blkif); + if (blkif->mode) + free(blkif->mode, M_DEVBUF); + if (blkif->type) + free(blkif->type, M_DEVBUF); + if (blkif->dev_name) + free(blkif->dev_name, M_DEVBUF); + free(blkif, M_DEVBUF); + } +} + +static int +blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params) +{ + blkif_t *blkif; + + blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!blkif) + return ENOMEM; + + DPRINTF("Created %x\n", (unsigned int)blkif); + + blkif->ref_cnt = 1; + blkif->domid = xdev->otherend_id; + blkif->handle = handle; + blkif->mode = mode; + blkif->type = type; + blkif->dev_name = params; + blkif->xdev = xdev; + xdev->data = blkif; + + mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF); + + if (strcmp(mode, "w")) + blkif->read_only = 1; + + return 0; +} + +static void +add_to_req_schedule_list_tail(blkif_t *blkif) +{ + if (!blkif->on_req_sched_list) { + mtx_lock(&req_sched_list_lock); + if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) { + blkif_get(blkif); + STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); + blkif->on_req_sched_list = 1; + taskqueue_enqueue(taskqueue_swi, &blk_req_task); + } + mtx_unlock(&req_sched_list_lock); + } +} + +/* This routine does not call blkif_get(), does not schedule the blk_req_task to run, + and assumes that the state is connected */ +static void +add_to_req_schedule_list_tail2(blkif_t *blkif) +{ + mtx_lock(&req_sched_list_lock); + if (!blkif->on_req_sched_list) { + STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); + blkif->on_req_sched_list = 1; + } + mtx_unlock(&req_sched_list_lock); +} + +/* Removes blkif from front of list and does not call blkif_put() (caller must) */ +static blkif_t * +remove_from_req_schedule_list(void) +{ + blkif_t *blkif; + + mtx_lock(&req_sched_list_lock); + + if ((blkif = STAILQ_FIRST(&req_sched_list))) { + STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req); + STAILQ_NEXT(blkif, next_req) = NULL; + blkif->on_req_sched_list = 0; + } + + mtx_unlock(&req_sched_list_lock); + + return blkif; +} + +static void +make_response(blkif_t *blkif, uint64_t id, + unsigned short op, int st) +{ + blkif_response_t *resp; + blkif_back_ring_t *blk_ring = &blkif->ring; + int more_to_do = 0; + int notify; + + mtx_lock(&blkif->blk_ring_lock); + + + /* Place on the response ring for the relevant domain. */ + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); + + if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); + + } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) + more_to_do = 1; + + mtx_unlock(&blkif->blk_ring_lock); + + if (more_to_do) + add_to_req_schedule_list_tail(blkif); + + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static void +end_block_io_op(struct bio *bio) +{ + pending_req_t *pending_req = bio->bio_caller2; + + if (bio->bio_error) { + DPRINTF("BIO returned error %d for operation on device %s\n", + bio->bio_error, pending_req->blkif->dev_name); + pending_req->status = BLKIF_RSP_ERROR; + pending_req->blkif->st_err_req++; + } + +#if 0 + printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n", + (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags); +#endif + + if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) { + fast_flush_area(pending_req); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } + + g_destroy_bio(bio); +} + +static void +dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req) +{ + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct { + unsigned long buf; unsigned int nsec; + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg = req->nr_segments, nr_sects = 0; + struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int operation, ret, i, nbio = 0; + + /* Check that number of segments is sane. */ + if (unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTF("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + if (req->operation == BLKIF_OP_WRITE) { + if (blkif->read_only) { + DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name); + goto fail_response; + } + operation = BIO_WRITE; + } else + operation = BIO_READ; + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + seg[i].nsec = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (seg[i].nsec <= 0)) + goto fail_response; + nr_sects += seg[i].nsec; + + map[i].host_addr = vaddr(pending_req, i); + map[i].dom = blkif->domid; + map[i].ref = req->seg[i].gref; + map[i].flags = GNTMAP_host_map; + if (operation == BIO_WRITE) + map[i].flags |= GNTMAP_readonly; + } + + /* Convert to the disk's sector size */ + nr_sects = (nr_sects << 9) >> blkif->sector_size_shift; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + PANIC_IF(ret); + + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status != 0)) { + DPRINTF("invalid buffer -- could not remap it\n"); + goto fail_flush; + } + + pending_handle(pending_req, i) = map[i].handle; +#if 0 + /* Can't do this in FreeBSD since vtophys() returns the pfn */ + /* of the remote domain who loaned us the machine page - DPT */ + xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] = + map[i]dev_bus_addr >> PAGE_SHIFT; +#endif + seg[i].buf = map[i].dev_bus_addr | + (req->seg[i].first_sect << 9); + } + + if (req->sector_number + nr_sects > blkif->media_num_sectors) { + DPRINTF("%s of [%llu,%llu] extends past end of device %s\n", + operation == BIO_READ ? "read" : "write", + req->sector_number, + req->sector_number + nr_sects, blkif->dev_name); + goto fail_flush; + } + + for (i = 0; i < nseg; i++) { + struct bio *bio; + + if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) { + DPRINTF("Misaligned I/O request from domain %d", blkif->domid); + goto fail_put_bio; + } + + bio = biolist[nbio++] = g_new_bio(); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bio_cmd = operation; + bio->bio_offset = req->sector_number << blkif->sector_size_shift; + bio->bio_length = seg[i].nsec << 9; + bio->bio_bcount = bio->bio_length; + bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK)); + bio->bio_done = end_block_io_op; + bio->bio_caller2 = pending_req; + bio->bio_dev = blkif->cdev; + + req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift; +#if 0 + printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n", + (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec, + blkif->cdev->si_iosize_max, seg[i].buf); +#endif + } + + pending_req->pendcnt = nbio; + blkif_get(blkif); + + for (i = 0; i < nbio; i++) + (*blkif->csw->d_strategy)(biolist[i]); + + return; + + fail_put_bio: + for (i = 0; i < (nbio-1); i++) + g_destroy_bio(biolist[i]); + fail_flush: + fast_flush_area(pending_req); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); +} + +static void +blk_req_action(void *context, int pending) +{ + blkif_t *blkif; + + DPRINTF("\n"); + + while (!STAILQ_EMPTY(&req_sched_list)) { + blkif_back_ring_t *blk_ring; + RING_IDX rc, rp; + + blkif = remove_from_req_schedule_list(); + + blk_ring = &blkif->ring; + rc = blk_ring->req_cons; + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { + blkif_request_t *req; + pending_req_t *pending_req; + + pending_req = alloc_req(); + if (pending_req == NULL) + goto out_of_preqs; + + req = RING_GET_REQUEST(blk_ring, rc); + blk_ring->req_cons = ++rc; /* before make_response() */ + + switch (req->operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + default: + blkif->st_err_req++; + DPRINTF("error: unknown block io operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + } + + blkif_put(blkif); + } + + return; + + out_of_preqs: + /* We ran out of pending req structs */ + /* Just requeue interface and wait to be rescheduled to run when one is freed */ + add_to_req_schedule_list_tail2(blkif); + blkif->st_oo_req++; +} + +/* Handle interrupt from a frontend */ +static void +blkback_intr(void *arg) +{ + blkif_t *blkif = arg; + DPRINTF("%x\n", (unsigned int)blkif); + add_to_req_schedule_list_tail(blkif); +} + +/* Map grant ref for ring */ +static int +map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +{ + struct gnttab_map_grant_ref op; + + ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + if (ring->va == 0) + return ENOMEM; + + op.host_addr = ring->va; + op.flags = GNTMAP_host_map; + op.ref = ref; + op.dom = dom; + HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); + if (op.status) { + WPRINTF("grant table op err=%d\n", op.status); + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; + return EACCES; + } + + ring->handle = op.handle; + ring->bus_addr = op.dev_bus_addr; + + return 0; +} + +/* Unmap grant ref for ring */ +static void +unmap_ring(struct ring_ref *ring) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = ring->va; + op.dev_bus_addr = ring->bus_addr; + op.handle = ring->handle; + HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); + if (op.status) + WPRINTF("grant table op err=%d\n", op.status); + + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; +} + +static int +connect_ring(blkif_t *blkif) +{ + struct xenbus_device *xdev = blkif->xdev; + blkif_sring_t *ring; + unsigned long ring_ref; + evtchn_port_t evtchn; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + if (blkif->ring_connected) + return 0; + + // Grab FE data and map his memory + err = xenbus_gather(NULL, xdev->otherend, + "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(xdev, err, + "reading %s/ring-ref and event-channel", + xdev->otherend); + return err; + } + + err = map_ring(ring_ref, blkif->domid, &blkif->rr); + if (err) { + xenbus_dev_fatal(xdev, err, "mapping ring"); + return err; + } + ring = (blkif_sring_t *)blkif->rr.va; + BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE); + + op.u.bind_interdomain.remote_dom = blkif->domid; + op.u.bind_interdomain.remote_port = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_ring(&blkif->rr); + xenbus_dev_fatal(xdev, err, "binding event channel"); + return err; + } + blkif->evtchn = op.u.bind_interdomain.local_port; + + /* bind evtchn to irq handler */ + blkif->irq = + bind_evtchn_to_irqhandler(blkif->evtchn, "blkback", + blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie); + + blkif->ring_connected = 1; + + DPRINTF("%x rings connected! evtchn=%d irq=%d\n", + (unsigned int)blkif, blkif->evtchn, blkif->irq); + + return 0; +} + +static void +disconnect_ring(blkif_t *blkif) +{ + DPRINTF("\n"); + + if (blkif->ring_connected) { + unbind_from_irqhandler(blkif->irq, blkif->irq_cookie); + blkif->irq = 0; + unmap_ring(&blkif->rr); + blkif->ring_connected = 0; + } +} + +static void +connect(blkif_t *blkif) +{ + struct xenbus_transaction *xbt; + struct xenbus_device *xdev = blkif->xdev; + int err; + + if (!blkif->ring_connected || + blkif->vn == NULL || + blkif->state == XenbusStateConnected) + return; + + DPRINTF("%s\n", xdev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + xbt = xenbus_transaction_start(); + if (IS_ERR(xbt)) { + xenbus_dev_fatal(xdev, PTR_ERR(xbt), + "Error writing configuration for backend " + "(start transaction)"); + return; + } + + err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u", + blkif->media_num_sectors); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/sectors", + xdev->nodename); + goto abort; + } + + err = xenbus_printf(xbt, xdev->nodename, "info", "%u", + blkif->read_only ? VDISK_READONLY : 0); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/info", + xdev->nodename); + goto abort; + } + err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u", + blkif->sector_size); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/sector-size", + xdev->nodename); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(xdev, err, "ending transaction"); + + err = xenbus_switch_state(xdev, NULL, XenbusStateConnected); + if (err) + xenbus_dev_fatal(xdev, err, "switching to Connected state", + xdev->nodename); + + blkif->state = XenbusStateConnected; + + return; + + abort: + xenbus_transaction_end(xbt, 1); +} + +static int +blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) +{ + int err; + char *p, *mode = NULL, *type = NULL, *params = NULL; + long handle; + + DPRINTF("node=%s\n", xdev->nodename); + + p = strrchr(xdev->otherend, '/') + 1; + handle = strtoul(p, NULL, 0); + + mode = xenbus_read(NULL, xdev->nodename, "mode", NULL); + if (IS_ERR(mode)) { + xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode"); + err = PTR_ERR(mode); + goto error; + } + + type = xenbus_read(NULL, xdev->nodename, "type", NULL); + if (IS_ERR(type)) { + xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type"); + err = PTR_ERR(type); + goto error; + } + + params = xenbus_read(NULL, xdev->nodename, "params", NULL); + if (IS_ERR(type)) { + xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params"); + err = PTR_ERR(params); + goto error; + } + + err = blkif_create(xdev, handle, mode, type, params); + if (err) { + xenbus_dev_fatal(xdev, err, "creating blkif"); + goto error; + } + + err = vbd_add_dev(xdev); + if (err) { + blkif_put((blkif_t *)xdev->data); + xenbus_dev_fatal(xdev, err, "adding vbd device"); + } + + return err; + + error: + if (mode) + free(mode, M_DEVBUF); + if (type) + free(type, M_DEVBUF); + if (params) + free(params, M_DEVBUF); + return err; +} + +static int +blkback_remove(struct xenbus_device *xdev) +{ + blkif_t *blkif = xdev->data; + device_t ndev; + + DPRINTF("node=%s\n", xdev->nodename); + + blkif->state = XenbusStateClosing; + + if ((ndev = blkif->ndev)) { + blkif->ndev = NULL; + mtx_lock(&Giant); + device_detach(ndev); + mtx_unlock(&Giant); + } + + xdev->data = NULL; + blkif->xdev = NULL; + blkif_put(blkif); + + return 0; +} + +static int +blkback_resume(struct xenbus_device *xdev) +{ + DPRINTF("node=%s\n", xdev->nodename); + return 0; +} + +static void +frontend_changed(struct xenbus_device *xdev, + XenbusState frontend_state) +{ + blkif_t *blkif = xdev->data; + + DPRINTF("state=%d\n", frontend_state); + + blkif->frontend_state = frontend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + break; + case XenbusStateInitialised: + case XenbusStateConnected: + connect_ring(blkif); + connect(blkif); + break; + case XenbusStateClosing: + xenbus_switch_state(xdev, NULL, XenbusStateClosing); + break; + case XenbusStateClosed: + xenbus_remove_device(xdev); + break; + case XenbusStateUnknown: + case XenbusStateInitWait: + xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* ** Driver registration ** */ + +static struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkback = { + .name = "blkback", + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, + .resume = blkback_resume, + .otherend_changed = frontend_changed, +}; + +static void +blkback_init(void *unused) +{ + int i; + + TASK_INIT(&blk_req_task, 0, blk_req_action, NULL); + mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF); + + mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF); + + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; + pending_reqs = malloc(sizeof(pending_reqs[0]) * + blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT); + pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) * + mmap_pages, M_DEVBUF, M_NOWAIT); + pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) * + mmap_pages, M_DEVBUF, M_NOWAIT); + mmap_vstart = alloc_empty_page_range(mmap_pages); + if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) { + if (pending_reqs) + free(pending_reqs, M_DEVBUF); + if (pending_grant_handles) + free(pending_grant_handles, M_DEVBUF); + if (pending_vaddrs) + free(pending_vaddrs, M_DEVBUF); + WPRINTF("out of memory\n"); + return; + } + + for (i = 0; i < mmap_pages; i++) { + pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); + pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + } + + for (i = 0; i < blkif_reqs; i++) { + STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list); + } + + DPRINTF("registering %s\n", blkback.name); + xenbus_register_backend(&blkback); +} + +SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL) + +static void +close_device(blkif_t *blkif) +{ + DPRINTF("closing dev=%s\n", blkif->dev_name); + if (blkif->vn) { + int flags = FREAD; + + if (!blkif->read_only) + flags |= FWRITE; + + if (blkif->csw) { + dev_relthread(blkif->cdev); + blkif->csw = NULL; + } + + (void)vn_close(blkif->vn, flags, NOCRED, curthread); + blkif->vn = NULL; + } +} + +static int +open_device(blkif_t *blkif) +{ + struct nameidata nd; + struct vattr vattr; + struct cdev *dev; + struct cdevsw *devsw; + int flags = FREAD, err = 0; + + DPRINTF("opening dev=%s\n", blkif->dev_name); + + if (!blkif->read_only) + flags |= FWRITE; + + if (!curthread->td_proc->p_fd->fd_cdir) { + curthread->td_proc->p_fd->fd_cdir = rootvnode; + VREF(rootvnode); + } + if (!curthread->td_proc->p_fd->fd_rdir) { + curthread->td_proc->p_fd->fd_rdir = rootvnode; + VREF(rootvnode); + } + if (!curthread->td_proc->p_fd->fd_jdir) { + curthread->td_proc->p_fd->fd_jdir = rootvnode; + VREF(rootvnode); + } + + again: + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread); + err = vn_open(&nd, &flags, 0, -1); + if (err) { + if (blkif->dev_name[0] != '/') { + char *dev_path = "/dev/"; + char *dev_name; + + /* Try adding device path at beginning of name */ + dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT); + if (dev_name) { + sprintf(dev_name, "%s%s", dev_path, blkif->dev_name); + free(blkif->dev_name, M_DEVBUF); + blkif->dev_name = dev_name; + goto again; + } + } + xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name); + return err; + } + NDFREE(&nd, NDF_ONLY_PNBUF); + + blkif->vn = nd.ni_vp; + + /* We only support disks for now */ + if (!vn_isdisk(blkif->vn, &err)) { + xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name); + VOP_UNLOCK(blkif->vn, 0, curthread); + goto error; + } + + blkif->cdev = blkif->vn->v_rdev; + blkif->csw = dev_refthread(blkif->cdev); + PANIC_IF(blkif->csw == NULL); + + err = VOP_GETATTR(blkif->vn, &vattr, NOCRED); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error getting vnode attributes for device %s", blkif->dev_name); + VOP_UNLOCK(blkif->vn, 0, curthread); + goto error; + } + + VOP_UNLOCK(blkif->vn, 0, curthread); + + dev = blkif->vn->v_rdev; + devsw = dev->si_devsw; + if (!devsw->d_ioctl) { + err = ENODEV; + xenbus_dev_fatal(blkif->xdev, err, + "no d_ioctl for device %s!", blkif->dev_name); + goto error; + } + + err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name); + goto error; + } + blkif->sector_size_shift = fls(blkif->sector_size) - 1; + + err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name); + goto error; + } + blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift; + + blkif->major = umajor(vattr.va_rdev); + blkif->minor = uminor(vattr.va_rdev); + + DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n", + blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size); + + return 0; + + error: + close_device(blkif); + return err; +} + +static int +vbd_add_dev(struct xenbus_device *xdev) +{ + blkif_t *blkif = xdev->data; + device_t nexus, ndev; + devclass_t dc; + int err = 0; + + mtx_lock(&Giant); + + /* We will add a vbd device as a child of nexus0 (for now) */ + if (!(dc = devclass_find("nexus")) || + !(nexus = devclass_get_device(dc, 0))) { + WPRINTF("could not find nexus0!\n"); + err = ENOENT; + goto done; + } + + + /* Create a newbus device representing the vbd */ + ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle); + if (!ndev) { + WPRINTF("could not create newbus device vbd%d!\n", blkif->handle); + err = EFAULT; + goto done; + } + + blkif_get(blkif); + device_set_ivars(ndev, blkif); + blkif->ndev = ndev; + + device_probe_and_attach(ndev); + + done: + + mtx_unlock(&Giant); + + return err; +} + +enum { + VBD_SYSCTL_DOMID, + VBD_SYSCTL_ST_RD_REQ, + VBD_SYSCTL_ST_WR_REQ, + VBD_SYSCTL_ST_OO_REQ, + VBD_SYSCTL_ST_ERR_REQ, + VBD_SYSCTL_RING, +}; + +static char * +vbd_sysctl_ring_info(blkif_t *blkif, int cmd) +{ + char *buf = malloc(256, M_DEVBUF, M_WAITOK); + if (buf) { + if (!blkif->ring_connected) + sprintf(buf, "ring not connected\n"); + else { + blkif_back_ring_t *ring = &blkif->ring; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + ring->nr_ents, ring->req_cons, + ring->sring->req_prod, ring->sring->req_event, + ring->sring->rsp_prod, ring->sring->rsp_event); + } + } + return buf; +} + +static int +vbd_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + device_t dev = (device_t)arg1; + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + const char *value; + char *buf = NULL; + int err; + + switch (arg2) { + case VBD_SYSCTL_DOMID: + return sysctl_handle_int(oidp, NULL, blkif->domid, req); + case VBD_SYSCTL_ST_RD_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req); + case VBD_SYSCTL_ST_WR_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req); + case VBD_SYSCTL_ST_OO_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req); + case VBD_SYSCTL_ST_ERR_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req); + case VBD_SYSCTL_RING: + value = buf = vbd_sysctl_ring_info(blkif, arg2); + break; + default: + return (EINVAL); + } + + err = SYSCTL_OUT(req, value, strlen(value)); + if (buf != NULL) + free(buf, M_DEVBUF); + + return err; +} + +/* Newbus vbd device driver probe */ +static int +vbd_probe(device_t dev) +{ + DPRINTF("vbd%d\n", device_get_unit(dev)); + return 0; +} + +/* Newbus vbd device driver attach */ +static int +vbd_attach(device_t dev) +{ + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + + DPRINTF("%s\n", blkif->dev_name); + + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I", + "domid of frontend"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I", + "number of read reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I", + "number of write reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I", + "number of deferred reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I", + "number of reqs that returned error"); +#if XEN_BLKBACK_DEBUG + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "ring", CTLFLAG_RD, + dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A", + "req ring info"); +#endif + + if (!open_device(blkif)) + connect(blkif); + + return bus_generic_attach(dev); +} + +/* Newbus vbd device driver detach */ +static int +vbd_detach(device_t dev) +{ + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + + DPRINTF("%s\n", blkif->dev_name); + + close_device(blkif); + + bus_generic_detach(dev); + + blkif_put(blkif); + + return 0; +} + +static device_method_t vbd_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vbd_probe), + DEVMETHOD(device_attach, vbd_attach), + DEVMETHOD(device_detach, vbd_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + {0, 0} +}; + +static devclass_t vbd_devclass; + +static driver_t vbd_driver = { + "vbd", + vbd_methods, + 0, +}; + +DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ Property changes on: dev/xen/blkback/blkback.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: libkern/strcspn.c =================================================================== --- libkern/strcspn.c (.../stable/6/sys) (revision 0) +++ libkern/strcspn.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2005 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#define IDX(c) ((u_char)(c) / LONG_BIT) +#define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT)) + +size_t +strcspn(const char *s, const char *charset) +{ + /* + * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to + * generate better code. Without them, gcc gets a little confused. + */ + const char *s1; + u_long bit; + u_long tbl[(UCHAR_MAX + 1) / LONG_BIT]; + int idx; + + if(*s == '\0') + return (0); + +#if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */ + tbl[0] = 1; + tbl[3] = tbl[2] = tbl[1] = 0; +#else + for (tbl[0] = idx = 1; idx < sizeof(tbl) / sizeof(tbl[0]); idx++) + tbl[idx] = 0; +#endif + for (; *charset != '\0'; charset++) { + idx = IDX(*charset); + bit = BIT(*charset); + tbl[idx] |= bit; + } + + for(s1 = s; ; s1++) { + idx = IDX(*s1); + bit = BIT(*s1); + if ((tbl[idx] & bit) != 0) + break; + } + return (s1 - s); +} Property changes on: libkern/strcspn.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/smp.h =================================================================== --- i386/include/smp.h (.../stable/6/sys) (revision 184012) +++ i386/include/smp.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -68,7 +68,9 @@ void ipi_all(u_int ipi); void ipi_all_but_self(u_int ipi); void ipi_self(u_int ipi); +#ifndef XEN void ipi_bitmap_handler(struct clockframe frame); +#endif u_int mp_bootaddress(u_int); int mp_grab_cpu_hlt(void); void mp_topology(void); @@ -85,7 +87,14 @@ int ipi_nmi_handler(void); void ipi_nmi_selected(u_int32_t cpus); #endif +#ifdef XEN +void ipi_to_irq_init(void); +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define NR_IPIS 2 + +#endif #endif /* !LOCORE */ #endif /* SMP */ Index: i386/include/param.h =================================================================== --- i386/include/param.h (.../stable/6/sys) (revision 184012) +++ i386/include/param.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -86,9 +86,11 @@ #ifdef PAE #define NPGPTD 4 #define PDRSHIFT 21 /* LOG2(NBPDR) */ +#define NPGPTD_SHIFT 9 #else #define NPGPTD 1 #define PDRSHIFT 22 /* LOG2(NBPDR) */ +#define NPGPTD_SHIFT 10 #endif #define NBPTD (NPGPTD< +#include +#include +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#define PG_KERNEL (PG_V | PG_A | PG_RW | PG_M) + +#define MACH_TO_VM_PAGE(ma) PHYS_TO_VM_PAGE(xpmap_mtop((ma))) +#define VM_PAGE_TO_MACH(m) xpmap_ptom(VM_PAGE_TO_PHYS((m))) + +static __inline vm_paddr_t +pmap_kextract_ma(vm_offset_t va) +{ + vm_paddr_t ma; + if ((ma = PTD[va >> PDRSHIFT]) & PG_PS) { + ma = (ma & ~(NBPDR - 1)) | (va & (NBPDR - 1)); + } else { + ma = (*vtopte(va) & PG_FRAME) | (va & PAGE_MASK); + } + return ma; +} + +static __inline vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + return xpmap_mtop(pmap_kextract_ma(va)); +} +#define vtomach(va) pmap_kextract_ma(((vm_offset_t) (va))) + +vm_paddr_t pmap_extract_ma(struct pmap *pmap, vm_offset_t va); + +void pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa); +void pmap_map_readonly(struct pmap *pmap, vm_offset_t va, int len); +void pmap_map_readwrite(struct pmap *pmap, vm_offset_t va, int len); + +static __inline pt_entry_t +pte_load_store(pt_entry_t *ptep, pt_entry_t v) +{ + pt_entry_t r; + + v = xpmap_ptom(v); + r = *ptep; + PT_SET_VA(ptep, v, TRUE); + return (r); +} + +static __inline pt_entry_t +pte_load_store_ma(pt_entry_t *ptep, pt_entry_t v) +{ + pt_entry_t r; + + r = *ptep; + PT_SET_VA_MA(ptep, v, TRUE); + return (r); +} + +#define pte_load_clear(ptep) pte_load_store((ptep), (pt_entry_t)0ULL) + +#define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) +#define pte_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) +#define pde_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) + +#elif !defined(XEN) + /* * Routine: pmap_kextract * Function: @@ -195,11 +273,10 @@ } return pa; } +#endif -#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) +#if defined(PAE) && !defined(XEN) -#ifdef PAE - static __inline pt_entry_t pte_load(pt_entry_t *ptep) { @@ -231,7 +308,7 @@ #define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) -#else /* PAE */ +#elif !defined (PAE) && !defined(XEN) static __inline pt_entry_t pte_load(pt_entry_t *ptep) Index: i386/include/vmparam.h =================================================================== --- i386/include/vmparam.h (.../stable/6/sys) (revision 184012) +++ i386/include/vmparam.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -83,8 +83,12 @@ * Kernel physical load address. */ #ifndef KERNLOAD +#if defined(XEN) && !defined(XEN_PRIVILEGED_GUEST) +#define KERNLOAD 0 +#else #define KERNLOAD (1 << PDRSHIFT) #endif +#endif /* * Virtual addresses of things. Derived from the page directory and @@ -93,7 +97,11 @@ * messy at times, but hey, we'll do anything to save a page :-) */ +#ifdef XEN +#define VM_MAX_KERNEL_ADDRESS HYPERVISOR_VIRT_START +#else #define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1) +#endif #define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI) #define KERNBASE VADDR(KPTDI, 0) Index: i386/include/xen/xen-os.h =================================================================== --- i386/include/xen/xen-os.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/xen-os.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,291 @@ +/****************************************************************************** + * os.h + * + * random collection of macros and definition + */ + +#ifndef _XEN_OS_H_ +#define _XEN_OS_H_ + +#ifdef PAE +#define CONFIG_X86_PAE +#endif + +#if defined(XEN) && !defined(__XEN_INTERFACE_VERSION__) +/* + * Can update to a more recent version when we implement + * the hypercall page + */ +#define __XEN_INTERFACE_VERSION__ 0x00030204 +#endif + +#include + +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +extern int gdtset; + +extern shared_info_t *HYPERVISOR_shared_info; + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__ ( "rep;nop" : : : "memory" ); +} +#define cpu_relax() rep_nop() + +/* crude memory allocator for memory allocation early in + * boot + */ +void *bootmem_alloc(unsigned int size); +void bootmem_free(void *ptr, unsigned int size); + + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef __ASSEMBLY__ + +/* some function prototypes */ +void trap_init(void); + +/* + * STI/CLI equivalents. These basically set and clear the virtual + * event_enable flag in teh shared_info structure. Note that when + * the enable bit is set, there may be pending events to be handled. + * We may therefore call into do_hypervisor_callback() directly. + */ +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + + + +#define __cli() \ +do { \ + vcpu_info_t *_vcpu; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + _vcpu->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define __sti() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ +} while (0) + +#define __restore_flags(x) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + } \ +} while (0) + +/* + * Add critical_{enter, exit}? + * + */ +#define __save_and_cli(x) \ +do { \ + vcpu_info_t *_vcpu; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + (x) = _vcpu->evtchn_upcall_mask; \ + _vcpu->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + + +#define cli() __cli() +#define sti() __sti() +#define save_flags(x) __save_flags(x) +#define restore_flags(x) __restore_flags(x) +#define save_and_cli(x) __save_and_cli(x) + +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#define local_irq_disable() __cli() +#define local_irq_enable() __sti() + +#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));} +#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); } +#define spin_lock_irqsave mtx_lock_irqsave +#define spin_unlock_irqrestore mtx_unlock_irqrestore + + +#ifndef mb +#define mb() __asm__ __volatile__("lock; addl $0, 0(%%esp)": : :"memory") +#endif +#ifndef rmb +#define rmb() mb() +#endif +#ifndef wmb +#define wmb() barrier() +#endif +#ifdef SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while(0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + + +/* This is a barrier for the compiler only, NOT the processor! */ +#define barrier() __asm__ __volatile__("": : :"memory") + +#define LOCK_PREFIX "" +#define LOCK "" +#define ADDR (*(volatile long *) addr) +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { volatile int counter; } atomic_t; + + + +#define xen_xchg(ptr,v) \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +struct __xchg_dummy { unsigned long a[100]; }; +#define __xg(x) ((volatile struct __xchg_dummy *)(x)) +static __inline unsigned long __xchg(unsigned long x, volatile void * ptr, + int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static __inline int test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + "btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"=m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __inline int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline int variable_test_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( + "btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btsl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + LOCK "incl %0" + :"=m" (v->counter) + :"m" (v->counter)); +} + + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + +#endif /* !__ASSEMBLY__ */ + +#endif /* _OS_H_ */ Property changes on: i386/include/xen/xen-os.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/xen/hypercall.h =================================================================== --- i386/include/xen/hypercall.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/hypercall.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,402 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __MACHINE_XEN_HYPERCALL_H__ +#define __MACHINE_XEN_HYPERCALL_H__ + +#define __STR(x) #x +#define STR(x) __STR(x) +#define ENOXENSYS 38 +#define CONFIG_XEN_COMPAT 0x030002 + + +#if defined(XEN) +#define HYPERCALL_STR(name) \ + "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)" +#else +#define HYPERCALL_STR(name) \ + "mov hypercall_stubs,%%eax; " \ + "add $("STR(__HYPERVISOR_##name)" * 32),%%eax; " \ + "call *%%eax" +#endif + +#define _hypercall0(type, name) \ +({ \ + long __res; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res) \ + : \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + long __res, __ign1; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1) \ + : "1" ((long)(a1)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + long __res, __ign1, __ign2; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ + : "1" ((long)(a1)), "2" ((long)(a2)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)), \ + "5" ((long)(a5)) \ + : "memory" ); \ + (type)__res; \ +}) + +static inline int +HYPERVISOR_set_trap_table( + trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int +HYPERVISOR_mmu_update( + mmu_update_t *req, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int +HYPERVISOR_mmuext_op( + mmuext_op_t *op, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int +HYPERVISOR_set_gdt( + unsigned long *frame_list, int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} + +static inline int +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline long +HYPERVISOR_set_timer_op( + uint64_t timeout) +{ + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); +} +#if 0 +static inline int +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} +#endif +static inline int +HYPERVISOR_set_debugreg( + int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long +HYPERVISOR_get_debugreg( + int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int +HYPERVISOR_update_descriptor( + uint64_t ma, uint64_t desc) +{ + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); +} + +static inline int +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int +HYPERVISOR_multicall( + void *call_list, int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int +HYPERVISOR_update_va_mapping( + unsigned long va, uint64_t new_val, unsigned long flags) +{ + uint32_t hi, lo; + + lo = (uint32_t)(new_val & 0xffffffff); + hi = (uint32_t)(new_val >> 32); + + return _hypercall4(int, update_va_mapping, va, + lo, hi, flags); +} + +static inline int +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (__predict_false(rc == -ENOXENSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + return (rc); +} + +static inline int +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int +HYPERVISOR_console_io( + int cmd, int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (__predict_false(rc == -ENOXENSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + return (rc); +} + +static inline int +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid) +{ + uint32_t hi, lo; + + lo = (uint32_t)(new_val & 0xffffffff); + hi = (uint32_t)(new_val >> 32); + + return _hypercall5(int, update_va_mapping_otherdomain, va, + lo, hi, flags, domid); +} + +static inline int +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int +HYPERVISOR_vcpu_op( + int cmd, int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + return (rc); +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +static inline int +HYPERVISOR_callback_op( + int cmd, void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +#ifndef CONFIG_XEN +static inline unsigned long +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +#endif /* __MACHINE_XEN_HYPERCALL_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ Property changes on: i386/include/xen/hypercall.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/xen/xenvar.h =================================================================== --- i386/include/xen/xenvar.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/xenvar.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * $FreeBSD$ + */ +#ifndef XENVAR_H_ +#define XENVAR_H_ +#define XBOOTUP 0x1 +#define XPMAP 0x2 +extern int xendebug_flags; +#ifndef NOXENDEBUG +#define XENPRINTF printk +#else +#define XENPRINTF printf +#endif + +extern xen_pfn_t *xen_phys_machine; +extern xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; +extern xen_pfn_t *xen_pfn_to_mfn_frame_list_list; + +#if 0 +#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_DEBUG(argflags, _f, _a...) \ +if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a); +#else +#define TRACE_ENTER +#define TRACE_EXIT +#define TRACE_DEBUG(argflags, _f, _a...) +#endif + +extern xen_pfn_t *xen_machine_phys; +/* Xen starts physical pages after the 4MB ISA hole - + * FreeBSD doesn't + */ + + +#undef ADD_ISA_HOLE /* XXX */ + +#ifdef ADD_ISA_HOLE +#define ISA_INDEX_OFFSET 1024 +#define ISA_PDR_OFFSET 1 +#else +#define ISA_INDEX_OFFSET 0 +#define ISA_PDR_OFFSET 0 +#endif + + +#define PFNTOMFN(i) (xen_phys_machine[(i)]) +#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) + +#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) +#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) + +#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) +#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) + +#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT) +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) + +#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT) +#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT) + + +void xpq_init(void); + +int xen_create_contiguous_region(vm_page_t pages, int npages); + +void xen_destroy_contiguous_region(void * addr, int npages); + +#endif Property changes on: i386/include/xen/xenvar.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/xen/synch_bitops.h =================================================================== --- i386/include/xen/synch_bitops.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/synch_bitops.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,139 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((volatile struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int synch_var_test_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); + return oldbit; +} + +#define synch_test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + synch_const_test_bit((nr),(addr)) : \ + synch_var_test_bit((nr),(addr))) + +#endif /* __XEN_SYNCH_BITOPS_H__ */ Property changes on: i386/include/xen/synch_bitops.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/xen/xenfunc.h =================================================================== --- i386/include/xen/xenfunc.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/xenfunc.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,79 @@ +/* + * + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENFUNC_H_ +#define _XEN_XENFUNC_H_ + +#include +#include +#include +#include +#include +#define BKPT __asm__("int3"); +#define XPQ_CALL_DEPTH 5 +#define XPQ_CALL_COUNT 2 +#define PG_PRIV PG_AVAIL3 +typedef struct { + unsigned long pt_ref; + unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH]; +} pteinfo_t; + +extern pteinfo_t *pteinfo_list; +#ifdef XENDEBUG_LOW +#define __PRINTK(x) printk x +#else +#define __PRINTK(x) +#endif + +char *xen_setbootenv(char *cmd_line); + +int xen_boothowto(char *envp); + +void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line); + +#ifdef INVARIANTS +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), __FILE__, __LINE__) +#else +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0) +#endif + +void xen_update_descriptor(union descriptor *, union descriptor *); + +extern struct mtx balloon_lock; +#if 0 +#define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags) +#define balloon_unlock(__flags) mtx_unlock_irqrestore(&balloon_lock, __flags) +#else +#define balloon_lock(__flags) __flags = 1 +#define balloon_unlock(__flags) __flags = 0 +#endif + + + +#endif /* _XEN_XENFUNC_H_ */ Property changes on: i386/include/xen/xenfunc.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/include/xen/xenpmap.h =================================================================== --- i386/include/xen/xenpmap.h (.../stable/6/sys) (revision 0) +++ i386/include/xen/xenpmap.h (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,230 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENPMAP_H_ +#define _XEN_XENPMAP_H_ + +#include + +void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int); +void xen_pt_switch(vm_paddr_t); +void xen_set_ldt(vm_paddr_t, unsigned long); +void xen_pgdpt_pin(vm_paddr_t); +void xen_pgd_pin(vm_paddr_t); +void xen_pgd_unpin(vm_paddr_t); +void xen_pt_pin(vm_paddr_t); +void xen_pt_unpin(vm_paddr_t); +void xen_flush_queue(void); +void xen_check_queue(void); +#if 0 +void pmap_ref(pt_entry_t *pte, vm_paddr_t ma); +#endif +void pmap_suspend(void); +void pmap_resume(void); + +#ifdef INVARIANTS +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__) +#else +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0) +#endif + +#ifdef PMAP_DEBUG +#define PMAP_REF pmap_ref +#define PMAP_DEC_REF_PAGE pmap_dec_ref_page +#define PMAP_MARK_PRIV pmap_mark_privileged +#define PMAP_MARK_UNPRIV pmap_mark_unprivileged +#else +#define PMAP_MARK_PRIV(a) +#define PMAP_MARK_UNPRIV(a) +#define PMAP_REF(a, b) +#define PMAP_DEC_REF_PAGE(a) +#endif + +#define ALWAYS_SYNC 0 + +#ifdef PT_DEBUG +#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) +#else +#define PT_LOG() +#endif + +#define INVALID_P2M_ENTRY (~0UL) + +#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ + +#define SH_PD_SET_VA 1 +#define SH_PD_SET_VA_MA 2 +#define SH_PD_SET_VA_CLEAR 3 + +struct pmap; +void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type); +#ifdef notyet +static vm_paddr_t +vptetomachpte(vm_paddr_t *pte) +{ + vm_offset_t offset, ppte; + vm_paddr_t pgoffset, retval, *pdir_shadow_ptr; + int pgindex; + + ppte = (vm_offset_t)pte; + pgoffset = (ppte & PAGE_MASK); + offset = ppte - (vm_offset_t)PTmap; + pgindex = ppte >> PDRSHIFT; + + pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow); + retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset; + return (retval); +} +#endif +#define PT_GET(_ptp) \ + (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0)) + +#ifdef WRITABLE_PAGETABLES + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + PT_LOG(); \ + *(_ptp) = xpmap_ptom((_npte)); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + PT_LOG(); \ + *(_ptp) = (_npte); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + PT_LOG(); \ + *(_ptp) = 0; \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#else /* !WRITABLE_PAGETABLES */ + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + xen_queue_pt_update(vtomach(_ptp), \ + xpmap_ptom(_npte)); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + xen_queue_pt_update(vtomach(_ptp), _npte); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + xen_queue_pt_update(vtomach(_ptp), 0); \ + if (sync || ALWAYS_SYNC) \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#endif + +#define PT_SET_MA(_va, _ma) \ +do { \ + int err; \ + err = HYPERVISOR_update_va_mapping(((unsigned long)(_va)), \ + (_ma), UVMF_INVLPG| UVMF_ALL); \ + KASSERT(err >= 0, ("unexpected result from update_va_mapping")); \ +} while (/*CONSTCOND*/0) + +#define PT_UPDATES_FLUSH() do { \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +static __inline vm_paddr_t +xpmap_mtop(vm_paddr_t mpa) +{ + vm_paddr_t tmp = (mpa & PG_FRAME); + + return machtophys(tmp) | (mpa & ~PG_FRAME); +} + +static __inline vm_paddr_t +xpmap_ptom(vm_paddr_t ppa) +{ + vm_paddr_t tmp = (ppa & PG_FRAME); + + return phystomach(tmp) | (ppa & ~PG_FRAME); +} + +static __inline void +set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +#ifdef notyet + PANIC_IF(max_mapnr && pfn >= max_mapnr); +#endif + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef notyet + PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY)); +#endif + return; + } + xen_phys_machine[pfn] = mfn; +} + + + + +#endif /* _XEN_XENPMAP_H_ */ Property changes on: i386/include/xen/xenpmap.h ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/conf/DEFAULTS =================================================================== --- i386/conf/DEFAULTS (.../stable/6/sys) (revision 184012) +++ i386/conf/DEFAULTS (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -15,3 +15,5 @@ # Pseudo devices. device mem # Memory and kernel memory devices device io # I/O device + +options NATIVE Index: i386/conf/XEN =================================================================== --- i386/conf/XEN (.../stable/6/sys) (revision 0) +++ i386/conf/XEN (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,156 @@ +# +# GENERIC -- Generic kernel configuration file for FreeBSD/i386 +# +# For more information on this file, please read the handbook section on +# Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD$ + +machine i386 +cpu I686_CPU +ident XEN + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols +makeoptions MODULES_OVERRIDE="" + +#options SCHED_ULE # ULE scheduler +#options PREEMPTION # Enable kernel thread preemption +options SCHED_4BSD +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options MD_ROOT # MD is a potential root device +options NFSCLIENT # Network Filesystem Client +options NFSSERVER # Network Filesystem Server +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_LABEL # Provides labelization +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options AUDIT # Security event auditing + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options GDB # Support remote GDB. +#options INVARIANTS # Enable calls of extra sanity checking +#options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS # Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# To make an SMP kernel, the next two lines are needed +#options SMP # Symmetric MultiProcessor Kernel +#device apic # I/O APIC +options PAE + + +# CPU frequency control +#device cpufreq # native only + +# Bus support. +#device pci + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device ch # SCSI media changers +device da # Direct Access (disks) +device sa # Sequential Access (tape etc) +device cd # CD +device pass # Passthrough device (direct SCSI access) +device ses # SCSI Environmental Services (and SAF-TE) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse +device kbdmux # keyboard multiplexer +#device vga # VGA video card driver +device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console + +#device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +#device apm +# Add suspend/resume support for the i8254. +#device pmtimer # native + +device pci + +# Serial (COM) ports +device uart # Generic UART driver + +# If you've got a "dumb" serial or parallel PCI card that is +# supported by the puc(4) glue driver, uncomment the following +# line to enable it (connects to sio, uart and/or ppc drivers): +#device puc + +# PCI Ethernet NICs. +device em # Intel PRO/1000 adapter Gigabit Ethernet Card + +# PCI Ethernet NICs that use the common MII bus controller code. +# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! +device miibus # MII bus support + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device sl # Kernel SLIP +device ppp # Kernel PPP +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + + +options XEN +nooption NATIVE +nodevice atpic +options MCLSHIFT=12 + +nodevice isa +nooption ISAPNP + +options KTR +options KTR_COMPILE=(KTR_PMAP) +options KTR_CPUMASK=0xff +options KTR_ENTRIES=65536 +options KTR_MASK=(KTR_PMAP) +options KVA_PAGES=1600 Index: i386/i386/vm_machdep.c =================================================================== --- i386/i386/vm_machdep.c (.../stable/6/sys) (revision 184012) +++ i386/i386/vm_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -89,6 +89,9 @@ #include #include +#ifdef XEN +#include +#endif #ifdef PC98 #include #else @@ -264,7 +267,7 @@ /* Setup to release sched_lock in fork_exit(). */ td2->td_md.md_spinlock_count = 1; - td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + td2->td_md.md_saved_flags = PSL_USER; /* * Now, cpu_switch() can schedule the new process. @@ -436,7 +439,7 @@ /* Setup to release sched_lock in fork_exit(). */ td->td_md.md_spinlock_count = 1; - td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + td->td_md.md_saved_flags = PSL_USER; } /* @@ -593,6 +596,9 @@ int b; #endif +#ifdef XEN + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +#endif disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) @@ -762,8 +768,11 @@ */ ptep = vtopte(sf->kva); opte = *ptep; +#ifdef XEN + PT_SET_MA(sf->kva, xpmap_ptom(VM_PAGE_TO_PHYS(m)) | pgeflag | PG_RW | PG_V); +#else *ptep = VM_PAGE_TO_PHYS(m) | pgeflag | PG_RW | PG_V; - +#endif /* * Avoid unnecessary TLB invalidations: If the sf_buf's old * virtual-to-physical mapping was not used, then any processor @@ -812,6 +821,14 @@ if (sf->ref_count == 0) { TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry); nsfbufsused--; +#ifdef XEN + /* + * Xen doesn't like having dangling R/W mappings + */ + pmap_qremove(sf->kva, 1); + sf->m = NULL; + LIST_REMOVE(sf, list_entry); +#endif if (sf_buf_alloc_want > 0) wakeup_one(&sf_buf_freelist); } Index: i386/i386/swtch.s =================================================================== --- i386/i386/swtch.s (.../stable/6/sys) (revision 184012) +++ i386/i386/swtch.s (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -71,7 +71,7 @@ movl 8(%esp),%ecx /* New thread */ movl TD_PCB(%ecx),%edx movl PCB_CR3(%edx),%eax - movl %eax,%cr3 /* new address space */ + LOAD_CR3(%eax) /* new address space */ /* set bit in new pm_active */ movl TD_PROC(%ecx),%eax movl P_VMSPACE(%eax), %ebx @@ -114,11 +114,13 @@ movl %gs,PCB_GS(%edx) pushfl /* PSL */ popl PCB_PSL(%edx) +#ifndef XEN /* Check to see if we need to call a switchout function. */ movl PCB_SWITCHOUT(%edx),%eax cmpl $0, %eax je 1f call *%eax +#endif 1: /* Test if debug registers should be saved. */ testl $PCB_DBREGS,PCB_FLAGS(%edx) @@ -171,7 +173,7 @@ movl %cr3,%ebx /* The same address space? */ cmpl %ebx,%eax je sw1 - movl %eax,%cr3 /* new address space */ + LOAD_CR3(%eax) /* new address space */ /* Release bit from old pmap->pm_active */ movl PCPU(CURPMAP), %ebx @@ -191,7 +193,19 @@ btsl %esi, PM_ACTIVE(%ebx) /* set new */ sw1: +#ifdef XEN + pushl %eax + pushl %ecx + pushl %edx + call xen_handle_thread_switch + popl %edx + popl %ecx + popl %eax /* + * XXX set IOPL + */ +#else + /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. */ @@ -238,7 +252,7 @@ movl 12(%esi), %ebx movl %eax, 8(%edi) movl %ebx, 12(%edi) - +#endif /* Restore context. */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp @@ -263,7 +277,7 @@ movl _default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f - lldt _default_ldt + LLDT(_default_ldt) movl %eax,PCPU(CURRENTLDT) jmp 2f 1: @@ -366,7 +380,7 @@ * parent's npx state for forks by forgetting to reload. */ pushfl - cli + CLI movl PCPU(FPCURTHREAD),%eax testl %eax,%eax je 1f Index: i386/i386/apic_vector.s =================================================================== --- i386/i386/apic_vector.s (.../stable/6/sys) (revision 184012) +++ i386/i386/apic_vector.s (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -299,6 +299,7 @@ /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ +#ifndef XEN .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) @@ -320,7 +321,7 @@ addl $4, %esp /* XXX convert clockframe to trapframe */ MEXITCOUNT jmp doreti - +#endif /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, * Index: i386/i386/genassym.c =================================================================== --- i386/i386/genassym.c (.../stable/6/sys) (revision 184012) +++ i386/i386/genassym.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -227,3 +227,9 @@ ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base)); ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); #endif + +#ifdef XEN +#include +ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3)); +ASSYM(HYPERVISOR_VIRT_START, __HYPERVISOR_VIRT_START); +#endif Index: i386/i386/support.s =================================================================== --- i386/i386/support.s (.../stable/6/sys) (revision 184012) +++ i386/i386/support.s (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -1426,10 +1426,11 @@ */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) +#ifndef XEN /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) - +#endif /* flush the prefetch q */ jmp 1f nop Index: i386/i386/busdma_machdep.c =================================================================== --- i386/i386/busdma_machdep.c (.../stable/6/sys) (revision 184012) +++ i386/i386/busdma_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -140,6 +140,11 @@ static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage); static __inline int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr); +#ifdef XEN +#undef pmap_kextract +#define pmap_kextract pmap_kextract_ma +#endif + /* * Return true if a match is made. * Index: i386/i386/sys_machdep.c =================================================================== --- i386/i386/sys_machdep.c (.../stable/6/sys) (revision 184012) +++ i386/i386/sys_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -58,6 +58,27 @@ #include /* for kernel_map */ +#ifdef XEN +#include +#include +#include + +void i386_reset_ldt(struct proc_ldt *pldt); + +void +i386_reset_ldt(struct proc_ldt *pldt) +{ + xen_set_ldt((vm_offset_t)pldt->ldt_base, pldt->ldt_len); +} +#define SEG_VIRT_END (HYPERVISOR_VIRT_START >> 12) & 0xffff +#define SET_DESCRIPTOR(index, sd) \ + HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[index]), *(uint64_t *)&(sd)); +#else +#define i386_reset_ldt(x) +#define SEG_VIRT_END 0xffff +#define SET_DESCRIPTOR(index, sd) PCPU_GET(fsgs_gdt)[index] = (sd); +#endif + #define MAX_LD 8192 #define LD_PER_PAGE 512 #define NEW_MAX_LD(num) ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1)) @@ -163,7 +184,7 @@ */ sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; - sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ + sd.sd_lolimit = SEG_VIRT_END; /* 4GB limit, wraps */ sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -173,7 +194,7 @@ sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_fsd = sd; - PCPU_GET(fsgs_gdt)[0] = sd; + SET_DESCRIPTOR(0, sd); critical_exit(); td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL); } @@ -193,7 +214,7 @@ */ sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; - sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ + sd.sd_lolimit = SEG_VIRT_END; /* 4GB limit, wraps */ sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -203,7 +224,7 @@ sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_gsd = sd; - PCPU_GET(fsgs_gdt)[1] = sd; + SET_DESCRIPTOR(1, sd); critical_exit(); load_gs(GSEL(GUGS_SEL, SEL_UPL)); } @@ -364,6 +385,10 @@ struct proc_ldt *pldt; pldt = mdp->md_ldt; +#ifdef XEN + i386_reset_ldt(pldt); + PCPU_SET(currentldt, (int)pldt); +#else #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd; #else @@ -371,6 +396,7 @@ #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); +#endif /* !XEN */ } #ifdef SMP @@ -385,6 +411,39 @@ } #endif +#ifdef XEN + +struct proc_ldt * +user_ldt_alloc(struct mdproc *mdp, int len) +{ + struct proc_ldt *pldt, *new_ldt; + + MALLOC(new_ldt, struct proc_ldt *, sizeof(struct proc_ldt), + M_SUBPROC, M_WAITOK); + + new_ldt->ldt_len = len = NEW_MAX_LD(len); + new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, + round_page(len * sizeof(union descriptor))); + if (new_ldt->ldt_base == NULL) { + FREE(new_ldt, M_SUBPROC); + return NULL; + } + new_ldt->ldt_refcnt = 1; + new_ldt->ldt_active = 0; + + if ((pldt = mdp->md_ldt)) { + if (len > pldt->ldt_len) + len = pldt->ldt_len; + bcopy(pldt->ldt_base, new_ldt->ldt_base, + len * sizeof(union descriptor)); + } else { + bcopy(ldt, new_ldt->ldt_base, PAGE_SIZE); + } + pmap_map_readonly(kernel_pmap, (vm_offset_t)new_ldt->ldt_base, + new_ldt->ldt_len*sizeof(union descriptor)); + return new_ldt; +} +#else /* * Must be called with either sched_lock free or held but not recursed. * If it does not return NULL, it will return with it owned. @@ -425,6 +484,7 @@ } return new_ldt; } +#endif /* * Must be called either with sched_lock free or held but not recursed. @@ -443,8 +503,11 @@ mtx_lock_spin(&sched_lock); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); if (td == PCPU_GET(curthread)) { +#ifndef XEN lldt(_default_ldt); +#endif PCPU_SET(currentldt, _default_ldt); + i386_reset_ldt((struct proc_ldt *)_default_ldt); } mdp->md_ldt = NULL; @@ -549,6 +612,9 @@ } if (!(uap->start == LDT_AUTO_ALLOC && uap->num == 1)) { +#ifdef XEN + load_gs(0); /* XXX check if we really still need this */ +#endif /* complain a for a while if using old methods */ if (ldt_warnings++ < NUM_LDT_WARNINGS) { printf("Warning: pid %d used static ldt allocation.\n", @@ -671,6 +737,23 @@ return (error); } +#ifdef XEN +static int +i386_set_ldt_data(struct thread *td, int start, int num, + union descriptor *descs) +{ + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt = mdp->md_ldt; + int i, error; + + for (i = 0; i < num; i++) { + error = HYPERVISOR_update_descriptor(vtomach(&((union descriptor *)(pldt->ldt_base))[start + i]), *(uint64_t *)(descs + i)); + if (error) + panic("failed to update ldt: %d", error); + } + return (0); +} +#else static int i386_set_ldt_data(struct thread *td, int start, int num, union descriptor *descs) @@ -686,6 +769,7 @@ num * sizeof(union descriptor)); return (0); } +#endif static int i386_ldt_grow(struct thread *td, int len) Index: i386/i386/machdep.c =================================================================== --- i386/i386/machdep.c (.../stable/6/sys) (revision 184012) +++ i386/i386/machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -141,6 +141,24 @@ uint32_t arch_i386_xbox_memsize = 0; #endif +#ifdef XEN +/* XEN includes */ +#include +#include +#include +#include +#include + +void Xhypervisor_callback(void); +void failsafe_callback(void); + +extern trap_info_t trap_table[]; +struct proc_ldt default_proc_ldt; +extern int init_first; +int running_xen = 1; +extern unsigned long physfree; +#endif /* XEN */ + /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); @@ -282,8 +300,9 @@ */ bufinit(); vm_pager_bufferinit(); - +#ifndef XEN cpu_setregs(); +#endif } /* @@ -1108,6 +1127,25 @@ return (0); } +static int cpu_idle_hlt = 1; +SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, + &cpu_idle_hlt, 0, "Idle loop HLT enable"); +#ifdef XEN + +void +cpu_halt(void) +{ + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +} + +static void +cpu_idle_default(void) +{ + idle_block(); +} + +#else + /* * Shutdown the CPU as much as possible */ @@ -1133,9 +1171,6 @@ * XXX I'm turning it on for SMP as well by default for now. It seems to * help lock contention somewhat, and this is critical for HTT. -Peter */ -static int cpu_idle_hlt = 1; -SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, - &cpu_idle_hlt, 0, "Idle loop HLT enable"); static void cpu_idle_default(void) @@ -1147,6 +1182,7 @@ */ __asm __volatile("sti; hlt"); } +#endif /* !XEN */ /* * Note that we have to be careful here to avoid a race between checking @@ -1158,7 +1194,7 @@ cpu_idle(void) { -#ifdef SMP +#if defined(SMP) && !defined(XEN) if (mp_grab_cpu_hlt()) return; #endif @@ -1317,10 +1353,16 @@ */ int _default_ldt; + +#ifdef XEN +union descriptor *gdt; +union descriptor *ldt; +#else union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ +union descriptor ldt[NLDT]; /* local descriptor table */ +#endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ -union descriptor ldt[NLDT]; /* local descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ int private_tss; /* flag indicating private tss */ @@ -1355,7 +1397,7 @@ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1382,7 +1424,7 @@ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1391,7 +1433,7 @@ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1418,11 +1460,12 @@ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, +#ifndef XEN /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ @@ -1514,6 +1557,7 @@ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, +#endif /* !XEN */ }; static struct soft_segment_descriptor ldt_segs[] = { @@ -1735,7 +1779,17 @@ goto physmap_done; } #endif - +#ifdef XEN + has_smap = 0; + Maxmem = xen_start_info->nr_pages - init_first; + physmem = Maxmem; + basemem = 0; + physmap[0] = init_first << PAGE_SHIFT; + physmap[1] = ptoa(Maxmem) - round_page(MSGBUF_SIZE); + physmap_idx = 0; + goto physmap_done; +#endif + hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); bzero(&vmf, sizeof(vmf)); @@ -1898,7 +1952,7 @@ vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; -#else +#elif !defined(XEN) /* * Prefer the RTC value for extended memory. */ @@ -1988,7 +2042,7 @@ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; - +#ifndef XEN /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. @@ -2106,7 +2160,10 @@ } *pte = 0; invltlb(); - +#else + phys_avail[0] = physfree; + phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; +#endif /* * XXX * The last chunk must contain at least one page plus the message @@ -2128,7 +2185,261 @@ avail_end = phys_avail[pa_indx]; } +#ifdef XEN + +#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) void +init386(int first) +{ + int error, gsel_tss, metadata_missing, x; + unsigned long off, gdtmachpfn; + struct pcpu *pc; + struct callback_register event = { + .type = CALLBACKTYPE_event, + .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, + }; + struct callback_register failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, + }; + + thread0.td_kstack = proc0kstack; + thread0.td_pcb = (struct pcb *) + (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; + + /* + * This may be done better later if it gets more high level + * components in it. If so just link td->td_proc here. + */ + proc_linkup(&proc0, &ksegrp0, &thread0); + + metadata_missing = 0; + if (xen_start_info->mod_start) { + preload_metadata = (caddr_t)xen_start_info->mod_start; + preload_bootstrap_relocate(KERNBASE); + } else { + metadata_missing = 1; + } + if (envmode == 1) + kern_envp = static_env; + else if ((caddr_t)xen_start_info->cmd_line) + kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); + + boothowto |= xen_boothowto(kern_envp); + + /* Init basic tunables, hz etc */ + init_param1(); + + /* + * XEN occupies a portion of the upper virtual address space + * At its base it manages an array mapping machine page frames + * to physical page frames - hence we need to be able to + * access 4GB - (64MB - 4MB + 64k) + */ + gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + + pc = &__pcpu[0]; + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + + PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); + bzero(gdt, PAGE_SIZE); + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &gdt[x].sd); + + + if (bootverbose) { + printf("gdt=%p\n", gdt); + printf("PTmap=%p\n", PTmap); + printf("addr=%#jx\n", (uintmax_t)*vtopte((unsigned long)gdt) & ~PG_RW); + } + + gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; + PT_SET_MA(gdt, *vtopte((unsigned long)gdt) & ~(PG_RW|PG_M|PG_A)); + error = HYPERVISOR_set_gdt(&gdtmachpfn, 512); + KASSERT(error == 0, ("unexpected result from set_gdt")); + lgdt(&r_gdt /* unused */); + gdtset = 1; + + if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { + panic("set_trap_table failed - error %d\n", error); + } + + error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (error == 0) + error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (error == -ENOXENSYS) + HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), + (unsigned long)Xhypervisor_callback, + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); +#endif + pcpu_init(pc, 0, sizeof(struct pcpu)); + PCPU_SET(prvspace, pc); + PCPU_SET(curthread, &thread0); + PCPU_SET(curpcb, thread0.td_pcb); + PCPU_SET(pdir, (unsigned long)IdlePTD); + + /* + * Initialize mutexes. + * + * icu_lock: in order to allow an interrupt to occur in a critical + * section, to set pcpu->ipending (etc...) properly, we + * must be able to get the icu lock, so it can't be + * under witness. + */ + mutex_init(); + mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); + + /* make ldt memory segments */ + PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); + bzero(ldt, PAGE_SIZE); + ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); + ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); + for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) + ssdtosd(&ldt_segs[x], &ldt[x].sd); + + default_proc_ldt.ldt_base = (caddr_t)ldt; + default_proc_ldt.ldt_len = 6; + _default_ldt = (int)&default_proc_ldt; + PCPU_SET(currentldt, _default_ldt) + PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); + xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); + +#ifdef XBOX + /* + * The following code queries the PCI ID of 0:0:0. For the XBOX, + * This should be 0x10de / 0x02a5. + * + * This is exactly what Linux does. + */ + outl(0xcf8, 0x80000000); + if (inl(0xcfc) == 0x02a510de) { + arch_i386_is_xbox = 1; + pic16l_setled(XBOX_LED_GREEN); + + /* + * We are an XBOX, but we may have either 64MB or 128MB of + * memory. The PCI host bridge should be programmed for this, + * so we just query it. + */ + outl(0xcf8, 0x80000084); + arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; + } +#endif /* XBOX */ +#if defined (XEN_PRIVILEGED) + /* + * Initialize the i8254 before the console so that console + * initialization can use DELAY(). + */ + i8254_init(); +#endif + /* + * Initialize the console before we print anything out. + */ + cninit(); + + if (metadata_missing) + printf("WARNING: loader(8) metadata is missing!\n"); + +#ifdef DEV_ISA + if (xen_start_info->flags & SIF_PRIVILEGED) { + elcr_probe(); +#ifdef DEV_ATPIC + atpic_startup(); +#endif + } +#endif + +#ifdef DDB + ksym_start = bootinfo.bi_symtab; + ksym_end = bootinfo.bi_esymtab; +#endif + + kdb_init(); + +#ifdef KDB + if (boothowto & RB_KDB) + kdb_enter("Boot flags requested debugger"); +#endif + + finishidentcpu(); /* Final stage of CPU initialization */ + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + initializecpu(); /* Initialize CPU registers */ + + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), + PCPU_GET(common_tss.tss_esp0)); + + + /* pointer to selector slot for %fs/%gs */ + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = + dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; + dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = + dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); +#ifdef PAE + dblfault_tss.tss_cr3 = (int)IdlePDPT; +#else + dblfault_tss.tss_cr3 = (int)IdlePTD; +#endif + dblfault_tss.tss_eip = (int)dblfault_handler; + dblfault_tss.tss_eflags = PSL_KERNEL; + dblfault_tss.tss_ds = dblfault_tss.tss_es = + dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); + dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); + dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); + dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + + vm86_initialize(); + getmemsize(first); + init_param2(physmem); + + + /* Map the message buffer. */ + for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) + pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); + + /* now running on new page tables, configured,and u/iom is accessible */ + + msgbufinit(msgbufp, MSGBUF_SIZE); + + /* transfer to user mode */ + + _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); + _udatasel = GSEL(GUDATA_SEL, SEL_UPL); + + /* setup proc 0's pcb */ + thread0.td_pcb->pcb_flags = 0; +#ifdef PAE + thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; +#else + thread0.td_pcb->pcb_cr3 = (int)IdlePTD; +#endif + thread0.td_pcb->pcb_ext = 0; + thread0.td_frame = &proc0_tf; + thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; + thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; +} + +#else +void init386(first) int first; { @@ -2389,6 +2700,7 @@ thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; } +#endif /* !XEN */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) Index: i386/i386/trap.c =================================================================== --- i386/i386/trap.c (.../stable/6/sys) (revision 184012) +++ i386/i386/trap.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -215,6 +215,7 @@ goto out; #endif +#ifndef XEN if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled @@ -245,6 +246,7 @@ enable_intr(); } } +#endif eva = 0; code = frame.tf_err; Index: i386/i386/intr_machdep.c =================================================================== --- i386/i386/intr_machdep.c (.../stable/6/sys) (revision 184012) +++ i386/i386/intr_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -284,7 +284,12 @@ /* Schedule the ithread if needed. */ if (thread) { error = intr_event_schedule_thread(ie); +#ifndef XEN KASSERT(error == 0, ("bad stray interrupt")); +#else + if (error != 0) + log(LOG_CRIT, "bad stray interrupt %d", vector); +#endif } critical_exit(); td->td_intr_nesting_level--; Index: i386/xen/exception.s =================================================================== --- i386/xen/exception.s (.../stable/6/sys) (revision 0) +++ i386/xen/exception.s (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,489 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_apic.h" +#include "opt_npx.h" + +#include +#include +#include + +#include "assym.s" + +#define SEL_RPL_MASK 0x0002 +#define __HYPERVISOR_iret 23 + +/* Offsets into shared_info_t. */ + +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + + +#ifdef SMP +#define GET_VCPU_INFO(reg) movl PCPU(CPUID),reg ; \ + shl $sizeof_vcpu_shift,reg ; \ + addl HYPERVISOR_shared_info,reg +#else +#define GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg +#endif + +#define __DISABLE_INTERRUPTS(reg) movb $1,evtchn_upcall_mask(reg) +#define __ENABLE_INTERRUPTS(reg) movb $0,evtchn_upcall_mask(reg) +#define DISABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ + __DISABLE_INTERRUPTS(reg) +#define ENABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ + __ENABLE_INTERRUPTS(reg) +#define __TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) + +#define POPA \ + popl %edi; \ + popl %esi; \ + popl %ebp; \ + popl %ebx; \ + popl %ebx; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + + .text + +/*****************************************************************************/ +/* Trap handling */ +/*****************************************************************************/ +/* + * Trap and fault vector routines. + * + * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on + * the stack that mostly looks like an interrupt, but does not disable + * interrupts. A few of the traps we are use are interrupt gates, + * SDT_SYS386IGT, which are nearly the same thing except interrupts are + * disabled on entry. + * + * The cpu will push a certain amount of state onto the kernel stack for + * the current process. The amount of state depends on the type of trap + * and whether the trap crossed rings or not. See i386/include/frame.h. + * At the very least the current EFLAGS (status register, which includes + * the interrupt disable state prior to the trap), the code segment register, + * and the return instruction pointer are pushed by the cpu. The cpu + * will also push an 'error' code for certain traps. We push a dummy + * error code for those traps where the cpu doesn't in order to maintain + * a consistent frame. We also push a contrived 'trap number'. + * + * The cpu does not push the general registers, we must do that, and we + * must restore them prior to calling 'iret'. The cpu adjusts the %cs and + * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we + * must load them with appropriate values for supervisor mode operation. + */ + +MCOUNT_LABEL(user) +MCOUNT_LABEL(btrap) + +IDTVEC(div) + pushl $0; TRAP(T_DIVIDE) +IDTVEC(dbg) + pushl $0; TRAP(T_TRCTRAP) +IDTVEC(nmi) + pushl $0; TRAP(T_NMI) +IDTVEC(bpt) + pushl $0; TRAP(T_BPTFLT) +IDTVEC(ofl) + pushl $0; TRAP(T_OFLOW) +IDTVEC(bnd) + pushl $0; TRAP(T_BOUND) +IDTVEC(ill) + pushl $0; TRAP(T_PRIVINFLT) +IDTVEC(dna) + pushl $0; TRAP(T_DNA) +IDTVEC(fpusegm) + pushl $0; TRAP(T_FPOPFLT) +IDTVEC(tss) + TRAP(T_TSSFLT) +IDTVEC(missing) + TRAP(T_SEGNPFLT) +IDTVEC(stk) + TRAP(T_STKFLT) +IDTVEC(prot) + TRAP(T_PROTFLT) +IDTVEC(page) + TRAP(T_PAGEFLT) +IDTVEC(mchk) + pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd) + pushl $0; TRAP(T_RESERVED) +IDTVEC(fpu) + pushl $0; TRAP(T_ARITHTRAP) +IDTVEC(align) + TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + +IDTVEC(hypervisor_callback) + pushl $0; + pushl $0; + pushal + pushl %ds + pushl %es + pushl %fs +upcall_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) +call_evtchn_upcall: + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 10f + cmpl $ecrit,%eax + jb critical_region_fixup + +10: pushl %esp + call evtchn_do_upcall + addl $4,%esp + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +hypervisor_callback_pending: + DISABLE_INTERRUPTS(%esi) /* cli */ + jmp 10b + + /* + * alltraps entry point. Interrupts are enabled if this was a trap + * gate (TGT), else disabled if this was an interrupt gate (IGT). + * Note that int0x80_syscall is a trap gate. Only page faults + * use an interrupt gate. + */ + + SUPERALIGN_TEXT + .globl alltraps + .type alltraps,@function +alltraps: + pushal + pushl %ds + pushl %es + pushl %fs +alltraps_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) +calltrap: + call trap + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + +/* + * SYSCALL CALL GATE (old entry point for a.out binaries) + * + * The intersegment call has been set up to specify one dummy parameter. + * + * This leaves a place to put eflags so that the call frame can be + * converted to a trap frame. Note that the eflags is (semi-)bogusly + * pushed into (what will be) tf_err and then copied later into the + * final spot. It has to be done this way because esp can't be just + * temporarily altered for the pushfl - an interrupt might come in + * and clobber the saved cs/eip. + */ + SUPERALIGN_TEXT +IDTVEC(lcall_syscall) + pushfl /* save eflags */ + popl 8(%esp) /* shuffle into tf_eflags */ + pushl $7 /* sizeof "lcall 7,0" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + call syscall + MEXITCOUNT + jmp doreti + +/* + * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) + * + * Even though the name says 'int0x80', this is actually a TGT (trap gate) + * rather then an IGT (interrupt gate). Thus interrupts are enabled on + * entry just as they are for a normal syscall. + */ + SUPERALIGN_TEXT +IDTVEC(int0x80_syscall) + pushl $2 /* sizeof "int 0x80" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + call syscall + MEXITCOUNT + jmp doreti + +ENTRY(fork_trampoline) + pushl %esp /* trapframe pointer */ + pushl %ebx /* arg1 */ + pushl %esi /* function */ + call fork_exit + addl $12,%esp + /* cut from syscall */ + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +/* + * To efficiently implement classification of trap and interrupt handlers + * for profiling, there must be only trap handlers between the labels btrap + * and bintr, and only interrupt handlers between the labels bintr and + * eintr. This is implemented (partly) by including files that contain + * some of the handlers. Before including the files, set up a normal asm + * environment so that the included files doen't need to know that they are + * included. + */ + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +MCOUNT_LABEL(bintr) + +#ifdef DEV_ATPIC +#include +#endif +#ifdef DEV_APIC + .data + .p2align 4 + .text + SUPERALIGN_TEXT + +#include +#endif + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +#include + + .text +MCOUNT_LABEL(eintr) + +/* + * void doreti(struct trapframe) + * + * Handle return from interrupts, traps and syscalls. + */ + .text + SUPERALIGN_TEXT + .type doreti,@function +doreti: + FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ +doreti_next: +#ifdef notyet + /* + * Check if ASTs can be handled now. PSL_VM must be checked first + * since segment registers only have an RPL in non-VM86 mode. + */ + testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ + jz doreti_notvm86 + movl PCPU(CURPCB),%ecx + testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ + jz doreti_ast /* can handle ASTS now if not */ + jmp doreti_exit + +doreti_notvm86: +#endif + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ + jz doreti_exit /* can't handle ASTs now if not */ + +doreti_ast: + /* + * Check for ASTs atomically with returning. Disabling CPU + * interrupts provides sufficient locking even in the SMP case, + * since we will be informed of any new ASTs by an IPI. + */ + DISABLE_INTERRUPTS(%esi) /* cli */ + movl PCPU(CURTHREAD),%eax + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) + je doreti_exit + ENABLE_INTERRUPTS(%esi) /* sti */ + pushl %esp /* pass a pointer to the trapframe */ + call ast + add $4,%esp + jmp doreti_ast + + /* + * doreti_exit: pop registers, iret. + * + * The segment register pop is a special case, since it may + * fault if (for example) a sigreturn specifies bad segment + * registers. The fault is handled in trap.c. + */ +doreti_exit: + ENABLE_INTERRUPTS(%esi) # reenable event callbacks (sti) + + .globl scrit +scrit: + __TEST_PENDING(%esi) + jnz hypervisor_callback_pending /* More to go */ + + MEXITCOUNT + + .globl doreti_popl_fs +doreti_popl_fs: + popl %fs + .globl doreti_popl_es +doreti_popl_es: + popl %es + .globl doreti_popl_ds +doreti_popl_ds: + popl %ds + + /* + * This is important: as nothing is atomic over here (we can get + * interrupted any time), we use the critical_region_fixup() in + * order to figure out where out stack is. Therefore, do NOT use + * 'popal' here without fixing up the table! + */ + POPA + addl $8,%esp + .globl doreti_iret +doreti_iret: + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .globl ecrit +ecrit: + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ + ALIGN_TEXT + .globl doreti_iret_fault +doreti_iret_fault: + subl $8,%esp + pushal + pushl %ds + .globl doreti_popl_ds_fault +doreti_popl_ds_fault: + pushl %es + .globl doreti_popl_es_fault +doreti_popl_es_fault: + pushl %fs + .globl doreti_popl_fs_fault +doreti_popl_fs_fault: + movl $0,TF_ERR(%esp) /* XXX should be the error code */ + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp alltraps_with_regs_pushed + + /* +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +*/ + +.globl critical_region_fixup +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + movl %esp,%esi + add %eax,%esi # %esi points at end of src region + movl %esp,%edi + add $0x40,%edi # %edi points at end of dst region + movl %eax,%ecx + shr $2,%ecx # convert bytes to words + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp hypervisor_callback_pending + + +critical_fixup_table: +.byte 0x0,0x0,0x0 #testb $0x1,(%esi) +.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea +.byte 0x0,0x0 #pop %fs +.byte 0x04 #pop %es +.byte 0x08 #pop %ds +.byte 0x0c #pop %edi +.byte 0x10 #pop %esi +.byte 0x14 #pop %ebp +.byte 0x18 #pop %ebx +.byte 0x1c #pop %ebx +.byte 0x20 #pop %edx +.byte 0x24 #pop %ecx +.byte 0x28 #pop %eax +.byte 0x2c,0x2c,0x2c #add $0x8,%esp +#if 0 +.byte 0x34 #iret +#endif +.byte 0x34,0x34,0x34,0x34,0x34 #HYPERVISOR_iret + + +/* # Hypervisor uses this for application faults while it executes.*/ +ENTRY(failsafe_callback) + pushal + call xen_failsafe_handler +/*# call install_safe_pf_handler */ + movl 28(%esp),%ebx +1: movl %ebx,%ds + movl 32(%esp),%ebx +2: movl %ebx,%es + movl 36(%esp),%ebx +3: movl %ebx,%fs + movl 40(%esp),%ebx +4: movl %ebx,%gs +/*# call install_normal_pf_handler */ + popal + addl $12,%esp + iret + + Property changes on: i386/xen/exception.s ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/locore.s =================================================================== --- i386/xen/locore.s (.../stable/6/sys) (revision 0) +++ i386/xen/locore.s (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,373 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 + * $FreeBSD$ + * + * originally from: locore.s, by William F. Jolitz + * + * Substantially rewritten by David Greenman, Rod Grimes, + * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp + * and many others. + */ + +#include "opt_bootp.h" +#include "opt_compat.h" +#include "opt_nfsroot.h" +#include "opt_global.h" +#include "opt_pmap.h" + +#include +#include + +#include +#include +#include +#include +#include + +#define __ASSEMBLY__ +#include + +/* The defines below have been lifted out of */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define KERNEL_CS FLAT_RING1_CS +#define KERNEL_DS FLAT_RING1_DS + +#include "assym.s" + +.section __xen_guest + .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" + .byte 0 + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) +#if 0 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") + +#ifdef PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) + + + +/* + * XXX + * + * Note: This version greatly munged to avoid various assembler errors + * that may be fixed in newer versions of gas. Perhaps newer versions + * will have more pleasant appearance. + */ + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + */ + .globl PTmap,PTD,PTDpde + .set PTmap,(PTDPTDI << PDRSHIFT) + .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) + .set PTDpde,PTD + (PTDPTDI * PDESIZE) + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase,KERNBASE + .globl kernload + .set kernload,KERNLOAD + +/* + * Globals + */ + .data + ALIGN_DATA /* just to be sure */ + + .space 0x2000 /* space for tmpstk - temporary stack */ +tmpstk: + + .globl bootinfo +bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + + .globl KERNend +KERNend: .long 0 /* phys addr end of kernel (just after bss) */ + .globl physfree +physfree: .long 0 /* phys addr of next free page */ + +#ifdef SMP + .globl cpu0prvpage +cpu0pp: .long 0 /* phys addr cpu0 private pg */ +cpu0prvpage: .long 0 /* relocated version */ + + .globl SMPpt +SMPptpa: .long 0 /* phys addr SMP page table */ +SMPpt: .long 0 /* relocated version */ +#endif /* SMP */ + + .globl IdlePTD +IdlePTD: .long 0 /* phys addr of kernel PTD */ + +#ifdef PAE + .globl IdlePDPT +IdlePDPT: .long 0 /* phys addr of kernel PDPT */ +#endif + +#ifdef SMP + .globl KPTphys +#endif + .globl gdtset +KPTphys: .long 0 /* phys addr of kernel page tables */ +gdtset: .long 0 + + .globl proc0kstack +proc0uarea: .long 0 /* address of proc 0 uarea (unused)*/ +proc0kstack: .long 0 /* address of proc 0 kstack space */ +p0upa: .long 0 /* phys addr of proc0 UAREA (unused) */ +p0kpa: .long 0 /* phys addr of proc0's STACK */ + +vm86phystk: .long 0 /* PA of vm86/bios stack */ + + .globl vm86paddr, vm86pa +vm86paddr: .long 0 /* address of vm86 region */ +vm86pa: .long 0 /* phys addr of vm86 region */ + +#ifdef PC98 + .globl pc98_system_parameter +pc98_system_parameter: + .space 0x240 +#endif + + .globl avail_space +avail_space: .long 0 + +/********************************************************************** + * + * Some handy macros + * + */ + +/* + * We're already in protected mode, so no remapping is needed. + */ +#define R(foo) (foo) + +#define ALLOCPAGES(foo) \ + movl R(physfree), %esi ; \ + movl $((foo)*PAGE_SIZE), %eax ; \ + addl %esi, %eax ; \ + movl %eax, R(physfree) ; \ + movl %esi, %edi ; \ + movl $((foo)*PAGE_SIZE),%ecx ; \ + xorl %eax,%eax ; \ + cld ; \ + rep ; \ + stosb + +/* + * fillkpt + * eax = page frame address + * ebx = index into page table + * ecx = how many pages to map + * base = base address of page dir/table + * prot = protection bits + */ +#define fillkpt(base, prot) \ + shll $PTESHIFT,%ebx ; \ + addl base,%ebx ; \ + orl $PG_V,%eax ; \ + orl prot,%eax ; \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $PTESIZE,%ebx ; /* next pte */ \ + loop 1b + +/* + * fillkptphys(prot) + * eax = physical address + * ecx = how many pages to map + * prot = protection bits + */ +#define fillkptphys(prot) \ + movl %eax, %ebx ; \ + shrl $PAGE_SHIFT, %ebx ; \ + fillkpt(R(KPTphys), prot) + +/* Temporary stack */ +.space 8192 +tmpstack: + .long tmpstack, KERNEL_DS + + .text + +.p2align 12, 0x90 + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + .cfi_startproc + .skip 0x1000 + .cfi_endproc + +/********************************************************************** + * + * This is where the bootblocks start us, set the ball rolling... + * + */ +NON_GPROF_ENTRY(btext) + /* At the end of our stack, we shall have free space - so store it */ + movl %esp,%ebx + movl %ebx,R(avail_space) + + lss tmpstack,%esp + + pushl %esi + call initvalues + popl %esi + + /* Store the CPUID information */ + xorl %eax,%eax + cpuid # cpuid 0 + movl %eax,R(cpu_high) # highest capability + movl %ebx,R(cpu_vendor) # store vendor string + movl %edx,R(cpu_vendor+4) + movl %ecx,R(cpu_vendor+8) + movb $0,R(cpu_vendor+12) + + movl $1,%eax + cpuid # cpuid 1 + movl %eax,R(cpu_id) # store cpu_id + movl %ebx,R(cpu_procinfo) # store cpu_procinfo + movl %edx,R(cpu_feature) # store cpu_feature + movl %ecx,R(cpu_feature2) # store cpu_feature2 + rorl $8,%eax # extract family type + andl $15,%eax + cmpl $5,%eax + movl $CPU_686,R(cpu) + + movl proc0kstack,%eax + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + xorl %ebp,%ebp /* mark end of frames */ +#ifdef PAE + movl IdlePDPT,%esi +#else + movl IdlePTD,%esi +#endif + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + pushl physfree + call init386 + addl $4, %esp + call mi_startup + /* NOTREACHED */ + int $3 + +/* + * Signal trampoline, copied to top of user stack + */ +NON_GPROF_ENTRY(sigcode) + calll *SIGF_HANDLER(%esp) + leal SIGF_UC(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC_EFLAGS(%eax) + jne 1f + mov UC_GS(%eax), %gs /* restore %gs */ +1: + movl $SYS_sigreturn,%eax + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b + +#ifdef COMPAT_FREEBSD4 + ALIGN_TEXT +freebsd4_sigcode: + calll *SIGF_HANDLER(%esp) + leal SIGF_UC4(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC4_EFLAGS(%eax) + jne 1f + mov UC4_GS(%eax),%gs /* restore %gs */ +1: + movl $344,%eax /* 4.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b +#endif + +#ifdef COMPAT_43 + ALIGN_TEXT +osigcode: + call *SIGF_HANDLER(%esp) /* call signal handler */ + lea SIGF_SC(%esp),%eax /* get sigcontext */ + pushl %eax + testl $PSL_VM,SC_PS(%eax) + jne 9f + movl SC_GS(%eax),%gs /* restore %gs */ +9: + movl $103,%eax /* 3.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ +0: jmp 0b +#endif /* COMPAT_43 */ + + ALIGN_TEXT +esigcode: + + .data + .globl szsigcode +szsigcode: + .long esigcode-sigcode +#ifdef COMPAT_FREEBSD4 + .globl szfreebsd4_sigcode +szfreebsd4_sigcode: + .long esigcode-freebsd4_sigcode +#endif +#ifdef COMPAT_43 + .globl szosigcode +szosigcode: + .long esigcode-osigcode +#endif Property changes on: i386/xen/locore.s ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/xen_bus.c =================================================================== --- i386/xen/xen_bus.c (.../stable/6/sys) (revision 0) +++ i386/xen/xen_bus.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,238 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +static MALLOC_DEFINE(M_XENDEV, "xenintrdrv", "xen system device"); + +struct xenbus_device { + struct resource_list xen_resources; +}; + +#define DEVTOXEN(dev) ((struct xenbus_device *)device_get_ivars(dev)) + +static void xenbus_identify(driver_t *, device_t); +static int xenbus_probe(device_t); +static int xenbus_attach(device_t); +static int xenbus_print_child(device_t, device_t); +static device_t xenbus_add_child(device_t bus, int order, const char *name, + int unit); +static struct resource *xenbus_alloc_resource(device_t, device_t, int, int *, + u_long, u_long, u_long, u_int); +static int xenbus_release_resource(device_t, device_t, int, int, + struct resource *); +static int xenbus_set_resource(device_t, device_t, int, int, u_long, u_long); +static int xenbus_get_resource(device_t, device_t, int, int, u_long *, u_long *); +static void xenbus_delete_resource(device_t, device_t, int, int); + + +static device_method_t xenbus_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenbus_identify), + DEVMETHOD(device_probe, xenbus_probe), + DEVMETHOD(device_attach, xenbus_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, xenbus_print_child), + DEVMETHOD(bus_add_child, xenbus_add_child), + DEVMETHOD(bus_read_ivar, bus_generic_read_ivar), + DEVMETHOD(bus_write_ivar, bus_generic_write_ivar), + DEVMETHOD(bus_set_resource, xenbus_set_resource), + DEVMETHOD(bus_get_resource, xenbus_get_resource), + DEVMETHOD(bus_alloc_resource, xenbus_alloc_resource), + DEVMETHOD(bus_release_resource, xenbus_release_resource), + DEVMETHOD(bus_delete_resource, xenbus_delete_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + + { 0, 0 } +}; + + +static driver_t xenbus_driver = { + "xenbus", + xenbus_methods, + 1, /* no softc */ +}; +static devclass_t xenbus_devclass; +static device_t xenbus_dev; +static boolean_t xenbus_probe_delay = TRUE; /* delay child probes */ + +DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0); + +static void +xenbus_identify(driver_t *driver, device_t parent) +{ + + /* + * Add child device with order of 0 so it gets probed + * first + */ + xenbus_dev = BUS_ADD_CHILD(parent, 0, "xenbus", 0); + if (xenbus_dev == NULL) + panic("xenbus: could not attach"); +} + +static int +xenbus_probe(device_t dev) +{ + device_set_desc(dev, "xen system"); + device_quiet(dev); + return (0); +} + +static int +xenbus_attach(device_t dev) +{ + /* + * First, let our child driver's identify any child devices that + * they can find. Once that is done attach any devices that we + * found. + */ + if (!xenbus_probe_delay) { + bus_generic_probe(dev); + bus_generic_attach(dev); + } + + return 0; +} + + +static int +xenbus_print_all_resources(device_t dev) +{ + struct xenbus_device *xdev = device_get_ivars(dev); + struct resource_list *rl = &xdev->xen_resources; + int retval = 0; + + if (STAILQ_FIRST(rl)) + retval += printf(" at"); + + retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx"); + retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx"); + retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld"); + + return retval; +} + + +static int +xenbus_print_child(device_t bus, device_t child) +{ + int retval = 0; + + retval += bus_print_child_header(bus, child); + retval += xenbus_print_all_resources(child); + retval += printf(" on motherboard\n"); /* XXX "motherboard", ick */ + + return (retval); +} + +static device_t +xenbus_add_child(device_t bus, int order, const char *name, int unit) +{ + device_t child; + struct xenbus_device *xendev; + + xendev = malloc(sizeof(struct xenbus_device), M_XENDEV, + M_NOWAIT | M_ZERO); + if (!xendev) + return(0); + resource_list_init(&xendev->xen_resources); + + child = device_add_child_ordered(bus, order, name, unit); + + /* should we free this in xenbus_child_detached? */ + device_set_ivars(child, xendev); + + return(child); +} + +static struct resource * +xenbus_alloc_resource(device_t bus, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_alloc(rl, bus, child, type, rid, start, end, + count, flags)); +} + + +static int +xenbus_release_resource(device_t bus, device_t child, int type, int rid, + struct resource *r) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_release(rl, bus, child, type, rid, r)); +} + +static int +xenbus_set_resource(device_t dev, device_t child, int type, int rid, + u_long start, u_long count) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_add(rl, type, rid, start, start + count - 1, count); + return(0); +} + +static int +xenbus_get_resource(device_t dev, device_t child, int type, int rid, + u_long *startp, u_long *countp) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + struct resource_list_entry *rle; + + rle = resource_list_find(rl, type, rid); + if (!rle) + return(ENOENT); + if (startp) + *startp = rle->start; + if (countp) + *countp = rle->count; + return(0); +} + +static void +xenbus_delete_resource(device_t dev, device_t child, int type, int rid) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_delete(rl, type, rid); +} + +static void +xenbus_init(void *unused) +{ + xenbus_probe_delay = FALSE; + xenbus_attach(xenbus_dev); +} +SYSINIT(xenbusdev, SI_SUB_PSEUDO, SI_ORDER_FIRST, xenbus_init, NULL); Property changes on: i386/xen/xen_bus.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/mptable.c =================================================================== --- i386/xen/mptable.c (.../stable/6/sys) (revision 0) +++ i386/xen/mptable.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2003 John Baldwin + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +static int mptable_probe(void); +static int mptable_probe_cpus(void); +static void mptable_register(void *dummy); +static int mptable_setup_local(void); +static int mptable_setup_io(void); + +static struct apic_enumerator mptable_enumerator = { + "MPTable", + mptable_probe, + mptable_probe_cpus, + mptable_setup_local, + mptable_setup_io +}; + +static int +mptable_probe(void) +{ + + return (-100); +} + +static int +mptable_probe_cpus(void) +{ + int i, rc; + + for (i = 0; i < MAXCPU; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_add(i, (i == 0)); + } + + return (0); +} + +/* + * Initialize the local APIC on the BSP. + */ +static int +mptable_setup_local(void) +{ + + return (0); +} + +static int +mptable_setup_io(void) +{ + + return (0); +} + +static void +mptable_register(void *dummy __unused) +{ + + apic_register_enumerator(&mptable_enumerator); +} +SYSINIT(mptable_register, SI_SUB_CPU - 1, SI_ORDER_FIRST, mptable_register, + NULL); + + + +int +mptable_pci_probe_table(int bus) +{ + + return (0); +} + +int +mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin) +{ + + return (0); +} + Property changes on: i386/xen/mptable.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/clock.c =================================================================== --- i386/xen/clock.c (.../stable/6/sys) (revision 0) +++ i386/xen/clock.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,963 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include +__FBSDID("$FreeBSD$"); + +/* #define DELAYDEBUG */ +/* + * Routines to handle clock hardware. + */ + +#include "opt_ddb.h" +#include "opt_clock.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#if defined(SMP) +#include +#endif +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we + * can use a simple formula for leap years. + */ +#define LEAPYEAR(y) (!((y) % 4)) +#define DAYSPERYEAR (28+30*4+31*7) + +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif + +#ifdef CYC2NS_SCALE_FACTOR +#undef CYC2NS_SCALE_FACTOR +#endif +#define CYC2NS_SCALE_FACTOR 10 + +/* Values for timerX_state: */ +#define RELEASED 0 +#define RELEASE_PENDING 1 +#define ACQUIRED 2 +#define ACQUIRE_PENDING 3 + +#define RTC_LOCK_INIT \ + mtx_init(&clock_lock, "clk", NULL, MTX_SPIN) +#define RTC_LOCK mtx_lock_spin(&clock_lock) +#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) + +int adjkerntz; /* local offset from GMT in seconds */ +int clkintr_pending; +int pscnt = 1; +int psdiv = 1; +int statclock_disable; +int disable_rtc_set = 0; +int wall_cmos_clock; +u_int timer_freq = TIMER_FREQ; +static int independent_wallclock; +static int xen_disable_rtc_set; +static u_long cached_gtm; /* cached quotient for TSC -> microseconds */ +static u_long cyc2ns_scale; +static u_char timer2_state = RELEASED; +static struct timespec shadow_tv; +static uint32_t shadow_tv_version; /* XXX: lazy locking */ +static uint64_t processed_system_time; /* stime (ns) at last processing. */ +static unsigned int time_irq; + +#ifdef XEN_PRIVILEGED_GUEST +static struct mtx clock_lock; +static int rtc_reg; +#endif + +static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + +SYSCTL_INT(_machdep, OID_AUTO, independent_wallclock, + CTLFLAG_RW, &independent_wallclock, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, xen_disable_rtc_set, + CTLFLAG_RW, &xen_disable_rtc_set, 1, ""); + + +#define do_div(n,base) ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ + __asm("":"=A" (n):"a" (__low),"d" (__high)); \ + __mod; \ +}) + + +#define NS_PER_TICK (1000000000ULL/hz) + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + uint32_t tmp1, tmp2; + + if ( shift < 0 ) + delta >>= -shift; + else + delta <<= shift; + + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) ); + + return product; +} + +static uint64_t get_nsec_offset(struct shadow_time_info *shadow) +{ + uint64_t now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_info[PCPU_GET(cpuid)].time; + dst = PCPU_PTR(shadow_time); + + do { + dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } + while ((src->version & 1) | (dst->version ^ src->version)); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; +} + +static inline int time_values_up_to_date(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)].time; + dst = PCPU_PTR(shadow_time); + + rmb(); + return (dst->version == src->version); +} + +static unsigned xen_get_timecount(struct timecounter *tc); + +static struct timecounter xen_timecounter = { + xen_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "ixen", /* name */ + 0 /* quality */ +}; + +static void +clkintr(void *arg) +{ + int64_t delta_cpu, delta; + struct shadow_time_info *shadow = PCPU_PTR(shadow_time); + struct clockframe *frame = (struct clockframe *)arg; + + do { + __get_time_values_from_xen(); + + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= PCPU_GET(processed_system_time); + + } while (!time_values_up_to_date()); + + if (unlikely(delta < (int64_t)0) || unlikely(delta_cpu < (int64_t)0)) { + printf("Timer ISR: Time went backwards: %lld\n", delta); + return; + } + + /* Process elapsed ticks since last call. */ + if (delta >= NS_PER_TICK) { + processed_system_time += (delta / NS_PER_TICK) * NS_PER_TICK; + *PCPU_PTR(processed_system_time) += (delta_cpu / NS_PER_TICK) * NS_PER_TICK; + } + hardclock(frame); + + /* + * Take synchronised time from Xen once a minute if we're not + * synchronised ourselves, and we haven't chosen to keep an independent + * time base. + */ + + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + tc_setclock(&shadow_tv); + } + + /* XXX TODO */ +} + +static uint32_t +getit(void) +{ + struct shadow_time_info *shadow; + shadow = PCPU_PTR(shadow_time); + __get_time_values_from_xen(); + return shadow->system_timestamp + get_nsec_offset(shadow); +} + + +/* + * Wait "n" microseconds. + * Relies on timer 1 counting down from (timer_freq / hz) + * Note: timer had better have been programmed before this is first used! + */ +void +DELAY(int n) +{ + int delta, ticks_left; + uint32_t tick, prev_tick; +#ifdef DELAYDEBUG + int getit_calls = 1; + int n1; + static int state = 0; + + if (state == 0) { + state = 1; + for (n1 = 1; n1 <= 10000000; n1 *= 10) + DELAY(n1); + state = 2; + } + if (state == 1) + printf("DELAY(%d)...", n); +#endif + /* + * Read the counter first, so that the rest of the setup overhead is + * counted. Guess the initial overhead is 20 usec (on most systems it + * takes about 1.5 usec for each of the i/o's in getit(). The loop + * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The + * multiplications and divisions to scale the count take a while). + * + * However, if ddb is active then use a fake counter since reading + * the i8254 counter involves acquiring a lock. ddb must not go + * locking for many reasons, but it calls here for at least atkbd + * input. + */ + prev_tick = getit(); + + n -= 0; /* XXX actually guess no initial overhead */ + /* + * Calculate (n * (timer_freq / 1e6)) without using floating point + * and without any avoidable overflows. + */ + if (n <= 0) + ticks_left = 0; + else if (n < 256) + /* + * Use fixed point to avoid a slow division by 1000000. + * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. + * 2^15 is the first power of 2 that gives exact results + * for n between 0 and 256. + */ + ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; + else + /* + * Don't bother using fixed point, although gcc-2.7.2 + * generates particularly poor code for the long long + * division, since even the slow way will complete long + * before the delay is up (unless we're interrupted). + */ + ticks_left = ((u_int)n * (long long)timer_freq + 999999) + / 1000000; + + while (ticks_left > 0) { + tick = getit(); +#ifdef DELAYDEBUG + ++getit_calls; +#endif + delta = tick - prev_tick; + prev_tick = tick; + if (delta < 0) { + /* + * Guard against timer0_max_count being wrong. + * This shouldn't happen in normal operation, + * but it may happen if set_timer_freq() is + * traced. + */ + /* delta += timer0_max_count; ??? */ + if (delta < 0) + delta = 0; + } + ticks_left -= delta; + } +#ifdef DELAYDEBUG + if (state == 1) + printf(" %d calls to getit() at %d usec each\n", + getit_calls, (n + 5) / getit_calls); +#endif +} + + +int +sysbeep(int pitch, int period) +{ + return (0); +} + +/* + * Restore all the timers non-atomically (XXX: should be atomically). + * + * This function is called from pmtimer_resume() to restore all the timers. + * This should not be necessary, but there are broken laptops that do not + * restore all the timers on resume. + */ +void +timer_restore(void) +{ + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = pcpu_find(0)->pc_shadow_time.system_timestamp; + pcpu_find(0)->pc_processed_system_time = processed_system_time; +} + +void +startrtclock() +{ + unsigned long long alarm; + uint64_t __cpu_khz; + uint32_t cpu_khz; + struct vcpu_time_info *info; + + /* initialize xen values */ + __get_time_values_from_xen(); + processed_system_time = pcpu_find(0)->pc_shadow_time.system_timestamp; + pcpu_find(0)->pc_processed_system_time = processed_system_time; + + __cpu_khz = 1000000ULL << 32; + info = &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(__cpu_khz, info->tsc_to_system_mul); + if ( info->tsc_shift < 0 ) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; + + printf("Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ + { + unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cached_gtm), "=d" (edx) + :"r" (cpu_khz), + "0" (eax), "1" (edx)); + } + + set_cyc2ns_scale(cpu_khz/1000); + tsc_freq = cpu_khz * 1000; + + timer_freq = xen_timecounter.tc_frequency = 1000000000LL; + tc_init(&xen_timecounter); + + + rdtscll(alarm); +} + +#ifdef XEN_PRIVILEGED_GUEST +/* + * RTC support routines + */ + +int +rtcin(reg) + int reg; +{ + u_char val; + + RTC_LOCK; + outb(IO_RTC, reg); + inb(0x84); + val = inb(IO_RTC + 1); + inb(0x84); + RTC_UNLOCK; + return (val); +} + + +static __inline int +readrtc(int port) +{ + return(bcd2bin(rtcin(port))); +} + +void +writertc(int reg, u_char val) +{ + + RTC_LOCK; + if (rtc_reg != reg) { + inb(0x84); + outb(IO_RTC, reg); + rtc_reg = reg; + inb(0x84); + } + outb(IO_RTC + 1, val); + inb(0x84); + RTC_UNLOCK; +} + + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +static void +domu_inittodr(time_t base) +{ + unsigned long sec; + int s, y; + struct timespec ts; + + update_wallclock(); + + RTC_LOCK; + + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + tc_setclock(&shadow_tv); + } + RTC_UNLOCK; +} + +/* + * Write system time back to RTC. + */ +static void +domu_resettodr(void) +{ + unsigned long tm; + int s; + dom0_op_t op; + struct shadow_time_info *shadow; + + shadow = PCPU_PTR(shadow_time); + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + if ((xen_start_info->flags & SIF_INITDOMAIN) && + !independent_wallclock) + { + op.cmd = DOM0_SETTIME; + op.u.settime.secs = tm; + op.u.settime.nsecs = 0; + op.u.settime.system_time = shadow->system_timestamp; + HYPERVISOR_dom0_op(&op); + update_wallclock(); + } else if (independent_wallclock) { + /* notyet */ + ; + } +} + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + unsigned long sec, days; + int year, month; + int y, m, s; + struct timespec ts; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_inittodr(base); + return; + } + + if (base) { + s = splclock(); + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + splx(s); + } + + /* Look if we have a RTC present and the time is valid */ + if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) + goto wrong_time; + + /* wait for time update to complete */ + /* If RTCSA_TUP is zero, we have at least 244us before next update */ + s = splhigh(); + while (rtcin(RTC_STATUSA) & RTCSA_TUP) { + splx(s); + s = splhigh(); + } + + days = 0; +#ifdef USE_RTC_CENTURY + year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; +#else + year = readrtc(RTC_YEAR) + 1900; + if (year < 1970) + year += 100; +#endif + if (year < 1970) { + splx(s); + goto wrong_time; + } + month = readrtc(RTC_MONTH); + for (m = 1; m < month; m++) + days += daysinmonth[m-1]; + if ((month > 2) && LEAPYEAR(year)) + days ++; + days += readrtc(RTC_DAY) - 1; + for (y = 1970; y < year; y++) + days += DAYSPERYEAR + LEAPYEAR(y); + sec = ((( days * 24 + + readrtc(RTC_HRS)) * 60 + + readrtc(RTC_MIN)) * 60 + + readrtc(RTC_SEC)); + /* sec now contains the number of seconds, since Jan 1 1970, + in the local time zone */ + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = sec; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + splx(s); + return; + + wrong_time: + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); +} + + + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + unsigned long tm; + int y, m, s; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_resettodr(); + return; + } + + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + /* Disable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); + + /* Calculate local time to put in RTC */ + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ + writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ + writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ + + /* We have now the days since 01-01-1970 in tm */ + writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ + for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); + tm >= m; + y++, m = DAYSPERYEAR + LEAPYEAR(y)) + tm -= m; + + /* Now we have the years in y and the day-of-the-year in tm */ + writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ +#ifdef USE_RTC_CENTURY + writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ +#endif + for (m = 0; ; m++) { + int ml; + + ml = daysinmonth[m]; + if (m == 1 && LEAPYEAR(y)) + ml++; + if (tm < ml) + break; + tm -= ml; + } + + writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ + writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ + + /* Reenable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_24HR); + rtcin(RTC_INTR); +} +#else +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + int s, y; + struct timespec ts; + + s = splclock(); + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = shadow_tv.tv_sec; + ts.tv_nsec = shadow_tv.tv_nsec * 1000000000; /* :-/ */ + tc_setclock(&ts); + } + splx(s); +} + +/* + * Write system time back to RTC. Not supported for guest domains. + */ +void +resettodr() +{ +} +#endif + + +int +acquire_timer2(int mode) +{ + + if (timer2_state != RELEASED) + return (-1); + timer2_state = ACQUIRED; + + /* + * This access to the timer registers is as atomic as possible + * because it is a single instruction. We could do better if we + * knew the rate. Use of splclock() limits glitches to 10-100us, + * and this is probably good enough for timer2, so we aren't as + * careful with it as with timer0. + */ + outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f)); + + return (0); +} + +int +release_timer2() +{ + + if (timer2_state != ACQUIRED) + return (-1); + timer2_state = RELEASED; + outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT); + return (0); +} + +static struct vcpu_set_periodic_timer xen_set_periodic_tick; + +/* + * Start clocks running. + */ +void +cpu_initclocks(void) +{ + int error; + + xen_set_periodic_tick.period_ns = NS_PER_TICK; + + HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, + &xen_set_periodic_tick); + + if (time_irq) + unbind_from_irqhandler(time_irq); + time_irq = 0; + + error = bind_virq_to_irqhandler(VIRQ_TIMER, 0, "clk", + clkintr, + INTR_TYPE_CLK | INTR_FAST, &time_irq); + if (error) + panic("failed to register clock interrupt\n"); + + /* should fast clock be enabled ? */ +} + +int +ap_cpu_initclocks(int cpu) +{ + unsigned int time_irq; + int error; + + xen_set_periodic_tick.period_ns = NS_PER_TICK; + + HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, + &xen_set_periodic_tick); + error = bind_virq_to_irqhandler(VIRQ_TIMER, 0, "clk", + clkintr, + INTR_TYPE_CLK | INTR_FAST, &time_irq); + if (error) + panic("failed to register clock interrupt\n"); + + + return (0); +} + +void +cpu_startprofclock(void) +{ + + printf("cpu_startprofclock: profiling clock is not supported\n"); +} + +void +cpu_stopprofclock(void) +{ + + printf("cpu_stopprofclock: profiling clock is not supported\n"); +} +#define NSEC_PER_USEC 1000 + +static uint32_t +xen_get_timecount(struct timecounter *tc) +{ + uint64_t clk; + struct shadow_time_info *shadow = PCPU_PTR(shadow_time); + + __get_time_values_from_xen(); + + clk = shadow->system_timestamp + get_nsec_offset(shadow); + + return (uint32_t)((clk / NS_PER_TICK) * NS_PER_TICK); + +} + +/* Return system time offset by ticks */ +uint64_t +get_system_time(int ticks) +{ + return processed_system_time + (ticks * NS_PER_TICK); +} + +/* + * Track behavior of cur_timer->get_offset() functionality in timer_tsc.c + */ + +#if 0 +static uint32_t +xen_get_offset(void) +{ + register unsigned long eax, edx; + + /* Read the Time Stamp Counter */ + + rdtsc(eax,edx); + + /* .. relative to previous jiffy (32 bits is enough) */ + eax -= shadow_tsc_stamp; + + /* + * Time offset = (tsc_low delta) * cached_gtm + * = (tsc_low delta) * (usecs_per_clock) + * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) + * + * Using a mull instead of a divl saves up to 31 clock cycles + * in the critical path. + */ + + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (cached_gtm), + "0" (eax)); + + /* our adjusted time offset in microseconds */ + return edx; +} +#endif +void +idle_block(void) +{ + int err; + + __get_time_values_from_xen(); + err = HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK); + KASSERT(err == 0, ("set_timer_op failed")); + HYPERVISOR_sched_op(SCHEDOP_block, 0); +} Property changes on: i386/xen/clock.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/xen_machdep.c =================================================================== --- i386/xen/xen_machdep.c (.../stable/6/sys) (revision 0) +++ i386/xen/xen_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1280 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006,2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include +#include +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif + + +#include + +#define IDTVEC(name) __CONCAT(X,name) + +extern inthand_t +IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + + +int xendebug_flags; +start_info_t *xen_start_info; +shared_info_t *HYPERVISOR_shared_info; +xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; +xen_pfn_t *xen_phys_machine; +xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; +xen_pfn_t *xen_pfn_to_mfn_frame_list_list; +int preemptable, init_first; +extern unsigned int avail_space; + +static void printk(const char *fmt, ...); + +void ni_cli(void); +void ni_sti(void); + + +void +ni_cli(void) +{ + __asm__("pushl %edx;" + "pushl %eax;" + ); + __cli(); + __asm__("popl %eax;" + "popl %edx;" + ); +} + + +void +ni_sti(void) +{ + __asm__("pushl %edx;" + "pushl %esi;" + "pushl %eax;" + ); + __sti(); + __asm__("popl %eax;" + "popl %esi;" + "popl %edx;" + ); +} + +/* + * Modify the cmd_line by converting ',' to NULLs so that it is in a format + * suitable for the static env vars. + */ +char * +xen_setbootenv(char *cmd_line) +{ + char *cmd_line_next; + + /* Skip leading spaces */ + for (; *cmd_line == ' '; cmd_line++); + + printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line); + + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); + return cmd_line; +} + +static struct +{ + const char *ev; + int mask; +} howto_names[] = { + {"boot_askname", RB_ASKNAME}, + {"boot_single", RB_SINGLE}, + {"boot_nosync", RB_NOSYNC}, + {"boot_halt", RB_ASKNAME}, + {"boot_serial", RB_SERIAL}, + {"boot_cdrom", RB_CDROM}, + {"boot_gdb", RB_GDB}, + {"boot_gdb_pause", RB_RESERVED1}, + {"boot_verbose", RB_VERBOSE}, + {"boot_multicons", RB_MULTIPLE}, + {NULL, 0} +}; + +int +xen_boothowto(char *envp) +{ + int i, howto = 0; + + /* get equivalents from the environment */ + for (i = 0; howto_names[i].ev != NULL; i++) + if (getenv(howto_names[i].ev) != NULL) + howto |= howto_names[i].mask; + return howto; +} + +#define PRINTK_BUFSIZE 1024 +static void +printk(const char *fmt, ...) +{ + __va_list ap; + int retval; + static char buf[PRINTK_BUFSIZE]; + + return; + + va_start(ap, fmt); + retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[retval] = 0; + (void)HYPERVISOR_console_write(buf, retval); +} + + +#define XPQUEUE_SIZE 128 + +struct mmu_log { + char *file; + int line; +}; + +#ifdef SMP +/* per-cpu queues and indices */ +#ifdef INVARIANTS +static struct mmu_log xpq_queue_log[MAX_VIRT_CPUS][XPQUEUE_SIZE]; +#endif + +static int xpq_idx[MAX_VIRT_CPUS]; +static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE]; + +#define XPQ_QUEUE xpq_queue[vcpu] +#define XPQ_IDX xpq_idx[vcpu] +#define SET_VCPU() int vcpu = gdtset ? PCPU_GET(cpuid) : 0 + +#define XPQ_QUEUE_LOG xpq_queue_log[vcpu] +#else + +static mmu_update_t xpq_queue[XPQUEUE_SIZE]; +static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; +static int xpq_idx = 0; + +#define XPQ_QUEUE_LOG xpq_queue_log +#define XPQ_QUEUE xpq_queue +#define XPQ_IDX xpq_idx +#define SET_VCPU() +#endif /* !SMP */ + +#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); + +#if 0 +static void +xen_dump_queue(void) +{ + int _xpq_idx = XPQ_IDX; + int i; + + if (_xpq_idx <= 1) + return; + + printk("xen_dump_queue(): %u entries\n", _xpq_idx); + for (i = 0; i < _xpq_idx; i++) { + printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + } +} +#endif + + +static __inline void +_xen_flush_queue(void) +{ + SET_VCPU(); + int _xpq_idx = XPQ_IDX; + int error, i; + /* window of vulnerability here? */ + + if (__predict_true(gdtset)) + critical_enter(); + XPQ_IDX = 0; + /* Make sure index is cleared first to avoid double updates. */ + error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, + _xpq_idx, NULL, DOMID_SELF); + +#if 0 + if (__predict_true(gdtset)) + for (i = _xpq_idx; i > 0;) { + if (i >= 3) { + CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx " + "ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), + (XPQ_QUEUE[i-2].ptr & 0xffffffff), + (XPQ_QUEUE[i-3].val & 0xffffffff), + (XPQ_QUEUE[i-3].ptr & 0xffffffff)); + i -= 3; + } else if (i == 2) { + CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), + (XPQ_QUEUE[i-2].ptr & 0xffffffff)); + i = 0; + } else { + CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff)); + i = 0; + } + } +#endif + if (__predict_true(gdtset)) + critical_exit(); + if (__predict_false(error < 0)) { + for (i = 0; i < _xpq_idx; i++) + printf("val: %llx ptr: %llx\n", + XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + panic("Failed to execute MMU updates: %d", error); + } + +} + +void +xen_flush_queue(void) +{ + SET_VCPU(); + if (XPQ_IDX != 0) _xen_flush_queue(); +} + +static __inline void +xen_increment_idx(void) +{ + SET_VCPU(); + + XPQ_IDX++; + if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) + xen_flush_queue(); +} + +void +xen_check_queue(void) +{ +#ifdef INVARIANTS + SET_VCPU(); + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +#endif +} + +void +xen_invlpg(vm_offset_t va) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = va & ~PAGE_MASK; + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_load_cr3(u_int val) +{ + struct mmuext_op op; + int err; +#ifdef INVARIANTS + SET_VCPU(); + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +#endif + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_restore_flags(u_int eflags) +{ + + if (eflags > 1) + eflags = ((eflags & PSL_I) == 0); + + __restore_flags(eflags); +} + +int +xen_save_and_cli(void) +{ + int eflags; + + __save_and_cli(eflags); + return (eflags); +} + +void +xen_cli(void) +{ + __cli(); +} + +void +xen_sti(void) +{ + __sti(); +} + +u_int +xen_rcr2(void) +{ + + return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2); +} + +void +_xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line) +{ + SET_VCPU(); + + if (__predict_true(gdtset)) + critical_enter(); + XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = pfn; +#ifdef INVARIANTS + XPQ_QUEUE_LOG[XPQ_IDX].file = file; + XPQ_QUEUE_LOG[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdtset)) + critical_exit(); +} + +void +_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) +{ + SET_VCPU(); +#if 0 + if (__predict_true(gdtset)) + mtx_assert(&vm_page_queue_mtx, MA_OWNED); +#endif + + KASSERT((ptr & 7) == 0, ("misaligned update")); + + if (__predict_true(gdtset)) + critical_enter(); + + XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; +#ifdef INVARIANTS + XPQ_QUEUE_LOG[XPQ_IDX].file = file; + XPQ_QUEUE_LOG[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdtset)) + critical_exit(); +} + +void +xen_pgdpt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_pgd_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_pgd_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_pt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + printk("xen_pt_pin(): mfn=%x\n", op.arg1.mfn); + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_pt_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_set_ldt(vm_paddr_t ptr, unsigned long len) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = ptr; + op.arg2.nr_ents = len; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + int err; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + xen_flush_queue(); + err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF); + KASSERT(err >= 0, ("mmuext_op failed")); +} + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + vm_paddr_t pa; + pt_entry_t *ptp; + + ptp = vtopte((vm_offset_t)table); + pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); + if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) + panic("HYPERVISOR_update_descriptor failed\n"); +} + + +#if 0 +/* + * Bitmap is indexed by page number. If bit is set, the page is part of a + * xen_create_contiguous_region() area of memory. + */ +unsigned long *contiguous_bitmap; + +static void +contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] |= + ((1UL<> PAGE_SHIFT; + mfn = PFNTOMFN(pfn); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + err = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + KASSERT(err == 1, ("memory_op failed")); + } + + + /* 2. Get a new contiguous memory extent. */ + reservation.extent_order = order; + /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not + * running with a broxen driver XXXEN + */ + reservation.address_bits = 31; + if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) + goto fail; + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + xen_machphys_update(mfn+i, pfn); + PFNTOMFN(pfn) = mfn+i; + } + + xen_tlb_flush(); + +#if 0 + contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); +#endif + + balloon_unlock(flags); + + return 0; + + fail: + reservation.extent_order = 0; + reservation.address_bits = 0; + + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + err = HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation); + KASSERT(err == 1, ("memory_op failed")); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); + + return ENOMEM; +} + +void +xen_destroy_contiguous_region(void *addr, int npages) +{ + unsigned long mfn, i, flags, order, pfn0; + int err; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + pfn0 = vtophys(addr) >> PAGE_SHIFT; +#if 0 + scrub_pages(vstart, 1 << order); +#endif + /* can currently only handle power of two allocation */ + KASSERT(ffs(npages) == fls(npages), ("non-power of 2 page count")); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + balloon_lock(flags); + +#if 0 + contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); +#endif + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val = 0; + pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; + + err = HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0); + KASSERT(err == 0, ("update_va_mapping failed")); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + err = HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation); + KASSERT(err == 1, ("memory_op failed")); + } + + /* 2. Map new pages in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val; + pfn = pfn0 + i; + err = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation); + KASSERT(err == 1, ("memory_op failed")); + + new_val = mfn << PAGE_SHIFT; + err = HYPERVISOR_update_va_mapping( + (vm_offset_t)addr + (i * PAGE_SIZE), + new_val, PG_KERNEL); + KASSERT(err == 0, ("update_va_mapping failed")); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); +} + +extern unsigned long cpu0prvpage; +extern unsigned long *SMPpt; +extern struct user *proc0uarea; +extern vm_offset_t proc0kstack; +extern int vm86paddr, vm86phystk; +char *bootmem_start, *bootmem_current, *bootmem_end; + +pteinfo_t *pteinfo_list; +void initvalues(start_info_t *startinfo); + +struct ringbuf_head *xen_store; /* XXX move me */ +char *console_page; + +void * +bootmem_alloc(unsigned int size) +{ + char *retptr; + + retptr = bootmem_current; + KASSERT(retptr + size <= bootmem_end, ("bootmem_alloc failed")); + bootmem_current += size; + + return retptr; +} + +void +bootmem_free(void *ptr, unsigned int size) +{ + char *tptr; + + tptr = ptr; + KASSERT(tptr == bootmem_current - size && + bootmem_current - size >= bootmem_start, + ("bootmem_free failed")); + + bootmem_current -= size; +} + +#if 0 +static vm_paddr_t +xpmap_mtop2(vm_paddr_t mpa) +{ + return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) + ) | (mpa & ~PG_FRAME); +} + +static pd_entry_t +xpmap_get_bootpde(vm_paddr_t va) +{ + + return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; +} + +static pd_entry_t +xpmap_get_vbootpde(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_bootpde(va); + if ((pde & PG_V) == 0) + return (pde & ~PG_FRAME); + return (pde & ~PG_FRAME) | + (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); +} + +static pt_entry_t 8* +xpmap_get_bootptep(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_vbootpde(va); + if ((pde & PG_V) == 0) + return (void *)-1; +#define PT_MASK 0x003ff000 /* page table address bits */ + return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); +} + +static pt_entry_t +xpmap_get_bootpte(vm_paddr_t va) +{ + + return xpmap_get_bootptep(va)[0]; +} +#endif + + +#ifdef ADD_ISA_HOLE +static void +shift_phys_machine(unsigned long *phys_machine, int nr_pages) +{ + + unsigned long *tmp_page, *current_page, *next_page; + int i; + + tmp_page = bootmem_alloc(PAGE_SIZE); + current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); + next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); + bcopy(phys_machine, tmp_page, PAGE_SIZE); + + while (current_page > phys_machine) { + /* save next page */ + bcopy(next_page, tmp_page, PAGE_SIZE); + /* shift down page */ + bcopy(current_page, next_page, PAGE_SIZE); + /* finish swap */ + bcopy(tmp_page, current_page, PAGE_SIZE); + + current_page -= (PAGE_SIZE/sizeof(unsigned long)); + next_page -= (PAGE_SIZE/sizeof(unsigned long)); + } + bootmem_free(tmp_page, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + xen_machphys_update(phys_machine[i], i); + } + memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); + +} +#endif /* ADD_ISA_HOLE */ + +/* + * Build a directory of the pages that make up our Physical to Machine + * mapping table. The Xen suspend/restore code uses this to find our + * mapping table. + */ +static void +init_frame_list_list(void *arg) +{ + unsigned long nr_pages = xen_start_info->nr_pages; +#define FPP (PAGE_SIZE/sizeof(xen_pfn_t)) + int i, j, k; + + xen_pfn_to_mfn_frame_list_list = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + for (i = 0, j = 0, k = -1; i < nr_pages; + i += FPP, j++) { + if ((j & (FPP - 1)) == 0) { + k++; + xen_pfn_to_mfn_frame_list[k] = + malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + xen_pfn_to_mfn_frame_list_list[k] = + VTOMFN(xen_pfn_to_mfn_frame_list[k]); + j = 0; + } + xen_pfn_to_mfn_frame_list[k][j] = + VTOMFN(&xen_phys_machine[i]); + } + + HYPERVISOR_shared_info->arch.max_pfn = nr_pages; + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list + = VTOMFN(xen_pfn_to_mfn_frame_list_list); +} +SYSINIT(init_fll, SI_SUB_DEVFS, SI_ORDER_ANY, init_frame_list_list, NULL); + +extern unsigned long physfree; + +int pdir, curoffset; +extern int nkpt; + +void +initvalues(start_info_t *startinfo) +{ + int l3_pages, l2_pages, l1_pages, offset; + vm_offset_t cur_space, cur_space_pt; + struct physdev_set_iopl set_iopl; + + vm_paddr_t KPTphys, IdlePTDma; + vm_paddr_t console_page_ma, xen_store_ma; + vm_offset_t KPTphysoff, tmpva; + vm_paddr_t shinfo; +#ifdef PAE + vm_paddr_t IdlePDPTma, IdlePDPTnewma; + vm_paddr_t IdlePTDnewma[4]; + pd_entry_t *IdlePDPTnew, *IdlePTDnew; +#else + vm_paddr_t pdir_shadow_ma; +#endif + unsigned long i; + int ncpus, err; + + nkpt = min( + min( + max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt), + NPGPTD*NPDEPG - KPTDI), + (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT); + +#ifdef SMP + ncpus = MAXCPU; +#else + ncpus = 1; +#endif + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); +#ifdef notyet + /* + * need to install handler + */ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); +#endif + xen_start_info = startinfo; + xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; + + IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); + l1_pages = 0; + +#ifdef PAE + l3_pages = 1; + l2_pages = 0; + IdlePDPT = (pd_entry_t *)startinfo->pt_base; + IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base)); + for (i = (KERNBASE >> 30); + (i < 4) && (IdlePDPT[i] != 0); i++) + l2_pages++; + /* + * Note that only one page directory has been allocated at this point. + * Thus, if KERNBASE + */ +#if 0 + for (i = 0; i < l2_pages; i++) + IdlePTDma[i] = xpmap_ptom(VTOP(IdlePTD + i*PAGE_SIZE)); +#endif + + l2_pages = (l2_pages == 0) ? 1 : l2_pages; +#else + l3_pages = 0; + l2_pages = 1; +#endif + for (i = (((KERNBASE>>18) & PAGE_MASK)>>PAGE_SHIFT); + (i>PDRSHIFT)); i++) { + + if (IdlePTD[i] == 0) + break; + l1_pages++; + } + + /* number of pages allocated after the pts + 1*/; + cur_space = xen_start_info->pt_base + + ((xen_start_info->nr_pt_frames) + 3 )*PAGE_SIZE; + printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space); + + printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n", + KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), + xen_start_info->nr_pt_frames); + xendebug_flags = 0; /* 0xffffffff; */ + + /* allocate 4 pages for bootmem allocator */ + bootmem_start = bootmem_current = (char *)cur_space; + cur_space += (4 * PAGE_SIZE); + bootmem_end = (char *)cur_space; + + /* allocate page for gdt */ + gdt = (union descriptor *)cur_space; + cur_space += PAGE_SIZE*ncpus; + + /* allocate page for ldt */ + ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + cur_space += PAGE_SIZE; + + HYPERVISOR_shared_info = (shared_info_t *)cur_space; + cur_space += PAGE_SIZE; + + xen_store = (struct ringbuf_head *)cur_space; + cur_space += PAGE_SIZE; + + console_page = (char *)cur_space; + cur_space += PAGE_SIZE; + +#ifdef ADD_ISA_HOLE + shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); +#endif + /* + * pre-zero unused mapped pages - mapped on 4MB boundary + */ +#ifdef PAE + IdlePDPT = (pd_entry_t *)startinfo->pt_base; + IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base)); + /* + * Note that only one page directory has been allocated at this point. + * Thus, if KERNBASE + */ + IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); + IdlePTDma = xpmap_ptom(VTOP(IdlePTD)); + l3_pages = 1; +#else + IdlePTD = (pd_entry_t *)startinfo->pt_base; + IdlePTDma = xpmap_ptom(VTOP(startinfo->pt_base)); + l3_pages = 0; +#endif + l2_pages = 1; + l1_pages = xen_start_info->nr_pt_frames - l2_pages - l3_pages; + + KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE; + + KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff)); + XENPRINTF("IdlePTD %p\n", IdlePTD); + XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " + "mod_start: 0x%lx mod_len: 0x%lx\n", + xen_start_info->nr_pages, xen_start_info->shared_info, + xen_start_info->flags, xen_start_info->pt_base, + xen_start_info->mod_start, xen_start_info->mod_len); + /* Map proc0's KSTACK */ + + proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); + printk("proc0kstack=%u\n", proc0kstack); + + /* vm86/bios stack */ + cur_space += PAGE_SIZE; + + /* Map space for the vm86 region */ + vm86paddr = (vm_offset_t)cur_space; + cur_space += (PAGE_SIZE * 3); + +#ifdef PAE + IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; + bzero(IdlePDPTnew, PAGE_SIZE); + + IdlePDPTnewma = xpmap_ptom(VTOP(IdlePDPTnew)); + IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; + bzero(IdlePTDnew, 4*PAGE_SIZE); + + for (i = 0; i < 4; i++) + IdlePTDnewma[i] = + xpmap_ptom(VTOP((uint8_t *)IdlePTDnew + i*PAGE_SIZE)); + /* + * L3 + * + * Copy the 4 machine addresses of the new PTDs in to the PDPT + * + */ + for (i = 0; i < 4; i++) + IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; + + __asm__("nop;"); + /* + * + * re-map the new PDPT read-only + */ + PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); + /* + * + * Unpin the current PDPT + */ + xen_pt_unpin(IdlePDPTma); + + for (i = 0; i < 20; i++) { + int startidx = ((KERNBASE >> 18) & PAGE_MASK) >> 3; + + if (IdlePTD[startidx + i] == 0) { + l1_pages = i; + break; + } + } + +#endif /* PAE */ + + /* unmap remaining pages from initial 4MB chunk + * + */ + for (tmpva = cur_space; (tmpva & ((1<<22)-1)) != 0; tmpva += PAGE_SIZE) { + bzero((char *)tmpva, PAGE_SIZE); + PT_SET_MA(tmpva, (vm_paddr_t)0); + } + + PT_UPDATES_FLUSH(); + + memcpy(((uint8_t *)IdlePTDnew) + ((unsigned int)(KERNBASE >> 18)), + ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK), + l1_pages*sizeof(pt_entry_t)); + + for (i = 0; i < 4; i++) { + PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, + IdlePTDnewma[i] | PG_V); + } + xen_load_cr3(VTOP(IdlePDPTnew)); + xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew))); + + /* allocate remainder of nkpt pages */ + cur_space_pt = cur_space; + for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt; + i++, cur_space += PAGE_SIZE) { + pdir = (offset + i) / NPDEPG; + curoffset = ((offset + i) % NPDEPG); + if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS) + break; + + /* + * make sure that all the initial page table pages + * have been zeroed + */ + PT_SET_MA(cur_space_pt, + xpmap_ptom(VTOP(cur_space)) | PG_V | PG_RW); + bzero((char *)cur_space_pt, PAGE_SIZE); + PT_SET_MA(cur_space_pt, (vm_paddr_t)0); + xen_pt_pin(xpmap_ptom(VTOP(cur_space))); + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + + curoffset*sizeof(vm_paddr_t)), + xpmap_ptom(VTOP(cur_space)) | PG_KERNEL); + PT_UPDATES_FLUSH(); + } + + for (i = 0; i < 4; i++) { + pdir = (PTDPTDI + i) / NPDEPG; + curoffset = (PTDPTDI + i) % NPDEPG; + + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + + curoffset*sizeof(vm_paddr_t)), + IdlePTDnewma[i] | PG_V); + } + + PT_UPDATES_FLUSH(); + + IdlePTD = IdlePTDnew; + IdlePDPT = IdlePDPTnew; + IdlePDPTma = IdlePDPTnewma; + + /* + * shared_info is an unsigned long so this will randomly break if + * it is allocated above 4GB - I guess people are used to that + * sort of thing with Xen ... sigh + */ + shinfo = xen_start_info->shared_info; + PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); + + printk("#4\n"); + + xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT); + PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); + console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT); + PT_SET_MA(console_page, console_page_ma | PG_KERNEL); + + printk("#5\n"); + + set_iopl.iopl = 1; + err = HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl); + KASSERT(err == 0, ("physdev_op failed")); + printk("#6\n"); +#if 0 + /* add page table for KERNBASE */ + xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_KERNEL)); + xen_flush_queue(); +#ifdef PAE + xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_V | PG_A)); +#else + xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_V | PG_A)); +#endif + xen_flush_queue(); + cur_space += PAGE_SIZE; + printk("#6\n"); +#endif /* 0 */ +#ifdef notyet + if (xen_start_info->flags & SIF_INITDOMAIN) { + /* Map first megabyte */ + for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) + PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); + xen_flush_queue(); + } +#endif + /* + * re-map kernel text read-only + * + */ + for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); + i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) + PT_SET_MA(i, xpmap_ptom(VTOP(i)) | PG_V | PG_A); + + printk("#7\n"); + physfree = VTOP(cur_space); + init_first = physfree >> PAGE_SHIFT; + IdlePTD = (pd_entry_t *)VTOP(IdlePTD); + IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); + setup_xen_features(); + printk("#8, proc0kstack=%u\n", proc0kstack); +} + + +trap_info_t trap_table[] = { + { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, + { 1, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, + { 3, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, + { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, + /* This is UPL on Linux and KPL on BSD */ + { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, + { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, + { 7, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, + /* + * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, + * no handler for double fault + */ + { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, + {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, + {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, + {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, + {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, + {14, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, + {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, + {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, + {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, + {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, + {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, + {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, + { 0, 0, 0, 0 } +}; + +/********** CODE WORTH KEEPING ABOVE HERE *****************/ + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + +void xen_handle_thread_switch(struct pcb *pcb); + +/* This is called by cpu_switch() when switching threads. */ +/* The pcb arg refers to the process control block of the */ +/* next thread which is to run */ +void +xen_handle_thread_switch(struct pcb *pcb) +{ + uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; + uint32_t *b = (uint32_t *)&pcb->pcb_fsd; + multicall_entry_t mcl[3]; + int i = 0; + + /* Notify Xen of task switch */ + mcl[i].op = __HYPERVISOR_stack_switch; + mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); + mcl[i++].args[1] = (unsigned long)pcb; + + /* Check for update of fsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + a += 2; + b += 2; + + /* Check for update of gsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + (void)HYPERVISOR_multicall(mcl, i); +} Property changes on: i386/xen/xen_machdep.c ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: i386/xen/mp_machdep.c =================================================================== --- i386/xen/mp_machdep.c (.../stable/6/sys) (revision 0) +++ i386/xen/mp_machdep.c (.../user/dfr/xenhvm/6/sys) (revision 190588) @@ -0,0 +1,1407 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_apic.h" +#include "opt_cpu.h" +#include "opt_kdb.h" +#include "opt_kstack_pages.h" +#include "opt_mp_watchdog.h" +#include "opt_sched.h" + +#if !defined(lint) +#if !defined(SMP) +#error How did you get here? +#endif + +#ifndef DEV_APIC +#error The apic device is required for SMP, add "device apic" to your config file. +#endif +#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) +#error SMP not supported with CPU_DISABLE_CMPXCHG +#endif +#endif /* not lint */ + +#include +#include +#include +#include /* cngetc() */ +#ifdef GPROF +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include /** COUNT_XINVLTLB_HITS */ +#include + +#include +#include +#include +#include +#include +#include + +#define WARMBOOT_TARGET 0 +#define WARMBOOT_OFF (KERNBASE + 0x0467) +#define WARMBOOT_SEG (KERNBASE + 0x0469) + +#define CMOS_REG (0x70) +#define CMOS_DATA (0x71) +#define BIOS_RESET (0x0f) +#define BIOS_WARM (0x0a) + +/* + * this code MUST be enabled here and in mpboot.s. + * it follows the very early stages of AP boot by placing values in CMOS ram. + * it NORMALLY will never be needed and thus the primitive method for enabling. + * +#define CHECK_POINTS + */ + +/* lock region used by kernel profiling */ +int mcount_lock; + +/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ +int current_postcode; + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ +extern int nkpt; + +extern struct pcpu __pcpu[]; + +/* + * CPU topology map datastructures for HTT. + */ +static struct cpu_group mp_groups[MAXCPU]; +static struct cpu_top mp_top; + +/* AP uses this during bootstrap. Do not staticize. */ +char *bootSTK; +static int bootAP; +static union descriptor *bootAPgdt; + +static char resched_name[MAX_VIRT_CPUS][15]; +static char callfunc_name[MAX_VIRT_CPUS][15]; + +/* Free these after use */ +void *bootstacks[MAXCPU]; + +/* Hotwire a 0->4MB V==P mapping */ +extern pt_entry_t *KPTphys; + +struct pcb stoppcbs[MAXCPU]; + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; + +typedef void call_data_func_t(uintptr_t , uintptr_t); + + +#ifdef COUNT_IPIS +/* Interrupt counts. */ +#ifdef IPI_PREEMPTION +static u_long *ipi_preempt_counts[MAXCPU]; +#endif +static u_long *ipi_ast_counts[MAXCPU]; +u_long *ipi_invltlb_counts[MAXCPU]; +u_long *ipi_invlrng_counts[MAXCPU]; +u_long *ipi_invlpg_counts[MAXCPU]; +u_long *ipi_invlcache_counts[MAXCPU]; +u_long *ipi_rendezvous_counts[MAXCPU]; +u_long *ipi_lazypmap_counts[MAXCPU]; +#endif + +/* + * Local data and functions. + */ + +static u_int logical_cpus; + +/* used to hold the AP's until we are ready to release them */ +static struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +static volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; +} static cpu_info[MAX_APIC_ID + 1]; +static int cpu_apic_ids[MAXCPU]; + +/* Holds pending bitmap based IPIs per CPU */ +static volatile u_int cpu_ipi_pending[MAXCPU]; + +static u_int boot_address; + +static void assign_cpu_ids(void); +static void set_interrupt_apic_ids(void); +static int start_all_aps(void); +static int start_ap(int apic_id); +static void release_aps(void *dummy); + +static u_int hyperthreading_cpus; +static cpumask_t hyperthreading_cpus_mask; +extern void Xhypervisor_callback(void); +extern void failsafe_callback(void); +extern void pmap_lazyfix_action(void); + +void +mp_topology(void) +{ + struct cpu_group *group; + int logical_cpus; + int apic_id; + int groups; + int cpu; + + /* Build the smp_topology map. */ + /* Nothing to do if there is no HTT support. */ + if ((cpu_feature & CPUID_HTT) == 0) + return; + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + if (logical_cpus <= 1) + return; + group = &mp_groups[0]; + groups = 1; + for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) { + if (!cpu_info[apic_id].cpu_present) + continue; + /* + * If the current group has members and we're not a logical + * cpu, create a new group. + */ + if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) { + group++; + groups++; + } + group->cg_count++; + group->cg_mask |= 1 << cpu; + cpu++; + } + + mp_top.ct_count = groups; + mp_top.ct_group = mp_groups; + smp_topology = &mp_top; +} + + +/* + * Calculate usable address in base memory for AP trampoline code. + */ +u_int +mp_bootaddress(u_int basemem) +{ + + return (basemem); +} + +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id > MAX_APIC_ID) { + panic("SMP: APIC ID %d too high", apic_id); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + if (mp_ncpus < MAXCPU) + mp_ncpus++; + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); +} + +void +cpu_mp_setmaxid(void) +{ + + mp_maxid = MAXCPU - 1; +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + all_cpus = 1; + if (mp_ncpus == 0) { + /* + * No CPUs were found, so this must be a UP system. Setup + * the variables to represent a system with a single CPU + * with an id of 0. + */ + mp_ncpus = 1; + return (0); + } + + /* At least one CPU was found. */ + if (mp_ncpus == 1) { + /* + * One CPU was found, so this must be a UP system with + * an I/O APIC. + */ + return (0); + } + + /* At least two CPUs were found. */ + return (1); +} + +/* + * Initialize the IPI handlers and start up the AP's. + */ +void +cpu_mp_start(void) +{ + int i; + + /* Initialize the logical ID to APIC ID table. */ + for (i = 0; i < MAXCPU; i++) { + cpu_apic_ids[i] = -1; + cpu_ipi_pending[i] = 0; + } + +#if 0 + /* + * IPI list that has to be converted to Xen + * + */ + /* Install an inter-CPU IPI for TLB invalidation */ + setidt(IPI_INVLTLB, IDTVEC(invltlb), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_INVLPG, IDTVEC(invlpg), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_INVLRNG, IDTVEC(invlrng), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for cache invalidation. */ + setidt(IPI_INVLCACHE, IDTVEC(invlcache), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for lazy pmap release */ + setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for all-CPU rendezvous */ + setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install generic inter-CPU IPI handler */ + setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for CPU stop/restart */ + setidt(IPI_STOP, IDTVEC(cpustop), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + + /* Set boot_cpu_id if needed. */ + if (boot_cpu_id == -1) { + boot_cpu_id = PCPU_GET(apic_id); + cpu_info[boot_cpu_id].cpu_bsp = 1; + } else + KASSERT(boot_cpu_id == PCPU_GET(apic_id), + ("BSP's APIC ID doesn't match boot_cpu_id")); + cpu_apic_ids[0] = boot_cpu_id; + + assign_cpu_ids(); + + /* Start each Application Processor */ + start_all_aps(); + + /* Setup the initial logical CPUs info. */ + logical_cpus = logical_cpus_mask = 0; + if (cpu_feature & CPUID_HTT) + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + + set_interrupt_apic_ids(); +} + + +static void +iv_rendezvous(uintptr_t a, uintptr_t b) +{ + smp_rendezvous_action(); +} + +static void +iv_invltlb(uintptr_t a, uintptr_t b) +{ + xen_tlb_flush(); +} + +static void +iv_invlpg(uintptr_t a, uintptr_t b) +{ + xen_invlpg(a); +} + +static void +iv_invlrng(uintptr_t a, uintptr_t b) +{ + vm_offset_t start = (vm_offset_t)a; + vm_offset_t end = (vm_offset_t)b; + + while (start < end) { + xen_invlpg(start); + start += PAGE_SIZE; + } +} + + +static void +iv_invlcache(uintptr_t a, uintptr_t b) +{ + + wbinvd(); +} + +static void +iv_lazypmap(uintptr_t a, uintptr_t b) +{ + pmap_lazyfix_action(); +} + + +static void +iv_noop(uintptr_t a, uintptr_t b) +{ +} + +static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR] = +{ + iv_noop, + iv_noop, + iv_rendezvous, + iv_invltlb, + iv_invlpg, + iv_invlrng, + iv_invlcache, + iv_lazypmap, +}; + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +static void +smp_reschedule_interrupt(void *unused) +{ + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + +#ifdef IPI_PREEMPTION + if (ipi_bitmap & (1 << IPI_PREEMPT)) { +#ifdef COUNT_IPIS + *ipi_preempt_counts[cpu]++; +#endif + mtx_lock_spin(&sched_lock); + /* Don't preempt the idle thread */ + if (curthread != PCPU_GET(idlethread)) { + struct thread *running_thread = curthread; + if (running_thread->td_critnest > 1) + running_thread->td_owepreempt = 1; + else + mi_switch(SW_INVOL | SW_PREEMPT, NULL); + } + mtx_unlock_spin(&sched_lock); + } +#endif + + if (ipi_bitmap & (1 << IPI_AST)) { +#ifdef COUNT_IPIS + *ipi_ast_counts[cpu]++; +#endif + /* Nothing to do for AST */ + } +} + +struct _call_data { + uint16_t func_id; + uint16_t wait; + uintptr_t arg1; + uintptr_t arg2; + atomic_t started; + atomic_t finished; +}; + +static struct _call_data *call_data; + +static void +smp_call_function_interrupt(void *arg) +{ + call_data_func_t *func; + uintptr_t arg1 = call_data->arg1; + uintptr_t arg2 = call_data->arg2; + int wait = call_data->wait; + atomic_t *started = &call_data->started; + atomic_t *finished = &call_data->finished; + + if (call_data->func_id > IPI_BITMAP_VECTOR) + panic("invalid function id %u", call_data->func_id); + + func = ipi_vectors[call_data->func_id]; + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + (*func)(arg1, arg2); + + if (wait) { + mb(); + atomic_inc(finished); + } + atomic_add_int(&smp_tlb_wait, 1); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + int i, x; + + /* List CPUs */ + printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); + for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { + if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) + continue; + if (cpu_info[x].cpu_disabled) + printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); + else { + KASSERT(i < mp_ncpus, + ("mp_ncpus and actual cpus are out of whack")); + printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); + } + } +} + +static int +xen_smp_intr_init(unsigned int cpu) +{ + int rc; + unsigned int irq; + + pc->pc_resched_irq = pc->pc_callfunc_irq = ~0; + + sprintf(resched_name[cpu], "resched%u", cpu); + rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, + cpu, + resched_name[cpu], + smp_reschedule_interrupt, + INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); + + printf("cpu=%d irq=%d vector=%d\n", + cpu, pc->pc_resched_irq, RESCHEDULE_VECTOR); + + per_cpu(resched_irq, cpu) = irq; + + sprintf(callfunc_name[cpu], "callfunc%u", cpu); + rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, + cpu, + callfunc_name[cpu], + smp_call_function_interrupt, + INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = irq; + + printf("cpu=%d irq=%d vector=%d\n", + cpu, pc->pc_callfunc_irq, CALL_FUNCTION_VECTOR); + + if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) + goto fail; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu)); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); + return rc; +} + +static void +xen_smp_intr_init_cpus(void *unused) +{ + int i; + + for (i = 0; i < mp_ncpus; i++) + xen_smp_intr_init(i); +} + +#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) +/* + * AP CPU's call this to initialize themselves. + */ +void +init_secondary(void) +{ + vm_offset_t addr; + int gsel_tss; + + /* bootAP is set in start_ap() to our ID. */ + PCPU_SET(currentldt, _default_ldt); + + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); +#if 0 + gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; +#endif + PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); +#if 0 + PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); + ltr(gsel_tss); +#endif + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + /* signal our startup to the BSP. */ + mp_naps++; + + /* Spin until the BSP releases the AP's. */ + while (!aps_ready) + ia32_pause(); + + /* BSP may have changed PTD while we were waiting */ + invltlb(); + for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) + invlpg(addr); + + /* set up FPU state on the AP */ + npxinit(__INITIAL_NPXCW__); + +#if 0 + /* set up SSE registers */ + enable_sse(); + + /* A quick check from sanity claus */ + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + panic("cpuid mismatch! boom!!"); + } +#endif + /* Initialize curthread. */ + KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); + PCPU_SET(curthread, PCPU_GET(idlethread)); + + mtx_lock_spin(&ap_boot_mtx); +#if 0 + /* Init local apic for irq's */ + lapic_setup(1); + + /* Set memory range attributes for this CPU to match the BSP */ + mem_range_AP_init(); +#endif + smp_cpus++; + + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); + + /* Determine if we are a logical CPU. */ + if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) + logical_cpus_mask |= PCPU_GET(cpumask); + + /* Determine if we are a hyperthread. */ + if (hyperthreading_cpus > 1 && + PCPU_GET(apic_id) % hyperthreading_cpus != 0) + hyperthreading_cpus_mask |= PCPU_GET(cpumask); + + /* Build our map of 'other' CPUs. */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); +#if 0 + if (bootverbose) + lapic_dump("AP"); +#endif + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + smp_active = 1; /* historic */ + } + + mtx_unlock_spin(&ap_boot_mtx); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ia32_pause(); + + /* ok, now grab sched_lock and enter the scheduler */ + mtx_lock_spin(&sched_lock); + + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + spinlock_exit(); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); + + cpu_throw(NULL, choosethread()); /* doesn't return */ + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * We tell the I/O APIC code about all the CPUs we want to receive + * interrupts. If we don't want certain CPUs to receive IRQs we + * can simply not tell the I/O APIC code about them in this function. + * We also do not tell it about the BSP since it tells itself about + * the BSP internally to work with UP kernels and on UP machines. + */ +static void +set_interrupt_apic_ids(void) +{ + u_int apic_id; + + for (apic_id = 0; apic_id < MAXCPU; apic_id++) { + if (!cpu_info[apic_id].cpu_present) + continue; + if (cpu_info[apic_id].cpu_bsp) + continue; + if (cpu_info[apic_id].cpu_disabled) + continue; + + /* Don't let hyperthreads service interrupts. */ + if (hyperthreading_cpus > 1 && + apic_id % hyperthreading_cpus != 0) + continue; + + intr_add_cpu(apic_id); + } +} + +/* + * Assign logical CPU IDs to local APICs. + */ +static void +assign_cpu_ids(void) +{ + u_int i; + + /* Check for explicitly disabled CPUs. */ + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) + continue; + + /* Don't use this CPU if it has been disabled by a tunable. */ + if (resource_disabled("lapic", i)) { + cpu_info[i].cpu_disabled = 1; + continue; + } + } + + /* + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 has already been assigned to the BSP, + * so we only have to assign IDs for APs. + */ + mp_ncpus = 1; + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || + cpu_info[i].cpu_disabled) + continue; + + if (mp_ncpus < MAXCPU) { + cpu_apic_ids[mp_ncpus] = i; + mp_ncpus++; + } else + cpu_info[i].cpu_disabled = 1; + } + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * start each AP in our list + */ +static int +start_all_aps(void) +{ + int apic_id, cpu, i; + struct pcpu *pc; + + mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + + /* start each AP */ + for (cpu = 1; cpu < mp_ncpus; cpu++) { + apic_id = cpu_apic_ids[cpu]; + + bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); + + /* setup a vector to our boot code */ + *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); + + bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4; + bootAP = cpu; + bootAPgdt = gdt + (512*cpu); + + /* Get per-cpu data */ + pc = &__pcpu[bootAP]; + pcpu_init(pc, bootAP, sizeof(struct pcpu)); + pc->pc_apic_id = cpu_apic_ids[bootAP]; + pc->pc_prvspace = pc; + pc->pc_curthread = 0; + + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + + PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); + bzero(bootAPgdt, PAGE_SIZE); + for (i = 0; i < NGDT; i++) + ssdtosd(&gdt_segs[i], &bootAPgdt[i].sd); + PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); +#ifdef notyet + + if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { + apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); + acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); +#ifdef CONFIG_ACPI + if (acpiid != 0xff) + x86_acpiid_to_apicid[acpiid] = apicid; +#endif + } +#endif + + /* attempt to start the Application Processor */ + if (!start_ap(apic_id)) { + printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); + /* better panic as the AP may be running loose */ + printf("panic y/n? [y] "); + if (cngetc() != 'n') + panic("bye-bye"); + } + + all_cpus |= (1 << cpu); /* record AP in CPU map */ + } + + /* build our map of 'other' CPUs */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + + pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); + + /* number of APs actually started */ + return mp_naps; +} + +extern uint8_t *pcpu_boot_stack; +extern trap_info_t trap_table[]; + +static void +smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + +void cpu_initialize_context(unsigned int cpu); + +void +cpu_initialize_context(unsigned int cpu) +{ + /* vcpu_guest_context_t is too large to allocate on the stack. + * Hence we allocate statically and protect it with a lock */ + vm_page_t m[4]; + static vcpu_guest_context_t ctxt; + vm_offset_t boot_stack; + vm_offset_t newPTD; + vm_paddr_t ma[NPGPTD]; + static int color; + int i, err; + + /* + * Page 0,[0-3] PTD + * Page 1, [4] boot stack + * Page [5] PDPT + + * + */ + for (i = 0; i < NPGPTD + 2; i++) { + m[i] = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + + pmap_zero_page(m[i]); + + } + boot_stack = kmem_alloc_nofault(kernel_map, 1); + newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); + ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; + +#ifdef PAE + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); + for (i = 0; i < NPGPTD; i++) { + ((vm_paddr_t *)boot_stack)[i] = + ma[i] = + xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; + } +#endif + + /* + * Copy cpu0 IdlePTD to new IdlePTD - copying only + * kernel mappings + */ + pmap_qenter(newPTD, m, 4); + + memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), + (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), + nkpt*sizeof(vm_paddr_t)); + + pmap_qremove(newPTD, 4); + kmem_free(kernel_map, newPTD, 4); + /* + * map actual idle stack to boot_stack + */ + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); + + + xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); + vm_page_lock_queues(); + for (i = 0; i < 4; i++) { + int pdir = (PTDPTDI + i) / NPDEPG; + int curoffset = (PTDPTDI + i) % NPDEPG; + + xen_queue_pt_update((vm_paddr_t) + ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), + ma[i]); + } + PT_UPDATES_FLUSH(); + vm_page_unlock_queues(); + + memset(&ctxt, 0, sizeof(ctxt)); + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); + ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.eip = (unsigned long)init_secondary; + ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ + + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + smp_trap_init(ctxt.trap_ctxt); + + ctxt.ldt_ents = 0; + ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); + ctxt.gdt_ents = 512; + +#ifdef __i386__ + ctxt.user_regs.esp = boot_stack + PAGE_SIZE; + + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = boot_stack + PAGE_SIZE; + + ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; + ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); +#else /* __x86_64__ */ + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = idle->thread.rsp0; + + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); + + ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); +#endif + + printf("gdtpfn=%lx pdptpfn=%lx\n", + ctxt.gdt_frames[0], + ctxt.ctrlreg[3] >> PAGE_SHIFT); + + err = HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt); + KASSERT(err == 0, ("VCPUOP_initialise failed")); + DELAY(3000); + err = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + KASSERT(err == 0, ("VCPUOP_up failed")); +} + +/* + * This function starts the AP (application processor) identified + * by the APIC ID 'physicalCpu'. It does quite a "song and dance" + * to accomplish this. This is necessary because of the nuances + * of the different hardware we might encounter. It isn't pretty, + * but it seems to work. + */ +static int +start_ap(int apic_id) +{ + int cpus, ms; + + /* used as a watchpoint to signal AP startup */ + cpus = mp_naps; + + cpu_initialize_context(apic_id); + + /* Wait up to 5 seconds for it to start. */ + for (ms = 0; ms < 5000; ms++) { + if (mp_naps > cpus) + return 1; /* return SUCCESS */ + DELAY(1000); + } + return 0; /* return FAILURE */ +} + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); +#endif /* COUNT_XINVLTLB_HITS */ + +/* + * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + struct _call_data data; + + call_data = &data; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + call_data->func_id = vector; + call_data->arg1 = addr1; + call_data->arg2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + call_data = NULL; + mtx_unlock_spin(&smp_ipi_mtx); +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + struct _call_data data; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + mask &= ~PCPU_GET(cpumask); + if (mask == 0) + return; + ncpu = bitcount32(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + call_data = &data; + call_data->func_id = vector; + call_data->arg1 = addr1; + call_data->arg2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + call_data = NULL; + mtx_unlock_spin(&smp_ipi_mtx); +} + +void +smp_cache_flush(void) +{ + + if (smp_started) + smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); +} + +void +smp_invltlb(void) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } +} + +void +smp_invlpg(vm_offset_t addr) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +} + +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +} + +void +smp_masked_invltlb(u_int mask) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +} + +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +} + +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +} + +void +ipi_bitmap_handler(struct clockframe frame); + +void +ipi_bitmap_handler(struct clockframe frame) +{ + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + +#ifdef IPI_PREEMPTION + if (ipi_bitmap & (1 << IPI_PREEMPT)) { +#ifdef COUNT_IPIS + *ipi_preempt_counts[cpu]++; +#endif + mtx_lock_spin(&sched_lock); + /* Don't preempt the idle thread */ + if (curthread != PCPU_GET(idlethread)) { + struct thread *running_thread = curthread; + if (running_thread->td_critnest > 1) + running_thread->td_owepreempt = 1; + else + mi_switch(SW_INVOL | SW_PREEMPT, NULL); + } + mtx_unlock_spin(&sched_lock); + } +#endif + + if (ipi_bitmap & (1 << IPI_AST)) { +#ifdef COUNT_IPIS + *ipi_ast_counts[cpu]++; +#endif + /* Nothing to do for AST */ + } +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(uint32_t cpus, u_int ipi) +{ + int cpu; + u_int bitmap = 0; + u_int old_pending; + u_int new_pending; + + if (IPI_IS_BITMAPED(ipi)) { + bitmap = 1 << ipi; + ipi = IPI_BITMAP_VECTOR; + } + + CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); + while ((cpu = ffs(cpus)) != 0) { + cpu--; + cpus &= ~(1 << cpu); + + KASSERT(cpu_apic_ids[cpu] != -1, + ("IPI to non-existent CPU %d", cpu)); + + if (bitmap) { + do { + old_pending = cpu_ipi_pending[cpu]; + new_pending = old_pending | bitmap; + } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); + + if (!old_pending) + ipi_pcpu(cpu, RESCHEDULE_VECTOR); + continue; + + } + + KASSERT(call_data != NULL, ("call_data not set")); + + ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); + } + +} + +/* + * send an IPI INTerrupt containing 'vector' to all CPUs, including myself + */ +void +ipi_all(u_int ipi) +{ + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + ipi_selected(PCPU_GET(other_cpus), ipi); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + ipi_selected(all_cpus & ~(1< + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * In addition to hardware address maps, this + * module is called upon to provide software-use-only + * maps which may or may not be stored in the same + * form as hardware maps. These pseudo-maps are + * used to store intermediate results from copy + * operations to and from address spaces. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include "opt_cpu.h" +#include "opt_pmap.h" +#include "opt_msgbuf.h" +#include "opt_xbox.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif + +#ifdef XBOX +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef XEN +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif + +#ifdef XBOX +#include +#endif + +#include +#include +#include +#include +#include + +#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#define PMAP_DIAGNOSTIC + +#if defined(DIAGNOSTIC) +#define PMAP_DIAGNOSTIC +#endif + +#if !defined(PMAP_DIAGNOSTIC) +#define PMAP_INLINE __inline +#else +#define PMAP_INLINE +#endif + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) +#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) + +#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) +#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) +#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) +#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) +#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) + +#ifndef XEN +#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ + atomic_clear_int((u_int *)(pte), PG_W)) +#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) +#endif + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_paddr_t avail_end; /* PA of last available physical page */ +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +int pgeflag = 0; /* PG_G or-in */ +int pseflag = 0; /* PG_PS or-in */ + +int nkpt; +vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; + +#if defined(PAE) && !defined(XEN) +static uma_zone_t pdptzone; +#endif + +/* + * Data for the pv entry allocation mechanism + */ +static uma_zone_t pvzone; +static struct vm_object pvzone_obj; +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +int pmap_pagedaemon_waken; + +/* + * All those kernel PT submaps that BSD is so fond of + */ +struct sysmaps { + struct mtx lock; + pt_entry_t *CMAP1; + pt_entry_t *CMAP2; + caddr_t CADDR1; + caddr_t CADDR2; +}; +static struct sysmaps sysmaps_pcpu[MAXCPU]; +pt_entry_t *CMAP1 = 0; +static pt_entry_t *CMAP3; +caddr_t CADDR1 = 0, ptvmmap = 0; +static caddr_t CADDR3; +struct msgbuf *msgbufp = 0; + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +#ifdef SMP +extern pt_entry_t *SMPpt; +#endif +static pt_entry_t *PMAP1 = 0, *PMAP2; +static pt_entry_t *PADDR1 = 0, *PADDR2; +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static PMAP_INLINE void free_pv_entry(pv_entry_t pv); +static pv_entry_t get_pv_entry(void); +static void pmap_clear_ptes(vm_page_t m, int bit); + +static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + vm_page_t *free); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); +static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, + vm_offset_t va); +static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); + +static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); +static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); +static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); +static void pmap_pte_release(pt_entry_t *pte); +static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); +static vm_offset_t pmap_kmem_choose(vm_offset_t addr); +#if defined(PAE) && !defined(XEN) +static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +#endif + +CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); +CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +/* + * If you get an error here, then you set KVA_PAGES wrong! See the + * description of KVA_PAGES in sys/i386/include/pmap.h. It must be + * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. + */ +CTASSERT(KERNBASE % (1 << 24) == 0); + +static __inline void +pagezero(void *page) +{ +#if defined(I686_CPU) + if (cpu_class == CPUCLASS_686) { +#if defined(CPU_ENABLE_SSE) + if (cpu_feature & CPUID_SSE2) + sse2_pagezero(page); + else +#endif + i686_pagezero(page); + } else +#endif + bzero(page, PAGE_SIZE); +} + +void +pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) +{ + vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); + + switch (type) { + case SH_PD_SET_VA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + xpmap_ptom(val & ~(PG_RW))); +#endif + xen_queue_pt_update(pdir_ma, + xpmap_ptom(val)); + break; + case SH_PD_SET_VA_MA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + val & ~(PG_RW)); +#endif + xen_queue_pt_update(pdir_ma, val); + break; + case SH_PD_SET_VA_CLEAR: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, 0); +#endif + xen_queue_pt_update(pdir_ma, 0); + break; + } +} + +/* + * Move the kernel virtual free pointer to the next + * 4MB. This is used to help improve performance + * by using a large (4MB) page for much of the kernel + * (.text, .data, .bss) + */ +static vm_offset_t +pmap_kmem_choose(vm_offset_t addr) +{ + vm_offset_t newaddr = addr; + +#ifndef DISABLE_PSE + if (cpu_feature & CPUID_PSE) + newaddr = (addr + PDRMASK) & ~PDRMASK; +#endif + return newaddr; +} + +/* + * Bootstrap the system enough to run with virtual memory. + * + * On the i386 this is called after mapping has already been enabled + * and just syncs the pmap module with what has already been done. + * [We can't call it easily with mapping off since the kernel is not + * mapped with PA == VA, hence we would have to relocate every address + * from the linked base (virtual) address "KERNBASE" to the actual + * (physical) address starting relative to 0] + */ +void +pmap_bootstrap(firstaddr, loadaddr) + vm_paddr_t firstaddr; + vm_paddr_t loadaddr; +{ + vm_offset_t va; + pt_entry_t *pte, *unused; + struct sysmaps *sysmaps; + int i; + + /* + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too + * large. It should instead be correctly calculated in locore.s and + * not based on 'first' (which is a physical address, not a virtual + * address, for the start of unused physical memory). The kernel + * page tables are NOT double mapped and thus should not be included + * in this calculation. + */ + virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + virtual_avail = pmap_kmem_choose(virtual_avail); + + virtual_end = VM_MAX_KERNEL_ADDRESS; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); +#ifdef PAE + kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); +#endif + kernel_pmap->pm_active = -1; /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvlist); + LIST_INIT(&allpmaps); + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + nkpt = NKPT; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); + + va = virtual_avail; + pte = vtopte(va); + + /* + * CMAP1/CMAP2 are used for zeroing and copying pages. + * CMAP3 is used for the idle process page zeroing. + */ + for (i = 0; i < MAXCPU; i++) { + sysmaps = &sysmaps_pcpu[i]; + mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); + SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) + SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) + } + SYSMAP(caddr_t, CMAP1, CADDR1, 1) + SYSMAP(caddr_t, CMAP3, CADDR3, 1) +#ifdef XEN + PT_SET_MA(CADDR3, 0); +#else + *CMAP3 = 0; +#endif + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) + + /* + * ptvmmap is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, ptvmmap, 1) + + /* + * msgbufp is used to map the system message buffer. + */ + SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) + + /* + * ptemap is used for pmap_pte_quick + */ + SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); + SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); + + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + virtual_avail = va; +#ifdef XEN + PT_SET_MA(CADDR1, 0); +#else + *CMAP1 = 0; +#endif + +#if !defined(XEN) +#ifdef XBOX + /* FIXME: This is gross, but needed for the XBOX. Since we are in such + * an early stadium, we cannot yet neatly map video memory ... :-( + * Better fixes are very welcome! + */ + if (!arch_i386_is_xbox) +#endif + for (i = 0; i < NKPT; i++) + PTD[i] = 0; + + /* Initialize the PAT MSR if present. */ + pmap_init_pat(); + + /* Turn on PG_G on kernel page(s) */ + pmap_set_pg(); +#endif /* !XEN */ +} + +/* + * Setup the PAT MSR. + */ +void +pmap_init_pat(void) +{ + uint64_t pat_msr; + + /* Bail if this CPU doesn't implement PAT. */ + if (!(cpu_feature & CPUID_PAT)) + return; + +#ifdef PAT_WORKS + /* + * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. + * Program 4 and 5 as WP and WC. + * Leave 6 and 7 as UC and UC-. + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); + pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | + PAT_VALUE(5, PAT_WRITE_COMBINING); +#else + /* + * Due to some Intel errata, we can only safely use the lower 4 + * PAT entries. Thus, just replace PAT Index 2 with WC instead + * of UC-. + * + * Intel Pentium III Processor Specification Update + * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B + * or Mode C Paging) + * + * Intel Pentium IV Processor Specification Update + * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~PAT_MASK(2); + pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); +#endif + wrmsr(MSR_PAT, pat_msr); +} + +/* + * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. + */ +void +pmap_set_pg(void) +{ + pd_entry_t pdir; + pt_entry_t *pte; + vm_offset_t va, endva; + int i; + + if (pgeflag == 0) + return; + + i = KERNLOAD/NBPDR; + endva = KERNBASE + KERNend; + + if (pseflag) { + va = KERNBASE + KERNLOAD; + while (va < endva) { + pdir = kernel_pmap->pm_pdir[KPTDI+i]; + pdir |= pgeflag; + kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; + invltlb(); /* Play it safe, invltlb() every time */ + i++; + va += NBPDR; + } + } else { + va = (vm_offset_t)btext; + while (va < endva) { + pte = vtopte(va); + if (*pte) { +#ifdef XEN + PT_SET_MA(va, *pte | pgeflag); +#else + *pte |= pgeflag; +#endif + } + invltlb(); /* Play it safe, invltlb() every time */ + va += PAGE_SIZE; + } + } +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + m->md.pv_list_count = 0; +} + +#if defined(PAE) && !defined(XEN) + +static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); + +static void * +pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +{ + *flags = UMA_SLAB_PRIV; + return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, + 1, 0)); +} +#endif + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + int shpgperproc = PMAP_SHPGPERPROC; + + /* + * Initialize the address space (zone) for the pv entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ + pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_high_water = 9 * (pv_entry_max / 10); + uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); + +#if defined(PAE) && !defined(XEN) + pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, + NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); +#endif +} + +void +pmap_init2() +{ +} + + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(int mode, boolean_t is_pde) +{ + int pat_flag, pat_index, cache_bits; + + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + + /* If we don't support PAT, map extended modes to older ones. */ + if (!(cpu_feature & CPUID_PAT)) { + switch (mode) { + case PAT_UNCACHEABLE: + case PAT_WRITE_THROUGH: + case PAT_WRITE_BACK: + break; + case PAT_UNCACHED: + case PAT_WRITE_COMBINING: + case PAT_WRITE_PROTECTED: + mode = PAT_UNCACHEABLE; + break; + } + } + + /* Map the caching mode to a PAT index. */ + switch (mode) { +#ifdef PAT_WORKS + case PAT_UNCACHEABLE: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_UNCACHED: + pat_index = 2; + break; + case PAT_WRITE_COMBINING: + pat_index = 5; + break; + case PAT_WRITE_PROTECTED: + pat_index = 4; + break; +#else + case PAT_UNCACHED: + case PAT_UNCACHEABLE: + case PAT_WRITE_PROTECTED: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_WRITE_COMBINING: + pat_index = 2; + break; +#endif + default: + panic("Unknown caching mode %d\n", mode); + } + + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_index & 0x4) + cache_bits |= pat_flag; + if (pat_index & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_index & 0x1) + cache_bits |= PG_NC_PWT; + return (cache_bits); +} +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + * + * N.B.: Before calling any of the following TLB invalidation functions, + * the calling processor must ensure that all stores updating a non- + * kernel page table are globally performed. Otherwise, another + * processor could cache an old, pre-update entry without being + * invalidated. This can happen one of two ways: (1) The pmap becomes + * active on another processor after its pm_active field is checked by + * one of the following functions but before a store updating the page + * table is globally performed. (2) The pmap becomes active on another + * processor before its pm_active field is checked but due to + * speculative loads one of the following functions stills reads the + * pmap as inactive on the other processor. + * + * The kernel page table is exempt because its pm_active field is + * immutable. The kernel page table is always active on every + * processor. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + u_int cpumask; + u_int other_cpus; + + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + PT_UPDATES_FLUSH(); + sched_unpin(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", + pmap, sva, eva); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + PT_UPDATES_FLUSH(); + sched_unpin(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + u_int cpumask; + u_int other_cpus; + + CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invltlb(); + smp_invltlb(); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + sched_unpin(); +} + +void +pmap_invalidate_cache(void) +{ + + sched_pin(); + wbinvd(); + smp_cache_flush(); + sched_unpin(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap || pmap->pm_active) { + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + invlpg(va); + PT_UPDATES_FLUSH(); + } +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (pmap == kernel_pmap || pmap->pm_active) { + if (eva - sva > PAGE_SIZE) + CTR3(KTR_PMAP, + "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + PT_UPDATES_FLUSH(); + } +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + + if (pmap == kernel_pmap || pmap->pm_active) { + CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); + invltlb(); + } +} + +PMAP_INLINE void +pmap_invalidate_cache(void) +{ + + wbinvd(); +} +#endif /* !SMP */ + +/* + * Are we current address space or kernel? N.B. We return FALSE when + * a pmap's page table is in use because a kernel thread is borrowing + * it. The borrowed page table can change spontaneously, making any + * dependence on its continued use subject to a race condition. + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && + (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned pte must + * be released by passing it to pmap_pte_release(). + */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_lock(&PMAP2mutex); + newpf = *pde & PG_FRAME; + if ((*PMAP2 & PG_FRAME) != newpf) { +#ifdef XEN + PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (*PMAP2 & 0xffffffff)); +#else + *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); +#endif + } + return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte + * being NULL. + */ +static __inline void +pmap_pte_release(pt_entry_t *pte) +{ + + if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { + CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", + *PMAP2); + PT_SET_VA_MA(PMAP2, 0, TRUE); + mtx_unlock(&PMAP2mutex); + } +} + +static __inline void +invlcaddr(void *caddr) +{ + + invlpg((u_int)caddr); + PT_UPDATES_FLUSH(); +} + +/* + * Super fast pmap_pte routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire invltlb for checking a single mapping. + * + * If the given pmap is not the current pmap, vm_page_queue_mtx + * must be held and curthread pinned to a CPU. + */ +static pt_entry_t * +pmap_pte_quick(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + + /* + * + * XXX hitting this indicates that things are AFU + */ + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + newpf = *pde & PG_FRAME; + if ((*PMAP1 & PG_FRAME) != n