From 26c93e057d701641f65a66c22c94fcbb40dae5c4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 4 Mar 2020 12:41:33 -0500 Subject: [PATCH 1/5] Rescue dumper support for the arm64 kernel. This allows a kernel to save a minidump following a panic, by kexec'ing an embedded "rescue" kernel, which boots with pre-reserved memory into an embedded root image containing a script. The script uses some facilities of the rescue kernel to save a minidump from the panicked kernel in the root filesystem, after which it reboots. I tried to make this as self-contained as possible so as to make upstreaming more straightforward. Two new kernel options are added: - RESCUE_SUPPORT: add the ability to exec a rescue kernel in order to obtain a minidump. One must also set RESCUE_EMBED to the path of a compiled rescue kernel. - RESCUE: a kernel which "rescues" a panicked kernel (compiled with RESCUE_SUPPORT) specifies this option. In typical usage, GENERIC sets RESCUE_SUPPORT and the RESCUE_EMBED make option, which is functionally the same as MFS_IMAGE. RESCUE_EMBED points to a kernel compiled with the RESCUE option. I provided a sample rescue kernel config; it just eliminates a few options from GENERIC to save memory, but this is not strictly necessary in practice with a 64MB reservation. To enable rescue dumps, the GENERIC kernel must set the "debug.rescue_minidump" tunable to "1". If enabled, during boot the GENERIC kernel will reserve 64MB of contiguous memory for use by the rescue kernel. The rescue kernel image is copied into the reserved memory, along with some additional data needed to bootstrap. No modification to the DTB is required. The kernel will automatically create a dump device, so that the kernel will call into the minidump code during a panic. Once minidump parameters are calculated, it jumps into the rescue "kexec" code, which finalizes bootstrap parameters, sanitizies the execution environment and hands execution to the rescue kernel that was copied into the memory reservation. The implementation is somewhat coupled to the internals of locore. It also assumes that the embedded rescue kernel is an ELF file. The rescue kernel must be compiled without "options SMP", or debug.rescue.kern.smp.disabled must be set in the host. The former is preferable to avoid wasting memory. The rescue kernel boots like any other arm64 kernel, except that it must synthesize some module metadata in initarm(). It provides a device file, /dev/dumper, which can be read to obtain a minidump for the panicked kernel. The device is seekable and can be read multiple times without modifying host memory. The idea is to be able to simply read the minidump using dd(1). Tunables can be set in the rescue kernel by setting the same tunables in the host with the prefix "debug.rescue.". For example, to set debug.debugger_on_panic="0" in the rescue kernel, set debug.rescue.debug.debugger_on_panic="0" in the host. I provided an rc script and minimal set of utilities which are sufficient to recover a minidump. The utilities are hardlinks of a cross-compiled rescue(8) executable; the result fits in 8MB. The rc script looks for the rescue_rootfs tunable (specified as debug.rescue.rescue_rootfs in the host), runs fsck_ffs on the specified rootfs, mounts it, and saves the vmcore to /var/crash. --- etc/rc-rescue | 25 ++ etc/rescue.mtree | 43 +++ sys/arm64/arm64/machdep.c | 94 ++++++ sys/arm64/arm64/minidump_machdep.c | 10 + sys/arm64/arm64/mp_machdep.c | 11 + sys/arm64/arm64/pmap.c | 8 - sys/arm64/arm64/rescue_dumper.c | 452 +++++++++++++++++++++++++++++ sys/arm64/arm64/rescue_machdep.c | 357 +++++++++++++++++++++++ sys/arm64/conf/GENERIC | 3 + sys/arm64/conf/RESCUE | 16 + sys/arm64/include/pmap.h | 7 + sys/arm64/include/rescue.h | 75 +++++ sys/conf/files.arm64 | 2 + sys/conf/kern.post.mk | 14 + sys/conf/kern.pre.mk | 7 +- sys/conf/options.arm64 | 14 + sys/kern/kern_shutdown.c | 2 + sys/kern/subr_intr.c | 28 +- sys/sys/intr.h | 3 + 19 files changed, 1161 insertions(+), 10 deletions(-) create mode 100644 etc/rc-rescue create mode 100644 etc/rescue.mtree create mode 100644 sys/arm64/arm64/rescue_dumper.c create mode 100644 sys/arm64/arm64/rescue_machdep.c create mode 100644 sys/arm64/conf/RESCUE create mode 100644 sys/arm64/include/rescue.h diff --git a/etc/rc-rescue b/etc/rc-rescue new file mode 100644 index 000000000000..be04c93401ba --- /dev/null +++ b/etc/rc-rescue @@ -0,0 +1,25 @@ +#!/rescue/sh + +set -x + +export PATH=/rescue + +rootfs=$(kenv rescue_rootfs) +if [ -z "$rootfs" ]; then + echo "rc: failed to locate root filesystem" >&2 + exit 1 +fi +mountdir=/mnt +destfile=${mountdir}/var/crash/vmcore + +fsck_ffs -fy $rootfs +if [ $? -eq 16 ]; then + fsck_ffs -fy $rootfs +fi + +set -e + +mount $rootfs $mountdir +dd if=/dev/dumper of=${destfile} bs=1M +umount $mountdir +reboot diff --git a/etc/rescue.mtree b/etc/rescue.mtree new file mode 100644 index 000000000000..1faf897b9c3e --- /dev/null +++ b/etc/rescue.mtree @@ -0,0 +1,43 @@ +/set type=file uid=0 gid=0 mode=0755 nlink=1 flags=none +. type=dir nlink=8 + +.snap type=dir gid=5 mode=0775 nlink=2 +.. + + +dev type=dir nlink=2 +.. + + +/set type=file uid=0 gid=0 mode=0644 nlink=1 flags=none +etc type=dir mode=0755 nlink=2 + rc size=385 +.. + + +mnt type=dir mode=0755 nlink=2 +.. + + +/set type=file uid=0 gid=1001 mode=0755 nlink=1 flags=none +rescue type=dir gid=0 nlink=2 + dd nlink=15 size=6035760 + fsck nlink=15 size=6035760 + fsck_ffs nlink=15 size=6035760 + init nlink=15 size=6035760 + kenv nlink=15 size=6035760 + ln nlink=15 size=6035760 + ls nlink=15 size=6035760 + mkdir nlink=15 size=6035760 + mount nlink=15 size=6035760 + reboot nlink=15 size=6035760 + rescue nlink=15 size=6035760 + rm nlink=15 size=6035760 + sh nlink=15 size=6035760 + sysctl nlink=15 size=6035760 + umount nlink=15 size=6035760 +.. + + +sbin type=dir gid=0 nlink=2 +.. diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index 56bc67d5dc5f..7f1509a26d4e 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -78,6 +78,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef RESCUE +#include +#endif #include #include @@ -888,6 +891,87 @@ print_efi_map_entries(struct efi_map_header *efihdr) foreach_efi_map_entry(efihdr, print_efi_map_entry); } +#ifdef RESCUE +static vm_offset_t +preload_add_string(vm_offset_t dst, int type, const char *s) +{ + uint32_t *data, len; + + data = (uint32_t *)dst; + len = strlen(s) + 1; + + *data++ = type; + *data++ = len; + strcpy((void *)data, s); + return (roundup2((vm_offset_t)data + len, sizeof(long))); +} + +static vm_offset_t +preload_add_u64(vm_offset_t dst, int type, uint64_t val) +{ + uint32_t *data; + + data = (uint32_t *)dst; + + *data++ = type; + *data++ = sizeof(val); + memcpy(data, &val, sizeof(val)); + return ((vm_offset_t)data + sizeof(val)); +} + +static void +preload_add_terminator(vm_offset_t dst) +{ + memset((void *)dst, 0, sizeof(uint32_t) * 2); +} + +/* + * Fake some preloaded metadata for the rescue kernel using parameters passed by + * the panicked kernel. + */ +void +rescue_preload_init(struct arm64_bootparams *abp) +{ + extern u_long _end; + struct rescue_kernel_params *params; + vm_offset_t dtb, env, kernend, md, mdstart; + + /* + * Get the parameter structure, making use of the identity map loaded in + * TTBR0. This relies on locore using a L1 (1GB) block mapping. + */ + params = (void *)(KERNBASE - abp->kern_delta - + RESCUE_RESERV_KERNEL_OFFSET); + + /* + * Copy the DTB and environment strings to memory following the kernel. + * This ensures that they remain mapped after the pmap is bootstrapped. + * This relies on locore providing some extra space in region following + * the kernel mapped by TTBR1. + */ + dtb = round_page((uintptr_t)&_end); + memcpy((void *)dtb, (void *)params->kp_dtbstart, params->kp_dtblen); + env = round_page(dtb + params->kp_dtblen); + memcpy((void *)env, (void *)params->kp_kenvstart, params->kp_kenvlen); + + md = mdstart = round_page(env + params->kp_kenvlen); + kernend = mdstart + PAGE_SIZE; + + md = preload_add_string(md, MODINFO_NAME, "kernel"); + md = preload_add_string(md, MODINFO_TYPE, "elf64 kernel"); + md = preload_add_u64(md, MODINFO_ADDR, VM_MIN_KERNEL_ADDRESS); + md = preload_add_u64(md, MODINFO_SIZE, (uintptr_t)&_end - KERNBASE); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_KERNEND, kernend); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_DTBP, dtb); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_ENVP, env); + preload_add_terminator(md); + + rescue_dumper_init(¶ms->kp_dumpparams); + + abp->modulep = mdstart; +} +#endif /* RESCUE */ + #ifdef FDT static void try_load_dtb(caddr_t kmdp) @@ -1013,6 +1097,16 @@ initarm(struct arm64_bootparams *abp) caddr_t kmdp; bool valid; +#ifdef RESCUE + /* + * The rescue kernel runs without any module metadata. The panicked + * kernel could provide it, but some variables, like the size of the + * loaded rescue kernel, can't easily be determined there. So, fake it + * here. + */ + rescue_preload_init(abp); +#endif + /* Set the module data location */ preload_metadata = (caddr_t)(uintptr_t)(abp->modulep); diff --git a/sys/arm64/arm64/minidump_machdep.c b/sys/arm64/arm64/minidump_machdep.c index 27c2081ef78d..ada1ae251014 100644 --- a/sys/arm64/arm64/minidump_machdep.c +++ b/sys/arm64/arm64/minidump_machdep.c @@ -54,6 +54,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef RESCUE_SUPPORT +#include +#endif CTASSERT(sizeof(struct kerneldumpheader) == 512); @@ -280,6 +283,13 @@ minidumpsys(struct dumperinfo *di) } dumpsize += PAGE_SIZE; +#ifdef RESCUE_SUPPORT + if (do_rescue_minidump) { + rescue_kernel_exec(); + return (ENXIO); + } +#endif + /* Determine dump offset on device. */ if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) { error = E2BIG; diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c index 5d8e23e59a58..258a88776999 100644 --- a/sys/arm64/arm64/mp_machdep.c +++ b/sys/arm64/arm64/mp_machdep.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -54,6 +55,9 @@ __FBSDID("$FreeBSD$"); #include #include +#ifdef RESCUE_SUPPORT +#include +#endif #include #ifdef VFP #include @@ -380,6 +384,13 @@ ipi_stop(void *dummy __unused) while (!CPU_ISSET(cpu, &started_cpus)) cpu_spinwait(); +#ifdef RESCUE_SUPPORT + if (dumping) { + /* Never returns. */ + rescue_kernel_exec(); + } +#endif + CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); CTR0(KTR_SMP, "IPI_STOP (restart)"); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index c2181373fc53..6ece6a711c05 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -167,14 +167,6 @@ __FBSDID("$FreeBSD$"); #define PMAP_INLINE #endif -/* - * These are configured by the mair_el1 register. This is set up in locore.S - */ -#define DEVICE_MEMORY 0 -#define UNCACHED_MEMORY 1 -#define CACHED_MEMORY 2 - - #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else diff --git a/sys/arm64/arm64/rescue_dumper.c b/sys/arm64/arm64/rescue_dumper.c new file mode 100644 index 000000000000..cfff207c9412 --- /dev/null +++ b/sys/arm64/arm64/rescue_dumper.c @@ -0,0 +1,452 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +enum dump_segs { + DUMP_SEG_MDHDR = 0, /* minidump header */ + DUMP_SEG_MSGBUF, /* kernel message buffer */ + DUMP_SEG_BITMAP, /* vm_page_dump array */ + DUMP_SEG_PTPS, /* kernel page table pages */ + DUMP_SEG_PAGES, /* pages marked in vm_page_dump */ + DUMP_SEG_COUNT, +}; + +struct dump_seg { + vm_offset_t ds_addr; + vm_size_t ds_sz; +}; + +struct dump_softc { + struct minidumphdr *sc_mdhdr; + struct dump_seg sc_segs[DUMP_SEG_COUNT]; + vm_offset_t sc_kernl0; + vm_offset_t sc_scratchkva; + char *sc_scratchbuf; + u_long sc_npages; + off_t sc_cursor; +}; + +FEATURE(rescue, "rescue kernel dumper"); + +static MALLOC_DEFINE(M_DUMPER, "dumper", "Rescue dumper structures"); + +static struct rescue_dump_params params; + +void +rescue_dumper_init(struct rescue_dump_params *p) +{ + memcpy(¶ms, p, sizeof(params)); +} + +static void +dump_seg_init(struct dump_seg *seg, vm_offset_t addr, vm_size_t sz) +{ + seg->ds_addr = addr; + seg->ds_sz = sz; +} + +static vm_offset_t +map_host_seg(vm_paddr_t pa, vm_size_t size) +{ + vm_offset_t va; + + va = kva_alloc(size); + if (va != 0) + pmap_kenter(va, size, pa, CACHED_MEMORY); + return (va); +} + +static void +unmap_host_seg(vm_offset_t va, vm_size_t size) +{ + vm_size_t off; + + for (off = 0; off < size; off += PAGE_SIZE) + pmap_kremove(va + off); + kva_free(va, size); +} + +static void +dumper_cdevpriv_dtr(void *arg) +{ + struct dump_softc *sc; + struct dump_seg *seg; + + sc = arg; + + free(sc->sc_scratchbuf, M_DUMPER); + if (sc->sc_scratchkva != 0) + kva_free(sc->sc_scratchkva, PAGE_SIZE); + if (sc->sc_kernl0 != 0) + unmap_host_seg(sc->sc_kernl0, PAGE_SIZE); + + seg = &sc->sc_segs[DUMP_SEG_BITMAP]; + if (seg->ds_addr != 0) + unmap_host_seg(seg->ds_addr, seg->ds_sz); + seg = &sc->sc_segs[DUMP_SEG_MSGBUF]; + if (seg->ds_addr != 0) + unmap_host_seg(seg->ds_addr, seg->ds_sz); + + free(sc->sc_mdhdr, M_DUMPER); + free(sc, M_DUMPER); +} + +CTASSERT(sizeof(struct minidumphdr) <= PAGE_SIZE); + +static int +dumper_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + struct dump_softc *sc; + struct minidumphdr *mdhdr; + uint64_t *bitmap; + vm_offset_t va; + u_long i; + int error; + + sc = malloc(sizeof(*sc), M_DUMPER, M_WAITOK | M_ZERO); + + /* + * The minidump header gets padded out to a full page. + */ + mdhdr = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + (void)strcpy(mdhdr->magic, MINIDUMP_MAGIC); + mdhdr->version = MINIDUMP_VERSION; + mdhdr->msgbufsize = round_page(params.dp_msgbufsz); + mdhdr->bitmapsize = round_page(params.dp_vmdumpsz); + mdhdr->pmapsize = howmany(params.dp_kernend - params.dp_kernstart, + L2_SIZE) * PAGE_SIZE; + mdhdr->kernbase = params.dp_kernstart; + mdhdr->dmapphys = params.dp_dmapbasepa; + mdhdr->dmapbase = params.dp_dmapmin; + mdhdr->dmapend = params.dp_dmapmax; + sc->sc_mdhdr = mdhdr; + + dump_seg_init(&sc->sc_segs[DUMP_SEG_MDHDR], (vm_offset_t)mdhdr, + PAGE_SIZE); + + /* + * Map the root kernel page table page. It is not included in the dump, + * but is needed in order to walk the page tables so it might as well be + * statically mapped. + * + * Also allocate a page of KVA to map the rest of the kernel page table + * pages during walks. + */ + sc->sc_kernl0 = map_host_seg(params.dp_kernl0pa, PAGE_SIZE); + if (sc->sc_kernl0 == 0) { + error = ENOMEM; + goto err; + } + sc->sc_scratchkva = kva_alloc(PAGE_SIZE); + if (sc->sc_scratchkva == 0) { + error = ENOMEM; + goto err; + } + + /* + * In some cases it is necessary to synthesize a fake page table page. + */ + sc->sc_scratchbuf = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + + /* + * Map segments of the host kernel that get included in the minidump. + */ + va = map_host_seg(params.dp_msgbufpa, mdhdr->msgbufsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_MSGBUF], va, mdhdr->msgbufsize); + + va = map_host_seg(params.dp_vmdumppa, mdhdr->bitmapsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_BITMAP], va, mdhdr->bitmapsize); + + /* + * Create a virtual dump segment for the kernel page tables and marked + * host pages. + */ + dump_seg_init(&sc->sc_segs[DUMP_SEG_PTPS], 0, mdhdr->pmapsize); + + sc->sc_npages = 0; + bitmap = (uint64_t *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + for (i = 0; i < mdhdr->bitmapsize / sizeof(uint64_t); i++) + sc->sc_npages += bitcount64(bitmap[i]); + dump_seg_init(&sc->sc_segs[DUMP_SEG_PAGES], 0, + sc->sc_npages * PAGE_SIZE); + + error = devfs_set_cdevpriv(sc, dumper_cdevpriv_dtr); + if (error != 0) + goto err; + + return (0); + +err: + dumper_cdevpriv_dtr(sc); + return (error); +} + +/* + * Map a host page directory page. + */ +static pd_entry_t * +map_pde(struct dump_softc *sc, pd_entry_t pde) +{ + vm_offset_t scratch; + + scratch = sc->sc_scratchkva; + pmap_kenter(scratch, PAGE_SIZE, pde & ~ATTR_MASK, CACHED_MEMORY); + return ((pd_entry_t *)scratch); +} + +/* + * Return a host page table page mapping the specified virtual address. + */ +static void * +map_ptp(struct dump_softc *sc, vm_offset_t va) +{ + pd_entry_t *l0, *l1, *l2, *l3; + vm_paddr_t pa; + int i; + + KASSERT((va & L2_OFFSET) == 0, ("%s: unaligned VA %#lx", __func__, va)); + + l0 = (pd_entry_t *)sc->sc_kernl0 + pmap_l0_index(va); + if ((*l0 & ATTR_DESCR_MASK) != L0_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + l1 = map_pde(sc, *l0); + l1 = &l1[pmap_l1_index(va)]; + if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) { + /* Dump a 1GB mapping using a fake PTP. */ + pa = (*l1 & ~ATTR_MASK) | (va & L1_OFFSET); + l3 = (pd_entry_t *)sc->sc_scratchbuf; + for (i = 0; i < Ln_ENTRIES; i++) + l3[i] = pa + (i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; + return (l3); + } + if ((*l1 & ATTR_DESCR_MASK) != L1_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + l2 = map_pde(sc, *l1); + l2 = &l2[pmap_l2_index(va)]; + if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) { + /* Dump a 2MB mapping using a fake PTP. */ + pa = *l2 & ~ATTR_MASK; + l3 = (pd_entry_t *)sc->sc_scratchbuf; + for (i = 0; i < Ln_ENTRIES; i++) + l3[i] = pa + (i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; + return (l3); + } + if ((*l2 & ATTR_DESCR_MASK) != L2_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + /* Dump the leaf page table page. */ + l3 = map_pde(sc, *l2); + return (l3); +} + +static int +dumper_read_seg(struct dump_softc *sc, enum dump_segs idx, struct dump_seg *seg, + off_t baseoff, struct uio *uio) +{ + uint64_t bit, bitcount, bits, *bitmap; + char *ptp; + vm_offset_t va; + vm_paddr_t pa; + off_t off, off1; + u_long i; + uint32_t bitmapsize; + int error; + + KASSERT(baseoff <= uio->uio_offset && + baseoff + seg->ds_sz > uio->uio_offset, + ("%s: invalid offset %#lx into seg at %#lx-%#lx", __func__, + uio->uio_offset, baseoff, baseoff + seg->ds_sz)); + + error = 0; + off = uio->uio_offset - baseoff; + switch (idx) { + case DUMP_SEG_MDHDR: + case DUMP_SEG_MSGBUF: + case DUMP_SEG_BITMAP: + /* Linear segments can simply be copied. */ + error = uiomove((char *)seg->ds_addr + off, seg->ds_sz - off, + uio); + break; + case DUMP_SEG_PTPS: + /* Dump leaf page table pages. */ + for (va = params.dp_kernstart + (off / PAGE_SIZE) * L2_SIZE; + va < params.dp_kernend; va += L2_SIZE) { + ptp = map_ptp(sc, va); + error = uiomove(ptp + (off & PAGE_MASK), + PAGE_SIZE - (off & PAGE_MASK), uio); + if (error != 0 || uio->uio_resid == 0) + break; + off = uio->uio_offset - baseoff; + } + break; + case DUMP_SEG_PAGES: + /* Dump pages marked in the bitmap. This is non-destructive. */ + bitmap = (uint64_t *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + bitmapsize = (uint32_t)sc->sc_segs[DUMP_SEG_BITMAP].ds_sz; + off1 = 0; + for (i = 0; i < bitmapsize / sizeof(uint64_t); i++) { + bits = bitmap[i]; + if (off1 < off) { + /* + * Seek forward in the array until we find where + * we left off during the last read. + */ + bitcount = bitcount64(bits); + if (off1 + bitcount * PAGE_SIZE <= off) { + off1 += bitcount * PAGE_SIZE; + continue; + } + do { + bits &= ~(1ul << (ffsl(bits) - 1)); + off1 += PAGE_SIZE; + } while (off1 < off); + } + while (bits != 0) { + bit = ffsl(bits) - 1; + bits &= ~(1ul << bit); +#define NBDUMPSLOT (sizeof(uint64_t) * NBBY) + pa = ((uint64_t)i * NBDUMPSLOT + bit) * + PAGE_SIZE; + pmap_kenter(sc->sc_scratchkva, PAGE_SIZE, pa, + CACHED_MEMORY); + error = uiomove((char *)sc->sc_scratchkva + + (off % PAGE_SIZE), + PAGE_SIZE - (off % PAGE_SIZE), uio); + if (error != 0) + goto out; + if (uio->uio_resid == 0) + goto out; + off = off1 = uio->uio_offset - baseoff; + } + } +out: + break; + default: + panic("%s: unknown segment index %d", __func__, idx); + } + + return (error); +} + +static int +dumper_read(struct cdev *dev, struct uio *uio, int flags) +{ + struct dump_softc *sc; + struct dump_seg *seg; + off_t baseoff, off; + int error, i; + + error = devfs_get_cdevpriv((void **)&sc); + if (error != 0) + return (error); + + off = uio->uio_offset; + if (off < 0) + return (EINVAL); + + /* Seeks are not supported. */ + if (off != sc->sc_cursor) + return (ESPIPE); + + for (baseoff = 0, i = 0; i < DUMP_SEG_COUNT; i++) { + seg = &sc->sc_segs[i]; + if (off >= baseoff && off < baseoff + seg->ds_sz) { + error = dumper_read_seg(sc, i, seg, baseoff, uio); + break; + } + baseoff += seg->ds_sz; + MPASS((baseoff & PAGE_MASK) == 0); + } + + sc->sc_cursor = uio->uio_offset; + return (error); +} + +static struct cdevsw dumper_cdevsw = { + .d_version = D_VERSION, + .d_open = dumper_open, + .d_read = dumper_read, + .d_name = "dumper", +}; + +static int +dumper_modevent(module_t mod __unused, int type, void *data __unused) +{ + static struct cdev *dumper_dev; + + switch (type) { + case MOD_LOAD: + dumper_dev = make_dev(&dumper_cdevsw, 0, UID_ROOT, GID_WHEEL, + 0600, "dumper"); + break; + case MOD_UNLOAD: + destroy_dev(dumper_dev); + break; + } + return (0); +} +DEV_MODULE(dumper, dumper_modevent, NULL); +MODULE_VERSION(dumper, 1); diff --git a/sys/arm64/arm64/rescue_machdep.c b/sys/arm64/arm64/rescue_machdep.c new file mode 100644 index 000000000000..fb00865aff69 --- /dev/null +++ b/sys/arm64/arm64/rescue_machdep.c @@ -0,0 +1,357 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +int do_rescue_minidump; + +/* + * Parameters for memory reserved for the rescue kernel. The boundary and + * alignment are fixed by the requirements of locore. The size is configurable + * but of course must be satisfiable by an allocation with the defined alignment + * and boundary requirements. + */ +#define RESCUE_RESERV_ALIGN (2 * 1024 * 1024u) /* 2MB */ +#define RESCUE_RESERV_BOUNDARY (1024 * 1024 * 1024u) /* 1GB */ +#define RESCUE_RESERV_SIZE (64 * 1024 * 1024u) /* 64MB */ + +/* + * Environment variables beginning with this prefix are copied into the rescue + * kernel's environment with the prefix stripped. + */ +#define RESCUE_KENV_PREFIX "debug.rescue." + +static vm_offset_t rescue_va; +static vm_paddr_t rescue_pa; + +/* + * Called from the host kernel to populate rescue dumper parameters. + * The returned structure is passed to the rescue kernel. + */ +static void +rescue_dump_params_init(struct rescue_dump_params *rdp) +{ + rdp->dp_msgbufpa = vtophys(msgbufp->msg_ptr); + rdp->dp_msgbufsz = msgbufp->msg_size; + rdp->dp_vmdumppa = vtophys(vm_page_dump); + rdp->dp_vmdumpsz = vm_page_dump_size; + rdp->dp_kernl0pa = vtophys(kernel_pmap->pm_l0); + rdp->dp_kernstart = VM_MIN_KERNEL_ADDRESS; + rdp->dp_kernend = kernel_vm_end; + rdp->dp_kernmax = VM_MAX_KERNEL_ADDRESS; + rdp->dp_dmapbasepa = DMAP_MIN_PHYSADDR; + rdp->dp_dmapmin = DMAP_MIN_ADDRESS; + rdp->dp_dmapmax = DMAP_MAX_ADDRESS; +} + +static void +rescue_kernel_cpu_switch(void) +{ + extern struct pcpu __pcpu[]; + struct pcpu *pcpu; + + pcpu = &__pcpu[0]; + if (get_pcpu() != pcpu) { + CPU_SET_ATOMIC(pcpu->pc_cpuid, &started_cpus); + for (;;) + cpu_spinwait(); + } +} + +/* + * Make the final preparations to jump into the rescue kernel, and then do it. + */ +void +rescue_kernel_exec(void) +{ + static pd_entry_t pt_l0[Ln_ENTRIES] __aligned(PAGE_SIZE); + static pd_entry_t pt_l1[Ln_ENTRIES] __aligned(PAGE_SIZE); + static pd_entry_t pt_l2[Ln_ENTRIES] __aligned(PAGE_SIZE); + struct rescue_kernel_params *params; + void (*rescue)(u_long modulep); + vm_paddr_t pa; + + /* + * Switch to the boot CPU if we are not already on it. + */ + rescue_kernel_cpu_switch(); + + /* + * Acknowledge any active interrupts to avoid leaving the PIC in an + * indeterminate state. + */ + intr_isrc_reset(); + + /* + * Prepare the dump parameters structure for the rescue kernel. The + * rest of the parameters must already have been initialized. These + * will be accessed via an aliasing mapping, so make sure the cache is + * written back. + */ + params = (struct rescue_kernel_params *)rescue_va; + rescue_dump_params_init(¶ms->kp_dumpparams); + cpu_dcache_wb_range((vm_offset_t)params, sizeof(*params)); + + /* + * Construct an identity map for the rescue kernel's locore. This + * covers the entire reservation. Because it does not span a 1GB + * boundary, only three pages are needed. This will be replaced by + * locore. + */ + pt_l0[pmap_l0_index(rescue_pa)] = L0_TABLE | vtophys(pt_l1); + pt_l1[pmap_l1_index(rescue_pa)] = L1_TABLE | vtophys(pt_l2); + for (pa = rescue_pa; pa < rescue_pa + RESCUE_RESERV_SIZE; pa += L2_SIZE) + pt_l2[pmap_l2_index(pa)] = L2_BLOCK | ATTR_DEFAULT | + ATTR_IDX(UNCACHED_MEMORY) | pa; + + cpu_setttb(pmap_kextract((vm_offset_t)pt_l0)); + + /* + * Jump to the entry point. Currently we pass a dummy module pointer to + * ensure that locore maps some memory following the rescue kernel, but + * this is really a hack to avoid modifying locore. + */ + rescue = (void *)(rescue_pa + RESCUE_RESERV_KERNEL_OFFSET + PAGE_SIZE); + (rescue)(KERNBASE + RESCUE_RESERV_SIZE); +} + +/* + * Dummy function to satisfy the set_dumper() interface. This should never be + * called. + */ +static int +rescue_dumper_dummy(void *priv, void *virtual, vm_offset_t physical, + off_t offset, size_t length) +{ + printf("%s: unexpected call\n", __func__); + return (EOPNOTSUPP); +} + +static void +rescue_kernel_init(void *arg __unused) +{ + extern u_long __rescue_kernel_start, __rescue_kernel_end; + struct dumperinfo di; + struct rescue_kernel_params *params; + void *dtbp, *fdtp; + char *envp, *p; + const uint32_t *addr_cellsp, *size_cellsp; + uint8_t *buf, *sb; + caddr_t kmdp; + size_t dtblen, envlen, kernlen, prefixlen, varlen; + vm_offset_t off; + uint32_t addr_cells, size_cells; + int enabled, error, i, len, memoff, rootoff; + + enabled = 0; + TUNABLE_INT_FETCH("debug.rescue_minidump", &enabled); + if (!enabled) + return; + if (!do_minidump) { + printf("rescue: minidumps are not enabled\n"); + return; + } + + rescue_va = kmem_alloc_contig(kernel_arena, RESCUE_RESERV_SIZE, + M_WAITOK, 0, ~(vm_paddr_t)0, + RESCUE_RESERV_ALIGN, RESCUE_RESERV_BOUNDARY, VM_MEMATTR_DEFAULT); + if (rescue_va == 0) { + printf("rescue: failed to reserve contiguous memory\n"); + goto out; + } + rescue_pa = pmap_kextract(rescue_va); + + params = (struct rescue_kernel_params *)rescue_va; + off = round_page(sizeof(*params)); + + /* + * Copy the DTB into the reserved area. It would be simpler to copy the + * kernel to the base of the reservation and copy the DTB to the space + * following the kernel, but we do not know the kernel's full size. + * Thus the DTB is copied first and the kernel is copied to the next + * 2MB-aligned address. + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, void *); + dtblen = fdt_totalsize(dtbp); + + fdtp = (void *)(rescue_va + off); + memcpy(fdtp, dtbp, dtblen); + + params->kp_dtbstart = rescue_pa + off; + params->kp_dtblen = dtblen; + + /* + * Fix up the DTB used by the rescue kernel: update the memory node to + * point at reserved memory, and delete the rescue and memreserve nodes. + */ + rootoff = fdt_path_offset(fdtp, "/"); + if (rootoff < 0) { + printf("rescue: failed to look up FDT root offset\n"); + goto out; + } + memoff = fdt_path_offset(fdtp, "/memory"); + if (memoff < 0) { + printf("rescue: failed to look up FDT memory offset\n"); + goto out; + } + addr_cellsp = fdt_getprop(fdtp, rootoff, "#address-cells", NULL); + if (addr_cellsp == NULL) { + printf("rescue: failed to look up address-cells property\n"); + goto out; + } + size_cellsp = fdt_getprop(fdtp, rootoff, "#size-cells", NULL); + if (addr_cellsp == NULL || size_cellsp == NULL) { + printf("rescue: failed to look up address-cells property\n"); + goto out; + } + addr_cells = fdt32_to_cpu(*addr_cellsp); + size_cells = fdt32_to_cpu(*size_cellsp); + + len = (addr_cells + size_cells) * sizeof(uint32_t); + sb = buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO); + if (addr_cells == 2) + *(uint64_t *)buf = cpu_to_fdt64(rescue_pa); + else + *(uint32_t *)buf = cpu_to_fdt32(rescue_pa); + buf += addr_cells * sizeof(uint32_t); + if (size_cells == 2) + *(uint64_t *)buf = cpu_to_fdt64(RESCUE_RESERV_SIZE); + else + *(uint32_t *)buf = cpu_to_fdt32(RESCUE_RESERV_SIZE); + error = fdt_setprop_inplace(fdtp, memoff, "reg", sb, len); + if (error != 0) { + printf("rescue: failed to update reg property: %d\n", error); + goto out; + } + free(sb, M_TEMP); + + /* + * Copy select variables from the host kernel's environment to the + * rescue kernel's memory following the DTB. + */ + off += round_page(dtblen); + envp = (char *)(rescue_va + off); + envlen = 0; + prefixlen = strlen(RESCUE_KENV_PREFIX); + for (i = 0; kenvp[i] != NULL; i++) { + p = kenvp[i]; + varlen = strlen(p); + if (strncmp(p, RESCUE_KENV_PREFIX, prefixlen) == 0) { + p += prefixlen; + varlen -= prefixlen; + memcpy(envp, p, varlen); + envp += varlen; + *envp++ = '\0'; + envlen += varlen + 1; + } + } + *envp++ = '\0'; + envlen++; + + params->kp_kenvstart = rescue_pa + off; + params->kp_kenvlen = envlen; + + /* + * The kernel must be loaded at a 2MB-aligned address. To simplify + * location of the parameter structure, we require that the parameters, + * DTB and rescue kernel environment all fit in the first 2MB of the + * reservation. + */ + if (roundup2(off, L2_SIZE) != RESCUE_RESERV_KERNEL_OFFSET) { + printf("rescue: DTB (%zd bytes) and kenv are too large\n", + dtblen); + goto out; + } + off = RESCUE_RESERV_KERNEL_OFFSET; + params->kp_kernstart = rescue_pa + off; + + /* + * Copy the kernel image. This must come last since the length does not + * include that of allocated sections. + */ + kernlen = (u_long)&__rescue_kernel_end - (u_long)&__rescue_kernel_start; + memcpy((void *)(rescue_va + off), (void *)&__rescue_kernel_start, + kernlen); + cpu_idcache_wbinv_range(rescue_va, RESCUE_RESERV_SIZE); + + /* + * Finally tell the generic kernel dump layer that a dump device + * exists, so that it calls into rescue_kernel_exec(). + */ + memset(&di, 0, sizeof(di)); + di.dumper = rescue_dumper_dummy; + error = set_dumper(&di, "rescue", curthread); + if (error != 0) { + printf("rescue: failed to set dump device: %d\n", error); + goto out; + } + + do_rescue_minidump = 1; + printf("rescue: initialized\n"); + return; + +out: + if (rescue_va != 0) { + kmem_free(kernel_arena, rescue_va, RESCUE_RESERV_SIZE); + rescue_va = 0; + rescue_pa = 0; + } +} +SYSINIT(rescue_kernel, SI_SUB_VM_CONF, SI_ORDER_ANY, rescue_kernel_init, NULL); diff --git a/sys/arm64/conf/GENERIC b/sys/arm64/conf/GENERIC index cbb7095aa668..14701bd873cb 100644 --- a/sys/arm64/conf/GENERIC +++ b/sys/arm64/conf/GENERIC @@ -86,6 +86,9 @@ options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones +options RESCUE_SUPPORT +makeoptions RESCUE_EMBED=${.CURDIR}/../RESCUE/kernel + # SoC support #options SOC_CAVM_THUNDERX options SOC_HISI_HI6220 diff --git a/sys/arm64/conf/RESCUE b/sys/arm64/conf/RESCUE new file mode 100644 index 000000000000..0edc1ea9e7fa --- /dev/null +++ b/sys/arm64/conf/RESCUE @@ -0,0 +1,16 @@ +include "./GENERIC" + +ident RESCUE + +nooptions RESCUE_SUPPORT +nomakeoptions RESCUE_EMBED +makeoptions RESCUE_EMBED="no" + +makeoptions MFS_IMAGE=/root/rescue.img + +# Try to keep the rescue kernel small. +options NO_SYSCTL_DESCR +nooptions WITNESS +nooptions SMP + +options RESCUE diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h index 736ce0f3333b..5038ffa2fa91 100644 --- a/sys/arm64/include/pmap.h +++ b/sys/arm64/include/pmap.h @@ -178,6 +178,13 @@ pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) return (0); } +/* + * These are configured by the mair_el1 register. This is set up in locore.S + */ +#define DEVICE_MEMORY 0 +#define UNCACHED_MEMORY 1 +#define CACHED_MEMORY 2 + #endif /* _KERNEL */ #endif /* !LOCORE */ diff --git a/sys/arm64/include/rescue.h b/sys/arm64/include/rescue.h new file mode 100644 index 000000000000..7eab9f9df08f --- /dev/null +++ b/sys/arm64/include/rescue.h @@ -0,0 +1,75 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RESCUE_H_ +#define _RESCUE_H_ + +/* + * Dump parameters passed from the panicked kernel to the rescue kernel. Some + * of these are known at compile-time, but pass them anyway to avoid surprises. + */ +struct rescue_dump_params { + vm_paddr_t dp_msgbufpa; /* message buffer physaddr */ + vm_size_t dp_msgbufsz; /* message buffer size */ + vm_paddr_t dp_vmdumppa; /* vm_dump_array physaddr */ + vm_size_t dp_vmdumpsz; /* vm_dump_array size (bytes) */ + vm_paddr_t dp_kernl0pa; /* L0 page table page physaddr */ + vm_offset_t dp_kernstart; /* beginning of KVA */ + vm_offset_t dp_kernend; /* end of mapped KVA */ + vm_offset_t dp_kernmax; /* maximum KVA */ + vm_paddr_t dp_dmapbasepa; /* lowest addr mapped by direct map */ + vm_offset_t dp_dmapmin; /* beginning of direct map range */ + vm_offset_t dp_dmapmax; /* end of direct map range */ +}; + +/* + * Memory layout parameters passed to the rescue kernel. These are used to + * bootstrap the kernel and to initialize the dumper. + */ +struct rescue_kernel_params { + struct rescue_dump_params kp_dumpparams; + vm_paddr_t kp_dtbstart; + vm_size_t kp_dtblen; + vm_paddr_t kp_kenvstart; + vm_size_t kp_kenvlen; + vm_paddr_t kp_kernstart; +}; + +/* + * The rescue kernel is copied at this offset into the rescue reservation. The + * offset must be a multiple of 2MB. + */ +#define RESCUE_RESERV_KERNEL_OFFSET L2_SIZE + +extern int do_rescue_minidump; + +struct arm64_bootparams; +extern void rescue_dumper_init(struct rescue_dump_params *); +extern void rescue_kernel_exec(void); +extern void rescue_preload_init(struct arm64_bootparams *); + +#endif /* !_RESCUE_H_ */ diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 651288e0106b..d2b82d0607ff 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -35,6 +35,8 @@ arm64/arm64/db_interface.c optional ddb arm64/arm64/db_trace.c optional ddb arm64/arm64/debug_monitor.c optional ddb arm64/arm64/disassem.c optional ddb +arm64/arm64/rescue_dumper.c optional rescue +arm64/arm64/rescue_machdep.c optional rescue_support arm64/arm64/dump_machdep.c standard arm64/arm64/elf_machdep.c standard arm64/arm64/exception.S standard diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk index 1891277a0e21..c40e12e779a9 100644 --- a/sys/conf/kern.post.mk +++ b/sys/conf/kern.post.mk @@ -398,6 +398,20 @@ embedfs_${MFS_IMAGE:T:R}.o: ${MFS_IMAGE} .endif .endif +.if ${RESCUE_EMBED:Uno} != "no" +rescue.o: ${RESCUE_EMBED} + ${OBJCOPY} --input-target binary \ + --output-target ${EMBEDFS_FORMAT.${MACHINE_ARCH}} \ + --binary-architecture ${EMBEDFS_ARCH.${MACHINE_ARCH}} \ + ${RESCUE_EMBED} ${.TARGET} + ${OBJCOPY} \ + --rename-section .data=rescue,contents,alloc,load,readonly,data \ + --redefine-sym _binary_${RESCUE_EMBED:C,[^[:alnum:]],_,g}_size=__rescue_kernel_size \ + --redefine-sym _binary_${RESCUE_EMBED:C,[^[:alnum:]],_,g}_start=__rescue_kernel_start \ + --redefine-sym _binary_${RESCUE_EMBED:C,[^[:alnum:]],_,g}_end=__rescue_kernel_end \ + ${.TARGET} +.endif + # XXX strictly, everything depends on Makefile because changes to ${PROF} # only appear there, but we don't handle that. diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index 949e3704472c..e1042a427310 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -184,6 +184,11 @@ MD_ROOT_SIZE_CONFIGURED!= grep MD_ROOT_SIZE opt_md.h || true ; echo SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o .endif .endif + +.if ${RESCUE_EMBED:Uno} != "no" +SYSTEM_OBJS+= rescue.o +.endif + SYSTEM_LD= @${LD} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} --no-warn-mismatch \ --warn-common --export-dynamic --dynamic-linker /red/herring \ -o ${.TARGET} -X ${SYSTEM_OBJS} vers.o @@ -219,7 +224,7 @@ MKMODULESENV+= __MPATH="${__MPATH}" # Architecture and output format arguments for objdump to convert image to # object file -.if ${MFS_IMAGE:Uno} != "no" +.if ${MFS_IMAGE:Uno} != "no" || ${RESCUE_EMBED:Uno} != "no" .if empty(MD_ROOT_SIZE_CONFIGURED) .if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH}) EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT} diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 index f5c276ca1316..8df36443a439 100644 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -10,3 +10,17 @@ VFP opt_global.h # SoC Support SOC_CAVM_THUNDERX opt_soc.h SOC_HISI_HI6220 opt_soc.h + +# +# Compile the kernel to be run as a rescue kernel after a panic +# and enable dumping the host kernel's memory. +# +RESCUE opt_global.h + +# +# Enable recovery of a memory dump by a rescue kernel. The rescue kernel must +# be compiled with the RESCUE option configured, and the rescue kernel image +# must be embedded by setting the RESCUE_EMBED make option to the path of a +# rescue kernel. The RESCUE and RESCUE_SUPPORT options are mutually exclusive. +# +RESCUE_SUPPORT opt_global.h diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index ea085a0fce3f..2c45ea51580d 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -690,6 +690,8 @@ panic(const char *fmt, ...) vpanic(fmt, ap); } +void exec_rescue_kernel(void); + void vpanic(const char *fmt, va_list ap) { diff --git a/sys/kern/subr_intr.c b/sys/kern/subr_intr.c index 98e7b47085d1..c57fce5b69a9 100644 --- a/sys/kern/subr_intr.c +++ b/sys/kern/subr_intr.c @@ -136,6 +136,13 @@ static boolean_t irq_assign_cpu = FALSE; #define INTRCNT_COUNT (NIRQ * 2) #endif +#ifdef RESCUE_SUPPORT +static DPCPU_DEFINE(struct intr_irqsrc *, isrc_active); +#define ISRC_ACTIVE_SET(ptr) DPCPU_SET(isrc_active, (ptr)) +#else +#define ISRC_ACTIVE_SET(ptr) +#endif + /* Data for MI statistics reporting. */ u_long intrcnt[INTRCNT_COUNT]; char intrnames[INTRCNT_COUNT * INTRNAME_LEN]; @@ -350,8 +357,11 @@ intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) } else #endif if (isrc->isrc_event != NULL) { - if (intr_event_handle(isrc->isrc_event, tf) == 0) + ISRC_ACTIVE_SET(isrc); + if (intr_event_handle(isrc->isrc_event, tf) == 0) { + ISRC_ACTIVE_SET(NULL); return (0); + } } isrc_increment_straycount(isrc); @@ -472,6 +482,22 @@ intr_isrc_deregister(struct intr_irqsrc *isrc) return (error); } +#ifdef RESCUE_SUPPORT +/* + * Make sure that active interrupts are acknowledged before executing the rescue + * kernel. Otherwise it will not be possible to reconfigure the PIC. + */ +void +intr_isrc_reset(void) +{ + struct intr_irqsrc *isrc; + + isrc = DPCPU_GET(isrc_active); + if (isrc != NULL) + PIC_POST_FILTER(isrc->isrc_dev, isrc); +} +#endif + #ifdef SMP /* * A support function for a PIC to decide if provided ISRC should be inited diff --git a/sys/sys/intr.h b/sys/sys/intr.h index a82ffda5876e..79ced09919f5 100644 --- a/sys/sys/intr.h +++ b/sys/sys/intr.h @@ -98,6 +98,9 @@ struct intr_irqsrc { int intr_isrc_deregister(struct intr_irqsrc *); int intr_isrc_register(struct intr_irqsrc *, device_t, u_int, const char *, ...) __printflike(4, 5); +#ifdef RESCUE_SUPPORT +void intr_isrc_reset(void); +#endif #ifdef SMP bool intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu); -- 2.36.1