Message ID | cover.1414099246.git.geoff@infradead.org |
---|---|
State | New |
Headers | show |
On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > Some of the macros defined in kvm_arm.h are useful in assembly files, but are > not compatible with the assembler. Change any C language integer constant > definitions using appended U, UL, or ULL to the UL() preprocessor macro. Also, > add a preprocessor include of the asm/memory.h file which defines the UL() > macro. > > Fixes build errors like these when using kvm_arm.h in assembly > source files: > > Error: unexpected characters following instruction at operand 3 -- `and x0,x1,#((1U<<25)-1)' > > Signed-off-by: Geoff Levand <geoff@infradead.org> Thanks for putting this together, Geoff. Acked-by: Mark Rutland <mark.rutland@arm.com> Thanks, Mark. > --- > arch/arm64/include/asm/kvm_arm.h | 21 +++++++++++---------- > 1 file changed, 11 insertions(+), 10 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h > index cc83520..9038a26 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -18,6 +18,7 @@ > #ifndef __ARM64_KVM_ARM_H__ > #define __ARM64_KVM_ARM_H__ > > +#include <asm/memory.h> > #include <asm/types.h> > > /* Hyp Configuration Register (HCR) bits */ > @@ -149,9 +150,9 @@ > #endif > > #define VTTBR_BADDR_SHIFT (VTTBR_X - 1) > -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) > -#define VTTBR_VMID_SHIFT (48LLU) > -#define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) > +#define VTTBR_BADDR_MASK (((UL(1) << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) > +#define VTTBR_VMID_SHIFT (UL(48)) > +#define VTTBR_VMID_MASK (UL(0xFF) << VTTBR_VMID_SHIFT) > > /* Hyp System Trap Register */ > #define HSTR_EL2_TTEE (1 << 16) > @@ -174,13 +175,13 @@ > > /* Exception Syndrome Register (ESR) bits */ > #define ESR_EL2_EC_SHIFT (26) > -#define ESR_EL2_EC (0x3fU << ESR_EL2_EC_SHIFT) > -#define ESR_EL2_IL (1U << 25) > +#define ESR_EL2_EC (UL(0x3f) << ESR_EL2_EC_SHIFT) > +#define ESR_EL2_IL (UL(1) << 25) > #define ESR_EL2_ISS (ESR_EL2_IL - 1) > #define ESR_EL2_ISV_SHIFT (24) > -#define ESR_EL2_ISV (1U << ESR_EL2_ISV_SHIFT) > +#define ESR_EL2_ISV (UL(1) << ESR_EL2_ISV_SHIFT) > #define ESR_EL2_SAS_SHIFT (22) > -#define ESR_EL2_SAS (3U << ESR_EL2_SAS_SHIFT) > +#define ESR_EL2_SAS (UL(3) << ESR_EL2_SAS_SHIFT) > #define ESR_EL2_SSE (1 << 21) > #define ESR_EL2_SRT_SHIFT (16) > #define ESR_EL2_SRT_MASK (0x1f << ESR_EL2_SRT_SHIFT) > @@ -194,16 +195,16 @@ > #define ESR_EL2_FSC_TYPE (0x3c) > > #define ESR_EL2_CV_SHIFT (24) > -#define ESR_EL2_CV (1U << ESR_EL2_CV_SHIFT) > +#define ESR_EL2_CV (UL(1) << ESR_EL2_CV_SHIFT) > #define ESR_EL2_COND_SHIFT (20) > -#define ESR_EL2_COND (0xfU << ESR_EL2_COND_SHIFT) > +#define ESR_EL2_COND (UL(0xf) << ESR_EL2_COND_SHIFT) > > > #define FSC_FAULT (0x04) > #define FSC_PERM (0x0c) > > /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ > -#define HPFAR_MASK (~0xFUL) > +#define HPFAR_MASK (~UL(0xf)) > > #define ESR_EL2_EC_UNKNOWN (0x00) > #define ESR_EL2_EC_WFI (0x01) > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
[Adding Vivek to Cc] Hi Geoff, This is looking rather good now. My only major concerns with this patch are the DTB handling (which I think should be left to the userspace purgatory), and the incompatibility with KVM (which is in defconfig currently). Otherwise, there are just a couple of minor fixups I'd like to see below. On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to the > arm64 architecture that add support for the kexec re-boot mechanism > (CONFIG_KEXEC) on arm64 platforms. > > Signed-off-by: Geoff Levand <geoff@infradead.org> > --- > arch/arm64/Kconfig | 9 ++ > arch/arm64/include/asm/kexec.h | 47 +++++++++ > arch/arm64/kernel/Makefile | 1 + > arch/arm64/kernel/machine_kexec.c | 169 +++++++++++++++++++++++++++++++++ > arch/arm64/kernel/relocate_kernel.S | 184 ++++++++++++++++++++++++++++++++++++ > include/uapi/linux/kexec.h | 1 + > 6 files changed, 411 insertions(+) > create mode 100644 arch/arm64/include/asm/kexec.h > create mode 100644 arch/arm64/kernel/machine_kexec.c > create mode 100644 arch/arm64/kernel/relocate_kernel.S > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index f0d3a2d..af03449 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE > > source "mm/Kconfig" > > +config KEXEC > + depends on (!SMP || PM_SLEEP_SMP) In its current state this also depends on !KVM && !EFI (technically you could detect those cases at runtime, but I don't see that in this series). > + bool "kexec system call" > + ---help--- > + kexec is a system call that implements the ability to shutdown your > + current kernel, and to start another kernel. It is like a reboot > + but it is independent of the system firmware. And like a reboot > + you can start any kernel with it, not just Linux. > + > config XEN_DOM0 > def_bool y > depends on XEN > diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h > new file mode 100644 > index 0000000..e7bd7ab > --- /dev/null > +++ b/arch/arm64/include/asm/kexec.h > @@ -0,0 +1,47 @@ > +/* > + * kexec for arm64 > + * > + * Copyright (C) Linaro. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#if !defined(_ARM64_KEXEC_H) > +#define _ARM64_KEXEC_H > + > +/* Maximum physical address we can use pages from */ > + > +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) > + > +/* Maximum address we can reach in physical address mode */ > + > +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) > + > +/* Maximum address we can use for the control code buffer */ > + > +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL) > + > +#define KEXEC_CONTROL_PAGE_SIZE 4096 > + > +#define KEXEC_ARCH KEXEC_ARCH_ARM64 > + > +#if !defined(__ASSEMBLY__) > + > +/** > + * crash_setup_regs() - save registers for the panic kernel > + * > + * @newregs: registers are saved here > + * @oldregs: registers to be saved (may be %NULL) > + */ > + > +static inline void crash_setup_regs(struct pt_regs *newregs, > + struct pt_regs *oldregs) > +{ > + /* Empty routine needed to avoid build errors. */ > +} > + > +#endif /* !defined(__ASSEMBLY__) */ > + > +#endif > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > index 6e9538c..77a7351 100644 > --- a/arch/arm64/kernel/Makefile > +++ b/arch/arm64/kernel/Makefile > @@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_CPU_IDLE) += cpuidle.o > arm64-obj-$(CONFIG_JUMP_LABEL) += jump_label.o > arm64-obj-$(CONFIG_KGDB) += kgdb.o > arm64-obj-$(CONFIG_EFI) += efi.o efi-stub.o efi-entry.o > +arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o > > obj-y += $(arm64-obj-y) vdso/ > obj-m += $(arm64-obj-m) > diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c > new file mode 100644 > index 0000000..95bc8d9 > --- /dev/null > +++ b/arch/arm64/kernel/machine_kexec.c > @@ -0,0 +1,169 @@ > +/* > + * kexec for arm64 > + * > + * Copyright (C) Linaro. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/kexec.h> > +#include <linux/of_fdt.h> > +#include <linux/slab.h> > +#include <linux/uaccess.h> > + > +#include <asm/cacheflush.h> > +#include <asm/system_misc.h> > + > +/* Global variables for the relocate_kernel routine. */ > + > +extern const unsigned char relocate_new_kernel[]; > +extern const unsigned long relocate_new_kernel_size; > +extern unsigned long arm64_kexec_dtb_addr; > +extern unsigned long arm64_kexec_kimage_head; > +extern unsigned long arm64_kexec_kimage_start; > + > +/** > + * kexec_is_dtb - Helper routine to check the device tree header signature. > + */ > + > +static bool kexec_is_dtb(const void *dtb) > +{ > + __be32 magic; > + > + return get_user(magic, (__be32 *)dtb) ? false : > + (be32_to_cpu(magic) == OF_DT_HEADER); > +} > + > +/** > + * kexec_find_dtb_seg - Helper routine to find the dtb segment. > + */ > + > +static const struct kexec_segment *kexec_find_dtb_seg( > + const struct kimage *image) > +{ > + int i; > + > + for (i = 0; i < image->nr_segments; i++) { > + if (kexec_is_dtb(image->segment[i].buf)) > + return &image->segment[i]; > + } > + > + return NULL; > +} > + > +void machine_kexec_cleanup(struct kimage *image) > +{ > + /* Empty routine needed to avoid build errors. */ > +} > + > +/** > + * machine_kexec_prepare - Prepare for a kexec reboot. > + * > + * Called from the core kexec code when a kernel image is loaded. > + */ > + > +int machine_kexec_prepare(struct kimage *image) > +{ > + const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image); > + > + if (!dtb_seg) > + pr_warn("%s: No device tree segment found.\n", __func__); > + > + arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0; > + arm64_kexec_kimage_start = image->start; > + > + return 0; > +} I thought all of the DTB handling was moving to purgatory? > + > +/** > + * kexec_list_flush - Helper to flush the kimage list to PoC. > + */ > + > +static void kexec_list_flush(unsigned long kimage_head) > +{ > + void *dest; > + unsigned long *entry; > + > + for (entry = &kimage_head, dest = NULL; ; entry++) { > + unsigned int flag = *entry & > + (IND_DESTINATION | IND_INDIRECTION | IND_DONE | > + IND_SOURCE); > + void *addr = phys_to_virt(*entry & PAGE_MASK); > + > + switch (flag) { > + case IND_INDIRECTION: > + entry = (unsigned long *)addr - 1; > + __flush_dcache_area(addr, PAGE_SIZE); > + break; > + case IND_DESTINATION: > + dest = addr; > + break; > + case IND_SOURCE: > + __flush_dcache_area(addr, PAGE_SIZE); > + dest += PAGE_SIZE; > + break; > + case IND_DONE: > + return; > + default: > + break; Can an image ever have no flags? Given the presence of IND_NONE I'd assume not, so this looks like a candidate for a BUG(). > + } > + } > +} > + > +/** > + * machine_kexec - Do the kexec reboot. > + * > + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC. > + */ > + > +void machine_kexec(struct kimage *image) > +{ > + phys_addr_t reboot_code_buffer_phys; > + void *reboot_code_buffer; > + > + BUG_ON(num_online_cpus() > 1); > + > + arm64_kexec_kimage_head = image->head; > + > + reboot_code_buffer_phys = page_to_phys(image->control_code_page); > + reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); > + > + /* > + * Copy relocate_new_kernel to the reboot_code_buffer for use > + * after the kernel is shut down. > + */ > + > + memcpy(reboot_code_buffer, relocate_new_kernel, > + relocate_new_kernel_size); Can we get rid of the line gaps between comments and the single function calls they apply to, please? I realise it's a minor thing, but this looks rather inconsistent with the rest of arch/arm64/. > + > + /* Flush the reboot_code_buffer in preparation for its execution. */ > + > + __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size); That code should already be at the PoC per the boot protocol (the entire kernel image should have been clean to the PoC at boot, so the instructions forming relocate_new_kernel are globally visible). >From the looks of it you only need to flush the variables at the very end. > + > + /* Flush the kimage list. */ > + > + kexec_list_flush(image->head); > + > + pr_info("Bye!\n"); > + > + /* Disable all DAIF exceptions. */ > + > + asm volatile ("msr daifset, #0xf" : : : "memory"); > + > + /* > + * soft_restart() will shutdown the MMU, disable data caches, then > + * transfer control to the reboot_code_buffer which contains a copy of > + * the relocate_new_kernel routine. relocate_new_kernel will use > + * physical addressing to relocate the new kernel to its final position > + * and then will transfer control to the entry point of the new kernel. > + */ > + > + soft_restart(reboot_code_buffer_phys); As mentioned above, either this needs to depend on !KVM, or this will blow up. > +} > + > +void machine_crash_shutdown(struct pt_regs *regs) > +{ > + /* Empty routine needed to avoid build errors. */ > +} > diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S > new file mode 100644 > index 0000000..49cf9a0 > --- /dev/null > +++ b/arch/arm64/kernel/relocate_kernel.S > @@ -0,0 +1,184 @@ > +/* > + * kexec for arm64 > + * > + * Copyright (C) Linaro. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <asm/assembler.h> > +#include <asm/kexec.h> > +#include <asm/memory.h> > +#include <asm/page.h> > +#include <asm/proc-macros.S> > + > +/* The list entry flags. */ > + > +#define IND_DESTINATION_BIT 0 > +#define IND_INDIRECTION_BIT 1 > +#define IND_DONE_BIT 2 > +#define IND_SOURCE_BIT 3 As previously, I think these need to be moved into a common header, and defined in terms of the existing IND_* macros (or vice-versa). I believe you had a patch doing so; what's the status of that? > + > +/* > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it. > + * > + * The memory that the old kernel occupies may be overwritten when coping the > + * new image to its final location. To assure that the relocate_new_kernel > + * routine which does that copy is not overwritten all code and data needed > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and > + * relocate_new_kernel_end. The machine_kexec() routine will copy > + * relocate_new_kernel to the kexec control_code_page, a special page which > + * has been set up to be preserved during the copy operation. > + */ > + > +.globl relocate_new_kernel > +relocate_new_kernel: > + > + /* Setup the list loop variables. */ > + > + ldr x18, arm64_kexec_kimage_head /* x18 = list entry */ > + dcache_line_size x17, x0 /* x17 = dcache line size */ > + mov x16, xzr /* x16 = segment start */ > + mov x15, xzr /* x15 = entry ptr */ > + mov x14, xzr /* x14 = copy dest */ > + > + /* Check if the new image needs relocation. */ > + > + cbz x18, .Ldone > + tbnz x18, IND_DONE_BIT, .Ldone > + > +.Lloop: > + and x13, x18, PAGE_MASK /* x13 = addr */ > + > + /* Test the entry flags. */ > + > +.Ltest_source: > + tbz x18, IND_SOURCE_BIT, .Ltest_indirection > + > + /* copy_page(x20 = dest, x21 = src) */ > + > + mov x20, x14 > + mov x21, x13 > + > +1: ldp x22, x23, [x21] > + ldp x24, x25, [x21, #16] > + ldp x26, x27, [x21, #32] > + ldp x28, x29, [x21, #48] > + add x21, x21, #64 > + stnp x22, x23, [x20] > + stnp x24, x25, [x20, #16] > + stnp x26, x27, [x20, #32] > + stnp x28, x29, [x20, #48] > + add x20, x20, #64 > + tst x21, #(PAGE_SIZE - 1) > + b.ne 1b > + > + /* dest += PAGE_SIZE */ > + > + add x14, x14, PAGE_SIZE > + b .Lnext > + > +.Ltest_indirection: > + tbz x18, IND_INDIRECTION_BIT, .Ltest_destination > + > + /* ptr = addr */ > + > + mov x15, x13 > + b .Lnext > + > +.Ltest_destination: > + tbz x18, IND_DESTINATION_BIT, .Lnext > + > + /* flush segment */ > + > + bl .Lflush > + mov x16, x13 > + > + /* dest = addr */ > + > + mov x14, x13 > + > +.Lnext: > + /* entry = *ptr++ */ > + > + ldr x18, [x15], #8 > + > + /* while (!(entry & DONE)) */ > + > + tbz x18, IND_DONE_BIT, .Lloop > + > +.Ldone: > + /* flush last segment */ > + > + bl .Lflush > + > + dsb sy > + isb > + ic ialluis > + dsb sy > + isb > + > + /* start_new_image */ > + > + ldr x4, arm64_kexec_kimage_start > + ldr x0, arm64_kexec_dtb_addr > + mov x1, xzr > + mov x2, xzr > + mov x3, xzr > + br x4 This last part should be in userspace-provided purgatory. If you have purgatory code which does this then we should be able to rely on that, and we don't have to try to maintain this DTB handling in kernelspace (which I suspect may become painful as the boot protocol evolves). Thanks, Mark. > + > +/* flush - x17 = line size, x16 = start addr, x14 = end addr. */ > + > +.Lflush: > + cbz x16, 2f > + mov x0, x16 > + sub x1, x17, #1 > + bic x0, x0, x1 > +1: dc civac, x0 > + add x0, x0, x17 > + cmp x0, x14 > + b.lo 1b > +2: ret > + > +.align 3 /* To keep the 64-bit values below naturally aligned. */ > + > +/* The machine_kexec routines set these variables. */ > + > +/* > + * arm64_kexec_kimage_start - Copy of image->start, the entry point of the new > + * image. > + */ > + > +.globl arm64_kexec_kimage_start > +arm64_kexec_kimage_start: > + .quad 0x0 > + > +/* > + * arm64_kexec_dtb_addr - Physical address of a device tree. > + */ > + > +.globl arm64_kexec_dtb_addr > +arm64_kexec_dtb_addr: > + .quad 0x0 > + > +/* > + * arm64_kexec_kimage_head - Copy of image->head, the list of kimage entries. > + */ > + > +.globl arm64_kexec_kimage_head > +arm64_kexec_kimage_head: > + .quad 0x0 > + > +.Lrelocate_new_kernel_end: > + > +/* > + * relocate_new_kernel_size - Number of bytes to copy to the control_code_page. > + */ > + > +.globl relocate_new_kernel_size > +relocate_new_kernel_size: > + .quad .Lrelocate_new_kernel_end - relocate_new_kernel > + > +.org KEXEC_CONTROL_PAGE_SIZE > diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h > index 6925f5b..04626b9 100644 > --- a/include/uapi/linux/kexec.h > +++ b/include/uapi/linux/kexec.h > @@ -39,6 +39,7 @@ > #define KEXEC_ARCH_SH (42 << 16) > #define KEXEC_ARCH_MIPS_LE (10 << 16) > #define KEXEC_ARCH_MIPS ( 8 << 16) > +#define KEXEC_ARCH_ARM64 (183 << 16) > > /* The artificial cap on the number of segments passed to kexec_load. */ > #define KEXEC_SEGMENT_MAX 16 > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Fri, Oct 24, 2014 at 12:10:59AM +0100, Geoff Levand wrote: > Signed-off-by: Geoff Levand <geoff@infradead.org> > --- > arch/arm64/configs/defconfig | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig > index d92ef3c..ebf8b3f 100644 > --- a/arch/arm64/configs/defconfig > +++ b/arch/arm64/configs/defconfig > @@ -39,6 +39,7 @@ CONFIG_PREEMPT=y > CONFIG_KSM=y > CONFIG_TRANSPARENT_HUGEPAGE=y > CONFIG_CMA=y > +CONFIG_KEXEC=y Given this is going to be incompatible with KVM, and KVM is already in defconfig, I don't think we can add this until that incompatibility is fixed (or at the very least detected and handled gracefully). Thanks, Mark. > CONFIG_CMDLINE="console=ttyAMA0" > # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set > CONFIG_COMPAT=y > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > Device tree regions described by /memreserve/ entries are not available in > /proc/device-tree, and hence are not available to user space utilities that use > /proc/device-tree. In order for these regions to be available, convert any > arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. The limitation here is in the kernel (and a partially in userspace), so modifying the dts files is a workaround rather than a fix. It's perfectly valid for people to remain using /memreserve/, so this isn't sufficient. There are also existing DTBs using /memreserve/ which we can't rely on being modified to use reserved-memory. I think we need to expose memreserves to userspace somehow, potentially along with other DTB header fields. Grant, ideas? Are other architectures not affected by this limitation? > This change is required for kexec re-boot to work properly when a user does not > specify a second stage DTB via the kexec --dtb option. When the --dtb option is > not specified kexec will prepare the second stage DTB using data from > /proc/device-tree. > > Signed-off-by: Geoff Levand <geoff@infradead.org> > --- > arch/arm64/boot/dts/foundation-v8.dts | 13 +++++++++++-- > arch/arm64/boot/dts/rtsm_ve-aemv8a.dts | 13 +++++++++++-- > 2 files changed, 22 insertions(+), 4 deletions(-) > > diff --git a/arch/arm64/boot/dts/foundation-v8.dts b/arch/arm64/boot/dts/foundation-v8.dts > index 4a06090..2b76c2d 100644 > --- a/arch/arm64/boot/dts/foundation-v8.dts > +++ b/arch/arm64/boot/dts/foundation-v8.dts > @@ -6,8 +6,6 @@ > > /dts-v1/; > > -/memreserve/ 0x80000000 0x00010000; > - > / { > model = "Foundation-v8A"; > compatible = "arm,foundation-aarch64", "arm,vexpress"; > @@ -64,6 +62,17 @@ > <0x00000008 0x80000000 0 0x80000000>; > }; > > + reserved-memory { > + #address-cells = <2>; > + #size-cells = <2>; > + ranges; > + > + firmware-memory@0x80000000 { > + no-map; > + reg = <0x0 0x80000000 0x0 0x00010000>; > + }; > + }; > + For the spin-table code at present we currently permit cacheable mappings (that's part of the definition of /memreserve/), so it's not strictly necessary to have no-map. Until recently we accessed the cpu-release-addr through a cacheable mapping, and I don't know what other spin-table users (e.g. Xen) do. Linux should be able to handle this from now on, however. Thanks, Mark. > gic: interrupt-controller@2c001000 { > compatible = "arm,cortex-a15-gic", "arm,cortex-a9-gic"; > #interrupt-cells = <3>; > diff --git a/arch/arm64/boot/dts/rtsm_ve-aemv8a.dts b/arch/arm64/boot/dts/rtsm_ve-aemv8a.dts > index 572005e..0f80807 100644 > --- a/arch/arm64/boot/dts/rtsm_ve-aemv8a.dts > +++ b/arch/arm64/boot/dts/rtsm_ve-aemv8a.dts > @@ -9,8 +9,6 @@ > > /dts-v1/; > > -/memreserve/ 0x80000000 0x00010000; > - > / { > model = "RTSM_VE_AEMv8A"; > compatible = "arm,rtsm_ve,aemv8a", "arm,vexpress"; > @@ -67,6 +65,17 @@ > <0x00000008 0x80000000 0 0x80000000>; > }; > > + reserved-memory { > + #address-cells = <2>; > + #size-cells = <2>; > + ranges; > + > + firmware-memory@0x80000000 { > + no-map; > + reg = <0x0 0x80000000 0x0 0x00010000>; > + }; > + }; > + > gic: interrupt-controller@2c001000 { > compatible = "arm,cortex-a15-gic", "arm,cortex-a9-gic"; > #interrupt-cells = <3>; > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > Change any reference of device tree '/memreserve/' entries in the arm64 > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes > are the preferred method of specifying reserved memory. Per my comments on patch 5, I don't think this change is sufficient. However, we should probably update the document to allow reserved-memory nodes. On an unrelated note we probably need to work out how reserved-memory interacts with the UEFI memory map -- unmappable regions shouldn't be described by UEFI and I hope people don't use reserved-memory as a workaround for broken UEFI tables. Thanks, Mark. > Signed-off-by: Geoff Levand <geoff@infradead.org> > --- > Documentation/arm64/booting.txt | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt > index f3c05b5..7446822 100644 > --- a/Documentation/arm64/booting.txt > +++ b/Documentation/arm64/booting.txt > @@ -196,7 +196,7 @@ following manner: > naturally-aligned 64-bit zero-initalised memory location. > > These CPUs should spin outside of the kernel in a reserved area of > - memory (communicated to the kernel by a /memreserve/ region in the > + memory (communicated to the kernel by a reserved-memory node in the > device tree) polling their cpu-release-addr location, which must be > contained in the reserved region. A wfe instruction may be inserted > to reduce the overhead of the busy-loop and a sev will be issued by > @@ -209,7 +209,7 @@ following manner: > - CPUs with a "psci" enable method should remain outside of > the kernel (i.e. outside of the regions of memory described to the > kernel in the memory node, or in a reserved area of memory described > - to the kernel by a /memreserve/ region in the device tree). The > + to the kernel by a reserved-memory node in the device tree). The > kernel will issue CPU_ON calls as described in ARM document number ARM > DEN 0022A ("Power State Coordination Interface System Software on ARM > processors") to bring CPUs into the kernel. > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > When a CPU is reset it needs to be put into the exception level it had when it > entered the kernel. Update cpu_reset() to accept an argument el2_switch which > signals cpu_reset() to enter the soft reset address at EL2. If el2_switch is > not set the soft reset address will be entered at EL1. > > Update cpu_soft_restart() and soft_restart() to pass the return of > is_hyp_mode_available() as the el2_switch value to cpu_reset(). Also update the > comments of cpu_reset(), cpu_soft_restart() and soft_restart() to reflect this > change. This will blow up without warning with KVM, and I think we need to address that first. Mark. > Signed-off-by: Geoff Levand <geoff@infradead.org> > --- > arch/arm64/include/asm/proc-fns.h | 4 ++-- > arch/arm64/kernel/process.c | 6 ++++- > arch/arm64/mm/proc.S | 47 +++++++++++++++++++++++++++++---------- > 3 files changed, 42 insertions(+), 15 deletions(-) > > diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h > index 9a8fd84..339394d 100644 > --- a/arch/arm64/include/asm/proc-fns.h > +++ b/arch/arm64/include/asm/proc-fns.h > @@ -32,8 +32,8 @@ extern void cpu_cache_off(void); > extern void cpu_do_idle(void); > extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm); > extern void cpu_reset(unsigned long addr) __attribute__((noreturn)); > -void cpu_soft_restart(phys_addr_t cpu_reset, > - unsigned long addr) __attribute__((noreturn)); > +void cpu_soft_restart(phys_addr_t cpu_reset, unsigned long el2_switch, > + unsigned long addr) __attribute__((noreturn)); > extern void cpu_do_suspend(struct cpu_suspend_ctx *ptr); > extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr); > > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index bf66922..0a3414b 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -50,6 +50,7 @@ > #include <asm/mmu_context.h> > #include <asm/processor.h> > #include <asm/stacktrace.h> > +#include <asm/virt.h> > > #ifdef CONFIG_CC_STACKPROTECTOR > #include <linux/stackprotector.h> > @@ -60,7 +61,10 @@ EXPORT_SYMBOL(__stack_chk_guard); > void soft_restart(unsigned long addr) > { > setup_mm_for_reboot(); > - cpu_soft_restart(virt_to_phys(cpu_reset), addr); > + > + cpu_soft_restart(virt_to_phys(cpu_reset), is_hyp_mode_available(), > + addr); > + > /* Should never get here */ > BUG(); > } > diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S > index 4e778b1..7467199 100644 > --- a/arch/arm64/mm/proc.S > +++ b/arch/arm64/mm/proc.S > @@ -25,6 +25,7 @@ > #include <asm/hwcap.h> > #include <asm/pgtable-hwdef.h> > #include <asm/pgtable.h> > +#include <asm/virt.h> > > #include "proc-macros.S" > > @@ -59,27 +60,48 @@ ENTRY(cpu_cache_off) > ENDPROC(cpu_cache_off) > > /* > - * cpu_reset(loc) > + * cpu_reset(el2_switch, loc) - Helper for cpu_soft_restart. > * > - * Perform a soft reset of the system. Put the CPU into the same state > - * as it would be if it had been reset, and branch to what would be the > - * reset vector. It must be executed with the flat identity mapping. > + * @cpu_reset: Physical address of the cpu_reset routine. > + * @el2_switch: Flag to indicate a swich to EL2 is needed. > + * @addr: Location to jump to for soft reset. > * > - * - loc - location to jump to for soft reset > + * Put the CPU into the same state as it would be if it had been reset, and > + * branch to what would be the reset vector. It must be executed with the > + * flat identity mapping. > */ > + > .align 5 > + > ENTRY(cpu_reset) > - mrs x1, sctlr_el1 > - bic x1, x1, #1 > - msr sctlr_el1, x1 // disable the MMU > + mrs x2, sctlr_el1 > + bic x2, x2, #1 > + msr sctlr_el1, x2 // disable the MMU > isb > - ret x0 > + > + cbz x0, 1f // el2_switch? > + mov x0, x1 > + mov x1, xzr > + mov x2, xzr > + mov x3, xzr > + hvc #HVC_CALL_FUNC // no return > + > +1: ret x1 > ENDPROC(cpu_reset) > > +/* > + * cpu_soft_restart(cpu_reset, el2_switch, addr) - Perform a cpu soft reset. > + * > + * @cpu_reset: Physical address of the cpu_reset routine. > + * @el2_switch: Flag to indicate a swich to EL2 is needed, passed to cpu_reset. > + * @addr: Location to jump to for soft reset, passed to cpu_reset. > + * > + */ > + > ENTRY(cpu_soft_restart) > - /* Save address of cpu_reset() and reset address */ > - mov x19, x0 > - mov x20, x1 > + mov x19, x0 // cpu_reset > + mov x20, x1 // el2_switch > + mov x21, x2 // addr > > /* Turn D-cache off */ > bl cpu_cache_off > @@ -88,6 +110,7 @@ ENTRY(cpu_soft_restart) > bl flush_cache_all > > mov x0, x20 > + mov x1, x21 > ret x19 > ENDPROC(cpu_soft_restart) > > -- > 1.9.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> Device tree regions described by /memreserve/ entries are not available in >> /proc/device-tree, and hence are not available to user space utilities that use >> /proc/device-tree. In order for these regions to be available, convert any >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. > > The limitation here is in the kernel (and a partially in userspace), so > modifying the dts files is a workaround rather than a fix. > > It's perfectly valid for people to remain using /memreserve/, so this > isn't sufficient. There are also existing DTBs using /memreserve/ which > we can't rely on being modified to use reserved-memory. > > I think we need to expose memreserves to userspace somehow, potentially > along with other DTB header fields. Grant, ideas? Yes, I suggested the same thing to Geoff in a separate thread. Here's what I wrote: >> Geoff Levand wrote: >>> I did some work on this and I think we just need to convert all the >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a >>> patch for it. >> >> I don't think that is going to be entirely sufficient. There will be >> platforms that don't get converted over, and this is a generic problem >> that covers all architectures using DT, not just aarch64. The solution >> probably needs to include exposing the /memreserve/ sections to >> userspace. I can see two ways to do this: >> >> 1) Create a new property in /sysfs with all the memreserve sctions >> 2) Live-modify the device tree to put the memreserve data into a node >> at boot time. >> >> Option 2 is probably the most generic solution, but it will require >> some care to make sure there aren't any overlaps if a reserved-memory >> node already exists. >> >> g. > > For reference, here is an old conversation about this exact thing. > Reading through it, the opinions I expressed then don't necessarily > match what I think now. I still don't think it is a good idea to > expose the physical address of the old .dtb blob, but I do agree that > the memreserve sections need to be exposed. > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2010-July/083993.html > > g.
On Fri, Oct 24, 2014 at 11:54 AM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > Change any reference of device tree '/memreserve/' entries in the arm64 > > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes > > are the preferred method of specifying reserved memory. > > Per my comments on patch 5, I don't think this change is sufficient. > > However, we should probably update the document to allow reserved-memory > nodes. > > On an unrelated note we probably need to work out how reserved-memory > interacts with the UEFI memory map -- unmappable regions shouldn't be > described by UEFI and I hope people don't use reserved-memory as a > workaround for broken UEFI tables. When booting with UEFI, the boot stub will clear out all memory nodes and (should) clear out reserved regions so that the kernel can use the UEFI memory map as authoritative. g.
On Fri, Oct 24, 2014 at 12:04:00PM +0100, Grant Likely wrote: > On Fri, Oct 24, 2014 at 11:54 AM, Mark Rutland <mark.rutland@arm.com> wrote: > > > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > > Change any reference of device tree '/memreserve/' entries in the arm64 > > > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes > > > are the preferred method of specifying reserved memory. > > > > Per my comments on patch 5, I don't think this change is sufficient. > > > > However, we should probably update the document to allow reserved-memory > > nodes. > > > > On an unrelated note we probably need to work out how reserved-memory > > interacts with the UEFI memory map -- unmappable regions shouldn't be > > described by UEFI and I hope people don't use reserved-memory as a > > workaround for broken UEFI tables. > > > When booting with UEFI, the boot stub will clear out all memory nodes > and (should) clear out reserved regions so that the kernel can use the > UEFI memory map as authoritative. We clear memory nodes and memreserves currently. I was thinking about reserved-memory nodes (for CMA and such), which we don't currently clear. Mark.
On Fri, Oct 24, 2014 at 11:59:38AM +0100, Grant Likely wrote: > On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > >> Device tree regions described by /memreserve/ entries are not available in > >> /proc/device-tree, and hence are not available to user space utilities that use > >> /proc/device-tree. In order for these regions to be available, convert any > >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. > > > > The limitation here is in the kernel (and a partially in userspace), so > > modifying the dts files is a workaround rather than a fix. > > > > It's perfectly valid for people to remain using /memreserve/, so this > > isn't sufficient. There are also existing DTBs using /memreserve/ which > > we can't rely on being modified to use reserved-memory. > > > > I think we need to expose memreserves to userspace somehow, potentially > > along with other DTB header fields. Grant, ideas? > > Yes, I suggested the same thing to Geoff in a separate thread. Here's > what I wrote: > > >> Geoff Levand wrote: > >>> I did some work on this and I think we just need to convert all the > >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the > >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a > >>> patch for it. > >> > >> I don't think that is going to be entirely sufficient. There will be > >> platforms that don't get converted over, and this is a generic problem > >> that covers all architectures using DT, not just aarch64. The solution > >> probably needs to include exposing the /memreserve/ sections to > >> userspace. I can see two ways to do this: > >> > >> 1) Create a new property in /sysfs with all the memreserve sctions > >> 2) Live-modify the device tree to put the memreserve data into a node > >> at boot time. > >> > >> Option 2 is probably the most generic solution, but it will require > >> some care to make sure there aren't any overlaps if a reserved-memory > >> node already exists. I would prefer the former currently. While I currently believe that modifying the tree is something we're going to have to do for stateful properties, it's not someting I want to have to do unless absolutely necessary. > >> > >> g. > > > > For reference, here is an old conversation about this exact thing. > > Reading through it, the opinions I expressed then don't necessarily > > match what I think now. I still don't think it is a good idea to > > expose the physical address of the old .dtb blob, but I do agree that > > the memreserve sections need to be exposed. > > > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2010-July/083993.html Another option would be to expose the original DTB as a (read-only) binary file somewhere. That might interact poorly with live tree modification in future, however. Mark.
On Fri, Oct 24, 2014 at 1:18 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Oct 24, 2014 at 12:04:00PM +0100, Grant Likely wrote: >> On Fri, Oct 24, 2014 at 11:54 AM, Mark Rutland <mark.rutland@arm.com> wrote: >> > >> > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> > > Change any reference of device tree '/memreserve/' entries in the arm64 >> > > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes >> > > are the preferred method of specifying reserved memory. >> > >> > Per my comments on patch 5, I don't think this change is sufficient. >> > >> > However, we should probably update the document to allow reserved-memory >> > nodes. >> > >> > On an unrelated note we probably need to work out how reserved-memory >> > interacts with the UEFI memory map -- unmappable regions shouldn't be >> > described by UEFI and I hope people don't use reserved-memory as a >> > workaround for broken UEFI tables. >> >> >> When booting with UEFI, the boot stub will clear out all memory nodes >> and (should) clear out reserved regions so that the kernel can use the >> UEFI memory map as authoritative. > > We clear memory nodes and memreserves currently. > > I was thinking about reserved-memory nodes (for CMA and such), which we > don't currently clear. We should either clear them, or make sure that they will coexist happily with the UEFI map. I can think of a few situations, like CMA, where still having the reserved-memory node would be a good idea. g.
On Fri, Oct 24, 2014 at 02:54:02PM +0100, Grant Likely wrote: > On Fri, Oct 24, 2014 at 1:18 PM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Fri, Oct 24, 2014 at 12:04:00PM +0100, Grant Likely wrote: > >> On Fri, Oct 24, 2014 at 11:54 AM, Mark Rutland <mark.rutland@arm.com> wrote: > >> > > >> > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > >> > > Change any reference of device tree '/memreserve/' entries in the arm64 > >> > > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes > >> > > are the preferred method of specifying reserved memory. > >> > > >> > Per my comments on patch 5, I don't think this change is sufficient. > >> > > >> > However, we should probably update the document to allow reserved-memory > >> > nodes. > >> > > >> > On an unrelated note we probably need to work out how reserved-memory > >> > interacts with the UEFI memory map -- unmappable regions shouldn't be > >> > described by UEFI and I hope people don't use reserved-memory as a > >> > workaround for broken UEFI tables. > >> > >> > >> When booting with UEFI, the boot stub will clear out all memory nodes > >> and (should) clear out reserved regions so that the kernel can use the > >> UEFI memory map as authoritative. > > > > We clear memory nodes and memreserves currently. > > > > I was thinking about reserved-memory nodes (for CMA and such), which we > > don't currently clear. > > We should either clear them, or make sure that they will coexist > happily with the UEFI map. I can think of a few situations, like CMA, > where still having the reserved-memory node would be a good idea. My thinking would be that reservations which the kernel could choose to ignore for whatever reason and use for general allocation (e.g. CMA) make sense, but anything else is nonsense if it overlaps with available memory in the UEFI memory map -- it shouldn't have been there to begin with. I don't know what the best thing to do in that case is. I'd like to explode very loudly during bringup to get vendors to fix their UEFI tables, but in the long run we might just have to reconcile the two and align with the most stringent requirement. Good luck getting another OS working in that case... Mark.
On Fri, Oct 24, 2014 at 1:27 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Oct 24, 2014 at 11:59:38AM +0100, Grant Likely wrote: >> On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> >> Device tree regions described by /memreserve/ entries are not available in >> >> /proc/device-tree, and hence are not available to user space utilities that use >> >> /proc/device-tree. In order for these regions to be available, convert any >> >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. >> > >> > The limitation here is in the kernel (and a partially in userspace), so >> > modifying the dts files is a workaround rather than a fix. >> > >> > It's perfectly valid for people to remain using /memreserve/, so this >> > isn't sufficient. There are also existing DTBs using /memreserve/ which >> > we can't rely on being modified to use reserved-memory. >> > >> > I think we need to expose memreserves to userspace somehow, potentially >> > along with other DTB header fields. Grant, ideas? >> >> Yes, I suggested the same thing to Geoff in a separate thread. Here's >> what I wrote: >> >> >> Geoff Levand wrote: >> >>> I did some work on this and I think we just need to convert all the >> >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the >> >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a >> >>> patch for it. >> >> >> >> I don't think that is going to be entirely sufficient. There will be >> >> platforms that don't get converted over, and this is a generic problem >> >> that covers all architectures using DT, not just aarch64. The solution >> >> probably needs to include exposing the /memreserve/ sections to >> >> userspace. I can see two ways to do this: >> >> >> >> 1) Create a new property in /sysfs with all the memreserve sctions >> >> 2) Live-modify the device tree to put the memreserve data into a node >> >> at boot time. >> >> >> >> Option 2 is probably the most generic solution, but it will require >> >> some care to make sure there aren't any overlaps if a reserved-memory >> >> node already exists. > > I would prefer the former currently. While I currently believe that > modifying the tree is something we're going to have to do for stateful > properties, it's not someting I want to have to do unless absolutely > necessary. We already have a file in debugfs that will expose the entire dtb to userspace, but I strongly discourage using that feature in production. Search for "DEBUG_FS" in fdt.c g.
On Fri, Oct 24, 2014 at 3:10 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Oct 24, 2014 at 02:54:02PM +0100, Grant Likely wrote: >> On Fri, Oct 24, 2014 at 1:18 PM, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Fri, Oct 24, 2014 at 12:04:00PM +0100, Grant Likely wrote: >> >> On Fri, Oct 24, 2014 at 11:54 AM, Mark Rutland <mark.rutland@arm.com> wrote: >> >> > >> >> > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> >> > > Change any reference of device tree '/memreserve/' entries in the arm64 >> >> > > booting.txt to refer to 'reserved-memory nodes'. Reserved-memory nodes >> >> > > are the preferred method of specifying reserved memory. >> >> > >> >> > Per my comments on patch 5, I don't think this change is sufficient. >> >> > >> >> > However, we should probably update the document to allow reserved-memory >> >> > nodes. >> >> > >> >> > On an unrelated note we probably need to work out how reserved-memory >> >> > interacts with the UEFI memory map -- unmappable regions shouldn't be >> >> > described by UEFI and I hope people don't use reserved-memory as a >> >> > workaround for broken UEFI tables. >> >> >> >> >> >> When booting with UEFI, the boot stub will clear out all memory nodes >> >> and (should) clear out reserved regions so that the kernel can use the >> >> UEFI memory map as authoritative. >> > >> > We clear memory nodes and memreserves currently. >> > >> > I was thinking about reserved-memory nodes (for CMA and such), which we >> > don't currently clear. >> >> We should either clear them, or make sure that they will coexist >> happily with the UEFI map. I can think of a few situations, like CMA, >> where still having the reserved-memory node would be a good idea. > > My thinking would be that reservations which the kernel could choose to > ignore for whatever reason and use for general allocation (e.g. CMA) > make sense, but anything else is nonsense if it overlaps with available > memory in the UEFI memory map -- it shouldn't have been there to begin > with. > > I don't know what the best thing to do in that case is. I'd like to > explode very loudly during bringup to get vendors to fix their UEFI > tables, but in the long run we might just have to reconcile the two and > align with the most stringent requirement. Good luck getting another OS > working in that case... On ACPI platforms this is a non-issue. On UEFI+FDT platforms there are unlikely to be any other operating systems significantly affected by this. g.
On Fri, Oct 24, 2014 at 10:24:51AM +0100, Mark Rutland wrote: > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > Some of the macros defined in kvm_arm.h are useful in assembly files, but are > > not compatible with the assembler. Change any C language integer constant > > definitions using appended U, UL, or ULL to the UL() preprocessor macro. Also, > > add a preprocessor include of the asm/memory.h file which defines the UL() > > macro. > > > > Fixes build errors like these when using kvm_arm.h in assembly > > source files: > > > > Error: unexpected characters following instruction at operand 3 -- `and x0,x1,#((1U<<25)-1)' > > > > Signed-off-by: Geoff Levand <geoff@infradead.org> > > Thanks for putting this together, Geoff. > > Acked-by: Mark Rutland <mark.rutland@arm.com> This doesn't apply against -rc2 (see below). If you send a rebased version of this patch, I'm happy to queue it for 3.19. Alternatively, it could go via the kvm tree. Christoffer, Marc, do you have a preference? (the kexec stuff depends on this, so that's why I was thinking of queuing it in the arm64 tree). Thanks, Will --->8 Applying: arm64/kvm: Fix assembler compatibility of macros error: patch failed: arch/arm64/include/asm/kvm_arm.h:149 error: arch/arm64/include/asm/kvm_arm.h: patch does not apply Patch failed at 0001 arm64/kvm: Fix assembler compatibility of macros The copy of the patch that failed is found in: /home/will/work/aarch32/linux/linux/.git/rebase-apply/patch When you have resolved this problem, run "git am --continue". If you prefer to skip this patch, run "git am --skip" instead. To restore the original branch and stop patching, run "git am --abort".
On Mon, Oct 27, 2014 at 1:13 PM, Will Deacon <will.deacon@arm.com> wrote: > On Fri, Oct 24, 2014 at 10:24:51AM +0100, Mark Rutland wrote: >> On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> > Some of the macros defined in kvm_arm.h are useful in assembly files, but are >> > not compatible with the assembler. Change any C language integer constant >> > definitions using appended U, UL, or ULL to the UL() preprocessor macro. Also, >> > add a preprocessor include of the asm/memory.h file which defines the UL() >> > macro. >> > >> > Fixes build errors like these when using kvm_arm.h in assembly >> > source files: >> > >> > Error: unexpected characters following instruction at operand 3 -- `and x0,x1,#((1U<<25)-1)' >> > >> > Signed-off-by: Geoff Levand <geoff@infradead.org> >> >> Thanks for putting this together, Geoff. >> >> Acked-by: Mark Rutland <mark.rutland@arm.com> > > This doesn't apply against -rc2 (see below). If you send a rebased version > of this patch, I'm happy to queue it for 3.19. > > Alternatively, it could go via the kvm tree. Christoffer, Marc, do you have > a preference? (the kexec stuff depends on this, so that's why I was thinking > of queuing it in the arm64 tree). > I'm fine with you picking it up. Thanks, -Chrsitoffer
Hi Dave, Thanks for testing. On Fri, 2014-10-31 at 15:52 +0800, Dave Young wrote: > I tested your patches. The macihne is using spin-table cpu enable method > so I tried maxcpus=1 as you suggested. > > There's below issues for me, thoughts? > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > to use. You can supply a DTB with the kexec --dtb option, then kexec will not need /proc/device-tree. Please try if you can and let me know what happens. > 2. adding "acpi=off" to 1st kernel boot cmdline, kexec load fails with error > like below: > machine_apply_elf_rel: ERROR Unknown type: 261 > > I did below hack then kexec load works fine, OK, thanks, I'll add support for R_AARCH64_PREL32 in. > but `kexec -e` still does not work > there's nothing more than "Disableing non-boot CPUS ...\n Bye!": It is really tough to say what happened. The 'Bye!' message is printed just before the 1st stage kernel exits and the 2nd stage kernel is entered. If you have a debugger you can put some breakpoints in there to see how far it gets. That is generally how I figure out what went wrong. You could try the kexec --lite option, it will do a non-purgatory boot. You could also try my master branch that has more debugging output. Some of it is through the VE's serial port, so you may need to change that to what your platform has. -Geoff
Hi, On Fri, 2014-10-24 at 13:27 +0100, Mark Rutland wrote: > On Fri, Oct 24, 2014 at 11:59:38AM +0100, Grant Likely wrote: > > On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: > > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > >> Device tree regions described by /memreserve/ entries are not available in > > >> /proc/device-tree, and hence are not available to user space utilities that use > > >> /proc/device-tree. In order for these regions to be available, convert any > > >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. > > > > > > The limitation here is in the kernel (and a partially in userspace), so > > > modifying the dts files is a workaround rather than a fix. > > > > > > It's perfectly valid for people to remain using /memreserve/, so this > > > isn't sufficient. There are also existing DTBs using /memreserve/ which > > > we can't rely on being modified to use reserved-memory. > > > > > > I think we need to expose memreserves to userspace somehow, potentially > > > along with other DTB header fields. Grant, ideas? > > > > Yes, I suggested the same thing to Geoff in a separate thread. Here's > > what I wrote: > > > > >> Geoff Levand wrote: > > >>> I did some work on this and I think we just need to convert all the > > >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the > > >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a > > >>> patch for it. > > >> > > >> I don't think that is going to be entirely sufficient. There will be > > >> platforms that don't get converted over, and this is a generic problem > > >> that covers all architectures using DT, not just aarch64. The solution > > >> probably needs to include exposing the /memreserve/ sections to > > >> userspace. I can see two ways to do this: > > >> > > >> 1) Create a new property in /sysfs with all the memreserve sctions > > >> 2) Live-modify the device tree to put the memreserve data into a node > > >> at boot time. > > >> > > >> Option 2 is probably the most generic solution, but it will require > > >> some care to make sure there aren't any overlaps if a reserved-memory > > >> node already exists. > > I would prefer the former currently. While I currently believe that > modifying the tree is something we're going to have to do for stateful > properties, it's not someting I want to have to do unless absolutely > necessary. Current user space kexec utilities use /proc/device-tree and nothing else. The intension of the device tree is to describe the system sufficiently for a kernel to boot, so I think we should put the /memreserve/ info into /proc/device-tree. We could put the /memreserve/ entries in there directly, or convert to reserved-memory nodes. At the moment I like the idea to convert to reserved-memory nodes. > > > For reference, here is an old conversation about this exact thing. > > > Reading through it, the opinions I expressed then don't necessarily > > > match what I think now. I still don't think it is a good idea to > > > expose the physical address of the old .dtb blob, but I do agree that > > > the memreserve sections need to be exposed. > > > > > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2010-July/083993.html > > Another option would be to expose the original DTB as a (read-only) > binary file somewhere. That might interact poorly with live tree > modification in future, however. I don't like the idea of having two interfaces to get essentially the same info. I think it will be a maintenance problem over time. -Geoff
On Fri, 2014-10-24 at 11:57 +0100, Mark Rutland wrote: > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > When a CPU is reset it needs to be put into the exception level it had when it > > entered the kernel. Update cpu_reset() to accept an argument el2_switch which > > signals cpu_reset() to enter the soft reset address at EL2. If el2_switch is > > not set the soft reset address will be entered at EL1. > > > > Update cpu_soft_restart() and soft_restart() to pass the return of > > is_hyp_mode_available() as the el2_switch value to cpu_reset(). Also update the > > comments of cpu_reset(), cpu_soft_restart() and soft_restart() to reflect this > > change. > > This will blow up without warning with KVM, and I think we need to > address that first. Yes. I think we can just put in a conditional on KVM as a workaround until KVM will work with this. -Geoff
On Fri, 2014-10-24 at 11:31 +0100, Mark Rutland wrote: > On Fri, Oct 24, 2014 at 12:10:59AM +0100, Geoff Levand wrote: > > Signed-off-by: Geoff Levand <geoff@infradead.org> > > --- > > arch/arm64/configs/defconfig | 1 + > > 1 file changed, 1 insertion(+) > > > > diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig > > index d92ef3c..ebf8b3f 100644 > > --- a/arch/arm64/configs/defconfig > > +++ b/arch/arm64/configs/defconfig > > @@ -39,6 +39,7 @@ CONFIG_PREEMPT=y > > CONFIG_KSM=y > > CONFIG_TRANSPARENT_HUGEPAGE=y > > CONFIG_CMA=y > > +CONFIG_KEXEC=y > > Given this is going to be incompatible with KVM, and KVM is already in > defconfig, I don't think we can add this until that incompatibility is > fixed (or at the very least detected and handled gracefully). Sure, I can put in a workaround to print out a message and fail the kexec_load sys call if KVM is configured. -Geoff
On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > Hi Geoff > > I tested your patches. The macihne is using spin-table cpu enable method > so I tried maxcpus=1 as you suggested. > > There's below issues for me, thoughts? > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > to use. Are you absolutely certain of this? To use ACPI, you must have booted via EFI, as the only mechanism for finding the ACPI tables is via EFI. If booted via EFI, the stub will have created a stub DTB if there is no provided DTB, to pass the command line and pointers to EFI data structures. This stub DTB should be present in the usual place. Thanks, Mark.
On Fri, Oct 31, 2014 at 11:44:52PM +0000, Geoff Levand wrote: > Hi, > > On Fri, 2014-10-24 at 13:27 +0100, Mark Rutland wrote: > > On Fri, Oct 24, 2014 at 11:59:38AM +0100, Grant Likely wrote: > > > On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: > > > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: > > > >> Device tree regions described by /memreserve/ entries are not available in > > > >> /proc/device-tree, and hence are not available to user space utilities that use > > > >> /proc/device-tree. In order for these regions to be available, convert any > > > >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. > > > > > > > > The limitation here is in the kernel (and a partially in userspace), so > > > > modifying the dts files is a workaround rather than a fix. > > > > > > > > It's perfectly valid for people to remain using /memreserve/, so this > > > > isn't sufficient. There are also existing DTBs using /memreserve/ which > > > > we can't rely on being modified to use reserved-memory. > > > > > > > > I think we need to expose memreserves to userspace somehow, potentially > > > > along with other DTB header fields. Grant, ideas? > > > > > > Yes, I suggested the same thing to Geoff in a separate thread. Here's > > > what I wrote: > > > > > > >> Geoff Levand wrote: > > > >>> I did some work on this and I think we just need to convert all the > > > >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the > > > >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a > > > >>> patch for it. > > > >> > > > >> I don't think that is going to be entirely sufficient. There will be > > > >> platforms that don't get converted over, and this is a generic problem > > > >> that covers all architectures using DT, not just aarch64. The solution > > > >> probably needs to include exposing the /memreserve/ sections to > > > >> userspace. I can see two ways to do this: > > > >> > > > >> 1) Create a new property in /sysfs with all the memreserve sctions > > > >> 2) Live-modify the device tree to put the memreserve data into a node > > > >> at boot time. > > > >> > > > >> Option 2 is probably the most generic solution, but it will require > > > >> some care to make sure there aren't any overlaps if a reserved-memory > > > >> node already exists. > > > > I would prefer the former currently. While I currently believe that > > modifying the tree is something we're going to have to do for stateful > > properties, it's not someting I want to have to do unless absolutely > > necessary. > > Current user space kexec utilities use /proc/device-tree and nothing > else. The intension of the device tree is to describe the system > sufficiently for a kernel to boot, so I think we should put the > /memreserve/ info into /proc/device-tree. I don't follow. The intention of device tree generically has precisely nothing to do with the Linux-specific filesystem hierarchy for exposing device tree to userspace. The memreserve entries aren't the same as other elements in /proc/device-tree. They're a flat table pointed to by the header rather than a hierarchy of nodes and properties. There's no way to expose that difference under /proc/device-tree, so exposing that as a parallel hierarchy makes the most sense to me. > We could put the /memreserve/ entries in there directly, or convert > to reserved-memory nodes. At the moment I like the idea to convert to > reserved-memory nodes. I think we need a parallel hierarchy to expose header information (though we likely only need the memreserve entries). I'm not keen on rewriting the device tree to work around a historical mistake. > > > > For reference, here is an old conversation about this exact thing. > > > > Reading through it, the opinions I expressed then don't necessarily > > > > match what I think now. I still don't think it is a good idea to > > > > expose the physical address of the old .dtb blob, but I do agree that > > > > the memreserve sections need to be exposed. > > > > > > > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2010-July/083993.html > > > > Another option would be to expose the original DTB as a (read-only) > > binary file somewhere. That might interact poorly with live tree > > modification in future, however. > > I don't like the idea of having two interfaces to get essentially the same > info. I think it will be a maintenance problem over time. Given Grant's comments, I'm perfectly happy to not expose the raw DTB for situations otehr than debugging. Thanks, Mark.
On Fri, Oct 31, 2014 at 11:50:15PM +0000, Geoff Levand wrote: > On Fri, 2014-10-24 at 11:31 +0100, Mark Rutland wrote: > > On Fri, Oct 24, 2014 at 12:10:59AM +0100, Geoff Levand wrote: > > > Signed-off-by: Geoff Levand <geoff@infradead.org> > > > --- > > > arch/arm64/configs/defconfig | 1 + > > > 1 file changed, 1 insertion(+) > > > > > > diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig > > > index d92ef3c..ebf8b3f 100644 > > > --- a/arch/arm64/configs/defconfig > > > +++ b/arch/arm64/configs/defconfig > > > @@ -39,6 +39,7 @@ CONFIG_PREEMPT=y > > > CONFIG_KSM=y > > > CONFIG_TRANSPARENT_HUGEPAGE=y > > > CONFIG_CMA=y > > > +CONFIG_KEXEC=y > > > > Given this is going to be incompatible with KVM, and KVM is already in > > defconfig, I don't think we can add this until that incompatibility is > > fixed (or at the very least detected and handled gracefully). > > Sure, I can put in a workaround to print out a message and fail the > kexec_load sys call if KVM is configured. What would be the plan for fixing up kexec to work with kvm were we to do that? Thanks, Mark.
Adding Leif... On Fri, Oct 31, 2014 at 6:44 PM, Geoff Levand <geoff@infradead.org> wrote: > Hi, > > On Fri, 2014-10-24 at 13:27 +0100, Mark Rutland wrote: >> On Fri, Oct 24, 2014 at 11:59:38AM +0100, Grant Likely wrote: >> > On Fri, Oct 24, 2014 at 11:51 AM, Mark Rutland <mark.rutland@arm.com> wrote: >> > > On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote: >> > >> Device tree regions described by /memreserve/ entries are not available in >> > >> /proc/device-tree, and hence are not available to user space utilities that use >> > >> /proc/device-tree. In order for these regions to be available, convert any >> > >> arm64 DTS files using /memreserve/ entries to use reserved-memory nodes. >> > > >> > > The limitation here is in the kernel (and a partially in userspace), so >> > > modifying the dts files is a workaround rather than a fix. >> > > >> > > It's perfectly valid for people to remain using /memreserve/, so this >> > > isn't sufficient. There are also existing DTBs using /memreserve/ which >> > > we can't rely on being modified to use reserved-memory. >> > > >> > > I think we need to expose memreserves to userspace somehow, potentially >> > > along with other DTB header fields. Grant, ideas? >> > >> > Yes, I suggested the same thing to Geoff in a separate thread. Here's >> > what I wrote: >> > >> > >> Geoff Levand wrote: >> > >>> I did some work on this and I think we just need to convert all the >> > >>> arm64 dts from /memreserve/ to reserved-memory nodes and update the >> > >>> arm64 booting.txt to specify using reserved-memory. I'll prepare a >> > >>> patch for it. >> > >> >> > >> I don't think that is going to be entirely sufficient. There will be >> > >> platforms that don't get converted over, and this is a generic problem >> > >> that covers all architectures using DT, not just aarch64. The solution >> > >> probably needs to include exposing the /memreserve/ sections to >> > >> userspace. I can see two ways to do this: >> > >> >> > >> 1) Create a new property in /sysfs with all the memreserve sctions >> > >> 2) Live-modify the device tree to put the memreserve data into a node >> > >> at boot time. >> > >> >> > >> Option 2 is probably the most generic solution, but it will require >> > >> some care to make sure there aren't any overlaps if a reserved-memory >> > >> node already exists. >> >> I would prefer the former currently. While I currently believe that >> modifying the tree is something we're going to have to do for stateful >> properties, it's not someting I want to have to do unless absolutely >> necessary. > > Current user space kexec utilities use /proc/device-tree and nothing > else. The intension of the device tree is to describe the system > sufficiently for a kernel to boot, so I think we should put the > /memreserve/ info into /proc/device-tree. > > We could put the /memreserve/ entries in there directly, or convert > to reserved-memory nodes. At the moment I like the idea to convert to > reserved-memory nodes. I'm just wondering does UEFI being used for the memory information have any impact here as the DT would not have valid memory nodes either? I'd assume reserved memory comes from UEFI (or both) in that case? Perhaps we need to expose memory layout independent of DT, UEFI or anything else. Rob > >> > > For reference, here is an old conversation about this exact thing. >> > > Reading through it, the opinions I expressed then don't necessarily >> > > match what I think now. I still don't think it is a good idea to >> > > expose the physical address of the old .dtb blob, but I do agree that >> > > the memreserve sections need to be exposed. >> > > >> > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2010-July/083993.html >> >> Another option would be to expose the original DTB as a (read-only) >> binary file somewhere. That might interact poorly with live tree >> modification in future, however. > > I don't like the idea of having two interfaces to get essentially the same > info. I think it will be a maintenance problem over time. > > -Geoff > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Hi Mark, On Mon, 2014-11-03 at 20:05 +0000, Mark Rutland wrote: > On Fri, Oct 31, 2014 at 11:50:15PM +0000, Geoff Levand wrote: > > On Fri, 2014-10-24 at 11:31 +0100, Mark Rutland wrote: > > > On Fri, Oct 24, 2014 at 12:10:59AM +0100, Geoff Levand wrote: > > > > Signed-off-by: Geoff Levand <geoff@infradead.org> > > > > --- > > > > arch/arm64/configs/defconfig | 1 + > > > > 1 file changed, 1 insertion(+) > > > > > > > > diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig > > > > index d92ef3c..ebf8b3f 100644 > > > > --- a/arch/arm64/configs/defconfig > > > > +++ b/arch/arm64/configs/defconfig > > > > @@ -39,6 +39,7 @@ CONFIG_PREEMPT=y > > > > CONFIG_KSM=y > > > > CONFIG_TRANSPARENT_HUGEPAGE=y > > > > CONFIG_CMA=y > > > > +CONFIG_KEXEC=y > > > > > > Given this is going to be incompatible with KVM, and KVM is already in > > > defconfig, I don't think we can add this until that incompatibility is > > > fixed (or at the very least detected and handled gracefully). > > > > Sure, I can put in a workaround to print out a message and fail the > > kexec_load sys call if KVM is configured. > > What would be the plan for fixing up kexec to work with kvm were we to > do that? As we discussed at on this ML and in detail at Linaro Connect, KVM needs to shut itself down properly and free up the resources it acquired at startup, mainly the hyp mode vectors. I don't have any time free to do that now, but if no one fixes it by the time I do have some time, then I'll work on it. Kexec needs no fixing for this that I know of. -Geoff
On Mon, Nov 03, 2014 at 10:26:02PM +0000, Rob Herring wrote: > Adding Leif... [...] > I'm just wondering does UEFI being used for the memory information > have any impact here as the DT would not have valid memory nodes > either? For UEFI, things should just work (TM). The dtb will be free of memory nodes (and memreserves), and the next kernel will discover the UEFI memory map via the linux,uefi-mmap-* properties as the first kernel did. We shouldn't need to pass any additional data in this case. > I'd assume reserved memory comes from UEFI (or both) in that > case? Yes. Any memory that should be reserved should be described as such in the UEFI memory map. One caveat is that reserved-memory nodes are preserved (because they can be referred to by phandle for CMA and such). There are few potential problems with this (we'll have to reconcile it with the UEFI memory map, prevent overlap when the kernel is relocated, etc). The reserved-memory nodes should only be used for CMA type carve outs rather than for protecting firmware and similar. > Perhaps we need to expose memory layout independent of DT, UEFI > or anything else. I can see that being useful for debugging, but I don't think that we require it. Thanks, Mark.
On Mon, Nov 3, 2014 at 10:26 PM, Rob Herring <robherring2@gmail.com> wrote: > On Fri, Oct 31, 2014 at 6:44 PM, Geoff Levand <geoff@infradead.org> wrote: >> On Fri, 2014-10-24 at 13:27 +0100, Mark Rutland wrote: >> Current user space kexec utilities use /proc/device-tree and nothing >> else. The intension of the device tree is to describe the system >> sufficiently for a kernel to boot, so I think we should put the >> /memreserve/ info into /proc/device-tree. >> >> We could put the /memreserve/ entries in there directly, or convert >> to reserved-memory nodes. At the moment I like the idea to convert to >> reserved-memory nodes. > > I'm just wondering does UEFI being used for the memory information > have any impact here as the DT would not have valid memory nodes > either? I'd assume reserved memory comes from UEFI (or both) in that > case? It should, but as always the answer is not simple. UEFI+DT, it may be that a CMA region is specified in /reserved-memory and we would want to respect that, even when getting memory information from UEFI. However, the /memreserve/ section shouldn't be an issue in this case. Anything firmware needs protected must be reflected in the UEFI memory map. g.
On 11/03/14 at 07:46pm, Mark Rutland wrote: > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > > Hi Geoff > > > > I tested your patches. The macihne is using spin-table cpu enable method > > so I tried maxcpus=1 as you suggested. > > > > There's below issues for me, thoughts? > > > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > > to use. > > Are you absolutely certain of this? > > To use ACPI, you must have booted via EFI, as the only mechanism for > finding the ACPI tables is via EFI. If booted via EFI, the stub will > have created a stub DTB if there is no provided DTB, to pass the command > line and pointers to EFI data structures. This stub DTB should be > present in the usual place. Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs maybe I can pass external dtb to kexec-tools. What you mentioned should be true though but I have not get idea how to get the dtb which kernel is using for boot since it is not unflattened. Thanks Dave
On 10/31/14 at 04:25pm, Geoff Levand wrote: > Hi Dave, > > Thanks for testing. > > On Fri, 2014-10-31 at 15:52 +0800, Dave Young wrote: > > I tested your patches. The macihne is using spin-table cpu enable method > > so I tried maxcpus=1 as you suggested. > > > > There's below issues for me, thoughts? > > > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > > to use. > > You can supply a DTB with the kexec --dtb option, then kexec will not need > /proc/device-tree. Please try if you can and let me know what happens. I remember I tried but kexec load fails due to kexec-tools is still trying to access proc flattened dtb. I have little time last few days, will test it again to confirm. > > > 2. adding "acpi=off" to 1st kernel boot cmdline, kexec load fails with error > > like below: > > machine_apply_elf_rel: ERROR Unknown type: 261 > > > > I did below hack then kexec load works fine, > > OK, thanks, I'll add support for R_AARCH64_PREL32 in. > > > but `kexec -e` still does not work > > there's nothing more than "Disableing non-boot CPUS ...\n Bye!": > > It is really tough to say what happened. The 'Bye!' message is printed > just before the 1st stage kernel exits and the 2nd stage kernel is > entered. If you have a debugger you can put some breakpoints in > there to see how far it gets. That is generally how I figure out > what went wrong. > > You could try the kexec --lite option, it will do a non-purgatory > boot. > > You could also try my master branch that has more debugging output. > Some of it is through the VE's serial port, so you may need to > change that to what your platform has. I think I have will have to check if I can find a way to debug as you have done in your master branch. Thanks a lot Dave
Hi Geoff, I am trying this on my hardware (apm-mustang.dtb) On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: > Hi All, > > This series adds the core support for kexec re-boots on arm64. I have tested > with the ARM VE fast model, the ARM Base model and the ARM Foundation > model with various kernel config options for both the first and second stage > kernels. > > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to > kexec-tools [2], which have not yet been merged upstream, are needed. > > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() > routine the ability to switch exception levels from EL1 to EL2 for kernels that > were entered in EL2. > > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree > reserved-memory nodes. > > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the > dcache_line_size macro it defines can be uesd by kexec's relocate kernel > routine. > > Patches 8-10 add the actual kexec support. > > Please consider all patches for inclusion. Any comments or suggestions on how > to improve are welcome. > > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git > > Several things are known to have problems on kexec re-boot: > > spin-table > ---------- > > PROBLEM: The spin-table enable method does not implement all the methods needed > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. > > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' > on the first stage kernel command line. I have CONFIG_SMP=n > > FIX: Upgrade system firmware to provide PSCI enable method support. > > KVM > --- > > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those > resources on shutdown, so the first stage kernel cannot be shutdown properly. > > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. KVM also disabled. /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux --dtb=apm-mustang.dtb --command-line= "root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlyprintk=uart8250-32bit,0x1c020000" kexec version: 14.10.21.16.36-ga38e0a6 arch_process_options:85: command_line: root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlyprintk=uart8250-32bit,0x1 c020000 arch_process_options:87: initrd: (null) arch_process_options:88: dtb: apm-mustang.dtb arch_process_options:89: lite: 1 kernel: 0x7f756e7010 kernel_size: 0x488a308 Modified cmdline: root=/dev/nfs Unable to find /proc/device-tree//chosen/linux,stdout-path, printing from purgatory is diabled get_memory_ranges_dt:638: node_1516 memory get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 get_memory_ranges_dt:678: Success elf_arm64_load: PE format: yes p_vaddr: ffffffc000080000 virt_to_phys: ffffffc000080000 -> 0000004000080000 add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) -> 0000004000080000 - 00000040006fb000 (0067b000) elf_arm64_load: text_offset: 0000000000080000 elf_arm64_load: image_size: 000000000067f000 elf_arm64_load: page_offset: ffffffc000000000 elf_arm64_load: memstart: 0000004000000000 virt_to_phys: ffffffc000080000 -> 0000004000080000 elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 virt_to_phys: ffffffc000080000 -> 0000004000080000 Modified cmdline:root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlyprintk=uart8250-32bit,0x1c020000 Unable to find /proc/device-tree//chosen/linux,stdout-path, printing from purgatory is diabled read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', cpu-release-addr 400000fff8 read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', cpu-release-addr 400000fff8 check_cpu_properties: hwid-0: OK check_cpu_properties: hwid-1: OK check_cpu_properties: hwid-100: OK check_cpu_properties: hwid-101: OK check_cpu_properties: hwid-200: OK check_cpu_properties: hwid-201: OK check_cpu_properties: hwid-300: OK check_cpu_properties: hwid-301: OK dtb: base 4000700000, size 221ch (8732) add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) -> 0000004000700000 - 0000004000703000 (00003000) kexec_load: entry = 0x4000080000 flags = 0xb70000 nr_segments = 2 segment[0].buf = 0x7f756f7010 segment[0].bufsz = 0x645f60 segment[0].mem = 0x4000080000 segment[0].memsz = 0x67b000 segment[1].buf = 0x24c14c90 segment[1].bufsz = 0x221c segment[1].mem = 0x4000700000 segment[1].memsz = 0x3000 root@genericarmv8:~# /usr/local/sbin/kexec --lite -e kexec version: 14.10.21.16.36-ga38e0a6 arch_process_options:85: command_line: (null) arch_process_options:87: initrd: (null) arch_process_options:88: dtb: (null) arch_process_options:89: lite: 1 sd 0:0:0:0: [sda] Synchronizing SCSI cache kexec: Starting new kernel Bye! It fails to come up. In debugger I can see Core number : 0 Core state : debug (AArch64 EL1) Debug entry cause : External Debug Request Current PC : 0xffffffc000083200 Current CPSR : 0x600003c5 (EL1h) It went to the second stage. But hang. Am I missing something? --Arun
On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: > On 11/03/14 at 07:46pm, Mark Rutland wrote: > > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > > > Hi Geoff > > > > > > I tested your patches. The macihne is using spin-table cpu enable method > > > so I tried maxcpus=1 as you suggested. > > > > > > There's below issues for me, thoughts? > > > > > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > > > to use. > > > > Are you absolutely certain of this? > > > > To use ACPI, you must have booted via EFI, as the only mechanism for > > finding the ACPI tables is via EFI. If booted via EFI, the stub will > > have created a stub DTB if there is no provided DTB, to pass the command > > line and pointers to EFI data structures. This stub DTB should be > > present in the usual place. > > Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs > maybe I can pass external dtb to kexec-tools. What you mentioned should be true > though but I have not get idea how to get the dtb which kernel is using for boot > since it is not unflattened. Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, so there's nothing to expose at in sysfs/procfs. Somehow we need to unflatten the DTB without exposing it to drivers, such that it can be exposed to userspace in the usual place but drivers don't being probing based off of it. Thanks, Mark.
On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: > Hi Geoff, > > I am trying this on my hardware (apm-mustang.dtb) > > On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: > > Hi All, > > > > This series adds the core support for kexec re-boots on arm64. I have tested > > with the ARM VE fast model, the ARM Base model and the ARM Foundation > > model with various kernel config options for both the first and second stage > > kernels. > > > > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to > > kexec-tools [2], which have not yet been merged upstream, are needed. > > > > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() > > routine the ability to switch exception levels from EL1 to EL2 for kernels that > > were entered in EL2. > > > > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree > > reserved-memory nodes. > > > > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the > > dcache_line_size macro it defines can be uesd by kexec's relocate kernel > > routine. > > > > Patches 8-10 add the actual kexec support. > > > > Please consider all patches for inclusion. Any comments or suggestions on how > > to improve are welcome. > > > > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git > > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git > > > > Several things are known to have problems on kexec re-boot: > > > > spin-table > > ---------- > > > > PROBLEM: The spin-table enable method does not implement all the methods needed > > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. > > > > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method > > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' > > on the first stage kernel command line. > > I have CONFIG_SMP=n > > > > > FIX: Upgrade system firmware to provide PSCI enable method support. > > > > KVM > > --- > > > > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those > > resources on shutdown, so the first stage kernel cannot be shutdown properly. > > > > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. > > KVM also disabled. > > /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux > --dtb=apm-mustang.dtb --command-line= > "root=/dev/nfs rw > nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > earlyprintk=uart8250-32bit,0x1c020000" > > kexec version: 14.10.21.16.36-ga38e0a6 > arch_process_options:85: command_line: root=/dev/nfs rw > nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > earlyprintk=uart8250-32bit,0x1 > c020000 > arch_process_options:87: initrd: (null) > arch_process_options:88: dtb: apm-mustang.dtb > arch_process_options:89: lite: 1 > kernel: 0x7f756e7010 kernel_size: 0x488a308 > Modified cmdline: root=/dev/nfs > Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > from purgatory is diabled > get_memory_ranges_dt:638: node_1516 memory > get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 > get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > get_memory_ranges_dt:678: Success > elf_arm64_load: PE format: yes > p_vaddr: ffffffc000080000 > virt_to_phys: ffffffc000080000 -> 0000004000080000 > add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) > -> 0000004000080000 - 00000040006fb000 (0067b000) > elf_arm64_load: text_offset: 0000000000080000 > elf_arm64_load: image_size: 000000000067f000 > elf_arm64_load: page_offset: ffffffc000000000 > elf_arm64_load: memstart: 0000004000000000 > virt_to_phys: ffffffc000080000 -> 0000004000080000 > elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 > virt_to_phys: ffffffc000080000 -> 0000004000080000 > Modified cmdline:root=/dev/nfs rw > nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > earlyprintk=uart8250-32bit,0x1c020000 > Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > from purgatory is diabled > read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > cpu-release-addr 400000fff8 > read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > cpu-release-addr 400000fff8 > check_cpu_properties: hwid-0: OK > check_cpu_properties: hwid-1: OK > check_cpu_properties: hwid-100: OK > check_cpu_properties: hwid-101: OK > check_cpu_properties: hwid-200: OK > check_cpu_properties: hwid-201: OK > check_cpu_properties: hwid-300: OK > check_cpu_properties: hwid-301: OK > dtb: base 4000700000, size 221ch (8732) > add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) > -> 0000004000700000 - 0000004000703000 (00003000) > kexec_load: entry = 0x4000080000 flags = 0xb70000 > nr_segments = 2 > segment[0].buf = 0x7f756f7010 > segment[0].bufsz = 0x645f60 > segment[0].mem = 0x4000080000 > segment[0].memsz = 0x67b000 > segment[1].buf = 0x24c14c90 > segment[1].bufsz = 0x221c > segment[1].mem = 0x4000700000 > segment[1].memsz = 0x3000 > > root@genericarmv8:~# /usr/local/sbin/kexec --lite -e > kexec version: 14.10.21.16.36-ga38e0a6 > arch_process_options:85: command_line: (null) > arch_process_options:87: initrd: (null) > arch_process_options:88: dtb: (null) > arch_process_options:89: lite: 1 > sd 0:0:0:0: [sda] Synchronizing SCSI cache > kexec: Starting new kernel > Bye! > > It fails to come up. In debugger I can see > > Core number : 0 > Core state : debug (AArch64 EL1) > Debug entry cause : External Debug Request > Current PC : 0xffffffc000083200 That's a kernel virtual address, and it looks to be somewhere early in the boot path, given it's close to PAGE_OFFSET + TEXT_OFFSET. Can you figure out what this corresponds to in your kernel image? Thanks, Mark.
On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: >> Hi Geoff, >> >> I am trying this on my hardware (apm-mustang.dtb) >> >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: >> > Hi All, >> > >> > This series adds the core support for kexec re-boots on arm64. I have tested >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation >> > model with various kernel config options for both the first and second stage >> > kernels. >> > >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to >> > kexec-tools [2], which have not yet been merged upstream, are needed. >> > >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that >> > were entered in EL2. >> > >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree >> > reserved-memory nodes. >> > >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel >> > routine. >> > >> > Patches 8-10 add the actual kexec support. >> > >> > Please consider all patches for inclusion. Any comments or suggestions on how >> > to improve are welcome. >> > >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git >> > >> > Several things are known to have problems on kexec re-boot: >> > >> > spin-table >> > ---------- >> > >> > PROBLEM: The spin-table enable method does not implement all the methods needed >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. >> > >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' >> > on the first stage kernel command line. >> >> I have CONFIG_SMP=n >> >> > >> > FIX: Upgrade system firmware to provide PSCI enable method support. >> > >> > KVM >> > --- >> > >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. >> > >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. >> >> KVM also disabled. >> >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux >> --dtb=apm-mustang.dtb --command-line= >> "root=/dev/nfs rw >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> earlyprintk=uart8250-32bit,0x1c020000" >> >> kexec version: 14.10.21.16.36-ga38e0a6 >> arch_process_options:85: command_line: root=/dev/nfs rw >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> earlyprintk=uart8250-32bit,0x1 >> c020000 >> arch_process_options:87: initrd: (null) >> arch_process_options:88: dtb: apm-mustang.dtb >> arch_process_options:89: lite: 1 >> kernel: 0x7f756e7010 kernel_size: 0x488a308 >> Modified cmdline: root=/dev/nfs >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing >> from purgatory is diabled >> get_memory_ranges_dt:638: node_1516 memory >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> get_memory_ranges_dt:678: Success >> elf_arm64_load: PE format: yes >> p_vaddr: ffffffc000080000 >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) >> -> 0000004000080000 - 00000040006fb000 (0067b000) >> elf_arm64_load: text_offset: 0000000000080000 >> elf_arm64_load: image_size: 000000000067f000 >> elf_arm64_load: page_offset: ffffffc000000000 >> elf_arm64_load: memstart: 0000004000000000 >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> Modified cmdline:root=/dev/nfs rw >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> earlyprintk=uart8250-32bit,0x1c020000 >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing >> from purgatory is diabled >> read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', >> cpu-release-addr 400000fff8 >> read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', >> cpu-release-addr 400000fff8 >> check_cpu_properties: hwid-0: OK >> check_cpu_properties: hwid-1: OK >> check_cpu_properties: hwid-100: OK >> check_cpu_properties: hwid-101: OK >> check_cpu_properties: hwid-200: OK >> check_cpu_properties: hwid-201: OK >> check_cpu_properties: hwid-300: OK >> check_cpu_properties: hwid-301: OK >> dtb: base 4000700000, size 221ch (8732) >> add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) >> -> 0000004000700000 - 0000004000703000 (00003000) >> kexec_load: entry = 0x4000080000 flags = 0xb70000 >> nr_segments = 2 >> segment[0].buf = 0x7f756f7010 >> segment[0].bufsz = 0x645f60 >> segment[0].mem = 0x4000080000 >> segment[0].memsz = 0x67b000 >> segment[1].buf = 0x24c14c90 >> segment[1].bufsz = 0x221c >> segment[1].mem = 0x4000700000 >> segment[1].memsz = 0x3000 >> >> root@genericarmv8:~# /usr/local/sbin/kexec --lite -e >> kexec version: 14.10.21.16.36-ga38e0a6 >> arch_process_options:85: command_line: (null) >> arch_process_options:87: initrd: (null) >> arch_process_options:88: dtb: (null) >> arch_process_options:89: lite: 1 >> sd 0:0:0:0: [sda] Synchronizing SCSI cache >> kexec: Starting new kernel >> Bye! >> >> It fails to come up. In debugger I can see >> >> Core number : 0 >> Core state : debug (AArch64 EL1) >> Debug entry cause : External Debug Request >> Current PC : 0xffffffc000083200 > > That's a kernel virtual address, and it looks to be somewhere early in > the boot path, given it's close to PAGE_OFFSET + TEXT_OFFSET. > Yes. My earlyconsole setting was wrong that is why I did not see anything in console for the kexec rebooting. With correct earlycon setting I can see root@genericarmv8:~# /usr/local/sbin/kexec --lite -l vmlinux --dtb=apm-mustang.dtb --command-line= "root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000" root@genericarmv8:~# /usr/local/sbin/kexec --lite -e kexec version: 14.10.21.16.36-ga38e0a6 arch_process_options:85: command_line: (null) arch_process_options:87: initrd: (null) arch_process_options:88: dtb: (null) arch_process_options:89: lite: 1 sd 0:0:0:0: [sda] Synchronizing SCSI cache kexec: Starting new kernel Bye! Initializing cgroup subsys cpu Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 CPU: AArch64 Processor [500f0000] revision 0 Detected PIPT I-cache on CPU0 Ignoring memory block 0x100000000 - 0x180000000 Early serial console at MMIO32 0x1c020000 (options '') bootconsole [uart0] enabled efi: Getting EFI parameters from FDT: efi: UEFI not found. cma: Failed to reserve 16 MiB Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. CPU: 0 PID: 0 Comm: swapper Not tainted 3.17.0-rc4+ #12 Call trace: [<ffffffc000087cf0>] dump_backtrace+0x0/0x124 [<ffffffc000087e24>] show_stack+0x10/0x1c [<ffffffc0004bcee8>] dump_stack+0x1c/0x28 [<ffffffc0004bc27c>] panic+0xd8/0x230 [<ffffffc00065510c>] memblock_alloc_base+0x2c/0x3c [<ffffffc000655128>] memblock_alloc+0xc/0x18 [<ffffffc00064d6cc>] early_alloc.constprop.5+0x14/0x4c [<ffffffc00064db50>] paging_init+0x124/0x1b8 [<ffffffc00064b758>] setup_arch+0x2a8/0x464 [<ffffffc00064966c>] start_kernel+0x88/0x38c ---[ end Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. Bad mode in Synchronous Abort handler detected, code 0x86000005 CPU: 0 PID: 0 Comm: swapper Not tainted 3.17.0-rc4+ #12 task: ffffffc000688460 ti: ffffffc00067c000 task.ti: ffffffc00067c000 PC is at 0x0 LR is at el1_irq+0x64/0xd0 pc : [<0000000000000000>] lr : [<ffffffc000083da4>] pstate: 800001c5 sp : ffffffc00067fc90 x29: ffffffc00067fdb0 x28: 0000000000000000 x27: ffffffc0005cf500 x26: 0000000000000000 x25: ffffffc000697000 x24: ffffffc000000000 x23: 0000000080000245 x22: ffffffc0004bc384 x21: ffffffc00067fdb0 x20: ffffffc0006c8000 x19: ffffffc0006c84d0 x18: 0000000000000004 x17: 0000000000000040 x16: ffffffc0006d6948 x15: 00000000ffffffff x14: 3030317830206574 x13: 61636f6c6c61206f x12: 742064656c696146 x11: 203a524f52524520 x10: 3a676e69636e7973 x9 : 20746f6e202d2063 x8 : 0000000000000018 x7 : 2073657479622030 x6 : ffffffc0006c9b57 x5 : ffffffc0006ca91c x4 : 0000000000000004 x3 : 0000000000000003 x2 : 0000000000200002 x1 : 0000000000000000 x0 : ffffffc00067fc90 Internal error: Oops - bad mode: 0 [#1] PREEMPT Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 3.17.0-rc4+ #12 task: ffffffc000688460 ti: ffffffc00067c000 task.ti: ffffffc00067c000 PC is at 0x0 LR is at el1_irq+0x64/0xd0 pc : [<0000000000000000>] lr : [<ffffffc000083da4>] pstate: 800001c5 sp : ffffffc00067fc90 x29: ffffffc00067fdb0 x28: 0000000000000000 x27: ffffffc0005cf500 x26: 0000000000000000 x25: ffffffc000697000 x24: ffffffc000000000 x23: 0000000080000245 x22: ffffffc0004bc384 x21: ffffffc00067fdb0 x20: ffffffc0006c8000 x19: ffffffc0006c84d0 x18: 0000000000000004 x17: 0000000000000040 x16: ffffffc0006d6948 x15: 00000000ffffffff x14: 3030317830206574 x13: 61636f6c6c61206f x12: 742064656c696146 x11: 203a524f52524520 x10: 3a676e69636e7973 x9 : 20746f6e202d2063 x8 : 0000000000000018 x7 : 2073657479622030 x6 : ffffffc0006c9b57 x5 : ffffffc0006ca91c x4 : 0000000000000004 x3 : 0000000000000003 x2 : 0000000000200002 x1 : 0000000000000000 x0 : ffffffc00067fc90 Process swapper (pid: 0, stack limit = 0xffffffc00067c058) Stack: (0xffffffc00067fc90 to 0xffffffc000680000) fc80: 00000057 00000000 00000057 00000000 fca0: 00200002 00000000 00000003 00000000 00000004 00000000 006ca91c ffffffc0 fcc0: 006c9b57 ffffffc0 79622030 20736574 00000018 00000000 202d2063 20746f6e fce0: 636e7973 3a676e69 52524520 203a524f 6c696146 74206465 6c61206f 61636f6c fd00: 30206574 30303178 ffffffff 00000000 006d6948 ffffffc0 00000040 00000000 fd20: 00000004 00000000 006c84d0 ffffffc0 006c8000 ffffffc0 00000000 00000000 fd40: 00000000 00000000 006c8000 ffffffc0 00000000 ffffffc0 00697000 ffffffc0 fd60: 00000000 00000000 005cf500 ffffffc0 00000000 00000000 0067fdb0 ffffffc0 fd80: 004bc380 ffffffc0 0067fdb0 ffffffc0 004bc384 ffffffc0 80000245 00000000 fda0: 006c9b2c ffffffc0 5d3e6336 61747320 0067fe70 ffffffc0 00655110 ffffffc0 fdc0: 00000000 00000000 00001000 00000000 006c8000 ffffffc0 00000018 00000000 fde0: ffffffff ffffff7f 00000000 ffffffc0 0067fe70 ffffffc0 0067fe70 ffffffc0 fe00: 0067fe30 ffffffc0 ffffffc8 00000000 0067fe70 ffffffc0 0067fe70 ffffffc0 fe20: 0067fe30 ffffffc0 ffffffc8 00000000 0067fe60 ffffffc0 00001000 00000000 fe40: 00000000 00000000 006979e8 ffffffc0 0067fe28 ffffffc0 0067fe30 ffffffc0 fe60: 00000000 00000000 ffffffff 00000000 0067fe90 ffffffc0 0065512c ffffffc0 fe80: 40000000 00000040 006d6948 ffffffc0 0067fea0 ffffffc0 0064d6d0 ffffffc0 fea0: 0067fec0 ffffffc0 0064db54 ffffffc0 40000000 00000040 006ecba8 ffffffc0 fec0: 0067ff40 ffffffc0 0064b75c ffffffc0 006c80c0 ffffffc0 00080000 ffffffc0 fee0: 0067ffe8 ffffffc0 0068a000 ffffffc0 00697000 ffffffc0 006c8000 ffffffc0 ff00: 006fb000 00000040 006fd000 00000040 000804a0 ffffffc0 00000000 00000080 ff20: 0067ff40 ffffffc0 0064b758 ffffffc0 006c80c0 ffffffc0 00080000 ffffffc0 ff40: 0067ffa0 ffffffc0 00649670 ffffffc0 0066e910 ffffffc0 006c8000 ffffffc0 ff60: 006c8000 ffffffc0 00000000 00000000 0068a000 00000040 00000000 00000040 ff80: 006fb000 00000040 006fd000 00000040 00000080 00000000 00700000 00000040 ffa0: 00000000 00000000 00080284 00000040 00094da0 00000040 00000e12 00000000 ffc0: 00700000 00000040 500f0000 00000000 0068a000 00000040 00000000 00000000 ffe0: 00000000 00000000 0066e910 ffffffc0 00000000 00000000 00000000 00000000 Call trace: [< (null)>] (null) [<ffffffc00065510c>] memblock_alloc_base+0x2c/0x3c [<ffffffc000655128>] memblock_alloc+0xc/0x18 [<ffffffc00064d6cc>] early_alloc.constprop.5+0x14/0x4c [<ffffffc00064db50>] paging_init+0x124/0x1b8 [<ffffffc00064b758>] setup_arch+0x2a8/0x464 [<ffffffc00064966c>] start_kernel+0x88/0x38c Code: bad PC value Bad mode in Synchronous Abort handler detected, code 0x86000005 CPU: 0 PID: 0 Comm: swapper Tainted: G D 3.17.0-rc4+ #12 task: ffffffc000688460 ti: ffffffc00067c000 task.ti: ffffffc00067c000 PC is at 0x0 LR is at el1_irq+0x64/0xd0 pc : [<0000000000000000>] lr : [<ffffffc000083da4>] pstate: 600001c5 sp : ffffffc00067f960 x29: ffffffc00067fa80 x28: 0000000000000000 x27: ffffffc0005cf500 x26: 0000000000000000 x25: ffffffc000697000 x24: 0000000000000021 x23: 0000000060000145 x22: ffffffc000087f00 x21: ffffffc00067fa80 x20: 0000000000000000 x19: ffffffc00067fb70 x18: 0000000000000004 x17: 0000000000000040 x16: ffffffc0006d6948 x15: 00000000ffffffff x14: 0ffffffffffffffe x13: 0000000000000020 x12: 0101010101010101 x11: 000000000000006d x10: 0000000000000002 x9 : 0000000000000000 x8 : 000000000000006e x7 : 0000000000000024 x6 : ffffffc0006c9b12 x5 : ffffffc0006cc108 x4 : 0000000000000008 x3 : 0000000000000080 x2 : 0000000000000080 x1 : 0000000000000000 x0 : ffffffc00067f960 --Arun > Can you figure out what this corresponds to in your kernel image? > > Thanks, > Mark.
Hi Arun, On Thu, 2014-11-06 at 21:43 +0530, Arun Chandran wrote: > Initializing cgroup subsys cpu > Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 > 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro > GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 > CPU: AArch64 Processor [500f0000] revision 0 > Detected PIPT I-cache on CPU0 > Ignoring memory block 0x100000000 - 0x180000000 > Early serial console at MMIO32 0x1c020000 (options '') > bootconsole [uart0] enabled > efi: Getting EFI parameters from FDT: > efi: UEFI not found. > cma: Failed to reserve 16 MiB This error comes from cma_declare_contiguous(). > Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. This panic comes from memblock_alloc_base(). I guess something has passed in bad values. I wonder if your dtb is compatible with this platform. What if you don't pass a dtb to kexec? It could be caused by memory corruption. -Geoff
On Thu, Nov 06, 2014 at 04:13:04PM +0000, Arun Chandran wrote: > On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: > >> Hi Geoff, > >> > >> I am trying this on my hardware (apm-mustang.dtb) > >> > >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: > >> > Hi All, > >> > > >> > This series adds the core support for kexec re-boots on arm64. I have tested > >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation > >> > model with various kernel config options for both the first and second stage > >> > kernels. > >> > > >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to > >> > kexec-tools [2], which have not yet been merged upstream, are needed. > >> > > >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() > >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that > >> > were entered in EL2. > >> > > >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree > >> > reserved-memory nodes. > >> > > >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the > >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel > >> > routine. > >> > > >> > Patches 8-10 add the actual kexec support. > >> > > >> > Please consider all patches for inclusion. Any comments or suggestions on how > >> > to improve are welcome. > >> > > >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git > >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git > >> > > >> > Several things are known to have problems on kexec re-boot: > >> > > >> > spin-table > >> > ---------- > >> > > >> > PROBLEM: The spin-table enable method does not implement all the methods needed > >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. > >> > > >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method > >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' > >> > on the first stage kernel command line. > >> > >> I have CONFIG_SMP=n > >> > >> > > >> > FIX: Upgrade system firmware to provide PSCI enable method support. > >> > > >> > KVM > >> > --- > >> > > >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those > >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. > >> > > >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. > >> > >> KVM also disabled. > >> > >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux > >> --dtb=apm-mustang.dtb --command-line= > >> "root=/dev/nfs rw > >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> earlyprintk=uart8250-32bit,0x1c020000" > >> > >> kexec version: 14.10.21.16.36-ga38e0a6 > >> arch_process_options:85: command_line: root=/dev/nfs rw > >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> earlyprintk=uart8250-32bit,0x1 > >> c020000 > >> arch_process_options:87: initrd: (null) > >> arch_process_options:88: dtb: apm-mustang.dtb > >> arch_process_options:89: lite: 1 > >> kernel: 0x7f756e7010 kernel_size: 0x488a308 > >> Modified cmdline: root=/dev/nfs > >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > >> from purgatory is diabled > >> get_memory_ranges_dt:638: node_1516 memory > >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 Does this look correct to you? That doesn't match what I see in apm-mustang.dts in mainline (where memory starts at 0x1_00000000). > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 This looks suspicious, but I haven't investigated the underlying code. > >> get_memory_ranges_dt:678: Success > >> elf_arm64_load: PE format: yes > >> p_vaddr: ffffffc000080000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) > >> -> 0000004000080000 - 00000040006fb000 (0067b000) > >> elf_arm64_load: text_offset: 0000000000080000 > >> elf_arm64_load: image_size: 000000000067f000 > >> elf_arm64_load: page_offset: ffffffc000000000 > >> elf_arm64_load: memstart: 0000004000000000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 So we'll load the kernel at this address... [...] > root@genericarmv8:~# /usr/local/sbin/kexec --lite -e > kexec version: 14.10.21.16.36-ga38e0a6 > arch_process_options:85: command_line: (null) > arch_process_options:87: initrd: (null) > arch_process_options:88: dtb: (null) > arch_process_options:89: lite: 1 > sd 0:0:0:0: [sda] Synchronizing SCSI cache > kexec: Starting new kernel > Bye! > Initializing cgroup subsys cpu > Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 > 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro > GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 > CPU: AArch64 Processor [500f0000] revision 0 > Detected PIPT I-cache on CPU0 > Ignoring memory block 0x100000000 - 0x180000000 ... then the kernel finds the single memory region described in apm-mustang.dts which happens to be below PHYS_OFFSET for your board. So the kernel ignores it because it cannot address it ... > Early serial console at MMIO32 0x1c020000 (options '') > bootconsole [uart0] enabled > efi: Getting EFI parameters from FDT: > efi: UEFI not found. > cma: Failed to reserve 16 MiB > Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. ... and then explodes because it knows of no memory outside of the kernel Image. Mark.
On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: >> On 11/03/14 at 07:46pm, Mark Rutland wrote: >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: >> > > Hi Geoff >> > > >> > > I tested your patches. The macihne is using spin-table cpu enable method >> > > so I tried maxcpus=1 as you suggested. >> > > >> > > There's below issues for me, thoughts? >> > > >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb >> > > to use. >> > >> > Are you absolutely certain of this? >> > >> > To use ACPI, you must have booted via EFI, as the only mechanism for >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will >> > have created a stub DTB if there is no provided DTB, to pass the command >> > line and pointers to EFI data structures. This stub DTB should be >> > present in the usual place. >> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true >> though but I have not get idea how to get the dtb which kernel is using for boot >> since it is not unflattened. > > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, > so there's nothing to expose at in sysfs/procfs. > > Somehow we need to unflatten the DTB without exposing it to drivers, > such that it can be exposed to userspace in the usual place but drivers > don't being probing based off of it. Is that even necessary? If the tree isn't unflattened, then it is just a stub tree. There really isn't anything interesting in it. Kexec should recreate the tree from scratch in that case. g.
On Thu, Nov 6, 2014 at 11:55 PM, Geoff Levand <geoff@infradead.org> wrote: > Hi Arun, > > On Thu, 2014-11-06 at 21:43 +0530, Arun Chandran wrote: >> Initializing cgroup subsys cpu >> Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 >> 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro >> GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 >> CPU: AArch64 Processor [500f0000] revision 0 >> Detected PIPT I-cache on CPU0 >> Ignoring memory block 0x100000000 - 0x180000000 >> Early serial console at MMIO32 0x1c020000 (options '') >> bootconsole [uart0] enabled >> efi: Getting EFI parameters from FDT: >> efi: UEFI not found. >> cma: Failed to reserve 16 MiB > > This error comes from cma_declare_contiguous(). > >> Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. > > This panic comes from memblock_alloc_base(). I guess something has > passed in bad values. > > I wonder if your dtb is compatible with this platform. What if you > don't pass a dtb to kexec? Yes dtb was the problem. Without dtb it works fine root@genericarmv8:~# ./usr/local/sbin/kexec --lite -l vmlinux --command-line= "root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000" root@genericarmv8:~# /usr/local/sbin/kexec --lite -e kexec version: 14.10.21.16.36-ga38e0a6 arch_process_options:85: command_line: (null) arch_process_options:87: initrd: (null) arch_process_options:88: dtb: (null) arch_process_options:89: lite: 1 sd 0:0:0:0: [sda] Synchronizing SCSI cache kexec: Starting new kernel Bye! Initializing cgroup subsys cpu Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 CPU: AArch64 Processor [500f0000] revision 0 Detected PIPT I-cache on CPU0 Early serial console at MMIO32 0x1c020000 (options '') bootconsole [uart0] enabled efi: Getting EFI parameters from FDT: efi: UEFI not found. cma: Reserved 16 MiB at 40ff000000 /cpus/cpu@000: unsupported enable-method property: spin-table Built 1 zonelists in Zone order, mobility grouping on. Total pages: 4136960 Kernel command line: root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000 PID hash table entries: 4096 (order: 3, 32768 bytes) Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes) Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes) .... ... .. . root@genericarmv8:~# --Arun
On Fri, Nov 7, 2014 at 12:09 AM, Mark Rutland <mark.rutland@arm.com> wrote: > On Thu, Nov 06, 2014 at 04:13:04PM +0000, Arun Chandran wrote: >> On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: >> >> Hi Geoff, >> >> >> >> I am trying this on my hardware (apm-mustang.dtb) >> >> >> >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: >> >> > Hi All, >> >> > >> >> > This series adds the core support for kexec re-boots on arm64. I have tested >> >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation >> >> > model with various kernel config options for both the first and second stage >> >> > kernels. >> >> > >> >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to >> >> > kexec-tools [2], which have not yet been merged upstream, are needed. >> >> > >> >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() >> >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that >> >> > were entered in EL2. >> >> > >> >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree >> >> > reserved-memory nodes. >> >> > >> >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the >> >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel >> >> > routine. >> >> > >> >> > Patches 8-10 add the actual kexec support. >> >> > >> >> > Please consider all patches for inclusion. Any comments or suggestions on how >> >> > to improve are welcome. >> >> > >> >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git >> >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git >> >> > >> >> > Several things are known to have problems on kexec re-boot: >> >> > >> >> > spin-table >> >> > ---------- >> >> > >> >> > PROBLEM: The spin-table enable method does not implement all the methods needed >> >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. >> >> > >> >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method >> >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' >> >> > on the first stage kernel command line. >> >> >> >> I have CONFIG_SMP=n >> >> >> >> > >> >> > FIX: Upgrade system firmware to provide PSCI enable method support. >> >> > >> >> > KVM >> >> > --- >> >> > >> >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those >> >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. >> >> > >> >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. >> >> >> >> KVM also disabled. >> >> >> >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux >> >> --dtb=apm-mustang.dtb --command-line= >> >> "root=/dev/nfs rw >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> >> earlyprintk=uart8250-32bit,0x1c020000" >> >> >> >> kexec version: 14.10.21.16.36-ga38e0a6 >> >> arch_process_options:85: command_line: root=/dev/nfs rw >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> >> earlyprintk=uart8250-32bit,0x1 >> >> c020000 >> >> arch_process_options:87: initrd: (null) >> >> arch_process_options:88: dtb: apm-mustang.dtb >> >> arch_process_options:89: lite: 1 >> >> kernel: 0x7f756e7010 kernel_size: 0x488a308 >> >> Modified cmdline: root=/dev/nfs >> >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing >> >> from purgatory is diabled >> >> get_memory_ranges_dt:638: node_1516 memory >> >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 > > Does this look correct to you? That doesn't match what I see in > apm-mustang.dts in mainline (where memory starts at 0x1_00000000). > >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > > This looks suspicious, but I haven't investigated the underlying code. > >> >> get_memory_ranges_dt:678: Success >> >> elf_arm64_load: PE format: yes >> >> p_vaddr: ffffffc000080000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) >> >> -> 0000004000080000 - 00000040006fb000 (0067b000) >> >> elf_arm64_load: text_offset: 0000000000080000 >> >> elf_arm64_load: image_size: 000000000067f000 >> >> elf_arm64_load: page_offset: ffffffc000000000 >> >> elf_arm64_load: memstart: 0000004000000000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > > So we'll load the kernel at this address... > > [...] > >> root@genericarmv8:~# /usr/local/sbin/kexec --lite -e >> kexec version: 14.10.21.16.36-ga38e0a6 >> arch_process_options:85: command_line: (null) >> arch_process_options:87: initrd: (null) >> arch_process_options:88: dtb: (null) >> arch_process_options:89: lite: 1 >> sd 0:0:0:0: [sda] Synchronizing SCSI cache >> kexec: Starting new kernel >> Bye! >> Initializing cgroup subsys cpu >> Linux version 3.17.0-rc4+ (arun@arun-OptiPlex-9010) (gcc version 4.9.1 >> 20140505 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.05 - Linaro >> GCC 4.9-2014.05) ) #12 PREEMPT Thu Nov 6 21:38:03 IST 2014 >> CPU: AArch64 Processor [500f0000] revision 0 >> Detected PIPT I-cache on CPU0 >> Ignoring memory block 0x100000000 - 0x180000000 > > ... then the kernel finds the single memory region described in > apm-mustang.dts which happens to be below PHYS_OFFSET for your board. So > the kernel ignores it because it cannot address it ... > >> Early serial console at MMIO32 0x1c020000 (options '') >> bootconsole [uart0] enabled >> efi: Getting EFI parameters from FDT: >> efi: UEFI not found. >> cma: Failed to reserve 16 MiB >> Kernel panic - not syncing: ERROR: Failed to allocate 0x1000 bytes below 0x0. > > ... and then explodes because it knows of no memory outside of the kernel > Image. Thanks for the nice explanation. Yes with the mainline dtb kexec won't work. Kexec works when I don't pass dtb to it. It will work (mainline dtb) only when it passes through the bootloader. Bootloader does a lot of modification to the dtb. For exmple. Memory node is changed from memory { device_type = "memory"; reg = < 0x1 0x00000000 0x0 0x80000000 >; /* Updated by bootloader */ }; to root@genericarmv8:~# hexdump /proc/device-tree/memory/reg 0000000 0000 4000 0000 0000 0000 0400 0000 0000 0000010 0000 0000 0000 0000 0000 0000 0000 0000 * 0000040 --Arun
On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: > On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: > >> On 11/03/14 at 07:46pm, Mark Rutland wrote: > >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > >> > > Hi Geoff > >> > > > >> > > I tested your patches. The macihne is using spin-table cpu enable method > >> > > so I tried maxcpus=1 as you suggested. > >> > > > >> > > There's below issues for me, thoughts? > >> > > > >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > >> > > to use. > >> > > >> > Are you absolutely certain of this? > >> > > >> > To use ACPI, you must have booted via EFI, as the only mechanism for > >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will > >> > have created a stub DTB if there is no provided DTB, to pass the command > >> > line and pointers to EFI data structures. This stub DTB should be > >> > present in the usual place. > >> > >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs > >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true > >> though but I have not get idea how to get the dtb which kernel is using for boot > >> since it is not unflattened. > > > > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, > > so there's nothing to expose at in sysfs/procfs. > > > > Somehow we need to unflatten the DTB without exposing it to drivers, > > such that it can be exposed to userspace in the usual place but drivers > > don't being probing based off of it. > > Is that even necessary? If the tree isn't unflattened, then it is just > a stub tree. There really isn't anything interesting in it. We need to UEFI properties [1] from /chosen to access the memory map and system table (both of which are necessary to access any ACPI tables). > Kexec should recreate the tree from scratch in that case. I'm not sure if the required information is exposed to userspace elsewhere. Ard, Leif? Mark. [1] Documentation/arm/uefi.txt
On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: >> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: >> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: >> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: >> >> > > Hi Geoff >> >> > > >> >> > > I tested your patches. The macihne is using spin-table cpu enable method >> >> > > so I tried maxcpus=1 as you suggested. >> >> > > >> >> > > There's below issues for me, thoughts? >> >> > > >> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb >> >> > > to use. >> >> > >> >> > Are you absolutely certain of this? >> >> > >> >> > To use ACPI, you must have booted via EFI, as the only mechanism for >> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will >> >> > have created a stub DTB if there is no provided DTB, to pass the command >> >> > line and pointers to EFI data structures. This stub DTB should be >> >> > present in the usual place. >> >> >> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs >> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true >> >> though but I have not get idea how to get the dtb which kernel is using for boot >> >> since it is not unflattened. >> > >> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, >> > so there's nothing to expose at in sysfs/procfs. >> > >> > Somehow we need to unflatten the DTB without exposing it to drivers, >> > such that it can be exposed to userspace in the usual place but drivers >> > don't being probing based off of it. >> >> Is that even necessary? If the tree isn't unflattened, then it is just >> a stub tree. There really isn't anything interesting in it. > > We need to UEFI properties [1] from /chosen to access the memory map and > system table (both of which are necessary to access any ACPI tables). > >> Kexec should recreate the tree from scratch in that case. > > I'm not sure if the required information is exposed to userspace > elsewhere. Ard, Leif? > Personally, I think we should not be using /proc/device-tree at all, but instead retain the original blob in some way and expose that. We already have /sys/firmware/efi/systab which contains the physical addresses of the UEFI configuration tables. We should probably add an entry for the FDT there anyway, but we would still be looking at mmap(/dev/mem) to access it, which is not a practice we want to encourage, I suppose. Also, we *must* take the secure boot scenario into account. Booting with an arbitrary user generated DTB is nice, but if you are doing kexec without an initrd, for instance, it would also be nice if we could just reuse the existing DTB without bothering the user for it at all, which would be something we could also allow when running securely.
On 7 November 2014 11:41, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: >> On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: >>> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: >>> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: >>> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: >>> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: >>> >> > > Hi Geoff >>> >> > > >>> >> > > I tested your patches. The macihne is using spin-table cpu enable method >>> >> > > so I tried maxcpus=1 as you suggested. >>> >> > > >>> >> > > There's below issues for me, thoughts? >>> >> > > >>> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb >>> >> > > to use. >>> >> > >>> >> > Are you absolutely certain of this? >>> >> > >>> >> > To use ACPI, you must have booted via EFI, as the only mechanism for >>> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will >>> >> > have created a stub DTB if there is no provided DTB, to pass the command >>> >> > line and pointers to EFI data structures. This stub DTB should be >>> >> > present in the usual place. >>> >> >>> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs >>> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true >>> >> though but I have not get idea how to get the dtb which kernel is using for boot >>> >> since it is not unflattened. >>> > >>> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, >>> > so there's nothing to expose at in sysfs/procfs. >>> > >>> > Somehow we need to unflatten the DTB without exposing it to drivers, >>> > such that it can be exposed to userspace in the usual place but drivers >>> > don't being probing based off of it. >>> >>> Is that even necessary? If the tree isn't unflattened, then it is just >>> a stub tree. There really isn't anything interesting in it. >> >> We need to UEFI properties [1] from /chosen to access the memory map and >> system table (both of which are necessary to access any ACPI tables). >> >>> Kexec should recreate the tree from scratch in that case. >> >> I'm not sure if the required information is exposed to userspace >> elsewhere. Ard, Leif? >> > > Personally, I think we should not be using /proc/device-tree at all, > but instead retain the original blob in some way and expose that. > > We already have /sys/firmware/efi/systab which contains the physical > addresses of the UEFI configuration tables. We should probably add an > entry for the FDT there anyway, but we would still be looking at > mmap(/dev/mem) to access it, which is not a practice we want to > encourage, I suppose. > Nah, strike that. The configuration table entry contains the original fdt, so with the device nodes etc still present. The stub makes changes to the DTB, and /that/ is the version we want to retain so subsequent kexec reboots can use it. > Also, we *must* take the secure boot scenario into account. Booting > with an arbitrary user generated DTB is nice, but if you are doing > kexec without an initrd, for instance, it would also be nice if we > could just reuse the existing DTB without bothering the user for it at > all, which would be something we could also allow when running > securely. > > -- > Ard.
On 7 November 2014 11:45, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 7 November 2014 11:41, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: >> On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: >>> On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: >>>> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: >>>> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: >>>> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: >>>> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: >>>> >> > > Hi Geoff >>>> >> > > >>>> >> > > I tested your patches. The macihne is using spin-table cpu enable method >>>> >> > > so I tried maxcpus=1 as you suggested. >>>> >> > > >>>> >> > > There's below issues for me, thoughts? >>>> >> > > >>>> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb >>>> >> > > to use. >>>> >> > >>>> >> > Are you absolutely certain of this? >>>> >> > >>>> >> > To use ACPI, you must have booted via EFI, as the only mechanism for >>>> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will >>>> >> > have created a stub DTB if there is no provided DTB, to pass the command >>>> >> > line and pointers to EFI data structures. This stub DTB should be >>>> >> > present in the usual place. >>>> >> >>>> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs >>>> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true >>>> >> though but I have not get idea how to get the dtb which kernel is using for boot >>>> >> since it is not unflattened. >>>> > >>>> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, >>>> > so there's nothing to expose at in sysfs/procfs. >>>> > >>>> > Somehow we need to unflatten the DTB without exposing it to drivers, >>>> > such that it can be exposed to userspace in the usual place but drivers >>>> > don't being probing based off of it. >>>> >>>> Is that even necessary? If the tree isn't unflattened, then it is just >>>> a stub tree. There really isn't anything interesting in it. >>> >>> We need to UEFI properties [1] from /chosen to access the memory map and >>> system table (both of which are necessary to access any ACPI tables). >>> >>>> Kexec should recreate the tree from scratch in that case. >>> >>> I'm not sure if the required information is exposed to userspace >>> elsewhere. Ard, Leif? >>> >> >> Personally, I think we should not be using /proc/device-tree at all, >> but instead retain the original blob in some way and expose that. >> >> We already have /sys/firmware/efi/systab which contains the physical >> addresses of the UEFI configuration tables. We should probably add an >> entry for the FDT there anyway, but we would still be looking at >> mmap(/dev/mem) to access it, which is not a practice we want to >> encourage, I suppose. >> > > Nah, strike that. The configuration table entry contains the original > fdt, so with the device nodes etc still present. The stub makes *memory* nodes not device nodes > changes to the DTB, and /that/ is the version we want to retain so > subsequent kexec reboots can use it. > Sorry for the noise.
On Fri, Nov 07, 2014 at 10:41:11AM +0000, Ard Biesheuvel wrote: > On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: > > On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: > >> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: > >> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: > >> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: > >> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > >> >> > > Hi Geoff > >> >> > > > >> >> > > I tested your patches. The macihne is using spin-table cpu enable method > >> >> > > so I tried maxcpus=1 as you suggested. > >> >> > > > >> >> > > There's below issues for me, thoughts? > >> >> > > > >> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > >> >> > > to use. > >> >> > > >> >> > Are you absolutely certain of this? > >> >> > > >> >> > To use ACPI, you must have booted via EFI, as the only mechanism for > >> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will > >> >> > have created a stub DTB if there is no provided DTB, to pass the command > >> >> > line and pointers to EFI data structures. This stub DTB should be > >> >> > present in the usual place. > >> >> > >> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs > >> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true > >> >> though but I have not get idea how to get the dtb which kernel is using for boot > >> >> since it is not unflattened. > >> > > >> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, > >> > so there's nothing to expose at in sysfs/procfs. > >> > > >> > Somehow we need to unflatten the DTB without exposing it to drivers, > >> > such that it can be exposed to userspace in the usual place but drivers > >> > don't being probing based off of it. > >> > >> Is that even necessary? If the tree isn't unflattened, then it is just > >> a stub tree. There really isn't anything interesting in it. > > > > We need to UEFI properties [1] from /chosen to access the memory map and > > system table (both of which are necessary to access any ACPI tables). > > > >> Kexec should recreate the tree from scratch in that case. > > > > I'm not sure if the required information is exposed to userspace > > elsewhere. Ard, Leif? > > > > Personally, I think we should not be using /proc/device-tree at all, > but instead retain the original blob in some way and expose that. Grant took objection to that approach previously. > We already have /sys/firmware/efi/systab which contains the physical > addresses of the UEFI configuration tables. We should probably add an > entry for the FDT there anyway, but we would still be looking at > mmap(/dev/mem) to access it, which is not a practice we want to > encourage, I suppose. We should not encourage use of /dev/mem. Using /sys/firmware/efi/systab doesn't get us the memory map though, unless that's also exposed? > Also, we *must* take the secure boot scenario into account. Booting > with an arbitrary user generated DTB is nice, but if you are doing > kexec without an initrd, for instance, it would also be nice if we > could just reuse the existing DTB without bothering the user for it at > all, which would be something we could also allow when running > securely. Secure boot has to be handled completely differently. That will require new syscall support, in-kernel purgatory, and so on. That should not affect the non-secureboot cases where a user wants to load their own DTB (or other objects) into memory for the next OS (which might not be Linux). Mark.
On 7 November 2014 12:35, Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Nov 07, 2014 at 10:41:11AM +0000, Ard Biesheuvel wrote: >> On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: >> >> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: >> >> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: >> >> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: >> >> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: >> >> >> > > Hi Geoff >> >> >> > > >> >> >> > > I tested your patches. The macihne is using spin-table cpu enable method >> >> >> > > so I tried maxcpus=1 as you suggested. >> >> >> > > >> >> >> > > There's below issues for me, thoughts? >> >> >> > > >> >> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb >> >> >> > > to use. >> >> >> > >> >> >> > Are you absolutely certain of this? >> >> >> > >> >> >> > To use ACPI, you must have booted via EFI, as the only mechanism for >> >> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will >> >> >> > have created a stub DTB if there is no provided DTB, to pass the command >> >> >> > line and pointers to EFI data structures. This stub DTB should be >> >> >> > present in the usual place. >> >> >> >> >> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs >> >> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true >> >> >> though but I have not get idea how to get the dtb which kernel is using for boot >> >> >> since it is not unflattened. >> >> > >> >> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, >> >> > so there's nothing to expose at in sysfs/procfs. >> >> > >> >> > Somehow we need to unflatten the DTB without exposing it to drivers, >> >> > such that it can be exposed to userspace in the usual place but drivers >> >> > don't being probing based off of it. >> >> >> >> Is that even necessary? If the tree isn't unflattened, then it is just >> >> a stub tree. There really isn't anything interesting in it. >> > >> > We need to UEFI properties [1] from /chosen to access the memory map and >> > system table (both of which are necessary to access any ACPI tables). >> > >> >> Kexec should recreate the tree from scratch in that case. >> > >> > I'm not sure if the required information is exposed to userspace >> > elsewhere. Ard, Leif? >> > >> >> Personally, I think we should not be using /proc/device-tree at all, >> but instead retain the original blob in some way and expose that. > > Grant took objection to that approach previously. > >> We already have /sys/firmware/efi/systab which contains the physical >> addresses of the UEFI configuration tables. We should probably add an >> entry for the FDT there anyway, but we would still be looking at >> mmap(/dev/mem) to access it, which is not a practice we want to >> encourage, I suppose. > > We should not encourage use of /dev/mem. > Agreed. > Using /sys/firmware/efi/systab doesn't get us the memory map though, > unless that's also exposed? > The address of the memory map is in /chosen/linux,uefi-mmap-start, but -as I replied-to-self earlier- the DT UEFI configuration table does not contain the properties added by the stub, so that is not going to work anyway. >> Also, we *must* take the secure boot scenario into account. Booting >> with an arbitrary user generated DTB is nice, but if you are doing >> kexec without an initrd, for instance, it would also be nice if we >> could just reuse the existing DTB without bothering the user for it at >> all, which would be something we could also allow when running >> securely. > > Secure boot has to be handled completely differently. That will require > new syscall support, in-kernel purgatory, and so on. > > That should not affect the non-secureboot cases where a user wants to > load their own DTB (or other objects) into memory for the next OS (which > might not be Linux). > OK, fair enough. So ideally, we should expose the binary blob somewhere, in a similar fashion to how config.gz is exposed, perhaps?
On 7 Nov 2014 11:36, "Mark Rutland" <mark.rutland@arm.com> wrote: > > On Fri, Nov 07, 2014 at 10:41:11AM +0000, Ard Biesheuvel wrote: > > On 7 November 2014 11:16, Mark Rutland <mark.rutland@arm.com> wrote: > > > On Fri, Nov 07, 2014 at 12:41:45AM +0000, Grant Likely wrote: > > >> On Thu, Nov 6, 2014 at 3:08 PM, Mark Rutland <mark.rutland@arm.com> wrote: > > >> > On Thu, Nov 06, 2014 at 01:56:42AM +0000, Dave Young wrote: > > >> >> On 11/03/14 at 07:46pm, Mark Rutland wrote: > > >> >> > On Fri, Oct 31, 2014 at 07:52:09AM +0000, Dave Young wrote: > > >> >> > > Hi Geoff > > >> >> > > > > >> >> > > I tested your patches. The macihne is using spin-table cpu enable method > > >> >> > > so I tried maxcpus=1 as you suggested. > > >> >> > > > > >> >> > > There's below issues for me, thoughts? > > >> >> > > > > >> >> > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > > >> >> > > to use. > > >> >> > > > >> >> > Are you absolutely certain of this? > > >> >> > > > >> >> > To use ACPI, you must have booted via EFI, as the only mechanism for > > >> >> > finding the ACPI tables is via EFI. If booted via EFI, the stub will > > >> >> > have created a stub DTB if there is no provided DTB, to pass the command > > >> >> > line and pointers to EFI data structures. This stub DTB should be > > >> >> > present in the usual place. > > >> >> > > >> >> Mark, I used kexec-tools from Geoff's git tree, it will create dtb from procfs > > >> >> maybe I can pass external dtb to kexec-tools. What you mentioned should be true > > >> >> though but I have not get idea how to get the dtb which kernel is using for boot > > >> >> since it is not unflattened. > > >> > > > >> > Ah, sorry. I see the problem now. For ACPI you don't unflatten the tree, > > >> > so there's nothing to expose at in sysfs/procfs. > > >> > > > >> > Somehow we need to unflatten the DTB without exposing it to drivers, > > >> > such that it can be exposed to userspace in the usual place but drivers > > >> > don't being probing based off of it. > > >> > > >> Is that even necessary? If the tree isn't unflattened, then it is just > > >> a stub tree. There really isn't anything interesting in it. > > > > > > We need to UEFI properties [1] from /chosen to access the memory map and > > > system table (both of which are necessary to access any ACPI tables). > > > > > >> Kexec should recreate the tree from scratch in that case. > > > > > > I'm not sure if the required information is exposed to userspace > > > elsewhere. Ard, Leif? > > > > > > > Personally, I think we should not be using /proc/device-tree at all, > > but instead retain the original blob in some way and expose that. > > Grant took objection to that approach previously. I don't strongly object, but using the original DTB is problematic if the kernel makes changes to the tree that should be passed through to the next kernel. The only big objection I have is to using the existing debugfs implementation. Care also need to be taken to make sure the flat tree has not been modified on the fly. I think there are some scenarios where the kernel modifies the DT buffer in-place. That said, for a stub DT, the best place to generate it is still probably in kernel space. > > > We already have /sys/firmware/efi/systab which contains the physical > > addresses of the UEFI configuration tables. We should probably add an > > entry for the FDT there anyway, but we would still be looking at > > mmap(/dev/mem) to access it, which is not a practice we want to > > encourage, I suppose. > > We should not encourage use of /dev/mem. > > Using /sys/firmware/efi/systab doesn't get us the memory map though, > unless that's also exposed? > > > Also, we *must* take the secure boot scenario into account. Booting > > with an arbitrary user generated DTB is nice, but if you are doing > > kexec without an initrd, for instance, it would also be nice if we > > could just reuse the existing DTB without bothering the user for it at > > all, which would be something we could also allow when running > > securely. > > Secure boot has to be handled completely differently. That will require > new syscall support, in-kernel purgatory, and so on. > > That should not affect the non-secureboot cases where a user wants to > load their own DTB (or other objects) into memory for the next OS (which > might not be Linux). > > Mark.
On 11/06/14 at 09:43pm, Arun Chandran wrote: > On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: > > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: > >> Hi Geoff, > >> > >> I am trying this on my hardware (apm-mustang.dtb) > >> > >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: > >> > Hi All, > >> > > >> > This series adds the core support for kexec re-boots on arm64. I have tested > >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation > >> > model with various kernel config options for both the first and second stage > >> > kernels. > >> > > >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to > >> > kexec-tools [2], which have not yet been merged upstream, are needed. > >> > > >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() > >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that > >> > were entered in EL2. > >> > > >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree > >> > reserved-memory nodes. > >> > > >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the > >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel > >> > routine. > >> > > >> > Patches 8-10 add the actual kexec support. > >> > > >> > Please consider all patches for inclusion. Any comments or suggestions on how > >> > to improve are welcome. > >> > > >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git > >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git > >> > > >> > Several things are known to have problems on kexec re-boot: > >> > > >> > spin-table > >> > ---------- > >> > > >> > PROBLEM: The spin-table enable method does not implement all the methods needed > >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. > >> > > >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method > >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' > >> > on the first stage kernel command line. > >> > >> I have CONFIG_SMP=n > >> > >> > > >> > FIX: Upgrade system firmware to provide PSCI enable method support. > >> > > >> > KVM > >> > --- > >> > > >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those > >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. > >> > > >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. > >> > >> KVM also disabled. > >> > >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux > >> --dtb=apm-mustang.dtb --command-line= > >> "root=/dev/nfs rw > >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> earlyprintk=uart8250-32bit,0x1c020000" > >> > >> kexec version: 14.10.21.16.36-ga38e0a6 > >> arch_process_options:85: command_line: root=/dev/nfs rw > >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> earlyprintk=uart8250-32bit,0x1 > >> c020000 > >> arch_process_options:87: initrd: (null) > >> arch_process_options:88: dtb: apm-mustang.dtb > >> arch_process_options:89: lite: 1 > >> kernel: 0x7f756e7010 kernel_size: 0x488a308 > >> Modified cmdline: root=/dev/nfs > >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > >> from purgatory is diabled > >> get_memory_ranges_dt:638: node_1516 memory > >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> get_memory_ranges_dt:678: Success > >> elf_arm64_load: PE format: yes > >> p_vaddr: ffffffc000080000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) > >> -> 0000004000080000 - 00000040006fb000 (0067b000) > >> elf_arm64_load: text_offset: 0000000000080000 > >> elf_arm64_load: image_size: 000000000067f000 > >> elf_arm64_load: page_offset: ffffffc000000000 > >> elf_arm64_load: memstart: 0000004000000000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 > >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> Modified cmdline:root=/dev/nfs rw > >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> earlyprintk=uart8250-32bit,0x1c020000 > >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > >> from purgatory is diabled > >> read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > >> cpu-release-addr 400000fff8 > >> read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > >> cpu-release-addr 400000fff8 > >> check_cpu_properties: hwid-0: OK > >> check_cpu_properties: hwid-1: OK > >> check_cpu_properties: hwid-100: OK > >> check_cpu_properties: hwid-101: OK > >> check_cpu_properties: hwid-200: OK > >> check_cpu_properties: hwid-201: OK > >> check_cpu_properties: hwid-300: OK > >> check_cpu_properties: hwid-301: OK > >> dtb: base 4000700000, size 221ch (8732) > >> add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) > >> -> 0000004000700000 - 0000004000703000 (00003000) > >> kexec_load: entry = 0x4000080000 flags = 0xb70000 > >> nr_segments = 2 > >> segment[0].buf = 0x7f756f7010 > >> segment[0].bufsz = 0x645f60 > >> segment[0].mem = 0x4000080000 > >> segment[0].memsz = 0x67b000 > >> segment[1].buf = 0x24c14c90 > >> segment[1].bufsz = 0x221c > >> segment[1].mem = 0x4000700000 > >> segment[1].memsz = 0x3000 > >> > >> root@genericarmv8:~# /usr/local/sbin/kexec --lite -e > >> kexec version: 14.10.21.16.36-ga38e0a6 > >> arch_process_options:85: command_line: (null) > >> arch_process_options:87: initrd: (null) > >> arch_process_options:88: dtb: (null) > >> arch_process_options:89: lite: 1 > >> sd 0:0:0:0: [sda] Synchronizing SCSI cache > >> kexec: Starting new kernel > >> Bye! > >> > >> It fails to come up. In debugger I can see > >> > >> Core number : 0 > >> Core state : debug (AArch64 EL1) > >> Debug entry cause : External Debug Request > >> Current PC : 0xffffffc000083200 > > > > That's a kernel virtual address, and it looks to be somewhere early in > > the boot path, given it's close to PAGE_OFFSET + TEXT_OFFSET. > > > Yes. My earlyconsole setting was wrong that is why I did not see > anything in console for the kexec rebooting. > > With correct earlycon setting I can see Arun, could you share how did you get the working earlycon? We might have same problem.. Thanks Dave
Hi dave, On Mon, Nov 10, 2014 at 12:47 PM, Dave Young <dyoung@redhat.com> wrote: > On 11/06/14 at 09:43pm, Arun Chandran wrote: >> On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: >> > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: >> >> Hi Geoff, >> >> >> >> I am trying this on my hardware (apm-mustang.dtb) >> >> >> >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: >> >> > Hi All, >> >> > >> >> > This series adds the core support for kexec re-boots on arm64. I have tested >> >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation >> >> > model with various kernel config options for both the first and second stage >> >> > kernels. >> >> > >> >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to >> >> > kexec-tools [2], which have not yet been merged upstream, are needed. >> >> > >> >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() >> >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that >> >> > were entered in EL2. >> >> > >> >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree >> >> > reserved-memory nodes. >> >> > >> >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the >> >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel >> >> > routine. >> >> > >> >> > Patches 8-10 add the actual kexec support. >> >> > >> >> > Please consider all patches for inclusion. Any comments or suggestions on how >> >> > to improve are welcome. >> >> > >> >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git >> >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git >> >> > >> >> > Several things are known to have problems on kexec re-boot: >> >> > >> >> > spin-table >> >> > ---------- >> >> > >> >> > PROBLEM: The spin-table enable method does not implement all the methods needed >> >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. >> >> > >> >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method >> >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' >> >> > on the first stage kernel command line. >> >> >> >> I have CONFIG_SMP=n >> >> >> >> > >> >> > FIX: Upgrade system firmware to provide PSCI enable method support. >> >> > >> >> > KVM >> >> > --- >> >> > >> >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those >> >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. >> >> > >> >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. >> >> >> >> KVM also disabled. >> >> >> >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux >> >> --dtb=apm-mustang.dtb --command-line= >> >> "root=/dev/nfs rw >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> >> earlyprintk=uart8250-32bit,0x1c020000" >> >> >> >> kexec version: 14.10.21.16.36-ga38e0a6 >> >> arch_process_options:85: command_line: root=/dev/nfs rw >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> >> earlyprintk=uart8250-32bit,0x1 >> >> c020000 >> >> arch_process_options:87: initrd: (null) >> >> arch_process_options:88: dtb: apm-mustang.dtb >> >> arch_process_options:89: lite: 1 >> >> kernel: 0x7f756e7010 kernel_size: 0x488a308 >> >> Modified cmdline: root=/dev/nfs >> >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing >> >> from purgatory is diabled >> >> get_memory_ranges_dt:638: node_1516 memory >> >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 >> >> get_memory_ranges_dt:678: Success >> >> elf_arm64_load: PE format: yes >> >> p_vaddr: ffffffc000080000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) >> >> -> 0000004000080000 - 00000040006fb000 (0067b000) >> >> elf_arm64_load: text_offset: 0000000000080000 >> >> elf_arm64_load: image_size: 000000000067f000 >> >> elf_arm64_load: page_offset: ffffffc000000000 >> >> elf_arm64_load: memstart: 0000004000000000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 >> >> Modified cmdline:root=/dev/nfs rw >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 >> >> earlyprintk=uart8250-32bit,0x1c020000 >> >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing >> >> from purgatory is diabled >> >> read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', >> >> cpu-release-addr 400000fff8 >> >> check_cpu_properties: hwid-0: OK >> >> check_cpu_properties: hwid-1: OK >> >> check_cpu_properties: hwid-100: OK >> >> check_cpu_properties: hwid-101: OK >> >> check_cpu_properties: hwid-200: OK >> >> check_cpu_properties: hwid-201: OK >> >> check_cpu_properties: hwid-300: OK >> >> check_cpu_properties: hwid-301: OK >> >> dtb: base 4000700000, size 221ch (8732) >> >> add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) >> >> -> 0000004000700000 - 0000004000703000 (00003000) >> >> kexec_load: entry = 0x4000080000 flags = 0xb70000 >> >> nr_segments = 2 >> >> segment[0].buf = 0x7f756f7010 >> >> segment[0].bufsz = 0x645f60 >> >> segment[0].mem = 0x4000080000 >> >> segment[0].memsz = 0x67b000 >> >> segment[1].buf = 0x24c14c90 >> >> segment[1].bufsz = 0x221c >> >> segment[1].mem = 0x4000700000 >> >> segment[1].memsz = 0x3000 >> >> >> >> root@genericarmv8:~# /usr/local/sbin/kexec --lite -e >> >> kexec version: 14.10.21.16.36-ga38e0a6 >> >> arch_process_options:85: command_line: (null) >> >> arch_process_options:87: initrd: (null) >> >> arch_process_options:88: dtb: (null) >> >> arch_process_options:89: lite: 1 >> >> sd 0:0:0:0: [sda] Synchronizing SCSI cache >> >> kexec: Starting new kernel >> >> Bye! >> >> >> >> It fails to come up. In debugger I can see >> >> >> >> Core number : 0 >> >> Core state : debug (AArch64 EL1) >> >> Debug entry cause : External Debug Request >> >> Current PC : 0xffffffc000083200 >> > >> > That's a kernel virtual address, and it looks to be somewhere early in >> > the boot path, given it's close to PAGE_OFFSET + TEXT_OFFSET. >> > >> Yes. My earlyconsole setting was wrong that is why I did not see >> anything in console for the kexec rebooting. >> >> With correct earlycon setting I can see > > Arun, could you share how did you get the working earlycon? We might have > same problem.. I invoked kexec like this root@genericarmv8:~# ./usr/local/sbin/kexec --lite -l vmlinux --command-line= "root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000" Earlier my command line parameters for earlycon was wrong hence I was not getting earlyconsole. Also have a look at http://lists.infradead.org/pipermail/linux-arm-kernel/2014-November/300584.html If your board has an L3 cache; you may have to apply the modification in the above link. --Arun
[snip] > >> Yes. My earlyconsole setting was wrong that is why I did not see > >> anything in console for the kexec rebooting. > >> > >> With correct earlycon setting I can see > > > > Arun, could you share how did you get the working earlycon? We might have > > same problem.. > > I invoked kexec like this > > root@genericarmv8:~# ./usr/local/sbin/kexec --lite -l vmlinux --command-line= > "root=/dev/nfs rw > nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > earlycon=uart8250,mmio32,0x1c020000" > > > Earlier my command line parameters for earlycon was wrong hence I was > not getting > earlyconsole. > > Also have a look at > http://lists.infradead.org/pipermail/linux-arm-kernel/2014-November/300584.html > > If your board has an L3 cache; you may have to apply the modification in the > above link. Arun, thanks a lot. Will try when I get a machine this week. Dave
Hi, Arun On 11/10/14 at 02:05pm, Arun Chandran wrote: > Hi dave, > > On Mon, Nov 10, 2014 at 12:47 PM, Dave Young <dyoung@redhat.com> wrote: > > On 11/06/14 at 09:43pm, Arun Chandran wrote: > >> On Thu, Nov 6, 2014 at 8:58 PM, Mark Rutland <mark.rutland@arm.com> wrote: > >> > On Thu, Nov 06, 2014 at 12:16:12PM +0000, Arun Chandran wrote: > >> >> Hi Geoff, > >> >> > >> >> I am trying this on my hardware (apm-mustang.dtb) > >> >> > >> >> On Fri, Oct 24, 2014 at 4:40 AM, Geoff Levand <geoff@infradead.org> wrote: > >> >> > Hi All, > >> >> > > >> >> > This series adds the core support for kexec re-boots on arm64. I have tested > >> >> > with the ARM VE fast model, the ARM Base model and the ARM Foundation > >> >> > model with various kernel config options for both the first and second stage > >> >> > kernels. > >> >> > > >> >> > To load a second stage kernel and execute a kexec re-boot on arm64 my patches to > >> >> > kexec-tools [2], which have not yet been merged upstream, are needed. > >> >> > > >> >> > Patches 1-4 rework the arm64 hcall mechanism to give the arm64 soft_restart() > >> >> > routine the ability to switch exception levels from EL1 to EL2 for kernels that > >> >> > were entered in EL2. > >> >> > > >> >> > Patches 5 and 6 convert the use of device tree /memreserve/ to device tree > >> >> > reserved-memory nodes. > >> >> > > >> >> > Patch 7 moves proc-macros.S from arm64/mm to arm64/include/asm so that the > >> >> > dcache_line_size macro it defines can be uesd by kexec's relocate kernel > >> >> > routine. > >> >> > > >> >> > Patches 8-10 add the actual kexec support. > >> >> > > >> >> > Please consider all patches for inclusion. Any comments or suggestions on how > >> >> > to improve are welcome. > >> >> > > >> >> > [1] https://git.linaro.org/people/geoff.levand/linux-kexec.git > >> >> > [2] https://git.linaro.org/people/geoff.levand/kexec-tools.git > >> >> > > >> >> > Several things are known to have problems on kexec re-boot: > >> >> > > >> >> > spin-table > >> >> > ---------- > >> >> > > >> >> > PROBLEM: The spin-table enable method does not implement all the methods needed > >> >> > for CPU hot-plug, so the first stage kernel cannot be shutdown properly. > >> >> > > >> >> > WORK-AROUND: Upgrade to system firmware that provides PSCI enable method > >> >> > support, OR build the first stage kernel with CONFIG_SMP=n, OR pass 'maxcpus=1' > >> >> > on the first stage kernel command line. > >> >> > >> >> I have CONFIG_SMP=n > >> >> > >> >> > > >> >> > FIX: Upgrade system firmware to provide PSCI enable method support. > >> >> > > >> >> > KVM > >> >> > --- > >> >> > > >> >> > PROBLEM: KVM acquires hypervisor resources on startup, but does not free those > >> >> > resources on shutdown, so the first stage kernel cannot be shutdown properly. > >> >> > > >> >> > WORK-AROUND: Build the first stage kernel with CONFIG_KVM=n. > >> >> > >> >> KVM also disabled. > >> >> > >> >> /root@genericarmv8:~# usr/local/sbin/kexec --lite -l vmlinux > >> >> --dtb=apm-mustang.dtb --command-line= > >> >> "root=/dev/nfs rw > >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> >> earlyprintk=uart8250-32bit,0x1c020000" > >> >> > >> >> kexec version: 14.10.21.16.36-ga38e0a6 > >> >> arch_process_options:85: command_line: root=/dev/nfs rw > >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> >> earlyprintk=uart8250-32bit,0x1 > >> >> c020000 > >> >> arch_process_options:87: initrd: (null) > >> >> arch_process_options:88: dtb: apm-mustang.dtb > >> >> arch_process_options:89: lite: 1 > >> >> kernel: 0x7f756e7010 kernel_size: 0x488a308 > >> >> Modified cmdline: root=/dev/nfs > >> >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > >> >> from purgatory is diabled > >> >> get_memory_ranges_dt:638: node_1516 memory > >> >> get_memory_ranges_dt:664: RAM: 0000004000000000 - 0000004400000000 > >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> >> get_memory_ranges_dt:659: SKIP: 0000000000000000 - 0000000000000000 > >> >> get_memory_ranges_dt:678: Success > >> >> elf_arm64_load: PE format: yes > >> >> p_vaddr: ffffffc000080000 > >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> >> add_segment_phys_virt: 0000007f756f7010 - 0000007f75d3cf70 (00645f60) > >> >> -> 0000004000080000 - 00000040006fb000 (0067b000) > >> >> elf_arm64_load: text_offset: 0000000000080000 > >> >> elf_arm64_load: image_size: 000000000067f000 > >> >> elf_arm64_load: page_offset: ffffffc000000000 > >> >> elf_arm64_load: memstart: 0000004000000000 > >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> >> elf_arm64_load: e_entry: ffffffc000080000 -> 0000004000080000 > >> >> virt_to_phys: ffffffc000080000 -> 0000004000080000 > >> >> Modified cmdline:root=/dev/nfs rw > >> >> nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > >> >> ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > >> >> earlyprintk=uart8250-32bit,0x1c020000 > >> >> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing > >> >> from purgatory is diabled > >> >> read_cpu_info: dtb_1 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_1 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-0 (/cpus/cpu@000): hwid-0, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-1 (/cpus/cpu@001): hwid-1, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-2 (/cpus/cpu@100): hwid-100, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-3 (/cpus/cpu@101): hwid-101, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-4 (/cpus/cpu@200): hwid-200, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-5 (/cpus/cpu@201): hwid-201, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-6 (/cpus/cpu@300): hwid-300, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> read_cpu_info: dtb_2 cpu-7 (/cpus/cpu@301): hwid-301, 'spin-table', > >> >> cpu-release-addr 400000fff8 > >> >> check_cpu_properties: hwid-0: OK > >> >> check_cpu_properties: hwid-1: OK > >> >> check_cpu_properties: hwid-100: OK > >> >> check_cpu_properties: hwid-101: OK > >> >> check_cpu_properties: hwid-200: OK > >> >> check_cpu_properties: hwid-201: OK > >> >> check_cpu_properties: hwid-300: OK > >> >> check_cpu_properties: hwid-301: OK > >> >> dtb: base 4000700000, size 221ch (8732) > >> >> add_segment_phys_virt: 0000000024c14c90 - 0000000024c16eac (0000221c) > >> >> -> 0000004000700000 - 0000004000703000 (00003000) > >> >> kexec_load: entry = 0x4000080000 flags = 0xb70000 > >> >> nr_segments = 2 > >> >> segment[0].buf = 0x7f756f7010 > >> >> segment[0].bufsz = 0x645f60 > >> >> segment[0].mem = 0x4000080000 > >> >> segment[0].memsz = 0x67b000 > >> >> segment[1].buf = 0x24c14c90 > >> >> segment[1].bufsz = 0x221c > >> >> segment[1].mem = 0x4000700000 > >> >> segment[1].memsz = 0x3000 > >> >> > >> >> root@genericarmv8:~# /usr/local/sbin/kexec --lite -e > >> >> kexec version: 14.10.21.16.36-ga38e0a6 > >> >> arch_process_options:85: command_line: (null) > >> >> arch_process_options:87: initrd: (null) > >> >> arch_process_options:88: dtb: (null) > >> >> arch_process_options:89: lite: 1 > >> >> sd 0:0:0:0: [sda] Synchronizing SCSI cache > >> >> kexec: Starting new kernel > >> >> Bye! > >> >> > >> >> It fails to come up. In debugger I can see > >> >> > >> >> Core number : 0 > >> >> Core state : debug (AArch64 EL1) > >> >> Debug entry cause : External Debug Request > >> >> Current PC : 0xffffffc000083200 > >> > > >> > That's a kernel virtual address, and it looks to be somewhere early in > >> > the boot path, given it's close to PAGE_OFFSET + TEXT_OFFSET. > >> > > >> Yes. My earlyconsole setting was wrong that is why I did not see > >> anything in console for the kexec rebooting. > >> > >> With correct earlycon setting I can see > > > > Arun, could you share how did you get the working earlycon? We might have > > same problem.. > > I invoked kexec like this > > root@genericarmv8:~# ./usr/local/sbin/kexec --lite -l vmlinux --command-line= > "root=/dev/nfs rw > nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 > ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 > earlycon=uart8250,mmio32,0x1c020000" > > > Earlier my command line parameters for earlycon was wrong hence I was > not getting > earlyconsole. > > Also have a look at > http://lists.infradead.org/pipermail/linux-arm-kernel/2014-November/300584.html > > If your board has an L3 cache; you may have to apply the modification in the > above link. Tested with CONFIG_KVM=n + maxcpus=1, I got a working earlycon with exact same cmdline as what you are using. But I got some panic which is caused by uefi init, I will try Ard's uefi stable mapping patchset. (Ccing him) Thanks Dave
Hi Arun, Dave, On Fri, 2014-11-07 at 16:31 +0530, Arun Chandran wrote: > I have issue in flushing with L3 cache on . kexec does not reboot with L3. > Is the above logic of flushing (clean + invalidate) after copying the > image data is correct? Is there exist the danger of image data being > overwritten by invalid cache lines? > > I tried with only invalidate(dc ivac); but that also not seems to be > working. > > Finally I moved the entire flushing logic before copying then > things started working. Please see the code change below. I added this with some minor changes. Please let me know if it works for you. -Geoff
Hi Mark, On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote: > > +++ b/arch/arm64/Kconfig > > @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE > > > > source "mm/Kconfig" > > > > +config KEXEC > > + depends on (!SMP || PM_SLEEP_SMP) > > In its current state this also depends on !KVM && !EFI (technically you > could detect those cases at runtime, but I don't see that in this > series). A kernel built with CONFIG_EFI is OK if run on a non-EFI system or without using a system's EFI support. I added a patch that adds runtime checks in the kexec_load syscall path to print a message and return failure for situations where KVM or EFI won't work. > > +/** > > + * machine_kexec_prepare - Prepare for a kexec reboot. > > + * > > + * Called from the core kexec code when a kernel image is loaded. > > + */ > > + > > +int machine_kexec_prepare(struct kimage *image) > > +{ > > + const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image); > > + > > + if (!dtb_seg) > > + pr_warn("%s: No device tree segment found.\n", __func__); > > + > > + arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0; > > + arm64_kexec_kimage_start = image->start; > > + > > + return 0; > > +} > > I thought all of the DTB handling was moving to purgatory? Non-purgatory booting is needed for kexec-lite. We can do this simple check here which optionally sets x0 to the dtb address to support that. The other solution is to have a trampoline in kexec-lite that sets x0 (basically an absolute minimal purgatory), but I think to do it here is nicer, and is also the same way that the arm arch code does it. Maybe removing this pr_warn message and just relying on the kexec_image_info() output would be better. > > +/** > > + * kexec_list_flush - Helper to flush the kimage list to PoC. > > + */ > > + > > +static void kexec_list_flush(unsigned long kimage_head) > > +{ > > + void *dest; > > + unsigned long *entry; > > + > > + for (entry = &kimage_head, dest = NULL; ; entry++) { > > + unsigned int flag = *entry & > > + (IND_DESTINATION | IND_INDIRECTION | IND_DONE | > > + IND_SOURCE); > > + void *addr = phys_to_virt(*entry & PAGE_MASK); > > + > > + switch (flag) { > > + case IND_INDIRECTION: > > + entry = (unsigned long *)addr - 1; > > + __flush_dcache_area(addr, PAGE_SIZE); > > + break; > > + case IND_DESTINATION: > > + dest = addr; > > + break; > > + case IND_SOURCE: > > + __flush_dcache_area(addr, PAGE_SIZE); > > + dest += PAGE_SIZE; > > + break; > > + case IND_DONE: > > + return; > > + default: > > + break; > > Can an image ever have no flags? Given the presence of IND_NONE I'd > assume not, so this looks like a candidate for a BUG(). Sure, I guess things will blow up before it ever gets here though. > > > + } > > + } > > +} > > + > > +/** > > + * machine_kexec - Do the kexec reboot. > > + * > > + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC. > > + */ > > + > > +void machine_kexec(struct kimage *image) > > +{ > > + phys_addr_t reboot_code_buffer_phys; > > + void *reboot_code_buffer; > > + > > + BUG_ON(num_online_cpus() > 1); > > + > > + arm64_kexec_kimage_head = image->head; > > + > > + reboot_code_buffer_phys = page_to_phys(image->control_code_page); > > + reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); > > + > > + /* > > + * Copy relocate_new_kernel to the reboot_code_buffer for use > > + * after the kernel is shut down. > > + */ > > + > > + memcpy(reboot_code_buffer, relocate_new_kernel, > > + relocate_new_kernel_size); > > Can we get rid of the line gaps between comments and the single function > calls they apply to, please? I realise it's a minor thing, but this > looks rather inconsistent with the rest of arch/arm64/. checkpatch doesn't seem to mind, but sure, I can do that. > > + > > + /* Flush the reboot_code_buffer in preparation for its execution. */ > > + > > + __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size); > > That code should already be at the PoC per the boot protocol (the entire > kernel image should have been clean to the PoC at boot, so the > instructions forming relocate_new_kernel are globally visible). > > From the looks of it you only need to flush the variables at the very > end. We copy the relocate_new_kernel routine to reboot_code_buffer, which is a buffer allocated by the kexec core with alloc_pages(). That copy of relocate_new_kernel is what we are flushing here. I believe we need to flush that buffer out to PoC so we can execute the code it contains after cpu_soft_restart(). > > --- /dev/null > > +++ b/arch/arm64/kernel/relocate_kernel.S > > @@ -0,0 +1,184 @@ > > +/* > > + * kexec for arm64 > > + * > > + * Copyright (C) Linaro. > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License version 2 as > > + * published by the Free Software Foundation. > > + */ > > + > > +#include <asm/assembler.h> > > +#include <asm/kexec.h> > > +#include <asm/memory.h> > > +#include <asm/page.h> > > +#include <asm/proc-macros.S> > > + > > +/* The list entry flags. */ > > + > > +#define IND_DESTINATION_BIT 0 > > +#define IND_INDIRECTION_BIT 1 > > +#define IND_DONE_BIT 2 > > +#define IND_SOURCE_BIT 3 > > As previously, I think these need to be moved into a common header, and > defined in terms of the existing IND_* macros (or vice-versa). I believe > you had a patch doing so; what's the status of that? Still working to get it merged: https://lkml.org/lkml/2014/11/12/675 > > +/* > > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it. > > + * > > + * The memory that the old kernel occupies may be overwritten when coping the > > + * new image to its final location. To assure that the relocate_new_kernel > > + * routine which does that copy is not overwritten all code and data needed > > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and > > + * relocate_new_kernel_end. The machine_kexec() routine will copy > > + * relocate_new_kernel to the kexec control_code_page, a special page which > > + * has been set up to be preserved during the copy operation. > > + */ > > + > > +.globl relocate_new_kernel > > +relocate_new_kernel: ... > > + > > + /* start_new_image */ > > + > > + ldr x4, arm64_kexec_kimage_start > > + ldr x0, arm64_kexec_dtb_addr > > + mov x1, xzr > > + mov x2, xzr > > + mov x3, xzr > > + br x4 > > This last part should be in userspace-provided purgatory. If you have > purgatory code which does this then we should be able to rely on that, > and we don't have to try to maintain this DTB handling in kernelspace > (which I suspect may become painful as the boot protocol evolves). I think the putting the dtb address in x0 is already fixed. There are users with firmware that does this and any change to the boot protocol will have to work with it. As I mentioned above, we need a solution for non-purgatory re-boot and I think this is the best way. Thanks for taking the time to review. I'll post an updated patch set soon. -Geoff
On 10/31/14 at 04:25pm, Geoff Levand wrote: > Hi Dave, > > Thanks for testing. > > On Fri, 2014-10-31 at 15:52 +0800, Dave Young wrote: > > I tested your patches. The macihne is using spin-table cpu enable method > > so I tried maxcpus=1 as you suggested. > > > > There's below issues for me, thoughts? > > > > 1. For acpi booting there's no /proc/device-tree so kexec can not find dtb > > to use. > > You can supply a DTB with the kexec --dtb option, then kexec will not need > /proc/device-tree. Please try if you can and let me know what happens. > In your kexec-tools code, by default it will create dtb1 from procfs, as for external --dtb it will be dtb2 for comparing with dtb1. Since dtb1 is a must so it will not work without /proc/device-tree. There had some discussion about this in the thread. But I think we have no conclusion where to go. There maybe below possibilities: 1) unflattern dtb both for EFI/acpi booting even if acpi boot does not need it. 2) copy the bootloader passed dtb stub to somewhere, and export the stub to userspace ie. in debugfs or procfs like /proc/saved_dtb Thoughts to continue the discuss? Thanks Dave
Hi Geoff, On Thu, Nov 13, 2014 at 3:24 AM, Geoff Levand <geoff@infradead.org> wrote: > Hi Arun, Dave, > > On Fri, 2014-11-07 at 16:31 +0530, Arun Chandran wrote: >> I have issue in flushing with L3 cache on . kexec does not reboot with L3. >> Is the above logic of flushing (clean + invalidate) after copying the >> image data is correct? Is there exist the danger of image data being >> overwritten by invalid cache lines? >> >> I tried with only invalidate(dc ivac); but that also not seems to be >> working. >> >> Finally I moved the entire flushing logic before copying then >> things started working. Please see the code change below. > > I added this with some minor changes. Please let me know if > it works for you. > I tested master branch for both LE and BE booting with the script below. genericarmv8# cat /etc/rc5.d/S60reboot.sh -------------------------------------- #!/bin/sh i=$RANDOM j=$(( $i % 2)) count=`cat /home/root/cnt` if [ $j -eq 0 ] ; then echo "KEXEC rebootng to LE" /usr/local/sbin/kexec --lite -l /home/root/vmlinux_LE --command-line="root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000" echo $RANDOM > /home/root/"$count""_LE" else echo "KEXEC rebootng to BE" /usr/local/sbin/kexec --lite -l /home/root/vmlinux_BE --command-line="root=/dev/nfs rw nfsroot=10.162.103.145:/nfs_root/linaro-image-lamp-genericarmv8b,nfsvers=3 ip=:::::eth0:dhcp panic=1 console=ttyS0,115200 earlycon=uart8250,mmio32,0x1c020000" echo $RANDOM > /home/root/"$count""_BE" fi count=$(( $count + 1 )) echo "$count">/home/root/cnt /usr/local/sbin/kexec --lite -e --------------------------------- It rebooted without any issues for more than 1 hour. --Arun
On Thu, 2014-11-13 at 16:37 +0800, Dave Young wrote: > In your kexec-tools code, by default it will create dtb1 from procfs, as for external > --dtb it will be dtb2 for comparing with dtb1. > > Since dtb1 is a must so it will not work without /proc/device-tree. I changed that to allow for no /proc/device-tree if --dtb is passed since it will also allow a kernel built with CONFIG_PROC_FS=n to do kexec re-boot. Please try it and let me know. > There had some discussion about this in the thread. But I think we have no > conclusion where to go. > > There maybe below possibilities: > 1) unflattern dtb both for EFI/acpi booting even if acpi boot does not need it. > > 2) copy the bootloader passed dtb stub to somewhere, and export the stub to > userspace ie. in debugfs or procfs > like /proc/saved_dtb > > Thoughts to continue the discuss? Some progress has been made, let's see what they come up with. -Geoff
On 11/13/14 at 03:50pm, Geoff Levand wrote: > On Thu, 2014-11-13 at 16:37 +0800, Dave Young wrote: > > In your kexec-tools code, by default it will create dtb1 from procfs, as for external > > --dtb it will be dtb2 for comparing with dtb1. > > > > Since dtb1 is a must so it will not work without /proc/device-tree. > > I changed that to allow for no /proc/device-tree if --dtb is passed > since it will also allow a kernel built with CONFIG_PROC_FS=n to do > kexec re-boot. > > Please try it and let me know. Tested the master branch or your kexec-tools, I got below with "mount -o bind /root /proc" because there's some other issues for my customized kernel to boot: arch_process_options:95: initrd: (null) arch_process_options:96: dtb: apm-mustang.dtb arch_process_options:97: lite: 1 arch_process_options:99: port: 0x0 kernel: 0x3ff9cbb0010 kernel_size: 0x602c947 unrecoverable error: could not scan "/proc/device-tree/": No such file or directory Thanks Dave
On 11/12/14 at 01:54pm, Geoff Levand wrote: > Hi Arun, Dave, > > On Fri, 2014-11-07 at 16:31 +0530, Arun Chandran wrote: > > I have issue in flushing with L3 cache on . kexec does not reboot with L3. > > Is the above logic of flushing (clean + invalidate) after copying the > > image data is correct? Is there exist the danger of image data being > > overwritten by invalid cache lines? > > > > I tried with only invalidate(dc ivac); but that also not seems to be > > working. > > > > Finally I moved the entire flushing logic before copying then > > things started working. Please see the code change below. > > I added this with some minor changes. Please let me know if > it works for you. I did not got a full boot log due to efi issues, but I think it works for me as for this cache issue. For UEFI, seems you applied Ard's original efi stable mapping patch, but he has an update version which solves fw_vendor panic etc. I tried but the patchset can not apply on your tree. I will have no time to test and work on kexec issues this week. Will try next week with more testing. Thanks Dave
On Thu, Nov 13, 2014 at 02:19:48AM +0000, Geoff Levand wrote: > Hi Mark, Hi Geoff, > On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote: > > > +++ b/arch/arm64/Kconfig > > > @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE > > > > > > source "mm/Kconfig" > > > > > > +config KEXEC > > > + depends on (!SMP || PM_SLEEP_SMP) > > > > In its current state this also depends on !KVM && !EFI (technically you > > could detect those cases at runtime, but I don't see that in this > > series). > > A kernel built with CONFIG_EFI is OK if run on a non-EFI system or > without using a system's EFI support. I added a patch that adds > runtime checks in the kexec_load syscall path to print a message > and return failure for situations where KVM or EFI won't work. > > > > +/** > > > + * machine_kexec_prepare - Prepare for a kexec reboot. > > > + * > > > + * Called from the core kexec code when a kernel image is loaded. > > > + */ > > > + > > > +int machine_kexec_prepare(struct kimage *image) > > > +{ > > > + const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image); > > > + > > > + if (!dtb_seg) > > > + pr_warn("%s: No device tree segment found.\n", __func__); > > > + > > > + arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0; > > > + arm64_kexec_kimage_start = image->start; > > > + > > > + return 0; > > > +} > > > > I thought all of the DTB handling was moving to purgatory? > > Non-purgatory booting is needed for kexec-lite. We can do > this simple check here which optionally sets x0 to the dtb > address to support that. The other solution is to have a > trampoline in kexec-lite that sets x0 (basically an absolute > minimal purgatory), but I think to do it here is nicer, and > is also the same way that the arm arch code does it. > > Maybe removing this pr_warn message and just relying on the > kexec_image_info() output would be better. I mentioned previously that I don't think the "kexec-lite" approach is a good one, especially if we're going to have userspace purgatory code anyway. It embeds a policy w.r.t. the segment handling within the kernel, on the assumption of a specific use-case for what is a more general mechanism. Unfortunately secureboot with kexec_file_load will require a kernelspace purgatory and likely special DT handling, but it's already a far more limited interface. > > > +/** > > > + * kexec_list_flush - Helper to flush the kimage list to PoC. > > > + */ > > > + > > > +static void kexec_list_flush(unsigned long kimage_head) > > > +{ > > > + void *dest; > > > + unsigned long *entry; > > > + > > > + for (entry = &kimage_head, dest = NULL; ; entry++) { > > > + unsigned int flag = *entry & > > > + (IND_DESTINATION | IND_INDIRECTION | IND_DONE | > > > + IND_SOURCE); > > > + void *addr = phys_to_virt(*entry & PAGE_MASK); > > > + > > > + switch (flag) { > > > + case IND_INDIRECTION: > > > + entry = (unsigned long *)addr - 1; > > > + __flush_dcache_area(addr, PAGE_SIZE); > > > + break; > > > + case IND_DESTINATION: > > > + dest = addr; > > > + break; > > > + case IND_SOURCE: > > > + __flush_dcache_area(addr, PAGE_SIZE); > > > + dest += PAGE_SIZE; > > > + break; > > > + case IND_DONE: > > > + return; > > > + default: > > > + break; > > > > Can an image ever have no flags? Given the presence of IND_NONE I'd > > assume not, so this looks like a candidate for a BUG(). > > Sure, I guess things will blow up before it ever gets here though. I would hope so. If we trigger the BUG() we know otherwise. > > > > > > + } > > > + } > > > +} > > > + > > > +/** > > > + * machine_kexec - Do the kexec reboot. > > > + * > > > + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC. > > > + */ > > > + > > > +void machine_kexec(struct kimage *image) > > > +{ > > > + phys_addr_t reboot_code_buffer_phys; > > > + void *reboot_code_buffer; > > > + > > > + BUG_ON(num_online_cpus() > 1); > > > + > > > + arm64_kexec_kimage_head = image->head; > > > + > > > + reboot_code_buffer_phys = page_to_phys(image->control_code_page); > > > + reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); > > > + > > > + /* > > > + * Copy relocate_new_kernel to the reboot_code_buffer for use > > > + * after the kernel is shut down. > > > + */ > > > + > > > + memcpy(reboot_code_buffer, relocate_new_kernel, > > > + relocate_new_kernel_size); > > > > Can we get rid of the line gaps between comments and the single function > > calls they apply to, please? I realise it's a minor thing, but this > > looks rather inconsistent with the rest of arch/arm64/. > > checkpatch doesn't seem to mind, but sure, I can do that. Cheers. > > > + > > > + /* Flush the reboot_code_buffer in preparation for its execution. */ > > > + > > > + __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size); > > > > That code should already be at the PoC per the boot protocol (the entire > > kernel image should have been clean to the PoC at boot, so the > > instructions forming relocate_new_kernel are globally visible). > > > > From the looks of it you only need to flush the variables at the very > > end. > > We copy the relocate_new_kernel routine to reboot_code_buffer, which is > a buffer allocated by the kexec core with alloc_pages(). That copy of > relocate_new_kernel is what we are flushing here. > > I believe we need to flush that buffer out to PoC so we can execute > the code it contains after cpu_soft_restart(). Apologies. I'd evidently confused myself here regarding what was being flushed. > > > > --- /dev/null > > > +++ b/arch/arm64/kernel/relocate_kernel.S > > > @@ -0,0 +1,184 @@ > > > +/* > > > + * kexec for arm64 > > > + * > > > + * Copyright (C) Linaro. > > > + * > > > + * This program is free software; you can redistribute it and/or modify > > > + * it under the terms of the GNU General Public License version 2 as > > > + * published by the Free Software Foundation. > > > + */ > > > + > > > +#include <asm/assembler.h> > > > +#include <asm/kexec.h> > > > +#include <asm/memory.h> > > > +#include <asm/page.h> > > > +#include <asm/proc-macros.S> > > > + > > > +/* The list entry flags. */ > > > + > > > +#define IND_DESTINATION_BIT 0 > > > +#define IND_INDIRECTION_BIT 1 > > > +#define IND_DONE_BIT 2 > > > +#define IND_SOURCE_BIT 3 > > > > As previously, I think these need to be moved into a common header, and > > defined in terms of the existing IND_* macros (or vice-versa). I believe > > you had a patch doing so; what's the status of that? > > Still working to get it merged: > > https://lkml.org/lkml/2014/11/12/675 Ok. Let's hope that goes through soon. > > > > +/* > > > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it. > > > + * > > > + * The memory that the old kernel occupies may be overwritten when coping the > > > + * new image to its final location. To assure that the relocate_new_kernel > > > + * routine which does that copy is not overwritten all code and data needed > > > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and > > > + * relocate_new_kernel_end. The machine_kexec() routine will copy > > > + * relocate_new_kernel to the kexec control_code_page, a special page which > > > + * has been set up to be preserved during the copy operation. > > > + */ > > > + > > > +.globl relocate_new_kernel > > > +relocate_new_kernel: > > ... > > > > + > > > + /* start_new_image */ > > > + > > > + ldr x4, arm64_kexec_kimage_start > > > + ldr x0, arm64_kexec_dtb_addr > > > + mov x1, xzr > > > + mov x2, xzr > > > + mov x3, xzr > > > + br x4 > > > > This last part should be in userspace-provided purgatory. If you have > > purgatory code which does this then we should be able to rely on that, > > and we don't have to try to maintain this DTB handling in kernelspace > > (which I suspect may become painful as the boot protocol evolves). > > I think the putting the dtb address in x0 is already fixed. There are > users with firmware that does this and any change to the boot protocol > will have to work with it. Sure, but that is the _Linux_ boot protocol, and the Kconfig description of kexec stats "you can start any kernel with it, not just Linux". Why should we embed Linux-specific details into a supposedly generic mechanism? We may also extend the boot protocol, and I would rather not have to manage the complexity of each possible extension within the kernel, especially given that the only context we can pass in kexec is segments. > As I mentioned above, we need a solution for non-purgatory re-boot and I > think this is the best way. Why do we need a solution for "non-purgatory re-boot"? As far as I can see this is a non-problem. Thanks, Mark.
Hi Mark, On Thu, Nov 13, 2014 at 02:19:48AM +0000, Geoff Levand wrote: > > On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote: > > > > +/** > > > > + * machine_kexec_prepare - Prepare for a kexec reboot. > > > > + * > > > > + * Called from the core kexec code when a kernel image is loaded. > > > > + */ > > > > + > > > > +int machine_kexec_prepare(struct kimage *image) > > > > +{ > > > > + const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image); > > > > + > > > > + if (!dtb_seg) > > > > + pr_warn("%s: No device tree segment found.\n", __func__); > > > > + > > > > + arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0; > > > > + arm64_kexec_kimage_start = image->start; > > > > + > > > > + return 0; > > > > +} > > > > > > I thought all of the DTB handling was moving to purgatory? > > > > Non-purgatory booting is needed for kexec-lite. We can do > > this simple check here which optionally sets x0 to the dtb > > address to support that. The other solution is to have a > > trampoline in kexec-lite that sets x0 (basically an absolute > > minimal purgatory), but I think to do it here is nicer, and > > is also the same way that the arm arch code does it. > > > > Maybe removing this pr_warn message and just relying on the > > kexec_image_info() output would be better. > > I mentioned previously that I don't think the "kexec-lite" approach is a > good one, especially if we're going to have userspace purgatory code > anyway. It embeds a policy w.r.t. the segment handling within the > kernel, on the assumption of a specific use-case for what is a more > general mechanism. I don't think this support embeds a policy. It is completely optional. If one of the kexec segments is found to have a dtb header at its start the address of that segment is put into x0 so that it is available to the code that control is passed to. That code is free to use the value or not. In the case of the current kexec-tools implementation for example, its purgatory does not use that value in x0 since the address of the dtb is known to the purgatory code through its arm64_dtb_addr variable. One motivation for kexec-lite was to avoid the complicated user space of a purgatory when it wasn't really needed. From what I understand, kexec-lite is shipping to customers, so there is at least a desire for it on other architectures which I believe are in the same market as 64 bit ARM servers. Also, just to mention it, the arm (32 bit) arch provides a similar facility in its kexec kernel code, by setting r2 to the address of the dtb, and there doesn't seem to be any concern over that. I can't see any negative effect of setting x0 in this way. If a user space loader needs or wants to do something different it is completely free to ignore the value the 1st stage kernel has put into x0. If the boot protocol is changed new kernels will still need to be able to boot from old loaders, and old kernels from new loaders. Depending on what the protocol change introduces we can decide if it makes sense to update this part of kexec. If you can describe a clear situation where this would cause a problem we should remove it, but if the choice is to remove support that users want to provide kernel developers some flexibility that may not be needed, then I think we should keep it in. > Unfortunately secureboot with kexec_file_load will require a kernelspace > purgatory and likely special DT handling, but it's already a far more > limited interface. ... > > > > +/* > > > > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it. > > > > + * > > > > + * The memory that the old kernel occupies may be overwritten when coping the > > > > + * new image to its final location. To assure that the relocate_new_kernel > > > > + * routine which does that copy is not overwritten all code and data needed > > > > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and > > > > + * relocate_new_kernel_end. The machine_kexec() routine will copy > > > > + * relocate_new_kernel to the kexec control_code_page, a special page which > > > > + * has been set up to be preserved during the copy operation. > > > > + */ > > > > + > > > > +.globl relocate_new_kernel > > > > +relocate_new_kernel: > > > > ... > > > > > > + > > > > + /* start_new_image */ > > > > + > > > > + ldr x4, arm64_kexec_kimage_start > > > > + ldr x0, arm64_kexec_dtb_addr > > > > + mov x1, xzr > > > > + mov x2, xzr > > > > + mov x3, xzr > > > > + br x4 > > > > > > This last part should be in userspace-provided purgatory. If you have > > > purgatory code which does this then we should be able to rely on that, > > > and we don't have to try to maintain this DTB handling in kernelspace > > > (which I suspect may become painful as the boot protocol evolves). > > > > I think the putting the dtb address in x0 is already fixed. There are > > users with firmware that does this and any change to the boot protocol > > will have to work with it. > > Sure, but that is the _Linux_ boot protocol, and the Kconfig description > of kexec stats "you can start any kernel with it, not just Linux". Why > should we embed Linux-specific details into a supposedly generic > mechanism? > > We may also extend the boot protocol, and I would rather not have to > manage the complexity of each possible extension within the kernel, > especially given that the only context we can pass in kexec is segments. > > > As I mentioned above, we need a solution for non-purgatory re-boot and I > > think this is the best way. > > Why do we need a solution for "non-purgatory re-boot"? As far as I can > see this is a non-problem. I tried to address these last concerns in my comments above. -Geoff