Message ID | 20161209155252.31381-1-npiggin@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Sat, 2016-12-10 at 01:52 +1000, Nicholas Piggin wrote: > This does not solve the hardlockup problem completely however, > because > interrupts can often become hard disabled when soft disabled for long > periods. And they can be hard disabled for other reasons. > > To make up for the lack of a periodic true NMI, this also has an SMP > hard lockup detector where all CPUs can observe lockups on others. > > This still needs a bit more polishing, testing, comments, config > options, and boot parameters, etc., so it's RFC quality only. Paulus and I discussed a plan with Balbir to also limit the cases of hard-disable. They typically happen as a result of an external interrupt. We could on P8 and earlier, just fetch the interrupt from the XICS in the "masked" path and stash it in the PACA. We already have a way to stash an interrupt there for later processing because KVM sometimes does it. That would cause the XICS to elevate the priority effectively masking subsequent interrupts. We'd have to change the XICS code to use the same priority for IPIs and externals too though. For XIVE (P9), we can just poke at the CPU priority register in the TM area to mask at the PIC level in that case and unmask later. Cheers, Ben.
On 10/12/16 02:52, Nicholas Piggin wrote: > Rather than use perf / PMU interrupts and the generic hardlockup > detector, this takes the decrementer interrupt as an "NMI" when > interrupts are soft disabled (XXX: will this do the right thing with a > large decrementer?). This will work even if we start soft-disabling PMU > interrupts. > > This does not solve the hardlockup problem completely however, because > interrupts can often become hard disabled when soft disabled for long > periods. And they can be hard disabled for other reasons. > Ben/Paul suggested a way to work around this with XICS. The idea was to have MSR_EE set and use XICS to stash away the current interrupt and acknowledge it/replay it later. Decrementer interrupts would not trigger timers, but trigger a special NMI watchdog, like you've implemented. > To make up for the lack of a periodic true NMI, this also has an SMP > hard lockup detector where all CPUs can observe lockups on others. > > This still needs a bit more polishing, testing, comments, config > options, and boot parameters, etc., so it's RFC quality only. > > Thanks, > Nick > --- > arch/powerpc/Kconfig | 2 + > arch/powerpc/include/asm/nmi.h | 5 + > arch/powerpc/kernel/Makefile | 1 + > arch/powerpc/kernel/exceptions-64s.S | 14 +- > arch/powerpc/kernel/kvm.c | 3 + > arch/powerpc/kernel/nmi.c | 288 +++++++++++++++++++++++++++++++++++ > arch/powerpc/kernel/setup_64.c | 18 --- > arch/powerpc/kernel/time.c | 2 + > arch/sparc/kernel/nmi.c | 2 +- > include/linux/nmi.h | 14 ++ > init/main.c | 3 + > kernel/watchdog.c | 16 +- > 12 files changed, 341 insertions(+), 27 deletions(-) > create mode 100644 arch/powerpc/kernel/nmi.c > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 65fba4c..adb3387 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -124,6 +124,8 @@ config PPC > select HAVE_CBPF_JIT if !PPC64 > select HAVE_EBPF_JIT if PPC64 > select HAVE_ARCH_JUMP_LABEL > + select HAVE_NMI > + select HAVE_NMI_WATCHDOG if PPC64 > select ARCH_HAVE_NMI_SAFE_CMPXCHG > select ARCH_HAS_GCOV_PROFILE_ALL > select GENERIC_SMP_IDLE_THREAD > diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h > index ff1ccb3..d00e29b 100644 > --- a/arch/powerpc/include/asm/nmi.h > +++ b/arch/powerpc/include/asm/nmi.h > @@ -1,4 +1,9 @@ > #ifndef _ASM_NMI_H > #define _ASM_NMI_H > > +#define arch_nmi_init powerpc_nmi_init > +void __init powerpc_nmi_init(void); > +void touch_nmi_watchdog(void); > +void soft_nmi_interrupt(struct pt_regs *regs); > + > #endif /* _ASM_NMI_H */ > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > index 1925341..77f199f 100644 > --- a/arch/powerpc/kernel/Makefile > +++ b/arch/powerpc/kernel/Makefile > @@ -42,6 +42,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ > signal_64.o ptrace32.o \ > paca.o nvram_64.o firmware.o > obj-$(CONFIG_VDSO32) += vdso32/ > +obj-$(CONFIG_HAVE_NMI_WATCHDOG) += nmi.o > obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o > obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o > obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o > diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S > index 1ba82ea..b159d02 100644 > --- a/arch/powerpc/kernel/exceptions-64s.S > +++ b/arch/powerpc/kernel/exceptions-64s.S > @@ -1295,7 +1295,7 @@ masked_##_H##interrupt: \ > lis r10,0x7fff; \ > ori r10,r10,0xffff; \ > mtspr SPRN_DEC,r10; \ > - b 2f; \ > + b masked_decrementer_##_H##interrupt; \ > 1: cmpwi r10,PACA_IRQ_DBELL; \ > beq 2f; \ > cmpwi r10,PACA_IRQ_HMI; \ > @@ -1312,6 +1312,16 @@ masked_##_H##interrupt: \ > ##_H##rfid; \ > b . > > +#define MASKED_NMI(_H) \ > +masked_decrementer_##_H##interrupt: \ > + std r12,PACA_EXGEN+EX_R12(r13); \ > + GET_SCRATCH0(r10); \ > + std r10,PACA_EXGEN+EX_R13(r13); \ > + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) > + > +EXC_COMMON(soft_nmi_common, 0x900, soft_nmi_interrupt) > + > + > /* > * Real mode exceptions actually use this too, but alternate > * instruction code patches (which end up in the common .text area) > @@ -1319,7 +1329,9 @@ masked_##_H##interrupt: \ > */ > USE_FIXED_SECTION(virt_trampolines) > MASKED_INTERRUPT() > + MASKED_NMI() > MASKED_INTERRUPT(H) > + MASKED_NMI(H) > > #ifdef CONFIG_KVM_BOOK3S_64_HANDLER > TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) > diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c > index 9ad37f8..f0d215c 100644 > --- a/arch/powerpc/kernel/kvm.c > +++ b/arch/powerpc/kernel/kvm.c > @@ -25,6 +25,7 @@ > #include <linux/kvm_para.h> > #include <linux/slab.h> > #include <linux/of.h> > +#include <linux/nmi.h> > > #include <asm/reg.h> > #include <asm/sections.h> > @@ -718,6 +719,8 @@ static __init void kvm_free_tmp(void) > > static int __init kvm_guest_init(void) > { > + /* XXX: disable hardlockup watchdog? */ > + You mean the hypervisor watchdog? Did your testing catch anything here? > if (!kvm_para_available()) > goto free_tmp; > > diff --git a/arch/powerpc/kernel/nmi.c b/arch/powerpc/kernel/nmi.c > new file mode 100644 > index 0000000..8e89db4 > --- /dev/null > +++ b/arch/powerpc/kernel/nmi.c > @@ -0,0 +1,288 @@ > +/* > + * NMI support on powerpc systems. > + * > + * Copyright 2016, IBM Corporation. > + * > + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c > + */ > +#include <linux/kernel.h> > +#include <linux/param.h> > +#include <linux/init.h> > +#include <linux/percpu.h> > +#include <linux/cpu.h> > +#include <linux/nmi.h> > +#include <linux/module.h> > +#include <linux/export.h> > +#include <linux/kprobes.h> > +#include <linux/hardirq.h> > +#include <linux/reboot.h> > +#include <linux/slab.h> > +#include <linux/kdebug.h> > +#include <linux/delay.h> > +#include <linux/smp.h> > + > +#include <asm/paca.h> > + > +/* > + * Powerpc NMI is implemented using either the 0x100 system reset interrupt > + * which is a real NMI, or using the soft-disable mechanism which can still > + * be hard masked. > + */ > +static int nmi_panic_timeout __read_mostly = 30; /* sec to panic */ > +static u64 nmi_panic_timeout_tb __read_mostly; /* timebase ticks */ > + > +static int nmi_timer_period __read_mostly = 10; /* sec between activity checks */ > +static u64 nmi_timer_period_tb __read_mostly; /* timebase ticks */ > + > +static DEFINE_PER_CPU(struct timer_list, nmi_timer); > +static DEFINE_PER_CPU(u64, last_activity_tb); > + > +static cpumask_t nmi_cpus_enabled __read_mostly; > + > +/* > + * Cross-CPU watchdog checker > + */ > +#ifdef CONFIG_SMP > +static int nmi_global_panic_timeout __read_mostly = 60; /* sec to panic other */ > +static u64 nmi_global_panic_timeout_tb __read_mostly; /* timebase ticks */ > +static cpumask_t nmi_global_cpus_pending; > +static u64 nmi_global_last_reset_tb; > +#endif > + > +static void watchdog_panic(struct pt_regs *regs, int cpu) > +{ > + static unsigned long lock = 0; > + > + while (unlikely(test_and_set_bit_lock(0, &lock))) > + cpu_relax(); > + > + if (cpu < 0) { > + pr_emerg("Watchdog detected hard LOCKUP other cpus %*pbl\n", > + cpumask_pr_args(&nmi_global_cpus_pending)); > + } else { > + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); > + } > + print_modules(); > + print_irqtrace_events(current); > + if (regs) > + show_regs(regs); > + else > + dump_stack(); > + > + /* XXX: trigger NMI IPI? */ > + > + nmi_panic(regs, "Hard LOCKUP"); > +} > + > +#ifdef CONFIG_SMP > +static void nmi_global_clear_cpu_pending(int cpu, u64 tb) > +{ > + static unsigned long lock = 0; > + > + while (unlikely(test_and_set_bit_lock(0, &lock))) > + cpu_relax(); > + cpumask_clear_cpu(cpu, &nmi_global_cpus_pending); > + if (cpumask_empty(&nmi_global_cpus_pending)) { > + nmi_global_last_reset_tb = tb; > + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled); > + } > + clear_bit_unlock(0, &lock); > +} > +#endif > + > +static void watchdog_timer_interrupt(int cpu) > +{ > + u64 tb = get_tb(); > + > + raw_cpu_write(last_activity_tb, tb); > + > +#ifdef CONFIG_SMP > + if (tb - nmi_global_last_reset_tb < nmi_timer_period_tb) > + return; > + > + if (cpumask_test_cpu(cpu, &nmi_global_cpus_pending)) > + nmi_global_clear_cpu_pending(cpu, tb); > + > + if (tb - nmi_global_last_reset_tb >= nmi_global_panic_timeout_tb) > + watchdog_panic(NULL, -1); > +#endif > +} > + > +static void nmi_timer_fn(unsigned long data) > +{ > + struct timer_list *t = this_cpu_ptr(&nmi_timer); > + int cpu = smp_processor_id(); > + > + watchdog_timer_interrupt(cpu); > + > + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ); > + add_timer_on(t, cpu); > +} Do we have to have this running all the time? Can we do an on-demand version of NMI where we do periodic decrementers without any reliance on timers to implement NMI watchdog > + > +void touch_nmi_watchdog(void) > +{ > + int cpu = smp_processor_id(); > + > + if (cpumask_test_cpu(cpu, &nmi_cpus_enabled)) > + watchdog_timer_interrupt(cpu); > + > + touch_softlockup_watchdog(); > +} > +EXPORT_SYMBOL(touch_nmi_watchdog); > + > +static void watchdog_nmi_interrupt(struct pt_regs *regs, u64 tb) > +{ > + u64 interval = tb - __this_cpu_read(last_activity_tb); > + > + if (interval >= nmi_panic_timeout_tb) > + watchdog_panic(regs, smp_processor_id()); > +} > + > +/* > + * This is the soft-mask nmi handler (called by masked_interrupt() in > + * low level exception entry). > + */ > +static DEFINE_PER_CPU(u64, soft_nmi_last_tb); > + > +/* > + * Soft nmi interrupts only occur when linux irqs are disabled (but > + * powerpc hardware irqs are enabled), so they can not be relied > + * upon to be timely or delivered at all. Their only real use is > + * the nmi watchdog. > + */ > +void soft_nmi_interrupt(struct pt_regs *regs) > +{ > + u64 tb; > + > + if (!cpumask_test_cpu(smp_processor_id(), &nmi_cpus_enabled)) > + return; > + > + tb = get_tb(); > + if (tb - __this_cpu_read(soft_nmi_last_tb) < nmi_timer_period_tb) > + return; > + __this_cpu_write(soft_nmi_last_tb, tb); > + > + nmi_enter(); > + watchdog_nmi_interrupt(regs, tb); > + nmi_exit(); > +} > + > +static void start_nmi_on_cpu(int cpu) > +{ > + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu); > + u64 *last_tb = per_cpu_ptr(&soft_nmi_last_tb, cpu); > + > + *last_tb = get_tb(); > + cpumask_set_cpu(cpu, &nmi_cpus_enabled); > + > + setup_pinned_timer(t, nmi_timer_fn, cpu); > + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ); > + add_timer_on(t, cpu); > +} > + > +static void stop_nmi_on_cpu(int cpu) > +{ > + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu); > + > + del_timer_sync(t); > + > + cpumask_clear_cpu(cpu, &nmi_cpus_enabled); > +#ifdef CONFIG_SMP > + nmi_global_clear_cpu_pending(cpu, get_tb()); > +#endif > +} > + > + > +static int nmi_startup(void) > +{ > + if (num_online_cpus() != 1) { > + WARN_ON(1); > + return -EINVAL; > + } > + > +#ifdef CONFIG_SMP > + nmi_global_last_reset_tb = get_tb(); > +#endif > + start_nmi_on_cpu(smp_processor_id()); > +#ifdef CONFIG_SMP > + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled); > +#endif > + > + pr_info("NMI Watchdog started on cpus %*pbl\n", > + cpumask_pr_args(&nmi_cpus_enabled)); > + > + return 0; > +} > + > +static int nmi_cpu_notify(struct notifier_block *self, > + unsigned long action, void *hcpu) > +{ > + int cpu = (unsigned long)hcpu; > + > + switch (action & ~CPU_TASKS_FROZEN) { > + case CPU_ONLINE: > + case CPU_DOWN_FAILED: > + start_nmi_on_cpu(cpu); > + pr_info("NMI Watchdog running on cpus %*pbl\n", > + cpumask_pr_args(&nmi_cpus_enabled)); > + break; > + case CPU_DOWN_PREPARE: > + stop_nmi_on_cpu(cpu); > + pr_info("NMI Watchdog running on cpus %*pbl\n", > + cpumask_pr_args(&nmi_cpus_enabled)); > + break; > + } > + return NOTIFY_OK; > +} FYI: These bits are changing in linux-next > + > +static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p) > +{ > + stop_nmi_on_cpu(smp_processor_id()); > + return 0; > +} > + > +static struct notifier_block nmi_reboot_notifier = { > + .notifier_call = nmi_shutdown, > +}; > + > +void __init powerpc_nmi_init(void) > +{ > + int err; > + > + if (!nmi_panic_timeout) > + return; > + > + nmi_panic_timeout_tb = nmi_panic_timeout * ppc_tb_freq; > + nmi_timer_period_tb = nmi_timer_period * ppc_tb_freq; > +#ifdef CONFIG_SMP > + nmi_global_panic_timeout_tb = nmi_global_panic_timeout * ppc_tb_freq; > +#endif > + > + err = register_reboot_notifier(&nmi_reboot_notifier); > + if (err) > + return; > + > + err = nmi_startup(); > + if (err) { > + unregister_reboot_notifier(&nmi_reboot_notifier); > + return; > + } > + > + cpu_notifier(nmi_cpu_notify, 0); > +} > + > +static int __init setup_nmi_watchdog(char *str) > +{ > + if (!strncmp(str, "0", 1)) > + nmi_panic_timeout = 0; > + return 0; > +} > +__setup("nmi_watchdog=", setup_nmi_watchdog); > + > +static int __init nowatchdog_setup(char *str) > +{ > + nmi_panic_timeout = 0; > + return 1; > +} > +__setup("nowatchdog", nowatchdog_setup); > + > diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c > index 8d586cf..6a403a9 100644 > --- a/arch/powerpc/kernel/setup_64.c > +++ b/arch/powerpc/kernel/setup_64.c > @@ -652,21 +652,3 @@ struct ppc_pci_io ppc_pci_io; > EXPORT_SYMBOL(ppc_pci_io); > #endif > > -#ifdef CONFIG_HARDLOCKUP_DETECTOR > -u64 hw_nmi_get_sample_period(int watchdog_thresh) > -{ > - return ppc_proc_freq * watchdog_thresh; > -} > - > -/* > - * The hardlockup detector breaks PMU event based branches and is likely > - * to get false positives in KVM guests, so disable it by default. > - */ > -static int __init disable_hardlockup_detector(void) > -{ > - hardlockup_detector_disable(); > - > - return 0; > -} > -early_initcall(disable_hardlockup_detector); > -#endif > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c > index bc3f7d0..5aec1ad 100644 > --- a/arch/powerpc/kernel/time.c > +++ b/arch/powerpc/kernel/time.c > @@ -52,6 +52,7 @@ > #include <linux/jiffies.h> > #include <linux/posix-timers.h> > #include <linux/irq.h> > +#include <linux/nmi.h> > #include <linux/delay.h> > #include <linux/irq_work.h> > #include <linux/clk-provider.h> > @@ -66,6 +67,7 @@ > #include <asm/machdep.h> > #include <asm/uaccess.h> > #include <asm/time.h> > +#include <asm/nmi.h> > #include <asm/prom.h> > #include <asm/irq.h> > #include <asm/div64.h> > diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c > index a9973bb..ae3a4d5 100644 > --- a/arch/sparc/kernel/nmi.c > +++ b/arch/sparc/kernel/nmi.c > @@ -244,7 +244,7 @@ static struct notifier_block nmi_reboot_notifier = { > .notifier_call = nmi_shutdown, > }; > > -int __init nmi_init(void) > +void __init nmi_init(void) > { > int err; > > diff --git a/include/linux/nmi.h b/include/linux/nmi.h > index a78c35c..9552ab0 100644 > --- a/include/linux/nmi.h > +++ b/include/linux/nmi.h > @@ -18,6 +18,9 @@ > #include <asm/nmi.h> > extern void touch_nmi_watchdog(void); > #else > +static inline void nmi_init(void) > +{ > +} > static inline void touch_nmi_watchdog(void) > { > touch_softlockup_watchdog(); > @@ -30,6 +33,17 @@ extern void hardlockup_detector_disable(void); > static inline void hardlockup_detector_disable(void) {} > #endif > > +#ifdef arch_nmi_init > +static inline void nmi_init(void) > +{ > + arch_nmi_init(); > +} > +#else > +static inline void nmi_init(void) > +{ > +} > +#endif > + > /* > * Create trigger_all_cpu_backtrace() out of the arch-provided > * base function. Return whether such support was available, > diff --git a/init/main.c b/init/main.c > index 2858be7..36fd7e7 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -33,6 +33,7 @@ > #include <linux/start_kernel.h> > #include <linux/security.h> > #include <linux/smp.h> > +#include <linux/nmi.h> > #include <linux/profile.h> > #include <linux/rcupdate.h> > #include <linux/moduleparam.h> > @@ -579,6 +580,8 @@ asmlinkage __visible void __init start_kernel(void) > > kmem_cache_init_late(); > > + nmi_init(); How did you test these? Balbir
On Sat, 10 Dec 2016 16:22:13 +1100 Balbir Singh <bsingharora@gmail.com> wrote: > On 10/12/16 02:52, Nicholas Piggin wrote: > > Rather than use perf / PMU interrupts and the generic hardlockup > > detector, this takes the decrementer interrupt as an "NMI" when > > interrupts are soft disabled (XXX: will this do the right thing with a > > large decrementer?). This will work even if we start soft-disabling PMU > > interrupts. > > > > This does not solve the hardlockup problem completely however, because > > interrupts can often become hard disabled when soft disabled for long > > periods. And they can be hard disabled for other reasons. > > > > Ben/Paul suggested a way to work around this with XICS. The idea was to > have MSR_EE set and use XICS to stash away the current > interrupt and acknowledge it/replay it later. Decrementer interrupts would > not trigger timers, but trigger a special NMI watchdog, like you've > implemented. Yeah that's a good idea, it should significantly avoid hard interrupt disable windows. > > @@ -718,6 +719,8 @@ static __init void kvm_free_tmp(void) > > > > static int __init kvm_guest_init(void) > > { > > + /* XXX: disable hardlockup watchdog? */ > > + > > You mean the hypervisor watchdog? Did your testing > catch anything here? I meant guest. Testing didn't catch anything but I put it there to investigate because I saw x86 does hardlockup_detector_disable() in their guest init. > > +static void nmi_timer_fn(unsigned long data) > > +{ > > + struct timer_list *t = this_cpu_ptr(&nmi_timer); > > + int cpu = smp_processor_id(); > > + > > + watchdog_timer_interrupt(cpu); > > + > > + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ); > > + add_timer_on(t, cpu); > > +} > > Do we have to have this running all the time? Can we do an on-demand > version of NMI where we do periodic decrementers without any reliance > on timers to implement NMI watchdog We could, but it is trivial to do this and get all the timer and dynticks stuff taken care of for us. We could bump the period up to 30s or so and it should hardly be an issue. I didn't want to try getting too clever, there are times when you could shut it off, but then you still lose some lockup coverage. But... I'm open to suggestions. I don't know the timer code well. > > +static int nmi_cpu_notify(struct notifier_block *self, > > + unsigned long action, void *hcpu) > > +{ > > + int cpu = (unsigned long)hcpu; > > + > > + switch (action & ~CPU_TASKS_FROZEN) { > > + case CPU_ONLINE: > > + case CPU_DOWN_FAILED: > > + start_nmi_on_cpu(cpu); > > + pr_info("NMI Watchdog running on cpus %*pbl\n", > > + cpumask_pr_args(&nmi_cpus_enabled)); > > + break; > > + case CPU_DOWN_PREPARE: > > + stop_nmi_on_cpu(cpu); > > + pr_info("NMI Watchdog running on cpus %*pbl\n", > > + cpumask_pr_args(&nmi_cpus_enabled)); > > + break; > > + } > > + return NOTIFY_OK; > > +} > > FYI: These bits are changing in linux-next Yeah I'll have to update them. > > diff --git a/init/main.c b/init/main.c > > index 2858be7..36fd7e7 100644 > > --- a/init/main.c > > +++ b/init/main.c > > @@ -33,6 +33,7 @@ > > #include <linux/start_kernel.h> > > #include <linux/security.h> > > #include <linux/smp.h> > > +#include <linux/nmi.h> > > #include <linux/profile.h> > > #include <linux/rcupdate.h> > > #include <linux/moduleparam.h> > > @@ -579,6 +580,8 @@ asmlinkage __visible void __init start_kernel(void) > > > > kmem_cache_init_late(); > > > > + nmi_init(); > > How did you test these? I just tried a few place putting soft/hard irq disable and spinning forever. Soft disable case was getting caught by the local NMI, hard disable gets caught by the SMP check. When we also get the NMI IPI crash debug stuff, we should be able to get reasonable crash data with hard disabled hangs. Thanks, Nick
> @@ -1295,7 +1295,7 @@ masked_##_H##interrupt: \ > lis r10,0x7fff; \ > ori r10,r10,0xffff; \ > mtspr SPRN_DEC,r10; \ > - b 2f; \ > + b masked_decrementer_##_H##interrupt; \ > 1: cmpwi r10,PACA_IRQ_DBELL; \ > beq 2f; \ > cmpwi r10,PACA_IRQ_HMI; \ > @@ -1312,6 +1312,16 @@ masked_##_H##interrupt: \ > ##_H##rfid; \ > b . > > +#define MASKED_NMI(_H) \ > +masked_decrementer_##_H##interrupt: \ > + std r12,PACA_EXGEN+EX_R12(r13); \ > + GET_SCRATCH0(r10); \ > + std r10,PACA_EXGEN+EX_R13(r13); \ > + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) > + This bit a little confusing, we are already handling an exception do a masked_##_H##interrupt check and we redo the exception prolog? Balbir Singh
On Sat, 10 Dec 2016 23:24:59 +1100 Balbir Singh <bsingharora@gmail.com> wrote: > > @@ -1295,7 +1295,7 @@ masked_##_H##interrupt: \ > > lis r10,0x7fff; \ > > ori r10,r10,0xffff; \ > > mtspr SPRN_DEC,r10; \ > > - b 2f; \ > > + b masked_decrementer_##_H##interrupt; \ > > 1: cmpwi r10,PACA_IRQ_DBELL; \ > > beq 2f; \ > > cmpwi r10,PACA_IRQ_HMI; \ > > @@ -1312,6 +1312,16 @@ masked_##_H##interrupt: \ > > ##_H##rfid; \ > > b . > > > > +#define MASKED_NMI(_H) \ > > +masked_decrementer_##_H##interrupt: \ > > + std r12,PACA_EXGEN+EX_R12(r13); \ > > + GET_SCRATCH0(r10); \ > > + std r10,PACA_EXGEN+EX_R13(r13); \ > > + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) > > + > > This bit a little confusing, we are already handling an exception > do a masked_##_H##interrupt check and we redo the exception prolog? The soft mask test is done halfway thought EXCEPTION_PROLOG_1, so I think we have to finish that then continue with PROLOG_PSERIES_1 to get into "common" code. That's the main tricky part of the code, so I'll have to go back and step through the asm and comment better, but it looked about right when I wrote it (famous last words). Thanks, Nick
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c..adb3387 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -124,6 +124,8 @@ config PPC select HAVE_CBPF_JIT if !PPC64 select HAVE_EBPF_JIT if PPC64 select HAVE_ARCH_JUMP_LABEL + select HAVE_NMI + select HAVE_NMI_WATCHDOG if PPC64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAS_GCOV_PROFILE_ALL select GENERIC_SMP_IDLE_THREAD diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index ff1ccb3..d00e29b 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -1,4 +1,9 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +#define arch_nmi_init powerpc_nmi_init +void __init powerpc_nmi_init(void); +void touch_nmi_watchdog(void); +void soft_nmi_interrupt(struct pt_regs *regs); + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1925341..77f199f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ +obj-$(CONFIG_HAVE_NMI_WATCHDOG) += nmi.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1ba82ea..b159d02 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1295,7 +1295,7 @@ masked_##_H##interrupt: \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ - b 2f; \ + b masked_decrementer_##_H##interrupt; \ 1: cmpwi r10,PACA_IRQ_DBELL; \ beq 2f; \ cmpwi r10,PACA_IRQ_HMI; \ @@ -1312,6 +1312,16 @@ masked_##_H##interrupt: \ ##_H##rfid; \ b . +#define MASKED_NMI(_H) \ +masked_decrementer_##_H##interrupt: \ + std r12,PACA_EXGEN+EX_R12(r13); \ + GET_SCRATCH0(r10); \ + std r10,PACA_EXGEN+EX_R13(r13); \ + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) + +EXC_COMMON(soft_nmi_common, 0x900, soft_nmi_interrupt) + + /* * Real mode exceptions actually use this too, but alternate * instruction code patches (which end up in the common .text area) @@ -1319,7 +1329,9 @@ masked_##_H##interrupt: \ */ USE_FIXED_SECTION(virt_trampolines) MASKED_INTERRUPT() + MASKED_NMI() MASKED_INTERRUPT(H) + MASKED_NMI(H) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 9ad37f8..f0d215c 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,6 +25,7 @@ #include <linux/kvm_para.h> #include <linux/slab.h> #include <linux/of.h> +#include <linux/nmi.h> #include <asm/reg.h> #include <asm/sections.h> @@ -718,6 +719,8 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { + /* XXX: disable hardlockup watchdog? */ + if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/nmi.c b/arch/powerpc/kernel/nmi.c new file mode 100644 index 0000000..8e89db4 --- /dev/null +++ b/arch/powerpc/kernel/nmi.c @@ -0,0 +1,288 @@ +/* + * NMI support on powerpc systems. + * + * Copyright 2016, IBM Corporation. + * + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c + */ +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/init.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/module.h> +#include <linux/export.h> +#include <linux/kprobes.h> +#include <linux/hardirq.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/kdebug.h> +#include <linux/delay.h> +#include <linux/smp.h> + +#include <asm/paca.h> + +/* + * Powerpc NMI is implemented using either the 0x100 system reset interrupt + * which is a real NMI, or using the soft-disable mechanism which can still + * be hard masked. + */ +static int nmi_panic_timeout __read_mostly = 30; /* sec to panic */ +static u64 nmi_panic_timeout_tb __read_mostly; /* timebase ticks */ + +static int nmi_timer_period __read_mostly = 10; /* sec between activity checks */ +static u64 nmi_timer_period_tb __read_mostly; /* timebase ticks */ + +static DEFINE_PER_CPU(struct timer_list, nmi_timer); +static DEFINE_PER_CPU(u64, last_activity_tb); + +static cpumask_t nmi_cpus_enabled __read_mostly; + +/* + * Cross-CPU watchdog checker + */ +#ifdef CONFIG_SMP +static int nmi_global_panic_timeout __read_mostly = 60; /* sec to panic other */ +static u64 nmi_global_panic_timeout_tb __read_mostly; /* timebase ticks */ +static cpumask_t nmi_global_cpus_pending; +static u64 nmi_global_last_reset_tb; +#endif + +static void watchdog_panic(struct pt_regs *regs, int cpu) +{ + static unsigned long lock = 0; + + while (unlikely(test_and_set_bit_lock(0, &lock))) + cpu_relax(); + + if (cpu < 0) { + pr_emerg("Watchdog detected hard LOCKUP other cpus %*pbl\n", + cpumask_pr_args(&nmi_global_cpus_pending)); + } else { + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + } + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* XXX: trigger NMI IPI? */ + + nmi_panic(regs, "Hard LOCKUP"); +} + +#ifdef CONFIG_SMP +static void nmi_global_clear_cpu_pending(int cpu, u64 tb) +{ + static unsigned long lock = 0; + + while (unlikely(test_and_set_bit_lock(0, &lock))) + cpu_relax(); + cpumask_clear_cpu(cpu, &nmi_global_cpus_pending); + if (cpumask_empty(&nmi_global_cpus_pending)) { + nmi_global_last_reset_tb = tb; + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled); + } + clear_bit_unlock(0, &lock); +} +#endif + +static void watchdog_timer_interrupt(int cpu) +{ + u64 tb = get_tb(); + + raw_cpu_write(last_activity_tb, tb); + +#ifdef CONFIG_SMP + if (tb - nmi_global_last_reset_tb < nmi_timer_period_tb) + return; + + if (cpumask_test_cpu(cpu, &nmi_global_cpus_pending)) + nmi_global_clear_cpu_pending(cpu, tb); + + if (tb - nmi_global_last_reset_tb >= nmi_global_panic_timeout_tb) + watchdog_panic(NULL, -1); +#endif +} + +static void nmi_timer_fn(unsigned long data) +{ + struct timer_list *t = this_cpu_ptr(&nmi_timer); + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); + + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ); + add_timer_on(t, cpu); +} + +void touch_nmi_watchdog(void) +{ + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, &nmi_cpus_enabled)) + watchdog_timer_interrupt(cpu); + + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static void watchdog_nmi_interrupt(struct pt_regs *regs, u64 tb) +{ + u64 interval = tb - __this_cpu_read(last_activity_tb); + + if (interval >= nmi_panic_timeout_tb) + watchdog_panic(regs, smp_processor_id()); +} + +/* + * This is the soft-mask nmi handler (called by masked_interrupt() in + * low level exception entry). + */ +static DEFINE_PER_CPU(u64, soft_nmi_last_tb); + +/* + * Soft nmi interrupts only occur when linux irqs are disabled (but + * powerpc hardware irqs are enabled), so they can not be relied + * upon to be timely or delivered at all. Their only real use is + * the nmi watchdog. + */ +void soft_nmi_interrupt(struct pt_regs *regs) +{ + u64 tb; + + if (!cpumask_test_cpu(smp_processor_id(), &nmi_cpus_enabled)) + return; + + tb = get_tb(); + if (tb - __this_cpu_read(soft_nmi_last_tb) < nmi_timer_period_tb) + return; + __this_cpu_write(soft_nmi_last_tb, tb); + + nmi_enter(); + watchdog_nmi_interrupt(regs, tb); + nmi_exit(); +} + +static void start_nmi_on_cpu(int cpu) +{ + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu); + u64 *last_tb = per_cpu_ptr(&soft_nmi_last_tb, cpu); + + *last_tb = get_tb(); + cpumask_set_cpu(cpu, &nmi_cpus_enabled); + + setup_pinned_timer(t, nmi_timer_fn, cpu); + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ); + add_timer_on(t, cpu); +} + +static void stop_nmi_on_cpu(int cpu) +{ + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu); + + del_timer_sync(t); + + cpumask_clear_cpu(cpu, &nmi_cpus_enabled); +#ifdef CONFIG_SMP + nmi_global_clear_cpu_pending(cpu, get_tb()); +#endif +} + + +static int nmi_startup(void) +{ + if (num_online_cpus() != 1) { + WARN_ON(1); + return -EINVAL; + } + +#ifdef CONFIG_SMP + nmi_global_last_reset_tb = get_tb(); +#endif + start_nmi_on_cpu(smp_processor_id()); +#ifdef CONFIG_SMP + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled); +#endif + + pr_info("NMI Watchdog started on cpus %*pbl\n", + cpumask_pr_args(&nmi_cpus_enabled)); + + return 0; +} + +static int nmi_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + start_nmi_on_cpu(cpu); + pr_info("NMI Watchdog running on cpus %*pbl\n", + cpumask_pr_args(&nmi_cpus_enabled)); + break; + case CPU_DOWN_PREPARE: + stop_nmi_on_cpu(cpu); + pr_info("NMI Watchdog running on cpus %*pbl\n", + cpumask_pr_args(&nmi_cpus_enabled)); + break; + } + return NOTIFY_OK; +} + +static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p) +{ + stop_nmi_on_cpu(smp_processor_id()); + return 0; +} + +static struct notifier_block nmi_reboot_notifier = { + .notifier_call = nmi_shutdown, +}; + +void __init powerpc_nmi_init(void) +{ + int err; + + if (!nmi_panic_timeout) + return; + + nmi_panic_timeout_tb = nmi_panic_timeout * ppc_tb_freq; + nmi_timer_period_tb = nmi_timer_period * ppc_tb_freq; +#ifdef CONFIG_SMP + nmi_global_panic_timeout_tb = nmi_global_panic_timeout * ppc_tb_freq; +#endif + + err = register_reboot_notifier(&nmi_reboot_notifier); + if (err) + return; + + err = nmi_startup(); + if (err) { + unregister_reboot_notifier(&nmi_reboot_notifier); + return; + } + + cpu_notifier(nmi_cpu_notify, 0); +} + +static int __init setup_nmi_watchdog(char *str) +{ + if (!strncmp(str, "0", 1)) + nmi_panic_timeout = 0; + return 0; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + +static int __init nowatchdog_setup(char *str) +{ + nmi_panic_timeout = 0; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 8d586cf..6a403a9 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -652,21 +652,3 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(int watchdog_thresh) -{ - return ppc_proc_freq * watchdog_thresh; -} - -/* - * The hardlockup detector breaks PMU event based branches and is likely - * to get false positives in KVM guests, so disable it by default. - */ -static int __init disable_hardlockup_detector(void) -{ - hardlockup_detector_disable(); - - return 0; -} -early_initcall(disable_hardlockup_detector); -#endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc3f7d0..5aec1ad 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -52,6 +52,7 @@ #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/irq.h> +#include <linux/nmi.h> #include <linux/delay.h> #include <linux/irq_work.h> #include <linux/clk-provider.h> @@ -66,6 +67,7 @@ #include <asm/machdep.h> #include <asm/uaccess.h> #include <asm/time.h> +#include <asm/nmi.h> #include <asm/prom.h> #include <asm/irq.h> #include <asm/div64.h> diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a9973bb..ae3a4d5 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -244,7 +244,7 @@ static struct notifier_block nmi_reboot_notifier = { .notifier_call = nmi_shutdown, }; -int __init nmi_init(void) +void __init nmi_init(void) { int err; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35c..9552ab0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -18,6 +18,9 @@ #include <asm/nmi.h> extern void touch_nmi_watchdog(void); #else +static inline void nmi_init(void) +{ +} static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); @@ -30,6 +33,17 @@ extern void hardlockup_detector_disable(void); static inline void hardlockup_detector_disable(void) {} #endif +#ifdef arch_nmi_init +static inline void nmi_init(void) +{ + arch_nmi_init(); +} +#else +static inline void nmi_init(void) +{ +} +#endif + /* * Create trigger_all_cpu_backtrace() out of the arch-provided * base function. Return whether such support was available, diff --git a/init/main.c b/init/main.c index 2858be7..36fd7e7 100644 --- a/init/main.c +++ b/init/main.c @@ -33,6 +33,7 @@ #include <linux/start_kernel.h> #include <linux/security.h> #include <linux/smp.h> +#include <linux/nmi.h> #include <linux/profile.h> #include <linux/rcupdate.h> #include <linux/moduleparam.h> @@ -579,6 +580,8 @@ asmlinkage __visible void __init start_kernel(void) kmem_cache_init_late(); + nmi_init(); + /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f..309a588 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -142,6 +142,13 @@ static int __init hardlockup_panic_setup(char *str) return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + watchdog_enabled = 0; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); #endif unsigned int __read_mostly softlockup_panic = @@ -155,13 +162,6 @@ static int __init softlockup_panic_setup(char *str) } __setup("softlockup_panic=", softlockup_panic_setup); -static int __init nowatchdog_setup(char *str) -{ - watchdog_enabled = 0; - return 1; -} -__setup("nowatchdog", nowatchdog_setup); - static int __init nosoftlockup_setup(char *str) { watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; @@ -177,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -185,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft-