diff mbox series

[v4] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt.

Message ID 20240214095146.1527369-1-mahesh@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series [v4] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt. | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_clang success Successfully ran 6 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu success Successfully ran 23 jobs.

Commit Message

Mahesh J Salgaonkar Feb. 14, 2024, 9:51 a.m. UTC
nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
interrupt handler) if percpu allocation comes from vmalloc area.

Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
percpu allocation is from the embedded first chunk. However with
CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
allocation can come from the vmalloc area.

With kernel command line "percpu_alloc=page" we can force percpu allocation
to come from vmalloc area and can see kernel crash in machine_check_early:

[    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
[    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
[    1.215719] --- interrupt: 200
[    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
[    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
[    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8

Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
first chunk is not embedded.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
---
Changes in v4:
- Fix coding style issues.

Changes in v3:
- Address comments from Christophe Leroy to avoid using #ifdefs in the
  code
- v2 at https://lore.kernel.org/linuxppc-dev/20240205053647.1763446-1-mahesh@linux.ibm.com/

Changes in v2:
- Rebase to upstream master
- Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing the
  embed first chunk test at each interrupt entry.
- v1 is at https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
---
 arch/powerpc/include/asm/interrupt.h | 10 ++++++++++
 arch/powerpc/include/asm/percpu.h    | 10 ++++++++++
 arch/powerpc/kernel/setup_64.c       |  3 +++
 3 files changed, 23 insertions(+)

Comments

Christophe Leroy Feb. 14, 2024, 9:54 a.m. UTC | #1
Le 14/02/2024 à 10:51, Mahesh Salgaonkar a écrit :
> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> interrupt handler) if percpu allocation comes from vmalloc area.
> 
> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> percpu allocation is from the embedded first chunk. However with
> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> allocation can come from the vmalloc area.
> 
> With kernel command line "percpu_alloc=page" we can force percpu allocation
> to come from vmalloc area and can see kernel crash in machine_check_early:
> 
> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
> [    1.215719] --- interrupt: 200
> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
> [    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8
> 
> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> first chunk is not embedded.
> 
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>

> ---
> Changes in v4:
> - Fix coding style issues.
> 
> Changes in v3:
> - Address comments from Christophe Leroy to avoid using #ifdefs in the
>    code
> - v2 at https://lore.kernel.org/linuxppc-dev/20240205053647.1763446-1-mahesh@linux.ibm.com/
> 
> Changes in v2:
> - Rebase to upstream master
> - Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing the
>    embed first chunk test at each interrupt entry.
> - v1 is at https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
> ---
>   arch/powerpc/include/asm/interrupt.h | 10 ++++++++++
>   arch/powerpc/include/asm/percpu.h    | 10 ++++++++++
>   arch/powerpc/kernel/setup_64.c       |  3 +++
>   3 files changed, 23 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index a4196ab1d0167..0b96464ff0339 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -336,6 +336,14 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
>   	if (IS_ENABLED(CONFIG_KASAN))
>   		return;
>   
> +	/*
> +	 * Likewise, do not use it in real mode if percpu first chunk is not
> +	 * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
> +	 * are chances where percpu allocation can come from vmalloc area.
> +	 */
> +	if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk)
> +		return;
> +
>   	/* Otherwise, it should be safe to call it */
>   	nmi_enter();
>   }
> @@ -351,6 +359,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
>   		// no nmi_exit for a pseries hash guest taking a real mode exception
>   	} else if (IS_ENABLED(CONFIG_KASAN)) {
>   		// no nmi_exit for KASAN in real mode
> +	} else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk) {
> +		// no nmi_exit if percpu first chunk is not embedded
>   	} else {
>   		nmi_exit();
>   	}
> diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
> index 8e5b7d0b851c6..e24063eb0b33b 100644
> --- a/arch/powerpc/include/asm/percpu.h
> +++ b/arch/powerpc/include/asm/percpu.h
> @@ -15,6 +15,16 @@
>   #endif /* CONFIG_SMP */
>   #endif /* __powerpc64__ */
>   
> +#ifdef CONFIG_PPC64
> +#include <linux/jump_label.h>
> +DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> +
> +#define is_embed_first_chunk	\
> +		(static_key_enabled(&__percpu_embed_first_chunk.key))
> +#else
> +#define is_embed_first_chunk	true
> +#endif /* CONFIG_PPC64 */
> +
>   #include <asm-generic/percpu.h>
>   
>   #include <asm/paca.h>
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 2f19d5e944852..e04f0ff69d4b6 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -834,6 +834,7 @@ static __init int pcpu_cpu_to_node(int cpu)
>   
>   unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>   EXPORT_SYMBOL(__per_cpu_offset);
> +DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
>   
>   void __init setup_per_cpu_areas(void)
>   {
> @@ -869,6 +870,8 @@ void __init setup_per_cpu_areas(void)
>   			pr_warn("PERCPU: %s allocator failed (%d), "
>   				"falling back to page size\n",
>   				pcpu_fc_names[pcpu_chosen_fc], rc);
> +		else
> +			static_key_enable(&__percpu_embed_first_chunk.key);
>   	}
>   
>   	if (rc < 0)
Shirisha G March 6, 2024, 8:25 a.m. UTC | #2
On Wed, 2024-02-14 at 15:21 +0530, Mahesh Salgaonkar wrote:
> nmi_enter()/nmi_exit() touches per cpu variables which can lead to
> kernel
> crash when invoked during real mode interrupt handling (e.g. early
> HMI/MCE
> interrupt handler) if percpu allocation comes from vmalloc area.
> 
> Early HMI/MCE handlers are called through
> DEFINE_INTERRUPT_HANDLER_NMI()
> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any
> issue when
> percpu allocation is from the embedded first chunk. However with
> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where
> percpu
> allocation can come from the vmalloc area.
> 
> With kernel command line "percpu_alloc=page" we can force percpu
> allocation
> to come from vmalloc area and can see kernel crash in
> machine_check_early:
> 
> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
> [    1.215719] --- interrupt: 200
> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
> [    1.215724] [c000000fffd73210] [c000000000008364]
> machine_check_early_common+0x134/0x1f8
> 
> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if
> percpu
> first chunk is not embedded.
> 
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>

Thanks for the Patch.
I have tested the patch and the fix works fine.
selftests/powerpc/mce/inject-ra-err testcase is working as
expected after enabling percpu_alloc=page with the patch applied.

Output with Patch:
# ./inject-ra-err 
test: inject-ra-err
tags: git_version:unknown
success: inject-ra-err
#

Tested-by: Shirisha Ganta <shirisha@linux.ibm.com>


> ---
> Changes in v4:
> - Fix coding style issues.
> 
> Changes in v3:
> - Address comments from Christophe Leroy to avoid using #ifdefs in
> the
>   code
> - v2 at 
> https://lore.kernel.org/linuxppc-dev/20240205053647.1763446-1-mahesh@linux.ibm.com/
> 
> Changes in v2:
> - Rebase to upstream master
> - Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing
> the
>   embed first chunk test at each interrupt entry.
> - v1 is at 
> https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
> ---
>  arch/powerpc/include/asm/interrupt.h | 10 ++++++++++
>  arch/powerpc/include/asm/percpu.h    | 10 ++++++++++
>  arch/powerpc/kernel/setup_64.c       |  3 +++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h
> b/arch/powerpc/include/asm/interrupt.h
> index a4196ab1d0167..0b96464ff0339 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -336,6 +336,14 @@ static inline void
> interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
>  	if (IS_ENABLED(CONFIG_KASAN))
>  		return;
>  
> +	/*
> +	 * Likewise, do not use it in real mode if percpu first chunk
> is not
> +	 * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled
> there
> +	 * are chances where percpu allocation can come from vmalloc
> area.
> +	 */
> +	if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) &&
> !is_embed_first_chunk)
> +		return;
> +
>  	/* Otherwise, it should be safe to call it */
>  	nmi_enter();
>  }
> @@ -351,6 +359,8 @@ static inline void
> interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
>  		// no nmi_exit for a pseries hash guest taking a real
> mode exception
>  	} else if (IS_ENABLED(CONFIG_KASAN)) {
>  		// no nmi_exit for KASAN in real mode
> +	} else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) &&
> !is_embed_first_chunk) {
> +		// no nmi_exit if percpu first chunk is not embedded
>  	} else {
>  		nmi_exit();
>  	}
> diff --git a/arch/powerpc/include/asm/percpu.h
> b/arch/powerpc/include/asm/percpu.h
> index 8e5b7d0b851c6..e24063eb0b33b 100644
> --- a/arch/powerpc/include/asm/percpu.h
> +++ b/arch/powerpc/include/asm/percpu.h
> @@ -15,6 +15,16 @@
>  #endif /* CONFIG_SMP */
>  #endif /* __powerpc64__ */
>  
> +#ifdef CONFIG_PPC64
> +#include <linux/jump_label.h>
> +DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> +
> +#define is_embed_first_chunk	\
> +		(static_key_enabled(&__percpu_embed_first_chunk.key))
> +#else
> +#define is_embed_first_chunk	true
> +#endif /* CONFIG_PPC64 */
> +
>  #include <asm-generic/percpu.h>
>  
>  #include <asm/paca.h>
> diff --git a/arch/powerpc/kernel/setup_64.c
> b/arch/powerpc/kernel/setup_64.c
> index 2f19d5e944852..e04f0ff69d4b6 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -834,6 +834,7 @@ static __init int pcpu_cpu_to_node(int cpu)
>  
>  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>  EXPORT_SYMBOL(__per_cpu_offset);
> +DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
>  
>  void __init setup_per_cpu_areas(void)
>  {
> @@ -869,6 +870,8 @@ void __init setup_per_cpu_areas(void)
>  			pr_warn("PERCPU: %s allocator failed (%d), "
>  				"falling back to page size\n",
>  				pcpu_fc_names[pcpu_chosen_fc], rc);
> +		else
> +			static_key_enable(&__percpu_embed_first_chunk.k
> ey);
>  	}
>  
>  	if (rc < 0)
Michael Ellerman March 7, 2024, 11:43 a.m. UTC | #3
Hi Mahesh,

Mahesh Salgaonkar <mahesh@linux.ibm.com> writes:
> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> interrupt handler) if percpu allocation comes from vmalloc area.
>
> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> percpu allocation is from the embedded first chunk. However with
> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> allocation can come from the vmalloc area.
>
> With kernel command line "percpu_alloc=page" we can force percpu allocation
> to come from vmalloc area and can see kernel crash in machine_check_early:
>
> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
> [    1.215719] --- interrupt: 200
> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
> [    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8
>
> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> first chunk is not embedded.

My system (powernv) doesn't even boot with percpu_alloc=page.

AFAIK the only reason we added support for it was to handle 4K kernels
with HPT. See commit eb553f16973a ("powerpc/64/mm: implement page
mapping percpu first chunk allocator").

So I wonder if we should change the Kconfig to only offer it as an
option in that case, and change the logic in setup_per_cpu_areas() to
only use it as a last resort.

I guess we probably still need this commit though, even if just for 4K HPT.


> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> ---
> Changes in v4:
> - Fix coding style issues.
>
> Changes in v3:
> - Address comments from Christophe Leroy to avoid using #ifdefs in the
>   code
> - v2 at https://lore.kernel.org/linuxppc-dev/20240205053647.1763446-1-mahesh@linux.ibm.com/
>
> Changes in v2:
> - Rebase to upstream master
> - Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing the
>   embed first chunk test at each interrupt entry.
> - v1 is at https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
> ---
>  arch/powerpc/include/asm/interrupt.h | 10 ++++++++++
>  arch/powerpc/include/asm/percpu.h    | 10 ++++++++++
>  arch/powerpc/kernel/setup_64.c       |  3 +++
>  3 files changed, 23 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index a4196ab1d0167..0b96464ff0339 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -336,6 +336,14 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
>  	if (IS_ENABLED(CONFIG_KASAN))
>  		return;
>  
> +	/*
> +	 * Likewise, do not use it in real mode if percpu first chunk is not
> +	 * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
> +	 * are chances where percpu allocation can come from vmalloc area.
> +	 */
> +	if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk)

I think this would be clearer if it was inverted, eg:

        if (percpu_first_chunk_is_paged)
               return;

That way you shouldn't need to check CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK here.
Instead it can be part of the ifdef in the header.

> @@ -351,6 +359,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
>  		// no nmi_exit for a pseries hash guest taking a real mode exception
>  	} else if (IS_ENABLED(CONFIG_KASAN)) {
>  		// no nmi_exit for KASAN in real mode
> +	} else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk) {
> +		// no nmi_exit if percpu first chunk is not embedded
>  	} else {
>  		nmi_exit();
>  	}
> diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
> index 8e5b7d0b851c6..e24063eb0b33b 100644
> --- a/arch/powerpc/include/asm/percpu.h
> +++ b/arch/powerpc/include/asm/percpu.h
> @@ -15,6 +15,16 @@
>  #endif /* CONFIG_SMP */
>  #endif /* __powerpc64__ */
>  
> +#ifdef CONFIG_PPC64
> +#include <linux/jump_label.h>
> +DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> +
> +#define is_embed_first_chunk	\
> +		(static_key_enabled(&__percpu_embed_first_chunk.key))
> +#else
> +#define is_embed_first_chunk	true
> +#endif /* CONFIG_PPC64 */
> +

Something like:

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
#include <linux/jump_label.h>
DECLARE_STATIC_KEY_FALSE(__percpu_first_chunk_is_paged);

#define percpu_first_chunk_is_paged	\
		(static_key_enabled(&__percpu_first_chunk_is_paged.key))
#else
#define percpu_first_chunk_is_paged	false
#endif /* CONFIG_PPC64 */

> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 2f19d5e944852..e04f0ff69d4b6 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -834,6 +834,7 @@ static __init int pcpu_cpu_to_node(int cpu)
>  
>  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>  EXPORT_SYMBOL(__per_cpu_offset);
> +DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
>  
>  void __init setup_per_cpu_areas(void)
>  {
> @@ -869,6 +870,8 @@ void __init setup_per_cpu_areas(void)
>  			pr_warn("PERCPU: %s allocator failed (%d), "
>  				"falling back to page size\n",
>  				pcpu_fc_names[pcpu_chosen_fc], rc);
> +		else
> +			static_key_enable(&__percpu_embed_first_chunk.key);
>  	}
>  
>  	if (rc < 0)
 
Finally, the current patch breaks the microwatt build:

  $ make microwatt_defconfig ; make -s -j (nproc)
  make[1]: Entering directory '/home/michael/linux/.build'
    GEN     Makefile
  #
  # configuration written to .config
  #
  make[1]: Leaving directory '/home/michael/linux/.build'
  ld: arch/powerpc/kernel/traps.o:(.toc+0x0): undefined reference to `__percpu_embed_first_chunk'
  ld: arch/powerpc/kernel/mce.o:(.toc+0x0): undefined reference to `__percpu_embed_first_chunk'
  make[3]: *** [../scripts/Makefile.vmlinux:37: vmlinux] Error 1

I guess because it has CONFIG_JUMP_LABEL=n?

cheers
Aneesh Kumar K V March 8, 2024, 4:41 a.m. UTC | #4
On 3/7/24 5:13 PM, Michael Ellerman wrote:
> Hi Mahesh,
> 
> Mahesh Salgaonkar <mahesh@linux.ibm.com> writes:
>> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
>> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
>> interrupt handler) if percpu allocation comes from vmalloc area.
>>
>> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
>> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
>> percpu allocation is from the embedded first chunk. However with
>> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
>> allocation can come from the vmalloc area.
>>
>> With kernel command line "percpu_alloc=page" we can force percpu allocation
>> to come from vmalloc area and can see kernel crash in machine_check_early:
>>
>> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
>> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
>> [    1.215719] --- interrupt: 200
>> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
>> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
>> [    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8
>>
>> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
>> first chunk is not embedded.
> 
> My system (powernv) doesn't even boot with percpu_alloc=page.
> 


Can you share the crash details? 


> AFAIK the only reason we added support for it was to handle 4K kernels
> with HPT. See commit eb553f16973a ("powerpc/64/mm: implement page
> mapping percpu first chunk allocator").
> 
> So I wonder if we should change the Kconfig to only offer it as an
> option in that case, and change the logic in setup_per_cpu_areas() to
> only use it as a last resort.
> 
> I guess we probably still need this commit though, even if just for 4K HPT.
> 
>
We have also observed some error when we have large gap between the start memory of
NUMA nodes. That made the percpu offset really large causing boot failures even on 64K.

-aneesh
Mahesh J Salgaonkar March 8, 2024, 5:19 a.m. UTC | #5
On 2024-03-07 22:43:07 Thu, Michael Ellerman wrote:
> > diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> > index a4196ab1d0167..0b96464ff0339 100644
> > --- a/arch/powerpc/include/asm/interrupt.h
> > +++ b/arch/powerpc/include/asm/interrupt.h
> > @@ -336,6 +336,14 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
> >  	if (IS_ENABLED(CONFIG_KASAN))
> >  		return;
> >  
> > +	/*
> > +	 * Likewise, do not use it in real mode if percpu first chunk is not
> > +	 * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
> > +	 * are chances where percpu allocation can come from vmalloc area.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk)
> 
> I think this would be clearer if it was inverted, eg:
> 
>         if (percpu_first_chunk_is_paged)
>                return;

Agree.

> 
> That way you shouldn't need to check CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK here.
> Instead it can be part of the ifdef in the header.
> 
> > @@ -351,6 +359,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
> >  		// no nmi_exit for a pseries hash guest taking a real mode exception
> >  	} else if (IS_ENABLED(CONFIG_KASAN)) {
> >  		// no nmi_exit for KASAN in real mode
> > +	} else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk) {
> > +		// no nmi_exit if percpu first chunk is not embedded
> >  	} else {
> >  		nmi_exit();
> >  	}
> > diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
> > index 8e5b7d0b851c6..e24063eb0b33b 100644
> > --- a/arch/powerpc/include/asm/percpu.h
> > +++ b/arch/powerpc/include/asm/percpu.h
> > @@ -15,6 +15,16 @@
> >  #endif /* CONFIG_SMP */
> >  #endif /* __powerpc64__ */
> >  
> > +#ifdef CONFIG_PPC64
> > +#include <linux/jump_label.h>
> > +DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> > +
> > +#define is_embed_first_chunk	\
> > +		(static_key_enabled(&__percpu_embed_first_chunk.key))
> > +#else
> > +#define is_embed_first_chunk	true
> > +#endif /* CONFIG_PPC64 */
> > +
> 
> Something like:
> 
> #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
> #include <linux/jump_label.h>
> DECLARE_STATIC_KEY_FALSE(__percpu_first_chunk_is_paged);
> 
> #define percpu_first_chunk_is_paged	\
> 		(static_key_enabled(&__percpu_first_chunk_is_paged.key))
> #else
> #define percpu_first_chunk_is_paged	false
> #endif /* CONFIG_PPC64 */

Sure, will fix it.

> 
> > diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> > index 2f19d5e944852..e04f0ff69d4b6 100644
> > --- a/arch/powerpc/kernel/setup_64.c
> > +++ b/arch/powerpc/kernel/setup_64.c
> > @@ -834,6 +834,7 @@ static __init int pcpu_cpu_to_node(int cpu)
> >  
> >  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> >  EXPORT_SYMBOL(__per_cpu_offset);
> > +DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> >  
> >  void __init setup_per_cpu_areas(void)
> >  {
> > @@ -869,6 +870,8 @@ void __init setup_per_cpu_areas(void)
> >  			pr_warn("PERCPU: %s allocator failed (%d), "
> >  				"falling back to page size\n",
> >  				pcpu_fc_names[pcpu_chosen_fc], rc);
> > +		else
> > +			static_key_enable(&__percpu_embed_first_chunk.key);
> >  	}
> >  
> >  	if (rc < 0)
>  
> Finally, the current patch breaks the microwatt build:
> 
>   $ make microwatt_defconfig ; make -s -j (nproc)
>   make[1]: Entering directory '/home/michael/linux/.build'
>     GEN     Makefile
>   #
>   # configuration written to .config
>   #
>   make[1]: Leaving directory '/home/michael/linux/.build'
>   ld: arch/powerpc/kernel/traps.o:(.toc+0x0): undefined reference to `__percpu_embed_first_chunk'
>   ld: arch/powerpc/kernel/mce.o:(.toc+0x0): undefined reference to `__percpu_embed_first_chunk'
>   make[3]: *** [../scripts/Makefile.vmlinux:37: vmlinux] Error 1
> 
> I guess because it has CONFIG_JUMP_LABEL=n?

Even with CONFIG_JUMP_LABEL=n it should still work. Let me take look and
fix this for microwatt build.

Thanks for your review.
-Mahesh.

> 
> cheers
Michael Ellerman March 8, 2024, 8:08 a.m. UTC | #6
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> On 3/7/24 5:13 PM, Michael Ellerman wrote:
>> Mahesh Salgaonkar <mahesh@linux.ibm.com> writes:
>>> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
>>> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
>>> interrupt handler) if percpu allocation comes from vmalloc area.
>>>
>>> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
>>> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
>>> percpu allocation is from the embedded first chunk. However with
>>> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
>>> allocation can come from the vmalloc area.
>>>
>>> With kernel command line "percpu_alloc=page" we can force percpu allocation
>>> to come from vmalloc area and can see kernel crash in machine_check_early:
>>>
>>> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
>>> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
>>> [    1.215719] --- interrupt: 200
>>> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
>>> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
>>> [    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8
>>>
>>> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
>>> first chunk is not embedded.
>> 
>> My system (powernv) doesn't even boot with percpu_alloc=page.
>
>
> Can you share the crash details?

Yes but it's not pretty :)

  [    1.725257][  T714] systemd-journald[714]: Collecting audit messages is disabled.
  [    1.729401][    T1] systemd[1]: Finished systemd-tmpfiles-setup-dev.service - Create Static Device Nodes in /dev.
  [^[[0;32m  OK  ^[[0m] Finished ^[[0;1;39msystemd-tmpfiles-…reate Static Device Nodes in /dev.
  [    1.773902][   C22] Disabling lock debugging due to kernel taint
  [    1.773905][   C23] Oops: Machine check, sig: 7 [#1]
  [    1.773911][   C23] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
  [    1.773916][   C23] Modules linked in:
  [    1.773920][   C23] CPU: 23 PID: 0 Comm: swapper/23 Tainted: G   M               6.8.0-rc7-02500-g23515c370cbb #1
  [    1.773924][   C23] Hardware name: 8335-GTH POWER9 0x4e1202 opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV
  [    1.773926][   C23] NIP:  0000000000000000 LR: 0000000000000000 CTR: 0000000000000000
  [    1.773929][   C23] REGS: c000000fffa6ef50 TRAP: 0000   Tainted: G   M                (6.8.0-rc7-02500-g23515c370cbb)
  [    1.773932][   C23] MSR:  0000000000000000 <>  CR: 00000000  XER: 00000000
  [    1.773937][   C23] CFAR: 0000000000000000 IRQMASK: 3 
  [    1.773937][   C23] GPR00: 0000000000000000 c000000fffa6efe0 c000000fffa6efb0 0000000000000000 
  [    1.773937][   C23] GPR04: c00000000003d8c0 c000000001f5f000 0000000000000000 0000000000000103 
  [    1.773937][   C23] GPR08: 0000000000000003 653a0d962a590300 0000000000000000 0000000000000000 
  [    1.773937][   C23] GPR12: c000000fffa6f280 0000000000000000 c0000000000084a4 0000000000000000 
  [    1.773937][   C23] GPR16: 0000000053474552 0000000000000000 c00000000003d8c0 c000000fffa6f280 
  [    1.773937][   C23] GPR20: c000000001f5f000 c000000fffa6f340 c000000fffa6f2e8 0000000000000000 
  [    1.773937][   C23] GPR24: 0007fffffecf0000 c0000000065bbb80 0000000000550102 c000000002172b20 
  [    1.773937][   C23] GPR28: 0000000000000000 0000000053474552 0000000000000000 c000000ffffc6d80 
  [    1.773982][   C23] NIP [0000000000000000] 0x0
  [    1.773988][   C23] LR [0000000000000000] 0x0
  [    1.773990][   C23] Call Trace:
  [    1.773991][   C23] [c000000fffa6efe0] [c000000001f5f000] .TOC.+0x0/0xa1000 (unreliable)
  [    1.773999][   C23] Code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX 
  [    1.774021][   C23] ---[ end trace 0000000000000000 ]---

Something has gone badly wrong.

That was a test kernel with some other commits, but nothing that should
cause that. Removing percpu_alloc=page fix it.

It's based on fddff98e83b4b4d54470902ea0d520c4d423ca3b.

>> AFAIK the only reason we added support for it was to handle 4K kernels
>> with HPT. See commit eb553f16973a ("powerpc/64/mm: implement page
>> mapping percpu first chunk allocator").
>> 
>> So I wonder if we should change the Kconfig to only offer it as an
>> option in that case, and change the logic in setup_per_cpu_areas() to
>> only use it as a last resort.
>> 
>> I guess we probably still need this commit though, even if just for 4K HPT.
>> 
>>
> We have also observed some error when we have large gap between the start memory of
> NUMA nodes. That made the percpu offset really large causing boot failures even on 64K.

Yeah, I have vague memories of that :)

cheers
Mahesh J Salgaonkar April 10, 2024, 4:38 a.m. UTC | #7
On 2024-03-08 19:08:50 Fri, Michael Ellerman wrote:
> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> > On 3/7/24 5:13 PM, Michael Ellerman wrote:
> >> Mahesh Salgaonkar <mahesh@linux.ibm.com> writes:
> >>> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> >>> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> >>> interrupt handler) if percpu allocation comes from vmalloc area.
> >>>
> >>> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> >>> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> >>> percpu allocation is from the embedded first chunk. However with
> >>> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> >>> allocation can come from the vmalloc area.
> >>>
> >>> With kernel command line "percpu_alloc=page" we can force percpu allocation
> >>> to come from vmalloc area and can see kernel crash in machine_check_early:
> >>>
> >>> [    1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110
> >>> [    1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0
> >>> [    1.215719] --- interrupt: 200
> >>> [    1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable)
> >>> [    1.215722] [c000000fffd731b0] [0000000000000000] 0x0
> >>> [    1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8
> >>>
> >>> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> >>> first chunk is not embedded.
> >> 
> >> My system (powernv) doesn't even boot with percpu_alloc=page.
> >
> >
> > Can you share the crash details?
> 
> Yes but it's not pretty :)
> 
>   [    1.725257][  T714] systemd-journald[714]: Collecting audit messages is disabled.
>   [    1.729401][    T1] systemd[1]: Finished systemd-tmpfiles-setup-dev.service - Create Static Device Nodes in /dev.
>   [^[[0;32m  OK  ^[[0m] Finished ^[[0;1;39msystemd-tmpfiles-…reate Static Device Nodes in /dev.
>   [    1.773902][   C22] Disabling lock debugging due to kernel taint
>   [    1.773905][   C23] Oops: Machine check, sig: 7 [#1]
>   [    1.773911][   C23] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
>   [    1.773916][   C23] Modules linked in:
>   [    1.773920][   C23] CPU: 23 PID: 0 Comm: swapper/23 Tainted: G   M               6.8.0-rc7-02500-g23515c370cbb #1
>   [    1.773924][   C23] Hardware name: 8335-GTH POWER9 0x4e1202 opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV
>   [    1.773926][   C23] NIP:  0000000000000000 LR: 0000000000000000 CTR: 0000000000000000
>   [    1.773929][   C23] REGS: c000000fffa6ef50 TRAP: 0000   Tainted: G   M                (6.8.0-rc7-02500-g23515c370cbb)
>   [    1.773932][   C23] MSR:  0000000000000000 <>  CR: 00000000  XER: 00000000
>   [    1.773937][   C23] CFAR: 0000000000000000 IRQMASK: 3 
>   [    1.773937][   C23] GPR00: 0000000000000000 c000000fffa6efe0 c000000fffa6efb0 0000000000000000 
>   [    1.773937][   C23] GPR04: c00000000003d8c0 c000000001f5f000 0000000000000000 0000000000000103 
>   [    1.773937][   C23] GPR08: 0000000000000003 653a0d962a590300 0000000000000000 0000000000000000 
>   [    1.773937][   C23] GPR12: c000000fffa6f280 0000000000000000 c0000000000084a4 0000000000000000 
>   [    1.773937][   C23] GPR16: 0000000053474552 0000000000000000 c00000000003d8c0 c000000fffa6f280 
>   [    1.773937][   C23] GPR20: c000000001f5f000 c000000fffa6f340 c000000fffa6f2e8 0000000000000000 
>   [    1.773937][   C23] GPR24: 0007fffffecf0000 c0000000065bbb80 0000000000550102 c000000002172b20 
>   [    1.773937][   C23] GPR28: 0000000000000000 0000000053474552 0000000000000000 c000000ffffc6d80 
>   [    1.773982][   C23] NIP [0000000000000000] 0x0
>   [    1.773988][   C23] LR [0000000000000000] 0x0
>   [    1.773990][   C23] Call Trace:
>   [    1.773991][   C23] [c000000fffa6efe0] [c000000001f5f000] .TOC.+0x0/0xa1000 (unreliable)
>   [    1.773999][   C23] Code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX 
>   [    1.774021][   C23] ---[ end trace 0000000000000000 ]---
> 
> Something has gone badly wrong.
> 
> That was a test kernel with some other commits, but nothing that should
> cause that. Removing percpu_alloc=page fix it.

So, when I try this without my patch "Avoid nmi_enter/nmi_exit in real
mode interrupt", I see this getting recreated. However, I was not able
to recrate this even once with my changes. Are you able to see this
crash with my patch ?

Thanks,
-Mahesh.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index a4196ab1d0167..0b96464ff0339 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -336,6 +336,14 @@  static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
 	if (IS_ENABLED(CONFIG_KASAN))
 		return;
 
+	/*
+	 * Likewise, do not use it in real mode if percpu first chunk is not
+	 * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
+	 * are chances where percpu allocation can come from vmalloc area.
+	 */
+	if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk)
+		return;
+
 	/* Otherwise, it should be safe to call it */
 	nmi_enter();
 }
@@ -351,6 +359,8 @@  static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
 		// no nmi_exit for a pseries hash guest taking a real mode exception
 	} else if (IS_ENABLED(CONFIG_KASAN)) {
 		// no nmi_exit for KASAN in real mode
+	} else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && !is_embed_first_chunk) {
+		// no nmi_exit if percpu first chunk is not embedded
 	} else {
 		nmi_exit();
 	}
diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
index 8e5b7d0b851c6..e24063eb0b33b 100644
--- a/arch/powerpc/include/asm/percpu.h
+++ b/arch/powerpc/include/asm/percpu.h
@@ -15,6 +15,16 @@ 
 #endif /* CONFIG_SMP */
 #endif /* __powerpc64__ */
 
+#ifdef CONFIG_PPC64
+#include <linux/jump_label.h>
+DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
+
+#define is_embed_first_chunk	\
+		(static_key_enabled(&__percpu_embed_first_chunk.key))
+#else
+#define is_embed_first_chunk	true
+#endif /* CONFIG_PPC64 */
+
 #include <asm-generic/percpu.h>
 
 #include <asm/paca.h>
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2f19d5e944852..e04f0ff69d4b6 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -834,6 +834,7 @@  static __init int pcpu_cpu_to_node(int cpu)
 
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
+DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
 
 void __init setup_per_cpu_areas(void)
 {
@@ -869,6 +870,8 @@  void __init setup_per_cpu_areas(void)
 			pr_warn("PERCPU: %s allocator failed (%d), "
 				"falling back to page size\n",
 				pcpu_fc_names[pcpu_chosen_fc], rc);
+		else
+			static_key_enable(&__percpu_embed_first_chunk.key);
 	}
 
 	if (rc < 0)