diff mbox series

[v3,5/6] powerpc/64: Add support for out-of-line static calls

Message ID 20221005053234.29312-6-bgray@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Out-of-line static calls for powerpc64 ELF V2 | expand

Commit Message

Benjamin Gray Oct. 5, 2022, 5:32 a.m. UTC
Implement static call support for 64 bit V2 ABI. This requires making
sure the TOC is kept correct across kernel-module boundaries. As a
secondary concern, it tries to use the local entry point of a target
wherever possible. It does so by checking if both tramp & target are
kernel code, and falls back to detecting the common global entry point
patterns if modules are involved. Detecting the global entry point is
also required for setting the local entry point as the trampoline
target: if we cannot detect the local entry point, then we need to
convservatively initialise r12 and use the global entry point.

The trampolines are marked with `.localentry NAME, 1` to make the
linker save and restore the TOC on each call to the trampoline. This
allows the trampoline to safely target functions with different TOC
values.

However this directive also implies the TOC is not initialised on entry
to the trampoline. The kernel TOC is easily found in the PACA, but not
an arbitrary module TOC. Therefore the trampoline implementation depends
on whether it's in the kernel or not. If in the kernel, we initialise
the TOC using the PACA. If in a module, we have to initialise the TOC
with zero context, so it's quite expensive.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
---
 arch/powerpc/Kconfig                     |  14 ++-
 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/include/asm/static_call.h   |  80 +++++++++++++-
 arch/powerpc/kernel/Makefile             |   3 +-
 arch/powerpc/kernel/static_call.c        | 130 +++++++++++++++++++++--
 5 files changed, 216 insertions(+), 12 deletions(-)

Comments

Christophe Leroy Oct. 5, 2022, 7:38 p.m. UTC | #1
Le 05/10/2022 à 07:32, Benjamin Gray a écrit :
> Implement static call support for 64 bit V2 ABI. This requires making
> sure the TOC is kept correct across kernel-module boundaries. As a
> secondary concern, it tries to use the local entry point of a target
> wherever possible. It does so by checking if both tramp & target are
> kernel code, and falls back to detecting the common global entry point
> patterns if modules are involved. Detecting the global entry point is
> also required for setting the local entry point as the trampoline
> target: if we cannot detect the local entry point, then we need to
> convservatively initialise r12 and use the global entry point.
> 
> The trampolines are marked with `.localentry NAME, 1` to make the
> linker save and restore the TOC on each call to the trampoline. This
> allows the trampoline to safely target functions with different TOC
> values.
> 
> However this directive also implies the TOC is not initialised on entry
> to the trampoline. The kernel TOC is easily found in the PACA, but not
> an arbitrary module TOC. Therefore the trampoline implementation depends
> on whether it's in the kernel or not. If in the kernel, we initialise
> the TOC using the PACA. If in a module, we have to initialise the TOC
> with zero context, so it's quite expensive.
> 
> Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>

This looks good to me

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>

However, thinking out loudly, I'm wondering, could we make things any 
simpler when CONFIG_MODULES is not selected, or is that a too much 
corner case on PPC64 ?

I'm asking because on embedded PPC32 it is current to not have 
CONFIG_MODULES set.

> ---
>   arch/powerpc/Kconfig                     |  14 ++-
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/include/asm/static_call.h   |  80 +++++++++++++-
>   arch/powerpc/kernel/Makefile             |   3 +-
>   arch/powerpc/kernel/static_call.c        | 130 +++++++++++++++++++++--
>   5 files changed, 216 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 4c466acdc70d..962e36ec34ec 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -102,6 +102,18 @@ config GENERIC_HWEIGHT
>   	bool
>   	default y
>   
> +config TOOLCHAIN_SUPPORTS_LOCALENTRY1
> +	bool
> +	depends on PPC64_ELF_ABI_V2
> +	default y if LD_VERSION >= 23200 || LLD_VERSION >= 110000
> +	help
> +	  A section of the ELF symbol st_other field can be given the value 1
> +	  using the directive '.localentry NAME, 1' to mean the local and global
> +	  entry points are the same, and r2 should be treated as caller-saved.
> +
> +	  Older versions of Clang and binutils do not recognise this form of the
> +	  directive and will error if it is used.
> +
>   config PPC
>   	bool
>   	default y
> @@ -248,7 +260,7 @@ config PPC
>   	select HAVE_SOFTIRQ_ON_OWN_STACK
>   	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
>   	select HAVE_STACKPROTECTOR		if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
> -	select HAVE_STATIC_CALL			if PPC32
> +	select HAVE_STATIC_CALL			if PPC32 || (PPC64_ELF_ABI_V2 && TOOLCHAIN_SUPPORTS_LOCALENTRY1)
>   	select HAVE_SYSCALL_TRACEPOINTS
>   	select HAVE_VIRT_CPU_ACCOUNTING
>   	select HUGETLB_PAGE_SIZE_VARIABLE	if PPC_BOOK3S_64 && HUGETLB_PAGE
> diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
> index 170bfa848c7c..cb4629e55e57 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -152,6 +152,7 @@ int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src);
>   bool is_conditional_branch(ppc_inst_t instr);
>   
>   #define OP_RT_RA_MASK	0xffff0000UL
> +#define OP_SI_MASK	0x0000ffffUL
>   #define LIS_R2		(PPC_RAW_LIS(_R2, 0))
>   #define ADDIS_R2_R12	(PPC_RAW_ADDIS(_R2, _R12, 0))
>   #define ADDI_R2_R2	(PPC_RAW_ADDI(_R2, _R2, 0))
> diff --git a/arch/powerpc/include/asm/static_call.h b/arch/powerpc/include/asm/static_call.h
> index de1018cc522b..3d6e82200cb7 100644
> --- a/arch/powerpc/include/asm/static_call.h
> +++ b/arch/powerpc/include/asm/static_call.h
> @@ -2,12 +2,75 @@
>   #ifndef _ASM_POWERPC_STATIC_CALL_H
>   #define _ASM_POWERPC_STATIC_CALL_H
>   
> +#ifdef CONFIG_PPC64_ELF_ABI_V2
> +
> +#ifdef MODULE
> +
> +#define __PPC_SCT(name, inst)					\
> +	asm(".pushsection .text, \"ax\"				\n"	\
> +	    ".align 6						\n"	\
> +	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
> +	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
> +	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> +	    "	mflr	11					\n"	\
> +	    "	bcl	20, 31, $+4				\n"	\
> +	    "0:	mflr	12					\n"	\
> +	    "	mtlr	11					\n"	\
> +	    "	addi	12, 12, (" STATIC_CALL_TRAMP_STR(name) " - 0b)	\n"	\
> +	    "	addis 2, 12, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@ha	\n"	\
> +	    "	addi 2, 2, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@l	\n"	\
> +	    "	" inst "					\n"	\
> +	    "	ld	12, (2f - " STATIC_CALL_TRAMP_STR(name) ")(12)	\n"	\
> +	    "	mtctr	12					\n"	\
> +	    "	bctr						\n"	\
> +	    "1:	li	3, 0					\n"	\
> +	    "	blr						\n"	\
> +	    ".balign 8						\n"	\
> +	    "2:	.8byte 0					\n"	\
> +	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
> +	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
> +	    ".popsection					\n")
> +
> +#else /* KERNEL */
> +
> +#define __PPC_SCT(name, inst)					\
> +	asm(".pushsection .text, \"ax\"				\n"	\
> +	    ".align 5						\n"	\
> +	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
> +	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
> +	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> +	    "	ld	2, 16(13)				\n"	\
> +	    "	" inst "					\n"	\
> +	    "	addis	12, 2, 2f@toc@ha			\n"	\
> +	    "	ld	12, 2f@toc@l(12)			\n"	\
> +	    "	mtctr	12					\n"	\
> +	    "	bctr						\n"	\
> +	    "1:	li	3, 0					\n"	\
> +	    "	blr						\n"	\
> +	    ".balign 8						\n"	\
> +	    "2:	.8byte 0					\n"	\
> +	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
> +	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
> +	    ".popsection					\n")
> +
> +#endif /* MODULE */
> +
> +#define PPC_SCT_INST_MODULE		28		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_MODULE		44		/* Offset of label 1 */
> +#define PPC_SCT_DATA_MODULE		56		/* Offset of label 2 (aligned) */
> +
> +#define PPC_SCT_INST_KERNEL		4		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_KERNEL		24		/* Offset of label 1 */
> +#define PPC_SCT_DATA_KERNEL		32		/* Offset of label 2 (aligned) */
> +
> +#elif defined(CONFIG_PPC32)
> +
>   #define __PPC_SCT(name, inst)					\
>   	asm(".pushsection .text, \"ax\"				\n"	\
>   	    ".align 5						\n"	\
>   	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
>   	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> -	    inst "						\n"	\
> +	    "	" inst "					\n"	\
>   	    "	lis	12,2f@ha				\n"	\
>   	    "	lwz	12,2f@l(12)				\n"	\
>   	    "	mtctr	12					\n"	\
> @@ -19,11 +82,20 @@
>   	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
>   	    ".popsection					\n")
>   
> -#define PPC_SCT_RET0		20		/* Offset of label 1 */
> -#define PPC_SCT_DATA		28		/* Offset of label 2 */
> +#define PPC_SCT_INST_MODULE		0		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_MODULE		20		/* Offset of label 1 */
> +#define PPC_SCT_DATA_MODULE		28		/* Offset of label 2 */
> +
> +#define PPC_SCT_INST_KERNEL		PPC_SCT_INST_MODULE
> +#define PPC_SCT_RET0_KERNEL		PPC_SCT_RET0_MODULE
> +#define PPC_SCT_DATA_KERNEL		PPC_SCT_DATA_MODULE
> +
> +#else /* !CONFIG_PPC64_ELF_ABI_V2 && !CONFIG_PPC32 */
> +#error "Unsupported ABI"
> +#endif /* CONFIG_PPC64_ELF_ABI_V2 */
>   
>   #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)	__PPC_SCT(name, "b " #func)
>   #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)	__PPC_SCT(name, "blr")
> -#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b .+20")
> +#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b 1f")
>   
>   #endif /* _ASM_POWERPC_STATIC_CALL_H */
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 06d2d1f78f71..a30d0d0f5499 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -128,8 +128,9 @@ extra-y				+= vmlinux.lds
>   
>   obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
>   
> -obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o static_call.o
> +obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o
>   obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
> +obj-$(CONFIG_HAVE_STATIC_CALL)	+= static_call.o
>   obj-$(CONFIG_KGDB)		+= kgdb.o
>   obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
>   obj-$(CONFIG_SMP)		+= smp.o
> diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c
> index 863a7aa24650..9211b2e189bb 100644
> --- a/arch/powerpc/kernel/static_call.c
> +++ b/arch/powerpc/kernel/static_call.c
> @@ -1,33 +1,151 @@
>   // SPDX-License-Identifier: GPL-2.0
> +#include <linux/bitops.h>
>   #include <linux/memory.h>
>   #include <linux/static_call.h>
>   
>   #include <asm/code-patching.h>
>   
> +static long sign_extend_long(unsigned long value, int index)
> +{
> +	if (sizeof(long) == 8)
> +		return sign_extend64(value, index);
> +	else
> +		return sign_extend32(value, index);
> +}
> +
> +static void *ppc_function_toc(u32 *func)
> +{
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
> +		/* There are two common global entry sequences we handle below
> +		 *
> +		 * 1. addis r2, r12, SI1
> +		 *    addi r2, SI2
> +		 *
> +		 * 2. lis r2, SI1
> +		 *    addi r2, SI2
> +		 *
> +		 * Where r12 contains the global entry point address (it is otherwise
> +		 * uninitialised, so doesn't matter what value we use if this is not
> +		 * a separate global entry point).
> +		 *
> +		 * Here we simulate running the given sequence and return the result it
> +		 * would calculate. If the sequence is not recognised we return NULL.
> +		 */
> +		u32 insn1 = *func;
> +		u32 insn2 = *(func + 1);
> +		unsigned long op_regs1 = insn1 & OP_RT_RA_MASK;
> +		unsigned long op_regs2 = insn2 & OP_RT_RA_MASK;
> +		unsigned long si1 = insn1 & OP_SI_MASK;
> +		unsigned long si2 = insn2 & OP_SI_MASK;
> +		unsigned long imm1 = sign_extend_long(si1 << 16, 31);
> +		unsigned long imm2 = sign_extend_long(si2, 15);
> +		unsigned long addr = 0;
> +
> +		/* Simulate the first instruction */
> +		if (op_regs1 == ADDIS_R2_R12)
> +			addr += (unsigned long)func + imm1;
> +		else if (op_regs1 == LIS_R2)
> +			addr += imm1;
> +		else
> +			return NULL;
> +
> +		/* Simulate the second instruction */
> +		if (op_regs2 == ADDI_R2_R2)
> +			addr += imm2;
> +		else
> +			return NULL;
> +
> +		return (void *)addr;
> +	}
> +
> +	return NULL;
> +}
> +
> +static bool shares_toc(void *func1, void *func2)
> +{
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
> +		void *func1_toc;
> +		void *func2_toc;
> +
> +		if (func1 == NULL || func2 == NULL)
> +			return false;
> +
> +		/* Assume the kernel only uses a single TOC */
> +		if (core_kernel_text((unsigned long)func1) &&
> +		    core_kernel_text((unsigned long)func2))
> +			return true;
> +
> +		/* Fall back to calculating the TOC from common patterns
> +		 * if modules are involved
> +		 */
> +		func1_toc = ppc_function_toc(func1);
> +		func2_toc = ppc_function_toc(func2);
> +		return func1_toc != NULL && func2_toc != NULL && func1_toc == func2_toc;
> +	}
> +
> +	return true;
> +}
> +
> +static void *get_inst_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long)tramp)
> +				? PPC_SCT_INST_KERNEL
> +				: PPC_SCT_INST_MODULE);
> +}
> +
> +static void *get_ret0_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long)tramp)
> +				? PPC_SCT_RET0_KERNEL
> +				: PPC_SCT_RET0_MODULE);
> +}
> +
> +static void *get_data_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long) tramp)
> +				? PPC_SCT_DATA_KERNEL
> +				: PPC_SCT_DATA_MODULE);
> +}
> +
>   void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
>   {
>   	int err;
>   	bool is_ret0 = (func == __static_call_return0);
> -	unsigned long target = (unsigned long)(is_ret0 ? tramp + PPC_SCT_RET0 : func);
> -	bool is_short = is_offset_in_branch_range((long)target - (long)tramp);
> +	bool is_short;
> +	void *target = is_ret0 ? get_ret0_addr(tramp) : func;
> +	void *tramp_inst = get_inst_addr(tramp);
>   
>   	if (!tramp)
>   		return;
>   
> +	if (is_ret0)
> +		is_short = true;
> +	else if (shares_toc(tramp, target))
> +		is_short = is_offset_in_branch_range(
> +			(long)ppc_function_entry(target) - (long)tramp_inst);
> +	else
> +		/* Combine out-of-range with not sharing a TOC. Though it's possible
> +		 * an out-of-range target shares a TOC, handling this separately
> +		 * complicates the trampoline. It's simpler to always use the global
> +		 * entry point in this case.
> +		 */
> +		is_short = false;
> +
>   	mutex_lock(&text_mutex);
>   
>   	if (func && !is_short) {
> -		err = patch_instruction(tramp + PPC_SCT_DATA, ppc_inst(target));
> +		err = patch_ulong(get_data_addr(tramp), (unsigned long)target);
>   		if (err)
>   			goto out;
>   	}
>   
>   	if (!func)
> -		err = patch_instruction(tramp, ppc_inst(PPC_RAW_BLR()));
> +		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_BLR()));
>   	else if (is_short)
> -		err = patch_branch(tramp, target, 0);
> +		err = patch_branch(tramp_inst, ppc_function_entry(target), 0);
>   	else
> -		err = patch_instruction(tramp, ppc_inst(PPC_RAW_NOP()));
> +		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_NOP()));
> +
>   out:
>   	mutex_unlock(&text_mutex);
>
Michael Ellerman Oct. 6, 2022, 12:39 a.m. UTC | #2
Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 05/10/2022 à 07:32, Benjamin Gray a écrit :
>> Implement static call support for 64 bit V2 ABI. This requires making
>> sure the TOC is kept correct across kernel-module boundaries. As a
>> secondary concern, it tries to use the local entry point of a target
>> wherever possible. It does so by checking if both tramp & target are
>> kernel code, and falls back to detecting the common global entry point
>> patterns if modules are involved. Detecting the global entry point is
>> also required for setting the local entry point as the trampoline
>> target: if we cannot detect the local entry point, then we need to
>> convservatively initialise r12 and use the global entry point.
>> 
>> The trampolines are marked with `.localentry NAME, 1` to make the
>> linker save and restore the TOC on each call to the trampoline. This
>> allows the trampoline to safely target functions with different TOC
>> values.
>> 
>> However this directive also implies the TOC is not initialised on entry
>> to the trampoline. The kernel TOC is easily found in the PACA, but not
>> an arbitrary module TOC. Therefore the trampoline implementation depends
>> on whether it's in the kernel or not. If in the kernel, we initialise
>> the TOC using the PACA. If in a module, we have to initialise the TOC
>> with zero context, so it's quite expensive.
>> 
>> Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
>
> This looks good to me
>
> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>
> However, thinking out loudly, I'm wondering, could we make things any 
> simpler when CONFIG_MODULES is not selected, or is that a too much 
> corner case on PPC64 ?

I'd say it's mostly a corner case.

Obviously no distros ship with modules disabled. 

AFAIK even the stripped down kernels we use in CPU bringup have modules
enabled.

So I think it's probably not worth worrying about, unless there's an
obvious and fairly simple optimisation.

cheers
Benjamin Gray Oct. 6, 2022, 5:01 a.m. UTC | #3
On Thu, 2022-10-06 at 11:39 +1100, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> > However, thinking out loudly, I'm wondering, could we make things
> > any 
> > simpler when CONFIG_MODULES is not selected, or is that a too much 
> > corner case on PPC64 ?
> 
> I'd say it's mostly a corner case.
> 
> Obviously no distros ship with modules disabled. 
> 
> AFAIK even the stripped down kernels we use in CPU bringup have
> modules
> enabled.
> 
> So I think it's probably not worth worrying about, unless there's an
> obvious and fairly simple optimisation.
> 
> cheers

Yeah, I think supporting this case would amount to a whole new
trampoline implementation. Something like the original RFC
implementation would be best here as there would only be one TOC to
worry about.

Instead, I expect this can all be optimised better once we can apply
patches to call-sites. If so, we can patch the call-site NOPs ourselves
without marking the trampoline as caller-saved TOC, which would remove
the suboptimal save-r2 trampolines. Then we could use separate local &
global entry points like the RFC. This would unify the kernel/module
trampolines again.

The benefit of this series is that it works with just what the ABI
offers, without extra kernel tools / linker magic. But I would
definitely revisit it once call-site patching is possible, especially
when working on inline static calls.
Segher Boessenkool Oct. 6, 2022, 6:22 p.m. UTC | #4
On Thu, Oct 06, 2022 at 11:39:50AM +1100, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> > However, thinking out loudly, I'm wondering, could we make things any 
> > simpler when CONFIG_MODULES is not selected, or is that a too much 
> > corner case on PPC64 ?
> 
> I'd say it's mostly a corner case.
> 
> Obviously no distros ship with modules disabled. 
> 
> AFAIK even the stripped down kernels we use in CPU bringup have modules
> enabled.
> 
> So I think it's probably not worth worrying about, unless there's an
> obvious and fairly simple optimisation.

Long ago I built kernels that fit together with the boot firmware and a
root fs (busybox+dropbear essentially) in 4MB, but I doubt we can get
close to that at all these days :-)

What is the overhead if you enable modules but do not use them, these
days?


Segher
Christophe Leroy Oct. 6, 2022, 6:38 p.m. UTC | #5
Le 06/10/2022 à 20:22, Segher Boessenkool a écrit :
> On Thu, Oct 06, 2022 at 11:39:50AM +1100, Michael Ellerman wrote:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>> However, thinking out loudly, I'm wondering, could we make things any
>>> simpler when CONFIG_MODULES is not selected, or is that a too much
>>> corner case on PPC64 ?
>>
>> I'd say it's mostly a corner case.
>>
>> Obviously no distros ship with modules disabled.
>>
>> AFAIK even the stripped down kernels we use in CPU bringup have modules
>> enabled.
>>
>> So I think it's probably not worth worrying about, unless there's an
>> obvious and fairly simple optimisation.
> 
> Long ago I built kernels that fit together with the boot firmware and a
> root fs (busybox+dropbear essentially) in 4MB, but I doubt we can get
> close to that at all these days :-)

4MB, not easy. But 8M still achievable. Well our smaller board has 32M, 
we have thousands of it spread all over Europe and have to keep it up to 
date ....

> 
> What is the overhead if you enable modules but do not use them, these
> days?
> 

On the 8xx it is mainly the instruction TLB miss handler:

#ifdef CONFIG_MODULES
	mfcr	r11
	not.	r10, r10
#endif
	mfspr	r10, SPRN_M_TWB	/* Get level 1 table */
#ifdef CONFIG_MODULES
	blt+	3f
	rlwinm	r10, r10, 0, 20, 31
	oris	r10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
3:
	mtcr	r11
#endif


And also some patches which have a interesting impact, like commit 
cb3ac45214c0 ("powerpc/code-patching: Don't call 
is_vmalloc_or_module_addr() without CONFIG_MODULES")


On the other hand, if we want one day to replace nftables by BPF jitted 
iptables, CONFIG_MODULES will be required. So what will be the 
trade-off, don't know yet because BPF is not yet cross-compile friendly.

Christophe
Segher Boessenkool Oct. 6, 2022, 8:45 p.m. UTC | #6
On Thu, Oct 06, 2022 at 06:38:00PM +0000, Christophe Leroy wrote:
> Le 06/10/2022 à 20:22, Segher Boessenkool a écrit :
> > Long ago I built kernels that fit together with the boot firmware and a
> > root fs (busybox+dropbear essentially) in 4MB, but I doubt we can get
> > close to that at all these days :-)
> 
> 4MB, not easy. But 8M still achievable. Well our smaller board has 32M, 
> we have thousands of it spread all over Europe and have to keep it up to 
> date ....

The smallest of these systems had 256MB RAM.  This 4MB is flash ROM :-)

> > What is the overhead if you enable modules but do not use them, these
> > days?
> 
> On the 8xx it is mainly the instruction TLB miss handler:

I meant just an indicative code size number...  100 bytes, 100kB, 100MB,
or something like that :-)  And, on 64 bit, which is what the question
was about!


Segher
Christophe Leroy Oct. 6, 2022, 8:50 p.m. UTC | #7
Le 06/10/2022 à 22:45, Segher Boessenkool a écrit :
> On Thu, Oct 06, 2022 at 06:38:00PM +0000, Christophe Leroy wrote:
>> Le 06/10/2022 à 20:22, Segher Boessenkool a écrit :
>>> Long ago I built kernels that fit together with the boot firmware and a
>>> root fs (busybox+dropbear essentially) in 4MB, but I doubt we can get
>>> close to that at all these days :-)
>>
>> 4MB, not easy. But 8M still achievable. Well our smaller board has 32M,
>> we have thousands of it spread all over Europe and have to keep it up to
>> date ....
> 
> The smallest of these systems had 256MB RAM.  This 4MB is flash ROM :-)

I fit Uboot + DTB + Kernel + Initramfs with klibc and mtdutils in a 2MB 
flash ROM.

> 
>>> What is the overhead if you enable modules but do not use them, these
>>> days?
>>
>> On the 8xx it is mainly the instruction TLB miss handler:
> 
> I meant just an indicative code size number...  100 bytes, 100kB, 100MB,
> or something like that :-)  And, on 64 bit, which is what the question
> was about!

Ah, does the size really matters here ? I was thinking more in terms of 
performance when I made the comment.

Christophe
Segher Boessenkool Oct. 6, 2022, 9:04 p.m. UTC | #8
On Thu, Oct 06, 2022 at 08:50:31PM +0000, Christophe Leroy wrote:
> Le 06/10/2022 à 22:45, Segher Boessenkool a écrit :
> > I meant just an indicative code size number...  100 bytes, 100kB, 100MB,
> > or something like that :-)  And, on 64 bit, which is what the question
> > was about!
> 
> Ah, does the size really matters here ? I was thinking more in terms of 
> performance when I made the comment.

Other than some unused code there should not be much performance impact
at all from enabling modules when not needed, on 64 bit.  Security (in
depth) is a very different kettle of fish of course ;-)


Segher
diff mbox series

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4c466acdc70d..962e36ec34ec 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -102,6 +102,18 @@  config GENERIC_HWEIGHT
 	bool
 	default y
 
+config TOOLCHAIN_SUPPORTS_LOCALENTRY1
+	bool
+	depends on PPC64_ELF_ABI_V2
+	default y if LD_VERSION >= 23200 || LLD_VERSION >= 110000
+	help
+	  A section of the ELF symbol st_other field can be given the value 1
+	  using the directive '.localentry NAME, 1' to mean the local and global
+	  entry points are the same, and r2 should be treated as caller-saved.
+
+	  Older versions of Clang and binutils do not recognise this form of the
+	  directive and will error if it is used.
+
 config PPC
 	bool
 	default y
@@ -248,7 +260,7 @@  config PPC
 	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
 	select HAVE_STACKPROTECTOR		if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
-	select HAVE_STATIC_CALL			if PPC32
+	select HAVE_STATIC_CALL			if PPC32 || (PPC64_ELF_ABI_V2 && TOOLCHAIN_SUPPORTS_LOCALENTRY1)
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HUGETLB_PAGE_SIZE_VARIABLE	if PPC_BOOK3S_64 && HUGETLB_PAGE
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 170bfa848c7c..cb4629e55e57 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -152,6 +152,7 @@  int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src);
 bool is_conditional_branch(ppc_inst_t instr);
 
 #define OP_RT_RA_MASK	0xffff0000UL
+#define OP_SI_MASK	0x0000ffffUL
 #define LIS_R2		(PPC_RAW_LIS(_R2, 0))
 #define ADDIS_R2_R12	(PPC_RAW_ADDIS(_R2, _R12, 0))
 #define ADDI_R2_R2	(PPC_RAW_ADDI(_R2, _R2, 0))
diff --git a/arch/powerpc/include/asm/static_call.h b/arch/powerpc/include/asm/static_call.h
index de1018cc522b..3d6e82200cb7 100644
--- a/arch/powerpc/include/asm/static_call.h
+++ b/arch/powerpc/include/asm/static_call.h
@@ -2,12 +2,75 @@ 
 #ifndef _ASM_POWERPC_STATIC_CALL_H
 #define _ASM_POWERPC_STATIC_CALL_H
 
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+
+#ifdef MODULE
+
+#define __PPC_SCT(name, inst)					\
+	asm(".pushsection .text, \"ax\"				\n"	\
+	    ".align 6						\n"	\
+	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
+	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
+	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
+	    "	mflr	11					\n"	\
+	    "	bcl	20, 31, $+4				\n"	\
+	    "0:	mflr	12					\n"	\
+	    "	mtlr	11					\n"	\
+	    "	addi	12, 12, (" STATIC_CALL_TRAMP_STR(name) " - 0b)	\n"	\
+	    "	addis 2, 12, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@ha	\n"	\
+	    "	addi 2, 2, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@l	\n"	\
+	    "	" inst "					\n"	\
+	    "	ld	12, (2f - " STATIC_CALL_TRAMP_STR(name) ")(12)	\n"	\
+	    "	mtctr	12					\n"	\
+	    "	bctr						\n"	\
+	    "1:	li	3, 0					\n"	\
+	    "	blr						\n"	\
+	    ".balign 8						\n"	\
+	    "2:	.8byte 0					\n"	\
+	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
+	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+	    ".popsection					\n")
+
+#else /* KERNEL */
+
+#define __PPC_SCT(name, inst)					\
+	asm(".pushsection .text, \"ax\"				\n"	\
+	    ".align 5						\n"	\
+	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
+	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
+	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
+	    "	ld	2, 16(13)				\n"	\
+	    "	" inst "					\n"	\
+	    "	addis	12, 2, 2f@toc@ha			\n"	\
+	    "	ld	12, 2f@toc@l(12)			\n"	\
+	    "	mtctr	12					\n"	\
+	    "	bctr						\n"	\
+	    "1:	li	3, 0					\n"	\
+	    "	blr						\n"	\
+	    ".balign 8						\n"	\
+	    "2:	.8byte 0					\n"	\
+	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
+	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+	    ".popsection					\n")
+
+#endif /* MODULE */
+
+#define PPC_SCT_INST_MODULE		28		/* Offset of instruction to update */
+#define PPC_SCT_RET0_MODULE		44		/* Offset of label 1 */
+#define PPC_SCT_DATA_MODULE		56		/* Offset of label 2 (aligned) */
+
+#define PPC_SCT_INST_KERNEL		4		/* Offset of instruction to update */
+#define PPC_SCT_RET0_KERNEL		24		/* Offset of label 1 */
+#define PPC_SCT_DATA_KERNEL		32		/* Offset of label 2 (aligned) */
+
+#elif defined(CONFIG_PPC32)
+
 #define __PPC_SCT(name, inst)					\
 	asm(".pushsection .text, \"ax\"				\n"	\
 	    ".align 5						\n"	\
 	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
 	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
-	    inst "						\n"	\
+	    "	" inst "					\n"	\
 	    "	lis	12,2f@ha				\n"	\
 	    "	lwz	12,2f@l(12)				\n"	\
 	    "	mtctr	12					\n"	\
@@ -19,11 +82,20 @@ 
 	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
 	    ".popsection					\n")
 
-#define PPC_SCT_RET0		20		/* Offset of label 1 */
-#define PPC_SCT_DATA		28		/* Offset of label 2 */
+#define PPC_SCT_INST_MODULE		0		/* Offset of instruction to update */
+#define PPC_SCT_RET0_MODULE		20		/* Offset of label 1 */
+#define PPC_SCT_DATA_MODULE		28		/* Offset of label 2 */
+
+#define PPC_SCT_INST_KERNEL		PPC_SCT_INST_MODULE
+#define PPC_SCT_RET0_KERNEL		PPC_SCT_RET0_MODULE
+#define PPC_SCT_DATA_KERNEL		PPC_SCT_DATA_MODULE
+
+#else /* !CONFIG_PPC64_ELF_ABI_V2 && !CONFIG_PPC32 */
+#error "Unsupported ABI"
+#endif /* CONFIG_PPC64_ELF_ABI_V2 */
 
 #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)	__PPC_SCT(name, "b " #func)
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)	__PPC_SCT(name, "blr")
-#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b .+20")
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b 1f")
 
 #endif /* _ASM_POWERPC_STATIC_CALL_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 06d2d1f78f71..a30d0d0f5499 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -128,8 +128,9 @@  extra-y				+= vmlinux.lds
 
 obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
 
-obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o static_call.o
+obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o
 obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
+obj-$(CONFIG_HAVE_STATIC_CALL)	+= static_call.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c
index 863a7aa24650..9211b2e189bb 100644
--- a/arch/powerpc/kernel/static_call.c
+++ b/arch/powerpc/kernel/static_call.c
@@ -1,33 +1,151 @@ 
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
 #include <linux/memory.h>
 #include <linux/static_call.h>
 
 #include <asm/code-patching.h>
 
+static long sign_extend_long(unsigned long value, int index)
+{
+	if (sizeof(long) == 8)
+		return sign_extend64(value, index);
+	else
+		return sign_extend32(value, index);
+}
+
+static void *ppc_function_toc(u32 *func)
+{
+	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
+		/* There are two common global entry sequences we handle below
+		 *
+		 * 1. addis r2, r12, SI1
+		 *    addi r2, SI2
+		 *
+		 * 2. lis r2, SI1
+		 *    addi r2, SI2
+		 *
+		 * Where r12 contains the global entry point address (it is otherwise
+		 * uninitialised, so doesn't matter what value we use if this is not
+		 * a separate global entry point).
+		 *
+		 * Here we simulate running the given sequence and return the result it
+		 * would calculate. If the sequence is not recognised we return NULL.
+		 */
+		u32 insn1 = *func;
+		u32 insn2 = *(func + 1);
+		unsigned long op_regs1 = insn1 & OP_RT_RA_MASK;
+		unsigned long op_regs2 = insn2 & OP_RT_RA_MASK;
+		unsigned long si1 = insn1 & OP_SI_MASK;
+		unsigned long si2 = insn2 & OP_SI_MASK;
+		unsigned long imm1 = sign_extend_long(si1 << 16, 31);
+		unsigned long imm2 = sign_extend_long(si2, 15);
+		unsigned long addr = 0;
+
+		/* Simulate the first instruction */
+		if (op_regs1 == ADDIS_R2_R12)
+			addr += (unsigned long)func + imm1;
+		else if (op_regs1 == LIS_R2)
+			addr += imm1;
+		else
+			return NULL;
+
+		/* Simulate the second instruction */
+		if (op_regs2 == ADDI_R2_R2)
+			addr += imm2;
+		else
+			return NULL;
+
+		return (void *)addr;
+	}
+
+	return NULL;
+}
+
+static bool shares_toc(void *func1, void *func2)
+{
+	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
+		void *func1_toc;
+		void *func2_toc;
+
+		if (func1 == NULL || func2 == NULL)
+			return false;
+
+		/* Assume the kernel only uses a single TOC */
+		if (core_kernel_text((unsigned long)func1) &&
+		    core_kernel_text((unsigned long)func2))
+			return true;
+
+		/* Fall back to calculating the TOC from common patterns
+		 * if modules are involved
+		 */
+		func1_toc = ppc_function_toc(func1);
+		func2_toc = ppc_function_toc(func2);
+		return func1_toc != NULL && func2_toc != NULL && func1_toc == func2_toc;
+	}
+
+	return true;
+}
+
+static void *get_inst_addr(void *tramp)
+{
+	return tramp + (core_kernel_text((unsigned long)tramp)
+				? PPC_SCT_INST_KERNEL
+				: PPC_SCT_INST_MODULE);
+}
+
+static void *get_ret0_addr(void *tramp)
+{
+	return tramp + (core_kernel_text((unsigned long)tramp)
+				? PPC_SCT_RET0_KERNEL
+				: PPC_SCT_RET0_MODULE);
+}
+
+static void *get_data_addr(void *tramp)
+{
+	return tramp + (core_kernel_text((unsigned long) tramp)
+				? PPC_SCT_DATA_KERNEL
+				: PPC_SCT_DATA_MODULE);
+}
+
 void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
 {
 	int err;
 	bool is_ret0 = (func == __static_call_return0);
-	unsigned long target = (unsigned long)(is_ret0 ? tramp + PPC_SCT_RET0 : func);
-	bool is_short = is_offset_in_branch_range((long)target - (long)tramp);
+	bool is_short;
+	void *target = is_ret0 ? get_ret0_addr(tramp) : func;
+	void *tramp_inst = get_inst_addr(tramp);
 
 	if (!tramp)
 		return;
 
+	if (is_ret0)
+		is_short = true;
+	else if (shares_toc(tramp, target))
+		is_short = is_offset_in_branch_range(
+			(long)ppc_function_entry(target) - (long)tramp_inst);
+	else
+		/* Combine out-of-range with not sharing a TOC. Though it's possible
+		 * an out-of-range target shares a TOC, handling this separately
+		 * complicates the trampoline. It's simpler to always use the global
+		 * entry point in this case.
+		 */
+		is_short = false;
+
 	mutex_lock(&text_mutex);
 
 	if (func && !is_short) {
-		err = patch_instruction(tramp + PPC_SCT_DATA, ppc_inst(target));
+		err = patch_ulong(get_data_addr(tramp), (unsigned long)target);
 		if (err)
 			goto out;
 	}
 
 	if (!func)
-		err = patch_instruction(tramp, ppc_inst(PPC_RAW_BLR()));
+		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_BLR()));
 	else if (is_short)
-		err = patch_branch(tramp, target, 0);
+		err = patch_branch(tramp_inst, ppc_function_entry(target), 0);
 	else
-		err = patch_instruction(tramp, ppc_inst(PPC_RAW_NOP()));
+		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_NOP()));
+
 out:
 	mutex_unlock(&text_mutex);