diff mbox series

[bpf-next,v4,02/10] bpf: powerpc64: pad function address loads with NOPs

Message ID d0db970711596827ba88209e545c686d77c22b7d.1527143877.git.sandipan@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show
Series bpf: enhancements for multi-function programs | expand

Commit Message

Sandipan Das May 24, 2018, 6:56 a.m. UTC
For multi-function programs, loading the address of a callee
function to a register requires emitting instructions whose
count varies from one to five depending on the nature of the
address.

Since we come to know of the callee's address only before the
extra pass, the number of instructions required to load this
address may vary from what was previously generated. This can
make the JITed image grow or shrink.

To avoid this, we should generate a constant five-instruction
when loading function addresses by padding the optimized load
sequence with NOPs.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

Comments

Daniel Borkmann May 24, 2018, 7:34 a.m. UTC | #1
On 05/24/2018 08:56 AM, Sandipan Das wrote:
> For multi-function programs, loading the address of a callee
> function to a register requires emitting instructions whose
> count varies from one to five depending on the nature of the
> address.
> 
> Since we come to know of the callee's address only before the
> extra pass, the number of instructions required to load this
> address may vary from what was previously generated. This can
> make the JITed image grow or shrink.
> 
> To avoid this, we should generate a constant five-instruction
> when loading function addresses by padding the optimized load
> sequence with NOPs.
> 
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ---
>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
> index 1bdb1aff0619..e4582744a31d 100644
> --- a/arch/powerpc/net/bpf_jit_comp64.c
> +++ b/arch/powerpc/net/bpf_jit_comp64.c
> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>  
>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>  {
> +	unsigned int i, ctx_idx = ctx->idx;
> +
> +	/* Load function address into r12 */
> +	PPC_LI64(12, func);
> +
> +	/* For bpf-to-bpf function calls, the callee's address is unknown
> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
> +	 * load the callee's address, but this may optimize the number of
> +	 * instructions required based on the nature of the address.
> +	 *
> +	 * Since we don't want the number of instructions emitted to change,
> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
> +	 * we always have a five-instruction sequence, which is the maximum
> +	 * that PPC_LI64() can emit.
> +	 */
> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
> +		PPC_NOP();

By the way, I think you can still optimize this. The nops are not really
needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
a normal BPF helper call will always be at a fixed location and known a
priori.

>  #ifdef PPC64_ELF_ABI_v1
> -	/* func points to the function descriptor */
> -	PPC_LI64(b2p[TMP_REG_2], func);
> -	/* Load actual entry point from function descriptor */
> -	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
> -	/* ... and move it to LR */
> -	PPC_MTLR(b2p[TMP_REG_1]);
>  	/*
>  	 * Load TOC from function descriptor at offset 8.
>  	 * We can clobber r2 since we get called through a
>  	 * function pointer (so caller will save/restore r2)
>  	 * and since we don't use a TOC ourself.
>  	 */
> -	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
> -#else
> -	/* We can clobber r12 */
> -	PPC_FUNC_ADDR(12, func);
> -	PPC_MTLR(12);
> +	PPC_BPF_LL(2, 12, 8);
> +	/* Load actual entry point from function descriptor */
> +	PPC_BPF_LL(12, 12, 0);
>  #endif
> +
> +	PPC_MTLR(12);
>  	PPC_BLRL();
>  }
>  
>
Sandipan Das May 24, 2018, 8:25 a.m. UTC | #2
On 05/24/2018 01:04 PM, Daniel Borkmann wrote:
> On 05/24/2018 08:56 AM, Sandipan Das wrote:
>> For multi-function programs, loading the address of a callee
>> function to a register requires emitting instructions whose
>> count varies from one to five depending on the nature of the
>> address.
>>
>> Since we come to know of the callee's address only before the
>> extra pass, the number of instructions required to load this
>> address may vary from what was previously generated. This can
>> make the JITed image grow or shrink.
>>
>> To avoid this, we should generate a constant five-instruction
>> when loading function addresses by padding the optimized load
>> sequence with NOPs.
>>
>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>>  1 file changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
>> index 1bdb1aff0619..e4582744a31d 100644
>> --- a/arch/powerpc/net/bpf_jit_comp64.c
>> +++ b/arch/powerpc/net/bpf_jit_comp64.c
>> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>>  
>>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>>  {
>> +	unsigned int i, ctx_idx = ctx->idx;
>> +
>> +	/* Load function address into r12 */
>> +	PPC_LI64(12, func);
>> +
>> +	/* For bpf-to-bpf function calls, the callee's address is unknown
>> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
>> +	 * load the callee's address, but this may optimize the number of
>> +	 * instructions required based on the nature of the address.
>> +	 *
>> +	 * Since we don't want the number of instructions emitted to change,
>> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
>> +	 * we always have a five-instruction sequence, which is the maximum
>> +	 * that PPC_LI64() can emit.
>> +	 */
>> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
>> +		PPC_NOP();
> 
> By the way, I think you can still optimize this. The nops are not really
> needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
> a normal BPF helper call will always be at a fixed location and known a
> priori.
> 

Ah, true. Thanks for pointing this out. There are a few other things that
we are planning to do for the ppc64 JIT compiler. Will put out a patch for
this with that series.

- Sandipan

>>  #ifdef PPC64_ELF_ABI_v1
>> -	/* func points to the function descriptor */
>> -	PPC_LI64(b2p[TMP_REG_2], func);
>> -	/* Load actual entry point from function descriptor */
>> -	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
>> -	/* ... and move it to LR */
>> -	PPC_MTLR(b2p[TMP_REG_1]);
>>  	/*
>>  	 * Load TOC from function descriptor at offset 8.
>>  	 * We can clobber r2 since we get called through a
>>  	 * function pointer (so caller will save/restore r2)
>>  	 * and since we don't use a TOC ourself.
>>  	 */
>> -	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
>> -#else
>> -	/* We can clobber r12 */
>> -	PPC_FUNC_ADDR(12, func);
>> -	PPC_MTLR(12);
>> +	PPC_BPF_LL(2, 12, 8);
>> +	/* Load actual entry point from function descriptor */
>> +	PPC_BPF_LL(12, 12, 0);
>>  #endif
>> +
>> +	PPC_MTLR(12);
>>  	PPC_BLRL();
>>  }
>>  
>>
>
Daniel Borkmann May 24, 2018, 9:25 a.m. UTC | #3
On 05/24/2018 10:25 AM, Sandipan Das wrote:
> On 05/24/2018 01:04 PM, Daniel Borkmann wrote:
>> On 05/24/2018 08:56 AM, Sandipan Das wrote:
>>> For multi-function programs, loading the address of a callee
>>> function to a register requires emitting instructions whose
>>> count varies from one to five depending on the nature of the
>>> address.
>>>
>>> Since we come to know of the callee's address only before the
>>> extra pass, the number of instructions required to load this
>>> address may vary from what was previously generated. This can
>>> make the JITed image grow or shrink.
>>>
>>> To avoid this, we should generate a constant five-instruction
>>> when loading function addresses by padding the optimized load
>>> sequence with NOPs.
>>>
>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>>> ---
>>>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>>>  1 file changed, 23 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
>>> index 1bdb1aff0619..e4582744a31d 100644
>>> --- a/arch/powerpc/net/bpf_jit_comp64.c
>>> +++ b/arch/powerpc/net/bpf_jit_comp64.c
>>> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>>>  
>>>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>>>  {
>>> +	unsigned int i, ctx_idx = ctx->idx;
>>> +
>>> +	/* Load function address into r12 */
>>> +	PPC_LI64(12, func);
>>> +
>>> +	/* For bpf-to-bpf function calls, the callee's address is unknown
>>> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
>>> +	 * load the callee's address, but this may optimize the number of
>>> +	 * instructions required based on the nature of the address.
>>> +	 *
>>> +	 * Since we don't want the number of instructions emitted to change,
>>> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
>>> +	 * we always have a five-instruction sequence, which is the maximum
>>> +	 * that PPC_LI64() can emit.
>>> +	 */
>>> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
>>> +		PPC_NOP();
>>
>> By the way, I think you can still optimize this. The nops are not really
>> needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
>> a normal BPF helper call will always be at a fixed location and known a
>> priori.
> 
> Ah, true. Thanks for pointing this out. There are a few other things that
> we are planning to do for the ppc64 JIT compiler. Will put out a patch for
> this with that series.

Awesome, thanks Sandipan!
diff mbox series

Patch

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..e4582744a31d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -167,25 +167,37 @@  static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 
 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
 {
+	unsigned int i, ctx_idx = ctx->idx;
+
+	/* Load function address into r12 */
+	PPC_LI64(12, func);
+
+	/* For bpf-to-bpf function calls, the callee's address is unknown
+	 * until the last extra pass. As seen above, we use PPC_LI64() to
+	 * load the callee's address, but this may optimize the number of
+	 * instructions required based on the nature of the address.
+	 *
+	 * Since we don't want the number of instructions emitted to change,
+	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
+	 * we always have a five-instruction sequence, which is the maximum
+	 * that PPC_LI64() can emit.
+	 */
+	for (i = ctx->idx - ctx_idx; i < 5; i++)
+		PPC_NOP();
+
 #ifdef PPC64_ELF_ABI_v1
-	/* func points to the function descriptor */
-	PPC_LI64(b2p[TMP_REG_2], func);
-	/* Load actual entry point from function descriptor */
-	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
-	/* ... and move it to LR */
-	PPC_MTLR(b2p[TMP_REG_1]);
 	/*
 	 * Load TOC from function descriptor at offset 8.
 	 * We can clobber r2 since we get called through a
 	 * function pointer (so caller will save/restore r2)
 	 * and since we don't use a TOC ourself.
 	 */
-	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
-#else
-	/* We can clobber r12 */
-	PPC_FUNC_ADDR(12, func);
-	PPC_MTLR(12);
+	PPC_BPF_LL(2, 12, 8);
+	/* Load actual entry point from function descriptor */
+	PPC_BPF_LL(12, 12, 0);
 #endif
+
+	PPC_MTLR(12);
 	PPC_BLRL();
 }