[RFC,bpf,1/2] bpf: allow 64-bit offsets for bpf function calls

Message ID 20180208120306.2568-1-sandipan@linux.vnet.ibm.com
State Not Applicable
Headers show
Series
  • [RFC,bpf,1/2] bpf: allow 64-bit offsets for bpf function calls
Related show

Commit Message

Sandipan Das Feb. 8, 2018, 12:03 p.m.
The imm field of a bpf_insn is a signed 32-bit integer. For
JIT-ed bpf-to-bpf function calls, it stores the offset from
__bpf_call_base to the start of the callee function.

For some architectures, such as powerpc64, it was found that
this offset may be as large as 64 bits because of which this
cannot be accomodated in the imm field without truncation.

To resolve this, we additionally use the aux data within each
bpf_prog associated with the caller functions to store the
addresses of their respective callees.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

Comments

Alexei Starovoitov Feb. 8, 2018, 5:08 p.m. | #1
On 2/8/18 4:03 AM, Sandipan Das wrote:
> The imm field of a bpf_insn is a signed 32-bit integer. For
> JIT-ed bpf-to-bpf function calls, it stores the offset from
> __bpf_call_base to the start of the callee function.
>
> For some architectures, such as powerpc64, it was found that
> this offset may be as large as 64 bits because of which this
> cannot be accomodated in the imm field without truncation.
>
> To resolve this, we additionally use the aux data within each
> bpf_prog associated with the caller functions to store the
> addresses of their respective callees.
>
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ---
>  kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 38 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 5fb69a85d967..52088b4ca02f 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -5282,6 +5282,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>  	 * run last pass of JIT
>  	 */
>  	for (i = 0; i <= env->subprog_cnt; i++) {
> +		u32 flen = func[i]->len, callee_cnt = 0;
> +		struct bpf_prog **callee;
> +
> +		/* for now assume that the maximum number of bpf function
> +		 * calls that can be made by a caller must be at most the
> +		 * number of bpf instructions in that function
> +		 */
> +		callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
> +		if (!callee) {
> +			err = -ENOMEM;
> +			goto out_free;
> +		}
> +
>  		insn = func[i]->insnsi;
>  		for (j = 0; j < func[i]->len; j++, insn++) {
>  			if (insn->code != (BPF_JMP | BPF_CALL) ||
> @@ -5292,6 +5305,26 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>  			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
>  				func[subprog]->bpf_func -
>  				__bpf_call_base;
> +
> +			/* the offset to the callee from __bpf_call_base
> +			 * may be larger than what the 32 bit integer imm
> +			 * can accomodate which will truncate the higher
> +			 * order bits
> +			 *
> +			 * to avoid this, we additionally utilize the aux
> +			 * data of each caller function for storing the
> +			 * addresses of every callee associated with it
> +			 */
> +			callee[callee_cnt++] = func[subprog];

can you share typical /proc/kallsyms ?
Are you saying that kernel and kernel modules are allocated from
address spaces that are always more than 32-bit apart?
That would mean that all kernel calls into modules are far calls
and the other way around form .ko into kernel?
Performance is probably suffering because every call needs to be built
with full 64-bit offset. No ?
Naveen N. Rao Feb. 8, 2018, 5:59 p.m. | #2
Alexei Starovoitov wrote:
> On 2/8/18 4:03 AM, Sandipan Das wrote:
>> The imm field of a bpf_insn is a signed 32-bit integer. For
>> JIT-ed bpf-to-bpf function calls, it stores the offset from
>> __bpf_call_base to the start of the callee function.
>>
>> For some architectures, such as powerpc64, it was found that
>> this offset may be as large as 64 bits because of which this
>> cannot be accomodated in the imm field without truncation.
>>
>> To resolve this, we additionally use the aux data within each
>> bpf_prog associated with the caller functions to store the
>> addresses of their respective callees.
>>
>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>> ---
>>  kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 38 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 5fb69a85d967..52088b4ca02f 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -5282,6 +5282,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>  	 * run last pass of JIT
>>  	 */
>>  	for (i = 0; i <= env->subprog_cnt; i++) {
>> +		u32 flen = func[i]->len, callee_cnt = 0;
>> +		struct bpf_prog **callee;
>> +
>> +		/* for now assume that the maximum number of bpf function
>> +		 * calls that can be made by a caller must be at most the
>> +		 * number of bpf instructions in that function
>> +		 */
>> +		callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
>> +		if (!callee) {
>> +			err = -ENOMEM;
>> +			goto out_free;
>> +		}
>> +
>>  		insn = func[i]->insnsi;
>>  		for (j = 0; j < func[i]->len; j++, insn++) {
>>  			if (insn->code != (BPF_JMP | BPF_CALL) ||
>> @@ -5292,6 +5305,26 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>  			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
>>  				func[subprog]->bpf_func -
>>  				__bpf_call_base;
>> +
>> +			/* the offset to the callee from __bpf_call_base
>> +			 * may be larger than what the 32 bit integer imm
>> +			 * can accomodate which will truncate the higher
>> +			 * order bits
>> +			 *
>> +			 * to avoid this, we additionally utilize the aux
>> +			 * data of each caller function for storing the
>> +			 * addresses of every callee associated with it
>> +			 */
>> +			callee[callee_cnt++] = func[subprog];
> 
> can you share typical /proc/kallsyms ?
> Are you saying that kernel and kernel modules are allocated from
> address spaces that are always more than 32-bit apart?

Yes. On ppc64, kernel text is linearly mapped from 0xc000000000000000, 
while vmalloc'ed area starts from 0xd000000000000000 (for radix, this is
different, but still beyond a 32-bit offset).

> That would mean that all kernel calls into modules are far calls
> and the other way around form .ko into kernel?
> Performance is probably suffering because every call needs to be built
> with full 64-bit offset. No ?

Possibly, and I think Michael can give a better perspective, but I think
this is due to our ABI. For inter-module calls, we need to setup the TOC
pointer (or the address of the function being called with ABIv2), which 
would require us to load a full address regardless.

- Naveen
Naveen N. Rao Feb. 9, 2018, 4:54 p.m. | #3
Naveen N. Rao wrote:
> Alexei Starovoitov wrote:
>> On 2/8/18 4:03 AM, Sandipan Das wrote:
>>> The imm field of a bpf_insn is a signed 32-bit integer. For
>>> JIT-ed bpf-to-bpf function calls, it stores the offset from
>>> __bpf_call_base to the start of the callee function.
>>>
>>> For some architectures, such as powerpc64, it was found that
>>> this offset may be as large as 64 bits because of which this
>>> cannot be accomodated in the imm field without truncation.
>>>
>>> To resolve this, we additionally use the aux data within each
>>> bpf_prog associated with the caller functions to store the
>>> addresses of their respective callees.
>>>
>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>>> ---
>>>  kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 38 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index 5fb69a85d967..52088b4ca02f 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -5282,6 +5282,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>>  	 * run last pass of JIT
>>>  	 */
>>>  	for (i = 0; i <= env->subprog_cnt; i++) {
>>> +		u32 flen = func[i]->len, callee_cnt = 0;
>>> +		struct bpf_prog **callee;
>>> +
>>> +		/* for now assume that the maximum number of bpf function
>>> +		 * calls that can be made by a caller must be at most the
>>> +		 * number of bpf instructions in that function
>>> +		 */
>>> +		callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
>>> +		if (!callee) {
>>> +			err = -ENOMEM;
>>> +			goto out_free;
>>> +		}
>>> +
>>>  		insn = func[i]->insnsi;
>>>  		for (j = 0; j < func[i]->len; j++, insn++) {
>>>  			if (insn->code != (BPF_JMP | BPF_CALL) ||
>>> @@ -5292,6 +5305,26 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>>  			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
>>>  				func[subprog]->bpf_func -
>>>  				__bpf_call_base;
>>> +
>>> +			/* the offset to the callee from __bpf_call_base
>>> +			 * may be larger than what the 32 bit integer imm
>>> +			 * can accomodate which will truncate the higher
>>> +			 * order bits
>>> +			 *
>>> +			 * to avoid this, we additionally utilize the aux
>>> +			 * data of each caller function for storing the
>>> +			 * addresses of every callee associated with it
>>> +			 */
>>> +			callee[callee_cnt++] = func[subprog];
>> 
>> can you share typical /proc/kallsyms ?
>> Are you saying that kernel and kernel modules are allocated from
>> address spaces that are always more than 32-bit apart?
> 
> Yes. On ppc64, kernel text is linearly mapped from 0xc000000000000000, 
> while vmalloc'ed area starts from 0xd000000000000000 (for radix, this is
> different, but still beyond a 32-bit offset).
> 
>> That would mean that all kernel calls into modules are far calls
>> and the other way around form .ko into kernel?
>> Performance is probably suffering because every call needs to be built
>> with full 64-bit offset. No ?
> 
> Possibly, and I think Michael can give a better perspective, but I think
> this is due to our ABI. For inter-module calls, we need to setup the TOC
> pointer (or the address of the function being called with ABIv2), which 
> would require us to load a full address regardless.

Thinking more about this, as an optimization, for bpf-to-bpf calls, we 
could detect a near call and just emit a relative branch since we don't 
care about TOC with BPF. But, this will depend on whether the different 
BPF functions are close enough (within 32MB) of one another.

We can attempt that once the generic changes are finalized on.

Thanks,
Naveen
Alexei Starovoitov Feb. 10, 2018, 12:38 a.m. | #4
On 2/9/18 8:54 AM, Naveen N. Rao wrote:
> Naveen N. Rao wrote:
>> Alexei Starovoitov wrote:
>>> On 2/8/18 4:03 AM, Sandipan Das wrote:
>>>> The imm field of a bpf_insn is a signed 32-bit integer. For
>>>> JIT-ed bpf-to-bpf function calls, it stores the offset from
>>>> __bpf_call_base to the start of the callee function.
>>>>
>>>> For some architectures, such as powerpc64, it was found that
>>>> this offset may be as large as 64 bits because of which this
>>>> cannot be accomodated in the imm field without truncation.
>>>>
>>>> To resolve this, we additionally use the aux data within each
>>>> bpf_prog associated with the caller functions to store the
>>>> addresses of their respective callees.
>>>>
>>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>>>> ---
>>>>  kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 38 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index 5fb69a85d967..52088b4ca02f 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -5282,6 +5282,19 @@ static int jit_subprogs(struct
>>>> bpf_verifier_env *env)
>>>>       * run last pass of JIT
>>>>       */
>>>>      for (i = 0; i <= env->subprog_cnt; i++) {
>>>> +        u32 flen = func[i]->len, callee_cnt = 0;
>>>> +        struct bpf_prog **callee;
>>>> +
>>>> +        /* for now assume that the maximum number of bpf function
>>>> +         * calls that can be made by a caller must be at most the
>>>> +         * number of bpf instructions in that function
>>>> +         */
>>>> +        callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
>>>> +        if (!callee) {
>>>> +            err = -ENOMEM;
>>>> +            goto out_free;
>>>> +        }
>>>> +
>>>>          insn = func[i]->insnsi;
>>>>          for (j = 0; j < func[i]->len; j++, insn++) {
>>>>              if (insn->code != (BPF_JMP | BPF_CALL) ||
>>>> @@ -5292,6 +5305,26 @@ static int jit_subprogs(struct
>>>> bpf_verifier_env *env)
>>>>              insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
>>>>                  func[subprog]->bpf_func -
>>>>                  __bpf_call_base;
>>>> +
>>>> +            /* the offset to the callee from __bpf_call_base
>>>> +             * may be larger than what the 32 bit integer imm
>>>> +             * can accomodate which will truncate the higher
>>>> +             * order bits
>>>> +             *
>>>> +             * to avoid this, we additionally utilize the aux
>>>> +             * data of each caller function for storing the
>>>> +             * addresses of every callee associated with it
>>>> +             */
>>>> +            callee[callee_cnt++] = func[subprog];
>>>
>>> can you share typical /proc/kallsyms ?
>>> Are you saying that kernel and kernel modules are allocated from
>>> address spaces that are always more than 32-bit apart?
>>
>> Yes. On ppc64, kernel text is linearly mapped from 0xc000000000000000,
>> while vmalloc'ed area starts from 0xd000000000000000 (for radix, this is
>> different, but still beyond a 32-bit offset).
>>
>>> That would mean that all kernel calls into modules are far calls
>>> and the other way around form .ko into kernel?
>>> Performance is probably suffering because every call needs to be built
>>> with full 64-bit offset. No ?
>>
>> Possibly, and I think Michael can give a better perspective, but I think
>> this is due to our ABI. For inter-module calls, we need to setup the TOC
>> pointer (or the address of the function being called with ABIv2),
>> which would require us to load a full address regardless.
>
> Thinking more about this, as an optimization, for bpf-to-bpf calls, we
> could detect a near call and just emit a relative branch since we don't
> care about TOC with BPF. But, this will depend on whether the different
> BPF functions are close enough (within 32MB) of one another.

so that will be just an optimization. Understood.
How about instead of doing callee = kzalloc(sizeof(func[i]) * flen..
we keep  insn->off pointing to subprog and move
prog->aux->func = func;
before the last JIT pass.
Then you won't need to alloc this extra array.
Sandipan Das Feb. 10, 2018, 4:36 p.m. | #5
On 02/10/2018 06:08 AM, Alexei Starovoitov wrote:
> On 2/9/18 8:54 AM, Naveen N. Rao wrote:
>> Naveen N. Rao wrote:
>>> Alexei Starovoitov wrote:
>>>> On 2/8/18 4:03 AM, Sandipan Das wrote:
>>>>> The imm field of a bpf_insn is a signed 32-bit integer. For
>>>>> JIT-ed bpf-to-bpf function calls, it stores the offset from
>>>>> __bpf_call_base to the start of the callee function.
>>>>>
>>>>> For some architectures, such as powerpc64, it was found that
>>>>> this offset may be as large as 64 bits because of which this
>>>>> cannot be accomodated in the imm field without truncation.
>>>>>
>>>>> To resolve this, we additionally use the aux data within each
>>>>> bpf_prog associated with the caller functions to store the
>>>>> addresses of their respective callees.
>>>>>
>>>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>>>>> ---
>>>>>  kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>>>>  1 file changed, 38 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>>> index 5fb69a85d967..52088b4ca02f 100644
>>>>> --- a/kernel/bpf/verifier.c
>>>>> +++ b/kernel/bpf/verifier.c
>>>>> @@ -5282,6 +5282,19 @@ static int jit_subprogs(struct
>>>>> bpf_verifier_env *env)
>>>>>       * run last pass of JIT
>>>>>       */
>>>>>      for (i = 0; i <= env->subprog_cnt; i++) {
>>>>> +        u32 flen = func[i]->len, callee_cnt = 0;
>>>>> +        struct bpf_prog **callee;
>>>>> +
>>>>> +        /* for now assume that the maximum number of bpf function
>>>>> +         * calls that can be made by a caller must be at most the
>>>>> +         * number of bpf instructions in that function
>>>>> +         */
>>>>> +        callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
>>>>> +        if (!callee) {
>>>>> +            err = -ENOMEM;
>>>>> +            goto out_free;
>>>>> +        }
>>>>> +
>>>>>          insn = func[i]->insnsi;
>>>>>          for (j = 0; j < func[i]->len; j++, insn++) {
>>>>>              if (insn->code != (BPF_JMP | BPF_CALL) ||
>>>>> @@ -5292,6 +5305,26 @@ static int jit_subprogs(struct
>>>>> bpf_verifier_env *env)
>>>>>              insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
>>>>>                  func[subprog]->bpf_func -
>>>>>                  __bpf_call_base;
>>>>> +
>>>>> +            /* the offset to the callee from __bpf_call_base
>>>>> +             * may be larger than what the 32 bit integer imm
>>>>> +             * can accomodate which will truncate the higher
>>>>> +             * order bits
>>>>> +             *
>>>>> +             * to avoid this, we additionally utilize the aux
>>>>> +             * data of each caller function for storing the
>>>>> +             * addresses of every callee associated with it
>>>>> +             */
>>>>> +            callee[callee_cnt++] = func[subprog];
>>>>
>>>> can you share typical /proc/kallsyms ?
>>>> Are you saying that kernel and kernel modules are allocated from
>>>> address spaces that are always more than 32-bit apart?
>>>
>>> Yes. On ppc64, kernel text is linearly mapped from 0xc000000000000000,
>>> while vmalloc'ed area starts from 0xd000000000000000 (for radix, this is
>>> different, but still beyond a 32-bit offset).
>>>
>>>> That would mean that all kernel calls into modules are far calls
>>>> and the other way around form .ko into kernel?
>>>> Performance is probably suffering because every call needs to be built
>>>> with full 64-bit offset. No ?
>>>
>>> Possibly, and I think Michael can give a better perspective, but I think
>>> this is due to our ABI. For inter-module calls, we need to setup the TOC
>>> pointer (or the address of the function being called with ABIv2),
>>> which would require us to load a full address regardless.
>>
>> Thinking more about this, as an optimization, for bpf-to-bpf calls, we
>> could detect a near call and just emit a relative branch since we don't
>> care about TOC with BPF. But, this will depend on whether the different
>> BPF functions are close enough (within 32MB) of one another.
> 
> so that will be just an optimization. Understood.
> How about instead of doing callee = kzalloc(sizeof(func[i]) * flen..
> we keep  insn->off pointing to subprog and move
> prog->aux->func = func;
> before the last JIT pass.
> Then you won't need to alloc this extra array.
> 

Agreed. Will make the necessary changes in v2.

--
With Regards,
Sandipan

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5fb69a85d967..52088b4ca02f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5282,6 +5282,19 @@  static int jit_subprogs(struct bpf_verifier_env *env)
 	 * run last pass of JIT
 	 */
 	for (i = 0; i <= env->subprog_cnt; i++) {
+		u32 flen = func[i]->len, callee_cnt = 0;
+		struct bpf_prog **callee;
+
+		/* for now assume that the maximum number of bpf function
+		 * calls that can be made by a caller must be at most the
+		 * number of bpf instructions in that function
+		 */
+		callee = kzalloc(sizeof(func[i]) * flen, GFP_KERNEL);
+		if (!callee) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5292,6 +5305,26 @@  static int jit_subprogs(struct bpf_verifier_env *env)
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
+
+			/* the offset to the callee from __bpf_call_base
+			 * may be larger than what the 32 bit integer imm
+			 * can accomodate which will truncate the higher
+			 * order bits
+			 *
+			 * to avoid this, we additionally utilize the aux
+			 * data of each caller function for storing the
+			 * addresses of every callee associated with it
+			 */
+			callee[callee_cnt++] = func[subprog];
+		}
+
+		/* free up callee list if no function calls were made */
+		if (!callee_cnt) {
+			kfree(callee);
+			callee = NULL;
+		} else {
+			func[i]->aux->func = callee;
+			func[i]->aux->func_cnt = callee_cnt;
 		}
 	}
 	for (i = 0; i <= env->subprog_cnt; i++) {
@@ -5338,8 +5371,12 @@  static int jit_subprogs(struct bpf_verifier_env *env)
 	return 0;
 out_free:
 	for (i = 0; i <= env->subprog_cnt; i++)
-		if (func[i])
+		if (func[i]) {
+			/* cleanup callee list */
+			if (func[i]->aux->func)
+				kfree(func[i]->aux->func);
 			bpf_jit_free(func[i]);
+		}
 	kfree(func);
 	/* cleanup main prog to be interpreted */
 	prog->jit_requested = 0;