diff mbox series

[v3,bpf-next,02/18] bpf: Add bpf_arch_text_poke() helper

Message ID 20191108064039.2041889-3-ast@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Introduce BPF trampoline | expand

Commit Message

Alexei Starovoitov Nov. 8, 2019, 6:40 a.m. UTC
Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
nops/calls in kernel text into calls into BPF trampoline and to patch
calls/nops inside BPF programs too.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |  8 ++++++
 kernel/bpf/core.c           |  6 +++++
 3 files changed, 65 insertions(+)

Comments

Song Liu Nov. 8, 2019, 6:56 a.m. UTC | #1
> On Nov 7, 2019, at 10:40 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 
> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> nops/calls in kernel text into calls into BPF trampoline and to patch
> calls/nops inside BPF programs too.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>


> ---
> arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++
> include/linux/bpf.h         |  8 ++++++
> kernel/bpf/core.c           |  6 +++++
> 3 files changed, 65 insertions(+)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 0399b1f83c23..bb8467fd6715 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -9,9 +9,11 @@
> #include <linux/filter.h>
> #include <linux/if_vlan.h>
> #include <linux/bpf.h>
> +#include <linux/memory.h>
> #include <asm/extable.h>
> #include <asm/set_memory.h>
> #include <asm/nospec-branch.h>
> +#include <asm/text-patching.h>
> 
> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
> {
> @@ -487,6 +489,55 @@ static int emit_call(u8 **pprog, void *func, void *ip)
> 	return 0;
> }
> 
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +		       void *old_addr, void *new_addr)
> +{
> +	u8 old_insn[X86_CALL_SIZE] = {};
> +	u8 new_insn[X86_CALL_SIZE] = {};
> +	u8 *prog;
> +	int ret;
> +
> +	if (!is_kernel_text((long)ip))
> +		/* BPF trampoline in modules is not supported */
> +		return -EINVAL;
> +
> +	if (old_addr) {
> +		prog = old_insn;
> +		ret = emit_call(&prog, old_addr, (void *)ip);
> +		if (ret)
> +			return ret;
> +	}
> +	if (new_addr) {
> +		prog = new_insn;
> +		ret = emit_call(&prog, new_addr, (void *)ip);
> +		if (ret)
> +			return ret;
> +	}
> +	ret = -EBUSY;
> +	mutex_lock(&text_mutex);
> +	switch (t) {
> +	case BPF_MOD_NOP_TO_CALL:
> +		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
> +			goto out;
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> +		break;
> +	case BPF_MOD_CALL_TO_CALL:
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> +			goto out;
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> +		break;
> +	case BPF_MOD_CALL_TO_NOP:
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> +			goto out;
> +		text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
> +		break;
> +	}
> +	ret = 0;
> +out:
> +	mutex_unlock(&text_mutex);
> +	return ret;
> +}
> +
> static bool ex_handler_bpf(const struct exception_table_entry *x,
> 			   struct pt_regs *regs, int trapnr,
> 			   unsigned long error_code, unsigned long fault_addr)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 7c7f518811a6..8b90db25348a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
> }
> #endif /* CONFIG_INET */
> 
> +enum bpf_text_poke_type {
> +	BPF_MOD_NOP_TO_CALL,
> +	BPF_MOD_CALL_TO_CALL,
> +	BPF_MOD_CALL_TO_NOP,
> +};
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +		       void *addr1, void *addr2);
> +
> #endif /* _LINUX_BPF_H */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index c1fde0303280..c4bcec1014a9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2140,6 +2140,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
> 	return -EFAULT;
> }
> 
> +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +			      void *addr1, void *addr2)
> +{
> +	return -ENOTSUPP;
> +}
> +
> DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> EXPORT_SYMBOL(bpf_stats_enabled_key);
> 
> -- 
> 2.23.0
>
Björn Töpel Nov. 8, 2019, 8:23 a.m. UTC | #2
On Fri, 8 Nov 2019 at 07:41, Alexei Starovoitov <ast@kernel.org> wrote:
>
> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> nops/calls in kernel text into calls into BPF trampoline and to patch
> calls/nops inside BPF programs too.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++
>  include/linux/bpf.h         |  8 ++++++
>  kernel/bpf/core.c           |  6 +++++
>  3 files changed, 65 insertions(+)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 0399b1f83c23..bb8467fd6715 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -9,9 +9,11 @@
>  #include <linux/filter.h>
>  #include <linux/if_vlan.h>
>  #include <linux/bpf.h>
> +#include <linux/memory.h>
>  #include <asm/extable.h>
>  #include <asm/set_memory.h>
>  #include <asm/nospec-branch.h>
> +#include <asm/text-patching.h>
>
>  static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
>  {
> @@ -487,6 +489,55 @@ static int emit_call(u8 **pprog, void *func, void *ip)
>         return 0;
>  }
>
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +                      void *old_addr, void *new_addr)
> +{
> +       u8 old_insn[X86_CALL_SIZE] = {};
> +       u8 new_insn[X86_CALL_SIZE] = {};
> +       u8 *prog;
> +       int ret;
> +
> +       if (!is_kernel_text((long)ip))
> +               /* BPF trampoline in modules is not supported */
> +               return -EINVAL;
> +
> +       if (old_addr) {
> +               prog = old_insn;
> +               ret = emit_call(&prog, old_addr, (void *)ip);
> +               if (ret)
> +                       return ret;
> +       }
> +       if (new_addr) {
> +               prog = new_insn;
> +               ret = emit_call(&prog, new_addr, (void *)ip);
> +               if (ret)
> +                       return ret;
> +       }
> +       ret = -EBUSY;
> +       mutex_lock(&text_mutex);
> +       switch (t) {
> +       case BPF_MOD_NOP_TO_CALL:
> +               if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
> +                       goto out;
> +               text_poke(ip, new_insn, X86_CALL_SIZE);

I'm probably missing something, but why isn't text_poke_bp() needed here?

> +               break;
> +       case BPF_MOD_CALL_TO_CALL:
> +               if (memcmp(ip, old_insn, X86_CALL_SIZE))
> +                       goto out;
> +               text_poke(ip, new_insn, X86_CALL_SIZE);
> +               break;
> +       case BPF_MOD_CALL_TO_NOP:
> +               if (memcmp(ip, old_insn, X86_CALL_SIZE))
> +                       goto out;
> +               text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
> +               break;
> +       }
> +       ret = 0;
> +out:
> +       mutex_unlock(&text_mutex);
> +       return ret;
> +}
> +
>  static bool ex_handler_bpf(const struct exception_table_entry *x,
>                            struct pt_regs *regs, int trapnr,
>                            unsigned long error_code, unsigned long fault_addr)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 7c7f518811a6..8b90db25348a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
>  }
>  #endif /* CONFIG_INET */
>
> +enum bpf_text_poke_type {
> +       BPF_MOD_NOP_TO_CALL,
> +       BPF_MOD_CALL_TO_CALL,
> +       BPF_MOD_CALL_TO_NOP,
> +};
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +                      void *addr1, void *addr2);
> +
>  #endif /* _LINUX_BPF_H */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index c1fde0303280..c4bcec1014a9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2140,6 +2140,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
>         return -EFAULT;
>  }
>
> +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> +                             void *addr1, void *addr2)
> +{
> +       return -ENOTSUPP;
> +}
> +
>  DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>  EXPORT_SYMBOL(bpf_stats_enabled_key);
>
> --
> 2.23.0
>
Peter Zijlstra Nov. 8, 2019, 9:11 a.m. UTC | #3
On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> nops/calls in kernel text into calls into BPF trampoline and to patch
> calls/nops inside BPF programs too.

This thing assumes the text is unused, right? That isn't spelled out
anywhere. The implementation is very much unsafe vs concurrent execution
of the text.
Peter Zijlstra Nov. 8, 2019, 9:36 a.m. UTC | #4
On Fri, Nov 08, 2019 at 10:11:56AM +0100, Peter Zijlstra wrote:
> On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
> > Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> > nops/calls in kernel text into calls into BPF trampoline and to patch
> > calls/nops inside BPF programs too.
> 
> This thing assumes the text is unused, right? That isn't spelled out
> anywhere. The implementation is very much unsafe vs concurrent execution
> of the text.

Also, what NOP/CALL instructions will you be hijacking? If you're
planning on using the fentry nops, then what ensures this and ftrace
don't trample on one another? Similar for kprobes.

In general, what ensures every instruction only has a single modifier?

I'm very uncomfortable letting random bpf proglets poke around in the
kernel text.
Alexei Starovoitov Nov. 8, 2019, 1:41 p.m. UTC | #5
On 11/8/19 1:36 AM, Peter Zijlstra wrote:
> On Fri, Nov 08, 2019 at 10:11:56AM +0100, Peter Zijlstra wrote:
>> On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
>>> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
>>> nops/calls in kernel text into calls into BPF trampoline and to patch
>>> calls/nops inside BPF programs too.
>>
>> This thing assumes the text is unused, right? That isn't spelled out
>> anywhere. The implementation is very much unsafe vs concurrent execution
>> of the text.
> 
> Also, what NOP/CALL instructions will you be hijacking? If you're
> planning on using the fentry nops, then what ensures this and ftrace
> don't trample on one another? Similar for kprobes.
> 
> In general, what ensures every instruction only has a single modifier?

Looks like you didn't bother reading cover letter and missed a month
of discussions between my and Steven regarding exactly this topic
though you were directly cc-ed in all threads :(
tldr for kernel fentry nops it will be converted to use 
register_ftrace_direct() whenever it's available.
For all other nops, calls, jumps that are inside BPF programs BPF infra
will continue modifying them through this helper.
Daniel's upcoming bpf_tail_call() optimization will use text_poke as well.

 > I'm very uncomfortable letting random bpf proglets poke around in the
kernel text.

1. There is no such thing as 'proglet'. Please don't invent meaningless 
names.
2. BPF programs have no ability to modify kernel text.
3. BPF infra taking all necessary measures to make sure that poking
kernel's and BPF generated text is safe.
If you see specific issue please say so. We'll be happy to address
all issues. Being 'uncomfortable' is not constructive.
Alexei Starovoitov Nov. 8, 2019, 2:09 p.m. UTC | #6
On 11/8/19 12:23 AM, Björn Töpel wrote:
> On Fri, 8 Nov 2019 at 07:41, Alexei Starovoitov <ast@kernel.org> wrote:
>>
>> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
>> nops/calls in kernel text into calls into BPF trampoline and to patch
>> calls/nops inside BPF programs too.
>>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> ---
>>   arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++
>>   include/linux/bpf.h         |  8 ++++++
>>   kernel/bpf/core.c           |  6 +++++
>>   3 files changed, 65 insertions(+)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 0399b1f83c23..bb8467fd6715 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -9,9 +9,11 @@
>>   #include <linux/filter.h>
>>   #include <linux/if_vlan.h>
>>   #include <linux/bpf.h>
>> +#include <linux/memory.h>
>>   #include <asm/extable.h>
>>   #include <asm/set_memory.h>
>>   #include <asm/nospec-branch.h>
>> +#include <asm/text-patching.h>
>>
>>   static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
>>   {
>> @@ -487,6 +489,55 @@ static int emit_call(u8 **pprog, void *func, void *ip)
>>          return 0;
>>   }
>>
>> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
>> +                      void *old_addr, void *new_addr)
>> +{
>> +       u8 old_insn[X86_CALL_SIZE] = {};
>> +       u8 new_insn[X86_CALL_SIZE] = {};
>> +       u8 *prog;
>> +       int ret;
>> +
>> +       if (!is_kernel_text((long)ip))
>> +               /* BPF trampoline in modules is not supported */
>> +               return -EINVAL;
>> +
>> +       if (old_addr) {
>> +               prog = old_insn;
>> +               ret = emit_call(&prog, old_addr, (void *)ip);
>> +               if (ret)
>> +                       return ret;
>> +       }
>> +       if (new_addr) {
>> +               prog = new_insn;
>> +               ret = emit_call(&prog, new_addr, (void *)ip);
>> +               if (ret)
>> +                       return ret;
>> +       }
>> +       ret = -EBUSY;
>> +       mutex_lock(&text_mutex);
>> +       switch (t) {
>> +       case BPF_MOD_NOP_TO_CALL:
>> +               if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
>> +                       goto out;
>> +               text_poke(ip, new_insn, X86_CALL_SIZE);
> 
> I'm probably missing something, but why isn't text_poke_bp() needed here?

I should have documented the intent better.
text_poke_bp() is being changed by Peter to emulate instructions
properly in his ftrace->text_poke conversion set.
So I cannot use it just yet.
To you point that text_poke() is technically incorrect here. Yep.
Well aware. This is temporarily. As I said in the cover letter this
needs to change to register_ftrace_direct() for kernel text poking to
play nice with ftrace. Thinking about it more... I guess I can use
text_poke_bp(). Just need to setup handler properly. I may need to do it 
for bpf prog poking anyway. Wanted to avoid extra churn that is going
to be removed during merge window when trees converge.

Since we're on this subject.
Peter,
why you don't do 8 byte atomic rewrite when start addr of insn
is properly aligned? This trap dance would be unnecessary.
That will make everything so much simpler.
Alexei Starovoitov Nov. 8, 2019, 7:32 p.m. UTC | #7
On Fri, Nov 8, 2019 at 5:42 AM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 11/8/19 1:36 AM, Peter Zijlstra wrote:
> > On Fri, Nov 08, 2019 at 10:11:56AM +0100, Peter Zijlstra wrote:
> >> On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
> >>> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> >>> nops/calls in kernel text into calls into BPF trampoline and to patch
> >>> calls/nops inside BPF programs too.
> >>
> >> This thing assumes the text is unused, right? That isn't spelled out
> >> anywhere. The implementation is very much unsafe vs concurrent execution
> >> of the text.
> >
> > Also, what NOP/CALL instructions will you be hijacking? If you're
> > planning on using the fentry nops, then what ensures this and ftrace
> > don't trample on one another? Similar for kprobes.
> >
> > In general, what ensures every instruction only has a single modifier?
>
> Looks like you didn't bother reading cover letter and missed a month
> of discussions between my and Steven regarding exactly this topic
> though you were directly cc-ed in all threads :(
> tldr for kernel fentry nops it will be converted to use
> register_ftrace_direct() whenever it's available.
> For all other nops, calls, jumps that are inside BPF programs BPF infra
> will continue modifying them through this helper.
> Daniel's upcoming bpf_tail_call() optimization will use text_poke as well.
>
>  > I'm very uncomfortable letting random bpf proglets poke around in the
> kernel text.
>
> 1. There is no such thing as 'proglet'. Please don't invent meaningless
> names.
> 2. BPF programs have no ability to modify kernel text.
> 3. BPF infra taking all necessary measures to make sure that poking
> kernel's and BPF generated text is safe.
> If you see specific issue please say so. We'll be happy to address
> all issues. Being 'uncomfortable' is not constructive.
>

I was thinking more about this.
Peter,
do you mind we apply your first patch:
https://lore.kernel.org/lkml/20191007081944.88332264.2@infradead.org/
to both tip and bpf-next trees?
Then I can use text_poke_bp() as-is without any additional ugliness
on my side that would need to be removed in few weeks.
Do you have it in tip already?
I can cherry-pick from there to make sure it's exactly the same commit log
then there will be no merge issues during merge window.
Peter Zijlstra Nov. 8, 2019, 9:36 p.m. UTC | #8
On Fri, Nov 08, 2019 at 11:32:41AM -0800, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2019 at 5:42 AM Alexei Starovoitov <ast@fb.com> wrote:
> >
> > On 11/8/19 1:36 AM, Peter Zijlstra wrote:
> > > On Fri, Nov 08, 2019 at 10:11:56AM +0100, Peter Zijlstra wrote:
> > >> On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
> > >>> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> > >>> nops/calls in kernel text into calls into BPF trampoline and to patch
> > >>> calls/nops inside BPF programs too.
> > >>
> > >> This thing assumes the text is unused, right? That isn't spelled out
> > >> anywhere. The implementation is very much unsafe vs concurrent execution
> > >> of the text.
> > >
> > > Also, what NOP/CALL instructions will you be hijacking? If you're
> > > planning on using the fentry nops, then what ensures this and ftrace
> > > don't trample on one another? Similar for kprobes.
> > >
> > > In general, what ensures every instruction only has a single modifier?
> >
> > Looks like you didn't bother reading cover letter and missed a month

I did indeed not. A Changelog should be self sufficient and this one is
sorely lacking. The cover leter is not preserved and should therefore
not contain anything of value that is not also covered in the
Changelogs.

> > of discussions between my and Steven regarding exactly this topic
> > though you were directly cc-ed in all threads :(

I read some of it; it is a sad fact that I cannot read all email in my
inbox, esp. not if, like in the last week or so, I'm busy hunting a
regression.

And what I did remember of the emails I saw left me with the questions
that were not answered by the changelog.

> > tldr for kernel fentry nops it will be converted to use
> > register_ftrace_direct() whenever it's available.

So why the rush and not wait for that work to complete? It appears to me
that without due coordination between bpf and ftrace badness could
happen.

> > For all other nops, calls, jumps that are inside BPF programs BPF infra
> > will continue modifying them through this helper.
> > Daniel's upcoming bpf_tail_call() optimization will use text_poke as well.

This is probably off topic, but isn't tail-call optimization something
done at JIT time and therefore not in need ot text_poke()?

> >  > I'm very uncomfortable letting random bpf proglets poke around in the
> > kernel text.
> >
> > 1. There is no such thing as 'proglet'. Please don't invent meaningless
> > names.

Again offtopic, but I'm not inventing stuff here:

  /prog'let/ [UK] A short extempore program written to meet an immediate, transient need.

which, methinks, succinctly captures the spirit of BPF.

> > 2. BPF programs have no ability to modify kernel text.

OK, that is good.

> > 3. BPF infra taking all necessary measures to make sure that poking
> > kernel's and BPF generated text is safe.

From the other subthread and your response below, it seems you are aware
that you in fact need text_poke_bp(). Again, it would've been excellent
Changelog/comment material to call out that the patch as presented is in
fact broken.

> I was thinking more about this.
> Peter,
> do you mind we apply your first patch:
> https://lore.kernel.org/lkml/20191007081944.88332264.2@infradead.org/
> to both tip and bpf-next trees?

That would indeed be a much better solution. I'll repost much of that on
Monday, and then we'll work on getting at the very least that one patch
in a tip/branch we can share.

> Then I can use text_poke_bp() as-is without any additional ugliness
> on my side that would need to be removed in few weeks.

This I do _NOT_ understand. Why are you willing to merge a known broken
patch? What is the rush, why can't you wait for all the prerequisites to
land?
David Miller Nov. 8, 2019, 9:39 p.m. UTC | #9
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Nov 2019 22:36:24 +0100

> The cover leter is not preserved and should therefore
 ...

The cover letter is +ALWAYS+ preserved, we put them in the merge
commit.
Alexei Starovoitov Nov. 8, 2019, 11:05 p.m. UTC | #10
On Fri, Nov 08, 2019 at 10:36:24PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 08, 2019 at 11:32:41AM -0800, Alexei Starovoitov wrote:
> > On Fri, Nov 8, 2019 at 5:42 AM Alexei Starovoitov <ast@fb.com> wrote:
> > >
> > > On 11/8/19 1:36 AM, Peter Zijlstra wrote:
> > > > On Fri, Nov 08, 2019 at 10:11:56AM +0100, Peter Zijlstra wrote:
> > > >> On Thu, Nov 07, 2019 at 10:40:23PM -0800, Alexei Starovoitov wrote:
> > > >>> Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
> > > >>> nops/calls in kernel text into calls into BPF trampoline and to patch
> > > >>> calls/nops inside BPF programs too.
> > > >>
> > > >> This thing assumes the text is unused, right? That isn't spelled out
> > > >> anywhere. The implementation is very much unsafe vs concurrent execution
> > > >> of the text.
> > > >
> > > > Also, what NOP/CALL instructions will you be hijacking? If you're
> > > > planning on using the fentry nops, then what ensures this and ftrace
> > > > don't trample on one another? Similar for kprobes.
> > > >
> > > > In general, what ensures every instruction only has a single modifier?
> > >
> > > Looks like you didn't bother reading cover letter and missed a month
> 
> I did indeed not. A Changelog should be self sufficient and this one is
> sorely lacking. The cover leter is not preserved and should therefore
> not contain anything of value that is not also covered in the
> Changelogs.
> 
> > > of discussions between my and Steven regarding exactly this topic
> > > though you were directly cc-ed in all threads :(
> 
> I read some of it; it is a sad fact that I cannot read all email in my
> inbox, esp. not if, like in the last week or so, I'm busy hunting a
> regression.
> 
> And what I did remember of the emails I saw left me with the questions
> that were not answered by the changelog.
> 
> > > tldr for kernel fentry nops it will be converted to use
> > > register_ftrace_direct() whenever it's available.
> 
> So why the rush and not wait for that work to complete? It appears to me
> that without due coordination between bpf and ftrace badness could
> happen.
> 
> > > For all other nops, calls, jumps that are inside BPF programs BPF infra
> > > will continue modifying them through this helper.
> > > Daniel's upcoming bpf_tail_call() optimization will use text_poke as well.
> 
> This is probably off topic, but isn't tail-call optimization something
> done at JIT time and therefore not in need ot text_poke()?

Not quite. bpf_tail_call() are done via prog_array which is indirect jmp and
it suffers from retpoline. The verifier can see that in a lot of cases the
prog_array is used with constant index into array instead of a variable. In
such case indirect jmps can be optimized with direct jmps. That is
essentially what Daniel's patches are doing that are building on top of
bpf_arch_text_poke() and trampoline that I'm introducing in this set.

Another set is being prepared by Bjorn that also builds on top of
bpf_arch_text_poke() and trampoline. It's serving the purpose of getting rid of
indirect call when driver calls into BPF program for the first time. We've
looked at your static_call and concluded that it doesn't quite cut for this use
case.

The third framework is worked on by Martin. Who is using BPF trampoline for
BPF-based TCP extensions. This bit is not related to indirect call/jmp
optimization, but needs trampoline.

> > I was thinking more about this.
> > Peter,
> > do you mind we apply your first patch:
> > https://lore.kernel.org/lkml/20191007081944.88332264.2@infradead.org/
> > to both tip and bpf-next trees?
> 
> That would indeed be a much better solution. I'll repost much of that on
> Monday, and then we'll work on getting at the very least that one patch
> in a tip/branch we can share.

Awesome! I can certainly wait till next week. I just don't want to miss the
merge window for the work that it is ready. More below.

> > Then I can use text_poke_bp() as-is without any additional ugliness
> > on my side that would need to be removed in few weeks.
> 
> This I do _NOT_ understand. Why are you willing to merge a known broken
> patch? What is the rush, why can't you wait for all the prerequisites to
> land?

People have deadlines and here I'm not talking about fb deadlines. If it was
only up to me I could have waited until yours and Steven's patches land in
Linus's tree. Then Dave would pick them up after the merge window into net-next
and bpf things would be ready for the next release. Which is in 1.5 + 2 + 8
weeks (assuming 1.5 weeks until merge window, 2 weeks merge window, and 8
weeks next release cycle).
But most of bpf things are ready. I have one more follow up to do for another
feature. The first 4-5 patches of my set will enable Bjorn, Daniel, and
Martin's work. So I'm mainly looking for a way to converge three trees during
the merge window with no conflicts.

Just saw that Steven posted his set. That is great. If you land your first part
of text_poke_pb() next week into tip it will enable us to cherry-pick the first
few patches from tip and continue with bpf trampoline in net-next. Then during
the merge window tip, Steven's and net-next land into Linus's tree. Then I'll
send small follow up to switch to Steven's register_ftrace_direct() in places
that can use it and the other bits of bpf will keep using yours text_poke_bp()
because it's for the code inside generated bpf progs, various generated
trampolines and such. The conversion of some of bpf bits to
register_ftrace_direct() can be delayed by a release if really necessary. Since
text_poke_bp() approach will work fine, just not as nice if there is a full
integration via ftrace.
imo it's the best path for 3 trees to converge without delaying things for bpf
folks by a full release. At the end the deadlines are met and a bunch of people
are unblocked and happy. I hope that explains the rush.
Thomas Gleixner Nov. 10, 2019, 10:54 a.m. UTC | #11
On Fri, 8 Nov 2019, Alexei Starovoitov wrote:
> On Fri, Nov 08, 2019 at 10:36:24PM +0100, Peter Zijlstra wrote:
> > This I do _NOT_ understand. Why are you willing to merge a known broken
> > patch? What is the rush, why can't you wait for all the prerequisites to
> > land?
> 
> People have deadlines and here I'm not talking about fb deadlines. If it was
> only up to me I could have waited until yours and Steven's patches land in
> Linus's tree. Then Dave would pick them up after the merge window into net-next
> and bpf things would be ready for the next release. Which is in 1.5 + 2 + 8
> weeks (assuming 1.5 weeks until merge window, 2 weeks merge window, and 8
> weeks next release cycle).
> But most of bpf things are ready. I have one more follow up to do for another
> feature. The first 4-5 patches of my set will enable Bjorn, Daniel, and
> Martin's work. So I'm mainly looking for a way to converge three trees during
> the merge window with no conflicts.

No. Nobodys deadlines are justifying anything.

You recently gave me a lecture how to do proper kernel development just
because I did the right thing, i.e. preventing a bad interaction by making
a Kconfig dependency which does not affect you in any way.

Now for your own interests you try to justify something which is
fundamentaly worse: Merging known to be broken code.

BPF is not special and has to wait for the next merge window if the
prerequisites are not ready in time as any other patch set does.

Thanks,

	tglx
Peter Zijlstra Nov. 11, 2019, 8:14 a.m. UTC | #12
On Fri, Nov 08, 2019 at 01:39:24PM -0800, David Miller wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Fri, 8 Nov 2019 22:36:24 +0100
> 
> > The cover leter is not preserved and should therefore
>  ...
> 
> The cover letter is +ALWAYS+ preserved, we put them in the merge
> commit.

Good to know; is this a netdev special? I've not seen this before.

Is there a convenient git command to find the merge commit for a given
regular commit? Say I've used git-blame to find a troublesome commit,
then how do I find the merge commit for it?

Also, I still think this does not excuse weak individual Changelogs.
Daniel Borkmann Nov. 11, 2019, 10:21 a.m. UTC | #13
On 11/11/19 9:14 AM, Peter Zijlstra wrote:
> On Fri, Nov 08, 2019 at 01:39:24PM -0800, David Miller wrote:
>> From: Peter Zijlstra <peterz@infradead.org>
>> Date: Fri, 8 Nov 2019 22:36:24 +0100
>>
>>> The cover leter is not preserved and should therefore
>>   ...
>>
>> The cover letter is +ALWAYS+ preserved, we put them in the merge
>> commit.
> 
> Good to know; is this a netdev special? I've not seen this before.

I think it might be netdev special, and given developers are often used to
this practice on netdev, we've adopted the same for both bpf trees as well
if the cover letter contains a useful high level summary of the whole set.

We've recently changed our workflow a bit after last maintainers summit and
reuse Thomas' mb2q [0] in our small collection of scripts [1], so aside from
other useful features, for every commit under bpf/bpf-next there is now also
a 'Link:' tag pointing to https://lore.kernel.org/bpf/ archive, thus cover
letter or discussions could alternatively be found this way.

One downside of merge commit as cover letter is that they are usually lost (*)
once commits get cherry-picked into stable trees or other downstream, backport
heavy kernels, so with a 'Link:' tag it's a convenient way to quickly get more
context or discussions for those cases.

> Is there a convenient git command to find the merge commit for a given
> regular commit? Say I've used git-blame to find a troublesome commit,
> then how do I find the merge commit for it?

Once you have the sha, you could for example retrieve the corresponding
merge commit from the upstream tree (as top commit) via:

   $ git log <sha>..master --ancestry-path --merges --reverse

> Also, I still think this does not excuse weak individual Changelogs.

Ideally commit messages should be as self-contained as possible to have all
the necessary context w/o having to look up other resources also given issue
(*) above.

Thanks,
Daniel

   [0] https://git.kernel.org/pub/scm/linux/kernel/git/tglx/quilttools.git/
   [1] https://git.kernel.org/pub/scm/linux/kernel/git/dborkman/pw.git/
Jonathan Corbet Nov. 11, 2019, 4:10 p.m. UTC | #14
On Mon, 11 Nov 2019 09:14:03 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Nov 08, 2019 at 01:39:24PM -0800, David Miller wrote:
> > From: Peter Zijlstra <peterz@infradead.org>
> > Date: Fri, 8 Nov 2019 22:36:24 +0100
> >   
> > > The cover leter is not preserved and should therefore  
> >  ...
> > 
> > The cover letter is +ALWAYS+ preserved, we put them in the merge
> > commit.  
> 
> Good to know; is this a netdev special? I've not seen this before.

I read a *lot* of merge commit changelogs; it's not just netdev that does
this.  I try to do it with significant documentation sets as well.  I
agree that changelogs should contain all relevant information, but there
is value in an overview as well.  But then, I make my living in the
overview business...:)

jon
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0399b1f83c23..bb8467fd6715 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -9,9 +9,11 @@ 
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <linux/memory.h>
 #include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
+#include <asm/text-patching.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -487,6 +489,55 @@  static int emit_call(u8 **pprog, void *func, void *ip)
 	return 0;
 }
 
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *old_addr, void *new_addr)
+{
+	u8 old_insn[X86_CALL_SIZE] = {};
+	u8 new_insn[X86_CALL_SIZE] = {};
+	u8 *prog;
+	int ret;
+
+	if (!is_kernel_text((long)ip))
+		/* BPF trampoline in modules is not supported */
+		return -EINVAL;
+
+	if (old_addr) {
+		prog = old_insn;
+		ret = emit_call(&prog, old_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+	if (new_addr) {
+		prog = new_insn;
+		ret = emit_call(&prog, new_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+	ret = -EBUSY;
+	mutex_lock(&text_mutex);
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL:
+		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
+			goto out;
+		text_poke(ip, new_insn, X86_CALL_SIZE);
+		break;
+	case BPF_MOD_CALL_TO_CALL:
+		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+			goto out;
+		text_poke(ip, new_insn, X86_CALL_SIZE);
+		break;
+	case BPF_MOD_CALL_TO_NOP:
+		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+			goto out;
+		text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
+		break;
+	}
+	ret = 0;
+out:
+	mutex_unlock(&text_mutex);
+	return ret;
+}
+
 static bool ex_handler_bpf(const struct exception_table_entry *x,
 			   struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7c7f518811a6..8b90db25348a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1157,4 +1157,12 @@  static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif /* CONFIG_INET */
 
+enum bpf_text_poke_type {
+	BPF_MOD_NOP_TO_CALL,
+	BPF_MOD_CALL_TO_CALL,
+	BPF_MOD_CALL_TO_NOP,
+};
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *addr1, void *addr2);
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1fde0303280..c4bcec1014a9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2140,6 +2140,12 @@  int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+			      void *addr1, void *addr2)
+{
+	return -ENOTSUPP;
+}
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);