diff mbox series

[bpf,v2] bpf: fix nested bpf tracepoints with per-cpu data

Message ID 20190611215304.28831-1-mmullins@fb.com
State Accepted
Delegated to: BPF Maintainers
Headers show
Series [bpf,v2] bpf: fix nested bpf tracepoints with per-cpu data | expand

Commit Message

Matt Mullins June 11, 2019, 9:53 p.m. UTC
BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
they do not increment bpf_prog_active while executing.

This enables three levels of nesting, to support
  - a kprobe or raw tp or perf event,
  - another one of the above that irq context happens to call, and
  - another one in nmi context
(at most one of which may be a kprobe or perf event).

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
Signed-off-by: Matt Mullins <mmullins@fb.com>
---
v1->v2:
  * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
  * instantiate err more readably

I've done additional testing with the original workload that hit the
irq+raw-tp reentrancy problem, and as far as I can tell, it's still
solved with this solution (as opposed to my earlier per-map-element
version).

 kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
 1 file changed, 84 insertions(+), 16 deletions(-)

Comments

Andrii Nakryiko June 12, 2019, 5 a.m. UTC | #1
On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
>
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
>
> This enables three levels of nesting, to support
>   - a kprobe or raw tp or perf event,
>   - another one of the above that irq context happens to call, and
>   - another one in nmi context
> (at most one of which may be a kprobe or perf event).
>
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <mmullins@fb.com>
> ---

LGTM, minor nit below.

Acked-by: Andrii Nakryiko <andriin@fb.com>

> v1->v2:
>   * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
>   * instantiate err more readably
>
> I've done additional testing with the original workload that hit the
> irq+raw-tp reentrancy problem, and as far as I can tell, it's still
> solved with this solution (as opposed to my earlier per-map-element
> version).
>
>  kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
>  1 file changed, 84 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f92d6ad5e080..1c9a4745e596 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
>         .arg4_type      = ARG_CONST_SIZE,
>  };
>
> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> -
>  static __always_inline u64
>  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>                         u64 flags, struct perf_sample_data *sd)
> @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>         return perf_event_output(event, sd, regs);
>  }
>
> +/*
> + * Support executing tracepoints in normal, irq, and nmi context that each call
> + * bpf_perf_event_output
> + */
> +struct bpf_trace_sample_data {
> +       struct perf_sample_data sds[3];
> +};
> +
> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
> +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
>  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
>            u64, flags, void *, data, u64, size)
>  {
> -       struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> +       struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
> +       int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
>         struct perf_raw_record raw = {
>                 .frag = {
>                         .size = size,
>                         .data = data,
>                 },
>         };
> +       struct perf_sample_data *sd;
> +       int err;
>
> -       if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> -               return -EINVAL;
> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
> +               err = -EBUSY;
> +               goto out;
> +       }
> +
> +       sd = &sds->sds[nest_level - 1];
> +
> +       if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
> +               err = -EINVAL;
> +               goto out;
> +       }

Feel free to ignore, but just stylistically, given this check doesn't
depend on sd, I'd move it either to the very top or right after
`nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
checking is grouped without interspersed assignment.

>
>         perf_sample_data_init(sd, 0, 0);
>         sd->raw = &raw;
>
> -       return __bpf_perf_event_output(regs, map, flags, sd);
> +       err = __bpf_perf_event_output(regs, map, flags, sd);
> +
> +out:
> +       this_cpu_dec(bpf_trace_nest_level);
> +       return err;
>  }
>
>  static const struct bpf_func_proto bpf_perf_event_output_proto = {
> @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  /*
>   * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
>   * to avoid potential recursive reuse issue when/if tracepoints are added
> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
> + *
> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
> + * in normal, irq, and nmi context.
>   */
> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
> +struct bpf_raw_tp_regs {
> +       struct pt_regs regs[3];
> +};
> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
> +static struct pt_regs *get_bpf_raw_tp_regs(void)
> +{
> +       struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
> +
> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
> +               this_cpu_dec(bpf_raw_tp_nest_level);
> +               return ERR_PTR(-EBUSY);
> +       }
> +
> +       return &tp_regs->regs[nest_level - 1];
> +}
> +
> +static void put_bpf_raw_tp_regs(void)
> +{
> +       this_cpu_dec(bpf_raw_tp_nest_level);
> +}
> +
>  BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            struct bpf_map *, map, u64, flags, void *, data, u64, size)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
> -       return ____bpf_perf_event_output(regs, map, flags, data, size);
> +       ret = ____bpf_perf_event_output(regs, map, flags, data, size);
> +
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>  BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            struct bpf_map *, map, u64, flags)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
>         /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
> -       return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> -                              flags, 0, 0);
> +       ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> +                             flags, 0, 0);
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>  BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            void *, buf, u32, size, u64, flags)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
> -       return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> -                            (unsigned long) size, flags, 0);
> +       ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> +                           (unsigned long) size, flags, 0);
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
> --
> 2.17.1
>
Daniel Borkmann June 13, 2019, 10:47 p.m. UTC | #2
On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
>>
>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>> they do not increment bpf_prog_active while executing.
>>
>> This enables three levels of nesting, to support
>>   - a kprobe or raw tp or perf event,
>>   - another one of the above that irq context happens to call, and
>>   - another one in nmi context
>> (at most one of which may be a kprobe or perf event).
>>
>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")

Generally, looks good to me. Two things below:

Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
instead of the one you currently have?

One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().

Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").

1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
performance overhead (and desire to not miss events as a result of nesting)? (This
also means we're not protected by bpf_prog_active in all the map ops, of course.)
2) Wouldn't this also mean that we only need to fix the raw tp programs via
get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
the rest which relies upon trace_call_bpf()? I'm probably missing something, but
given they have separate pt_regs there, how could they be affected then?

Thanks,
Daniel

>> Signed-off-by: Matt Mullins <mmullins@fb.com>
>> ---
> 
> LGTM, minor nit below.
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
>> v1->v2:
>>   * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
>>   * instantiate err more readably
>>
>> I've done additional testing with the original workload that hit the
>> irq+raw-tp reentrancy problem, and as far as I can tell, it's still
>> solved with this solution (as opposed to my earlier per-map-element
>> version).
>>
>>  kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
>>  1 file changed, 84 insertions(+), 16 deletions(-)
>>
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index f92d6ad5e080..1c9a4745e596 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
>>         .arg4_type      = ARG_CONST_SIZE,
>>  };
>>
>> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
>> -
>>  static __always_inline u64
>>  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>>                         u64 flags, struct perf_sample_data *sd)
>> @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>>         return perf_event_output(event, sd, regs);
>>  }
>>
>> +/*
>> + * Support executing tracepoints in normal, irq, and nmi context that each call
>> + * bpf_perf_event_output
>> + */
>> +struct bpf_trace_sample_data {
>> +       struct perf_sample_data sds[3];
>> +};
>> +
>> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
>> +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
>>  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
>>            u64, flags, void *, data, u64, size)
>>  {
>> -       struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
>> +       struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
>> +       int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
>>         struct perf_raw_record raw = {
>>                 .frag = {
>>                         .size = size,
>>                         .data = data,
>>                 },
>>         };
>> +       struct perf_sample_data *sd;
>> +       int err;
>>
>> -       if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
>> -               return -EINVAL;
>> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
>> +               err = -EBUSY;
>> +               goto out;
>> +       }
>> +
>> +       sd = &sds->sds[nest_level - 1];
>> +
>> +       if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
>> +               err = -EINVAL;
>> +               goto out;
>> +       }
> 
> Feel free to ignore, but just stylistically, given this check doesn't
> depend on sd, I'd move it either to the very top or right after
> `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
> checking is grouped without interspersed assignment.

Makes sense.

>>         perf_sample_data_init(sd, 0, 0);
>>         sd->raw = &raw;
>>
>> -       return __bpf_perf_event_output(regs, map, flags, sd);
>> +       err = __bpf_perf_event_output(regs, map, flags, sd);
>> +
>> +out:
>> +       this_cpu_dec(bpf_trace_nest_level);
>> +       return err;
>>  }
>>
>>  static const struct bpf_func_proto bpf_perf_event_output_proto = {
>> @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>>  /*
>>   * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
>>   * to avoid potential recursive reuse issue when/if tracepoints are added
>> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
>> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
>> + *
>> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
>> + * in normal, irq, and nmi context.
>>   */
>> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
>> +struct bpf_raw_tp_regs {
>> +       struct pt_regs regs[3];
>> +};
>> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
>> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
>> +static struct pt_regs *get_bpf_raw_tp_regs(void)
>> +{
>> +       struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> +       int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
>> +
>> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
>> +               this_cpu_dec(bpf_raw_tp_nest_level);
>> +               return ERR_PTR(-EBUSY);
>> +       }
>> +
>> +       return &tp_regs->regs[nest_level - 1];
>> +}
>> +
>> +static void put_bpf_raw_tp_regs(void)
>> +{
>> +       this_cpu_dec(bpf_raw_tp_nest_level);
>> +}
>> +
>>  BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
>>            struct bpf_map *, map, u64, flags, void *, data, u64, size)
>>  {
>> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
>> +       int ret;
>> +
>> +       if (IS_ERR(regs))
>> +               return PTR_ERR(regs);
>>
>>         perf_fetch_caller_regs(regs);
>> -       return ____bpf_perf_event_output(regs, map, flags, data, size);
>> +       ret = ____bpf_perf_event_output(regs, map, flags, data, size);
>> +
>> +       put_bpf_raw_tp_regs();
>> +       return ret;
>>  }
>>
>>  static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>> @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>>  BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
>>            struct bpf_map *, map, u64, flags)
>>  {
>> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
>> +       int ret;
>> +
>> +       if (IS_ERR(regs))
>> +               return PTR_ERR(regs);
>>
>>         perf_fetch_caller_regs(regs);
>>         /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
>> -       return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
>> -                              flags, 0, 0);
>> +       ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
>> +                             flags, 0, 0);
>> +       put_bpf_raw_tp_regs();
>> +       return ret;
>>  }
>>
>>  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>> @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>>  BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
>>            void *, buf, u32, size, u64, flags)
>>  {
>> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
>> +       int ret;
>> +
>> +       if (IS_ERR(regs))
>> +               return PTR_ERR(regs);
>>
>>         perf_fetch_caller_regs(regs);
>> -       return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
>> -                            (unsigned long) size, flags, 0);
>> +       ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
>> +                           (unsigned long) size, flags, 0);
>> +       put_bpf_raw_tp_regs();
>> +       return ret;
>>  }
>>
>>  static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
>> --
>> 2.17.1
>>
Matt Mullins June 14, 2019, 12:51 a.m. UTC | #3
On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
> > > 
> > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > they do not increment bpf_prog_active while executing.
> > > 
> > > This enables three levels of nesting, to support
> > >   - a kprobe or raw tp or perf event,
> > >   - another one of the above that irq context happens to call, and
> > >   - another one in nmi context
> > > (at most one of which may be a kprobe or perf event).
> > > 
> > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> 
> Generally, looks good to me. Two things below:
> 
> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> instead of the one you currently have?

Ah, yeah, that's probably more reasonable; I haven't managed to come up
with a scenario where one could hit this without raw tracepoints.  I'll
fix up the nits that've accumulated since v2.

> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
> 
> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
> 
> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> performance overhead (and desire to not miss events as a result of nesting)? (This
> also means we're not protected by bpf_prog_active in all the map ops, of course.)
> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> given they have separate pt_regs there, how could they be affected then?

For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
the _raw_tp variants.  However, consider the following nesting:

                                    trace_nest_level raw_tp_nest_level
  (kprobe) bpf_perf_event_output            1               0
  (raw_tp) bpf_perf_event_output_raw_tp     2               1
  (raw_tp) bpf_get_stackid_raw_tp           2               2

I need to increment a nest level (and ideally increment it only once)
between the kprobe and the first raw_tp, because they would otherwise
share the struct perf_sample_data.  But I also need to increment a nest
level between the two raw_tps, since they share the pt_regs -- I can't
use trace_nest_level for everything because it's not used by
get_stackid, and I can't use raw_tp_nest_level for everything because
it's not incremented by kprobes.

If raw tracepoints were to bump bpf_prog_active, then I could get away
with just using that count in these callsites -- I'm reluctant to do
that, though, since it would prevent kprobes from ever running inside a
raw_tp.  I'd like to retain the ability to (e.g.)
  trace.py -K htab_map_update_elem
and get some stack traces from at least within raw tracepoints.

That said, as I wrote up this example, bpf_trace_nest_level seems to be
wildly misnamed; I should name those after the structure they're
protecting...

> Thanks,
> Daniel
> 
> > > Signed-off-by: Matt Mullins <mmullins@fb.com>
> > > ---
> > 
> > LGTM, minor nit below.
> > 
> > Acked-by: Andrii Nakryiko <andriin@fb.com>
> > 
> > > v1->v2:
> > >   * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
> > >   * instantiate err more readably
> > > 
> > > I've done additional testing with the original workload that hit the
> > > irq+raw-tp reentrancy problem, and as far as I can tell, it's still
> > > solved with this solution (as opposed to my earlier per-map-element
> > > version).
> > > 
> > >  kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
> > >  1 file changed, 84 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > index f92d6ad5e080..1c9a4745e596 100644
> > > --- a/kernel/trace/bpf_trace.c
> > > +++ b/kernel/trace/bpf_trace.c
> > > @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> > >         .arg4_type      = ARG_CONST_SIZE,
> > >  };
> > > 
> > > -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> > > -
> > >  static __always_inline u64
> > >  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > >                         u64 flags, struct perf_sample_data *sd)
> > > @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > >         return perf_event_output(event, sd, regs);
> > >  }
> > > 
> > > +/*
> > > + * Support executing tracepoints in normal, irq, and nmi context that each call
> > > + * bpf_perf_event_output
> > > + */
> > > +struct bpf_trace_sample_data {
> > > +       struct perf_sample_data sds[3];
> > > +};
> > > +
> > > +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
> > > +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
> > >  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> > >            u64, flags, void *, data, u64, size)
> > >  {
> > > -       struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> > > +       struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
> > > +       int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
> > >         struct perf_raw_record raw = {
> > >                 .frag = {
> > >                         .size = size,
> > >                         .data = data,
> > >                 },
> > >         };
> > > +       struct perf_sample_data *sd;
> > > +       int err;
> > > 
> > > -       if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> > > -               return -EINVAL;
> > > +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
> > > +               err = -EBUSY;
> > > +               goto out;
> > > +       }
> > > +
> > > +       sd = &sds->sds[nest_level - 1];
> > > +
> > > +       if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
> > > +               err = -EINVAL;
> > > +               goto out;
> > > +       }
> > 
> > Feel free to ignore, but just stylistically, given this check doesn't
> > depend on sd, I'd move it either to the very top or right after
> > `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
> > checking is grouped without interspersed assignment.
> 
> Makes sense.
> 
> > >         perf_sample_data_init(sd, 0, 0);
> > >         sd->raw = &raw;
> > > 
> > > -       return __bpf_perf_event_output(regs, map, flags, sd);
> > > +       err = __bpf_perf_event_output(regs, map, flags, sd);
> > > +
> > > +out:
> > > +       this_cpu_dec(bpf_trace_nest_level);
> > > +       return err;
> > >  }
> > > 
> > >  static const struct bpf_func_proto bpf_perf_event_output_proto = {
> > > @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > >  /*
> > >   * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
> > >   * to avoid potential recursive reuse issue when/if tracepoints are added
> > > - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
> > > + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
> > > + *
> > > + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
> > > + * in normal, irq, and nmi context.
> > >   */
> > > -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
> > > +struct bpf_raw_tp_regs {
> > > +       struct pt_regs regs[3];
> > > +};
> > > +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
> > > +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
> > > +static struct pt_regs *get_bpf_raw_tp_regs(void)
> > > +{
> > > +       struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > +       int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
> > > +
> > > +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
> > > +               this_cpu_dec(bpf_raw_tp_nest_level);
> > > +               return ERR_PTR(-EBUSY);
> > > +       }
> > > +
> > > +       return &tp_regs->regs[nest_level - 1];
> > > +}
> > > +
> > > +static void put_bpf_raw_tp_regs(void)
> > > +{
> > > +       this_cpu_dec(bpf_raw_tp_nest_level);
> > > +}
> > > +
> > >  BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > >            struct bpf_map *, map, u64, flags, void *, data, u64, size)
> > >  {
> > > -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > +       int ret;
> > > +
> > > +       if (IS_ERR(regs))
> > > +               return PTR_ERR(regs);
> > > 
> > >         perf_fetch_caller_regs(regs);
> > > -       return ____bpf_perf_event_output(regs, map, flags, data, size);
> > > +       ret = ____bpf_perf_event_output(regs, map, flags, data, size);
> > > +
> > > +       put_bpf_raw_tp_regs();
> > > +       return ret;
> > >  }
> > > 
> > >  static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> > > @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> > >  BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > >            struct bpf_map *, map, u64, flags)
> > >  {
> > > -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > +       int ret;
> > > +
> > > +       if (IS_ERR(regs))
> > > +               return PTR_ERR(regs);
> > > 
> > >         perf_fetch_caller_regs(regs);
> > >         /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
> > > -       return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> > > -                              flags, 0, 0);
> > > +       ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> > > +                             flags, 0, 0);
> > > +       put_bpf_raw_tp_regs();
> > > +       return ret;
> > >  }
> > > 
> > >  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> > > @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> > >  BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > >            void *, buf, u32, size, u64, flags)
> > >  {
> > > -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > +       int ret;
> > > +
> > > +       if (IS_ERR(regs))
> > > +               return PTR_ERR(regs);
> > > 
> > >         perf_fetch_caller_regs(regs);
> > > -       return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> > > -                            (unsigned long) size, flags, 0);
> > > +       ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> > > +                           (unsigned long) size, flags, 0);
> > > +       put_bpf_raw_tp_regs();
> > > +       return ret;
> > >  }
> > > 
> > >  static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
> > > --
> > > 2.17.1
> > > 
> 
>
Alexei Starovoitov June 14, 2019, 12:55 a.m. UTC | #4
On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <mmullins@fb.com> wrote:
>
> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
> > > >
> > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > > they do not increment bpf_prog_active while executing.
> > > >
> > > > This enables three levels of nesting, to support
> > > >   - a kprobe or raw tp or perf event,
> > > >   - another one of the above that irq context happens to call, and
> > > >   - another one in nmi context
> > > > (at most one of which may be a kprobe or perf event).
> > > >
> > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> >
> > Generally, looks good to me. Two things below:
> >
> > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> > instead of the one you currently have?
>
> Ah, yeah, that's probably more reasonable; I haven't managed to come up
> with a scenario where one could hit this without raw tracepoints.  I'll
> fix up the nits that've accumulated since v2.
>
> > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
> >
> > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
> >
> > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> > performance overhead (and desire to not miss events as a result of nesting)? (This
> > also means we're not protected by bpf_prog_active in all the map ops, of course.)
> > 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> > the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> > given they have separate pt_regs there, how could they be affected then?
>
> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> the _raw_tp variants.  However, consider the following nesting:
>
>                                     trace_nest_level raw_tp_nest_level
>   (kprobe) bpf_perf_event_output            1               0
>   (raw_tp) bpf_perf_event_output_raw_tp     2               1
>   (raw_tp) bpf_get_stackid_raw_tp           2               2
>
> I need to increment a nest level (and ideally increment it only once)
> between the kprobe and the first raw_tp, because they would otherwise
> share the struct perf_sample_data.  But I also need to increment a nest
> level between the two raw_tps, since they share the pt_regs -- I can't
> use trace_nest_level for everything because it's not used by
> get_stackid, and I can't use raw_tp_nest_level for everything because
> it's not incremented by kprobes.
>
> If raw tracepoints were to bump bpf_prog_active, then I could get away
> with just using that count in these callsites -- I'm reluctant to do
> that, though, since it would prevent kprobes from ever running inside a
> raw_tp.  I'd like to retain the ability to (e.g.)
>   trace.py -K htab_map_update_elem
> and get some stack traces from at least within raw tracepoints.
>
> That said, as I wrote up this example, bpf_trace_nest_level seems to be
> wildly misnamed; I should name those after the structure they're
> protecting...

I still don't get what's wrong with the previous approach.
Didn't I manage to convince both of you that perf_sample_data
inside bpf_perf_event_array doesn't have any issue that Daniel brought up?
I think this refcnting approach is inferior.
Daniel Borkmann June 14, 2019, 2:50 p.m. UTC | #5
On 06/14/2019 02:51 AM, Matt Mullins wrote:
> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
>> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
>>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
>>>>
>>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>>>> they do not increment bpf_prog_active while executing.
>>>>
>>>> This enables three levels of nesting, to support
>>>>   - a kprobe or raw tp or perf event,
>>>>   - another one of the above that irq context happens to call, and
>>>>   - another one in nmi context
>>>> (at most one of which may be a kprobe or perf event).
>>>>
>>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>>
>> Generally, looks good to me. Two things below:
>>
>> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
>> instead of the one you currently have?
> 
> Ah, yeah, that's probably more reasonable; I haven't managed to come up
> with a scenario where one could hit this without raw tracepoints.  I'll
> fix up the nits that've accumulated since v2.
> 
>> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
>>
>> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
>> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
>> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
>>
>> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
>> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
>> performance overhead (and desire to not miss events as a result of nesting)? (This
>> also means we're not protected by bpf_prog_active in all the map ops, of course.)
>> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
>> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
>> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
>> given they have separate pt_regs there, how could they be affected then?
> 
> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> the _raw_tp variants.  However, consider the following nesting:
> 
>                                     trace_nest_level raw_tp_nest_level
>   (kprobe) bpf_perf_event_output            1               0
>   (raw_tp) bpf_perf_event_output_raw_tp     2               1
>   (raw_tp) bpf_get_stackid_raw_tp           2               2
> 
> I need to increment a nest level (and ideally increment it only once)
> between the kprobe and the first raw_tp, because they would otherwise
> share the struct perf_sample_data.  But I also need to increment a nest

I'm not sure I follow on this one: the former would still keep using the
bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU
as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter.
Given these two are /not/ shared, you only need the code you have below for
nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs()
which should also simplify the code quite a bit.

> level between the two raw_tps, since they share the pt_regs -- I can't
> use trace_nest_level for everything because it's not used by
> get_stackid, and I can't use raw_tp_nest_level for everything because
> it's not incremented by kprobes.

(See above wrt kprobes.)

> If raw tracepoints were to bump bpf_prog_active, then I could get away
> with just using that count in these callsites -- I'm reluctant to do
> that, though, since it would prevent kprobes from ever running inside a
> raw_tp.  I'd like to retain the ability to (e.g.)
>   trace.py -K htab_map_update_elem
> and get some stack traces from at least within raw tracepoints.
> 
> That said, as I wrote up this example, bpf_trace_nest_level seems to be
> wildly misnamed; I should name those after the structure they're
> protecting...
Daniel Borkmann June 14, 2019, 3:16 p.m. UTC | #6
On 06/14/2019 02:55 AM, Alexei Starovoitov wrote:
> On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <mmullins@fb.com> wrote:
>> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
>>> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
>>>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
>>>>>
>>>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>>>>> they do not increment bpf_prog_active while executing.
>>>>>
>>>>> This enables three levels of nesting, to support
>>>>>   - a kprobe or raw tp or perf event,
>>>>>   - another one of the above that irq context happens to call, and
>>>>>   - another one in nmi context
>>>>> (at most one of which may be a kprobe or perf event).
>>>>>
>>>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>>>
>>> Generally, looks good to me. Two things below:
>>>
>>> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
>>> instead of the one you currently have?
>>
>> Ah, yeah, that's probably more reasonable; I haven't managed to come up
>> with a scenario where one could hit this without raw tracepoints.  I'll
>> fix up the nits that've accumulated since v2.
>>
>>> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
>>>
>>> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
>>> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
>>> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
>>>
>>> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
>>> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
>>> performance overhead (and desire to not miss events as a result of nesting)? (This
>>> also means we're not protected by bpf_prog_active in all the map ops, of course.)
>>> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
>>> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
>>> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
>>> given they have separate pt_regs there, how could they be affected then?
>>
>> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
>> the _raw_tp variants.  However, consider the following nesting:
>>
>>                                     trace_nest_level raw_tp_nest_level
>>   (kprobe) bpf_perf_event_output            1               0
>>   (raw_tp) bpf_perf_event_output_raw_tp     2               1
>>   (raw_tp) bpf_get_stackid_raw_tp           2               2
>>
>> I need to increment a nest level (and ideally increment it only once)
>> between the kprobe and the first raw_tp, because they would otherwise
>> share the struct perf_sample_data.  But I also need to increment a nest
>> level between the two raw_tps, since they share the pt_regs -- I can't
>> use trace_nest_level for everything because it's not used by
>> get_stackid, and I can't use raw_tp_nest_level for everything because
>> it's not incremented by kprobes.
>>
>> If raw tracepoints were to bump bpf_prog_active, then I could get away
>> with just using that count in these callsites -- I'm reluctant to do
>> that, though, since it would prevent kprobes from ever running inside a
>> raw_tp.  I'd like to retain the ability to (e.g.)
>>   trace.py -K htab_map_update_elem
>> and get some stack traces from at least within raw tracepoints.
>>
>> That said, as I wrote up this example, bpf_trace_nest_level seems to be
>> wildly misnamed; I should name those after the structure they're
>> protecting...
> 
> I still don't get what's wrong with the previous approach.
> Didn't I manage to convince both of you that perf_sample_data
> inside bpf_perf_event_array doesn't have any issue that Daniel brought up?
> I think this refcnting approach is inferior.

Hm, but looking at perf RB handling code, it can deal with nesting situation
just fine. I think this was your main concern with prior email:

  because I suspect that 'struct bpf_event_entry' is not reentrable
  (even w/o issues with 'struct perf_sample_data').

  Even if we always use 'struct perf_sample_data' on stack, I suspect
  the same 'struct bpf_event_entry' cannot be reused in the nested way.

Check the perf_output_{get,put}_handle() and the way it updates head pointer
and does the final propagation to user_page. So if it's designed to handle
these situations, then bailing out doesn't make sense from BPF side.
Matt Mullins June 14, 2019, 5:25 p.m. UTC | #7
On Fri, 2019-06-14 at 16:50 +0200, Daniel Borkmann wrote:
> On 06/14/2019 02:51 AM, Matt Mullins wrote:
> > On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> > > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote:
> > > > > 
> > > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > > > they do not increment bpf_prog_active while executing.
> > > > > 
> > > > > This enables three levels of nesting, to support
> > > > >   - a kprobe or raw tp or perf event,
> > > > >   - another one of the above that irq context happens to call, and
> > > > >   - another one in nmi context
> > > > > (at most one of which may be a kprobe or perf event).
> > > > > 
> > > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> > > 
> > > Generally, looks good to me. Two things below:
> > > 
> > > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> > > instead of the one you currently have?
> > 
> > Ah, yeah, that's probably more reasonable; I haven't managed to come up
> > with a scenario where one could hit this without raw tracepoints.  I'll
> > fix up the nits that've accumulated since v2.
> > 
> > > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
> > > 
> > > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> > > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> > > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
> > > 
> > > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> > > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> > > performance overhead (and desire to not miss events as a result of nesting)? (This
> > > also means we're not protected by bpf_prog_active in all the map ops, of course.)
> > > 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> > > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> > > the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> > > given they have separate pt_regs there, how could they be affected then?
> > 
> > For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> > the _raw_tp variants.  However, consider the following nesting:
> > 
> >                                     trace_nest_level raw_tp_nest_level
> >   (kprobe) bpf_perf_event_output            1               0
> >   (raw_tp) bpf_perf_event_output_raw_tp     2               1
> >   (raw_tp) bpf_get_stackid_raw_tp           2               2
> > 
> > I need to increment a nest level (and ideally increment it only once)
> > between the kprobe and the first raw_tp, because they would otherwise
> > share the struct perf_sample_data.  But I also need to increment a nest
> 
> I'm not sure I follow on this one: the former would still keep using the
> bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU
> as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter.
> Given these two are /not/ shared, you only need the code you have below for
> nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs()
> which should also simplify the code quite a bit.

bpf_perf_event_output_raw_tp calls ____bpf_perf_event_output, so it
currently shares bpf_trace_sd with kprobes -- it _can_ be nested.

> 
> > level between the two raw_tps, since they share the pt_regs -- I can't
> > use trace_nest_level for everything because it's not used by
> > get_stackid, and I can't use raw_tp_nest_level for everything because
> > it's not incremented by kprobes.
> 
> (See above wrt kprobes.)
> 
> > If raw tracepoints were to bump bpf_prog_active, then I could get away
> > with just using that count in these callsites -- I'm reluctant to do
> > that, though, since it would prevent kprobes from ever running inside a
> > raw_tp.  I'd like to retain the ability to (e.g.)
> >   trace.py -K htab_map_update_elem
> > and get some stack traces from at least within raw tracepoints.
> > 
> > That said, as I wrote up this example, bpf_trace_nest_level seems to be
> > wildly misnamed; I should name those after the structure they're
> > protecting...
Alexei Starovoitov June 15, 2019, 11:35 p.m. UTC | #8
On Tue, Jun 11, 2019 at 2:54 PM Matt Mullins <mmullins@fb.com> wrote:
>
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
>
> This enables three levels of nesting, to support
>   - a kprobe or raw tp or perf event,
>   - another one of the above that irq context happens to call, and
>   - another one in nmi context
> (at most one of which may be a kprobe or perf event).
>
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <mmullins@fb.com>

Applied. Thanks
diff mbox series

Patch

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..1c9a4745e596 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@  static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 			u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,50 @@  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	return perf_event_output(event, sd, regs);
 }
 
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+	struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+	struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+	int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
 			.data = data,
 		},
 	};
+	struct perf_sample_data *sd;
+	int err;
 
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	sd = &sds->sds[nest_level - 1];
+
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	perf_sample_data_init(sd, 0, 0);
 	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+	this_cpu_dec(bpf_trace_nest_level);
+	return err;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +846,48 @@  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
  */
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+	struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+	struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+		this_cpu_dec(bpf_raw_tp_nest_level);
+		return ERR_PTR(-EBUSY);
+	}
+
+	return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+	this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return ____bpf_perf_event_output(regs, map, flags, data, size);
+	ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +904,18 @@  static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
 	/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
-	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
-			       flags, 0, 0);
+	ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			      flags, 0, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +930,17 @@  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   void *, buf, u32, size, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
-			     (unsigned long) size, flags, 0);
+	ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			    (unsigned long) size, flags, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {