[bpf-next,1/4] bpf: enable program stats
diff mbox series

Message ID 20190222233644.1487087-2-ast@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series
  • bpf: per program stats
Related show

Commit Message

Alexei Starovoitov Feb. 22, 2019, 11:36 p.m. UTC
JITed BPF programs are indistinguishable from kernel functions, but unlike
kernel code BPF code can be changed often.
Typical approach of "perf record" + "perf report" profiling and tunning of
kernel code works just as well for BPF programs, but kernel code doesn't
need to be monitored whereas BPF programs do.
Users load and run large amount of BPF programs.
These BPF stats allow tools monitor the usage of BPF on the server.
The monitoring tools will turn sysctl kernel.bpf_stats_enabled
on and off for few seconds to sample average cost of the programs.
Aggregated data over hours and days will provide an insight into cost of BPF
and alarms can trigger in case given program suddenly gets more expensive.

The cost of two sched_clock() per program invocation adds ~20 nsec.
Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
from ~40 nsec to ~60 nsec.
static_key minimizes the cost of the stats collection.
There is no measurable difference before/after this patch
with kernel.bpf_stats_enabled=0

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |  7 +++++++
 include/linux/filter.h | 16 +++++++++++++++-
 kernel/bpf/core.c      | 13 +++++++++++++
 kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
 kernel/bpf/verifier.c  |  5 +++++
 kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 96 insertions(+), 3 deletions(-)

Comments

Eric Dumazet Feb. 23, 2019, 12:05 a.m. UTC | #1
On 02/22/2019 03:36 PM, Alexei Starovoitov wrote:

>  
> +static void bpf_prog_get_stats(const struct bpf_prog *prog,
> +			       struct bpf_prog_stats *stats)
> +{
> +	u64 nsecs = 0, cnt = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
> +		cnt += per_cpu(prog->aux->stats->cnt, cpu);

This does not work reliably for 32 bit kernels.

> +	}
> +	stats->nsecs = nsecs;
> +	stats->cnt = cnt;
> +}
Roman Gushchin Feb. 23, 2019, 12:34 a.m. UTC | #2
On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
> JITed BPF programs are indistinguishable from kernel functions, but unlike
> kernel code BPF code can be changed often.
> Typical approach of "perf record" + "perf report" profiling and tunning of
> kernel code works just as well for BPF programs, but kernel code doesn't
> need to be monitored whereas BPF programs do.
> Users load and run large amount of BPF programs.
> These BPF stats allow tools monitor the usage of BPF on the server.
> The monitoring tools will turn sysctl kernel.bpf_stats_enabled
> on and off for few seconds to sample average cost of the programs.
> Aggregated data over hours and days will provide an insight into cost of BPF
> and alarms can trigger in case given program suddenly gets more expensive.
> 
> The cost of two sched_clock() per program invocation adds ~20 nsec.
> Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
> from ~40 nsec to ~60 nsec.
> static_key minimizes the cost of the stats collection.
> There is no measurable difference before/after this patch
> with kernel.bpf_stats_enabled=0
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf.h    |  7 +++++++
>  include/linux/filter.h | 16 +++++++++++++++-
>  kernel/bpf/core.c      | 13 +++++++++++++
>  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
>  kernel/bpf/verifier.c  |  5 +++++
>  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
>  6 files changed, 96 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..14260674bc57 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
>  
>  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
>  
> +struct bpf_prog_stats {
> +	u64 cnt;
> +	u64 nsecs;
> +};
> +
>  struct bpf_prog_aux {
>  	atomic_t refcnt;
>  	u32 used_map_cnt;
> @@ -389,6 +394,7 @@ struct bpf_prog_aux {
>  	 * main prog always has linfo_idx == 0
>  	 */
>  	u32 linfo_idx;
> +	struct bpf_prog_stats __percpu *stats;
>  	union {
>  		struct work_struct work;
>  		struct rcu_head	rcu;
> @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>  
>  extern int sysctl_unprivileged_bpf_disabled;
> +extern int sysctl_bpf_stats_enabled;
>  
>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>  int bpf_prog_new_fd(struct bpf_prog *prog);
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index f32b3eca5a04..7b14235b6f7d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -533,7 +533,21 @@ struct sk_filter {
>  	struct bpf_prog	*prog;
>  };
>  
> -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> +
> +#define BPF_PROG_RUN(prog, ctx)	({				\
> +	u32 ret;						\
> +	cant_sleep();						\
> +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> +		u64 start = sched_clock();			\
> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +		this_cpu_inc(prog->aux->stats->cnt);		\
> +		this_cpu_add(prog->aux->stats->nsecs,		\
> +			     sched_clock() - start);		\
> +	} else {						\
> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +	}							\
> +	ret; })

Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?

Why it might be useful?
If the performance cost is too high to keep the statistics gathering
always enabled, it will be possible to measure the average cost
periodically and extrapolate.

Thanks!
Daniel Borkmann Feb. 23, 2019, 1:06 a.m. UTC | #3
On 02/23/2019 01:34 AM, Roman Gushchin wrote:
> On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
>> JITed BPF programs are indistinguishable from kernel functions, but unlike
>> kernel code BPF code can be changed often.
>> Typical approach of "perf record" + "perf report" profiling and tunning of
>> kernel code works just as well for BPF programs, but kernel code doesn't
>> need to be monitored whereas BPF programs do.
>> Users load and run large amount of BPF programs.
>> These BPF stats allow tools monitor the usage of BPF on the server.
>> The monitoring tools will turn sysctl kernel.bpf_stats_enabled
>> on and off for few seconds to sample average cost of the programs.
>> Aggregated data over hours and days will provide an insight into cost of BPF
>> and alarms can trigger in case given program suddenly gets more expensive.
>>
>> The cost of two sched_clock() per program invocation adds ~20 nsec.
>> Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
>> from ~40 nsec to ~60 nsec.
>> static_key minimizes the cost of the stats collection.
>> There is no measurable difference before/after this patch
>> with kernel.bpf_stats_enabled=0
>>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> ---
>>  include/linux/bpf.h    |  7 +++++++
>>  include/linux/filter.h | 16 +++++++++++++++-
>>  kernel/bpf/core.c      | 13 +++++++++++++
>>  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
>>  kernel/bpf/verifier.c  |  5 +++++
>>  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
>>  6 files changed, 96 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index de18227b3d95..14260674bc57 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
>>  
>>  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
>>  
>> +struct bpf_prog_stats {
>> +	u64 cnt;
>> +	u64 nsecs;
>> +};
>> +
>>  struct bpf_prog_aux {
>>  	atomic_t refcnt;
>>  	u32 used_map_cnt;
>> @@ -389,6 +394,7 @@ struct bpf_prog_aux {
>>  	 * main prog always has linfo_idx == 0
>>  	 */
>>  	u32 linfo_idx;
>> +	struct bpf_prog_stats __percpu *stats;
>>  	union {
>>  		struct work_struct work;
>>  		struct rcu_head	rcu;
>> @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
>>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>>  
>>  extern int sysctl_unprivileged_bpf_disabled;
>> +extern int sysctl_bpf_stats_enabled;
>>  
>>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>>  int bpf_prog_new_fd(struct bpf_prog *prog);
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index f32b3eca5a04..7b14235b6f7d 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -533,7 +533,21 @@ struct sk_filter {
>>  	struct bpf_prog	*prog;
>>  };
>>  
>> -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
>> +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>> +
>> +#define BPF_PROG_RUN(prog, ctx)	({				\
>> +	u32 ret;						\
>> +	cant_sleep();						\
>> +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
>> +		u64 start = sched_clock();			\
>> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>> +		this_cpu_inc(prog->aux->stats->cnt);		\
>> +		this_cpu_add(prog->aux->stats->nsecs,		\
>> +			     sched_clock() - start);		\
>> +	} else {						\
>> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>> +	}							\
>> +	ret; })
> 
> Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?
> 
> Why it might be useful?
> If the performance cost is too high to keep the statistics gathering
> always enabled, it will be possible to measure the average cost
> periodically and extrapolate.

In general, having some stats and timing info would be useful, but I
guess people might want to customize it in future even more specifically
beyond number of runs + time it takes. One thing that would be super
useful is to have some notion of __attribute__((constructor)) and
__attribute__((destructor)) support in BPF which gets then inlined into
prologue / epilogue of program. E.g. such infrastructure would allow to
mark an skb and measure time it takes through the BPF prog till it hits
an exit point somewhere (without having to explicitly code this into the
program everywhere). Other examples may be histograms or p99 latencies
that might probably be useful. Feels like for monitoring more ways to
program would be nice and to move it into the BPF insns sequence (e.g.
enforced globally or by choice of prog as another option)? Thoughts?
Alexei Starovoitov Feb. 23, 2019, 2:29 a.m. UTC | #4
On Fri, Feb 22, 2019 at 04:05:55PM -0800, Eric Dumazet wrote:
> 
> 
> On 02/22/2019 03:36 PM, Alexei Starovoitov wrote:
> 
> >  
> > +static void bpf_prog_get_stats(const struct bpf_prog *prog,
> > +			       struct bpf_prog_stats *stats)
> > +{
> > +	u64 nsecs = 0, cnt = 0;
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
> > +		cnt += per_cpu(prog->aux->stats->cnt, cpu);
> 
> This does not work reliably for 32 bit kernels.

right. my mind was in probe_read thread. will fix.
Alexei Starovoitov Feb. 23, 2019, 2:31 a.m. UTC | #5
On Sat, Feb 23, 2019 at 12:34:38AM +0000, Roman Gushchin wrote:
> On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
> > JITed BPF programs are indistinguishable from kernel functions, but unlike
> > kernel code BPF code can be changed often.
> > Typical approach of "perf record" + "perf report" profiling and tunning of
> > kernel code works just as well for BPF programs, but kernel code doesn't
> > need to be monitored whereas BPF programs do.
> > Users load and run large amount of BPF programs.
> > These BPF stats allow tools monitor the usage of BPF on the server.
> > The monitoring tools will turn sysctl kernel.bpf_stats_enabled
> > on and off for few seconds to sample average cost of the programs.
> > Aggregated data over hours and days will provide an insight into cost of BPF
> > and alarms can trigger in case given program suddenly gets more expensive.
> > 
> > The cost of two sched_clock() per program invocation adds ~20 nsec.
> > Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
> > from ~40 nsec to ~60 nsec.
> > static_key minimizes the cost of the stats collection.
> > There is no measurable difference before/after this patch
> > with kernel.bpf_stats_enabled=0
> > 
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/bpf.h    |  7 +++++++
> >  include/linux/filter.h | 16 +++++++++++++++-
> >  kernel/bpf/core.c      | 13 +++++++++++++
> >  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
> >  kernel/bpf/verifier.c  |  5 +++++
> >  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
> >  6 files changed, 96 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index de18227b3d95..14260674bc57 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
> >  
> >  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
> >  
> > +struct bpf_prog_stats {
> > +	u64 cnt;
> > +	u64 nsecs;
> > +};
> > +
> >  struct bpf_prog_aux {
> >  	atomic_t refcnt;
> >  	u32 used_map_cnt;
> > @@ -389,6 +394,7 @@ struct bpf_prog_aux {
> >  	 * main prog always has linfo_idx == 0
> >  	 */
> >  	u32 linfo_idx;
> > +	struct bpf_prog_stats __percpu *stats;
> >  	union {
> >  		struct work_struct work;
> >  		struct rcu_head	rcu;
> > @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
> >  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
> >  
> >  extern int sysctl_unprivileged_bpf_disabled;
> > +extern int sysctl_bpf_stats_enabled;
> >  
> >  int bpf_map_new_fd(struct bpf_map *map, int flags);
> >  int bpf_prog_new_fd(struct bpf_prog *prog);
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index f32b3eca5a04..7b14235b6f7d 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -533,7 +533,21 @@ struct sk_filter {
> >  	struct bpf_prog	*prog;
> >  };
> >  
> > -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> > +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> > +
> > +#define BPF_PROG_RUN(prog, ctx)	({				\
> > +	u32 ret;						\
> > +	cant_sleep();						\
> > +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> > +		u64 start = sched_clock();			\
> > +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> > +		this_cpu_inc(prog->aux->stats->cnt);		\
> > +		this_cpu_add(prog->aux->stats->nsecs,		\
> > +			     sched_clock() - start);		\
> > +	} else {						\
> > +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> > +	}							\
> > +	ret; })
> 
> Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?

xdp cannot afford to pay this penalty unconditionally.
It's not only this_cpu_inc. It's prog->aux->stats loads.
Alexei Starovoitov Feb. 23, 2019, 2:38 a.m. UTC | #6
On Sat, Feb 23, 2019 at 02:06:56AM +0100, Daniel Borkmann wrote:
> 
> In general, having some stats and timing info would be useful, but I
> guess people might want to customize it in future even more specifically
> beyond number of runs + time it takes. One thing that would be super
> useful is to have some notion of __attribute__((constructor)) and
> __attribute__((destructor)) support in BPF which gets then inlined into
> prologue / epilogue of program. E.g. such infrastructure would allow to
> mark an skb and measure time it takes through the BPF prog till it hits
> an exit point somewhere (without having to explicitly code this into the
> program everywhere). Other examples may be histograms or p99 latencies
> that might probably be useful. Feels like for monitoring more ways to
> program would be nice and to move it into the BPF insns sequence (e.g.
> enforced globally or by choice of prog as another option)? Thoughts?

the constructor/destructor you mean to capture the full sequence of tail_calls?
Or full path of skb through the stack with all hook points?
That is likely very useful without any bpf, but I think hw timestamping
already serves that purpose.

I've been thinking about doing this stats per program
(instead of static_key for all).
Like replacing a bpf_prog->bpf_func with a wrapper function
that does stats, but that is more costly in retpoline world
due to extra indirect call.
Another alternative is to patch the stats in via JITs, but
it's much more complex and error prone.
So went with the simplest approach.

With all stats ideas we need to be careful not to reintroduce what
perf is already great at.
These stats are _not_ for performance analysis. That's what perf does.
These stats are for 24/7 monitoring to catch things that not suppose to happen.
Daniel Borkmann Feb. 25, 2019, 10:36 a.m. UTC | #7
On 02/23/2019 03:38 AM, Alexei Starovoitov wrote:
> On Sat, Feb 23, 2019 at 02:06:56AM +0100, Daniel Borkmann wrote:
>>
>> In general, having some stats and timing info would be useful, but I
>> guess people might want to customize it in future even more specifically
>> beyond number of runs + time it takes. One thing that would be super
>> useful is to have some notion of __attribute__((constructor)) and
>> __attribute__((destructor)) support in BPF which gets then inlined into
>> prologue / epilogue of program. E.g. such infrastructure would allow to
>> mark an skb and measure time it takes through the BPF prog till it hits
>> an exit point somewhere (without having to explicitly code this into the
>> program everywhere). Other examples may be histograms or p99 latencies
>> that might probably be useful. Feels like for monitoring more ways to
>> program would be nice and to move it into the BPF insns sequence (e.g.
>> enforced globally or by choice of prog as another option)? Thoughts?
> 
> the constructor/destructor you mean to capture the full sequence of tail_calls?
> Or full path of skb through the stack with all hook points?
> That is likely very useful without any bpf, but I think hw timestamping
> already serves that purpose.

Not through the stack, but was more thinking something like low-overhead
kprobes-style extension for a BPF prog where such sequence would be added
'inline' at beginning / exit of BPF prog invocation with normal ctx access
and helpers as the native program (e.g. time-stamping plus read/write into
mark as one example which kprobes couldn't do). But might go much beyond
context of this stats collection.

> I've been thinking about doing this stats per program
> (instead of static_key for all).
> Like replacing a bpf_prog->bpf_func with a wrapper function
> that does stats, but that is more costly in retpoline world
> due to extra indirect call.

That's actually an interesting thought, given the original prog->bpf_func
is a known address at that time, this could be templated where an inner
dummy bpf_func call to the normal BPF prog gets later replaced with the
actual prog->bpf_func address to have no indirect call. This would also
allow to keep the actual prog->bpf_func entry call-sites small and simple.

> Another alternative is to patch the stats in via JITs, but
> it's much more complex and error prone.
> So went with the simplest approach.

Pure BPF insns rewrite via JIT would indeed be unclear wrt per-cpu data
structures.

> With all stats ideas we need to be careful not to reintroduce what
> perf is already great at.

Yeah agree.

> These stats are _not_ for performance analysis. That's what perf does.
> These stats are for 24/7 monitoring to catch things that not suppose to happen.
Thanks,
Daniel
Alexei Starovoitov Feb. 26, 2019, 4:27 a.m. UTC | #8
On 2/25/19 2:36 AM, Daniel Borkmann wrote:
> 
> Not through the stack, but was more thinking something like low-overhead
> kprobes-style extension for a BPF prog where such sequence would be added
> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
> and helpers as the native program (e.g. time-stamping plus read/write into
> mark as one example which kprobes couldn't do). But might go much beyond
> context of this stats collection.

see below.

> That's actually an interesting thought, given the original prog->bpf_func
> is a known address at that time, this could be templated where an inner
> dummy bpf_func call to the normal BPF prog gets later replaced with the
> actual prog->bpf_func address to have no indirect call. This would also
> allow to keep the actual prog->bpf_func entry call-sites small and simple.

My first thought was that we indeed can have a template of stats
collecting wrapper in .text,
then at 'switch stats on' event vmalloc exec region, copy wrapper code
into it and manually patch 'call foo' x86 insn with direct jump.
That is still quite involved from coding side.
It's similar to what kprobe/ftrace is doing, but probably
an overkill for stats due to potential security
concerns with new executable code.
But it won't work due to tail_calls.
I've looked into single wrapper approach with indirect call
(because it's much easer than patching x86) and realized it
won't work for the same reason.
Inline kprobe+kretprobe insertion won't work either.
All because we have tail_calls.
we cannot have executable epilogue in progs.
tail_call will jmp into next prog and second part of stats collection
won't run. Essentially no epilogue allowed unless we disable tail_call.
Or we somehow reimplement tail_calls so that prog returns,
an epilogue is executed and then jumps into the next prog.
tail_call turned out be the worst corner case feature to deal with.

With static_key approach the main prog accounts the time
of all progs called via tail_call.
We can argue whether it's ideal semantics, but looks like
it's the only thing we can do.
I don't see a way how tail_called progs can count their own time.
Stats would somehow need to be computed right before jumping
into next prog. Which means heavy changes in all JITs
plus extra performance cost at runtime, since such if (stats_on)
check in tail_call handling would have to be done at runtime,
since JITed code is read-only and regenerating progs due to stats
is non-starter.
Or tail-called next prog would need to update stats for previous
prog, but there is no pointer to 'aux->stats' there.
So imo that is dead end.
static_key approach is the simplest by far.
Daniel Borkmann Feb. 26, 2019, 3:29 p.m. UTC | #9
On 02/26/2019 05:27 AM, Alexei Starovoitov wrote:
> On 2/25/19 2:36 AM, Daniel Borkmann wrote:
>>
>> Not through the stack, but was more thinking something like low-overhead
>> kprobes-style extension for a BPF prog where such sequence would be added
>> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
>> and helpers as the native program (e.g. time-stamping plus read/write into
>> mark as one example which kprobes couldn't do). But might go much beyond
>> context of this stats collection.
> 
> see below.
> 
>> That's actually an interesting thought, given the original prog->bpf_func
>> is a known address at that time, this could be templated where an inner
>> dummy bpf_func call to the normal BPF prog gets later replaced with the
>> actual prog->bpf_func address to have no indirect call. This would also
>> allow to keep the actual prog->bpf_func entry call-sites small and simple.
> 
> My first thought was that we indeed can have a template of stats
> collecting wrapper in .text,
> then at 'switch stats on' event vmalloc exec region, copy wrapper code
> into it and manually patch 'call foo' x86 insn with direct jump.
> That is still quite involved from coding side.
> It's similar to what kprobe/ftrace is doing, but probably
> an overkill for stats due to potential security
> concerns with new executable code.
> But it won't work due to tail_calls.

You mean for the scenario to measure the _individual_ progs that
are part of the given tail call processing such that each have
their individual inc in count and time-spent attribution. If so,
yeah, agree. Or for the case you have right now where everything
gets attributed to the "entry" program that triggers the tail
call processing? Latter would be similar as we have now in this
patch, imho.

> I've looked into single wrapper approach with indirect call
> (because it's much easer than patching x86) and realized it
> won't work for the same reason.
> Inline kprobe+kretprobe insertion won't work either.
> All because we have tail_calls.
> we cannot have executable epilogue in progs.
> tail_call will jmp into next prog and second part of stats collection
> won't run. Essentially no epilogue allowed unless we disable tail_call.
> Or we somehow reimplement tail_calls so that prog returns,
> an epilogue is executed and then jumps into the next prog.
> tail_call turned out be the worst corner case feature to deal with.

Agree, it's a pain as in most cases. Was more thinking that
'special' epilogue would inc the count, start first ktime/cycle
snapshot and then do a direct function call to the normally
JITed BPF prog (where it can do tail calls, etc, so nothing
changes there), and upon return from this call take the second
measurement, and update the per CPU buffer, similar as we have
now just with static keys approach. So, was thinking similar as
we JIT BPF-to-BPF calls on an individual prog basis.

> With static_key approach the main prog accounts the time
> of all progs called via tail_call.
> We can argue whether it's ideal semantics, but looks like
> it's the only thing we can do.

Yeah true, it's more about measuring how long the prog 'chain'
in the whole hook is taking to complete, which may be interesting
in itself, just perhaps a bit confusing when looking at individual
progs' share.

> I don't see a way how tail_called progs can count their own time.

Me neither, if so I guess it would be of similar fashion as you
have right now where it's accounted to the first prog.

> Stats would somehow need to be computed right before jumping
> into next prog. Which means heavy changes in all JITs
> plus extra performance cost at runtime, since such if (stats_on)
> check in tail_call handling would have to be done at runtime,
> since JITed code is read-only and regenerating progs due to stats
> is non-starter.
> Or tail-called next prog would need to update stats for previous
> prog, but there is no pointer to 'aux->stats' there.
> So imo that is dead end.
> static_key approach is the simplest by far.

It's simplest, yep. I guess if we get to the point of removing
indirect call for BPF_PROG_RUN itself, we might need to revisit
then and move it into bpf_func at that point. Kind of implementation
detail given we only expose count and time spent data to uapi,
though the direct switch at runtime may be a bit problematic in
future. Anyway, lets see how it goes when we get there.

Thanks,
Daniel
Alexei Starovoitov Feb. 27, 2019, 4:05 a.m. UTC | #10
On Tue, Feb 26, 2019 at 04:29:45PM +0100, Daniel Borkmann wrote:
> On 02/26/2019 05:27 AM, Alexei Starovoitov wrote:
> > On 2/25/19 2:36 AM, Daniel Borkmann wrote:
> >>
> >> Not through the stack, but was more thinking something like low-overhead
> >> kprobes-style extension for a BPF prog where such sequence would be added
> >> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
> >> and helpers as the native program (e.g. time-stamping plus read/write into
> >> mark as one example which kprobes couldn't do). But might go much beyond
> >> context of this stats collection.
> > 
> > see below.
> > 
> >> That's actually an interesting thought, given the original prog->bpf_func
> >> is a known address at that time, this could be templated where an inner
> >> dummy bpf_func call to the normal BPF prog gets later replaced with the
> >> actual prog->bpf_func address to have no indirect call. This would also
> >> allow to keep the actual prog->bpf_func entry call-sites small and simple.
> > 
> > My first thought was that we indeed can have a template of stats
> > collecting wrapper in .text,
> > then at 'switch stats on' event vmalloc exec region, copy wrapper code
> > into it and manually patch 'call foo' x86 insn with direct jump.
> > That is still quite involved from coding side.
> > It's similar to what kprobe/ftrace is doing, but probably
> > an overkill for stats due to potential security
> > concerns with new executable code.
> > But it won't work due to tail_calls.
> 
> You mean for the scenario to measure the _individual_ progs that
> are part of the given tail call processing such that each have
> their individual inc in count and time-spent attribution. If so,
> yeah, agree. Or for the case you have right now where everything
> gets attributed to the "entry" program that triggers the tail
> call processing? Latter would be similar as we have now in this
> patch, imho.

There are two other issues with 'epilogue' approach.
1.
We call progs as (*(prog)->bpf_func)(ctx, (prog)->insnsi)
and from R2 we can get back to prog and from there go to prog->aux-stats,
but there is no room in the stack to save this R2.
So we'd need extra 16-bytes for start_ns and saved R2
that epilogue can use to update stats in the 'entry' prog.
With static_key approach gcc was smart enough to use callee-saved
registers and no extra stack was used in the caller
(at least for one callsite of BPF_PROG_RUN where I've looked at asm)

2.
when stats are flipped on and off we need to make sure that all bpf progs
in prog_array are with stats on or off. Cannot mix and match.
Otherwise this 16-byte stack mismatch will cause issues.
Which is impossible to do atomically for the whole prog_array.

> > With static_key approach the main prog accounts the time
> > of all progs called via tail_call.
> > We can argue whether it's ideal semantics, but looks like
> > it's the only thing we can do.
> 
> Yeah true, it's more about measuring how long the prog 'chain'
> in the whole hook is taking to complete, which may be interesting
> in itself, just perhaps a bit confusing when looking at individual
> progs' share.

yep. would be good to come up with a way to measure individual progs,
but I couldn't figure it out yet.

> It's simplest, yep. I guess if we get to the point of removing
> indirect call for BPF_PROG_RUN itself, we might need to revisit
> then and move it into bpf_func at that point. Kind of implementation
> detail given we only expose count and time spent data to uapi,
> though the direct switch at runtime may be a bit problematic in
> future. Anyway, lets see how it goes when we get there.

yep

Patch
diff mbox series

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de18227b3d95..14260674bc57 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -340,6 +340,11 @@  enum bpf_cgroup_storage_type {
 
 #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
 
+struct bpf_prog_stats {
+	u64 cnt;
+	u64 nsecs;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -389,6 +394,7 @@  struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -559,6 +565,7 @@  void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
 extern int sysctl_unprivileged_bpf_disabled;
+extern int sysctl_bpf_stats_enabled;
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f32b3eca5a04..7b14235b6f7d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,21 @@  struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
+DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+
+#define BPF_PROG_RUN(prog, ctx)	({				\
+	u32 ret;						\
+	cant_sleep();						\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
+		u64 start = sched_clock();			\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		this_cpu_inc(prog->aux->stats->cnt);		\
+		this_cpu_add(prog->aux->stats->nsecs,		\
+			     sched_clock() - start);		\
+	} else {						\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+	}							\
+	ret; })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef88b167959d..574747f98d44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -95,6 +95,13 @@  struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 		return NULL;
 	}
 
+	aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+	if (!aux->stats) {
+		kfree(aux);
+		vfree(fp);
+		return NULL;
+	}
+
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
 	fp->aux->prog = fp;
@@ -231,6 +238,8 @@  struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
+	if (fp->aux && !fp->is_func)
+		free_percpu(fp->aux->stats);
 	kfree(fp->aux);
 	vfree(fp);
 }
@@ -2069,6 +2078,10 @@  int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ec7c552af76b..079e7fa14f39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1283,24 +1283,44 @@  static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+			       struct bpf_prog_stats *stats)
+{
+	u64 nsecs = 0, cnt = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
+		cnt += per_cpu(prog->aux->stats->cnt, cpu);
+	}
+	stats->nsecs = nsecs;
+	stats->cnt = cnt;
+}
+
 #ifdef CONFIG_PROC_FS
 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_prog *prog = filp->private_data;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+	struct bpf_prog_stats stats;
 
+	bpf_prog_get_stats(prog, &stats);
 	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
 		   "memlock:\t%llu\n"
-		   "prog_id:\t%u\n",
+		   "prog_id:\t%u\n"
+		   "runtime:\t%llu\n"
+		   "runcnt:\t%llu\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT,
-		   prog->aux->id);
+		   prog->aux->id,
+		   stats.nsecs,
+		   stats.cnt);
 }
 #endif
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1b9496c41383..dff61920a316 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7330,6 +7330,11 @@  static int jit_subprogs(struct bpf_verifier_env *env)
 		if (bpf_prog_calc_tag(func[i]))
 			goto out_free;
 		func[i]->is_func = 1;
+		/* BPF_PROG_RUN doesn't call subprogs directly,
+		 * hence stats are reused from main prog
+		 */
+		free_percpu(func[i]->aux->stats);
+		func[i]->aux->stats = prog->aux->stats;
 		func[i]->aux->func_idx = i;
 		/* the btf and func_info will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..86e0771352f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -224,6 +224,9 @@  static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1230,6 +1233,15 @@  static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "bpf_stats_enabled",
+		.data		= &sysctl_bpf_stats_enabled,
+		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax_bpf_stats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3260,6 +3272,28 @@  int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	int ret, bpf_stats = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &bpf_stats;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		*(int *)table->data = bpf_stats;
+		if (bpf_stats)
+			static_branch_enable(&bpf_stats_enabled_key);
+		else
+			static_branch_disable(&bpf_stats_enabled_key);
+	}
+	return ret;
+}
+
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)