diff mbox series

[v2,bpf-next] bpf: sharing bpf runtime stats with /dev/bpf_stats

Message ID 20200316203329.2747779-1-songliubraving@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series [v2,bpf-next] bpf: sharing bpf runtime stats with /dev/bpf_stats | expand

Commit Message

Song Liu March 16, 2020, 8:33 p.m. UTC
Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
Typical userspace tools use kernel.bpf_stats_enabled as follows:

  1. Enable kernel.bpf_stats_enabled;
  2. Check program run_time_ns;
  3. Sleep for the monitoring period;
  4. Check program run_time_ns again, calculate the difference;
  5. Disable kernel.bpf_stats_enabled.

The problem with this approach is that only one userspace tool can toggle
this sysctl. If multiple tools toggle the sysctl at the same time, the
measurement may be inaccurate.

To fix this problem while keep backward compatibility, introduce a new
bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
run_time_ns stats and returns a valid fd.

With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
flow:

  1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
  2. Check program run_time_ns;
  3. Sleep for the monitoring period;
  4. Check program run_time_ns again, calculate the difference;
  5. Close the fd.

Signed-off-by: Song Liu <songliubraving@fb.com>

---
Changes RFC => v2:
1. Add a new bpf command instead of /dev/bpf_stats;
2. Remove the jump_label patch, which is no longer needed;
3. Add a static variable to save previous value of the sysctl.
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 43 ++++++++++++++++++++++++++++++++++
 kernel/sysctl.c                | 36 +++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 81 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann March 17, 2020, 7:30 p.m. UTC | #1
On 3/16/20 9:33 PM, Song Liu wrote:
> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
> Typical userspace tools use kernel.bpf_stats_enabled as follows:
> 
>    1. Enable kernel.bpf_stats_enabled;
>    2. Check program run_time_ns;
>    3. Sleep for the monitoring period;
>    4. Check program run_time_ns again, calculate the difference;
>    5. Disable kernel.bpf_stats_enabled.
> 
> The problem with this approach is that only one userspace tool can toggle
> this sysctl. If multiple tools toggle the sysctl at the same time, the
> measurement may be inaccurate.
> 
> To fix this problem while keep backward compatibility, introduce a new
> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
> run_time_ns stats and returns a valid fd.
> 
> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
> flow:
> 
>    1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>    2. Check program run_time_ns;
>    3. Sleep for the monitoring period;
>    4. Check program run_time_ns again, calculate the difference;
>    5. Close the fd.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>

Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
about it?

Also, should this have bpftool integration now that we have `bpftool prog profile`
support? Would be nice to then fetch the related stats via bpf_prog_info, so users
can consume this in an easy way.

> Changes RFC => v2:
> 1. Add a new bpf command instead of /dev/bpf_stats;
> 2. Remove the jump_label patch, which is no longer needed;
> 3. Add a static variable to save previous value of the sysctl.
> ---
>   include/linux/bpf.h            |  1 +
>   include/uapi/linux/bpf.h       |  1 +
>   kernel/bpf/syscall.c           | 43 ++++++++++++++++++++++++++++++++++
>   kernel/sysctl.c                | 36 +++++++++++++++++++++++++++-
>   tools/include/uapi/linux/bpf.h |  1 +
>   5 files changed, 81 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4fd91b7c95ea..d542349771df 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -970,6 +970,7 @@ _out:							\
>   
>   #ifdef CONFIG_BPF_SYSCALL
>   DECLARE_PER_CPU(int, bpf_prog_active);
> +extern struct mutex bpf_stats_enabled_mutex;
>   
>   /*
>    * Block execution of BPF programs attached to instrumentation (perf,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 40b2d9476268..8285ff37210c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -111,6 +111,7 @@ enum bpf_cmd {
>   	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
>   	BPF_MAP_UPDATE_BATCH,
>   	BPF_MAP_DELETE_BATCH,
> +	BPF_ENABLE_RUNTIME_STATS,
>   };
>   
>   enum bpf_map_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b2f73ecacced..823dc9de7953 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -24,6 +24,9 @@
>   #include <linux/ctype.h>
>   #include <linux/nospec.h>
>   #include <linux/audit.h>
> +#include <linux/miscdevice.h>

Is this still needed?

> +#include <linux/fs.h>
> +#include <linux/jump_label.h>
>   #include <uapi/linux/btf.h>
>   
>   #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
> @@ -3550,6 +3553,43 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
>   	return err;
>   }
>   

Thanks,
Daniel
Song Liu March 17, 2020, 7:54 p.m. UTC | #2
> On Mar 17, 2020, at 12:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 3/16/20 9:33 PM, Song Liu wrote:
>> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
>> Typical userspace tools use kernel.bpf_stats_enabled as follows:
>>   1. Enable kernel.bpf_stats_enabled;
>>   2. Check program run_time_ns;
>>   3. Sleep for the monitoring period;
>>   4. Check program run_time_ns again, calculate the difference;
>>   5. Disable kernel.bpf_stats_enabled.
>> The problem with this approach is that only one userspace tool can toggle
>> this sysctl. If multiple tools toggle the sysctl at the same time, the
>> measurement may be inaccurate.
>> To fix this problem while keep backward compatibility, introduce a new
>> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
>> run_time_ns stats and returns a valid fd.
>> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
>> flow:
>>   1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>>   2. Check program run_time_ns;
>>   3. Sleep for the monitoring period;
>>   4. Check program run_time_ns again, calculate the difference;
>>   5. Close the fd.
>> Signed-off-by: Song Liu <songliubraving@fb.com>
> 
> Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
> about it?

My fault. Will fix..

> 
> Also, should this have bpftool integration now that we have `bpftool prog profile`
> support? Would be nice to then fetch the related stats via bpf_prog_info, so users
> can consume this in an easy way.

We can add "run_time_ns" as a metric to "bpftool prog profile". But the 
mechanism is not the same though. Let me think about this. 

> 
>> Changes RFC => v2:
>> 1. Add a new bpf command instead of /dev/bpf_stats;
>> 2. Remove the jump_label patch, which is no longer needed;
>> 3. Add a static variable to save previous value of the sysctl.
>> ---
>>  include/linux/bpf.h            |  1 +
>>  include/uapi/linux/bpf.h       |  1 +
>>  kernel/bpf/syscall.c           | 43 ++++++++++++++++++++++++++++++++++
>>  kernel/sysctl.c                | 36 +++++++++++++++++++++++++++-
>>  tools/include/uapi/linux/bpf.h |  1 +
>>  5 files changed, 81 insertions(+), 1 deletion(-)
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 4fd91b7c95ea..d542349771df 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -970,6 +970,7 @@ _out:							\
>>    #ifdef CONFIG_BPF_SYSCALL
>>  DECLARE_PER_CPU(int, bpf_prog_active);
>> +extern struct mutex bpf_stats_enabled_mutex;
>>    /*
>>   * Block execution of BPF programs attached to instrumentation (perf,
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 40b2d9476268..8285ff37210c 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -111,6 +111,7 @@ enum bpf_cmd {
>>  	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
>>  	BPF_MAP_UPDATE_BATCH,
>>  	BPF_MAP_DELETE_BATCH,
>> +	BPF_ENABLE_RUNTIME_STATS,
>>  };
>>    enum bpf_map_type {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index b2f73ecacced..823dc9de7953 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -24,6 +24,9 @@
>>  #include <linux/ctype.h>
>>  #include <linux/nospec.h>
>>  #include <linux/audit.h>
>> +#include <linux/miscdevice.h>
> 
> Is this still needed?

My fault again. Will fix. 

Thanks,
Song
Daniel Borkmann March 17, 2020, 8:03 p.m. UTC | #3
On 3/17/20 8:54 PM, Song Liu wrote:
>> On Mar 17, 2020, at 12:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 3/16/20 9:33 PM, Song Liu wrote:
>>> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
>>> Typical userspace tools use kernel.bpf_stats_enabled as follows:
>>>    1. Enable kernel.bpf_stats_enabled;
>>>    2. Check program run_time_ns;
>>>    3. Sleep for the monitoring period;
>>>    4. Check program run_time_ns again, calculate the difference;
>>>    5. Disable kernel.bpf_stats_enabled.
>>> The problem with this approach is that only one userspace tool can toggle
>>> this sysctl. If multiple tools toggle the sysctl at the same time, the
>>> measurement may be inaccurate.
>>> To fix this problem while keep backward compatibility, introduce a new
>>> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
>>> run_time_ns stats and returns a valid fd.
>>> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
>>> flow:
>>>    1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>>>    2. Check program run_time_ns;
>>>    3. Sleep for the monitoring period;
>>>    4. Check program run_time_ns again, calculate the difference;
>>>    5. Close the fd.
>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>
>> Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
>> about it?
> 
> My fault. Will fix..
> 
>> Also, should this have bpftool integration now that we have `bpftool prog profile`
>> support? Would be nice to then fetch the related stats via bpf_prog_info, so users
>> can consume this in an easy way.
> 
> We can add "run_time_ns" as a metric to "bpftool prog profile". But the
> mechanism is not the same though. Let me think about this.

Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
potentially deprecate bpf_stats counters?

Thanks,
Daniel
Song Liu March 17, 2020, 8:04 p.m. UTC | #4
> On Mar 17, 2020, at 12:54 PM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Mar 17, 2020, at 12:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> 
>> On 3/16/20 9:33 PM, Song Liu wrote:
>>> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
>>> Typical userspace tools use kernel.bpf_stats_enabled as follows:
>>>  1. Enable kernel.bpf_stats_enabled;
>>>  2. Check program run_time_ns;
>>>  3. Sleep for the monitoring period;
>>>  4. Check program run_time_ns again, calculate the difference;
>>>  5. Disable kernel.bpf_stats_enabled.
>>> The problem with this approach is that only one userspace tool can toggle
>>> this sysctl. If multiple tools toggle the sysctl at the same time, the
>>> measurement may be inaccurate.
>>> To fix this problem while keep backward compatibility, introduce a new
>>> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
>>> run_time_ns stats and returns a valid fd.
>>> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
>>> flow:
>>>  1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>>>  2. Check program run_time_ns;
>>>  3. Sleep for the monitoring period;
>>>  4. Check program run_time_ns again, calculate the difference;
>>>  5. Close the fd.
>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> 
>> Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
>> about it?
> 
> My fault. Will fix..
> 
>> 
>> Also, should this have bpftool integration now that we have `bpftool prog profile`
>> support? Would be nice to then fetch the related stats via bpf_prog_info, so users
>> can consume this in an easy way.
> 
> We can add "run_time_ns" as a metric to "bpftool prog profile". But the 
> mechanism is not the same though. Let me think about this. 

Btw, I plan to add this in separate set. Please let me know if it 
preferable to have them in the same set. 

Thanks,
Song
Song Liu March 17, 2020, 8:13 p.m. UTC | #5
> On Mar 17, 2020, at 1:03 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 3/17/20 8:54 PM, Song Liu wrote:
>>> On Mar 17, 2020, at 12:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> On 3/16/20 9:33 PM, Song Liu wrote:
>>>> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
>>>> Typical userspace tools use kernel.bpf_stats_enabled as follows:
>>>>   1. Enable kernel.bpf_stats_enabled;
>>>>   2. Check program run_time_ns;
>>>>   3. Sleep for the monitoring period;
>>>>   4. Check program run_time_ns again, calculate the difference;
>>>>   5. Disable kernel.bpf_stats_enabled.
>>>> The problem with this approach is that only one userspace tool can toggle
>>>> this sysctl. If multiple tools toggle the sysctl at the same time, the
>>>> measurement may be inaccurate.
>>>> To fix this problem while keep backward compatibility, introduce a new
>>>> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
>>>> run_time_ns stats and returns a valid fd.
>>>> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
>>>> flow:
>>>>   1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>>>>   2. Check program run_time_ns;
>>>>   3. Sleep for the monitoring period;
>>>>   4. Check program run_time_ns again, calculate the difference;
>>>>   5. Close the fd.
>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>> 
>>> Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
>>> about it?
>> My fault. Will fix..
>>> Also, should this have bpftool integration now that we have `bpftool prog profile`
>>> support? Would be nice to then fetch the related stats via bpf_prog_info, so users
>>> can consume this in an easy way.
>> We can add "run_time_ns" as a metric to "bpftool prog profile". But the
>> mechanism is not the same though. Let me think about this.
> 
> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
> potentially deprecate bpf_stats counters?

I think run_time_ns has its own value as a simple monitoring framework. We can
use it in tools like top (and variations). It will be easier for these tools to 
adopt run_time_ns than using fentry/fexit. 

On the other hand, in long term, we may include a few fentry/fexit based programs
in the kernel binary (or the rpm), so that these tools can use them easily. At
that time, we can fully deprecate run_time_ns. Maybe this is not too far away?

Thanks,
Song
Daniel Borkmann March 17, 2020, 9:47 p.m. UTC | #6
On 3/17/20 9:13 PM, Song Liu wrote:
>> On Mar 17, 2020, at 1:03 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 3/17/20 8:54 PM, Song Liu wrote:
>>>> On Mar 17, 2020, at 12:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 3/16/20 9:33 PM, Song Liu wrote:
>>>>> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
>>>>> Typical userspace tools use kernel.bpf_stats_enabled as follows:
>>>>>    1. Enable kernel.bpf_stats_enabled;
>>>>>    2. Check program run_time_ns;
>>>>>    3. Sleep for the monitoring period;
>>>>>    4. Check program run_time_ns again, calculate the difference;
>>>>>    5. Disable kernel.bpf_stats_enabled.
>>>>> The problem with this approach is that only one userspace tool can toggle
>>>>> this sysctl. If multiple tools toggle the sysctl at the same time, the
>>>>> measurement may be inaccurate.
>>>>> To fix this problem while keep backward compatibility, introduce a new
>>>>> bpf command BPF_ENABLE_RUNTIME_STATS. On success, this command enables
>>>>> run_time_ns stats and returns a valid fd.
>>>>> With BPF_ENABLE_RUNTIME_STATS, user space tool would have the following
>>>>> flow:
>>>>>    1. Get a fd with BPF_ENABLE_RUNTIME_STATS, and make sure it is valid;
>>>>>    2. Check program run_time_ns;
>>>>>    3. Sleep for the monitoring period;
>>>>>    4. Check program run_time_ns again, calculate the difference;
>>>>>    5. Close the fd.
>>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>>>
>>>> Hmm, I see no relation to /dev/bpf_stats anymore, yet the subject still talks
>>>> about it?
>>> My fault. Will fix..
>>>> Also, should this have bpftool integration now that we have `bpftool prog profile`
>>>> support? Would be nice to then fetch the related stats via bpf_prog_info, so users
>>>> can consume this in an easy way.
>>> We can add "run_time_ns" as a metric to "bpftool prog profile". But the
>>> mechanism is not the same though. Let me think about this.
>>
>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>> potentially deprecate bpf_stats counters?
> 
> I think run_time_ns has its own value as a simple monitoring framework. We can
> use it in tools like top (and variations). It will be easier for these tools to
> adopt run_time_ns than using fentry/fexit.

Agree that this is easier; I presume there is no such official integration today
in tools like top, right, or is there anything planned?

> On the other hand, in long term, we may include a few fentry/fexit based programs
> in the kernel binary (or the rpm), so that these tools can use them easily. At
> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?

Did you check how feasible it is to have something like `bpftool prog profile top`
which then enables fentry/fexit for /all/ existing BPF programs in the system? It
could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
runtime, etc in a top-like output. Wdyt?

Thanks,
Daniel
Song Liu March 17, 2020, 11:08 p.m. UTC | #7
> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> 
>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>>> potentially deprecate bpf_stats counters?
>> I think run_time_ns has its own value as a simple monitoring framework. We can
>> use it in tools like top (and variations). It will be easier for these tools to
>> adopt run_time_ns than using fentry/fexit.
> 
> Agree that this is easier; I presume there is no such official integration today
> in tools like top, right, or is there anything planned?

Yes, we do want more supports in different tools to increase the visibility. 
Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .

I wasn't pushing push hard on this one mostly because the sysctl interface requires 
a user space "owner". 

> 
>> On the other hand, in long term, we may include a few fentry/fexit based programs
>> in the kernel binary (or the rpm), so that these tools can use them easily. At
>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
> 
> Did you check how feasible it is to have something like `bpftool prog profile top`
> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
> runtime, etc in a top-like output. Wdyt?

I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
all BPF programs, like a trampoline inside __BPF_PROG_RUN()? 

For long term direction, I think we could compare two different approaches: add new 
tools (like bpftool prog profile top) vs. add BPF support to existing tools. The 
first approach is easier. The latter approach would show BPF information to users
who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
programs in top/ps, and controlling them via kill is more natural than learning
bpftool. What's your thought on this? 

Thanks,
Song
Song Liu March 18, 2020, 6:33 a.m. UTC | #8
> On Mar 17, 2020, at 4:08 PM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> 
>>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>>>> potentially deprecate bpf_stats counters?
>>> I think run_time_ns has its own value as a simple monitoring framework. We can
>>> use it in tools like top (and variations). It will be easier for these tools to
>>> adopt run_time_ns than using fentry/fexit.
>> 
>> Agree that this is easier; I presume there is no such official integration today
>> in tools like top, right, or is there anything planned?
> 
> Yes, we do want more supports in different tools to increase the visibility. 
> Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .
> 
> I wasn't pushing push hard on this one mostly because the sysctl interface requires 
> a user space "owner". 
> 
>> 
>>> On the other hand, in long term, we may include a few fentry/fexit based programs
>>> in the kernel binary (or the rpm), so that these tools can use them easily. At
>>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
>> 
>> Did you check how feasible it is to have something like `bpftool prog profile top`
>> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
>> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
>> runtime, etc in a top-like output. Wdyt?
> 
> I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
> all BPF programs, like a trampoline inside __BPF_PROG_RUN()? 
> 
> For long term direction, I think we could compare two different approaches: add new 
> tools (like bpftool prog profile top) vs. add BPF support to existing tools. The 
> first approach is easier. The latter approach would show BPF information to users
> who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
> programs in top/ps, and controlling them via kill is more natural than learning
> bpftool. What's your thought on this? 

More thoughts on this. 

If we have a special trampoline that attach to all BPF programs at once, we really 
don't need the run_time_ns stats anymore. Eventually, tools that monitor BPF 
programs will depend on libbpf, so using fentry/fexit to monitor BPF programs doesn't
introduce extra dependency. I guess we also need a way to include BPF program in 
libbpf. 

To summarize this plan, we need:

1) A global trampoline that attaches to all BPF programs at once;
2) Embed fentry/fexit program in libbpf, which will be used by tools for monitoring;
3) BPF helpers to read time, which replaces current run_time_ns. 

Does this look reasonable?

Thanks,
Song
Daniel Borkmann March 18, 2020, 8:58 p.m. UTC | #9
On 3/18/20 7:33 AM, Song Liu wrote:
>> On Mar 17, 2020, at 4:08 PM, Song Liu <songliubraving@fb.com> wrote:
>>> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>
>>>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>>>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>>>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>>>>> potentially deprecate bpf_stats counters?
>>>> I think run_time_ns has its own value as a simple monitoring framework. We can
>>>> use it in tools like top (and variations). It will be easier for these tools to
>>>> adopt run_time_ns than using fentry/fexit.
>>>
>>> Agree that this is easier; I presume there is no such official integration today
>>> in tools like top, right, or is there anything planned?
>>
>> Yes, we do want more supports in different tools to increase the visibility.
>> Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .
>>
>> I wasn't pushing push hard on this one mostly because the sysctl interface requires
>> a user space "owner".
>>
>>>> On the other hand, in long term, we may include a few fentry/fexit based programs
>>>> in the kernel binary (or the rpm), so that these tools can use them easily. At
>>>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
>>>
>>> Did you check how feasible it is to have something like `bpftool prog profile top`
>>> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
>>> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
>>> runtime, etc in a top-like output. Wdyt?
>>
>> I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
>> all BPF programs, like a trampoline inside __BPF_PROG_RUN()?
>>
>> For long term direction, I think we could compare two different approaches: add new
>> tools (like bpftool prog profile top) vs. add BPF support to existing tools. The
>> first approach is easier. The latter approach would show BPF information to users
>> who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
>> programs in top/ps, and controlling them via kill is more natural than learning
>> bpftool. What's your thought on this?
> 
> More thoughts on this.
> 
> If we have a special trampoline that attach to all BPF programs at once, we really
> don't need the run_time_ns stats anymore. Eventually, tools that monitor BPF
> programs will depend on libbpf, so using fentry/fexit to monitor BPF programs doesn't
> introduce extra dependency. I guess we also need a way to include BPF program in
> libbpf.
> 
> To summarize this plan, we need:
> 
> 1) A global trampoline that attaches to all BPF programs at once;

Overall sounds good, I think the `at once` part might be tricky, at least it would
need to patch one prog after another, each prog also needs to store its own metrics
somewhere for later collection. The start-to-sample could be a shared global var (aka
shared map between all the programs) which would flip the switch though.

> 2) Embed fentry/fexit program in libbpf, which will be used by tools for monitoring;
> 3) BPF helpers to read time, which replaces current run_time_ns.
> 
> Does this look reasonable?
> 
> Thanks,
> Song
>
Song Liu March 18, 2020, 9:20 p.m. UTC | #10
> On Mar 18, 2020, at 1:58 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 3/18/20 7:33 AM, Song Liu wrote:
>>> On Mar 17, 2020, at 4:08 PM, Song Liu <songliubraving@fb.com> wrote:
>>>> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>> 
>>>>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>>>>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>>>>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>>>>>> potentially deprecate bpf_stats counters?
>>>>> I think run_time_ns has its own value as a simple monitoring framework. We can
>>>>> use it in tools like top (and variations). It will be easier for these tools to
>>>>> adopt run_time_ns than using fentry/fexit.
>>>> 
>>>> Agree that this is easier; I presume there is no such official integration today
>>>> in tools like top, right, or is there anything planned?
>>> 
>>> Yes, we do want more supports in different tools to increase the visibility.
>>> Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .
>>> 
>>> I wasn't pushing push hard on this one mostly because the sysctl interface requires
>>> a user space "owner".
>>> 
>>>>> On the other hand, in long term, we may include a few fentry/fexit based programs
>>>>> in the kernel binary (or the rpm), so that these tools can use them easily. At
>>>>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
>>>> 
>>>> Did you check how feasible it is to have something like `bpftool prog profile top`
>>>> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
>>>> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
>>>> runtime, etc in a top-like output. Wdyt?
>>> 
>>> I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
>>> all BPF programs, like a trampoline inside __BPF_PROG_RUN()?
>>> 
>>> For long term direction, I think we could compare two different approaches: add new
>>> tools (like bpftool prog profile top) vs. add BPF support to existing tools. The
>>> first approach is easier. The latter approach would show BPF information to users
>>> who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
>>> programs in top/ps, and controlling them via kill is more natural than learning
>>> bpftool. What's your thought on this?
>> More thoughts on this.
>> If we have a special trampoline that attach to all BPF programs at once, we really
>> don't need the run_time_ns stats anymore. Eventually, tools that monitor BPF
>> programs will depend on libbpf, so using fentry/fexit to monitor BPF programs doesn't
>> introduce extra dependency. I guess we also need a way to include BPF program in
>> libbpf.
>> To summarize this plan, we need:
>> 1) A global trampoline that attaches to all BPF programs at once;
> 
> Overall sounds good, I think the `at once` part might be tricky, at least it would
> need to patch one prog after another, each prog also needs to store its own metrics
> somewhere for later collection. The start-to-sample could be a shared global var (aka
> shared map between all the programs) which would flip the switch though.

I was thinking about adding bpf_global_trampoline and use it in __BPF_PROG_RUN. 
Something like:

diff --git i/include/linux/filter.h w/include/linux/filter.h
index 9b5aa5c483cc..ac9497d1fa7b 100644
--- i/include/linux/filter.h
+++ w/include/linux/filter.h
@@ -559,9 +559,14 @@ struct sk_filter {

 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

+extern struct bpf_trampoline *bpf_global_trampoline;
+DECLARE_STATIC_KEY_FALSE(bpf_global_tr_active);
+
 #define __BPF_PROG_RUN(prog, ctx, dfunc)       ({                      \
        u32 ret;                                                        \
        cant_migrate();                                                 \
+       if (static_branch_unlikely(&bpf_global_tr_active))              \
+               run_the_trampoline();                                   \
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {           \
                struct bpf_prog_stats *stats;                           \
                u64 start = sched_clock();                              \


I am not 100% sure this is OK. 

I am also not sure whether this is an overkill. Do we really want more complex
metric for all BPF programs? Or run_time_ns is enough? 

Thanks,
Song
Stanislav Fomichev March 18, 2020, 10:29 p.m. UTC | #11
On 03/18, Song Liu wrote:
> 
> 
> > On Mar 18, 2020, at 1:58 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > 
> > On 3/18/20 7:33 AM, Song Liu wrote:
> >>> On Mar 17, 2020, at 4:08 PM, Song Liu <songliubraving@fb.com> wrote:
> >>>> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>> 
> >>>>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
> >>>>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
> >>>>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
> >>>>>> potentially deprecate bpf_stats counters?
> >>>>> I think run_time_ns has its own value as a simple monitoring framework. We can
> >>>>> use it in tools like top (and variations). It will be easier for these tools to
> >>>>> adopt run_time_ns than using fentry/fexit.
> >>>> 
> >>>> Agree that this is easier; I presume there is no such official integration today
> >>>> in tools like top, right, or is there anything planned?
> >>> 
> >>> Yes, we do want more supports in different tools to increase the visibility.
> >>> Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .
> >>> 
> >>> I wasn't pushing push hard on this one mostly because the sysctl interface requires
> >>> a user space "owner".
> >>> 
> >>>>> On the other hand, in long term, we may include a few fentry/fexit based programs
> >>>>> in the kernel binary (or the rpm), so that these tools can use them easily. At
> >>>>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
> >>>> 
> >>>> Did you check how feasible it is to have something like `bpftool prog profile top`
> >>>> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
> >>>> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
> >>>> runtime, etc in a top-like output. Wdyt?
> >>> 
> >>> I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
> >>> all BPF programs, like a trampoline inside __BPF_PROG_RUN()?
> >>> 
> >>> For long term direction, I think we could compare two different approaches: add new
> >>> tools (like bpftool prog profile top) vs. add BPF support to existing tools. The
> >>> first approach is easier. The latter approach would show BPF information to users
> >>> who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
> >>> programs in top/ps, and controlling them via kill is more natural than learning
> >>> bpftool. What's your thought on this?
> >> More thoughts on this.
> >> If we have a special trampoline that attach to all BPF programs at once, we really
> >> don't need the run_time_ns stats anymore. Eventually, tools that monitor BPF
> >> programs will depend on libbpf, so using fentry/fexit to monitor BPF programs doesn't
> >> introduce extra dependency. I guess we also need a way to include BPF program in
> >> libbpf.
> >> To summarize this plan, we need:
> >> 1) A global trampoline that attaches to all BPF programs at once;
> > 
> > Overall sounds good, I think the `at once` part might be tricky, at least it would
> > need to patch one prog after another, each prog also needs to store its own metrics
> > somewhere for later collection. The start-to-sample could be a shared global var (aka
> > shared map between all the programs) which would flip the switch though.
> 
> I was thinking about adding bpf_global_trampoline and use it in __BPF_PROG_RUN. 
> Something like:
> 
> diff --git i/include/linux/filter.h w/include/linux/filter.h
> index 9b5aa5c483cc..ac9497d1fa7b 100644
> --- i/include/linux/filter.h
> +++ w/include/linux/filter.h
> @@ -559,9 +559,14 @@ struct sk_filter {
> 
>  DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> 
> +extern struct bpf_trampoline *bpf_global_trampoline;
> +DECLARE_STATIC_KEY_FALSE(bpf_global_tr_active);
> +
>  #define __BPF_PROG_RUN(prog, ctx, dfunc)       ({                      \
>         u32 ret;                                                        \
>         cant_migrate();                                                 \
> +       if (static_branch_unlikely(&bpf_global_tr_active))              \
> +               run_the_trampoline();                                   \
>         if (static_branch_unlikely(&bpf_stats_enabled_key)) {           \
>                 struct bpf_prog_stats *stats;                           \
>                 u64 start = sched_clock();                              \
> 
> 
> I am not 100% sure this is OK. 
> 
> I am also not sure whether this is an overkill. Do we really want more complex
> metric for all BPF programs? Or run_time_ns is enough? 
I was thinking about exporting a real distribution of the prog runtimes
instead of doing an average. It would be interesting to see
50%/95%/99%/max stats.
Song Liu March 18, 2020, 11:45 p.m. UTC | #12
> On Mar 18, 2020, at 3:29 PM, Stanislav Fomichev <sdf@fomichev.me> wrote:
> 
> On 03/18, Song Liu wrote:
>> 
>> 
>>> On Mar 18, 2020, at 1:58 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> 
>>> On 3/18/20 7:33 AM, Song Liu wrote:
>>>>> On Mar 17, 2020, at 4:08 PM, Song Liu <songliubraving@fb.com> wrote:
>>>>>> On Mar 17, 2020, at 2:47 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>>>> 
>>>>>>>> Hm, true as well. Wouldn't long-term extending "bpftool prog profile" fentry/fexit
>>>>>>>> programs supersede this old bpf_stats infrastructure? Iow, can't we implement the
>>>>>>>> same (or even more elaborate stats aggregation) in BPF via fentry/fexit and then
>>>>>>>> potentially deprecate bpf_stats counters?
>>>>>>> I think run_time_ns has its own value as a simple monitoring framework. We can
>>>>>>> use it in tools like top (and variations). It will be easier for these tools to
>>>>>>> adopt run_time_ns than using fentry/fexit.
>>>>>> 
>>>>>> Agree that this is easier; I presume there is no such official integration today
>>>>>> in tools like top, right, or is there anything planned?
>>>>> 
>>>>> Yes, we do want more supports in different tools to increase the visibility.
>>>>> Here is the effort for atop: https://github.com/Atoptool/atop/pull/88 .
>>>>> 
>>>>> I wasn't pushing push hard on this one mostly because the sysctl interface requires
>>>>> a user space "owner".
>>>>> 
>>>>>>> On the other hand, in long term, we may include a few fentry/fexit based programs
>>>>>>> in the kernel binary (or the rpm), so that these tools can use them easily. At
>>>>>>> that time, we can fully deprecate run_time_ns. Maybe this is not too far away?
>>>>>> 
>>>>>> Did you check how feasible it is to have something like `bpftool prog profile top`
>>>>>> which then enables fentry/fexit for /all/ existing BPF programs in the system? It
>>>>>> could then sort the sample interval by run_cnt, cycles, cache misses, aggregated
>>>>>> runtime, etc in a top-like output. Wdyt?
>>>>> 
>>>>> I wonder whether we can achieve this with one bpf prog (or a trampoline) that covers
>>>>> all BPF programs, like a trampoline inside __BPF_PROG_RUN()?
>>>>> 
>>>>> For long term direction, I think we could compare two different approaches: add new
>>>>> tools (like bpftool prog profile top) vs. add BPF support to existing tools. The
>>>>> first approach is easier. The latter approach would show BPF information to users
>>>>> who are not expecting BPF programs in the systems. For many sysadmins, seeing BPF
>>>>> programs in top/ps, and controlling them via kill is more natural than learning
>>>>> bpftool. What's your thought on this?
>>>> More thoughts on this.
>>>> If we have a special trampoline that attach to all BPF programs at once, we really
>>>> don't need the run_time_ns stats anymore. Eventually, tools that monitor BPF
>>>> programs will depend on libbpf, so using fentry/fexit to monitor BPF programs doesn't
>>>> introduce extra dependency. I guess we also need a way to include BPF program in
>>>> libbpf.
>>>> To summarize this plan, we need:
>>>> 1) A global trampoline that attaches to all BPF programs at once;
>>> 
>>> Overall sounds good, I think the `at once` part might be tricky, at least it would
>>> need to patch one prog after another, each prog also needs to store its own metrics
>>> somewhere for later collection. The start-to-sample could be a shared global var (aka
>>> shared map between all the programs) which would flip the switch though.
>> 
>> I was thinking about adding bpf_global_trampoline and use it in __BPF_PROG_RUN. 
>> Something like:
>> 
>> diff --git i/include/linux/filter.h w/include/linux/filter.h
>> index 9b5aa5c483cc..ac9497d1fa7b 100644
>> --- i/include/linux/filter.h
>> +++ w/include/linux/filter.h
>> @@ -559,9 +559,14 @@ struct sk_filter {
>> 
>> DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>> 
>> +extern struct bpf_trampoline *bpf_global_trampoline;
>> +DECLARE_STATIC_KEY_FALSE(bpf_global_tr_active);
>> +
>> #define __BPF_PROG_RUN(prog, ctx, dfunc)       ({                      \
>>        u32 ret;                                                        \
>>        cant_migrate();                                                 \
>> +       if (static_branch_unlikely(&bpf_global_tr_active))              \
>> +               run_the_trampoline();                                   \
>>        if (static_branch_unlikely(&bpf_stats_enabled_key)) {           \
>>                struct bpf_prog_stats *stats;                           \
>>                u64 start = sched_clock();                              \
>> 
>> 
>> I am not 100% sure this is OK. 
>> 
>> I am also not sure whether this is an overkill. Do we really want more complex
>> metric for all BPF programs? Or run_time_ns is enough? 
> I was thinking about exporting a real distribution of the prog runtimes
> instead of doing an average. It would be interesting to see
> 50%/95%/99%/max stats.

Good point. Distribution logic fits well in fentry/fexit programs. 

Let me think more about this. 

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fd91b7c95ea..d542349771df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -970,6 +970,7 @@  _out:							\
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
+extern struct mutex bpf_stats_enabled_mutex;
 
 /*
  * Block execution of BPF programs attached to instrumentation (perf,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 40b2d9476268..8285ff37210c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@  enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
+	BPF_ENABLE_RUNTIME_STATS,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b2f73ecacced..823dc9de7953 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -24,6 +24,9 @@ 
 #include <linux/ctype.h>
 #include <linux/nospec.h>
 #include <linux/audit.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/jump_label.h>
 #include <uapi/linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -3550,6 +3553,43 @@  static int bpf_map_do_batch(const union bpf_attr *attr,
 	return err;
 }
 
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+	mutex_lock(&bpf_stats_enabled_mutex);
+	static_key_slow_dec(&bpf_stats_enabled_key.key);
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+	.release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+	int fd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&bpf_stats_enabled_mutex);
+	/* Set a very high limit to avoid overflow */
+	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+		mutex_unlock(&bpf_stats_enabled_mutex);
+		return -EBUSY;
+	}
+
+	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, 0);
+	if (fd >= 0)
+		static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return fd;
+}
+
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -3660,6 +3700,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_DELETE_BATCH:
 		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
 		break;
+	case BPF_ENABLE_RUNTIME_STATS:
+		err = bpf_enable_runtime_stats();
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ad5b88a53c5a..14613d1e0b88 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -316,6 +316,40 @@  static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos)
+{
+	struct static_key *key = (struct static_key *)table->data;
+	int val, ret;
+	static int saved_val;
+	struct ctl_table tmp = {
+		.data   = &val,
+		.maxlen = sizeof(val),
+		.mode   = table->mode,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
+	};
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&bpf_stats_enabled_mutex);
+	val = saved_val;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret && val != saved_val) {
+		if (val)
+			static_key_slow_inc(key);
+		else
+			static_key_slow_dec(key);
+		saved_val = val;
+	}
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return ret;
+}
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
@@ -1256,7 +1290,7 @@  static struct ctl_table kern_table[] = {
 		.data		= &bpf_stats_enabled_key.key,
 		.maxlen		= sizeof(bpf_stats_enabled_key),
 		.mode		= 0644,
-		.proc_handler	= proc_do_static_key,
+		.proc_handler	= bpf_stats_handler,
 	},
 #endif
 #if defined(CONFIG_TREE_RCU)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 40b2d9476268..8285ff37210c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@  enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
+	BPF_ENABLE_RUNTIME_STATS,
 };
 
 enum bpf_map_type {