diff mbox series

[v5,perf,bpf-next,3/7] perf, bpf: introduce PERF_RECORD_BPF_EVENT

Message ID 20181220182904.4193196-4-songliubraving@fb.com
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series reveal invisible bpf programs | expand

Commit Message

Song Liu Dec. 20, 2018, 6:29 p.m. UTC
For better performance analysis of BPF programs, this patch introduces
PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
load/unload information to user space.

Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
The following example shows kernel symbols for a BPF program with 7
sub programs:

    ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
    ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
    ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
    ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
    ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
    ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
    ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
    ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for
each of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not
needed for simple profiling.

For annotation, user space need to listen to PERF_RECORD_BPF_EVENT
and gather more information about these (sub) programs via sys_bpf.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/filter.h          |   7 ++
 include/linux/perf_event.h      |   6 ++
 include/uapi/linux/perf_event.h |  29 +++++++-
 kernel/bpf/core.c               |   2 +-
 kernel/bpf/syscall.c            |   2 +
 kernel/events/core.c            | 120 ++++++++++++++++++++++++++++++++
 6 files changed, 164 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Jan. 8, 2019, 6:41 p.m. UTC | #1
On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> @@ -986,9 +987,35 @@ enum perf_event_type {
>  	 */
>  	PERF_RECORD_KSYMBOL			= 17,
>  
> +	/*
> +	 * Record bpf events:
> +	 *  enum perf_bpf_event_type {
> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
> +	 *  };
> +	 *
> +	 * struct {
> +	 *	struct perf_event_header	header;
> +	 *	u16				type;
> +	 *	u16				flags;
> +	 *	u32				id;
> +	 *	u8				tag[BPF_TAG_SIZE];
> +	 *	struct sample_id		sample_id;
> +	 * };
> +	 */
> +	PERF_RECORD_BPF_EVENT			= 18,
> +

Elsewhere today, I raised the point that by the time (however short
interval) userspace gets around to reading this event, the actual
program could be gone again.

In this case the program has been with us for a very short period
indeed; but it could still have generated some samples or otherwise
generated trace data.

It was suggested to allow pinning modules/programs to avoid this
situation, but that of course has other undesirable effects, such as a
trivial DoS.

A truly horrible hack would be to include an open filedesc in the event
that needs closing to release the resource, but I'm sorry for even
suggesting that **shudder**.

Do we have any sane ideas?
Song Liu Jan. 8, 2019, 7:10 p.m. UTC | #2
> On Jan 8, 2019, at 10:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>> @@ -986,9 +987,35 @@ enum perf_event_type {
>> 	 */
>> 	PERF_RECORD_KSYMBOL			= 17,
>> 
>> +	/*
>> +	 * Record bpf events:
>> +	 *  enum perf_bpf_event_type {
>> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
>> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
>> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
>> +	 *  };
>> +	 *
>> +	 * struct {
>> +	 *	struct perf_event_header	header;
>> +	 *	u16				type;
>> +	 *	u16				flags;
>> +	 *	u32				id;
>> +	 *	u8				tag[BPF_TAG_SIZE];
>> +	 *	struct sample_id		sample_id;
>> +	 * };
>> +	 */
>> +	PERF_RECORD_BPF_EVENT			= 18,
>> +
> 
> Elsewhere today, I raised the point that by the time (however short
> interval) userspace gets around to reading this event, the actual
> program could be gone again.
> 
> In this case the program has been with us for a very short period
> indeed; but it could still have generated some samples or otherwise
> generated trace data.

Since we already have the separate KSYMBOL events, BPF_EVENT is only 
required for advanced use cases, like annotation. So I guess missing 
it for very-short-living programs should not be a huge problem?

> It was suggested to allow pinning modules/programs to avoid this
> situation, but that of course has other undesirable effects, such as a
> trivial DoS.
> 
> A truly horrible hack would be to include an open filedesc in the event
> that needs closing to release the resource, but I'm sorry for even
> suggesting that **shudder**.
> 
> Do we have any sane ideas?

How about we gate the open filedesc solution with an option, and limit
that option for root only? If this still sounds hacky, maybe we should
just ignore when short-living programs are missed?

Thanks,
Song
Peter Zijlstra Jan. 8, 2019, 7:43 p.m. UTC | #3
On Tue, Jan 08, 2019 at 07:10:20PM +0000, Song Liu wrote:
> > On Jan 8, 2019, at 10:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> >> @@ -986,9 +987,35 @@ enum perf_event_type {
> >> 	 */
> >> 	PERF_RECORD_KSYMBOL			= 17,
> >> 
> >> +	/*
> >> +	 * Record bpf events:
> >> +	 *  enum perf_bpf_event_type {
> >> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
> >> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
> >> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
> >> +	 *  };
> >> +	 *
> >> +	 * struct {
> >> +	 *	struct perf_event_header	header;
> >> +	 *	u16				type;
> >> +	 *	u16				flags;
> >> +	 *	u32				id;
> >> +	 *	u8				tag[BPF_TAG_SIZE];
> >> +	 *	struct sample_id		sample_id;
> >> +	 * };
> >> +	 */
> >> +	PERF_RECORD_BPF_EVENT			= 18,
> >> +
> > 
> > Elsewhere today, I raised the point that by the time (however short
> > interval) userspace gets around to reading this event, the actual
> > program could be gone again.
> > 
> > In this case the program has been with us for a very short period
> > indeed; but it could still have generated some samples or otherwise
> > generated trace data.
> 
> Since we already have the separate KSYMBOL events, BPF_EVENT is only 
> required for advanced use cases, like annotation. So I guess missing 
> it for very-short-living programs should not be a huge problem?
> 
> > It was suggested to allow pinning modules/programs to avoid this
> > situation, but that of course has other undesirable effects, such as a
> > trivial DoS.
> > 
> > A truly horrible hack would be to include an open filedesc in the event
> > that needs closing to release the resource, but I'm sorry for even
> > suggesting that **shudder**.
> > 
> > Do we have any sane ideas?
> 
> How about we gate the open filedesc solution with an option, and limit
> that option for root only? If this still sounds hacky, maybe we should
> just ignore when short-living programs are missed?

I'm afraid we might also 'need' this for the kallsym thing.

The problem is that things like Intel PT (ARM Coresight too IIRC) encode
a bitstream of branch-taken decisions. The only way to decode that and
reconstruct the actual code-flow is with an exact matching text image.

In order to have this matching text we need to be able to copy out every
piece of dynamic text (from kcore) that has ever executed before it
dissapears.

Elsewhere (*), Andi suggests to have a kind of text-free fence
interface, where userspace can call a complete. And I suppose as long we
know there is a consumer, we also know we'll not be blocked
indefinitely. So it would have to be slightly more complicated than
suggested, but I think that is something we could work with.

It would also not complicate these events.



[*] https://lkml.kernel.org/r/20190108172721.GN6118@tassilo.jf.intel.com
Peter Zijlstra Jan. 8, 2019, 7:59 p.m. UTC | #4
On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
> +					 enum perf_bpf_event_type type)
> +{
> +	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
> +	int i;
> +
> +	if (prog->aux->func_cnt == 0) {
> +		perf_event_ksymbol(PERF_RECORD_MISC_KSYMBOL_TYPE_BPF,
> +				   (u64)(unsigned long)prog->bpf_func,
> +				   prog->jited_len, unregister,
> +				   perf_event_bpf_get_name, prog);
> +	} else {
> +		for (i = 0; i < prog->aux->func_cnt; i++) {
> +			struct bpf_prog *subprog = prog->aux->func[i];
> +
> +			perf_event_ksymbol(
> +				PERF_RECORD_MISC_KSYMBOL_TYPE_BPF,
> +				(u64)(unsigned long)subprog->bpf_func,
> +				subprog->jited_len, unregister,
> +				perf_event_bpf_get_name, subprog);
> +		}
> +	}
> +}

That's a bit unexpected, but yes sure, that works for now.

I was expecting it to be hooked up in your kallsym rbtree thing, but
whatever, we can fix that when needed.
Arnaldo Carvalho de Melo Jan. 8, 2019, 8:16 p.m. UTC | #5
Em Tue, Jan 08, 2019 at 07:10:20PM +0000, Song Liu escreveu:
> > On Jan 8, 2019, at 10:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> >> @@ -986,9 +987,35 @@ enum perf_event_type {
> >> 	 */
> >> 	PERF_RECORD_KSYMBOL			= 17,
> >> 
> >> +	/*
> >> +	 * Record bpf events:
> >> +	 *  enum perf_bpf_event_type {
> >> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
> >> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
> >> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
> >> +	 *  };
> >> +	 *
> >> +	 * struct {
> >> +	 *	struct perf_event_header	header;
> >> +	 *	u16				type;
> >> +	 *	u16				flags;
> >> +	 *	u32				id;
> >> +	 *	u8				tag[BPF_TAG_SIZE];
> >> +	 *	struct sample_id		sample_id;
> >> +	 * };
> >> +	 */
> >> +	PERF_RECORD_BPF_EVENT			= 18,

> > It was suggested to allow pinning modules/programs to avoid this
> > situation, but that of course has other undesirable effects, such as a
> > trivial DoS.
> > 
> > A truly horrible hack would be to include an open filedesc in the event
> > that needs closing to release the resource, but I'm sorry for even
> > suggesting that **shudder**.
> > 
> > Do we have any sane ideas?
> 
> How about we gate the open filedesc solution with an option, and limit
> that option for root only? If this still sounds hacky, maybe we should
> just ignore when short-living programs are missed?

Short lived short programs could go in the event? Short lived long
events.. One could ask for max number of bytes of binary?

The smallest kernel modules are 16KB, multiple of PAGE_SIZE:

[acme@quaco perf]$ cat /proc/modules | sort -k2 -nr | tail
ebtable_nat 16384 1 - Live 0x0000000000000000
ebtable_filter 16384 1 - Live 0x0000000000000000
crct10dif_pclmul 16384 0 - Live 0x0000000000000000
crc32_pclmul 16384 0 - Live 0x0000000000000000
coretemp 16384 0 - Live 0x0000000000000000
btrtl 16384 1 btusb, Live 0x0000000000000000
btbcm 16384 1 btusb, Live 0x0000000000000000
arc4 16384 2 - Live 0x0000000000000000
acpi_thermal_rel 16384 1 int3400_thermal, Live 0x0000000000000000
ac97_bus 16384 1 snd_soc_core, Live 0x0000000000000000
[acme@quaco perf]$

On a Fedora 29 I have these here, all rather small:

# bpftool prog
13: cgroup_skb  tag 7be49e3934a125ba  gpl
	loaded_at 2019-01-04T14:40:32-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 13,14
14: cgroup_skb  tag 2a142ef67aaad174  gpl
	loaded_at 2019-01-04T14:40:32-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 13,14
15: cgroup_skb  tag 7be49e3934a125ba  gpl
	loaded_at 2019-01-04T14:40:32-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 15,16
16: cgroup_skb  tag 2a142ef67aaad174  gpl
	loaded_at 2019-01-04T14:40:32-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 15,16
17: cgroup_skb  tag 7be49e3934a125ba  gpl
	loaded_at 2019-01-04T14:40:43-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 17,18
18: cgroup_skb  tag 2a142ef67aaad174  gpl
	loaded_at 2019-01-04T14:40:43-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 17,18
21: cgroup_skb  tag 7be49e3934a125ba  gpl
	loaded_at 2019-01-04T14:40:43-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 21,22
22: cgroup_skb  tag 2a142ef67aaad174  gpl
	loaded_at 2019-01-04T14:40:43-0300  uid 0
	xlated 296B  jited 229B  memlock 4096B  map_ids 21,22
[root@quaco IRPF2018]#


Running 'perf trace' with its BPF augmenter get these two more:

158: tracepoint  name sys_enter  tag 12504ba9402f952f  gpl
	loaded_at 2019-01-08T17:12:39-0300  uid 0
	xlated 512B  jited 374B  memlock 4096B  map_ids 118,117,116
159: tracepoint  name sys_exit  tag c1bd85c092d6e4aa  gpl
	loaded_at 2019-01-08T17:12:39-0300  uid 0
	xlated 256B  jited 191B  memlock 4096B  map_ids 118,117
[root@quaco ~]#

A PERF_RECORD_MMAP gets as its payload up to PATH_MAX - sizeof(u64).

So for a class of programs, shoving it together with the
PERF_RECORD_MMAP like event may be enough?

You started the shuddering suggestions... ;-)

- Arnaldo
Peter Zijlstra Jan. 8, 2019, 8:29 p.m. UTC | #6
On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> The following example shows kernel symbols for a BPF program with 7
> sub programs:
> 
>     ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>     ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>     ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>     ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>     ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>     ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>     ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>     ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

We should probably specify somewhere that the name can include a
'[module]' part just like normal kallsyms. Even though you don't
currently use that.
Alexei Starovoitov Jan. 8, 2019, 8:45 p.m. UTC | #7
On 1/8/19 12:29 PM, Peter Zijlstra wrote:
> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>> The following example shows kernel symbols for a BPF program with 7
>> sub programs:
>>
>>      ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>>      ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>>      ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>>      ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>>      ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>>      ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>>      ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>>      ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
> 
> We should probably specify somewhere that the name can include a
> '[module]' part just like normal kallsyms. Even though you don't
> currently use that.

there is no [module] equivalent in bpf land.
The progs loaded by different users can be shared.
There is no strict tree hierarchy (like in modules)
where there is one root and a bunch of function underneath.
In bpf all these functions form a graph and can call each other.
Same with maps that are shared by different progs of different types.
Like networking prog and tracing prog can share common map.
Alexei Starovoitov Jan. 8, 2019, 8:56 p.m. UTC | #8
On 1/8/19 10:41 AM, Peter Zijlstra wrote:
> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>> @@ -986,9 +987,35 @@ enum perf_event_type {
>>   	 */
>>   	PERF_RECORD_KSYMBOL			= 17,
>>   
>> +	/*
>> +	 * Record bpf events:
>> +	 *  enum perf_bpf_event_type {
>> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
>> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
>> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
>> +	 *  };
>> +	 *
>> +	 * struct {
>> +	 *	struct perf_event_header	header;
>> +	 *	u16				type;
>> +	 *	u16				flags;
>> +	 *	u32				id;
>> +	 *	u8				tag[BPF_TAG_SIZE];
>> +	 *	struct sample_id		sample_id;
>> +	 * };
>> +	 */
>> +	PERF_RECORD_BPF_EVENT			= 18,
>> +
> 
> Elsewhere today, I raised the point that by the time (however short
> interval) userspace gets around to reading this event, the actual
> program could be gone again.
> 
> In this case the program has been with us for a very short period
> indeed; but it could still have generated some samples or otherwise
> generated trace data.
> 
> It was suggested to allow pinning modules/programs to avoid this
> situation, but that of course has other undesirable effects, such as a
> trivial DoS.
> 
> A truly horrible hack would be to include an open filedesc in the event
> that needs closing to release the resource, but I'm sorry for even
> suggesting that **shudder**.
> 
> Do we have any sane ideas?

I think we should miss such ultra short lived progs.
If perf record is slower than some user doing for(;;) {load prog; 
unload;} than it's a good thing.
I frankly don't think it's practically possible to miss a prog.
perf ring buffer is way faster than prog load/unload.
Even when all scheduling artifacts of perf user space are factored in.
Doing barriers is DoS-able.
Sending FD via ring buffer is not possible.
If there was a non-shudder solution to this we could do it,
but all complications pointing out that this is not a problem
worth solving.
At least since right now none of us see a clean fix I propose
to move ahead with what we have and address it later if better ideas
come up. We've been missing short lived progs and prog notifications
all this time and users complain. That is the problem to address.
Song Liu Jan. 8, 2019, 11:37 p.m. UTC | #9
> On Jan 8, 2019, at 12:16 PM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Tue, Jan 08, 2019 at 07:10:20PM +0000, Song Liu escreveu:
>>> On Jan 8, 2019, at 10:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>>>> @@ -986,9 +987,35 @@ enum perf_event_type {
>>>> 	 */
>>>> 	PERF_RECORD_KSYMBOL			= 17,
>>>> 
>>>> +	/*
>>>> +	 * Record bpf events:
>>>> +	 *  enum perf_bpf_event_type {
>>>> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
>>>> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
>>>> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
>>>> +	 *  };
>>>> +	 *
>>>> +	 * struct {
>>>> +	 *	struct perf_event_header	header;
>>>> +	 *	u16				type;
>>>> +	 *	u16				flags;
>>>> +	 *	u32				id;
>>>> +	 *	u8				tag[BPF_TAG_SIZE];
>>>> +	 *	struct sample_id		sample_id;
>>>> +	 * };
>>>> +	 */
>>>> +	PERF_RECORD_BPF_EVENT			= 18,
> 
>>> It was suggested to allow pinning modules/programs to avoid this
>>> situation, but that of course has other undesirable effects, such as a
>>> trivial DoS.
>>> 
>>> A truly horrible hack would be to include an open filedesc in the event
>>> that needs closing to release the resource, but I'm sorry for even
>>> suggesting that **shudder**.
>>> 
>>> Do we have any sane ideas?
>> 
>> How about we gate the open filedesc solution with an option, and limit
>> that option for root only? If this still sounds hacky, maybe we should
>> just ignore when short-living programs are missed?
> 
> Short lived short programs could go in the event? Short lived long
> events.. One could ask for max number of bytes of binary?
> 
> The smallest kernel modules are 16KB, multiple of PAGE_SIZE:
> 
> [acme@quaco perf]$ cat /proc/modules | sort -k2 -nr | tail
> ebtable_nat 16384 1 - Live 0x0000000000000000
> ebtable_filter 16384 1 - Live 0x0000000000000000
> crct10dif_pclmul 16384 0 - Live 0x0000000000000000
> crc32_pclmul 16384 0 - Live 0x0000000000000000
> coretemp 16384 0 - Live 0x0000000000000000
> btrtl 16384 1 btusb, Live 0x0000000000000000
> btbcm 16384 1 btusb, Live 0x0000000000000000
> arc4 16384 2 - Live 0x0000000000000000
> acpi_thermal_rel 16384 1 int3400_thermal, Live 0x0000000000000000
> ac97_bus 16384 1 snd_soc_core, Live 0x0000000000000000
> [acme@quaco perf]$
> 
> On a Fedora 29 I have these here, all rather small:
> 
> # bpftool prog
> 13: cgroup_skb  tag 7be49e3934a125ba  gpl
> 	loaded_at 2019-01-04T14:40:32-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 13,14
> 14: cgroup_skb  tag 2a142ef67aaad174  gpl
> 	loaded_at 2019-01-04T14:40:32-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 13,14
> 15: cgroup_skb  tag 7be49e3934a125ba  gpl
> 	loaded_at 2019-01-04T14:40:32-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 15,16
> 16: cgroup_skb  tag 2a142ef67aaad174  gpl
> 	loaded_at 2019-01-04T14:40:32-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 15,16
> 17: cgroup_skb  tag 7be49e3934a125ba  gpl
> 	loaded_at 2019-01-04T14:40:43-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 17,18
> 18: cgroup_skb  tag 2a142ef67aaad174  gpl
> 	loaded_at 2019-01-04T14:40:43-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 17,18
> 21: cgroup_skb  tag 7be49e3934a125ba  gpl
> 	loaded_at 2019-01-04T14:40:43-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 21,22
> 22: cgroup_skb  tag 2a142ef67aaad174  gpl
> 	loaded_at 2019-01-04T14:40:43-0300  uid 0
> 	xlated 296B  jited 229B  memlock 4096B  map_ids 21,22
> [root@quaco IRPF2018]#
> 
> 
> Running 'perf trace' with its BPF augmenter get these two more:
> 
> 158: tracepoint  name sys_enter  tag 12504ba9402f952f  gpl
> 	loaded_at 2019-01-08T17:12:39-0300  uid 0
> 	xlated 512B  jited 374B  memlock 4096B  map_ids 118,117,116
> 159: tracepoint  name sys_exit  tag c1bd85c092d6e4aa  gpl
> 	loaded_at 2019-01-08T17:12:39-0300  uid 0
> 	xlated 256B  jited 191B  memlock 4096B  map_ids 118,117
> [root@quaco ~]#
> 
> A PERF_RECORD_MMAP gets as its payload up to PATH_MAX - sizeof(u64).
> 
> So for a class of programs, shoving it together with the
> PERF_RECORD_MMAP like event may be enough?
> 
> You started the shuddering suggestions... ;-)
> 
> - Arnaldo

Besides the cited binary, we are adding more information for each 
BPF program, including source code. So even short program could 
easily exceed PATH_MAX...

Song
Song Liu Jan. 8, 2019, 11:54 p.m. UTC | #10
> On Jan 8, 2019, at 11:43 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Jan 08, 2019 at 07:10:20PM +0000, Song Liu wrote:
>>> On Jan 8, 2019, at 10:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>>>> @@ -986,9 +987,35 @@ enum perf_event_type {
>>>> 	 */
>>>> 	PERF_RECORD_KSYMBOL			= 17,
>>>> 
>>>> +	/*
>>>> +	 * Record bpf events:
>>>> +	 *  enum perf_bpf_event_type {
>>>> +	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
>>>> +	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
>>>> +	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
>>>> +	 *  };
>>>> +	 *
>>>> +	 * struct {
>>>> +	 *	struct perf_event_header	header;
>>>> +	 *	u16				type;
>>>> +	 *	u16				flags;
>>>> +	 *	u32				id;
>>>> +	 *	u8				tag[BPF_TAG_SIZE];
>>>> +	 *	struct sample_id		sample_id;
>>>> +	 * };
>>>> +	 */
>>>> +	PERF_RECORD_BPF_EVENT			= 18,
>>>> +
>>> 
>>> Elsewhere today, I raised the point that by the time (however short
>>> interval) userspace gets around to reading this event, the actual
>>> program could be gone again.
>>> 
>>> In this case the program has been with us for a very short period
>>> indeed; but it could still have generated some samples or otherwise
>>> generated trace data.
>> 
>> Since we already have the separate KSYMBOL events, BPF_EVENT is only 
>> required for advanced use cases, like annotation. So I guess missing 
>> it for very-short-living programs should not be a huge problem?
>> 
>>> It was suggested to allow pinning modules/programs to avoid this
>>> situation, but that of course has other undesirable effects, such as a
>>> trivial DoS.
>>> 
>>> A truly horrible hack would be to include an open filedesc in the event
>>> that needs closing to release the resource, but I'm sorry for even
>>> suggesting that **shudder**.
>>> 
>>> Do we have any sane ideas?
>> 
>> How about we gate the open filedesc solution with an option, and limit
>> that option for root only? If this still sounds hacky, maybe we should
>> just ignore when short-living programs are missed?
> 
> I'm afraid we might also 'need' this for the kallsym thing.
> 
> The problem is that things like Intel PT (ARM Coresight too IIRC) encode
> a bitstream of branch-taken decisions. The only way to decode that and
> reconstruct the actual code-flow is with an exact matching text image.
> 
> In order to have this matching text we need to be able to copy out every
> piece of dynamic text (from kcore) that has ever executed before it
> dissapears.
> 
> Elsewhere (*), Andi suggests to have a kind of text-free fence
> interface, where userspace can call a complete. And I suppose as long we
> know there is a consumer, we also know we'll not be blocked
> indefinitely. So it would have to be slightly more complicated than
> suggested, but I think that is something we could work with.
> 
> It would also not complicate these events.
> 
> 
> 
> [*] https://lkml.kernel.org/r/20190108172721.GN6118@tassilo.jf.intel.com

I think Intel PT case is at instruction granularity (instead of ksymbol
granularity)? If this is true, modules, BPF, and PT could still share
the ksymbol record for basic profiling. And advanced use cases like 
annotation will depend on user space to record BPF_EVENT (and equivalent
for other cases) timely. But at least, the ksymbol is already there. 

Does this make sense?  

Thanks,
Song
Peter Zijlstra Jan. 9, 2019, 10:18 a.m. UTC | #11
On Tue, Jan 08, 2019 at 11:54:04PM +0000, Song Liu wrote:

> I think Intel PT case is at instruction granularity (instead of ksymbol
> granularity)? 

Yes.

> If this is true, modules, BPF, and PT could still share
> the ksymbol record for basic profiling. And advanced use cases like 
> annotation will depend on user space to record BPF_EVENT (and equivalent
> for other cases) timely. But at least, the ksymbol is already there. 
> 
> Does this make sense?  

I'm not sure I follow; the idea was that on ksym events we copy out the
instructions using kcore. The ksym event already has addr+len.

All we need is some means of ensuring the symbol is still there by the
time we see the event and do the copy.

I think we can do this with a new ioctl() on /proc/kcore itself:

 - when we have kcore open, we queue all text-free operations on list-1.

 - when we close kcore, we drain all (text-free) list-* and perform the
   pending frees immediately.

 - on ioctl(KCORE_QC) we perform the pending free of list-3 and advance
   list-2 to list-3 and list-1 to list-2.

Perf would then open kcore at the start of the record, make a complete
copy and keep the FD open. At the end of every buffer process, we issue
KCORE_QC IFF we observed a ksym unreg in that buffer.

We use 3 lists instead of 2 to guard against races, if there was a
reg+unreg en-route but not yet visible in the buffer, then we don't want
that free to be processed. The next buffer (read) will have the event(s)
and all should be well.
Song Liu Jan. 9, 2019, 11:32 a.m. UTC | #12
> On Jan 9, 2019, at 2:18 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Jan 08, 2019 at 11:54:04PM +0000, Song Liu wrote:
> 
>> I think Intel PT case is at instruction granularity (instead of ksymbol
>> granularity)? 
> 
> Yes.
> 
>> If this is true, modules, BPF, and PT could still share
>> the ksymbol record for basic profiling. And advanced use cases like 
>> annotation will depend on user space to record BPF_EVENT (and equivalent
>> for other cases) timely. But at least, the ksymbol is already there. 
>> 
>> Does this make sense?  
> 
> I'm not sure I follow; the idea was that on ksym events we copy out the
> instructions using kcore. The ksym event already has addr+len.

I was thinking about modifying the text in-place scenario. In this case, 
we can use something like

struct perf_record_text_modify {
    u64 addr;
    u_big_enough old_instr;
    u_big_enough new_instr;
    timestamp ;
};

It is a fixed size record, and we don't need process it immediately 
in user space. At the end of perf run, a series of these events will 
help us reconstruct exact text at any time. 

> 
> All we need is some means of ensuring the symbol is still there by the
> time we see the event and do the copy.
> 
> I think we can do this with a new ioctl() on /proc/kcore itself:
> 
> - when we have kcore open, we queue all text-free operations on list-1.
> 
> - when we close kcore, we drain all (text-free) list-* and perform the
>   pending frees immediately.
> 
> - on ioctl(KCORE_QC) we perform the pending free of list-3 and advance
>   list-2 to list-3 and list-1 to list-2.
> 
> Perf would then open kcore at the start of the record, make a complete
> copy and keep the FD open. At the end of every buffer process, we issue
> KCORE_QC IFF we observed a ksym unreg in that buffer.

Does this mean we need to scan every buffer before writing it to perf.data 
during perf-record? 

Also, if we need ksym unreg here, I guess it is NOT really modifying text 
in-place, but creating new version and swap? Then can we include something 
like this in perf.data:

struct perf_record_text_modify {
    u64 old_addr;
    u64 new_addr;
    u32 old_len; /* up to MAX_SIZE */
    u32 new_len; /* up to MAX_SIZE */
    u8 old_text[MAX_SIZE];
    u8 new_text[MAX_SIZE];
    timestamp ;
};

In this way, this record is embedded in perf.data, and doesn't require
extra processing during perf-record (only at the end of perf-record). 
This would work for text modifying case, as modifying text is simply
old-text to new-text.
 
Similar solution would not work for BPF case, as bpf_prog_info is 
getting a lot more members in the near future. 

Does this make sense...?

Thanks,
Song
Peter Zijlstra Jan. 9, 2019, 12:41 p.m. UTC | #13
On Tue, Jan 08, 2019 at 08:45:19PM +0000, Alexei Starovoitov wrote:
> On 1/8/19 12:29 PM, Peter Zijlstra wrote:
> > On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
> >> The following example shows kernel symbols for a BPF program with 7
> >> sub programs:
> >>
> >>      ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
> >>      ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
> >>      ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
> >>      ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
> >>      ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
> >>      ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
> >>      ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
> >>      ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
> > 
> > We should probably specify somewhere that the name can include a
> > '[module]' part just like normal kallsyms. Even though you don't
> > currently use that.
> 
> there is no [module] equivalent in bpf land.

I know; although you could consider each program it's own separate
module. But what I meant was, we should probably document the name[]
format somewhere, maybe in the PERF_RECORD_KSYMBOL comment.

The "symbol [module]" syntax can be used to create a DSO sort key, so
you could simply put in "[bpf]" for all BPF generated symbols and have
everything BPF grouped in perf-report when sorted on DSO.

It doesn't have any other implications.

Similarly, I would suggest "[ftrace]" for all the ftrace trampolines
(which are currently not exposed but really should be).
Peter Zijlstra Jan. 9, 2019, 12:59 p.m. UTC | #14
On Wed, Jan 09, 2019 at 11:32:50AM +0000, Song Liu wrote:
> I was thinking about modifying the text in-place scenario. In this case, 
> we can use something like
> 
> struct perf_record_text_modify {
>     u64 addr;
>     u_big_enough old_instr;
>     u_big_enough new_instr;

char[15] for x86 ;-)

Also, I don't think we need old, we should already have the old text,
either from a previous event or from the initial kcore snapshot.

>     timestamp ;

that lives in struct sample_id.

> };
> 
> It is a fixed size record, and we don't need process it immediately 
> in user space. At the end of perf run, a series of these events will 
> help us reconstruct exact text at any time. 

That works for text_poke users, see also:

  https://lkml.kernel.org/r/20190109103544.GH1900@hirez.programming.kicks-ass.net

But is useless for module / bpf / ftrace dynamic text.

> > All we need is some means of ensuring the symbol is still there by the
> > time we see the event and do the copy.
> > 
> > I think we can do this with a new ioctl() on /proc/kcore itself:
> > 
> > - when we have kcore open, we queue all text-free operations on list-1.
> > 
> > - when we close kcore, we drain all (text-free) list-* and perform the
> >   pending frees immediately.
> > 
> > - on ioctl(KCORE_QC) we perform the pending free of list-3 and advance
> >   list-2 to list-3 and list-1 to list-2.
> > 
> > Perf would then open kcore at the start of the record, make a complete
> > copy and keep the FD open. At the end of every buffer process, we issue
> > KCORE_QC IFF we observed a ksym unreg in that buffer.
> 
> Does this mean we need to scan every buffer before writing it to perf.data 
> during perf-record? 

Just like the BPF events, yes. Now for PT most of the actual data is not
in the regular buffer, so it shouldn't be too horrible, but just like
the BPF event, it can get its own buffer if it does become a problem.

> Also, if we need ksym unreg here, I guess it is NOT really modifying text 
> in-place, but creating new version and swap? Then can we include something 
> like this in perf.data:
> 
> struct perf_record_text_modify {
>     u64 old_addr;
>     u64 new_addr;
>     u32 old_len; /* up to MAX_SIZE */
>     u32 new_len; /* up to MAX_SIZE */
>     u8 old_text[MAX_SIZE];
>     u8 new_text[MAX_SIZE];
>     timestamp ;
> };
> 
> In this way, this record is embedded in perf.data, and doesn't require
> extra processing during perf-record (only at the end of perf-record). 
> This would work for text modifying case, as modifying text is simply
> old-text to new-text.
>  
> Similar solution would not work for BPF case, as bpf_prog_info is 
> getting a lot more members in the near future. 
> 
> Does this make sense...?

I don't think we actually need old_text here either. We're creating a
new text mapping, there was nothing there before.

But still, perf events are limited to 64k, so that means we cannot
support symbols larger than that (although I suppose that would be
fairly rare).

Something like that could work, but I'm not sure it is actually better.
Some PT person would have to play with things I suppose.
Song Liu Jan. 9, 2019, 3:51 p.m. UTC | #15
> On Jan 9, 2019, at 4:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Jan 08, 2019 at 08:45:19PM +0000, Alexei Starovoitov wrote:
>> On 1/8/19 12:29 PM, Peter Zijlstra wrote:
>>> On Thu, Dec 20, 2018 at 10:29:00AM -0800, Song Liu wrote:
>>>> The following example shows kernel symbols for a BPF program with 7
>>>> sub programs:
>>>> 
>>>>     ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>>>>     ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>>>>     ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>>>>     ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>>>>     ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>>>>     ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>>>>     ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>>>>     ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
>>> 
>>> We should probably specify somewhere that the name can include a
>>> '[module]' part just like normal kallsyms. Even though you don't
>>> currently use that.
>> 
>> there is no [module] equivalent in bpf land.
> 
> I know; although you could consider each program it's own separate
> module. But what I meant was, we should probably document the name[]
> format somewhere, maybe in the PERF_RECORD_KSYMBOL comment.
> 
> The "symbol [module]" syntax can be used to create a DSO sort key, so
> you could simply put in "[bpf]" for all BPF generated symbols and have
> everything BPF grouped in perf-report when sorted on DSO.

In current version, I put [bpf_prog] for bpf programs DSO. We can 
probably add something to /proc/kallsyms as well. On the other hand, 
"bpf_prog_<tag>_XXX" also indicates this is a BPF program. 

Thanks,
Song
Song Liu Jan. 9, 2019, 4:04 p.m. UTC | #16
> On Jan 9, 2019, at 4:59 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Jan 09, 2019 at 11:32:50AM +0000, Song Liu wrote:
>> I was thinking about modifying the text in-place scenario. In this case, 
>> we can use something like
>> 
>> struct perf_record_text_modify {
>>    u64 addr;
>>    u_big_enough old_instr;
>>    u_big_enough new_instr;
> 
> char[15] for x86 ;-)
> 
> Also, I don't think we need old, we should already have the old text,
> either from a previous event or from the initial kcore snapshot.
> 
>>    timestamp ;
> 
> that lives in struct sample_id.
> 
>> };
>> 
>> It is a fixed size record, and we don't need process it immediately 
>> in user space. At the end of perf run, a series of these events will 
>> help us reconstruct exact text at any time. 
> 
> That works for text_poke users, see also:
> 
>  https://lkml.kernel.org/r/20190109103544.GH1900@hirez.programming.kicks-ass.net
> 
> But is useless for module / bpf / ftrace dynamic text.

I think we will end up with RECORD_KSYMBOL + something else for all cases. 
For bpf, it is RECORD_KSYMBOL + (optional) RECORD_BPF_EVENT. For text_poke, 
it will be RECORD_KSYMBOL + RECORD_TEXT_POKE. In all cases, RECORD_KSYMBOL
goes to regular buffer and gets saved directly to perf.data. The other 
record goes to a separate buffer, and requires extra processing. 

> 
>>> All we need is some means of ensuring the symbol is still there by the
>>> time we see the event and do the copy.
>>> 
>>> I think we can do this with a new ioctl() on /proc/kcore itself:
>>> 
>>> - when we have kcore open, we queue all text-free operations on list-1.
>>> 
>>> - when we close kcore, we drain all (text-free) list-* and perform the
>>>  pending frees immediately.
>>> 
>>> - on ioctl(KCORE_QC) we perform the pending free of list-3 and advance
>>>  list-2 to list-3 and list-1 to list-2.
>>> 
>>> Perf would then open kcore at the start of the record, make a complete
>>> copy and keep the FD open. At the end of every buffer process, we issue
>>> KCORE_QC IFF we observed a ksym unreg in that buffer.
>> 
>> Does this mean we need to scan every buffer before writing it to perf.data 
>> during perf-record? 
> 
> Just like the BPF events, yes. Now for PT most of the actual data is not
> in the regular buffer, so it shouldn't be too horrible, but just like
> the BPF event, it can get its own buffer if it does become a problem.

I see. Separate buffer does make it better. 

> 
>> Also, if we need ksym unreg here, I guess it is NOT really modifying text 
>> in-place, but creating new version and swap? Then can we include something 
>> like this in perf.data:
>> 
>> struct perf_record_text_modify {
>>    u64 old_addr;
>>    u64 new_addr;
>>    u32 old_len; /* up to MAX_SIZE */
>>    u32 new_len; /* up to MAX_SIZE */
>>    u8 old_text[MAX_SIZE];
>>    u8 new_text[MAX_SIZE];
>>    timestamp ;
>> };
>> 
>> In this way, this record is embedded in perf.data, and doesn't require
>> extra processing during perf-record (only at the end of perf-record). 
>> This would work for text modifying case, as modifying text is simply
>> old-text to new-text.
>> 
>> Similar solution would not work for BPF case, as bpf_prog_info is 
>> getting a lot more members in the near future. 
>> 
>> Does this make sense...?
> 
> I don't think we actually need old_text here either. We're creating a
> new text mapping, there was nothing there before.
> 
> But still, perf events are limited to 64k, so that means we cannot
> support symbols larger than that (although I suppose that would be
> fairly rare).

For larger symbols, I guess we can do one RECORD_KSYMBOL and multiple 
RECORD_TEXT_MODIFY. 

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 537e9e7c6e6f..45d23560f90b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -955,6 +955,7 @@  bpf_address_lookup(unsigned long addr, unsigned long *size,
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp);
 void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);
 
 #else /* CONFIG_BPF_JIT */
 
@@ -1010,6 +1011,12 @@  static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	sym[0] = '\0';
+}
+
 #endif /* CONFIG_BPF_JIT */
 
 void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 037863e69bb2..8ea0ce650c6f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1118,6 +1118,9 @@  extern void perf_event_mmap(struct vm_area_struct *vma);
 typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
 extern void perf_event_ksymbol(int type, u64 addr, u64 len, bool unregister,
 			       perf_ksymbol_get_name_f get_name, void *data);
+extern void perf_event_bpf_event(struct bpf_prog *prog,
+				 enum perf_bpf_event_type type,
+				 u16 flags);
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1345,6 +1348,9 @@  static inline void perf_event_ksymbol(int type, u64 addr, u64 len,
 				      bool unregister,
 				      perf_ksymbol_get_name_f get_name,
 				      void *data) 			{ }
+static inline void perf_event_bpf_event(struct bpf_prog *prog,
+					enum perf_bpf_event_type type,
+					u16 flags)			{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 6c9e327e87ed..68db04058408 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -373,7 +373,8 @@  struct perf_event_attr {
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
 				ksymbol        :  1, /* include ksymbol events */
-				__reserved_1   : 34;
+				bpf_event      :  1, /* include bpf events */
+				__reserved_1   : 33;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -986,9 +987,35 @@  enum perf_event_type {
 	 */
 	PERF_RECORD_KSYMBOL			= 17,
 
+	/*
+	 * Record bpf events:
+	 *  enum perf_bpf_event_type {
+	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
+	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	 *  };
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u16				type;
+	 *	u16				flags;
+	 *	u32				id;
+	 *	u8				tag[BPF_TAG_SIZE];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_BPF_EVENT			= 18,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
+enum perf_bpf_event_type {
+	PERF_BPF_EVENT_UNKNOWN		= 0,
+	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	PERF_BPF_EVENT_MAX,		/* non-ABI */
+};
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5cdd8da0e7f2..2a8364294f11 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -496,7 +496,7 @@  bpf_get_prog_addr_region(const struct bpf_prog *prog,
 	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
 }
 
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
 	const char *end = sym + KSYM_NAME_LEN;
 	const struct btf_type *type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0607db304def..4af63c8c95eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1211,6 +1211,7 @@  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
@@ -1554,6 +1555,7 @@  static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
+	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
 	return err;
 
 free_used_maps:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0ac6dee367c..04feb2b28c46 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@  static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4308,6 +4309,8 @@  static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (event->attr.ksymbol)
 		atomic_dec(&nr_ksymbol_events);
+	if (event->attr.bpf_event)
+		atomic_dec(&nr_bpf_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7744,6 +7747,121 @@  void perf_event_ksymbol(int ksym_type, u64 addr, u64 len, bool unregister,
 	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
 }
 
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+	struct bpf_prog	*prog;
+	struct {
+		struct perf_event_header        header;
+		u16				type;
+		u16				flags;
+		u32				id;
+		u8				tag[BPF_TAG_SIZE];
+	} event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+	return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+	struct perf_bpf_event *bpf_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_bpf_match(event))
+		return;
+
+	perf_event_header__init_id(&bpf_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				bpf_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, bpf_event->event_id);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static int perf_event_bpf_get_name(char *name, int len, void *data)
+{
+	struct bpf_prog *prog = data;
+
+	bpf_get_prog_name(prog, name);
+	return 0;
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+					 enum perf_bpf_event_type type)
+{
+	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+	int i;
+
+	if (prog->aux->func_cnt == 0) {
+		perf_event_ksymbol(PERF_RECORD_MISC_KSYMBOL_TYPE_BPF,
+				   (u64)(unsigned long)prog->bpf_func,
+				   prog->jited_len, unregister,
+				   perf_event_bpf_get_name, prog);
+	} else {
+		for (i = 0; i < prog->aux->func_cnt; i++) {
+			struct bpf_prog *subprog = prog->aux->func[i];
+
+			perf_event_ksymbol(
+				PERF_RECORD_MISC_KSYMBOL_TYPE_BPF,
+				(u64)(unsigned long)subprog->bpf_func,
+				subprog->jited_len, unregister,
+				perf_event_bpf_get_name, subprog);
+		}
+	}
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+			  enum perf_bpf_event_type type,
+			  u16 flags)
+{
+	struct perf_bpf_event bpf_event;
+
+	if (type <= PERF_BPF_EVENT_UNKNOWN ||
+	    type >= PERF_BPF_EVENT_MAX)
+		return;
+
+	switch (type) {
+	case PERF_BPF_EVENT_PROG_LOAD:
+	case PERF_BPF_EVENT_PROG_UNLOAD:
+		if (atomic_read(&nr_ksymbol_events))
+			perf_event_bpf_emit_ksymbols(prog, type);
+		break;
+	default:
+		break;
+	}
+
+	if (!atomic_read(&nr_bpf_events))
+		return;
+
+	bpf_event = (struct perf_bpf_event){
+		.prog = prog,
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_BPF_EVENT,
+				.size = sizeof(bpf_event.event_id),
+			},
+			.type = type,
+			.flags = flags,
+			.id = prog->aux->id,
+		},
+	};
+
+	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9996,6 +10114,8 @@  static void account_event(struct perf_event *event)
 		inc = true;
 	if (event->attr.ksymbol)
 		atomic_inc(&nr_ksymbol_events);
+	if (event->attr.bpf_event)
+		atomic_inc(&nr_bpf_events);
 
 	if (inc) {
 		/*