diff mbox series

[v3,perf,bpf-next,1/4] perf, bpf: Introduce PERF_RECORD_BPF_EVENT

Message ID 20181211233351.4036381-2-songliubraving@fb.com
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series reveal invisible bpf programs | expand

Commit Message

Song Liu Dec. 11, 2018, 11:33 p.m. UTC
For better performance analysis of BPF programs, this patch introduces
PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
load/unload information to user space.

Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
The following example shows kernel symbols for a BPF program with 7
sub programs:

    ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
    ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
    ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
    ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
    ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
    ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
    ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
    ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

Note that these sub programs are not allocated in contiguous memory
ranges. Instead, each of them occupies separate page(s). The starting
address of these sub programs are randomized within the page(s) for
security reasons.

The following data structure is used for PERF_RECORD_BPF_EVENT. It is
generated for each _sub program_:

        /*
         * Record different types of bpf events:
         *  enum perf_bpf_event_type {
         *     PERF_BPF_EVENT_UNKNOWN           = 0,
         *     PERF_BPF_EVENT_PROG_LOAD         = 1,
         *     PERF_BPF_EVENT_PROG_UNLOAD       = 2,
         *  };
         *
         * struct {
         *      struct perf_event_header header;
         *      u32                             type;
         *      u32                             flags;
         *      u32                             id; // prog_id or other id
         *      u32                             sub_id; // subprog id
         *
         *      // for bpf_prog types, bpf prog or subprog
         *      u8                              tag[BPF_TAG_SIZE];
         *      u64                             addr;
         *      u64                             len;
         *      char                            name[];
         *      struct sample_id                sample_id;
         * };
         */

This data is designed for different use cases:

  1. For simple perf profiling, addr, len, and name[] works similar to
     PERF_RECORD_MMAP. These raw records are stored in perf.data file.
  2. For perf annotation and other cases that needs more details of the
     BPF program, id and sub_id are used to extract detailed information
     of the prog through sys_bpf(BPF_OBJ_GET_INFO_BY_FD). User space
     tools are responsible to save the detailed information properly, as
     these information will not be available after the bpf program is
     unloaded.

This follows the existing perf model of keeping the ordered records
with enough information for profiling while keeping keys for reliably
finding extra, more voluminous information for further analysis, like
raw jitted binaries augmented with line numbers that can be used for
disassembly, annotation, etc

Currently, PERF_RECORD_BPF_EVENT only support two events:
PERF_BPF_EVENT_PROG_LOAD and PERF_BPF_EVENT_PROG_UNLOAD. But it can be
easily extended to support more events.

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/filter.h          |   7 ++
 include/linux/perf_event.h      |  10 +++
 include/uapi/linux/perf_event.h |  35 ++++++++-
 kernel/bpf/core.c               |   2 +-
 kernel/bpf/syscall.c            |   3 +
 kernel/events/core.c            | 123 +++++++++++++++++++++++++++++++-
 6 files changed, 177 insertions(+), 3 deletions(-)

Comments

Peter Zijlstra Dec. 12, 2018, 12:06 p.m. UTC | #1
On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
> +static void perf_event_bpf_output(struct perf_event *event,
> +				   void *data)
> +{
> +	struct perf_bpf_event *bpf_event = data;
> +	struct perf_output_handle handle;
> +	struct perf_sample_data sample;
> +	char name[KSYM_NAME_LEN];
> +	int name_len;
> +	int ret;
> +
> +	if (!perf_event_bpf_match(event))
> +		return;
> +
> +	/* get prog name and round up to 64 bit aligned */
> +	bpf_get_prog_name(bpf_event->prog, name);
> +	name_len = strlen(name) + 1;
> +	while (!IS_ALIGNED(name_len, sizeof(u64)))
> +		name[name_len++] = '\0';

Does this not require something like:

	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
Peter Zijlstra Dec. 12, 2018, 1:15 p.m. UTC | #2
On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
> For better performance analysis of BPF programs, this patch introduces
> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
> load/unload information to user space.
> 
> Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
> The following example shows kernel symbols for a BPF program with 7
> sub programs:
> 
>     ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>     ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>     ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>     ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>     ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>     ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>     ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>     ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

Doesn't BPF have enough information to generate 'saner' names? Going by
the thing below, these sub-progs are actually functions, right?

>         /*
>          * Record different types of bpf events:
>          *  enum perf_bpf_event_type {
>          *     PERF_BPF_EVENT_UNKNOWN           = 0,
>          *     PERF_BPF_EVENT_PROG_LOAD         = 1,
>          *     PERF_BPF_EVENT_PROG_UNLOAD       = 2,
>          *  };
>          *
>          * struct {
>          *      struct perf_event_header header;
>          *      u32                             type;
>          *      u32                             flags;
>          *      u32                             id; // prog_id or other id
>          *      u32                             sub_id; // subprog id
>          *
>          *      // for bpf_prog types, bpf prog or subprog
>          *      u8                              tag[BPF_TAG_SIZE];
>          *      u64                             addr;
>          *      u64                             len;
>          *      char                            name[];
>          *      struct sample_id                sample_id;
>          * };
>          */

Isn't this mixing two different things (poorly)? The kallsym update and
the BPF load/unload ?

And while this tracks the bpf kallsyms, it does not do all kallsyms.

.... Oooh, I see the problem, everybody is doing their own custom
kallsym_{add,del}() thing, instead of having that in generic code :-(

This, for example, doesn't track module load/unload nor ftrace
trampolines, even though both affect kallsyms.

> +void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
> +			       struct bpf_prog *prog)
> +{
> +	if (!atomic_read(&nr_bpf_events))
> +		return;
> +
> +	if (type != PERF_BPF_EVENT_PROG_LOAD &&
> +	    type != PERF_BPF_EVENT_PROG_UNLOAD)
> +		return;
> +
> +	if (prog->aux->func_cnt == 0) {
> +		perf_event_bpf_event_subprog(type, prog,
> +					     prog->aux->id, 0);
> +	} else {
> +		int i;
> +
> +		for (i = 0; i < prog->aux->func_cnt; i++)
> +			perf_event_bpf_event_subprog(type, prog->aux->func[i],
> +						     prog->aux->id, i);
> +	}
> +}
Arnaldo Carvalho de Melo Dec. 12, 2018, 1:27 p.m. UTC | #3
Em Wed, Dec 12, 2018 at 02:15:49PM +0100, Peter Zijlstra escreveu:
> On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
> > For better performance analysis of BPF programs, this patch introduces
> > PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
> > load/unload information to user space.
> > 
> > Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
> > The following example shows kernel symbols for a BPF program with 7
> > sub programs:
> > 
> >     ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
> >     ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
> >     ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
> >     ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
> >     ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
> >     ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
> >     ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
> >     ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
> 
> Doesn't BPF have enough information to generate 'saner' names? Going by
> the thing below, these sub-progs are actually functions, right?

Yeah, this looks just like a symbol table, albeit just with functions,
so far.
 
> >         /*
> >          * Record different types of bpf events:
> >          *  enum perf_bpf_event_type {
> >          *     PERF_BPF_EVENT_UNKNOWN           = 0,
> >          *     PERF_BPF_EVENT_PROG_LOAD         = 1,
> >          *     PERF_BPF_EVENT_PROG_UNLOAD       = 2,
> >          *  };
> >          *
> >          * struct {
> >          *      struct perf_event_header header;
> >          *      u32                             type;
> >          *      u32                             flags;
> >          *      u32                             id; // prog_id or other id
> >          *      u32                             sub_id; // subprog id
> >          *
> >          *      // for bpf_prog types, bpf prog or subprog
> >          *      u8                              tag[BPF_TAG_SIZE];
> >          *      u64                             addr;
> >          *      u64                             len;
> >          *      char                            name[];
> >          *      struct sample_id                sample_id;
> >          * };
> >          */
> 
> Isn't this mixing two different things (poorly)? The kallsym update and
> the BPF load/unload ?
> 
> And while this tracks the bpf kallsyms, it does not do all kallsyms.
> 
> .... Oooh, I see the problem, everybody is doing their own custom
> kallsym_{add,del}() thing, instead of having that in generic code :-(
 
> This, for example, doesn't track module load/unload nor ftrace
> trampolines, even though both affect kallsyms.

So you think the best would have to be a PERF_RECORD_ that just states
that code has been loaded at range (addr, len). I.e. much like
PERF_RECORD_MMAP does, just for userspace? Then it could be used by BPF
and any other kernel facility like the ones you described?

There would be an area that would be used by each of these facilities to
figure out further info, like we use the mmap->name to go and get the
symbol table from ELF files, etc, but BPF could use for their specific
stuff?

The above is almost like PERF_RECORD_MMAP (PERF_BPF_EVENT_PROG_LOAD) +
PERF_RECORD_MUNMAP(PERF_BPF_EVENT_PROG_UNLOAD) in one event, with this
'type' thing allowing for more stuff to be put in later, I guess.

- Arnaldo
 
> > +void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
> > +			       struct bpf_prog *prog)
> > +{
> > +	if (!atomic_read(&nr_bpf_events))
> > +		return;
> > +
> > +	if (type != PERF_BPF_EVENT_PROG_LOAD &&
> > +	    type != PERF_BPF_EVENT_PROG_UNLOAD)
> > +		return;
> > +
> > +	if (prog->aux->func_cnt == 0) {
> > +		perf_event_bpf_event_subprog(type, prog,
> > +					     prog->aux->id, 0);
> > +	} else {
> > +		int i;
> > +
> > +		for (i = 0; i < prog->aux->func_cnt; i++)
> > +			perf_event_bpf_event_subprog(type, prog->aux->func[i],
> > +						     prog->aux->id, i);
> > +	}
> > +}
>
Song Liu Dec. 12, 2018, 4:49 p.m. UTC | #4
> On Dec 12, 2018, at 4:06 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
>> +static void perf_event_bpf_output(struct perf_event *event,
>> +				   void *data)
>> +{
>> +	struct perf_bpf_event *bpf_event = data;
>> +	struct perf_output_handle handle;
>> +	struct perf_sample_data sample;
>> +	char name[KSYM_NAME_LEN];
>> +	int name_len;
>> +	int ret;
>> +
>> +	if (!perf_event_bpf_match(event))
>> +		return;
>> +
>> +	/* get prog name and round up to 64 bit aligned */
>> +	bpf_get_prog_name(bpf_event->prog, name);
>> +	name_len = strlen(name) + 1;
>> +	while (!IS_ALIGNED(name_len, sizeof(u64)))
>> +		name[name_len++] = '\0';
> 
> Does this not require something like:
> 
> 	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

Yeah, this makes sense. I will add this in next version. 

Thanks,
Song
Song Liu Dec. 12, 2018, 5:09 p.m. UTC | #5
> On Dec 12, 2018, at 5:15 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
>> For better performance analysis of BPF programs, this patch introduces
>> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
>> load/unload information to user space.
>> 
>> Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
>> The following example shows kernel symbols for a BPF program with 7
>> sub programs:
>> 
>>    ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>>    ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>>    ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>>    ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>>    ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>>    ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>>    ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>>    ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
> 
> Doesn't BPF have enough information to generate 'saner' names? Going by
> the thing below, these sub-progs are actually functions, right?

These sub programs/functions will have their descriptive names from BTF
function types (coming in 4.21). However, BTF is optional in normal cases, 
when BTF is missing, they will be named as bpf_prog_<tag>_F. The main BPF
program has a name up to 16 byte long. In the example above, the last 
program has name _dummy_tracepoint. 

I think these sub programs are more like "programs" than "functions", 
because each sub program occupies its own page(s). 

> 
>>        /*
>>         * Record different types of bpf events:
>>         *  enum perf_bpf_event_type {
>>         *     PERF_BPF_EVENT_UNKNOWN           = 0,
>>         *     PERF_BPF_EVENT_PROG_LOAD         = 1,
>>         *     PERF_BPF_EVENT_PROG_UNLOAD       = 2,
>>         *  };
>>         *
>>         * struct {
>>         *      struct perf_event_header header;
>>         *      u32                             type;
>>         *      u32                             flags;
>>         *      u32                             id; // prog_id or other id
>>         *      u32                             sub_id; // subprog id
>>         *
>>         *      // for bpf_prog types, bpf prog or subprog
>>         *      u8                              tag[BPF_TAG_SIZE];
>>         *      u64                             addr;
>>         *      u64                             len;
>>         *      char                            name[];
>>         *      struct sample_id                sample_id;
>>         * };
>>         */
> 
> Isn't this mixing two different things (poorly)? The kallsym update and
> the BPF load/unload ?

I would say these two things are actually two parts of the same event. 
Fields id, sub_id, and tag provide information about which program is
mapped to this ksym. They are equivalent to "pgoff + filename" in 
PERF_RECORD_MMAP, or "maj, min, ino, and ino_generation" in 
PERF_RECORD_MMAP2. 

> 
> And while this tracks the bpf kallsyms, it does not do all kallsyms.
> 
> .... Oooh, I see the problem, everybody is doing their own custom
> kallsym_{add,del}() thing, instead of having that in generic code :-(
> 
> This, for example, doesn't track module load/unload nor ftrace
> trampolines, even though both affect kallsyms.

I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
That could be separate sets of patches. 

Thanks,
Song

> 
>> +void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
>> +			       struct bpf_prog *prog)
>> +{
>> +	if (!atomic_read(&nr_bpf_events))
>> +		return;
>> +
>> +	if (type != PERF_BPF_EVENT_PROG_LOAD &&
>> +	    type != PERF_BPF_EVENT_PROG_UNLOAD)
>> +		return;
>> +
>> +	if (prog->aux->func_cnt == 0) {
>> +		perf_event_bpf_event_subprog(type, prog,
>> +					     prog->aux->id, 0);
>> +	} else {
>> +		int i;
>> +
>> +		for (i = 0; i < prog->aux->func_cnt; i++)
>> +			perf_event_bpf_event_subprog(type, prog->aux->func[i],
>> +						     prog->aux->id, i);
>> +	}
>> +}
> 
>
Song Liu Dec. 12, 2018, 5:33 p.m. UTC | #6
> On Dec 12, 2018, at 5:27 AM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Wed, Dec 12, 2018 at 02:15:49PM +0100, Peter Zijlstra escreveu:
>> On Tue, Dec 11, 2018 at 03:33:47PM -0800, Song Liu wrote:
>>> For better performance analysis of BPF programs, this patch introduces
>>> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
>>> load/unload information to user space.
>>> 
>>> Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
>>> The following example shows kernel symbols for a BPF program with 7
>>> sub programs:
>>> 
>>>    ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
>>>    ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
>>>    ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
>>>    ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
>>>    ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
>>>    ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
>>>    ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
>>>    ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi
>> 
>> Doesn't BPF have enough information to generate 'saner' names? Going by
>> the thing below, these sub-progs are actually functions, right?
> 
> Yeah, this looks just like a symbol table, albeit just with functions,
> so far.
>>>        /*
>>>         * Record different types of bpf events:
>>>         *  enum perf_bpf_event_type {
>>>         *     PERF_BPF_EVENT_UNKNOWN           = 0,
>>>         *     PERF_BPF_EVENT_PROG_LOAD         = 1,
>>>         *     PERF_BPF_EVENT_PROG_UNLOAD       = 2,
>>>         *  };
>>>         *
>>>         * struct {
>>>         *      struct perf_event_header header;
>>>         *      u32                             type;
>>>         *      u32                             flags;
>>>         *      u32                             id; // prog_id or other id
>>>         *      u32                             sub_id; // subprog id
>>>         *
>>>         *      // for bpf_prog types, bpf prog or subprog
>>>         *      u8                              tag[BPF_TAG_SIZE];
>>>         *      u64                             addr;
>>>         *      u64                             len;
>>>         *      char                            name[];
>>>         *      struct sample_id                sample_id;
>>>         * };
>>>         */
>> 
>> Isn't this mixing two different things (poorly)? The kallsym update and
>> the BPF load/unload ?
>> 
>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>> 
>> .... Oooh, I see the problem, everybody is doing their own custom
>> kallsym_{add,del}() thing, instead of having that in generic code :-(
> 
>> This, for example, doesn't track module load/unload nor ftrace
>> trampolines, even though both affect kallsyms.
> 
> So you think the best would have to be a PERF_RECORD_ that just states
> that code has been loaded at range (addr, len). I.e. much like
> PERF_RECORD_MMAP does, just for userspace? Then it could be used by BPF
> and any other kernel facility like the ones you described?
> 
> There would be an area that would be used by each of these facilities to
> figure out further info, like we use the mmap->name to go and get the
> symbol table from ELF files, etc, but BPF could use for their specific
> stuff?
> 
> The above is almost like PERF_RECORD_MMAP (PERF_BPF_EVENT_PROG_LOAD) +
> PERF_RECORD_MUNMAP(PERF_BPF_EVENT_PROG_UNLOAD) in one event, with this
> 'type' thing allowing for more stuff to be put in later, I guess.
> 
> - Arnaldo

PERF_RECORD_MMAP and PERF_RECORD_BPF_EVENT are issued at different 
granularities: 

   One PERF_RECORD_MMAP is issued for each mmap, which may contain 
   many symbols;
   PERF_RECORD_BPF_EVENT is issued for each symbol (sub program). 

Unlike mmap to ELF file, where all symbols have static offsets within 
the file, different sub programs of a BPF program have their own 
page(s) and random starting addresses within the page(s). Therefore, we 
cannot have one PERF_RECORD_BPF_EVENT to cover multiple sub programs. 

Current version has these mmap-like PERF_RECORD_BPF_EVENT in perf.data, 
which is sufficient for basic profiling cases. For annotation and source
line matching, use space need to process PERF_RECORD_BPF_EVENT (from 
kernel and synthesized). 

Thanks,
Song


>>> +void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
>>> +			       struct bpf_prog *prog)
>>> +{
>>> +	if (!atomic_read(&nr_bpf_events))
>>> +		return;
>>> +
>>> +	if (type != PERF_BPF_EVENT_PROG_LOAD &&
>>> +	    type != PERF_BPF_EVENT_PROG_UNLOAD)
>>> +		return;
>>> +
>>> +	if (prog->aux->func_cnt == 0) {
>>> +		perf_event_bpf_event_subprog(type, prog,
>>> +					     prog->aux->id, 0);
>>> +	} else {
>>> +		int i;
>>> +
>>> +		for (i = 0; i < prog->aux->func_cnt; i++)
>>> +			perf_event_bpf_event_subprog(type, prog->aux->func[i],
>>> +						     prog->aux->id, i);
>>> +	}
>>> +}
>> 
> 
> -- 
> 
> - Arnaldo
Peter Zijlstra Dec. 12, 2018, 6:05 p.m. UTC | #7
On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
> > And while this tracks the bpf kallsyms, it does not do all kallsyms.
> > 
> > .... Oooh, I see the problem, everybody is doing their own custom
> > kallsym_{add,del}() thing, instead of having that in generic code :-(
> > 
> > This, for example, doesn't track module load/unload nor ftrace
> > trampolines, even though both affect kallsyms.
> 
> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
> That could be separate sets of patches. 

So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
and also have ftrace use that.

Because currently the ftrace stuff is otherwise invisible.

A generic kallsym register/unregister for any JIT.
Steven Rostedt Dec. 12, 2018, 6:33 p.m. UTC | #8
On Wed, 12 Dec 2018 19:05:53 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
> > > And while this tracks the bpf kallsyms, it does not do all kallsyms.
> > > 
> > > .... Oooh, I see the problem, everybody is doing their own custom
> > > kallsym_{add,del}() thing, instead of having that in generic code :-(
> > > 
> > > This, for example, doesn't track module load/unload nor ftrace
> > > trampolines, even though both affect kallsyms.  
> > 
> > I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
> > That could be separate sets of patches.   
> 
> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
> and also have ftrace use that.
> 
> Because currently the ftrace stuff is otherwise invisible.
> 
> A generic kallsym register/unregister for any JIT.

That's if it needs to look up the symbols that were recorded when init
was unloaded.

The ftrace kallsyms is used to save the function names of init code
that was freed, but may have been recorded. With out the ftrace
kallsyms the functions traced at init time would just show up as hex
addresses (not very useful).

I'm not sure how BPF would need those symbols unless they were executed
during init (module or core) and needed to see what the symbols use to
be).

-- Steve
Song Liu Dec. 12, 2018, 6:56 p.m. UTC | #9
> On Dec 12, 2018, at 10:05 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>>> 
>>> .... Oooh, I see the problem, everybody is doing their own custom
>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
>>> 
>>> This, for example, doesn't track module load/unload nor ftrace
>>> trampolines, even though both affect kallsyms.
>> 
>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
>> That could be separate sets of patches. 
> 
> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
> and also have ftrace use that.
> 
> Because currently the ftrace stuff is otherwise invisible.
> 
> A generic kallsym register/unregister for any JIT.

I guess this is _not_ a requirement for this patchset? BPF program has
special data (id, sub_id, tag) that we need PERF_RECORD_BPF_EVENT. So 
this patchset should be orthogonal to the generic kallsym framework?

Thanks,
Song
Song Liu Dec. 12, 2018, 6:58 p.m. UTC | #10
> On Dec 12, 2018, at 10:33 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Wed, 12 Dec 2018 19:05:53 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
>> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
>>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>>>> 
>>>> .... Oooh, I see the problem, everybody is doing their own custom
>>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
>>>> 
>>>> This, for example, doesn't track module load/unload nor ftrace
>>>> trampolines, even though both affect kallsyms.  
>>> 
>>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
>>> That could be separate sets of patches.   
>> 
>> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
>> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
>> and also have ftrace use that.
>> 
>> Because currently the ftrace stuff is otherwise invisible.
>> 
>> A generic kallsym register/unregister for any JIT.
> 
> That's if it needs to look up the symbols that were recorded when init
> was unloaded.
> 
> The ftrace kallsyms is used to save the function names of init code
> that was freed, but may have been recorded. With out the ftrace
> kallsyms the functions traced at init time would just show up as hex
> addresses (not very useful).
> 
> I'm not sure how BPF would need those symbols unless they were executed
> during init (module or core) and needed to see what the symbols use to
> be).
> 
> -- Steve

Currently, BPF programs are not executed during init. 

Thanks,
Song
Peter Zijlstra Dec. 13, 2018, 3:25 p.m. UTC | #11
On Wed, Dec 12, 2018 at 06:56:11PM +0000, Song Liu wrote:
> 
> 
> > On Dec 12, 2018, at 10:05 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
> >>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
> >>> 
> >>> .... Oooh, I see the problem, everybody is doing their own custom
> >>> kallsym_{add,del}() thing, instead of having that in generic code :-(
> >>> 
> >>> This, for example, doesn't track module load/unload nor ftrace
> >>> trampolines, even though both affect kallsyms.
> >> 
> >> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
> >> That could be separate sets of patches. 
> > 
> > So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
> > bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
> > and also have ftrace use that.
> > 
> > Because currently the ftrace stuff is otherwise invisible.
> > 
> > A generic kallsym register/unregister for any JIT.
> 
> I guess this is _not_ a requirement for this patchset? BPF program has
> special data (id, sub_id, tag) that we need PERF_RECORD_BPF_EVENT. So 
> this patchset should be orthogonal to the generic kallsym framework?

Well, it is a question of ABI. I don't like mixing the kallsym updates
with the BPF updates.
Song Liu Dec. 13, 2018, 4:07 p.m. UTC | #12
> On Dec 13, 2018, at 7:25 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Dec 12, 2018 at 06:56:11PM +0000, Song Liu wrote:
>> 
>> 
>>> On Dec 12, 2018, at 10:05 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
>>>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>>>>> 
>>>>> .... Oooh, I see the problem, everybody is doing their own custom
>>>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
>>>>> 
>>>>> This, for example, doesn't track module load/unload nor ftrace
>>>>> trampolines, even though both affect kallsyms.
>>>> 
>>>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
>>>> That could be separate sets of patches. 
>>> 
>>> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
>>> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
>>> and also have ftrace use that.
>>> 
>>> Because currently the ftrace stuff is otherwise invisible.
>>> 
>>> A generic kallsym register/unregister for any JIT.
>> 
>> I guess this is _not_ a requirement for this patchset? BPF program has
>> special data (id, sub_id, tag) that we need PERF_RECORD_BPF_EVENT. So 
>> this patchset should be orthogonal to the generic kallsym framework?
> 
> Well, it is a question of ABI. I don't like mixing the kallsym updates
> with the BPF updates.

I have been always thinking the two is one update: "mapping this BPF 
program to this ksym". 

On the other hand, if we really want to separate the two. I guess we 
need two PERF_RECORD_*:

       /*
        * PERF_RECORD_KSYM_ADD/DEL or MMAP3/MUNMAP3
        *
        * struct {
        *      struct perf_event_header header;
        *      u64                             addr;
        *      u64                             len;
        *      char                            name[];
        *      struct sample_id                sample_id;
        * };
        */

and

       /*
        * PERF_RECORD_BPF_EVENT
        *
        * struct {
        *      struct perf_event_header header;
        *      u32                             type;
        *      u32                             flags;
        *      u32                             id; // prog_id or other id
        *      u32                             sub_id; // subprog id
        *
        *      // for bpf_prog types, bpf prog or subprog
        *      u8                              tag[BPF_TAG_SIZE];
        *      struct sample_id                sample_id;
        * };
        */

In this case, PERF_RECORD_BPF_EVENT is only needed when user want
annotation. When annotation is needed, kernel will generate both
record for each BPF program load/unload. Then, user space will do 
some work to match the two. 

Personally, I think this is not as clean as current version. But
it would work. 

Would you recommend we go on this direction? 

Thanks,
Song
Peter Zijlstra Dec. 13, 2018, 6:45 p.m. UTC | #13
On Wed, Dec 12, 2018 at 01:33:20PM -0500, Steven Rostedt wrote:
> On Wed, 12 Dec 2018 19:05:53 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
> > > > And while this tracks the bpf kallsyms, it does not do all kallsyms.
> > > > 
> > > > .... Oooh, I see the problem, everybody is doing their own custom
> > > > kallsym_{add,del}() thing, instead of having that in generic code :-(
> > > > 
> > > > This, for example, doesn't track module load/unload nor ftrace
> > > > trampolines, even though both affect kallsyms.  
> > > 
> > > I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
> > > That could be separate sets of patches.   
> > 
> > So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
> > bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
> > and also have ftrace use that.
> > 
> > Because currently the ftrace stuff is otherwise invisible.
> > 
> > A generic kallsym register/unregister for any JIT.
> 
> That's if it needs to look up the symbols that were recorded when init
> was unloaded.
> 
> The ftrace kallsyms is used to save the function names of init code
> that was freed, but may have been recorded. With out the ftrace
> kallsyms the functions traced at init time would just show up as hex
> addresses (not very useful).
> 
> I'm not sure how BPF would need those symbols unless they were executed
> during init (module or core) and needed to see what the symbols use to
> be).

Aah, that sounds entirely dodgy and possibly quite broken. We freed that
init code, so BPF or your trampolines (or a tiny module) could actually
fit in there and insert their own kallsyms, and then we have overlapping
symbols, which would be pretty bad.

I thought the ftrace kallsym stuff was for the trampolines, which would
be fairly similar to what BPF is doing. And why I'm trying to get a
generic dynamic kallsym thing sorted. There's bound the be other
code-gen things at some point.
Song Liu Dec. 13, 2018, 9:48 p.m. UTC | #14
> On Dec 13, 2018, at 10:45 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Dec 12, 2018 at 01:33:20PM -0500, Steven Rostedt wrote:
>> On Wed, 12 Dec 2018 19:05:53 +0100
>> Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>>> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
>>>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>>>>> 
>>>>> .... Oooh, I see the problem, everybody is doing their own custom
>>>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
>>>>> 
>>>>> This, for example, doesn't track module load/unload nor ftrace
>>>>> trampolines, even though both affect kallsyms.  
>>>> 
>>>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
>>>> That could be separate sets of patches.   
>>> 
>>> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
>>> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
>>> and also have ftrace use that.
>>> 
>>> Because currently the ftrace stuff is otherwise invisible.
>>> 
>>> A generic kallsym register/unregister for any JIT.
>> 
>> That's if it needs to look up the symbols that were recorded when init
>> was unloaded.
>> 
>> The ftrace kallsyms is used to save the function names of init code
>> that was freed, but may have been recorded. With out the ftrace
>> kallsyms the functions traced at init time would just show up as hex
>> addresses (not very useful).
>> 
>> I'm not sure how BPF would need those symbols unless they were executed
>> during init (module or core) and needed to see what the symbols use to
>> be).
> 
> Aah, that sounds entirely dodgy and possibly quite broken. We freed that
> init code, so BPF or your trampolines (or a tiny module) could actually
> fit in there and insert their own kallsyms, and then we have overlapping
> symbols, which would be pretty bad.
> 
> I thought the ftrace kallsym stuff was for the trampolines, which would
> be fairly similar to what BPF is doing. And why I'm trying to get a
> generic dynamic kallsym thing sorted. There's bound the be other
> code-gen things at some point.

Hi Peter, 

I guess you are looking for something for all ksym add/delete events, like;

      /*
       * PERF_RECORD_KSYMBOL
       *
       * struct {
       *      struct perf_event_header header;
       *      u64                             addr;
       *      u32                             len;
       *      u16                             ksym_type;
       *      u16                             flags;
       *      char                            name[];
       *      struct sample_id                sample_id;
       * };
       */

We can use ksym_type to encode BPF_EVENT, trampolines, or other type of ksym.
We can use flags or header.misc to encode ksym add/delete. Is this right?

If we go this direction, shall we reserve a few more bytes in it for different
types to use, like:

      /*
       * PERF_RECORD_KSYMBOL
       *
       * struct {
       *      struct perf_event_header header;
       *      u64                             addr;
       *      u32                             len;
       *      u16                             ksym_type;
       *      u16                             flags;
       *      u64                             data[2];
       *      char                            name[];
       *      struct sample_id                sample_id;
       * };
       */

Thanks,
Song
Arnaldo Carvalho de Melo Dec. 14, 2018, 1:48 p.m. UTC | #15
Em Thu, Dec 13, 2018 at 09:48:57PM +0000, Song Liu escreveu:
> 
> 
> > On Dec 13, 2018, at 10:45 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Wed, Dec 12, 2018 at 01:33:20PM -0500, Steven Rostedt wrote:
> >> On Wed, 12 Dec 2018 19:05:53 +0100
> >> Peter Zijlstra <peterz@infradead.org> wrote:
> >> 
> >>> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
> >>>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
> >>>>> 
> >>>>> .... Oooh, I see the problem, everybody is doing their own custom
> >>>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
> >>>>> 
> >>>>> This, for example, doesn't track module load/unload nor ftrace
> >>>>> trampolines, even though both affect kallsyms.  
> >>>> 
> >>>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
> >>>> That could be separate sets of patches.   
> >>> 
> >>> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
> >>> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
> >>> and also have ftrace use that.
> >>> 
> >>> Because currently the ftrace stuff is otherwise invisible.
> >>> 
> >>> A generic kallsym register/unregister for any JIT.
> >> 
> >> That's if it needs to look up the symbols that were recorded when init
> >> was unloaded.
> >> 
> >> The ftrace kallsyms is used to save the function names of init code
> >> that was freed, but may have been recorded. With out the ftrace
> >> kallsyms the functions traced at init time would just show up as hex
> >> addresses (not very useful).
> >> 
> >> I'm not sure how BPF would need those symbols unless they were executed
> >> during init (module or core) and needed to see what the symbols use to
> >> be).
> > 
> > Aah, that sounds entirely dodgy and possibly quite broken. We freed that
> > init code, so BPF or your trampolines (or a tiny module) could actually
> > fit in there and insert their own kallsyms, and then we have overlapping
> > symbols, which would be pretty bad.
> > 
> > I thought the ftrace kallsym stuff was for the trampolines, which would
> > be fairly similar to what BPF is doing. And why I'm trying to get a
> > generic dynamic kallsym thing sorted. There's bound the be other
> > code-gen things at some point.
> 
> Hi Peter, 
> 
> I guess you are looking for something for all ksym add/delete events, like;
> 
>       /*
>        * PERF_RECORD_KSYMBOL
>        *
>        * struct {
>        *      struct perf_event_header header;
>        *      u64                             addr;
>        *      u32                             len;
>        *      u16                             ksym_type;
>        *      u16                             flags;
>        *      char                            name[];
>        *      struct sample_id                sample_id;
>        * };
>        */

Can't this reuse PERF_RECORD_MMAP2 with some bit in the header to mean
that the name is the symbol name, not a path to some ELF/whatever? The
ksym type could be encoded in the prot field, PROT_EXEC for functions,
PROT_READ for read only data, PROT_WRITE for rw data.

If we do it that way older tools will show the DSO name and an
unresolved symbol, and even an indication if its a function or data,
which is better than not showing anything when processing a new
PERF_RECORD_KSYMBOL.

New tools, seeing the perf_event_attr.header bit will know that this is
a "map" with just one symbol and will show that for both DSO name and
symbol.
 
> We can use ksym_type to encode BPF_EVENT, trampolines, or other type of ksym.
> We can use flags or header.misc to encode ksym add/delete. Is this right?
> 
> If we go this direction, shall we reserve a few more bytes in it for different
> types to use, like:
> 
>       /*
>        * PERF_RECORD_KSYMBOL
>        *
>        * struct {
>        *      struct perf_event_header header;
>        *      u64                             addr;
>        *      u32                             len;
>        *      u16                             ksym_type;
>        *      u16                             flags;
>        *      u64                             data[2];
>        *      char                            name[];
>        *      struct sample_id                sample_id;
>        * };
>        */
> 
> Thanks,
> Song
>
Song Liu Dec. 14, 2018, 5:10 p.m. UTC | #16
> On Dec 14, 2018, at 5:48 AM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Thu, Dec 13, 2018 at 09:48:57PM +0000, Song Liu escreveu:
>> 
>> 
>>> On Dec 13, 2018, at 10:45 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Wed, Dec 12, 2018 at 01:33:20PM -0500, Steven Rostedt wrote:
>>>> On Wed, 12 Dec 2018 19:05:53 +0100
>>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>>> 
>>>>> On Wed, Dec 12, 2018 at 05:09:17PM +0000, Song Liu wrote:
>>>>>>> And while this tracks the bpf kallsyms, it does not do all kallsyms.
>>>>>>> 
>>>>>>> .... Oooh, I see the problem, everybody is doing their own custom
>>>>>>> kallsym_{add,del}() thing, instead of having that in generic code :-(
>>>>>>> 
>>>>>>> This, for example, doesn't track module load/unload nor ftrace
>>>>>>> trampolines, even though both affect kallsyms.  
>>>>>> 
>>>>>> I think we can use PERF_RECORD_MMAP(or MMAP2) for module load/unload. 
>>>>>> That could be separate sets of patches.   
>>>>> 
>>>>> So I would actually like to move bpf_lock/bpf_kallsyms/bpf_tree +
>>>>> bpf_prog_kallsyms_*() + __bpf_address_lookup() into kernel/kallsyms.c
>>>>> and also have ftrace use that.
>>>>> 
>>>>> Because currently the ftrace stuff is otherwise invisible.
>>>>> 
>>>>> A generic kallsym register/unregister for any JIT.
>>>> 
>>>> That's if it needs to look up the symbols that were recorded when init
>>>> was unloaded.
>>>> 
>>>> The ftrace kallsyms is used to save the function names of init code
>>>> that was freed, but may have been recorded. With out the ftrace
>>>> kallsyms the functions traced at init time would just show up as hex
>>>> addresses (not very useful).
>>>> 
>>>> I'm not sure how BPF would need those symbols unless they were executed
>>>> during init (module or core) and needed to see what the symbols use to
>>>> be).
>>> 
>>> Aah, that sounds entirely dodgy and possibly quite broken. We freed that
>>> init code, so BPF or your trampolines (or a tiny module) could actually
>>> fit in there and insert their own kallsyms, and then we have overlapping
>>> symbols, which would be pretty bad.
>>> 
>>> I thought the ftrace kallsym stuff was for the trampolines, which would
>>> be fairly similar to what BPF is doing. And why I'm trying to get a
>>> generic dynamic kallsym thing sorted. There's bound the be other
>>> code-gen things at some point.
>> 
>> Hi Peter, 
>> 
>> I guess you are looking for something for all ksym add/delete events, like;
>> 
>>      /*
>>       * PERF_RECORD_KSYMBOL
>>       *
>>       * struct {
>>       *      struct perf_event_header header;
>>       *      u64                             addr;
>>       *      u32                             len;
>>       *      u16                             ksym_type;
>>       *      u16                             flags;
>>       *      char                            name[];
>>       *      struct sample_id                sample_id;
>>       * };
>>       */
> 
> Can't this reuse PERF_RECORD_MMAP2 with some bit in the header to mean
> that the name is the symbol name, not a path to some ELF/whatever? The
> ksym type could be encoded in the prot field, PROT_EXEC for functions,
> PROT_READ for read only data, PROT_WRITE for rw data.

Thanks Arnaldo!

I think this works. PERF_RECORD_MMAP2 has many bits in it. We can encode
a lot of details. We can even have bit to differentiate map/unmap. 

> 
> If we do it that way older tools will show the DSO name and an
> unresolved symbol, and even an indication if its a function or data,
> which is better than not showing anything when processing a new
> PERF_RECORD_KSYMBOL.

For compatibility, we can use attr.bpf_event bit (or attr.mmap2_plus)
to turn on/off new variations of PERF_RECORD_MMAP2. Unless user runs
perf-record and perf-report with different versions of perf tools, we 
should not see weird events. 

> 
> New tools, seeing the perf_event_attr.header bit will know that this is
> a "map" with just one symbol and will show that for both DSO name and
> symbol.
> 

Hi Peter, 

Could you please share your comments/suggestions on Arnaldo's proposal?

Thanks,
Song
Peter Zijlstra Dec. 17, 2018, 3:48 p.m. UTC | #17
On Fri, Dec 14, 2018 at 10:48:57AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Thu, Dec 13, 2018 at 09:48:57PM +0000, Song Liu escreveu:

> > I guess you are looking for something for all ksym add/delete events, like;
> > 
> >       /*
> >        * PERF_RECORD_KSYMBOL
> >        *
> >        * struct {
> >        *      struct perf_event_header header;
> >        *      u64                             addr;
> >        *      u32                             len;
> >        *      u16                             ksym_type;
> >        *      u16                             flags;
> >        *      char                            name[];
> >        *      struct sample_id                sample_id;
> >        * };
> >        */

Yes, something like that.

> Can't this reuse PERF_RECORD_MMAP2 with some bit in the header to mean
> that the name is the symbol name, not a path to some ELF/whatever? The
> ksym type could be encoded in the prot field, PROT_EXEC for functions,
> PROT_READ for read only data, PROT_WRITE for rw data.
> 
> If we do it that way older tools will show the DSO name and an
> unresolved symbol, and even an indication if its a function or data,
> which is better than not showing anything when processing a new
> PERF_RECORD_KSYMBOL.
> 
> New tools, seeing the perf_event_attr.header bit will know that this is
> a "map" with just one symbol and will show that for both DSO name and
> symbol.

That confuses me; the DSO for ksyms is [kernel|$modname] after all. And
BPF would like to have multiple symbols per 'program', so I can imagine
it would want to do something like:

	[bpf-progname1] function1
	[bpf-progname1] function2
	[bpf-progname2] progname2

The first being an bpf proglet with multiple functions, the second a
'legacy' bpf proglet with only a single function.

Trouble is; both PERF_RECORD_KSYM and MMAP* only have a single name[]
field. Now, I suppose we could add:

	char modname[MODULE_NAME_LEN]

or:

	u16 modlen;
	char modname[modlen];

or something along those lines.

Similarly; I would not expect the ftrace trampolines to all have a
different module name.

> > We can use ksym_type to encode BPF_EVENT, trampolines, or other type of ksym.
> > We can use flags or header.misc to encode ksym add/delete. Is this right?
> > 
> > If we go this direction, shall we reserve a few more bytes in it for different
> > types to use, like:
> > 
> >       /*
> >        * PERF_RECORD_KSYMBOL
> >        *
> >        * struct {
> >        *      struct perf_event_header header;
> >        *      u64                             addr;
> >        *      u32                             len;
> >        *      u16                             ksym_type;
> >        *      u16                             flags;
> >        *      u64                             data[2];
> >        *      char                            name[];
> >        *      struct sample_id                sample_id;
> >        * };
> >        */

Right; elsewhere you proposed keeping PERF_RECORD_BPF_EVENT for that;
which I think is clearer.

I think you can keep much of the current patches for that in fact.
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 537e9e7c6e6f..45d23560f90b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -955,6 +955,7 @@  bpf_address_lookup(unsigned long addr, unsigned long *size,
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp);
 void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);
 
 #else /* CONFIG_BPF_JIT */
 
@@ -1010,6 +1011,12 @@  static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	sym[0] = '\0';
+}
+
 #endif /* CONFIG_BPF_JIT */
 
 void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 53c500f0ca79..02217bab64d0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1113,6 +1113,12 @@  static inline void perf_event_task_sched_out(struct task_struct *prev,
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
+
+#ifdef CONFIG_BPF_SYSCALL
+extern void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+				      struct bpf_prog *prog);
+#endif
+
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1333,6 +1339,10 @@  static inline int perf_unregister_guest_info_callbacks
 (struct perf_guest_info_callbacks *callbacks)				{ return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
+#ifdef CONFIG_BPF_SYSCALL
+static inline void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+					     struct bpf_prog *prog)	{ }
+#endif
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9de8780ac8d9..0ae3dae55fa8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@  struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				bpf_event      :  1, /* include bpf events */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -965,9 +966,41 @@  enum perf_event_type {
 	 */
 	PERF_RECORD_NAMESPACES			= 16,
 
+	/*
+	 * Record different types of bpf events:
+	 *  enum perf_bpf_event_type {
+	 *     PERF_BPF_EVENT_UNKNOWN		= 0,
+	 *     PERF_BPF_EVENT_PROG_LOAD		= 1,
+	 *     PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	 *  };
+	 *
+	 * struct {
+	 *	struct perf_event_header header;
+	 *	u32				type;
+	 *	u32				flags;
+	 *	u32				id; // prog_id or other id
+	 *	u32				sub_id; // subprog id
+	 *
+	 *	// for bpf_prog types, bpf prog or subprog
+	 *	u8				tag[BPF_TAG_SIZE];
+	 *	u64				addr;
+	 *	u64				len;
+	 *	char				name[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_BPF_EVENT			= 17,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
+enum perf_bpf_event_type {
+	PERF_BPF_EVENT_UNKNOWN		= 0,
+	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	PERF_BPF_EVENT_MAX,		/* non-ABI */
+};
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5cdd8da0e7f2..2a8364294f11 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -496,7 +496,7 @@  bpf_get_prog_addr_region(const struct bpf_prog *prog,
 	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
 }
 
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
 	const char *end = sym + KSYM_NAME_LEN;
 	const struct btf_type *type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 496c2f695f2d..a97a3698b7a4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1210,6 +1210,8 @@  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		perf_event_bpf_event_prog(PERF_BPF_EVENT_PROG_UNLOAD, prog);
+
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
@@ -1558,6 +1560,7 @@  static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
+	perf_event_bpf_event_prog(PERF_BPF_EVENT_PROG_LOAD, prog);
 	return err;
 
 free_used_maps:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84530ab358c3..4b2d1a58d3fe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -385,6 +385,7 @@  static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4235,7 +4236,7 @@  static bool is_sb_event(struct perf_event *event)
 
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
-	    attr->task ||
+	    attr->task || attr->bpf_event ||
 	    attr->context_switch)
 		return true;
 	return false;
@@ -4305,6 +4306,8 @@  static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (has_branch_stack(event))
 		dec = true;
+	if (event->attr.bpf_event)
+		atomic_dec(&nr_bpf_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7650,6 +7653,122 @@  static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+/*
+ * bpf load/unload tracking
+ */
+
+struct perf_bpf_event {
+	struct bpf_prog *prog;
+
+	struct {
+		struct perf_event_header        header;
+		u32 type;
+		u32 flags;
+		u32 id;
+		u32 sub_id;
+		u8 tag[BPF_TAG_SIZE];
+		u64 addr;
+		u64 len;
+	} event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+	return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event,
+				   void *data)
+{
+	struct perf_bpf_event *bpf_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	char name[KSYM_NAME_LEN];
+	int name_len;
+	int ret;
+
+	if (!perf_event_bpf_match(event))
+		return;
+
+	/* get prog name and round up to 64 bit aligned */
+	bpf_get_prog_name(bpf_event->prog, name);
+	name_len = strlen(name) + 1;
+	while (!IS_ALIGNED(name_len, sizeof(u64)))
+		name[name_len++] = '\0';
+	bpf_event->event_id.header.size += name_len;
+
+	perf_event_header__init_id(&bpf_event->event_id.header, &sample, event);
+	ret = perf_output_begin(&handle, event,
+				bpf_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, bpf_event->event_id);
+
+	__output_copy(&handle, name, name_len);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_event_bpf(struct perf_bpf_event *bpf_event)
+{
+	perf_iterate_sb(perf_event_bpf_output,
+		       bpf_event,
+		       NULL);
+}
+
+static void perf_event_bpf_event_subprog(
+	enum perf_bpf_event_type type,
+	struct bpf_prog *prog, u32 id, u32 sub_id)
+{
+	struct perf_bpf_event bpf_event = (struct perf_bpf_event){
+		.prog = prog,
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_BPF_EVENT,
+				.size = sizeof(bpf_event.event_id),
+			},
+			.type = type,
+			/* .flags = 0 */
+			.id = id,
+			.sub_id = sub_id,
+			.addr = (u64)(unsigned long)prog->bpf_func,
+			.len = prog->jited_len,
+		},
+	};
+
+	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+	perf_event_bpf(&bpf_event);
+}
+
+/*
+ * This is call per bpf_prog. In case of multiple sub programs,
+ * this function calls perf_event_bpf_event_subprog() multiple times
+ */
+void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+			       struct bpf_prog *prog)
+{
+	if (!atomic_read(&nr_bpf_events))
+		return;
+
+	if (type != PERF_BPF_EVENT_PROG_LOAD &&
+	    type != PERF_BPF_EVENT_PROG_UNLOAD)
+		return;
+
+	if (prog->aux->func_cnt == 0) {
+		perf_event_bpf_event_subprog(type, prog,
+					     prog->aux->id, 0);
+	} else {
+		int i;
+
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			perf_event_bpf_event_subprog(type, prog->aux->func[i],
+						     prog->aux->id, i);
+	}
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9900,6 +10019,8 @@  static void account_event(struct perf_event *event)
 		inc = true;
 	if (is_cgroup_event(event))
 		inc = true;
+	if (event->attr.bpf_event)
+		atomic_inc(&nr_bpf_events);
 
 	if (inc) {
 		/*