diff mbox series

[RFC,perf,bpf,1/5] perf, bpf: Introduce PERF_RECORD_BPF_EVENT

Message ID 20181106205246.567448-2-songliubraving@fb.com
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series reveal invisible bpf programs | expand

Commit Message

Song Liu Nov. 6, 2018, 8:52 p.m. UTC
For better performance analysis of BPF programs, this patch introduces
PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
load/unload information to user space.

        /*
         * Record different types of bpf events:
         *   enum perf_bpf_event_type {
         *      PERF_BPF_EVENT_UNKNOWN          = 0,
         *      PERF_BPF_EVENT_PROG_LOAD        = 1,
         *      PERF_BPF_EVENT_PROG_UNLOAD      = 2,
         *   };
         *
         * struct {
         *      struct perf_event_header header;
         *      u16 type;
         *      u16 flags;
         *      u32 id;  // prog_id or map_id
         * };
         */
        PERF_RECORD_BPF_EVENT                   = 17,

PERF_RECORD_BPF_EVENT contains minimal information about the BPF program.
Perf utility (or other user space tools) should listen to this event and
fetch more details about the event via BPF syscalls
(BPF_PROG_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, etc.).

Currently, PERF_RECORD_BPF_EVENT only support two events:
PERF_BPF_EVENT_PROG_LOAD and PERF_BPF_EVENT_PROG_UNLOAD. But it can be
easily extended to support more events.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/perf_event.h      |  5 ++
 include/uapi/linux/perf_event.h | 27 ++++++++++-
 kernel/bpf/syscall.c            |  4 ++
 kernel/events/core.c            | 82 ++++++++++++++++++++++++++++++++-
 4 files changed, 116 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Nov. 7, 2018, 8:40 a.m. UTC | #1
On Tue, Nov 06, 2018 at 12:52:42PM -0800, Song Liu wrote:
> For better performance analysis of BPF programs, this patch introduces
> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
> load/unload information to user space.
> 
>         /*
>          * Record different types of bpf events:
>          *   enum perf_bpf_event_type {
>          *      PERF_BPF_EVENT_UNKNOWN          = 0,
>          *      PERF_BPF_EVENT_PROG_LOAD        = 1,
>          *      PERF_BPF_EVENT_PROG_UNLOAD      = 2,
>          *   };
>          *
>          * struct {
>          *      struct perf_event_header header;
>          *      u16 type;
>          *      u16 flags;
>          *      u32 id;  // prog_id or map_id
>          * };
>          */
>         PERF_RECORD_BPF_EVENT                   = 17,
> 
> PERF_RECORD_BPF_EVENT contains minimal information about the BPF program.
> Perf utility (or other user space tools) should listen to this event and
> fetch more details about the event via BPF syscalls
> (BPF_PROG_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, etc.).

Why !? You're failing to explain why it cannot provide the full
information there.
Song Liu Nov. 7, 2018, 6:25 p.m. UTC | #2
> On Nov 7, 2018, at 12:40 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Nov 06, 2018 at 12:52:42PM -0800, Song Liu wrote:
>> For better performance analysis of BPF programs, this patch introduces
>> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
>> load/unload information to user space.
>> 
>>        /*
>>         * Record different types of bpf events:
>>         *   enum perf_bpf_event_type {
>>         *      PERF_BPF_EVENT_UNKNOWN          = 0,
>>         *      PERF_BPF_EVENT_PROG_LOAD        = 1,
>>         *      PERF_BPF_EVENT_PROG_UNLOAD      = 2,
>>         *   };
>>         *
>>         * struct {
>>         *      struct perf_event_header header;
>>         *      u16 type;
>>         *      u16 flags;
>>         *      u32 id;  // prog_id or map_id
>>         * };
>>         */
>>        PERF_RECORD_BPF_EVENT                   = 17,
>> 
>> PERF_RECORD_BPF_EVENT contains minimal information about the BPF program.
>> Perf utility (or other user space tools) should listen to this event and
>> fetch more details about the event via BPF syscalls
>> (BPF_PROG_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, etc.).
> 
> Why !? You're failing to explain why it cannot provide the full
> information there.

Aha, I missed this part. I will add the following to next version. Please
let me know if anything is not clear.

Thanks,
Song



This design decision is picked for the following reasons. First, BPF 
programs could be loaded-and-jited and/or unloaded before/during/after 
perf-record run. Once a BPF programs is unloaded, it is impossible to 
recover details of the program. It is impossible to provide the 
information through a simple key (like the build ID). Second, BPF prog
annotation is under fast developments. Multiple informations will be 
added to bpf_prog_info in the next few releases. Including all the
information of a BPF program in the perf ring buffer requires frequent 
changes to the perf ABI, and thus makes it very difficult to manage 
compatibility of perf utility. 

For more information on this topic, please refer to the following 
discussions: 

    https://www.spinics.net/lists/netdev/msg524230.html
Peter Zijlstra Nov. 8, 2018, 3 p.m. UTC | #3
On Wed, Nov 07, 2018 at 06:25:04PM +0000, Song Liu wrote:
> 
> 
> > On Nov 7, 2018, at 12:40 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Tue, Nov 06, 2018 at 12:52:42PM -0800, Song Liu wrote:
> >> For better performance analysis of BPF programs, this patch introduces
> >> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
> >> load/unload information to user space.
> >> 
> >>        /*
> >>         * Record different types of bpf events:
> >>         *   enum perf_bpf_event_type {
> >>         *      PERF_BPF_EVENT_UNKNOWN          = 0,
> >>         *      PERF_BPF_EVENT_PROG_LOAD        = 1,
> >>         *      PERF_BPF_EVENT_PROG_UNLOAD      = 2,
> >>         *   };
> >>         *
> >>         * struct {
> >>         *      struct perf_event_header header;
> >>         *      u16 type;
> >>         *      u16 flags;
> >>         *      u32 id;  // prog_id or map_id
> >>         * };
> >>         */
> >>        PERF_RECORD_BPF_EVENT                   = 17,
> >> 
> >> PERF_RECORD_BPF_EVENT contains minimal information about the BPF program.
> >> Perf utility (or other user space tools) should listen to this event and
> >> fetch more details about the event via BPF syscalls
> >> (BPF_PROG_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, etc.).
> > 
> > Why !? You're failing to explain why it cannot provide the full
> > information there.
> 
> Aha, I missed this part. I will add the following to next version. Please
> let me know if anything is not clear.

> 
> This design decision is picked for the following reasons. First, BPF 
> programs could be loaded-and-jited and/or unloaded before/during/after 
> perf-record run. Once a BPF programs is unloaded, it is impossible to 
> recover details of the program. It is impossible to provide the 
> information through a simple key (like the build ID). Second, BPF prog
> annotation is under fast developments. Multiple informations will be 
> added to bpf_prog_info in the next few releases. Including all the
> information of a BPF program in the perf ring buffer requires frequent 
> changes to the perf ABI, and thus makes it very difficult to manage 
> compatibility of perf utility. 

So I don't agree with that reasoning. If you want symbol information
you'll just have to commit to some form of ABI. That bpf_prog_info is an
ABI too.

And relying on userspace to synchronously consume perf output to
directly call into the kernel again to get more info (through another
ABI) is a pretty terrible design.

So please try harder. NAK on this.
Song Liu Nov. 8, 2018, 6:04 p.m. UTC | #4
Hi Peter,

> On Nov 8, 2018, at 7:00 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Nov 07, 2018 at 06:25:04PM +0000, Song Liu wrote:
>> 
>> 
>>> On Nov 7, 2018, at 12:40 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Tue, Nov 06, 2018 at 12:52:42PM -0800, Song Liu wrote:
>>>> For better performance analysis of BPF programs, this patch introduces
>>>> PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
>>>> load/unload information to user space.
>>>> 
>>>>       /*
>>>>        * Record different types of bpf events:
>>>>        *   enum perf_bpf_event_type {
>>>>        *      PERF_BPF_EVENT_UNKNOWN          = 0,
>>>>        *      PERF_BPF_EVENT_PROG_LOAD        = 1,
>>>>        *      PERF_BPF_EVENT_PROG_UNLOAD      = 2,
>>>>        *   };
>>>>        *
>>>>        * struct {
>>>>        *      struct perf_event_header header;
>>>>        *      u16 type;
>>>>        *      u16 flags;
>>>>        *      u32 id;  // prog_id or map_id
>>>>        * };
>>>>        */
>>>>       PERF_RECORD_BPF_EVENT                   = 17,
>>>> 
>>>> PERF_RECORD_BPF_EVENT contains minimal information about the BPF program.
>>>> Perf utility (or other user space tools) should listen to this event and
>>>> fetch more details about the event via BPF syscalls
>>>> (BPF_PROG_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, etc.).
>>> 
>>> Why !? You're failing to explain why it cannot provide the full
>>> information there.
>> 
>> Aha, I missed this part. I will add the following to next version. Please
>> let me know if anything is not clear.
> 
>> 
>> This design decision is picked for the following reasons. First, BPF 
>> programs could be loaded-and-jited and/or unloaded before/during/after 
>> perf-record run. Once a BPF programs is unloaded, it is impossible to 
>> recover details of the program. It is impossible to provide the 
>> information through a simple key (like the build ID). Second, BPF prog
>> annotation is under fast developments. Multiple informations will be 
>> added to bpf_prog_info in the next few releases. Including all the
>> information of a BPF program in the perf ring buffer requires frequent 
>> changes to the perf ABI, and thus makes it very difficult to manage 
>> compatibility of perf utility. 
> 
> So I don't agree with that reasoning. If you want symbol information
> you'll just have to commit to some form of ABI. That bpf_prog_info is an
> ABI too.

At the beginning of the perf-record run, perf need to query bpf_prog_info 
of already loaded BPF programs. Therefore, we need to commit to the 
bpf_prog_info ABI. If we also include full information of the BPF program 
in the perf ring buffer, we will commit to TWO ABIs. 

Also, perf-record write the event to perf.data file, so the data need to be 
serialized. This is implemented in patch 4/5. To include the data in the 
ring buffer, we will need another piece of code in the kernel to do the
same serialization work.   

On the other hand, processing BPF load/unload events synchronously should
not introduce too much overhead for meaningful use cases. If many BPF progs
are being loaded/unloaded within short period of time, it is not the steady
state that profiling works care about. 

Would these resolve your concerns? 

Thanks,
Song
David Ahern Nov. 8, 2018, 6:26 p.m. UTC | #5
On 11/8/18 11:04 AM, Song Liu wrote:
> On the other hand, processing BPF load/unload events synchronously should
> not introduce too much overhead for meaningful use cases. If many BPF progs
> are being loaded/unloaded within short period of time, it is not the steady
> state that profiling works care about. 

but, profiling is not the only use case, and perf-record is common with
those other use cases.

I think that answers why your RFC set does not fork a thread for the bpf
events. You are focused on profiling and for already loaded programs or
for a small number of programs loaded by a specific workload started by
perf.
Song Liu Nov. 8, 2018, 6:49 p.m. UTC | #6
Hi David,

> On Nov 8, 2018, at 10:26 AM, David Ahern <dsahern@gmail.com> wrote:
> 
> On 11/8/18 11:04 AM, Song Liu wrote:
>> On the other hand, processing BPF load/unload events synchronously should
>> not introduce too much overhead for meaningful use cases. If many BPF progs
>> are being loaded/unloaded within short period of time, it is not the steady
>> state that profiling works care about. 
> 
> but, profiling is not the only use case, and perf-record is common with
> those other use cases.
> 
> I think that answers why your RFC set does not fork a thread for the bpf
> events. You are focused on profiling and for already loaded programs or
> for a small number of programs loaded by a specific workload started by
> perf.

We sure can fork a thread for the BPF event. But I guess that's not Peter's
main concern here...

Could you please point me to more information about the use cases you worry 
about? I am more than happy to optimize the logic for those use cases. 

Thanks,
Song
David Ahern Nov. 9, 2018, 5:08 p.m. UTC | #7
On 11/8/18 11:49 AM, Song Liu wrote:
> Could you please point me to more information about the use cases you worry 
> about? I am more than happy to optimize the logic for those use cases. 

bpf load and unload as just another tracepoint to throw into a set of
events that are monitored. As mentioned before auditing the loads and
unloads is one example.

And that brings up another comment: Why are you adding a PERF_RECORD_*
rather than a tracepoint? From what I can see the PERF_RECORD_BPF_EVENT
definition does not include the who is loading / unloading a bpf
program. That is important information as well.
Song Liu Nov. 9, 2018, 6:49 p.m. UTC | #8
> On Nov 9, 2018, at 9:08 AM, David Ahern <dsahern@gmail.com> wrote:
> 
> On 11/8/18 11:49 AM, Song Liu wrote:
>> Could you please point me to more information about the use cases you worry 
>> about? I am more than happy to optimize the logic for those use cases. 
> 
> bpf load and unload as just another tracepoint to throw into a set of
> events that are monitored. As mentioned before auditing the loads and
> unloads is one example.

For the tracepoint approach, we need similar synchronous logic in perf to 
process BPF load/unload events. If we agree this is the right direction, 
I will modify the set with tracepoints. 

> 
> And that brings up another comment: Why are you adding a PERF_RECORD_*
> rather than a tracepoint? From what I can see the PERF_RECORD_BPF_EVENT
> definition does not include the who is loading / unloading a bpf
> program. That is important information as well.

bpf_prog_info has "__u32 created_by_uid;" in it, so it is not really needed
in current version. 

Thanks,
Song
Alexei Starovoitov Nov. 9, 2018, 7:14 p.m. UTC | #9
On 11/9/18 10:49 AM, Song Liu wrote:
> 
> 
>> On Nov 9, 2018, at 9:08 AM, David Ahern <dsahern@gmail.com> wrote:
>>
>> On 11/8/18 11:49 AM, Song Liu wrote:
>>> Could you please point me to more information about the use cases you worry
>>> about? I am more than happy to optimize the logic for those use cases.
>>
>> bpf load and unload as just another tracepoint to throw into a set of
>> events that are monitored. As mentioned before auditing the loads and
>> unloads is one example.
> 
> For the tracepoint approach, we need similar synchronous logic in perf to
> process BPF load/unload events. If we agree this is the right direction,
> I will modify the set with tracepoints.

we had tracepoints in bpf core. We removed it.
I don't think we'll be going back.
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 53c500f0ca79..a3126fd5b7f1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1113,6 +1113,9 @@  static inline void perf_event_task_sched_out(struct task_struct *prev,
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
+extern void perf_event_bpf_event(enum perf_bpf_event_type type,
+				 u16 flags, u32 id);
+
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1333,6 +1336,8 @@  static inline int perf_unregister_guest_info_callbacks
 (struct perf_guest_info_callbacks *callbacks)				{ return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
+static inline void perf_event_bpf_event(enum perf_bpf_event_type type,
+					u16 flags, u32 id)		{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index f35eb72739c0..d51cacb3077a 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@  struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				bpf_event      :  1, /* include bpf events */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -963,9 +964,33 @@  enum perf_event_type {
 	 */
 	PERF_RECORD_NAMESPACES			= 16,
 
+	/*
+	 * Record different types of bpf events:
+	 *  enum perf_bpf_event_type {
+	 *     PERF_BPF_EVENT_UNKNOWN		= 0,
+	 *     PERF_BPF_EVENT_PROG_LOAD	= 1,
+	 *     PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	 *  };
+	 *
+	 * struct {
+	 *	struct perf_event_header header;
+	 *	u16 type;
+	 *	u16 flags;
+	 *	u32 id;  // prog_id or map_id
+	 * };
+	 */
+	PERF_RECORD_BPF_EVENT			= 17,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
+enum perf_bpf_event_type {
+	PERF_BPF_EVENT_UNKNOWN		= 0,
+	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	PERF_BPF_EVENT_MAX,		/* non-ABI */
+};
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 18e3be193a05..b37051a13be6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1101,9 +1101,12 @@  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		int prog_id = prog->aux->id;
+
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
+		perf_event_bpf_event(PERF_BPF_EVENT_PROG_UNLOAD, 0, prog_id);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -1441,6 +1444,7 @@  static int bpf_prog_load(union bpf_attr *attr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
+	perf_event_bpf_event(PERF_BPF_EVENT_PROG_LOAD, 0, prog->aux->id);
 	return err;
 
 free_used_maps:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a97f34bc14c..54667be6669b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -385,6 +385,7 @@  static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4235,7 +4236,7 @@  static bool is_sb_event(struct perf_event *event)
 
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
-	    attr->task ||
+	    attr->task || attr->bpf_event ||
 	    attr->context_switch)
 		return true;
 	return false;
@@ -4305,6 +4306,8 @@  static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (has_branch_stack(event))
 		dec = true;
+	if (event->attr.bpf_event)
+		atomic_dec(&nr_bpf_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7650,6 +7653,81 @@  static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+/*
+ * bpf load/unload tracking
+ */
+
+struct perf_bpf_event {
+	struct {
+		struct perf_event_header        header;
+		u16 type;
+		u16 flags;
+		u32 id;
+	} event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+	return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event,
+				   void *data)
+{
+	struct perf_bpf_event *bpf_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int size = bpf_event->event_id.header.size;
+	int ret;
+
+	if (!perf_event_bpf_match(event))
+		return;
+
+	perf_event_header__init_id(&bpf_event->event_id.header, &sample, event);
+	ret = perf_output_begin(&handle, event,
+				bpf_event->event_id.header.size);
+	if (ret)
+		goto out;
+
+	perf_output_put(&handle, bpf_event->event_id);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+out:
+	bpf_event->event_id.header.size = size;
+}
+
+static void perf_event_bpf(struct perf_bpf_event *bpf_event)
+{
+	perf_iterate_sb(perf_event_bpf_output,
+		       bpf_event,
+		       NULL);
+}
+
+void perf_event_bpf_event(enum perf_bpf_event_type type, u16 flags, u32 id)
+{
+	struct perf_bpf_event bpf_event;
+
+	if (!atomic_read(&nr_bpf_events))
+		return;
+
+	if (type <= PERF_BPF_EVENT_UNKNOWN || type >= PERF_BPF_EVENT_MAX)
+		return;
+
+	bpf_event = (struct perf_bpf_event){
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_BPF_EVENT,
+				.size = sizeof(bpf_event.event_id),
+			},
+			.type = type,
+			.flags = flags,
+			.id = id,
+		},
+	};
+	perf_event_bpf(&bpf_event);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9871,6 +9949,8 @@  static void account_event(struct perf_event *event)
 		inc = true;
 	if (is_cgroup_event(event))
 		inc = true;
+	if (event->attr.bpf_event)
+		atomic_inc(&nr_bpf_events);
 
 	if (inc) {
 		/*