diff mbox

[net-next,v3,1/2] bpf: add support for sys_enter_* and sys_exit_* tracepoints

Message ID 20170803162951.1564963-2-yhs@fb.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Yonghong Song Aug. 3, 2017, 4:29 p.m. UTC
Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The iovisor/bcc issue #748
(https://github.com/iovisor/bcc/issues/748) documents this issue.
For example, if you try to attach a bpf program to tracepoints
syscalls/sys_enter_newfstat, you will get the following error:
   # ./tools/trace.py t:syscalls:sys_enter_newfstat
   Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
   Failed to attach BPF to tracepoint

The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.

This patch adds bpf support for these syscalls tracepoints by
  . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
  . calling bpf programs in perf_syscall_enter and perf_syscall_exit

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/syscalls.h      | 12 ++++++++++
 kernel/events/core.c          |  8 ++++---
 kernel/trace/trace_syscalls.c | 53 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 5 deletions(-)

Comments

Alexei Starovoitov Aug. 4, 2017, 2:08 a.m. UTC | #1
On 8/3/17 6:29 AM, Yonghong Song wrote:
> @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  	if (!sys_data)
>  		return;
>
> +	prog = READ_ONCE(sys_data->enter_event->prog);
>  	head = this_cpu_ptr(sys_data->enter_event->perf_events);
> -	if (hlist_empty(head))
> +	if (!prog && hlist_empty(head))
>  		return;
>
>  	/* get the size after alignment with the u32 buffer size field */
> @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  	rec->nr = syscall_nr;
>  	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
>  			       (unsigned long *)&rec->args);
> +
> +	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
> +	    hlist_empty(head)) {
> +		perf_swevent_put_recursion_context(rctx);
> +		return;
> +	}

hmm. if I read the patch correctly that makes it different from
kprobe/uprobe/tracepoints+bpf behavior. Why make it different and
force user space to perf_event_open() on every cpu?
In other cases it's the job of the bpf program to filter by cpu
if necessary and that is well understood by bcc scripts.
Y Song Aug. 4, 2017, 3:09 a.m. UTC | #2
On Thu, Aug 3, 2017 at 7:08 PM, Alexei Starovoitov <ast@fb.com> wrote:
> On 8/3/17 6:29 AM, Yonghong Song wrote:
>>
>> @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct
>> pt_regs *regs, long id)
>>         if (!sys_data)
>>                 return;
>>
>> +       prog = READ_ONCE(sys_data->enter_event->prog);
>>         head = this_cpu_ptr(sys_data->enter_event->perf_events);
>> -       if (hlist_empty(head))
>> +       if (!prog && hlist_empty(head))
>>                 return;
>>
>>         /* get the size after alignment with the u32 buffer size field */
>> @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct
>> pt_regs *regs, long id)
>>         rec->nr = syscall_nr;
>>         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
>>                                (unsigned long *)&rec->args);
>> +
>> +       if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
>> +           hlist_empty(head)) {
>> +               perf_swevent_put_recursion_context(rctx);
>> +               return;
>> +       }
>
>
> hmm. if I read the patch correctly that makes it different from
> kprobe/uprobe/tracepoints+bpf behavior. Why make it different and
> force user space to perf_event_open() on every cpu?
> In other cases it's the job of the bpf program to filter by cpu
> if necessary and that is well understood by bcc scripts.

The patch actually does allow the bpf program to track all cpus.
The test:
>> +       if (!prog && hlist_empty(head))
>>                 return;
ensures that if prog is not empty, it will not return even if the
event in the current cpu is empty. Later on, perf_call_bpf_enter will
be called if prog is not empty. This ensures that
the bpf program will execute regardless of the current cpu.

Maybe I missed anything here?
Alexei Starovoitov Aug. 4, 2017, 6:40 p.m. UTC | #3
On 8/3/17 5:09 PM, Y Song wrote:
> On Thu, Aug 3, 2017 at 7:08 PM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 8/3/17 6:29 AM, Yonghong Song wrote:
>>>
>>> @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct
>>> pt_regs *regs, long id)
>>>         if (!sys_data)
>>>                 return;
>>>
>>> +       prog = READ_ONCE(sys_data->enter_event->prog);
>>>         head = this_cpu_ptr(sys_data->enter_event->perf_events);
>>> -       if (hlist_empty(head))
>>> +       if (!prog && hlist_empty(head))
>>>                 return;
>>>
>>>         /* get the size after alignment with the u32 buffer size field */
>>> @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct
>>> pt_regs *regs, long id)
>>>         rec->nr = syscall_nr;
>>>         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
>>>                                (unsigned long *)&rec->args);
>>> +
>>> +       if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
>>> +           hlist_empty(head)) {
>>> +               perf_swevent_put_recursion_context(rctx);
>>> +               return;
>>> +       }
>>
>>
>> hmm. if I read the patch correctly that makes it different from
>> kprobe/uprobe/tracepoints+bpf behavior. Why make it different and
>> force user space to perf_event_open() on every cpu?
>> In other cases it's the job of the bpf program to filter by cpu
>> if necessary and that is well understood by bcc scripts.
>
> The patch actually does allow the bpf program to track all cpus.
> The test:
>>> +       if (!prog && hlist_empty(head))
>>>                 return;
> ensures that if prog is not empty, it will not return even if the
> event in the current cpu is empty. Later on, perf_call_bpf_enter will
> be called if prog is not empty. This ensures that
> the bpf program will execute regardless of the current cpu.
>
> Maybe I missed anything here?

you're right. sorry. misread && for ||.
That part looks good indeed.

Another question...
that part:
         if (is_tracepoint) {
                 int off = trace_event_get_offsets(event->tp_event);

                 if (prog->aux->max_ctx_offset > off) {
seems to be not used in this new path...
or new is_syscall_tp is also is_tracepoint ?
If so, then it's ok...
and trace_event_get_offsets() returns the actual number
of syscall args or always upper bound of 6?
just curious how this new code checks that bpf prog cannot
access args[6+].

Thanks!
Yonghong Song Aug. 4, 2017, 6:50 p.m. UTC | #4
On 8/4/17 11:40 AM, Alexei Starovoitov wrote:
> On 8/3/17 5:09 PM, Y Song wrote:
>> On Thu, Aug 3, 2017 at 7:08 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>> On 8/3/17 6:29 AM, Yonghong Song wrote:
>>>>
>>>> @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct
>>>> pt_regs *regs, long id)
>>>>         if (!sys_data)
>>>>                 return;
>>>>
>>>> +       prog = READ_ONCE(sys_data->enter_event->prog);
>>>>         head = this_cpu_ptr(sys_data->enter_event->perf_events);
>>>> -       if (hlist_empty(head))
>>>> +       if (!prog && hlist_empty(head))
>>>>                 return;
>>>>
>>>>         /* get the size after alignment with the u32 buffer size 
>>>> field */
>>>> @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, 
>>>> struct
>>>> pt_regs *regs, long id)
>>>>         rec->nr = syscall_nr;
>>>>         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
>>>>                                (unsigned long *)&rec->args);
>>>> +
>>>> +       if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, 
>>>> rec)) ||
>>>> +           hlist_empty(head)) {
>>>> +               perf_swevent_put_recursion_context(rctx);
>>>> +               return;
>>>> +       }
>>>
>>>
>>> hmm. if I read the patch correctly that makes it different from
>>> kprobe/uprobe/tracepoints+bpf behavior. Why make it different and
>>> force user space to perf_event_open() on every cpu?
>>> In other cases it's the job of the bpf program to filter by cpu
>>> if necessary and that is well understood by bcc scripts.
>>
>> The patch actually does allow the bpf program to track all cpus.
>> The test:
>>>> +       if (!prog && hlist_empty(head))
>>>>                 return;
>> ensures that if prog is not empty, it will not return even if the
>> event in the current cpu is empty. Later on, perf_call_bpf_enter will
>> be called if prog is not empty. This ensures that
>> the bpf program will execute regardless of the current cpu.
>>
>> Maybe I missed anything here?
> 
> you're right. sorry. misread && for ||.
> That part looks good indeed.
> 
> Another question...
> that part:
>          if (is_tracepoint) {
>                  int off = trace_event_get_offsets(event->tp_event);
> 
>                  if (prog->aux->max_ctx_offset > off) {
> seems to be not used in this new path...
> or new is_syscall_tp is also is_tracepoint ?

Good catch! I think I need "is_tracepoint || is_syscall_tp" here.
If trace_event_get_offsets can get the correct offset for the current
particular syscall_{enter|exit}_* event, we will be find.
I will double check this and have another patch.

> If so, then it's ok...
> and trace_event_get_offsets() returns the actual number
> of syscall args or always upper bound of 6?

Since the specific event is fed here, I think the actual number
will be returned.

> just curious how this new code checks that bpf prog cannot
> access args[6+].
> 
> Thanks!
>
diff mbox

Patch

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3cb15ea..c917021 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -172,8 +172,20 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __attribute__((section("__syscalls_metadata")))	\
 	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
+
+static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
+{
+	return tp_event->class == &event_class_syscall_enter ||
+	       tp_event->class == &event_class_syscall_exit;
+}
+
 #else
 #define SYSCALL_METADATA(sname, nb, ...)
+
+static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
+{
+	return 0;
+}
 #endif
 
 #define SYSCALL_DEFINE0(sname)					\
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..750b8d3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8050,7 +8050,7 @@  static void perf_event_free_bpf_handler(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
-	bool is_kprobe, is_tracepoint;
+	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8061,7 +8061,8 @@  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
-	if (!is_kprobe && !is_tracepoint)
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
 		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
@@ -8070,7 +8071,8 @@  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return PTR_ERR(prog);
 
 	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
-	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395..3bd9e1c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@  static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_metadata *sys_data,
+			      struct syscall_trace_enter *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long args[6]; /* maximum 6 arguments */
+	} param;
+	int i;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	for (i = 0; i < sys_data->nb_args && i < 6; i++)
+		param.args[i] = rec->args[i];
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -578,8 +596,9 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +613,13 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
+
+	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
 			      head, NULL);
@@ -633,11 +659,26 @@  static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long ret;
+	} param;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	param.ret = rec->ret;
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -652,8 +693,9 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -666,6 +708,13 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
+
+	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
 			      1, regs, head, NULL);
 }