diff mbox

[net-next,v2,1/2] bpf: add support for sys_enter_* and sys_exit_* tracepoints

Message ID 20170803052828.2303723-2-yhs@fb.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Yonghong Song Aug. 3, 2017, 5:28 a.m. UTC
Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The iovisor/bcc issue #748
(https://github.com/iovisor/bcc/issues/748) documents this issue.
For example, if you try to attach a bpf program to tracepoints
syscalls/sys_enter_newfstat, you will get the following error:
   # ./tools/trace.py t:syscalls:sys_enter_newfstat
   Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
   Failed to attach BPF to tracepoint

The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.

This patch adds bpf support for these syscalls tracepoints by
  . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
  . calling bpf programs in perf_syscall_enter and perf_syscall_exit

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/syscalls.h      |  6 +++++
 kernel/events/core.c          |  8 ++++---
 kernel/trace/trace_syscalls.c | 53 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 5 deletions(-)

Comments

Peter Zijlstra Aug. 3, 2017, 8:08 a.m. UTC | #1
On Wed, Aug 02, 2017 at 10:28:27PM -0700, Yonghong Song wrote:
> Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
> style tracepoints. The iovisor/bcc issue #748
> (https://github.com/iovisor/bcc/issues/748) documents this issue.
> For example, if you try to attach a bpf program to tracepoints
> syscalls/sys_enter_newfstat, you will get the following error:
>    # ./tools/trace.py t:syscalls:sys_enter_newfstat
>    Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
>    Failed to attach BPF to tracepoint
> 
> The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
> tracepoints are treated differently from other tracepoints and there
> is no bpf hook to it.
> 
> This patch adds bpf support for these syscalls tracepoints by
>   . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
>   . calling bpf programs in perf_syscall_enter and perf_syscall_exit
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>

Ack for the perf bits, but you should've Cc'ed steve too I suppose.

> ---
>  include/linux/syscalls.h      |  6 +++++
>  kernel/events/core.c          |  8 ++++---
>  kernel/trace/trace_syscalls.c | 53 +++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 62 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 3cb15ea..00fa3eb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -117,6 +117,12 @@ extern struct trace_event_class event_class_syscall_exit;
>  extern struct trace_event_functions enter_syscall_print_funcs;
>  extern struct trace_event_functions exit_syscall_print_funcs;
>  
> +static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
> +{
> +	return tp_event->class == &event_class_syscall_enter ||
> +	       tp_event->class == &event_class_syscall_exit;
> +}
> +
>  #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
>  	static struct syscall_metadata __syscall_meta_##sname;		\
>  	static struct trace_event_call __used				\
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 426c2ff..750b8d3 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
>  
>  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
>  {
> -	bool is_kprobe, is_tracepoint;
> +	bool is_kprobe, is_tracepoint, is_syscall_tp;
>  	struct bpf_prog *prog;
>  
>  	if (event->attr.type != PERF_TYPE_TRACEPOINT)
> @@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
>  
>  	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
>  	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
> -	if (!is_kprobe && !is_tracepoint)
> +	is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
>  		/* bpf programs can only be attached to u/kprobe or tracepoint */
>  		return -EINVAL;
>  
> @@ -8070,7 +8071,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
>  		return PTR_ERR(prog);
>  
>  	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
> -	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
> +	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
> +	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
>  		/* valid fd, but invalid bpf program type */
>  		bpf_prog_put(prog);
>  		return -EINVAL;
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 5e10395..3bd9e1c 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
>  static int sys_perf_refcount_enter;
>  static int sys_perf_refcount_exit;
>  
> +static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
> +			      struct syscall_metadata *sys_data,
> +			      struct syscall_trace_enter *rec) {
> +	struct syscall_tp_t {
> +		unsigned long long regs;
> +		unsigned long syscall_nr;
> +		unsigned long args[6]; /* maximum 6 arguments */
> +	} param;
> +	int i;
> +
> +	*(struct pt_regs **)&param = regs;
> +	param.syscall_nr = rec->nr;
> +	for (i = 0; i < sys_data->nb_args && i < 6; i++)
> +		param.args[i] = rec->args[i];
> +	return trace_call_bpf(prog, &param);
> +}
> +
>  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  {
>  	struct syscall_metadata *sys_data;
>  	struct syscall_trace_enter *rec;
>  	struct hlist_head *head;
> +	struct bpf_prog *prog;
>  	int syscall_nr;
>  	int rctx;
>  	int size;
> @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  	if (!sys_data)
>  		return;
>  
> +	prog = READ_ONCE(sys_data->enter_event->prog);
>  	head = this_cpu_ptr(sys_data->enter_event->perf_events);
> -	if (hlist_empty(head))
> +	if (!prog && hlist_empty(head))
>  		return;
>  
>  	/* get the size after alignment with the u32 buffer size field */
> @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  	rec->nr = syscall_nr;
>  	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
>  			       (unsigned long *)&rec->args);
> +
> +	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
> +	    hlist_empty(head)) {
> +		perf_swevent_put_recursion_context(rctx);
> +		return;
> +	}
> +
>  	perf_trace_buf_submit(rec, size, rctx,
>  			      sys_data->enter_event->event.type, 1, regs,
>  			      head, NULL);
> @@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
>  	mutex_unlock(&syscall_trace_lock);
>  }
>  
> +static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
> +			      struct syscall_trace_exit *rec) {
> +	struct syscall_tp_t {
> +		unsigned long long regs;
> +		unsigned long syscall_nr;
> +		unsigned long ret;
> +	} param;
> +
> +	*(struct pt_regs **)&param = regs;
> +	param.syscall_nr = rec->nr;
> +	param.ret = rec->ret;
> +	return trace_call_bpf(prog, &param);
> +}
> +
>  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>  {
>  	struct syscall_metadata *sys_data;
>  	struct syscall_trace_exit *rec;
>  	struct hlist_head *head;
> +	struct bpf_prog *prog;
>  	int syscall_nr;
>  	int rctx;
>  	int size;
> @@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>  	if (!sys_data)
>  		return;
>  
> +	prog = READ_ONCE(sys_data->exit_event->prog);
>  	head = this_cpu_ptr(sys_data->exit_event->perf_events);
> -	if (hlist_empty(head))
> +	if (!prog && hlist_empty(head))
>  		return;
>  
>  	/* We can probably do that at build time */
> @@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>  
>  	rec->nr = syscall_nr;
>  	rec->ret = syscall_get_return_value(current, regs);
> +
> +	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
> +	    hlist_empty(head)) {
> +		perf_swevent_put_recursion_context(rctx);
> +		return;
> +	}
> +
>  	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
>  			      1, regs, head, NULL);
>  }
> -- 
> 2.9.4
>
kernel test robot Aug. 3, 2017, 1:47 p.m. UTC | #2
Hi Yonghong,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Yonghong-Song/bpf-add-support-for-sys_-enter-exit-_-tracepoints/20170803-213504
config: i386-randconfig-x019-201731 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   kernel/events/core.c: In function 'perf_event_set_bpf_prog':
>> kernel/events/core.c:8073:18: error: implicit declaration of function 'is_syscall_trace_event' [-Werror=implicit-function-declaration]
     is_syscall_tp = is_syscall_trace_event(event->tp_event);
                     ^~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/is_syscall_trace_event +8073 kernel/events/core.c

  8059	
  8060	static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  8061	{
  8062		bool is_kprobe, is_tracepoint, is_syscall_tp;
  8063		struct bpf_prog *prog;
  8064	
  8065		if (event->attr.type != PERF_TYPE_TRACEPOINT)
  8066			return perf_event_set_bpf_handler(event, prog_fd);
  8067	
  8068		if (event->tp_event->prog)
  8069			return -EEXIST;
  8070	
  8071		is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
  8072		is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
> 8073		is_syscall_tp = is_syscall_trace_event(event->tp_event);
  8074		if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
  8075			/* bpf programs can only be attached to u/kprobe or tracepoint */
  8076			return -EINVAL;
  8077	
  8078		prog = bpf_prog_get(prog_fd);
  8079		if (IS_ERR(prog))
  8080			return PTR_ERR(prog);
  8081	
  8082		if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
  8083		    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
  8084		    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
  8085			/* valid fd, but invalid bpf program type */
  8086			bpf_prog_put(prog);
  8087			return -EINVAL;
  8088		}
  8089	
  8090		if (is_tracepoint) {
  8091			int off = trace_event_get_offsets(event->tp_event);
  8092	
  8093			if (prog->aux->max_ctx_offset > off) {
  8094				bpf_prog_put(prog);
  8095				return -EACCES;
  8096			}
  8097		}
  8098		event->tp_event->prog = prog;
  8099	
  8100		return 0;
  8101	}
  8102	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Yonghong Song Aug. 3, 2017, 3:22 p.m. UTC | #3
On 8/3/17 1:08 AM, Peter Zijlstra wrote:
> On Wed, Aug 02, 2017 at 10:28:27PM -0700, Yonghong Song wrote:
>> Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
>> style tracepoints. The iovisor/bcc issue #748
>> (https://github.com/iovisor/bcc/issues/748) documents this issue.
>> For example, if you try to attach a bpf program to tracepoints
>> syscalls/sys_enter_newfstat, you will get the following error:
>>     # ./tools/trace.py t:syscalls:sys_enter_newfstat
>>     Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
>>     Failed to attach BPF to tracepoint
>>
>> The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
>> tracepoints are treated differently from other tracepoints and there
>> is no bpf hook to it.
>>
>> This patch adds bpf support for these syscalls tracepoints by
>>    . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
>>    . calling bpf programs in perf_syscall_enter and perf_syscall_exit
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
> 
> Ack for the perf bits, but you should've Cc'ed steve too I suppose.

Thanks, Peter. This is first time I posted for tracing related changes.
Will for sure remember this next time.

There is a build error:
======
    kernel/events/core.c: In function 'perf_event_set_bpf_prog':
 >> kernel/events/core.c:8073:18: error: implicit declaration of 
function 'is_syscall_trace_event' [-Werror=implicit-function-declaration]
      is_syscall_tp = is_syscall_trace_event(event->tp_event);
======

Will address this and send another patch soon.
diff mbox

Patch

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3cb15ea..00fa3eb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -117,6 +117,12 @@  extern struct trace_event_class event_class_syscall_exit;
 extern struct trace_event_functions enter_syscall_print_funcs;
 extern struct trace_event_functions exit_syscall_print_funcs;
 
+static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
+{
+	return tp_event->class == &event_class_syscall_enter ||
+	       tp_event->class == &event_class_syscall_exit;
+}
+
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
 	static struct syscall_metadata __syscall_meta_##sname;		\
 	static struct trace_event_call __used				\
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..750b8d3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8050,7 +8050,7 @@  static void perf_event_free_bpf_handler(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
-	bool is_kprobe, is_tracepoint;
+	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8061,7 +8061,8 @@  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
-	if (!is_kprobe && !is_tracepoint)
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
 		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
@@ -8070,7 +8071,8 @@  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return PTR_ERR(prog);
 
 	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
-	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395..3bd9e1c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@  static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_metadata *sys_data,
+			      struct syscall_trace_enter *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long args[6]; /* maximum 6 arguments */
+	} param;
+	int i;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	for (i = 0; i < sys_data->nb_args && i < 6; i++)
+		param.args[i] = rec->args[i];
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -578,8 +596,9 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +613,13 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
+
+	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
 			      head, NULL);
@@ -633,11 +659,26 @@  static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long ret;
+	} param;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	param.ret = rec->ret;
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -652,8 +693,9 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -666,6 +708,13 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
+
+	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
 			      1, regs, head, NULL);
 }