diff mbox series

[RFC,bpf-next,v2,1/3] bpf: implement bpf_send_signal() helper

Message ID 20190503000806.1340997-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf_send_signal() helper | expand

Commit Message

Yonghong Song May 3, 2019, 12:08 a.m. UTC
This patch tries to solve the following specific use case.

Currently, bpf program can already collect stack traces
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). These stack traces can be
used for performance analysis. For jitted programs, e.g.,
hhvm (jited php), it is very hard to get the true stack
trace in the bpf program due to jit complexity.

To resolve this issue, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.

The following is the current way to handle this use case:
  . profiler installs a bpf program and polls on a map.
    When certain event happens, bpf program writes to a map.
  . Once receiving the information from the map, the profiler
    sends a signal to hhvm.
This method could have large delays and causing profiling
results skewed.

This patch implements bpf_send_signal() helper to send a signal to
hhvm in real time, resulting in intended stack traces.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h | 15 ++++++-
 kernel/trace/bpf_trace.c | 85 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)

v1 -> v2:
 fixed a compilation warning (missing return value in case)
 reported by kbuild test robot.
 added Reported-by in the above to notify the bot.

Comments

Alexei Starovoitov May 5, 2019, 7:29 a.m. UTC | #1
On Thu, May 2, 2019 at 5:08 PM Yonghong Song <yhs@fb.com> wrote:
>
> This patch tries to solve the following specific use case.
>
> Currently, bpf program can already collect stack traces
> when certain events happens (e.g., cache miss counter or
> cpu clock counter overflows). These stack traces can be
> used for performance analysis. For jitted programs, e.g.,
> hhvm (jited php), it is very hard to get the true stack
> trace in the bpf program due to jit complexity.
>
> To resolve this issue, hhvm implements a signal handler,
> e.g. for SIGALARM, and a set of program locations which
> it can dump stack traces. When it receives a signal, it will
> dump the stack in next such program location.
>
> The following is the current way to handle this use case:
>   . profiler installs a bpf program and polls on a map.
>     When certain event happens, bpf program writes to a map.
>   . Once receiving the information from the map, the profiler
>     sends a signal to hhvm.
> This method could have large delays and causing profiling
> results skewed.
>
> This patch implements bpf_send_signal() helper to send a signal to
> hhvm in real time, resulting in intended stack traces.
>
> Reported-by: kbuild test robot <lkp@intel.com>

v2 addressing buildbot issue doesn't need to have such tag.

> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h | 15 ++++++-
>  kernel/trace/bpf_trace.c | 85 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 99 insertions(+), 1 deletion(-)
>
> v1 -> v2:
>  fixed a compilation warning (missing return value in case)
>  reported by kbuild test robot.
>  added Reported-by in the above to notify the bot.
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 72336bac7573..e3e824848335 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2667,6 +2667,18 @@ union bpf_attr {
>   *             0 on success.
>   *
>   *             **-ENOENT** if the bpf-local-storage cannot be found.
> + *
> + * int bpf_send_signal(u32 pid, u32 sig)
> + *     Description
> + *             Send signal *sig* to *pid*.
> + *     Return
> + *             0 on success or successfully queued.
> + *
> + *             **-ENOENT** if *pid* cannot be found.
> + *
> + *             **-EBUSY** if work queue under nmi is full.
> + *
> + *             Other negative values for actual signal sending errors.
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -2777,7 +2789,8 @@ union bpf_attr {
>         FN(strtol),                     \
>         FN(strtoul),                    \
>         FN(sk_storage_get),             \
> -       FN(sk_storage_delete),
> +       FN(sk_storage_delete),          \
> +       FN(send_signal),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 8607aba1d882..49a804d59bfb 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -559,6 +559,76 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
>         .arg3_type      = ARG_ANYTHING,
>  };
>
> +struct send_signal_irq_work {
> +       struct irq_work irq_work;
> +       u32 pid;
> +       u32 sig;
> +};
> +
> +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
> +
> +static int do_bpf_send_signal_pid(u32 pid, u32 sig)
> +{
> +       struct task_struct *task;
> +       struct pid *pidp;
> +
> +       pidp = find_vpid(pid);

it takes pidns from current which should be valid
which makes bpf prog depend on current, but from nmi
there are no guarantees.
I think find_pid_ns() would be cleaner, but then the question
would be how to pass pidns...
Another issue is instability of pid as an integer...
upcoming pidfd could be the answer.
At this point I think it's cleaner to make such api to send signal
to existing process only under the same conditions as in bpf_probe_write_user.
Would that work for your use case?
Yonghong Song May 5, 2019, 5:27 p.m. UTC | #2
On 5/5/19 12:29 AM, Alexei Starovoitov wrote:
> On Thu, May 2, 2019 at 5:08 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> This patch tries to solve the following specific use case.
>>
>> Currently, bpf program can already collect stack traces
>> when certain events happens (e.g., cache miss counter or
>> cpu clock counter overflows). These stack traces can be
>> used for performance analysis. For jitted programs, e.g.,
>> hhvm (jited php), it is very hard to get the true stack
>> trace in the bpf program due to jit complexity.
>>
>> To resolve this issue, hhvm implements a signal handler,
>> e.g. for SIGALARM, and a set of program locations which
>> it can dump stack traces. When it receives a signal, it will
>> dump the stack in next such program location.
>>
>> The following is the current way to handle this use case:
>>    . profiler installs a bpf program and polls on a map.
>>      When certain event happens, bpf program writes to a map.
>>    . Once receiving the information from the map, the profiler
>>      sends a signal to hhvm.
>> This method could have large delays and causing profiling
>> results skewed.
>>
>> This patch implements bpf_send_signal() helper to send a signal to
>> hhvm in real time, resulting in intended stack traces.
>>
>> Reported-by: kbuild test robot <lkp@intel.com>
> 
> v2 addressing buildbot issue doesn't need to have such tag.

I will drop this in the next revision. The intention is, as suggested
by buildbot email, is to give credit to buildbot and let it know the bug 
is taken care of.

> 
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/uapi/linux/bpf.h | 15 ++++++-
>>   kernel/trace/bpf_trace.c | 85 ++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 99 insertions(+), 1 deletion(-)
>>
>> v1 -> v2:
>>   fixed a compilation warning (missing return value in case)
>>   reported by kbuild test robot.
>>   added Reported-by in the above to notify the bot.
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 72336bac7573..e3e824848335 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2667,6 +2667,18 @@ union bpf_attr {
>>    *             0 on success.
>>    *
>>    *             **-ENOENT** if the bpf-local-storage cannot be found.
>> + *
>> + * int bpf_send_signal(u32 pid, u32 sig)
>> + *     Description
>> + *             Send signal *sig* to *pid*.
>> + *     Return
>> + *             0 on success or successfully queued.
>> + *
>> + *             **-ENOENT** if *pid* cannot be found.
>> + *
>> + *             **-EBUSY** if work queue under nmi is full.
>> + *
>> + *             Other negative values for actual signal sending errors.
>>    */
>>   #define __BPF_FUNC_MAPPER(FN)          \
>>          FN(unspec),                     \
>> @@ -2777,7 +2789,8 @@ union bpf_attr {
>>          FN(strtol),                     \
>>          FN(strtoul),                    \
>>          FN(sk_storage_get),             \
>> -       FN(sk_storage_delete),
>> +       FN(sk_storage_delete),          \
>> +       FN(send_signal),
>>
>>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>    * function eBPF program intends to call
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index 8607aba1d882..49a804d59bfb 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -559,6 +559,76 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
>>          .arg3_type      = ARG_ANYTHING,
>>   };
>>
>> +struct send_signal_irq_work {
>> +       struct irq_work irq_work;
>> +       u32 pid;
>> +       u32 sig;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
>> +
>> +static int do_bpf_send_signal_pid(u32 pid, u32 sig)
>> +{
>> +       struct task_struct *task;
>> +       struct pid *pidp;
>> +
>> +       pidp = find_vpid(pid);
> 
> it takes pidns from current which should be valid
> which makes bpf prog depend on current, but from nmi
> there are no guarantees.

This is right. For nmi, I have found some fields of "current" may not be 
valid.

> I think find_pid_ns() would be cleaner, but then the question
> would be how to pass pidns...

The "current" is mostly used to find pid namespace 
`task_active_pid_ns(current)`. Yes, we can use find_pid_ns(), but we
need to get pid_namespace in advance then pid, maybe by introducing
a new map or something, or may be directly using the below pidfd
based approach to hold the reference count in a map.

> Another issue is instability of pid as an integer...
> upcoming pidfd could be the answer.

Will take a look at the details. Thanks for referencing.

> At this point I think it's cleaner to make such api to send signal
> to existing process only under the same conditions as in bpf_probe_write_user.
> Would that work for your use case?

Let me double check this.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 72336bac7573..e3e824848335 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2667,6 +2667,18 @@  union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf-local-storage cannot be found.
+ *
+ * int bpf_send_signal(u32 pid, u32 sig)
+ *	Description
+ *		Send signal *sig* to *pid*.
+ *	Return
+ *		0 on success or successfully queued.
+ *
+ *		**-ENOENT** if *pid* cannot be found.
+ *
+ *		**-EBUSY** if work queue under nmi is full.
+ *
+ * 		Other negative values for actual signal sending errors.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2777,7 +2789,8 @@  union bpf_attr {
 	FN(strtol),			\
 	FN(strtoul),			\
 	FN(sk_storage_get),		\
-	FN(sk_storage_delete),
+	FN(sk_storage_delete),		\
+	FN(send_signal),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8607aba1d882..49a804d59bfb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -559,6 +559,76 @@  static const struct bpf_func_proto bpf_probe_read_str_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+struct send_signal_irq_work {
+	struct irq_work irq_work;
+	u32 pid;
+	u32 sig;
+};
+
+static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
+
+static int do_bpf_send_signal_pid(u32 pid, u32 sig)
+{
+	struct task_struct *task;
+	struct pid *pidp;
+
+	pidp = find_vpid(pid);
+	if (!pidp)
+		return -ENOENT;
+
+	task = get_pid_task(pidp, PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	kill_pid_info(sig, SEND_SIG_PRIV, pidp);
+	put_task_struct(task);
+
+	return 0;
+}
+
+static void do_bpf_send_signal(struct irq_work *entry)
+{
+	struct send_signal_irq_work *work;
+
+	work = container_of(entry, struct send_signal_irq_work, irq_work);
+	do_bpf_send_signal_pid(work->pid, work->sig);
+}
+
+BPF_CALL_2(bpf_send_signal, u32, pid, u32, sig)
+{
+	struct send_signal_irq_work *work = NULL;
+
+	/* current may be in the middle of teardown and task_pid(current)
+	 * becomes NULL. task_pid(current)) is needed to find pid namespace
+	 * in order to locate proper pid structure for the target pid.
+	 */
+	if (!task_pid(current))
+		return -ENOENT;
+
+	if (in_nmi()) {
+		work = this_cpu_ptr(&send_signal_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			return -EBUSY;
+	}
+
+	if (!work)
+		return do_bpf_send_signal_pid(pid, sig);
+
+	work->pid = pid;
+	work->sig = sig;
+	irq_work_queue(&work->irq_work);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_send_signal_proto = {
+	.func		= bpf_send_signal,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -609,6 +679,8 @@  tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 #endif
+	case BPF_FUNC_send_signal:
+		return &bpf_send_signal_proto;
 	default:
 		return NULL;
 	}
@@ -1334,5 +1406,18 @@  int __init bpf_event_init(void)
 	return 0;
 }
 
+static int __init send_signal_irq_work_init(void)
+{
+	int cpu;
+	struct send_signal_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&send_signal_work, cpu);
+		init_irq_work(&work->irq_work, do_bpf_send_signal);
+	}
+	return 0;
+}
+
 fs_initcall(bpf_event_init);
+subsys_initcall(send_signal_irq_work_init);
 #endif /* CONFIG_MODULES */