diff mbox series

[bpf-next,1/2] bpf: enable stackmap with build_id in nmi context

Message ID 20180502000220.2585320-1-songliubraving@fb.com
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series [bpf-next,1/2] bpf: enable stackmap with build_id in nmi context | expand

Commit Message

Song Liu May 2, 2018, 12:02 a.m. UTC
Currently, we cannot parse build_id in nmi context because of
up_read(&current->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 kernel/bpf/stackmap.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

Comments

kernel test robot May 2, 2018, 3:34 a.m. UTC | #1
Hi Song,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Song-Liu/bpf-enable-stackmap-with-build_id-in-nmi-context/20180502-081727
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> kernel/bpf/stackmap.c:51:1: sparse: symbol '__pcpu_scope_irq_work' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Peter Zijlstra May 2, 2018, 9:21 a.m. UTC | #2
On Tue, May 01, 2018 at 05:02:19PM -0700, Song Liu wrote:
> @@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>  {
>  	int i;
>  	struct vm_area_struct *vma;
> +	bool in_nmi_ctx = in_nmi();
> +	bool irq_work_busy = false;
> +	struct stack_map_irq_work *work;
> +
> +	if (in_nmi_ctx) {
> +		work = this_cpu_ptr(&irq_work);
> +		if (work->sem)
> +			/* cannot queue more up_read, fallback */
> +			irq_work_busy = true;
> +	}
>  
>  	/*
> +	 * We cannot do up_read() in nmi context. To do build_id lookup
> +	 * in nmi context, we need to run up_read() in irq_work. We use
> +	 * a percpu variable to do the irq_work. If the irq_work is
> +	 * already used by another lookup, we fall back to report ips.
>  	 *
>  	 * Same fallback is used for kernel stack (!user) on a stackmap
>  	 * with build_id.
>  	 */
> +	if (!user || !current || !current->mm || irq_work_busy ||
>  	    down_read_trylock(&current->mm->mmap_sem) == 0) {
>  		/* cannot access current->mm, fall back to ips */
>  		for (i = 0; i < trace_nr; i++) {
> @@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>  			- vma->vm_start;
>  		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
>  	}
> +
> +	if (!in_nmi_ctx)
> +		up_read(&current->mm->mmap_sem);
> +	else {
> +		work->sem = &current->mm->mmap_sem;
> +		irq_work_queue(&work->work);
> +	}
>  }

This is disguisting.. :-)

It's broken though, I've bet you've never actually ran this with lockdep
enabled for example.

Also, you set work->sem before you do trylock, if the trylock fails you
return early and keep work->sem set, which will thereafter always result
in irq_work_busy.
Song Liu May 2, 2018, 4:48 p.m. UTC | #3
> On May 2, 2018, at 2:21 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, May 01, 2018 at 05:02:19PM -0700, Song Liu wrote:
>> @@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>> {
>> 	int i;
>> 	struct vm_area_struct *vma;
>> +	bool in_nmi_ctx = in_nmi();
>> +	bool irq_work_busy = false;
>> +	struct stack_map_irq_work *work;
>> +
>> +	if (in_nmi_ctx) {
>> +		work = this_cpu_ptr(&irq_work);
>> +		if (work->sem)
>> +			/* cannot queue more up_read, fallback */
>> +			irq_work_busy = true;
>> +	}
>> 
>> 	/*
>> +	 * We cannot do up_read() in nmi context. To do build_id lookup
>> +	 * in nmi context, we need to run up_read() in irq_work. We use
>> +	 * a percpu variable to do the irq_work. If the irq_work is
>> +	 * already used by another lookup, we fall back to report ips.
>> 	 *
>> 	 * Same fallback is used for kernel stack (!user) on a stackmap
>> 	 * with build_id.
>> 	 */
>> +	if (!user || !current || !current->mm || irq_work_busy ||
>> 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
>> 		/* cannot access current->mm, fall back to ips */
>> 		for (i = 0; i < trace_nr; i++) {
>> @@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>> 			- vma->vm_start;
>> 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
>> 	}
>> +
>> +	if (!in_nmi_ctx)
>> +		up_read(&current->mm->mmap_sem);
>> +	else {
>> +		work->sem = &current->mm->mmap_sem;
>> +		irq_work_queue(&work->work);
>> +	}
>> }
> 
> This is disguisting.. :-)
> 
> It's broken though, I've bet you've never actually ran this with lockdep
> enabled for example.

I am not following here. I just run the new selftest with CONFIG_LOCKDEP on, 
and got no warning for this. 


> Also, you set work->sem before you do trylock, if the trylock fails you
> return early and keep work->sem set, which will thereafter always result
> in irq_work_busy.

work->sem was set after down_read_trylock(). I guess you misread the patch?

Thanks,
Song
Peter Zijlstra May 2, 2018, 5:30 p.m. UTC | #4
On Wed, May 02, 2018 at 04:48:32PM +0000, Song Liu wrote:
> > It's broken though, I've bet you've never actually ran this with lockdep
> > enabled for example.
> 
> I am not following here. I just run the new selftest with CONFIG_LOCKDEP on, 
> and got no warning for this. 

Weird, I would be expecting complaints about releasing an unheld lock.

nmi_enter(),nmi_exit() have lockdep_off(),lockdep_on() resp. Which means
that the down_trylock() will not be recorded. The up, which is done from
IRQ context, will not be so supressed and should hit
print_unlock_imbalance_bug().

> > Also, you set work->sem before you do trylock, if the trylock fails you
> > return early and keep work->sem set, which will thereafter always result
> > in irq_work_busy.
> 
> work->sem was set after down_read_trylock(). I guess you misread the patch?

Argh, yes indeed. Sorry.
Song Liu May 2, 2018, 5:57 p.m. UTC | #5
> On May 2, 2018, at 10:30 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, May 02, 2018 at 04:48:32PM +0000, Song Liu wrote:
>>> It's broken though, I've bet you've never actually ran this with lockdep
>>> enabled for example.
>> 
>> I am not following here. I just run the new selftest with CONFIG_LOCKDEP on, 
>> and got no warning for this. 
> 
> Weird, I would be expecting complaints about releasing an unheld lock.
> 
> nmi_enter(),nmi_exit() have lockdep_off(),lockdep_on() resp. Which means
> that the down_trylock() will not be recorded. The up, which is done from
> IRQ context, will not be so supressed and should hit
> print_unlock_imbalance_bug().
> 

I am still not sure whether I am following. I guess your concern apply to 
spinlock only? lock_acquire() has the following in the beginning:

	if (unlikely(current->lockdep_recursion))
		return;

So it will not run in nmi context?

On the other hand, semaphore and rw_semaphore should be ok in such cases?

Thanks,
Song
diff mbox series

Patch

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b..c33fec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@ 
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@  struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work work;
+	struct rw_semaphore *sem;
+};
+
+static void up_read_work(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work = container_of(entry,
+			struct stack_map_irq_work, work);
+
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+DEFINE_PER_CPU(struct stack_map_irq_work, irq_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@  static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
+	bool in_nmi_ctx = in_nmi();
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work;
+
+	if (in_nmi_ctx) {
+		work = this_cpu_ptr(&irq_work);
+		if (work->sem)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@  static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!in_nmi_ctx)
+		up_read(&current->mm->mmap_sem);
+	else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,17 @@  const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&irq_work, cpu);
+		init_irq_work(&work->work, up_read_work);
+	}
+	pr_info("%s: done\n", __func__);
+	return 0;
+}
+subsys_initcall(stack_map_init);