diff mbox series

[RFC,14/19] bpf: Use migrate_disable() in hashtab code

Message ID 20200214161504.325142160@linutronix.de
State RFC
Delegated to: BPF Maintainers
Headers show
Series bpf: Make BPF and PREEMPT_RT co-exist | expand

Commit Message

Thomas Gleixner Feb. 14, 2020, 1:39 p.m. UTC
The required protection is that the caller cannot be migrated to a
different CPU as these places take either a hash bucket lock or might
trigger a kprobe inside the memory allocator. Both scenarios can lead to
deadlocks. The deadlock prevention is per CPU by incrementing a per CPU
variable which temporarily blocks the invocation of BPF programs from perf
and kprobes.

Replace the preempt_disable/enable() pairs with migrate_disable/enable()
pairs to prepare BPF to work on PREEMPT_RT enabled kernels. On a non-RT
kernel this maps to preempt_disable/enable(), i.e. no functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/bpf/hashtab.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Mathieu Desnoyers Feb. 14, 2020, 7:11 p.m. UTC | #1
On 14-Feb-2020 02:39:31 PM, Thomas Gleixner wrote:
> The required protection is that the caller cannot be migrated to a
> different CPU as these places take either a hash bucket lock or might
> trigger a kprobe inside the memory allocator. Both scenarios can lead to
> deadlocks. The deadlock prevention is per CPU by incrementing a per CPU
> variable which temporarily blocks the invocation of BPF programs from perf
> and kprobes.
> 
> Replace the preempt_disable/enable() pairs with migrate_disable/enable()
> pairs to prepare BPF to work on PREEMPT_RT enabled kernels. On a non-RT
> kernel this maps to preempt_disable/enable(), i.e. no functional change.

Will that _really_ work on RT ?

I'm puzzled about what will happen in the following scenario on RT:

Thread A is preempted within e.g. htab_elem_free_rcu, and Thread B is
scheduled and runs through a bunch of tracepoints. Both are on the
same CPU's runqueue:

CPU 1

Thread A is scheduled
(Thread A) htab_elem_free_rcu()
(Thread A)   migrate disable
(Thread A)   __this_cpu_inc(bpf_prog_active); -> per-cpu variable for
                                               deadlock prevention.
Thread A is preempted
Thread B is scheduled
(Thread B) Runs through various tracepoints:
           trace_call_bpf()
           if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
               -> will skip any instrumentation that happens to be on
                  this CPU until...
Thread B is preempted
Thread A is scheduled
(Thread A)  __this_cpu_dec(bpf_prog_active);
(Thread A)  migrate enable

Having all those events randomly and silently discarded might be quite
unexpected from a user standpoint. This turns the deadlock prevention
mechanism into a random tracepoint-dropping facility, which is
unsettling. One alternative approach we could consider to solve this
is to make this deadlock prevention nesting counter per-thread rather
than per-cpu.

Also, I don't think using __this_cpu_inc() without preempt-disable or
irq off is safe. You'll probably want to move to this_cpu_inc/dec
instead, which can be heavier on some architectures.

Thanks,

Mathieu


> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/bpf/hashtab.c |   12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -698,11 +698,11 @@ static void htab_elem_free_rcu(struct rc
>  	 * we're calling kfree, otherwise deadlock is possible if kprobes
>  	 * are placed somewhere inside of slub
>  	 */
> -	preempt_disable();
> +	migrate_disable();
>  	__this_cpu_inc(bpf_prog_active);
>  	htab_elem_free(htab, l);
>  	__this_cpu_dec(bpf_prog_active);
> -	preempt_enable();
> +	migrate_enable();
>  }
>  
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
> @@ -1327,7 +1327,7 @@ static int
>  	}
>  
>  again:
> -	preempt_disable();
> +	migrate_disable();
>  	this_cpu_inc(bpf_prog_active);
>  	rcu_read_lock();
>  again_nocopy:
> @@ -1347,7 +1347,7 @@ static int
>  		raw_spin_unlock_irqrestore(&b->lock, flags);
>  		rcu_read_unlock();
>  		this_cpu_dec(bpf_prog_active);
> -		preempt_enable();
> +		migrate_enable();
>  		goto after_loop;
>  	}
>  
> @@ -1356,7 +1356,7 @@ static int
>  		raw_spin_unlock_irqrestore(&b->lock, flags);
>  		rcu_read_unlock();
>  		this_cpu_dec(bpf_prog_active);
> -		preempt_enable();
> +		migrate_enable();
>  		kvfree(keys);
>  		kvfree(values);
>  		goto alloc;
> @@ -1406,7 +1406,7 @@ static int
>  
>  	rcu_read_unlock();
>  	this_cpu_dec(bpf_prog_active);
> -	preempt_enable();
> +	migrate_enable();
>  	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
>  	    key_size * bucket_cnt) ||
>  	    copy_to_user(uvalues + total * value_size, values,
>
Thomas Gleixner Feb. 14, 2020, 7:56 p.m. UTC | #2
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> writes:
> On 14-Feb-2020 02:39:31 PM, Thomas Gleixner wrote:
>> Replace the preempt_disable/enable() pairs with migrate_disable/enable()
>> pairs to prepare BPF to work on PREEMPT_RT enabled kernels. On a non-RT
>> kernel this maps to preempt_disable/enable(), i.e. no functional change.

...

> Having all those events randomly and silently discarded might be quite
> unexpected from a user standpoint. This turns the deadlock prevention
> mechanism into a random tracepoint-dropping facility, which is
> unsettling.

Well, it randomly drops events which might be unrelated to the syscall
target today already, this will just drop some more. Shrug.

> One alternative approach we could consider to solve this is to make
> this deadlock prevention nesting counter per-thread rather than
> per-cpu.

That should work both on !RT and RT.

> Also, I don't think using __this_cpu_inc() without preempt-disable or
> irq off is safe. You'll probably want to move to this_cpu_inc/dec
> instead, which can be heavier on some architectures.

Good catch.

Thanks,

        tglx
Alexei Starovoitov Feb. 18, 2020, 11:36 p.m. UTC | #3
On Fri, Feb 14, 2020 at 08:56:12PM +0100, Thomas Gleixner wrote:
> 
> > Also, I don't think using __this_cpu_inc() without preempt-disable or
> > irq off is safe. You'll probably want to move to this_cpu_inc/dec
> > instead, which can be heavier on some architectures.
> 
> Good catch.

Overall looks great.
Thank you for taking time to write commit logs and detailed cover letter.
I think s/__this_cpu_inc/this_cpu_inc/ is the only bit that needs to be
addressed for it to be merged.
There were few other suggestions from Mathieu and Jakub.
Could you address them and resend?
I saw patch 1 landing in tip tree, but it needs to be in bpf-next as well
along with the rest of the series. Does it really need to be in the tip?
I would prefer to take the whole thing and avoid conflicts around
migrate_disable() especially if nothing in tip is going to use it in this
development cycle. So just drop patch 1 from the tip?

Regarding
union {
   raw_spinlock_t  raw_lock;
   spinlock_t      lock;
};
yeah. it's not pretty, but I also don't have better ideas.

Regarding migrate_disable()... can you enable it without the rest of RT?
I haven't seen its implementation. I suspect it's scheduler only change?
If I can use migrate_disable() without RT it will help my work on sleepable
BPF programs. I would only have to worry about rcu_read_lock() since
preempt_disable() is nicely addressed.
Thomas Gleixner Feb. 19, 2020, 12:49 a.m. UTC | #4
Alexei,

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> Overall looks great.
> Thank you for taking time to write commit logs and detailed cover letter.
> I think s/__this_cpu_inc/this_cpu_inc/ is the only bit that needs to be
> addressed for it to be merged.
> There were few other suggestions from Mathieu and Jakub.
> Could you address them and resend?

I have them fixed up already, but I was waiting for further
comments. I'll send it out tomorrow morning as I'm dead tired by now.

> I saw patch 1 landing in tip tree, but it needs to be in bpf-next as well
> along with the rest of the series. Does it really need to be in the tip?
> I would prefer to take the whole thing and avoid conflicts around
> migrate_disable() especially if nothing in tip is going to use it in this
> development cycle. So just drop patch 1 from the tip?

I'll add patch 2 to a tip branch as well and I'll give you a tag to pull
into BPF (which has only those two commits). That allows us to further
tweak the relevant files without creating conflicts in next.

> Regarding
> union {
>    raw_spinlock_t  raw_lock;
>    spinlock_t      lock;
> };
> yeah. it's not pretty, but I also don't have better ideas.

Yeah. I really tried hard to avoid it, but the alternative solution was
code duplication which was even more horrible.

> Regarding migrate_disable()... can you enable it without the rest of RT?
> I haven't seen its implementation. I suspect it's scheduler only change?
> If I can use migrate_disable() without RT it will help my work on sleepable
> BPF programs. I would only have to worry about rcu_read_lock() since
> preempt_disable() is nicely addressed.

You have to talk to Peter Zijlstra about this as this is really
scheduler relevant stuff. FYI, he undamentaly hates migrate_disable()
from a schedulabilty POV, but as with the above lock construct the
amount of better solutions is also close to zero.

Thanks,

        tglx
Alexei Starovoitov Feb. 19, 2020, 1:23 a.m. UTC | #5
On Wed, Feb 19, 2020 at 01:49:57AM +0100, Thomas Gleixner wrote:
> Alexei,
> 
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > Overall looks great.
> > Thank you for taking time to write commit logs and detailed cover letter.
> > I think s/__this_cpu_inc/this_cpu_inc/ is the only bit that needs to be
> > addressed for it to be merged.
> > There were few other suggestions from Mathieu and Jakub.
> > Could you address them and resend?
> 
> I have them fixed up already, but I was waiting for further
> comments. I'll send it out tomorrow morning as I'm dead tired by now.
> 
> > I saw patch 1 landing in tip tree, but it needs to be in bpf-next as well
> > along with the rest of the series. Does it really need to be in the tip?
> > I would prefer to take the whole thing and avoid conflicts around
> > migrate_disable() especially if nothing in tip is going to use it in this
> > development cycle. So just drop patch 1 from the tip?
> 
> I'll add patch 2 to a tip branch as well and I'll give you a tag to pull
> into BPF (which has only those two commits). 

That works too.

> > Regarding
> > union {
> >    raw_spinlock_t  raw_lock;
> >    spinlock_t      lock;
> > };
> > yeah. it's not pretty, but I also don't have better ideas.
> 
> Yeah. I really tried hard to avoid it, but the alternative solution was
> code duplication which was even more horrible.
> 
> > Regarding migrate_disable()... can you enable it without the rest of RT?
> > I haven't seen its implementation. I suspect it's scheduler only change?
> > If I can use migrate_disable() without RT it will help my work on sleepable
> > BPF programs. I would only have to worry about rcu_read_lock() since
> > preempt_disable() is nicely addressed.
> 
> You have to talk to Peter Zijlstra about this as this is really
> scheduler relevant stuff. FYI, he undamentaly hates migrate_disable()
> from a schedulabilty POV, but as with the above lock construct the
> amount of better solutions is also close to zero.

I would imagine that migrate_disable is like temporary cpu pinning. The
scheduler has to have all the logic to make scheduling decisions quickly in
presence of pinned tasks and additional migrate_disable shouldn't introduce
slowdowns or complexity to critical path. Anyway, we'll discuss further
when migrate_disable patches hit mailing lists.
Mathieu Desnoyers Feb. 19, 2020, 3:17 p.m. UTC | #6
----- On Feb 18, 2020, at 6:36 PM, Alexei Starovoitov alexei.starovoitov@gmail.com wrote:

[...]

> If I can use migrate_disable() without RT it will help my work on sleepable
> BPF programs. I would only have to worry about rcu_read_lock() since
> preempt_disable() is nicely addressed.

Hi Alexei,

You may want to consider using SRCU rather than RCU if you need to sleep while
holding a RCU read-side lock.

This is the synchronization approach I consider for adding the ability to take page
faults when doing syscall tracing.

Then you'll be able to replace preempt_disable() by combining SRCU and
migrate_disable():

AFAIU eBPF currently uses preempt_disable() for two reasons:

- Ensure the thread is not migrated,
  -> can be replaced by migrate_disable() in RT
- Provide RCU existence guarantee through sched-RCU
  -> can be replaced by SRCU, which allows sleeping and taking page faults.

I wonder if it would be acceptable to take a page fault while migration is
disabled though ?

Thanks,

Mathieu
Alexei Starovoitov Feb. 20, 2020, 4:19 a.m. UTC | #7
On Wed, Feb 19, 2020 at 10:17:28AM -0500, Mathieu Desnoyers wrote:
> ----- On Feb 18, 2020, at 6:36 PM, Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
> 
> [...]
> 
> > If I can use migrate_disable() without RT it will help my work on sleepable
> > BPF programs. I would only have to worry about rcu_read_lock() since
> > preempt_disable() is nicely addressed.
> 
> Hi Alexei,
> 
> You may want to consider using SRCU rather than RCU if you need to sleep while
> holding a RCU read-side lock.
> 
> This is the synchronization approach I consider for adding the ability to take page
> faults when doing syscall tracing.
> 
> Then you'll be able to replace preempt_disable() by combining SRCU and
> migrate_disable():
> 
> AFAIU eBPF currently uses preempt_disable() for two reasons:
> 
> - Ensure the thread is not migrated,
>   -> can be replaced by migrate_disable() in RT
> - Provide RCU existence guarantee through sched-RCU
>   -> can be replaced by SRCU, which allows sleeping and taking page faults.

bpf is using normal rcu to protect map values
and rcu+preempt to protect per-cpu map values.
srcu is certainly under consideration. It hasn't been used due to performance
implications. atomics and barriers are too heavy for certain use cases. So we
have to keep rcu where performance matters, but cannot fork map implementations
to rcu and srcu due to huge code bloat. So far I've been thinking to introduce
explicit helper bpf_rcu_read_lock() and let programs use it directly instead of
implicit rcu_read_lock() that is done outside of bpf prog. The tricky part is
teaching verifier to enforce critical section.
diff mbox series

Patch

--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -698,11 +698,11 @@  static void htab_elem_free_rcu(struct rc
 	 * we're calling kfree, otherwise deadlock is possible if kprobes
 	 * are placed somewhere inside of slub
 	 */
-	preempt_disable();
+	migrate_disable();
 	__this_cpu_inc(bpf_prog_active);
 	htab_elem_free(htab, l);
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 }
 
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
@@ -1327,7 +1327,7 @@  static int
 	}
 
 again:
-	preempt_disable();
+	migrate_disable();
 	this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
 again_nocopy:
@@ -1347,7 +1347,7 @@  static int
 		raw_spin_unlock_irqrestore(&b->lock, flags);
 		rcu_read_unlock();
 		this_cpu_dec(bpf_prog_active);
-		preempt_enable();
+		migrate_enable();
 		goto after_loop;
 	}
 
@@ -1356,7 +1356,7 @@  static int
 		raw_spin_unlock_irqrestore(&b->lock, flags);
 		rcu_read_unlock();
 		this_cpu_dec(bpf_prog_active);
-		preempt_enable();
+		migrate_enable();
 		kvfree(keys);
 		kvfree(values);
 		goto alloc;
@@ -1406,7 +1406,7 @@  static int
 
 	rcu_read_unlock();
 	this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
 	    key_size * bucket_cnt) ||
 	    copy_to_user(uvalues + total * value_size, values,