diff mbox series

[v4,bpf-next,1/9] bpf: introduce bpf_spin_lock

Message ID 20190124041403.2100609-2-ast@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series introduce bpf_spin_lock | expand

Commit Message

Alexei Starovoitov Jan. 24, 2019, 4:13 a.m. UTC
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.

Implementation details:
- on !SMP bpf_spin_lock() becomes nop
- on architectures that don't support queued_spin_lock trivial lock is used.
  Note that arch_spin_lock cannot be used, since not all archs agree that
  zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user coding mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  37 ++++++++-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   1 +
 include/uapi/linux/bpf.h     |   7 +-
 kernel/bpf/arraymap.c        |   7 +-
 kernel/bpf/btf.c             |  42 ++++++++++
 kernel/bpf/core.c            |   2 +
 kernel/bpf/hashtab.c         |   6 +-
 kernel/bpf/helpers.c         |  57 ++++++++++++++
 kernel/bpf/map_in_map.c      |   5 ++
 kernel/bpf/syscall.c         |  21 ++++-
 kernel/bpf/verifier.c        | 149 ++++++++++++++++++++++++++++++++++-
 net/core/filter.c            |  16 +++-
 13 files changed, 335 insertions(+), 16 deletions(-)

Comments

Peter Zijlstra Jan. 24, 2019, 6:01 p.m. UTC | #1
Thanks for having kernel/locking people on Cc...

On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:

> Implementation details:
> - on !SMP bpf_spin_lock() becomes nop

Because no BPF program is preemptible? I don't see any assertions or
even a comment that says this code is non-preemptible.

AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
which is not sufficient.

> - on architectures that don't support queued_spin_lock trivial lock is used.
>   Note that arch_spin_lock cannot be used, since not all archs agree that
>   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).

I really don't much like direct usage of qspinlock; esp. not as a
surprise.

Why does it matter if 0 means unlocked; that's what
__ARCH_SPIN_LOCK_UNLOCKED is for.

I get the sizeof(__u32) thing, but why not key off of that?

> Next steps:
> - allow bpf_spin_lock in other map types (like cgroup local storage)
> - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
>   to request kernel to grab bpf_spin_lock before rewriting the value.
>   That will serialize access to map elements.

So clearly this map stuff is shared between bpf proglets, otherwise
there would not be a need for locking. But what happens if one is from
task context and another from IRQ context?

I don't see a local_irq_save()/restore() anywhere. What avoids the
trivial lock inversion?


> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index a74972b07e74..2e98e4caf5aa 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -221,6 +221,63 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
>  	.arg2_type	= ARG_CONST_SIZE,
>  };
>  
> +#ifndef CONFIG_QUEUED_SPINLOCKS
> +struct dumb_spin_lock {
> +	atomic_t val;
> +};
> +#endif
> +
> +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
> +{
> +#if defined(CONFIG_SMP)
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +	struct qspinlock *qlock = (void *)lock;
> +
> +	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
> +	queued_spin_lock(qlock);
> +#else
> +	struct dumb_spin_lock *qlock = (void *)lock;
> +
> +	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
> +	do {
> +		while (atomic_read(&qlock->val) != 0)
> +			cpu_relax();
> +	} while (atomic_cmpxchg(&qlock->val, 0, 1) != 0);
> +#endif
> +#endif
> +	return 0;
> +}
> +
> +const struct bpf_func_proto bpf_spin_lock_proto = {
> +	.func		= bpf_spin_lock,
> +	.gpl_only	= false,
> +	.ret_type	= RET_VOID,
> +	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
> +};
> +
> +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
> +{
> +#if defined(CONFIG_SMP)
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +	struct qspinlock *qlock = (void *)lock;
> +
> +	queued_spin_unlock(qlock);
> +#else
> +	struct dumb_spin_lock *qlock = (void *)lock;
> +
> +	atomic_set(&qlock->val, 0);

And this is broken... That should've been atomic_set_release() at the
very least.

And this would again be the moment where I go pester you about the BPF
memory model :-)

> +#endif
> +#endif
> +	return 0;
> +}
> +
> +const struct bpf_func_proto bpf_spin_unlock_proto = {
> +	.func		= bpf_spin_unlock,
> +	.gpl_only	= false,
> +	.ret_type	= RET_VOID,
> +	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
> +};
Peter Zijlstra Jan. 24, 2019, 6:56 p.m. UTC | #2
On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> 
> Thanks for having kernel/locking people on Cc...
> 
> On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> 
> > Implementation details:
> > - on !SMP bpf_spin_lock() becomes nop
> 
> Because no BPF program is preemptible? I don't see any assertions or
> even a comment that says this code is non-preemptible.
> 
> AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> which is not sufficient.
> 
> > - on architectures that don't support queued_spin_lock trivial lock is used.
> >   Note that arch_spin_lock cannot be used, since not all archs agree that
> >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> 
> I really don't much like direct usage of qspinlock; esp. not as a
> surprise.
> 
> Why does it matter if 0 means unlocked; that's what
> __ARCH_SPIN_LOCK_UNLOCKED is for.
> 
> I get the sizeof(__u32) thing, but why not key off of that?
> 
> > Next steps:
> > - allow bpf_spin_lock in other map types (like cgroup local storage)
> > - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
> >   to request kernel to grab bpf_spin_lock before rewriting the value.
> >   That will serialize access to map elements.
> 
> So clearly this map stuff is shared between bpf proglets, otherwise
> there would not be a need for locking. But what happens if one is from
> task context and another from IRQ context?
> 
> I don't see a local_irq_save()/restore() anywhere. What avoids the
> trivial lock inversion?
> 

Also; what about BPF running from NMI context and using locks?
Paul E. McKenney Jan. 24, 2019, 11:42 p.m. UTC | #3
On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > 
> > Thanks for having kernel/locking people on Cc...
> > 
> > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > 
> > > Implementation details:
> > > - on !SMP bpf_spin_lock() becomes nop
> > 
> > Because no BPF program is preemptible? I don't see any assertions or
> > even a comment that says this code is non-preemptible.
> > 
> > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > which is not sufficient.
> > 
> > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > 
> > I really don't much like direct usage of qspinlock; esp. not as a
> > surprise.

Substituting the lightweight-reader SRCU as discussed earlier would allow
use of a more generic locking primitive, for example, one that allowed
blocking, at least in cases were the context allowed this.

git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
branch srcu-lr.2019.01.16a.

One advantage of a more generic locking primitive would be keeping BPF
programs independent of internal changes to spinlock primitives.

							Thanx, Paul

> > Why does it matter if 0 means unlocked; that's what
> > __ARCH_SPIN_LOCK_UNLOCKED is for.
> > 
> > I get the sizeof(__u32) thing, but why not key off of that?
> > 
> > > Next steps:
> > > - allow bpf_spin_lock in other map types (like cgroup local storage)
> > > - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
> > >   to request kernel to grab bpf_spin_lock before rewriting the value.
> > >   That will serialize access to map elements.
> > 
> > So clearly this map stuff is shared between bpf proglets, otherwise
> > there would not be a need for locking. But what happens if one is from
> > task context and another from IRQ context?
> > 
> > I don't see a local_irq_save()/restore() anywhere. What avoids the
> > trivial lock inversion?
> 
> Also; what about BPF running from NMI context and using locks?
>
Alexei Starovoitov Jan. 24, 2019, 11:58 p.m. UTC | #4
On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> 
> Thanks for having kernel/locking people on Cc...
> 
> On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> 
> > Implementation details:
> > - on !SMP bpf_spin_lock() becomes nop
> 
> Because no BPF program is preemptible? I don't see any assertions or
> even a comment that says this code is non-preemptible.
> 
> AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> which is not sufficient.

nope. all bpf prog types disable preemption. That is must have for all
sorts of things to work properly.
If there is a prog type that doing rcu_read_lock only it's a serious bug.
About a year or so ago we audited everything specifically to make
sure everything disables preemption before calling bpf progs.
I'm pretty sure nothing crept in in the mean time.

> > - on architectures that don't support queued_spin_lock trivial lock is used.
> >   Note that arch_spin_lock cannot be used, since not all archs agree that
> >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> 
> I really don't much like direct usage of qspinlock; esp. not as a
> surprise.
> 
> Why does it matter if 0 means unlocked; that's what
> __ARCH_SPIN_LOCK_UNLOCKED is for.
> 
> I get the sizeof(__u32) thing, but why not key off of that?

what do you mean by 'key off of that' ?
to use arch_spinlock_t instead of qspinlock ?
That was my first attempt, but then I painfully found that
its size on parisc is 16 bytes and we're not going to penalize bpf
to waste that much space because of single architecture.
sizeof(arch_spinlock_t) can be 1 byte too (on sparc).
That would fit in __u32, but I figured it's cleaner to use qspinlock
on all archs that support it and dumb_spin_lock on archs that dont.

Another option is use to arch_spinlock_t when its sizeof==4
and use dumb_spin_lock otherwise.
It's doable, but imo still less clean than using qspinlock
due to zero init. Since zero init is a lot less map work
that zero inits all elements already.

If arch_spinlock_t is used than at map init time we would need to
walk all elements and do __ARCH_SPIN_LOCK_UNLOCKED assignment
(and maps can have millions of elements).
Not horrible, but 100% waste of cycles for x86/arm64 where qspinlock
is used. Such waste can be workaround further by doing ugly
#idef __ARCH_SPIN_LOCK_UNLOCKED == 0 -> don't do init loop.
And then add another #ifdef for archs with sizeof(arch_spinlock_t)!=4
to keep zero init for all map types that support bpf_spin_lock
via dumb_spin_lock.
Clearly at that point we're getting into ugliness everywhere.
Hence I've used qspinlock directly.

> > Next steps:
> > - allow bpf_spin_lock in other map types (like cgroup local storage)
> > - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
> >   to request kernel to grab bpf_spin_lock before rewriting the value.
> >   That will serialize access to map elements.
> 
> So clearly this map stuff is shared between bpf proglets, otherwise
> there would not be a need for locking. But what happens if one is from
> task context and another from IRQ context?
> 
> I don't see a local_irq_save()/restore() anywhere. What avoids the
> trivial lock inversion?

> and from NMI ...

progs are not preemptable and map syscall accessors have bpf_prog_active counters.
So nmi/kprobe progs will not be running when syscall is running.
Hence dead lock is not possible and irq_save is not needed.

> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index a74972b07e74..2e98e4caf5aa 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -221,6 +221,63 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
> >  	.arg2_type	= ARG_CONST_SIZE,
> >  };
> >  
> > +#ifndef CONFIG_QUEUED_SPINLOCKS
> > +struct dumb_spin_lock {
> > +	atomic_t val;
> > +};
> > +#endif
> > +
> > +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
> > +{
> > +#if defined(CONFIG_SMP)
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +	struct qspinlock *qlock = (void *)lock;
> > +
> > +	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
> > +	queued_spin_lock(qlock);
> > +#else
> > +	struct dumb_spin_lock *qlock = (void *)lock;
> > +
> > +	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
> > +	do {
> > +		while (atomic_read(&qlock->val) != 0)
> > +			cpu_relax();
> > +	} while (atomic_cmpxchg(&qlock->val, 0, 1) != 0);
> > +#endif
> > +#endif
> > +	return 0;
> > +}
> > +
> > +const struct bpf_func_proto bpf_spin_lock_proto = {
> > +	.func		= bpf_spin_lock,
> > +	.gpl_only	= false,
> > +	.ret_type	= RET_VOID,
> > +	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
> > +};
> > +
> > +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
> > +{
> > +#if defined(CONFIG_SMP)
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +	struct qspinlock *qlock = (void *)lock;
> > +
> > +	queued_spin_unlock(qlock);
> > +#else
> > +	struct dumb_spin_lock *qlock = (void *)lock;
> > +
> > +	atomic_set(&qlock->val, 0);
> 
> And this is broken... That should've been atomic_set_release() at the
> very least.

right. good catch.

> And this would again be the moment where I go pester you about the BPF
> memory model :-)

hehe :)
How do you propose to define it in a way that it applies to all archs
and yet doesn't penalize x86 ?
"Assume minimum execution ordering model" the way kernel does
unfortunately is not usable, since bpf doesn't have a luxury
of using nice #defines that convert into nops on x86.
Alexei Starovoitov Jan. 25, 2019, 12:05 a.m. UTC | #5
On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > 
> > > Thanks for having kernel/locking people on Cc...
> > > 
> > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > 
> > > > Implementation details:
> > > > - on !SMP bpf_spin_lock() becomes nop
> > > 
> > > Because no BPF program is preemptible? I don't see any assertions or
> > > even a comment that says this code is non-preemptible.
> > > 
> > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > which is not sufficient.
> > > 
> > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > 
> > > I really don't much like direct usage of qspinlock; esp. not as a
> > > surprise.
> 
> Substituting the lightweight-reader SRCU as discussed earlier would allow
> use of a more generic locking primitive, for example, one that allowed
> blocking, at least in cases were the context allowed this.
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> branch srcu-lr.2019.01.16a.
> 
> One advantage of a more generic locking primitive would be keeping BPF
> programs independent of internal changes to spinlock primitives.

Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
bpf is not switching to srcu any time soon.
If/when it happens it will be only for certain prog+map types
like bpf syscall probes that need to be able to do copy_from_user
from bpf prog.
Jann Horn Jan. 25, 2019, 12:18 a.m. UTC | #6
On Fri, Jan 25, 2019 at 12:59 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > Thanks for having kernel/locking people on Cc...
> >
> > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> >
> > > Implementation details:
> > > - on !SMP bpf_spin_lock() becomes nop
> >
> > Because no BPF program is preemptible? I don't see any assertions or
> > even a comment that says this code is non-preemptible.
> >
> > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > which is not sufficient.
>
> nope. all bpf prog types disable preemption. That is must have for all
> sorts of things to work properly.
> If there is a prog type that doing rcu_read_lock only it's a serious bug.
> About a year or so ago we audited everything specifically to make
> sure everything disables preemption before calling bpf progs.
> I'm pretty sure nothing crept in in the mean time.

Hmm? What about
unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN?
That just holds rcu_read_lock(), as far as I can tell...

In general, for tricky rules about locks that must be held or contexts
that code can run in, it makes sense to use some check that yells when
that rule is being violated, at least in debug builds. Otherwise it's
too easy for bugs to creep in...
Paul E. McKenney Jan. 25, 2019, 1:22 a.m. UTC | #7
On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > 
> > > > Thanks for having kernel/locking people on Cc...
> > > > 
> > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > 
> > > > > Implementation details:
> > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > 
> > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > even a comment that says this code is non-preemptible.
> > > > 
> > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > which is not sufficient.
> > > > 
> > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > 
> > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > surprise.
> > 
> > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > use of a more generic locking primitive, for example, one that allowed
> > blocking, at least in cases were the context allowed this.
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > branch srcu-lr.2019.01.16a.
> > 
> > One advantage of a more generic locking primitive would be keeping BPF
> > programs independent of internal changes to spinlock primitives.
> 
> Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> bpf is not switching to srcu any time soon.
> If/when it happens it will be only for certain prog+map types
> like bpf syscall probes that need to be able to do copy_from_user
> from bpf prog.

Hmmm...  What prevents BPF programs from looping infinitely within an
RCU reader, and as you noted, preemption disabled?

If BPF programs are in fact allowed to loop infinitely, it would be
very good for the health of the kernel to have preemption enabled.
And to be within an SRCU read-side critical section instead of an RCU
read-side critical section.

							Thanx, Paul
Jann Horn Jan. 25, 2019, 1:46 a.m. UTC | #8
On Fri, Jan 25, 2019 at 2:22 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> > On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > >
> > > > > Thanks for having kernel/locking people on Cc...
> > > > >
> > > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > >
> > > > > > Implementation details:
> > > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > >
> > > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > > even a comment that says this code is non-preemptible.
> > > > >
> > > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > > which is not sufficient.
> > > > >
> > > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > >
> > > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > > surprise.
> > >
> > > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > > use of a more generic locking primitive, for example, one that allowed
> > > blocking, at least in cases were the context allowed this.
> > >
> > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > > branch srcu-lr.2019.01.16a.
> > >
> > > One advantage of a more generic locking primitive would be keeping BPF
> > > programs independent of internal changes to spinlock primitives.
> >
> > Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> > bpf is not switching to srcu any time soon.
> > If/when it happens it will be only for certain prog+map types
> > like bpf syscall probes that need to be able to do copy_from_user
> > from bpf prog.
>
> Hmmm...  What prevents BPF programs from looping infinitely within an
> RCU reader, and as you noted, preemption disabled?
>
> If BPF programs are in fact allowed to loop infinitely, it would be
> very good for the health of the kernel to have preemption enabled.
> And to be within an SRCU read-side critical section instead of an RCU
> read-side critical section.

The BPF verifier prevents loops; this is in push_insn() in
kernel/bpf/verifier.c, which errors out with -EINVAL when a back edge
is encountered. For non-root programs, that limits the maximum number
of instructions per eBPF engine execution to
BPF_MAXINSNS*MAX_TAIL_CALL_CNT==4096*32==131072 (but that includes
call instructions, which can cause relatively expensive operations
like hash table lookups). For programs created with CAP_SYS_ADMIN,
things get more tricky because you can create your own functions and
call them repeatedly; I'm not sure whether the pessimal runtime there
becomes exponential, or whether there is some check that catches this.
Eric Dumazet Jan. 25, 2019, 2:29 a.m. UTC | #9
On 01/24/2019 03:58 PM, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:

>> and from NMI ...
> 
> progs are not preemptable and map syscall accessors have bpf_prog_active counters.
> So nmi/kprobe progs will not be running when syscall is running.
> Hence dead lock is not possible and irq_save is not needed.


Speaking of NMI, how pcpu_freelist_push() and pop() can actually work ?

It seems bpf_get_stackid() can be called from NMI, and lockdep seems to complain loudly

lpaa5:/export/hda3/google/edumazet# ./test_progs 
test_pkt_access:PASS:ipv4 48 nsec
test_pkt_access:PASS:ipv6 49 nsec
test_prog_run_xattr:PASS:load 0 nsec
test_prog_run_xattr:PASS:run 316 nsec
test_prog_run_xattr:PASS:data_size_out 316 nsec
test_prog_run_xattr:PASS:overflow 316 nsec
test_prog_run_xattr:PASS:run_no_output 431 nsec
test_prog_run_xattr:PASS:run_wrong_size_out 431 nsec
test_xdp:PASS:ipv4 1397 nsec
test_xdp:PASS:ipv6 523 nsec
test_xdp_adjust_tail:PASS:ipv4 382 nsec
test_xdp_adjust_tail:PASS:ipv6 210 nsec
test_l4lb:PASS:ipv4 143 nsec
test_l4lb:PASS:ipv6 164 nsec
libbpf: incorrect bpf_call opcode
libbpf: incorrect bpf_call opcode
test_tcp_estats:PASS: 0 nsec
test_bpf_obj_id:PASS:get-fd-by-notexist-prog-id 0 nsec
test_bpf_obj_id:PASS:get-fd-by-notexist-map-id 0 nsec
test_bpf_obj_id:PASS:get-map-info(fd) 0 nsec
test_bpf_obj_id:PASS:get-prog-info(fd) 0 nsec
test_bpf_obj_id:PASS:get-map-info(fd) 0 nsec
test_bpf_obj_id:PASS:get-prog-info(fd) 0 nsec
test_bpf_obj_id:PASS:get-prog-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-prog-fd-bad-nr-map-ids 0 nsec
test_bpf_obj_id:PASS:get-prog-info(next_id->fd) 0 nsec
test_bpf_obj_id:PASS:get-prog-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-prog-fd-bad-nr-map-ids 0 nsec
test_bpf_obj_id:PASS:get-prog-info(next_id->fd) 0 nsec
test_bpf_obj_id:PASS:check total prog id found by get_next_id 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:check get-map-info(next_id->fd) 0 nsec
test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec
test_bpf_obj_id:PASS:check get-map-info(next_id->fd) 0 nsec
test_bpf_obj_id:PASS:check total map id found by get_next_id 0 nsec
test_pkt_md_access:PASS: 76 nsec
test_obj_name:PASS:check-bpf-prog-name 0 nsec
test_obj_name:PASS:check-bpf-map-name 0 nsec
test_obj_name:PASS:check-bpf-prog-name 0 nsec
test_obj_name:PASS:check-bpf-map-name 0 nsec
test_obj_name:PASS:check-bpf-prog-name 0 nsec
test_obj_name:PASS:check-bpf-map-name 0 nsec
test_obj_name:PASS:check-bpf-prog-name 0 nsec
test_obj_name:PASS:check-bpf-map-name 0 nsec
test_tp_attach_query:PASS:open 0 nsec
test_tp_attach_query:PASS:read 0 nsec
test_tp_attach_query:PASS:prog_load 0 nsec
test_tp_attach_query:PASS:bpf_obj_get_info_by_fd 0 nsec
test_tp_attach_query:PASS:perf_event_open 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_enable 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_set_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:prog_load 0 nsec
test_tp_attach_query:PASS:bpf_obj_get_info_by_fd 0 nsec
test_tp_attach_query:PASS:perf_event_open 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_enable 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_set_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:prog_load 0 nsec
test_tp_attach_query:PASS:bpf_obj_get_info_by_fd 0 nsec
test_tp_attach_query:PASS:perf_event_open 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_enable 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_set_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_tp_attach_query:PASS:perf_event_ioc_query_bpf 0 nsec
test_stacktrace_map:PASS:prog_load 0 nsec
test_stacktrace_map:PASS:open 0 nsec
test_stacktrace_map:PASS:perf_event_open 0 nsec
test_stacktrace_map:PASS:compare_map_keys stackid_hmap vs. stackmap 0 nsec
test_stacktrace_map:PASS:compare_map_keys stackmap vs. stackid_hmap 0 nsec
test_stacktrace_map:PASS:compare_stack_ips stackmap vs. stack_amap 0 nsec
test_stacktrace_build_id:PASS:prog_load 0 nsec
test_stacktrace_build_id:PASS:open 0 nsec
test_stacktrace_build_id:PASS:read 0 nsec
test_stacktrace_build_id:PASS:perf_event_open 0 nsec
test_stacktrace_build_id:PASS:perf_event_ioc_enable 0 nsec
test_stacktrace_build_id:PASS:perf_event_ioc_set_bpf 0 nsec
test_stacktrace_build_id:PASS:bpf_find_map control_map 0 nsec
test_stacktrace_build_id:PASS:bpf_find_map stackid_hmap 0 nsec
test_stacktrace_build_id:PASS:bpf_find_map stackmap 0 nsec
test_stacktrace_build_id:PASS:bpf_find_map stack_amap 0 nsec
test_stacktrace_build_id:PASS:compare_map_keys stackid_hmap vs. stackmap 0 nsec
test_stacktrace_build_id:PASS:compare_map_keys stackmap vs. stackid_hmap 0 nsec
test_stacktrace_build_id:PASS:get build_id with readelf 0 nsec
test_stacktrace_build_id:PASS:get_next_key from stackmap 0 nsec
test_stacktrace_build_id:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id:PASS:build id match 0 nsec
test_stacktrace_build_id:PASS:compare_stack_ips stackmap vs. stack_amap 0 nsec
test_stacktrace_build_id_nmi:PASS:prog_load 0 nsec
test_stacktrace_build_id_nmi:PASS:perf_event_open 0 nsec
test_stacktrace_build_id_nmi:PASS:perf_event_ioc_enable 0 nsec
test_stacktrace_build_id_nmi:PASS:perf_event_ioc_set_bpf 0 nsec
test_stacktrace_build_id_nmi:PASS:bpf_find_map control_map 0 nsec
test_stacktrace_build_id_nmi:PASS:bpf_find_map stackid_hmap 0 nsec
test_stacktrace_build_id_nmi:PASS:bpf_find_map stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:bpf_find_map stack_amap 0 nsec
test_stacktrace_build_id_nmi:PASS:compare_map_keys stackid_hmap vs. stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:compare_map_keys stackmap vs. stackid_hmap 0 nsec
test_stacktrace_build_id_nmi:PASS:get build_id with readelf 0 nsec
test_stacktrace_build_id_nmi:PASS:get_next_key from stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:lookup_elem from stackmap 0 nsec
test_stacktrace_build_id_nmi:PASS:build id match 0 nsec
test_stacktrace_map_raw_tp:PASS:prog_load raw tp 0 nsec
test_stacktrace_map_raw_tp:PASS:raw_tp_open 0 nsec
test_stacktrace_map_raw_tp:PASS:compare_map_keys stackid_hmap vs. stackmap 0 nsec
test_stacktrace_map_raw_tp:PASS:compare_map_keys stackmap vs. stackid_hmap 0 nsec
test_get_stack_raw_tp:PASS:prog_load raw tp 0 nsec
test_get_stack_raw_tp:PASS:raw_tp_open 0 nsec
test_get_stack_raw_tp:PASS:bpf_find_map 0 nsec
test_get_stack_raw_tp:PASS:load_kallsyms 0 nsec
test_get_stack_raw_tp:PASS:perf_event_open 0 nsec
test_get_stack_raw_tp:PASS:bpf_map_update_elem 0 nsec
test_get_stack_raw_tp:PASS:ioctl PERF_EVENT_IOC_ENABLE 0 nsec
test_get_stack_raw_tp:PASS:perf_event_mmap 0 nsec
test_get_stack_raw_tp:PASS:perf_event_poller 0 nsec
test_task_fd_query_rawtp:PASS:prog_load raw tp 0 nsec
test_task_fd_query_rawtp:PASS:raw_tp_open 0 nsec
test_task_fd_query_rawtp:PASS:bpf_task_fd_query 0 nsec
test_task_fd_query_rawtp:PASS:check_results 0 nsec
test_task_fd_query_rawtp:PASS:bpf_task_fd_query (len = 0) 0 nsec
test_task_fd_query_rawtp:PASS:check_results 0 nsec
test_task_fd_query_rawtp:PASS:bpf_task_fd_query (buf = 0) 0 nsec
test_task_fd_query_rawtp:PASS:check_results 0 nsec
test_task_fd_query_rawtp:PASS:bpf_task_fd_query (len = 3) 0 nsec
test_task_fd_query_rawtp:PASS:check_results 0 nsec
test_task_fd_query_tp_core:PASS:bpf_prog_load 0 nsec
test_task_fd_query_tp_core:PASS:open 0 nsec
test_task_fd_query_tp_core:PASS:read 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_open 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_ioc_enable 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_ioc_set_bpf 0 nsec
test_task_fd_query_tp_core:PASS:bpf_task_fd_query 0 nsec
test_task_fd_query_tp_core:PASS:check_results 0 nsec
test_task_fd_query_tp_core:PASS:bpf_prog_load 0 nsec
test_task_fd_query_tp_core:PASS:open 0 nsec
test_task_fd_query_tp_core:PASS:read 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_open 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_ioc_enable 0 nsec
test_task_fd_query_tp_core:PASS:perf_event_ioc_set_bpf 0 nsec
test_task_fd_query_tp_core:PASS:bpf_task_fd_query 0 nsec
test_task_fd_query_tp_core:PASS:check_results 0 nsec
test_reference_tracking:PASS:sk_lookup_success 0 nsec
test_reference_tracking:PASS:sk_lookup_success_simple 0 nsec
test_reference_tracking:PASS:fail_use_after_free 0 nsec
test_reference_tracking:PASS:fail_modify_sk_pointer 0 nsec
test_reference_tracking:PASS:fail_modify_sk_or_null_pointer 0 nsec
test_reference_tracking:PASS:fail_no_release 0 nsec
test_reference_tracking:PASS:fail_release_twice 0 nsec
test_reference_tracking:PASS:fail_release_unchecked 0 nsec
test_reference_tracking:PASS:fail_no_release_subcall 0 nsec
test_queue_stack_map:PASS:bpf_map_pop_elem 342 nsec
test_queue_stack_map:PASS:check-queue-stack-map-empty 323 nsec
test_queue_stack_map:PASS:bpf_map_push_elem 323 nsec
test_queue_stack_map:PASS:bpf_map_pop_elem 325 nsec
test_queue_stack_map:PASS:check-queue-stack-map-empty 311 nsec
test_queue_stack_map:PASS:bpf_map_push_elem 311 nsec
Summary: 175 PASSED, 2 FAILED

[  429.727565] ========================================================
[  429.733916] WARNING: possible irq lock inversion dependency detected
[  429.740282] 5.0.0-dbg-DEV #550 Not tainted
[  429.744381] --------------------------------------------------------
[  429.750743] dd/16374 just changed the state of lock:
[  429.755751] 0000000062c2321f (&head->lock){+...}, at: pcpu_freelist_push+0x28/0x50
[  429.763348] but this lock was taken by another, HARDIRQ-safe lock in the past:
[  429.770609]  (&rq->lock){-.-.}
[  429.770610] 
               
               and interrupts could create inverse lock ordering between them.

[  429.785089] 
               other info that might help us debug this:
[  429.791630]  Possible interrupt unsafe locking scenario:

[  429.798432]        CPU0                    CPU1
[  429.802964]        ----                    ----
[  429.807513]   lock(&head->lock);
[  429.810746]                                local_irq_disable();
[  429.816700]                                lock(&rq->lock);
[  429.822322]                                lock(&head->lock);
[  429.828094]   <Interrupt>
[  429.830724]     lock(&rq->lock);
[  429.833977] 
                *** DEADLOCK ***

[  429.839929] 1 lock held by dd/16374:
[  429.843526]  #0: 00000000a4c09748 (rcu_read_lock){....}, at: trace_call_bpf+0x38/0x1f0
[  429.851498] 
               the shortest dependencies between 2nd lock and 1st lock:
[  429.859348]  -> (&rq->lock){-.-.} {
[  429.862847]     IN-HARDIRQ-W at:
[  429.866091]                       lock_acquire+0xa7/0x190
[  429.871531]                       _raw_spin_lock+0x2f/0x40
[  429.877051]                       scheduler_tick+0x51/0x100
[  429.882621]                       update_process_times+0x6f/0x90
[  429.888660]                       tick_periodic+0x2b/0xc0
[  429.894065]                       tick_handle_periodic+0x25/0x70
[  429.900096]                       timer_interrupt+0x15/0x20
[  429.905729]                       __handle_irq_event_percpu+0x44/0x2b0
[  429.912265]                       handle_irq_event+0x60/0xc0
[  429.917924]                       handle_edge_irq+0x8b/0x220
[  429.923570]                       handle_irq+0x21/0x30
[  429.928724]                       do_IRQ+0x64/0x120
[  429.933613]                       ret_from_intr+0x0/0x1d
[  429.938935]                       timer_irq_works+0x5b/0xfd
[  429.944528]                       setup_IO_APIC+0x378/0x82b
[  429.950101]                       apic_intr_mode_init+0x16d/0x172
[  429.956202]                       x86_late_time_init+0x15/0x1c
[  429.962049]                       start_kernel+0x44b/0x4fe
[  429.967532]                       x86_64_start_reservations+0x24/0x26
[  429.973988]                       x86_64_start_kernel+0x6f/0x72
[  429.979942]                       secondary_startup_64+0xa4/0xb0
[  429.985960]     IN-SOFTIRQ-W at:
[  429.989183]                       lock_acquire+0xa7/0x190
[  429.994582]                       _raw_spin_lock+0x2f/0x40
[  430.000067]                       try_to_wake_up+0x1ef/0x610
[  430.005740]                       wake_up_process+0x15/0x20
[  430.011314]                       swake_up_one+0x36/0x70
[  430.016648]                       rcu_gp_kthread_wake+0x3c/0x40
[  430.022591]                       rcu_accelerate_cbs_unlocked+0x8c/0xd0
[  430.029219]                       rcu_process_callbacks+0xdd/0xc50
[  430.035434]                       __do_softirq+0x105/0x465
[  430.040952]                       irq_exit+0xc8/0xd0
[  430.045932]                       smp_apic_timer_interrupt+0xa5/0x230
[  430.052387]                       apic_timer_interrupt+0xf/0x20
[  430.058331]                       clear_page_erms+0x7/0x10
[  430.063825]                       __alloc_pages_nodemask+0x165/0x390
[  430.070201]                       dsalloc_pages+0x65/0x90
[  430.075657]                       reserve_ds_buffers+0x136/0x500
[  430.081679]                       x86_reserve_hardware+0x16d/0x180
[  430.087874]                       x86_pmu_event_init+0x4b/0x210
[  430.093828]                       perf_try_init_event+0x8f/0xb0
[  430.099744]                       perf_event_alloc+0xa1d/0xc90
[  430.105576]                       perf_event_create_kernel_counter+0x24/0x150
[  430.112721]                       hardlockup_detector_event_create+0x41/0x90
[  430.119785]                       hardlockup_detector_perf_enable+0xe/0x40
[  430.126653]                       watchdog_nmi_enable+0xe/0x20
[  430.132501]                       watchdog_enable+0xb8/0xd0
[  430.138097]                       softlockup_start_fn+0x15/0x20
[  430.144015]                       smp_call_on_cpu_callback+0x2a/0x60
[  430.150393]                       process_one_work+0x1f4/0x580
[  430.156231]                       worker_thread+0x6f/0x430
[  430.161708]                       kthread+0x132/0x170
[  430.166791]                       ret_from_fork+0x24/0x30
[  430.172180]     INITIAL USE at:
[  430.175334]                      lock_acquire+0xa7/0x190
[  430.180656]                      _raw_spin_lock_irqsave+0x3a/0x50
[  430.186763]                      rq_attach_root+0x1d/0xd0
[  430.192153]                      sched_init+0x30b/0x40d
[  430.197409]                      start_kernel+0x283/0x4fe
[  430.202825]                      x86_64_start_reservations+0x24/0x26
[  430.209181]                      x86_64_start_kernel+0x6f/0x72
[  430.215032]                      secondary_startup_64+0xa4/0xb0
[  430.220964]   }
[  430.222724]   ... key      at: [<ffffffff88f7dc28>] __key.71964+0x0/0x8
[  430.229345]   ... acquired at:
[  430.232404]    _raw_spin_lock+0x2f/0x40
[  430.236260]    pcpu_freelist_pop+0x77/0xf0
[  430.240349]    bpf_get_stackid+0x1c4/0x440
[  430.244465]    bpf_get_stackid_tp+0x11/0x20

[  430.250156] -> (&head->lock){+...} {
[  430.253742]    HARDIRQ-ON-W at:
[  430.256878]                     lock_acquire+0xa7/0x190
[  430.262095]                     _raw_spin_lock+0x2f/0x40
[  430.267397]                     pcpu_freelist_push+0x28/0x50
[  430.273088]                     bpf_get_stackid+0x41e/0x440
[  430.278677]                     bpf_get_stackid_tp+0x11/0x20
[  430.284350]    INITIAL USE at:
[  430.287411]                    lock_acquire+0xa7/0x190
[  430.292591]                    _raw_spin_lock+0x2f/0x40
[  430.297833]                    pcpu_freelist_populate+0xc3/0x120
[  430.303840]                    htab_map_alloc+0x41e/0x550
[  430.309245]                    __do_sys_bpf+0x27e/0x1b50
[  430.314591]                    __x64_sys_bpf+0x1a/0x20
[  430.319738]                    do_syscall_64+0x5a/0x460
[  430.324962]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  430.331598]  }
[  430.333274]  ... key      at: [<ffffffff89bfce48>] __key.13162+0x0/0x8
[  430.339821]  ... acquired at:
[  430.342795]    mark_lock+0x3c4/0x630
[  430.346371]    __lock_acquire+0x3b2/0x1850
[  430.350493]    lock_acquire+0xa7/0x190
[  430.354282]    _raw_spin_lock+0x2f/0x40
[  430.358129]    pcpu_freelist_push+0x28/0x50
[  430.362331]    bpf_get_stackid+0x41e/0x440
[  430.366438]    bpf_get_stackid_tp+0x11/0x20

[  430.372128] 
               stack backtrace:
[  430.376479] CPU: 29 PID: 16374 Comm: dd Not tainted 5.0.0-dbg-DEV #550
[  430.383008] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.54.0 06/07/2018
[  430.389967] Call Trace:
[  430.392433]  dump_stack+0x67/0x95
[  430.395756]  print_irq_inversion_bug.part.38+0x1b8/0x1c4
[  430.401066]  check_usage_backwards+0x156/0x160
[  430.405523]  mark_lock+0x3c4/0x630
[  430.408925]  ? mark_lock+0x3c4/0x630
[  430.412496]  ? print_shortest_lock_dependencies+0x1b0/0x1b0
[  430.418092]  __lock_acquire+0x3b2/0x1850
[  430.422018]  ? find_get_entry+0x1b1/0x320
[  430.426050]  ? find_get_entry+0x1d0/0x320
[  430.430063]  lock_acquire+0xa7/0x190
[  430.433667]  ? lock_acquire+0xa7/0x190
[  430.437433]  ? pcpu_freelist_push+0x28/0x50
[  430.441635]  _raw_spin_lock+0x2f/0x40
[  430.445317]  ? pcpu_freelist_push+0x28/0x50
[  430.449494]  pcpu_freelist_push+0x28/0x50
[  430.453513]  bpf_get_stackid+0x41e/0x440
[  430.457446]  bpf_get_stackid_tp+0x11/0x20
[  430.461450]  ? trace_call_bpf+0xe4/0x1f0
[  430.465365]  ? perf_trace_run_bpf_submit+0x42/0xb0
[  430.470155]  ? perf_trace_urandom_read+0xbf/0x100
[  430.474851]  ? urandom_read+0x20f/0x350
[  430.478685]  ? vfs_read+0xb8/0x190
[  430.482096]  ? __x64_sys_read+0x61/0xd0
[  430.485949]  ? do_syscall_64+0x21/0x460
[  430.489797]  ? do_syscall_64+0x5a/0x460
[  430.493635]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  451.715758] perf: interrupt took too long (2516 > 2500), lowering kernel.perf_event_max_sample_rate to 79000
Alexei Starovoitov Jan. 25, 2019, 2:34 a.m. UTC | #10
On Thu, Jan 24, 2019 at 06:29:55PM -0800, Eric Dumazet wrote:
> 
> 
> On 01/24/2019 03:58 PM, Alexei Starovoitov wrote:
> > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> 
> >> and from NMI ...
> > 
> > progs are not preemptable and map syscall accessors have bpf_prog_active counters.
> > So nmi/kprobe progs will not be running when syscall is running.
> > Hence dead lock is not possible and irq_save is not needed.
> 
> 
> Speaking of NMI, how pcpu_freelist_push() and pop() can actually work ?
> 
> It seems bpf_get_stackid() can be called from NMI, and lockdep seems to complain loudly

it's a known false positive.
https://lkml.org/lkml/2018/7/25/756
and the same answer as before:
we're not going to penalize performance to shut up false positive.
Alexei Starovoitov Jan. 25, 2019, 2:38 a.m. UTC | #11
On Fri, Jan 25, 2019 at 02:46:55AM +0100, Jann Horn wrote:
> On Fri, Jan 25, 2019 at 2:22 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> > On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> > > On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > > > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > > >
> > > > > > Thanks for having kernel/locking people on Cc...
> > > > > >
> > > > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > > >
> > > > > > > Implementation details:
> > > > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > > >
> > > > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > > > even a comment that says this code is non-preemptible.
> > > > > >
> > > > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > > > which is not sufficient.
> > > > > >
> > > > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > > >
> > > > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > > > surprise.
> > > >
> > > > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > > > use of a more generic locking primitive, for example, one that allowed
> > > > blocking, at least in cases were the context allowed this.
> > > >
> > > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > > > branch srcu-lr.2019.01.16a.
> > > >
> > > > One advantage of a more generic locking primitive would be keeping BPF
> > > > programs independent of internal changes to spinlock primitives.
> > >
> > > Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> > > bpf is not switching to srcu any time soon.
> > > If/when it happens it will be only for certain prog+map types
> > > like bpf syscall probes that need to be able to do copy_from_user
> > > from bpf prog.
> >
> > Hmmm...  What prevents BPF programs from looping infinitely within an
> > RCU reader, and as you noted, preemption disabled?
> >
> > If BPF programs are in fact allowed to loop infinitely, it would be
> > very good for the health of the kernel to have preemption enabled.
> > And to be within an SRCU read-side critical section instead of an RCU
> > read-side critical section.
> 
> The BPF verifier prevents loops; this is in push_insn() in
> kernel/bpf/verifier.c, which errors out with -EINVAL when a back edge
> is encountered. For non-root programs, that limits the maximum number
> of instructions per eBPF engine execution to
> BPF_MAXINSNS*MAX_TAIL_CALL_CNT==4096*32==131072 (but that includes
> call instructions, which can cause relatively expensive operations
> like hash table lookups).

correct.

> For programs created with CAP_SYS_ADMIN,
> things get more tricky because you can create your own functions and
> call them repeatedly; I'm not sure whether the pessimal runtime there
> becomes exponential, or whether there is some check that catches this.

I think you're referring to bpf-to-bpf calls.
The limit it still the same. 4k per program including all calls.
tail calls are not allowed when bpf-to-bpf is used. So no 32 multiplier.

Note that classic bpf has the same 4k limit and it can call
expensive functions too via SKF_AD extensions.
Eric Dumazet Jan. 25, 2019, 2:44 a.m. UTC | #12
On 01/24/2019 06:34 PM, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 06:29:55PM -0800, Eric Dumazet wrote:
>>
>>
>> On 01/24/2019 03:58 PM, Alexei Starovoitov wrote:
>>> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
>>
>>>> and from NMI ...
>>>
>>> progs are not preemptable and map syscall accessors have bpf_prog_active counters.
>>> So nmi/kprobe progs will not be running when syscall is running.
>>> Hence dead lock is not possible and irq_save is not needed.
>>
>>
>> Speaking of NMI, how pcpu_freelist_push() and pop() can actually work ?
>>
>> It seems bpf_get_stackid() can be called from NMI, and lockdep seems to complain loudly
> 
> it's a known false positive.
> https://lkml.org/lkml/2018/7/25/756
> and the same answer as before:
> we're not going to penalize performance to shut up false positive.
> 

As far as lockdep is concerned, I do not believe we care about performance.

How can we remove this false positive, so that lockdep stays alive even after running bpf  test_progs ?


Let see if we understood this well.

1. create perf event PERF_TYPE_HARDWARE:PERF_COUNT_HW_CPU_CYCLES
2. attach bpf probram to this event 
3. since that's a hw event, the bpf program is executed in NMI context
4. the bpf program calls bpf_get_stackid to record the trace in a bpf map
5. bpf_get_stackid calls pcpu_freelist_pop and pcpu_freelist_push from NMI
6. userspace calls sys_bpf(bpf_map_lookup_elem) which calls bpf_stackmap_copy which can call pcpu_freelist_push


It seems pcpu_freelist_pop and pcpu_freelist_push are not NMI safe,
so what prevents bad things to happen ?

Thanks.
Alexei Starovoitov Jan. 25, 2019, 2:49 a.m. UTC | #13
On Fri, Jan 25, 2019 at 01:18:04AM +0100, Jann Horn wrote:
> On Fri, Jan 25, 2019 at 12:59 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > Thanks for having kernel/locking people on Cc...
> > >
> > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > >
> > > > Implementation details:
> > > > - on !SMP bpf_spin_lock() becomes nop
> > >
> > > Because no BPF program is preemptible? I don't see any assertions or
> > > even a comment that says this code is non-preemptible.
> > >
> > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > which is not sufficient.
> >
> > nope. all bpf prog types disable preemption. That is must have for all
> > sorts of things to work properly.
> > If there is a prog type that doing rcu_read_lock only it's a serious bug.
> > About a year or so ago we audited everything specifically to make
> > sure everything disables preemption before calling bpf progs.
> > I'm pretty sure nothing crept in in the mean time.
> 
> Hmm? What about
> unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN?
> That just holds rcu_read_lock(), as far as I can tell...

Looking into it.
First reaction is per-cpu maps and bpf_get_smp_processor_id/numa_id
will return bogus values for sender attached socket socket filters
in CONFIG_PREEMPT kernels. Receive side is in bh.
Not a security issue, but something we have to fix.
Alexei Starovoitov Jan. 25, 2019, 2:57 a.m. UTC | #14
On Thu, Jan 24, 2019 at 06:44:20PM -0800, Eric Dumazet wrote:
> 
> 
> On 01/24/2019 06:34 PM, Alexei Starovoitov wrote:
> > On Thu, Jan 24, 2019 at 06:29:55PM -0800, Eric Dumazet wrote:
> >>
> >>
> >> On 01/24/2019 03:58 PM, Alexei Starovoitov wrote:
> >>> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> >>
> >>>> and from NMI ...
> >>>
> >>> progs are not preemptable and map syscall accessors have bpf_prog_active counters.
> >>> So nmi/kprobe progs will not be running when syscall is running.
> >>> Hence dead lock is not possible and irq_save is not needed.
> >>
> >>
> >> Speaking of NMI, how pcpu_freelist_push() and pop() can actually work ?
> >>
> >> It seems bpf_get_stackid() can be called from NMI, and lockdep seems to complain loudly
> > 
> > it's a known false positive.
> > https://lkml.org/lkml/2018/7/25/756
> > and the same answer as before:
> > we're not going to penalize performance to shut up false positive.
> > 
> 
> As far as lockdep is concerned, I do not believe we care about performance.
> 
> How can we remove this false positive, so that lockdep stays alive even after running bpf  test_progs ?

Like do irq_save version when lockdep is on?
Sure. Let's do that.
That splat was bugging me for very long time.
I see it every single day when I run this test before applying patches.

> Let see if we understood this well.
> 
> 1. create perf event PERF_TYPE_HARDWARE:PERF_COUNT_HW_CPU_CYCLES
> 2. attach bpf probram to this event 
> 3. since that's a hw event, the bpf program is executed in NMI context
> 4. the bpf program calls bpf_get_stackid to record the trace in a bpf map
> 5. bpf_get_stackid calls pcpu_freelist_pop and pcpu_freelist_push from NMI
> 6. userspace calls sys_bpf(bpf_map_lookup_elem) which calls bpf_stackmap_copy which can call pcpu_freelist_push

argh. lookup cmd is missing __this_cpu_inc(bpf_prog_active); like update/delete do.
Will fix.

> It seems pcpu_freelist_pop and pcpu_freelist_push are not NMI safe,
> so what prevents bad things to happen ?

nmi checks for bpf_prog_active==0. See bpf_overflow_handler.
Paul E. McKenney Jan. 25, 2019, 4:11 a.m. UTC | #15
On Fri, Jan 25, 2019 at 02:46:55AM +0100, Jann Horn wrote:
> On Fri, Jan 25, 2019 at 2:22 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> > On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> > > On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > > > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > > >
> > > > > > Thanks for having kernel/locking people on Cc...
> > > > > >
> > > > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > > >
> > > > > > > Implementation details:
> > > > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > > >
> > > > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > > > even a comment that says this code is non-preemptible.
> > > > > >
> > > > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > > > which is not sufficient.
> > > > > >
> > > > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > > >
> > > > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > > > surprise.
> > > >
> > > > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > > > use of a more generic locking primitive, for example, one that allowed
> > > > blocking, at least in cases were the context allowed this.
> > > >
> > > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > > > branch srcu-lr.2019.01.16a.
> > > >
> > > > One advantage of a more generic locking primitive would be keeping BPF
> > > > programs independent of internal changes to spinlock primitives.
> > >
> > > Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> > > bpf is not switching to srcu any time soon.
> > > If/when it happens it will be only for certain prog+map types
> > > like bpf syscall probes that need to be able to do copy_from_user
> > > from bpf prog.
> >
> > Hmmm...  What prevents BPF programs from looping infinitely within an
> > RCU reader, and as you noted, preemption disabled?
> >
> > If BPF programs are in fact allowed to loop infinitely, it would be
> > very good for the health of the kernel to have preemption enabled.
> > And to be within an SRCU read-side critical section instead of an RCU
> > read-side critical section.
> 
> The BPF verifier prevents loops; this is in push_insn() in
> kernel/bpf/verifier.c, which errors out with -EINVAL when a back edge
> is encountered. For non-root programs, that limits the maximum number
> of instructions per eBPF engine execution to
> BPF_MAXINSNS*MAX_TAIL_CALL_CNT==4096*32==131072 (but that includes
> call instructions, which can cause relatively expensive operations
> like hash table lookups). For programs created with CAP_SYS_ADMIN,
> things get more tricky because you can create your own functions and
> call them repeatedly; I'm not sure whether the pessimal runtime there
> becomes exponential, or whether there is some check that catches this.

Whew!!!  ;-)

So no more than (say) 100 milliseconds?

							Thanx, Paul
Alexei Starovoitov Jan. 25, 2019, 4:27 a.m. UTC | #16
On 1/24/19 6:38 PM, Alexei Starovoitov wrote:
>> For programs created with CAP_SYS_ADMIN,
>> things get more tricky because you can create your own functions and
>> call them repeatedly; I'm not sure whether the pessimal runtime there
>> becomes exponential, or whether there is some check that catches this.
> I think you're referring to bpf-to-bpf calls.
> The limit it still the same. 4k per program including all calls.
> tail calls are not allowed when bpf-to-bpf is used. So no 32 multiplier.

Jann,

I think you meant
main:
call A
call A
call A
exit
A:
call B
call B
call B
exit
B:
call C
...

scenario when everything fits into 4k?
Would be great if you can construct such test while we're fixing
the rest of the issues brought up in this thread.
It will definitely be no more than BPF_COMPLEXITY_LIMIT_INSNS
which is 128k, but I wonder what will be the actual number of
executed insns.
I think such clever constructed sequence can actually
hit 128k executed too.
It would be awesome test to add to test_verifier.c
We have some of such pushing-the-boundary tests in lib/test_bpf.c
that are generated in assembler.
The longest takes 23853 nanoseconds, but clever bpf2bpf call hack
like above with map_update call in the leaf function should
certainly take much longer.
I accept Paul's challenge to try to get such fancy bpf prog
to take 100 millseconds :)
Paul E. McKenney Jan. 25, 2019, 4:31 a.m. UTC | #17
On Fri, Jan 25, 2019 at 04:27:02AM +0000, Alexei Starovoitov wrote:
> On 1/24/19 6:38 PM, Alexei Starovoitov wrote:
> >> For programs created with CAP_SYS_ADMIN,
> >> things get more tricky because you can create your own functions and
> >> call them repeatedly; I'm not sure whether the pessimal runtime there
> >> becomes exponential, or whether there is some check that catches this.
> > I think you're referring to bpf-to-bpf calls.
> > The limit it still the same. 4k per program including all calls.
> > tail calls are not allowed when bpf-to-bpf is used. So no 32 multiplier.
> 
> Jann,
> 
> I think you meant
> main:
> call A
> call A
> call A
> exit
> A:
> call B
> call B
> call B
> exit
> B:
> call C
> ...
> 
> scenario when everything fits into 4k?
> Would be great if you can construct such test while we're fixing
> the rest of the issues brought up in this thread.
> It will definitely be no more than BPF_COMPLEXITY_LIMIT_INSNS
> which is 128k, but I wonder what will be the actual number of
> executed insns.
> I think such clever constructed sequence can actually
> hit 128k executed too.
> It would be awesome test to add to test_verifier.c
> We have some of such pushing-the-boundary tests in lib/test_bpf.c
> that are generated in assembler.
> The longest takes 23853 nanoseconds, but clever bpf2bpf call hack
> like above with map_update call in the leaf function should
> certainly take much longer.
> I accept Paul's challenge to try to get such fancy bpf prog
> to take 100 millseconds :)

Fair enough!  But once you meet my challenge, the RCU CPU stall warning
code will challenge you to hit 21 seconds (or only three seconds given
an appropriately configured kernel).  ;-)

							Thanx, Paul
Alexei Starovoitov Jan. 25, 2019, 4:47 a.m. UTC | #18
On 1/24/19 8:31 PM, Paul E. McKenney wrote:
> On Fri, Jan 25, 2019 at 04:27:02AM +0000, Alexei Starovoitov wrote:
>> On 1/24/19 6:38 PM, Alexei Starovoitov wrote:
>>>> For programs created with CAP_SYS_ADMIN,
>>>> things get more tricky because you can create your own functions and
>>>> call them repeatedly; I'm not sure whether the pessimal runtime there
>>>> becomes exponential, or whether there is some check that catches this.
>>> I think you're referring to bpf-to-bpf calls.
>>> The limit it still the same. 4k per program including all calls.
>>> tail calls are not allowed when bpf-to-bpf is used. So no 32 multiplier.
>>
>> Jann,
>>
>> I think you meant
>> main:
>> call A
>> call A
>> call A
>> exit
>> A:
>> call B
>> call B
>> call B
>> exit
>> B:
>> call C
>> ...
>>
>> scenario when everything fits into 4k?
>> Would be great if you can construct such test while we're fixing
>> the rest of the issues brought up in this thread.
>> It will definitely be no more than BPF_COMPLEXITY_LIMIT_INSNS
>> which is 128k, but I wonder what will be the actual number of
>> executed insns.
>> I think such clever constructed sequence can actually
>> hit 128k executed too.
>> It would be awesome test to add to test_verifier.c
>> We have some of such pushing-the-boundary tests in lib/test_bpf.c
>> that are generated in assembler.
>> The longest takes 23853 nanoseconds, but clever bpf2bpf call hack
>> like above with map_update call in the leaf function should
>> certainly take much longer.
>> I accept Paul's challenge to try to get such fancy bpf prog
>> to take 100 millseconds :)
> 
> Fair enough!  But once you meet my challenge, the RCU CPU stall warning
> code will challenge you to hit 21 seconds (or only three seconds given
> an appropriately configured kernel).  ;-)

         if (till_stall_check < 3) {
                 WRITE_ONCE(rcu_cpu_stall_timeout, 3);
                 till_stall_check = 3;

let's change that limit to 1 !
Seriously though folks have proposed to teach bpf verifier
to sprinkle cond_resched() automatically into bpf program
when critical path through the program reaches certain insn limit.
The verifier can easily be taught to compute the longest path.
Other folks proposed to get rid of 4k limit when prog
is preemptable and executing in user context.
That's when srcu will come into play.
Peter Zijlstra Jan. 25, 2019, 8:38 a.m. UTC | #19
On Thu, Jan 24, 2019 at 06:57:00PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 06:44:20PM -0800, Eric Dumazet wrote:
> > Let see if we understood this well.
> > 
> > 1. create perf event PERF_TYPE_HARDWARE:PERF_COUNT_HW_CPU_CYCLES
> > 2. attach bpf probram to this event 
> > 3. since that's a hw event, the bpf program is executed in NMI context
> > 4. the bpf program calls bpf_get_stackid to record the trace in a bpf map
> > 5. bpf_get_stackid calls pcpu_freelist_pop and pcpu_freelist_push from NMI

How is this not a straight up bug? NMI code should not ever call code
that uses spinlocks.

> > 6. userspace calls sys_bpf(bpf_map_lookup_elem) which calls bpf_stackmap_copy which can call pcpu_freelist_push
> 
> argh. lookup cmd is missing __this_cpu_inc(bpf_prog_active); like update/delete do.
> Will fix.
> 
> > It seems pcpu_freelist_pop and pcpu_freelist_push are not NMI safe,
> > so what prevents bad things to happen ?
> 
> nmi checks for bpf_prog_active==0. See bpf_overflow_handler.

yuck yuck yuck.. That's horrific :-( That means the whole BPF crud is
unreliable and events can go randomly missing.
Peter Zijlstra Jan. 25, 2019, 9:10 a.m. UTC | #20
On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > 
> > Thanks for having kernel/locking people on Cc...
> > 
> > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > 
> > > Implementation details:
> > > - on !SMP bpf_spin_lock() becomes nop
> > 
> > Because no BPF program is preemptible? I don't see any assertions or
> > even a comment that says this code is non-preemptible.
> > 
> > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > which is not sufficient.
> 
> nope. all bpf prog types disable preemption. That is must have for all
> sorts of things to work properly.
> If there is a prog type that doing rcu_read_lock only it's a serious bug.
> About a year or so ago we audited everything specifically to make
> sure everything disables preemption before calling bpf progs.
> I'm pretty sure nothing crept in in the mean time.

Do we want something like (the completely untested) below to avoid
having to manually audit this over and over?

---
 include/linux/filter.h |  2 +-
 include/linux/kernel.h |  9 +++++++--
 kernel/sched/core.c    | 28 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d531d4250bff..4ab51e78da36 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -513,7 +513,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8f0e68e250a7..f4cea3260a28 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -245,8 +245,10 @@ extern int _cond_resched(void);
 #endif
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-  void ___might_sleep(const char *file, int line, int preempt_offset);
-  void __might_sleep(const char *file, int line, int preempt_offset);
+extern void ___might_sleep(const char *file, int line, int preempt_offset);
+extern void __might_sleep(const char *file, int line, int preempt_offset);
+extern void __cant_sleep(const char *file, int line, int preempt_offset);
+
 /**
  * might_sleep - annotation for functions that can sleep
  *
@@ -259,6 +261,8 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+# define cant_sleep() \
+	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
 # define sched_annotate_sleep()	(current->task_state_change = 0)
 #else
   static inline void ___might_sleep(const char *file, int line,
@@ -266,6 +270,7 @@ extern int _cond_resched(void);
   static inline void __might_sleep(const char *file, int line,
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
+# define cant_sleep() do { } while (0)
 # define sched_annotate_sleep() do { } while (0)
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee7763641348..799c285f4e0f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6162,6 +6162,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;
+
+	if (irqs_disabled())
+		return;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		return;
+
+	if (preempt_count() > preempt_offset)
+		return;
+
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
Peter Zijlstra Jan. 25, 2019, 9:59 a.m. UTC | #21
On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:

> > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > 
> > I really don't much like direct usage of qspinlock; esp. not as a
> > surprise.
> > 
> > Why does it matter if 0 means unlocked; that's what
> > __ARCH_SPIN_LOCK_UNLOCKED is for.
> > 
> > I get the sizeof(__u32) thing, but why not key off of that?
> 
> what do you mean by 'key off of that' ?
> to use arch_spinlock_t instead of qspinlock ?
> That was my first attempt, but then I painfully found that
> its size on parisc is 16 bytes and we're not going to penalize bpf
> to waste that much space because of single architecture.
> sizeof(arch_spinlock_t) can be 1 byte too (on sparc).

PowerPC has 8 bytes for some config options IIRC.

> That would fit in __u32, but I figured it's cleaner to use qspinlock
> on all archs that support it and dumb_spin_lock on archs that dont.
> 
> Another option is use to arch_spinlock_t when its sizeof==4

That's what I meant.

> and use dumb_spin_lock otherwise.
> It's doable, but imo still less clean than using qspinlock
> due to zero init. Since zero init is a lot less map work
> that zero inits all elements already.
> 
> If arch_spinlock_t is used than at map init time we would need to
> walk all elements and do __ARCH_SPIN_LOCK_UNLOCKED assignment
> (and maps can have millions of elements).
> Not horrible, but 100% waste of cycles for x86/arm64 where qspinlock
> is used. Such waste can be workaround further by doing ugly
> #idef __ARCH_SPIN_LOCK_UNLOCKED == 0 -> don't do init loop.
> And then add another #ifdef for archs with sizeof(arch_spinlock_t)!=4
> to keep zero init for all map types that support bpf_spin_lock
> via dumb_spin_lock.
> Clearly at that point we're getting into ugliness everywhere.
> Hence I've used qspinlock directly.

OK; I see.. but do these locks really have enough contention to run into
trouble with the simple test-and-set lock?

[ I tried to propose a simple ticket lock, but then realized the virt
  archs (s390,powerpc,etc.) would hate that and deleted everything again
]

Argh, what a mess..
Peter Zijlstra Jan. 25, 2019, 10:09 a.m. UTC | #22
On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:

> > So clearly this map stuff is shared between bpf proglets, otherwise
> > there would not be a need for locking. But what happens if one is from
> > task context and another from IRQ context?
> > 
> > I don't see a local_irq_save()/restore() anywhere. What avoids the
> > trivial lock inversion?
> 
> > and from NMI ...
> 
> progs are not preemptable and map syscall accessors have bpf_prog_active counters.
> So nmi/kprobe progs will not be running when syscall is running.
> Hence dead lock is not possible and irq_save is not needed.

What about the progs that run from SoftIRQ ? Since that bpf_prog_active
thing isn't inside BPF_PROG_RUN() what is to stop say:

   reuseport_select_sock()
     ...
       BPF_PROG_RUN()
         bpf_spin_lock()
        <IRQ>
	  ...
	  BPF_PROG_RUN()
	    bpf_spin_lock() // forever more

	</IRQ>

Unless you stick that bpf_prog_active stuff inside BPF_PROG_RUN itself,
I don't see how you can fundamentally avoid this happening (now or in
the future).
Peter Zijlstra Jan. 25, 2019, 10:23 a.m. UTC | #23
On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:

> > And this would again be the moment where I go pester you about the BPF
> > memory model :-)
> 
> hehe :)
> How do you propose to define it in a way that it applies to all archs
> and yet doesn't penalize x86 ?
> "Assume minimum execution ordering model" the way kernel does
> unfortunately is not usable, since bpf doesn't have a luxury
> of using nice #defines that convert into nops on x86.

Why not? Surely the JIT can fix it up? That is, suppose you were to have
smp_rmb() as a eBPF instruction then the JIT (which knows what
architecture it is targeting) can simply avoid emitting instructions for
it.

Similarly; could something like this not also help with the spinlock
thing? Use that generic test-and-set thing for the interpreter, but
provide a better implementation in the JIT?
Paul E. McKenney Jan. 25, 2019, 4:02 p.m. UTC | #24
On Fri, Jan 25, 2019 at 04:47:20AM +0000, Alexei Starovoitov wrote:
> On 1/24/19 8:31 PM, Paul E. McKenney wrote:
> > On Fri, Jan 25, 2019 at 04:27:02AM +0000, Alexei Starovoitov wrote:
> >> On 1/24/19 6:38 PM, Alexei Starovoitov wrote:
> >>>> For programs created with CAP_SYS_ADMIN,
> >>>> things get more tricky because you can create your own functions and
> >>>> call them repeatedly; I'm not sure whether the pessimal runtime there
> >>>> becomes exponential, or whether there is some check that catches this.
> >>> I think you're referring to bpf-to-bpf calls.
> >>> The limit it still the same. 4k per program including all calls.
> >>> tail calls are not allowed when bpf-to-bpf is used. So no 32 multiplier.
> >>
> >> Jann,
> >>
> >> I think you meant
> >> main:
> >> call A
> >> call A
> >> call A
> >> exit
> >> A:
> >> call B
> >> call B
> >> call B
> >> exit
> >> B:
> >> call C
> >> ...
> >>
> >> scenario when everything fits into 4k?
> >> Would be great if you can construct such test while we're fixing
> >> the rest of the issues brought up in this thread.
> >> It will definitely be no more than BPF_COMPLEXITY_LIMIT_INSNS
> >> which is 128k, but I wonder what will be the actual number of
> >> executed insns.
> >> I think such clever constructed sequence can actually
> >> hit 128k executed too.
> >> It would be awesome test to add to test_verifier.c
> >> We have some of such pushing-the-boundary tests in lib/test_bpf.c
> >> that are generated in assembler.
> >> The longest takes 23853 nanoseconds, but clever bpf2bpf call hack
> >> like above with map_update call in the leaf function should
> >> certainly take much longer.
> >> I accept Paul's challenge to try to get such fancy bpf prog
> >> to take 100 millseconds :)
> > 
> > Fair enough!  But once you meet my challenge, the RCU CPU stall warning
> > code will challenge you to hit 21 seconds (or only three seconds given
> > an appropriately configured kernel).  ;-)
> 
>          if (till_stall_check < 3) {
>                  WRITE_ONCE(rcu_cpu_stall_timeout, 3);
>                  till_stall_check = 3;
> 
> let's change that limit to 1 !

Heh!  It was 1.5 seconds back in DYNIX/ptx.  However, taking it below
3 seconds would require some other adjustments.  Something about there
being a lot more moving parts in the Linux kernel.

> Seriously though folks have proposed to teach bpf verifier
> to sprinkle cond_resched() automatically into bpf program
> when critical path through the program reaches certain insn limit.
> The verifier can easily be taught to compute the longest path.

Good point, for PREEMPT=n, cond_resched() will help.

> Other folks proposed to get rid of 4k limit when prog
> is preemptable and executing in user context.
> That's when srcu will come into play.

OK, please let me know when you get to this point so that I can get
the lightweight variant of SRCU moving forward.

							Thanx, Paul
Jann Horn Jan. 25, 2019, 4:18 p.m. UTC | #25
On Fri, Jan 25, 2019 at 5:12 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> On Fri, Jan 25, 2019 at 02:46:55AM +0100, Jann Horn wrote:
> > On Fri, Jan 25, 2019 at 2:22 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> > > On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> > > > On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > > > > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > > > >
> > > > > > > Thanks for having kernel/locking people on Cc...
> > > > > > >
> > > > > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > > > >
> > > > > > > > Implementation details:
> > > > > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > > > >
> > > > > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > > > > even a comment that says this code is non-preemptible.
> > > > > > >
> > > > > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > > > > which is not sufficient.
> > > > > > >
> > > > > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > > > >
> > > > > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > > > > surprise.
> > > > >
> > > > > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > > > > use of a more generic locking primitive, for example, one that allowed
> > > > > blocking, at least in cases were the context allowed this.
> > > > >
> > > > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > > > > branch srcu-lr.2019.01.16a.
> > > > >
> > > > > One advantage of a more generic locking primitive would be keeping BPF
> > > > > programs independent of internal changes to spinlock primitives.
> > > >
> > > > Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> > > > bpf is not switching to srcu any time soon.
> > > > If/when it happens it will be only for certain prog+map types
> > > > like bpf syscall probes that need to be able to do copy_from_user
> > > > from bpf prog.
> > >
> > > Hmmm...  What prevents BPF programs from looping infinitely within an
> > > RCU reader, and as you noted, preemption disabled?
> > >
> > > If BPF programs are in fact allowed to loop infinitely, it would be
> > > very good for the health of the kernel to have preemption enabled.
> > > And to be within an SRCU read-side critical section instead of an RCU
> > > read-side critical section.
> >
> > The BPF verifier prevents loops; this is in push_insn() in
> > kernel/bpf/verifier.c, which errors out with -EINVAL when a back edge
> > is encountered. For non-root programs, that limits the maximum number
> > of instructions per eBPF engine execution to
> > BPF_MAXINSNS*MAX_TAIL_CALL_CNT==4096*32==131072 (but that includes
> > call instructions, which can cause relatively expensive operations
> > like hash table lookups). For programs created with CAP_SYS_ADMIN,
> > things get more tricky because you can create your own functions and
> > call them repeatedly; I'm not sure whether the pessimal runtime there
> > becomes exponential, or whether there is some check that catches this.
>
> Whew!!!  ;-)
>
> So no more than (say) 100 milliseconds?

Depends on RLIMIT_MEMLOCK and on how hard userspace is trying to make
things slow, I guess - if userspace manages to create a hashtable,
with a few dozen megabytes in size, with worst-case assignment of
elements to buckets (everything in a single bucket), every lookup call
on that bucket becomes a linked list traversal through a list that
must be stored in main memory because it's too big for the CPU caches.
I don't know into how much time that translates.
Paul E. McKenney Jan. 25, 2019, 10:51 p.m. UTC | #26
On Fri, Jan 25, 2019 at 05:18:12PM +0100, Jann Horn wrote:
> On Fri, Jan 25, 2019 at 5:12 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> > On Fri, Jan 25, 2019 at 02:46:55AM +0100, Jann Horn wrote:
> > > On Fri, Jan 25, 2019 at 2:22 AM Paul E. McKenney <paulmck@linux.ibm.com> wrote:
> > > > On Thu, Jan 24, 2019 at 04:05:16PM -0800, Alexei Starovoitov wrote:
> > > > > On Thu, Jan 24, 2019 at 03:42:32PM -0800, Paul E. McKenney wrote:
> > > > > > On Thu, Jan 24, 2019 at 07:56:52PM +0100, Peter Zijlstra wrote:
> > > > > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > > > > > >
> > > > > > > > Thanks for having kernel/locking people on Cc...
> > > > > > > >
> > > > > > > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > > > > > >
> > > > > > > > > Implementation details:
> > > > > > > > > - on !SMP bpf_spin_lock() becomes nop
> > > > > > > >
> > > > > > > > Because no BPF program is preemptible? I don't see any assertions or
> > > > > > > > even a comment that says this code is non-preemptible.
> > > > > > > >
> > > > > > > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > > > > > > which is not sufficient.
> > > > > > > >
> > > > > > > > > - on architectures that don't support queued_spin_lock trivial lock is used.
> > > > > > > > >   Note that arch_spin_lock cannot be used, since not all archs agree that
> > > > > > > > >   zero == unlocked and sizeof(arch_spinlock_t) != sizeof(__u32).
> > > > > > > >
> > > > > > > > I really don't much like direct usage of qspinlock; esp. not as a
> > > > > > > > surprise.
> > > > > >
> > > > > > Substituting the lightweight-reader SRCU as discussed earlier would allow
> > > > > > use of a more generic locking primitive, for example, one that allowed
> > > > > > blocking, at least in cases were the context allowed this.
> > > > > >
> > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
> > > > > > branch srcu-lr.2019.01.16a.
> > > > > >
> > > > > > One advantage of a more generic locking primitive would be keeping BPF
> > > > > > programs independent of internal changes to spinlock primitives.
> > > > >
> > > > > Let's keep "srcu in bpf" discussion separate from bpf_spin_lock discussion.
> > > > > bpf is not switching to srcu any time soon.
> > > > > If/when it happens it will be only for certain prog+map types
> > > > > like bpf syscall probes that need to be able to do copy_from_user
> > > > > from bpf prog.
> > > >
> > > > Hmmm...  What prevents BPF programs from looping infinitely within an
> > > > RCU reader, and as you noted, preemption disabled?
> > > >
> > > > If BPF programs are in fact allowed to loop infinitely, it would be
> > > > very good for the health of the kernel to have preemption enabled.
> > > > And to be within an SRCU read-side critical section instead of an RCU
> > > > read-side critical section.
> > >
> > > The BPF verifier prevents loops; this is in push_insn() in
> > > kernel/bpf/verifier.c, which errors out with -EINVAL when a back edge
> > > is encountered. For non-root programs, that limits the maximum number
> > > of instructions per eBPF engine execution to
> > > BPF_MAXINSNS*MAX_TAIL_CALL_CNT==4096*32==131072 (but that includes
> > > call instructions, which can cause relatively expensive operations
> > > like hash table lookups). For programs created with CAP_SYS_ADMIN,
> > > things get more tricky because you can create your own functions and
> > > call them repeatedly; I'm not sure whether the pessimal runtime there
> > > becomes exponential, or whether there is some check that catches this.
> >
> > Whew!!!  ;-)
> >
> > So no more than (say) 100 milliseconds?
> 
> Depends on RLIMIT_MEMLOCK and on how hard userspace is trying to make
> things slow, I guess - if userspace manages to create a hashtable,
> with a few dozen megabytes in size, with worst-case assignment of
> elements to buckets (everything in a single bucket), every lookup call
> on that bucket becomes a linked list traversal through a list that
> must be stored in main memory because it's too big for the CPU caches.
> I don't know into how much time that translates.

So perhaps you have a candidate BPF program for the RCU CPU stall warning
challenge, then.  ;-)

							Thanx, Paul
Alexei Starovoitov Jan. 25, 2019, 11:42 p.m. UTC | #27
On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > 
> > > Thanks for having kernel/locking people on Cc...
> > > 
> > > On Wed, Jan 23, 2019 at 08:13:55PM -0800, Alexei Starovoitov wrote:
> > > 
> > > > Implementation details:
> > > > - on !SMP bpf_spin_lock() becomes nop
> > > 
> > > Because no BPF program is preemptible? I don't see any assertions or
> > > even a comment that says this code is non-preemptible.
> > > 
> > > AFAICT some of the BPF_RUN_PROG things are under rcu_read_lock() only,
> > > which is not sufficient.
> > 
> > nope. all bpf prog types disable preemption. That is must have for all
> > sorts of things to work properly.
> > If there is a prog type that doing rcu_read_lock only it's a serious bug.
> > About a year or so ago we audited everything specifically to make
> > sure everything disables preemption before calling bpf progs.
> > I'm pretty sure nothing crept in in the mean time.
> 
> Do we want something like (the completely untested) below to avoid
> having to manually audit this over and over?
> 
> ---
>  include/linux/filter.h |  2 +-
>  include/linux/kernel.h |  9 +++++++--
>  kernel/sched/core.c    | 28 ++++++++++++++++++++++++++++
>  3 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d531d4250bff..4ab51e78da36 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -513,7 +513,7 @@ struct sk_filter {
>  	struct bpf_prog	*prog;
>  };
>  
> -#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
> +#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })

That looks reasonable and I intent to apply this patch to bpf-next after testing.
Can you pls reply with a sob ?

With Daniel we did a bit of git archaeology how we came to this missing
preempt_disable in send side of socket filters...
The main reason is that classic BPF was preemptable from the beginning
and it has SKF_AD_CPU feature for the 10+ years.
Meaning that classic BPF programs can read current cpu as a hint though
they are preemptable.
raw_smp_processor_id was used to avoid triggering false warning.
When eBPF came along we kept it as-is during classic->extended conversion.
Back then there were no per-cpu maps.
When per-cpu maps were introduced we missed preemptable sendmsg case.
This cant_sleep() check should help us in the future.

The easiest fix is to add preempt_disable/enable for socket filters.
There is a concern that such fix will make classic bpf non-preemptable
and classic bpf can be quite cpu expensive.
For example it's allowed to have 4k classic instructions that do
'load SKF_AD_PAY_OFFSET' which will call heavy flow_dissector 4k times.

We can do preempt_disable for extended only. Then BPF_PROG_RUN macro and
cant_sleep would need to be adjusted accordingly.
My preference would be to do preempt_disable for both classic and extended.
Since classic is only legacy uapi at this point. Nothing on the kernel side
makes it special and I'd like to keep it such.
Also on the receive side classic runs in bh, so 4k flow_dissector calls
in classic has to be dealt with anyway.

>  > nmi checks for bpf_prog_active==0. See bpf_overflow_handler.
> yuck yuck yuck.. That's horrific :-( That means the whole BPF crud is
> unreliable and events can go randomly missing.

bpf_prog_active is the mechanism to workaround non-reentrant pieces of the kernel.
Before that we couldn't do this:
sudo funccount.py *spin_lock*
Tracing 5 functions for "*spin_lock*"... Hit Ctrl-C to end.
^C
FUNC                                    COUNT
queued_spin_lock_slowpath                 682
_raw_spin_lock_bh                         997
_raw_spin_lock_irq                      10586
_raw_spin_lock_irqsave                  90666
_raw_spin_lock                         318300
Detaching...

Now we can.
This power comes with the 'horrific' caveat that two
tracing progs cannot execute on the same cpu at the same time.
It's not a pretty fix for the reentrancy issue.
If anyone has better ideas, I'm all ears.

> What about the progs that run from SoftIRQ ? Since that bpf_prog_active
> thing isn't inside BPF_PROG_RUN() what is to stop say:
> 
>    reuseport_select_sock()
>      ...
>        BPF_PROG_RUN()
>          bpf_spin_lock()
>         <IRQ>
> 	  ...
> 	  BPF_PROG_RUN()
> 	    bpf_spin_lock() // forever more
> 
> 	</IRQ>
> 
> Unless you stick that bpf_prog_active stuff inside BPF_PROG_RUN itself,
> I don't see how you can fundamentally avoid this happening (now or in
> the future).

BPF_PROG_RUN macro is used as a template for another macro:
ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
see kernel/trace/bpf_trace.c
Doing bpf_prog_active for every prog in array is too expensive.

But your issue above is valid.
We don't use bpf_prog_active for networking progs, since we allow
for one level of nesting due to the classic SKF_AD_PAY_OFFSET legacy.
Also we allow tracing progs to nest with networking progs.
People using this actively.
Typically it's not an issue, since in networking there is no
arbitrary nesting (unlike kprobe/nmi in tracing),
but for bpf_spin_lock it can be, since the same map can be shared
by networking and tracing progs and above deadlock would be possible:
(first BPF_PROG_RUN will be from networking prog, then kprobe+bpf's
BPF_PROG_RUN accessing the same map with bpf_spin_lock)

So for now I'm going to allow bpf_spin_lock in networking progs only,
since there is no arbitrary nesting there.
And once we figure out the safety concerns for kprobe/tracepoint progs
we can enable bpf_spin_lock there too.
NMI bpf progs will never have bpf_spin_lock.

re: memory model.. will reply in the other thread.
Alexei Starovoitov Jan. 25, 2019, 11:44 p.m. UTC | #28
On Fri, Jan 25, 2019 at 02:51:12PM -0800, Paul E. McKenney wrote:
> > >
> > > So no more than (say) 100 milliseconds?
> > 
> > Depends on RLIMIT_MEMLOCK and on how hard userspace is trying to make
> > things slow, I guess - if userspace manages to create a hashtable,
> > with a few dozen megabytes in size, with worst-case assignment of
> > elements to buckets (everything in a single bucket), every lookup call
> > on that bucket becomes a linked list traversal through a list that
> > must be stored in main memory because it's too big for the CPU caches.
> > I don't know into how much time that translates.
> 
> So perhaps you have a candidate BPF program for the RCU CPU stall warning
> challenge, then.  ;-)

I'd like to see one that can defeat jhash + random seed.
Alexei Starovoitov Jan. 26, 2019, 12:17 a.m. UTC | #29
On Fri, Jan 25, 2019 at 11:23:12AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> 
> > > And this would again be the moment where I go pester you about the BPF
> > > memory model :-)
> > 
> > hehe :)
> > How do you propose to define it in a way that it applies to all archs
> > and yet doesn't penalize x86 ?
> > "Assume minimum execution ordering model" the way kernel does
> > unfortunately is not usable, since bpf doesn't have a luxury
> > of using nice #defines that convert into nops on x86.
> 
> Why not? Surely the JIT can fix it up? That is, suppose you were to have
> smp_rmb() as a eBPF instruction then the JIT (which knows what
> architecture it is targeting) can simply avoid emitting instructions for
> it.

I'm all for adding new instructions that solve real use cases.
imo bpf_spin_lock() is the first step in helping with concurrency.
At plumbers conference we agreed to add new sync_fetch_and_add()
and xchg() instructions. That's a second step.
smp_rmb/wmb/mb should be added as well.
JITs will patch them depending on architecture.

What I want to avoid is to define the whole execution ordering model upfront.
We cannot say that BPF ISA is weakly ordered like alpha.
Most of the bpf progs are written and running on x86. We shouldn't
twist bpf developer's arm by artificially relaxing memory model.
BPF memory model is equal to memory model of underlying architecture.
What we can do is to make it bpf progs a bit more portable with
smp_rmb instructions, but we must not force weak execution on the developer.

> Similarly; could something like this not also help with the spinlock
> thing? Use that generic test-and-set thing for the interpreter, but
> provide a better implementation in the JIT?

May be eventually. We can add cmpxchg insn, but the verifier still
doesn't support loops. We made a lot of progress in bounded loop research
over the last 2 years, but loops in bpf are still a year away.
We considered adding 'bpf_spin_lock' as a new instruction instead of helper call,
but that approach has a lot of negatives, so we went with the helper.
Jann Horn Jan. 26, 2019, 12:43 a.m. UTC | #30
On Sat, Jan 26, 2019 at 12:44 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Jan 25, 2019 at 02:51:12PM -0800, Paul E. McKenney wrote:
> > > >
> > > > So no more than (say) 100 milliseconds?
> > >
> > > Depends on RLIMIT_MEMLOCK and on how hard userspace is trying to make
> > > things slow, I guess - if userspace manages to create a hashtable,
> > > with a few dozen megabytes in size, with worst-case assignment of
> > > elements to buckets (everything in a single bucket), every lookup call
> > > on that bucket becomes a linked list traversal through a list that
> > > must be stored in main memory because it's too big for the CPU caches.
> > > I don't know into how much time that translates.
> >
> > So perhaps you have a candidate BPF program for the RCU CPU stall warning
> > challenge, then.  ;-)
>
> I'd like to see one that can defeat jhash + random seed.

Assuming that the map isn't created by root with BPF_F_ZERO_SEED:

The dumb approach would be to put things into the map, try to measure
via timing/sidechannel whether you got collisions, and then keep
trying different keys, and keep them if the timing indicates a
collision. That'd probably be pretty slow and annoying though. Two
years ago, I implemented something similar to leak information about
virtual addresses from Firefox by measuring hash bucket collisions
from JavaScript (but to be fair, it was easier there because you can
resize the hash table):
https://thejh.net/misc/firefox-cve-2016-9904-and-cve-2017-5378-bugreport

But I think there's an easier way, too: The jhash seed is just 32
bits, and AFAICS the BPF API leaks information about that seed through
BPF_MAP_GET_NEXT_KEY: Stuff two random keys into the hash table, run
BPF_MAP_GET_NEXT_KEY with attr->key==NULL, and see which key is
returned. Do that around 32 times, and you should have roughly enough
information to bruteforce the jhash seed? Recovering the seed should
then be relatively quick, 2^32 iterations of a fast hash don't take
terribly long.

That said, I don't think this is interesting enough to spend the time
necessary to implement it. :P
Jann Horn Jan. 26, 2019, 12:59 a.m. UTC | #31
On Sat, Jan 26, 2019 at 1:43 AM Jann Horn <jannh@google.com> wrote:
> On Sat, Jan 26, 2019 at 12:44 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Jan 25, 2019 at 02:51:12PM -0800, Paul E. McKenney wrote:
> > > > >
> > > > > So no more than (say) 100 milliseconds?
> > > >
> > > > Depends on RLIMIT_MEMLOCK and on how hard userspace is trying to make
> > > > things slow, I guess - if userspace manages to create a hashtable,
> > > > with a few dozen megabytes in size, with worst-case assignment of
> > > > elements to buckets (everything in a single bucket), every lookup call
> > > > on that bucket becomes a linked list traversal through a list that
> > > > must be stored in main memory because it's too big for the CPU caches.
> > > > I don't know into how much time that translates.
> > >
> > > So perhaps you have a candidate BPF program for the RCU CPU stall warning
> > > challenge, then.  ;-)
> >
> > I'd like to see one that can defeat jhash + random seed.
>
> Assuming that the map isn't created by root with BPF_F_ZERO_SEED:
>
> The dumb approach would be to put things into the map, try to measure
> via timing/sidechannel whether you got collisions, and then keep
> trying different keys, and keep them if the timing indicates a
> collision. That'd probably be pretty slow and annoying though. Two
> years ago, I implemented something similar to leak information about
> virtual addresses from Firefox by measuring hash bucket collisions
> from JavaScript (but to be fair, it was easier there because you can
> resize the hash table):
> https://thejh.net/misc/firefox-cve-2016-9904-and-cve-2017-5378-bugreport
>
> But I think there's an easier way, too: The jhash seed is just 32
> bits, and AFAICS the BPF API leaks information about that seed through
> BPF_MAP_GET_NEXT_KEY: Stuff two random keys into the hash table, run
> BPF_MAP_GET_NEXT_KEY with attr->key==NULL, and see which key is
> returned. Do that around 32 times, and you should have roughly enough
> information to bruteforce the jhash seed? Recovering the seed should
> then be relatively quick, 2^32 iterations of a fast hash don't take
> terribly long.
>
> That said, I don't think this is interesting enough to spend the time
> necessary to implement it. :P

Oh, and actually, you can probably also detect a collision in a simpler way:

 - insert A
 - insert B
 - query BPF_MAP_GET_NEXT_KEY
 - delete A
 - delete B
 - insert B
 - insert A
 - query BPF_MAP_GET_NEXT_KEY
 - delete A
 - delete B

If the two BPF_MAP_GET_NEXT_KEY queries return the same result, A and
B are in different buckets; if they return different results, A and B
are in the same bucket, I think?
Peter Zijlstra Jan. 28, 2019, 8:24 a.m. UTC | #32
On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:

> > Do we want something like (the completely untested) below to avoid
> > having to manually audit this over and over?
> > 
> > ---
> >  include/linux/filter.h |  2 +-
> >  include/linux/kernel.h |  9 +++++++--
> >  kernel/sched/core.c    | 28 ++++++++++++++++++++++++++++
> >  3 files changed, 36 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index d531d4250bff..4ab51e78da36 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -513,7 +513,7 @@ struct sk_filter {
> >  	struct bpf_prog	*prog;
> >  };
> >  
> > -#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
> > +#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> 
> That looks reasonable and I intent to apply this patch to bpf-next after testing.
> Can you pls reply with a sob ?

Sure; with the caveat that I didn't even hold it near a compiler, and it
probably should grow a comment to explain the interface (similar to
might_sleep):

Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

> The easiest fix is to add preempt_disable/enable for socket filters.
> There is a concern that such fix will make classic bpf non-preemptable
> and classic bpf can be quite cpu expensive.

> Also on the receive side classic runs in bh, so 4k flow_dissector calls
> in classic has to be dealt with anyway.

Right and agreed; per that argument the worst case (legacy) BPF was
already present under non-preempt and thus making it consistently so
should not affect the worst case.
Peter Zijlstra Jan. 28, 2019, 8:31 a.m. UTC | #33
On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:
> > On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:

> >  > nmi checks for bpf_prog_active==0. See bpf_overflow_handler.

> > yuck yuck yuck.. That's horrific :-( That means the whole BPF crud is
> > unreliable and events can go randomly missing.
> 
> bpf_prog_active is the mechanism to workaround non-reentrant pieces of the kernel.

'the kernel' or 'bpf' ?

perf has a recursion counter per context (task,softirq,hardirq,nmi) and
that ensures that perf doesn't recurse in on itself while allowing the
nesting of these contexts.

But if BPF itself is not able to deal with such nesting that won't work
of course.
Peter Zijlstra Jan. 28, 2019, 8:35 a.m. UTC | #34
On Mon, Jan 28, 2019 at 09:31:23AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> 
> > >  > nmi checks for bpf_prog_active==0. See bpf_overflow_handler.
> 
> > > yuck yuck yuck.. That's horrific :-( That means the whole BPF crud is
> > > unreliable and events can go randomly missing.
> > 
> > bpf_prog_active is the mechanism to workaround non-reentrant pieces of the kernel.
> 
> 'the kernel' or 'bpf' ?
> 
> perf has a recursion counter per context (task,softirq,hardirq,nmi) and
> that ensures that perf doesn't recurse in on itself while allowing the
> nesting of these contexts.
> 
> But if BPF itself is not able to deal with such nesting that won't work
> of course.

Ooh, later you say:

> Also we allow tracing progs to nest with networking progs.

Which seems to suggest BPF itself can suppord (limited) nesting.

See: kernel/events/internal.h:get_recursion_context()
Peter Zijlstra Jan. 28, 2019, 8:43 a.m. UTC | #35
On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:

> > What about the progs that run from SoftIRQ ? Since that bpf_prog_active
> > thing isn't inside BPF_PROG_RUN() what is to stop say:
> > 
> >    reuseport_select_sock()
> >      ...
> >        BPF_PROG_RUN()
> >          bpf_spin_lock()
> >         <IRQ>
> > 	  ...
> > 	  BPF_PROG_RUN()
> > 	    bpf_spin_lock() // forever more
> > 
> > 	</IRQ>
> > 
> > Unless you stick that bpf_prog_active stuff inside BPF_PROG_RUN itself,
> > I don't see how you can fundamentally avoid this happening (now or in
> > the future).

> But your issue above is valid.

> We don't use bpf_prog_active for networking progs, since we allow
> for one level of nesting due to the classic SKF_AD_PAY_OFFSET legacy.
> Also we allow tracing progs to nest with networking progs.
> People using this actively.
> Typically it's not an issue, since in networking there is no
> arbitrary nesting (unlike kprobe/nmi in tracing),
> but for bpf_spin_lock it can be, since the same map can be shared
> by networking and tracing progs and above deadlock would be possible:
> (first BPF_PROG_RUN will be from networking prog, then kprobe+bpf's
> BPF_PROG_RUN accessing the same map with bpf_spin_lock)
> 
> So for now I'm going to allow bpf_spin_lock in networking progs only,
> since there is no arbitrary nesting there.

Isn't that still broken? AFAIU networking progs can happen in task
context (TX) and SoftIRQ context (RX), which can nest.

> And once we figure out the safety concerns for kprobe/tracepoint progs
> we can enable bpf_spin_lock there too.
> NMI bpf progs will never have bpf_spin_lock.

kprobe is like NMI, since it pokes an INT3 instruction which can trigger
in the middle of IRQ-disabled or even in NMIs. Similar arguments can be
made for tracepoints, they can happen 'anywhere'.
Peter Zijlstra Jan. 28, 2019, 9:24 a.m. UTC | #36
On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> On Fri, Jan 25, 2019 at 11:23:12AM +0100, Peter Zijlstra wrote:
> > On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > 
> > > > And this would again be the moment where I go pester you about the BPF
> > > > memory model :-)
> > > 
> > > hehe :)
> > > How do you propose to define it in a way that it applies to all archs
> > > and yet doesn't penalize x86 ?
> > > "Assume minimum execution ordering model" the way kernel does
> > > unfortunately is not usable, since bpf doesn't have a luxury
> > > of using nice #defines that convert into nops on x86.
> > 
> > Why not? Surely the JIT can fix it up? That is, suppose you were to have
> > smp_rmb() as a eBPF instruction then the JIT (which knows what
> > architecture it is targeting) can simply avoid emitting instructions for
> > it.
> 
> I'm all for adding new instructions that solve real use cases.
> imo bpf_spin_lock() is the first step in helping with concurrency.
> At plumbers conference we agreed to add new sync_fetch_and_add()
> and xchg() instructions. That's a second step.
> smp_rmb/wmb/mb should be added as well.
> JITs will patch them depending on architecture.
> 
> What I want to avoid is to define the whole execution ordering model upfront.
> We cannot say that BPF ISA is weakly ordered like alpha.
> Most of the bpf progs are written and running on x86. We shouldn't
> twist bpf developer's arm by artificially relaxing memory model.
> BPF memory model is equal to memory model of underlying architecture.
> What we can do is to make it bpf progs a bit more portable with
> smp_rmb instructions, but we must not force weak execution on the developer.

Well, I agree with only introducing bits you actually need, and my
smp_rmb() example might have been poorly chosen, smp_load_acquire() /
smp_store_release() might have been a far more useful example.

But I disagree with the last part; we have to pick a model now;
otherwise you'll pain yourself into a corner.

Also; Alpha isn't very relevant these days; however ARM64 does seem to
be gaining a lot of attention and that is very much a weak architecture.
Adding strongly ordered assumptions to BPF now, will penalize them in
the long run.

> > Similarly; could something like this not also help with the spinlock
> > thing? Use that generic test-and-set thing for the interpreter, but
> > provide a better implementation in the JIT?
> 
> May be eventually. We can add cmpxchg insn, but the verifier still
> doesn't support loops. We made a lot of progress in bounded loop research
> over the last 2 years, but loops in bpf are still a year away.
> We considered adding 'bpf_spin_lock' as a new instruction instead of helper call,
> but that approach has a lot of negatives, so we went with the helper.

Ah, but the loop won't be in the BPF program itself. The BPF program
would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
out-of-line versions of them).

There isn't anything that mandates the JIT uses the exact same locking
routines the interpreter does, is there?
Alexei Starovoitov Jan. 28, 2019, 8:49 p.m. UTC | #37
On Mon, Jan 28, 2019 at 09:35:08AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 28, 2019 at 09:31:23AM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> > > On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:
> > > > On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> > 
> > > >  > nmi checks for bpf_prog_active==0. See bpf_overflow_handler.
> > 
> > > > yuck yuck yuck.. That's horrific :-( That means the whole BPF crud is
> > > > unreliable and events can go randomly missing.
> > > 
> > > bpf_prog_active is the mechanism to workaround non-reentrant pieces of the kernel.
> > 
> > 'the kernel' or 'bpf' ?
> > 
> > perf has a recursion counter per context (task,softirq,hardirq,nmi) and
> > that ensures that perf doesn't recurse in on itself while allowing the
> > nesting of these contexts.
> > 
> > But if BPF itself is not able to deal with such nesting that won't work
> > of course.
> 
> Ooh, later you say:
> 
> > Also we allow tracing progs to nest with networking progs.
> 
> Which seems to suggest BPF itself can suppord (limited) nesting.

well I'm not sure where is the boundary between bpf and the kernel :)
By non-reentrant I meant pieces of the kernel that bpf maps
or bpf helpers may use.
For example kmalloc-style bpf map is using rcu and map update/delete
do call_rcu() from inside bpf program.
If kprobe or tracepoint bpf prog is called from inner bits of
__call_rcu_core and the prog is doing map update we get into
recursive __call_rcu_core and things go bad.
Hence we disallow such situations when possible.
If bpf had its own infrastructure for everything then it would be
easy enough to allow proper recursion using mechanism similar
to what perf does with get_recursion_context.
Alexei Starovoitov Jan. 28, 2019, 9:37 p.m. UTC | #38
On Mon, Jan 28, 2019 at 09:43:10AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 25, 2019 at 03:42:43PM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 25, 2019 at 10:10:57AM +0100, Peter Zijlstra wrote:
> 
> > > What about the progs that run from SoftIRQ ? Since that bpf_prog_active
> > > thing isn't inside BPF_PROG_RUN() what is to stop say:
> > > 
> > >    reuseport_select_sock()
> > >      ...
> > >        BPF_PROG_RUN()
> > >          bpf_spin_lock()
> > >         <IRQ>
> > > 	  ...
> > > 	  BPF_PROG_RUN()
> > > 	    bpf_spin_lock() // forever more
> > > 
> > > 	</IRQ>
> > > 
> > > Unless you stick that bpf_prog_active stuff inside BPF_PROG_RUN itself,
> > > I don't see how you can fundamentally avoid this happening (now or in
> > > the future).
> 
> > But your issue above is valid.
> 
> > We don't use bpf_prog_active for networking progs, since we allow
> > for one level of nesting due to the classic SKF_AD_PAY_OFFSET legacy.
> > Also we allow tracing progs to nest with networking progs.
> > People using this actively.
> > Typically it's not an issue, since in networking there is no
> > arbitrary nesting (unlike kprobe/nmi in tracing),
> > but for bpf_spin_lock it can be, since the same map can be shared
> > by networking and tracing progs and above deadlock would be possible:
> > (first BPF_PROG_RUN will be from networking prog, then kprobe+bpf's
> > BPF_PROG_RUN accessing the same map with bpf_spin_lock)
> > 
> > So for now I'm going to allow bpf_spin_lock in networking progs only,
> > since there is no arbitrary nesting there.
> 
> Isn't that still broken? AFAIU networking progs can happen in task
> context (TX) and SoftIRQ context (RX), which can nest.

Sure. sendmsg side of networking can be interrupted by napi receive.
Both can have bpf progs attached at different points, but napi won't run
when bpf prog is running, because bpf prog disables preemption.
More so the whole networking stack can be recursive and there is
xmit_recursion counter to check for bad cases.
When bpf progs interact with networking they don't add to that recursion.
All of *redirect*() helpers do so outside of bpf preempt disabled context.
Also there is no nesting of the same networking prog type.
Like xdp/tc/lwt/cgroup bpf progs cannot be called recursively by design.
There are no arbitrary entry points unlike kprobe/tracepoint.
The only nesting is when socket filter _classic_ bpf prog is calling
SKF_AD_PAY_OFFSET legacy. That calls flow dissector which may call flow dissector
bpf prog. Classic bpf doesn't use bpf maps, so no deadlock issues.

> > And once we figure out the safety concerns for kprobe/tracepoint progs
> > we can enable bpf_spin_lock there too.
> > NMI bpf progs will never have bpf_spin_lock.
> 
> kprobe is like NMI, since it pokes an INT3 instruction which can trigger
> in the middle of IRQ-disabled or even in NMIs. Similar arguments can be
> made for tracepoints, they can happen 'anywhere'.

exactly. that's why there is bpf_prog_active to protect the kernel in general
for tracing bpf progs.
Alexei Starovoitov Jan. 28, 2019, 9:56 p.m. UTC | #39
On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 25, 2019 at 11:23:12AM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 24, 2019 at 03:58:59PM -0800, Alexei Starovoitov wrote:
> > > > On Thu, Jan 24, 2019 at 07:01:09PM +0100, Peter Zijlstra wrote:
> > > 
> > > > > And this would again be the moment where I go pester you about the BPF
> > > > > memory model :-)
> > > > 
> > > > hehe :)
> > > > How do you propose to define it in a way that it applies to all archs
> > > > and yet doesn't penalize x86 ?
> > > > "Assume minimum execution ordering model" the way kernel does
> > > > unfortunately is not usable, since bpf doesn't have a luxury
> > > > of using nice #defines that convert into nops on x86.
> > > 
> > > Why not? Surely the JIT can fix it up? That is, suppose you were to have
> > > smp_rmb() as a eBPF instruction then the JIT (which knows what
> > > architecture it is targeting) can simply avoid emitting instructions for
> > > it.
> > 
> > I'm all for adding new instructions that solve real use cases.
> > imo bpf_spin_lock() is the first step in helping with concurrency.
> > At plumbers conference we agreed to add new sync_fetch_and_add()
> > and xchg() instructions. That's a second step.
> > smp_rmb/wmb/mb should be added as well.
> > JITs will patch them depending on architecture.
> > 
> > What I want to avoid is to define the whole execution ordering model upfront.
> > We cannot say that BPF ISA is weakly ordered like alpha.
> > Most of the bpf progs are written and running on x86. We shouldn't
> > twist bpf developer's arm by artificially relaxing memory model.
> > BPF memory model is equal to memory model of underlying architecture.
> > What we can do is to make it bpf progs a bit more portable with
> > smp_rmb instructions, but we must not force weak execution on the developer.
> 
> Well, I agree with only introducing bits you actually need, and my
> smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> smp_store_release() might have been a far more useful example.
> 
> But I disagree with the last part; we have to pick a model now;
> otherwise you'll pain yourself into a corner.
> 
> Also; Alpha isn't very relevant these days; however ARM64 does seem to
> be gaining a lot of attention and that is very much a weak architecture.
> Adding strongly ordered assumptions to BPF now, will penalize them in
> the long run.

arm64 is gaining attention just like riscV is gaining it too.
BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
BPF is not picking sides in CPU HW and ISA battles.
Memory model is CPU HW design decision. BPF ISA cannot dictate HW design.
We're not saying today that BPF is strongly ordered.
BPF load/stores are behaving differently on x86 vs arm64.
We can add new instructions, but we cannot 'define' how load/stores behave
from memory model perspective.
For example, take atomicity of single byte load/store.
Not all archs have them atomic, but we cannot say to bpf programmers
to always assume non-atomic byte loads.

> > > Similarly; could something like this not also help with the spinlock
> > > thing? Use that generic test-and-set thing for the interpreter, but
> > > provide a better implementation in the JIT?
> > 
> > May be eventually. We can add cmpxchg insn, but the verifier still
> > doesn't support loops. We made a lot of progress in bounded loop research
> > over the last 2 years, but loops in bpf are still a year away.
> > We considered adding 'bpf_spin_lock' as a new instruction instead of helper call,
> > but that approach has a lot of negatives, so we went with the helper.
> 
> Ah, but the loop won't be in the BPF program itself. The BPF program
> would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> out-of-line versions of them).

As I said we considered exactly that and such approach has a lot of downsides
comparing with the helper approach.
Pretty much every time new feature is added we're evaluating whether it
should be new instruction or new helper. 99% of the time we go with new helper.

> There isn't anything that mandates the JIT uses the exact same locking
> routines the interpreter does, is there?

sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
JITs don't even need to do anything. It looks like function call from bpf prog
point of view, but in JITed code it is a sequence of native instructions.

Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
takes too much time then we can inline fast path of queued_spin_lock
directly into bpf prog and save function call cost.
Peter Zijlstra Jan. 29, 2019, 8:59 a.m. UTC | #40
On Mon, Jan 28, 2019 at 01:37:12PM -0800, Alexei Starovoitov wrote:
> On Mon, Jan 28, 2019 at 09:43:10AM +0100, Peter Zijlstra wrote:

> > Isn't that still broken? AFAIU networking progs can happen in task
> > context (TX) and SoftIRQ context (RX), which can nest.
> 
> Sure. sendmsg side of networking can be interrupted by napi receive.
> Both can have bpf progs attached at different points, but napi won't run
> when bpf prog is running, because bpf prog disables preemption.

Disabling preemption is not sufficient, it needs to have done
local_bh_disable(), which isn't unlikely given this is all networking
code.

Anyway, if you're sure that's all good, I'll trust you on that. It's
been a very _very_ long time since I looked at the networking code.
Peter Zijlstra Jan. 29, 2019, 9:16 a.m. UTC | #41
On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:

> > Ah, but the loop won't be in the BPF program itself. The BPF program
> > would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> > code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> > out-of-line versions of them).
> 
> As I said we considered exactly that and such approach has a lot of downsides
> comparing with the helper approach.
> Pretty much every time new feature is added we're evaluating whether it
> should be new instruction or new helper. 99% of the time we go with new helper.

Ah; it seems I'm confused on helper vs instruction. As in, I've no idea
what a helper is.

> > There isn't anything that mandates the JIT uses the exact same locking
> > routines the interpreter does, is there?
> 
> sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
> Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
> JITs don't even need to do anything. It looks like function call from bpf prog
> point of view, but in JITed code it is a sequence of native instructions.
> 
> Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
> takes too much time then we can inline fast path of queued_spin_lock
> directly into bpf prog and save function call cost.

OK, so then the JIT can optimize helpers. Would it not make sense to
have the simple test-and-set spinlock in the generic code and have the
JITs use arch_spinlock_t where appropriate?
Alexei Starovoitov Jan. 30, 2019, 2:20 a.m. UTC | #42
On Tue, Jan 29, 2019 at 09:59:03AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 28, 2019 at 01:37:12PM -0800, Alexei Starovoitov wrote:
> > On Mon, Jan 28, 2019 at 09:43:10AM +0100, Peter Zijlstra wrote:
> 
> > > Isn't that still broken? AFAIU networking progs can happen in task
> > > context (TX) and SoftIRQ context (RX), which can nest.
> > 
> > Sure. sendmsg side of networking can be interrupted by napi receive.
> > Both can have bpf progs attached at different points, but napi won't run
> > when bpf prog is running, because bpf prog disables preemption.
> 
> Disabling preemption is not sufficient, it needs to have done
> local_bh_disable(), which isn't unlikely given this is all networking
> code.

right. excellent point. Since bh is not disabled nesting of the sender side
socket filter is possible with receive side socket filter.
Theoretically the same socket filter prog can be nested.
I believe it's fine with the current state of things including
parrallel access to bpf maps
(my patch to add preempt_disable around socket filter is still needed).
But this point regarding local_bh_disable changes the plans for bpf_spin_lock.
We'd need local_bh_disable around socket filters and around syscall access
to map with 'struct bpf_spin_lock' or two new helpers bpf_spin_lock_irqsave
and corresponding verifier support.
I guess we'll argue about best option when that time comes.
For now there will be no bpf_spin_lock in socket filters and I need to
add local_bh_disable for syscall access to maps with bpf_spin_lock
to avoid deadlock.
I'll send a set of fixes for lockdep false positives and
respin bpf_spin_lock set.
Alexei Starovoitov Jan. 30, 2019, 2:32 a.m. UTC | #43
On Tue, Jan 29, 2019 at 10:16:54AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> 
> > > Ah, but the loop won't be in the BPF program itself. The BPF program
> > > would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> > > code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> > > out-of-line versions of them).
> > 
> > As I said we considered exactly that and such approach has a lot of downsides
> > comparing with the helper approach.
> > Pretty much every time new feature is added we're evaluating whether it
> > should be new instruction or new helper. 99% of the time we go with new helper.
> 
> Ah; it seems I'm confused on helper vs instruction. As in, I've no idea
> what a helper is.

bpf helper is a normal kernel function that can be called from bpf program.
In assembler it's a direct function call.

> > > There isn't anything that mandates the JIT uses the exact same locking
> > > routines the interpreter does, is there?
> > 
> > sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
> > Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
> > JITs don't even need to do anything. It looks like function call from bpf prog
> > point of view, but in JITed code it is a sequence of native instructions.
> > 
> > Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
> > takes too much time then we can inline fast path of queued_spin_lock
> > directly into bpf prog and save function call cost.
> 
> OK, so then the JIT can optimize helpers. Would it not make sense to
> have the simple test-and-set spinlock in the generic code and have the
> JITs use arch_spinlock_t where appropriate?

I think that pretty much the same as what I have with qspinlock.
Instead of taking a risk how JIT writers implement bpf_spin_lock optimization
I'm using qspinlock on architectures that are known to support it.
So instead of starting with dumb test-and-set there will be faster
qspinlock from the start on x86, arm64 and few others archs.
Those are the archs we care about the most anyway. Other archs can take
time to optimize it (if optimizations are necessary at all).
In general hacking JITs is much harder and more error prone than
changing core and adding helpers. Hence we avoid touching JITs
as much as possible.
Like map_lookup inlining optimization we do only when JIT is on.
And we do it purely in the generic core. See array_map_gen_lookup().
We generate bpf instructions only to feed them into JITs so they
can replace them with native asm. That is much easier to implement
correctly than if we were doing inlining in every JIT independently.
Peter Zijlstra Jan. 30, 2019, 8:58 a.m. UTC | #44
On Tue, Jan 29, 2019 at 06:32:13PM -0800, Alexei Starovoitov wrote:
> On Tue, Jan 29, 2019 at 10:16:54AM +0100, Peter Zijlstra wrote:
> > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > 
> > > > Ah, but the loop won't be in the BPF program itself. The BPF program
> > > > would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> > > > code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> > > > out-of-line versions of them).
> > > 
> > > As I said we considered exactly that and such approach has a lot of downsides
> > > comparing with the helper approach.
> > > Pretty much every time new feature is added we're evaluating whether it
> > > should be new instruction or new helper. 99% of the time we go with new helper.
> > 
> > Ah; it seems I'm confused on helper vs instruction. As in, I've no idea
> > what a helper is.
> 
> bpf helper is a normal kernel function that can be called from bpf program.
> In assembler it's a direct function call.

Ah, it is what is otherwise known as a standard library,

> > > > There isn't anything that mandates the JIT uses the exact same locking
> > > > routines the interpreter does, is there?
> > > 
> > > sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
> > > Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
> > > JITs don't even need to do anything. It looks like function call from bpf prog
> > > point of view, but in JITed code it is a sequence of native instructions.
> > > 
> > > Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
> > > takes too much time then we can inline fast path of queued_spin_lock
> > > directly into bpf prog and save function call cost.
> > 
> > OK, so then the JIT can optimize helpers. Would it not make sense to
> > have the simple test-and-set spinlock in the generic code and have the
> > JITs use arch_spinlock_t where appropriate?
> 
> I think that pretty much the same as what I have with qspinlock.
> Instead of taking a risk how JIT writers implement bpf_spin_lock optimization
> I'm using qspinlock on architectures that are known to support it.

I see the argument for it...

> So instead of starting with dumb test-and-set there will be faster
> qspinlock from the start on x86, arm64 and few others archs.
> Those are the archs we care about the most anyway. Other archs can take
> time to optimize it (if optimizations are necessary at all).
> In general hacking JITs is much harder and more error prone than
> changing core and adding helpers. Hence we avoid touching JITs
> as much as possible.

So archs/JITs are not trivially able to override those helper functions?
Because for example ARM (32bit) doesn't do qspinlock but it's
arch_spinlock_t is _much_ better than a TAS lock.

> Like map_lookup inlining optimization we do only when JIT is on.
> And we do it purely in the generic core. See array_map_gen_lookup().
> We generate bpf instructions only to feed them into JITs so they
> can replace them with native asm. That is much easier to implement
> correctly than if we were doing inlining in every JIT independently.

That seems prudent and fairly painful at the same time. I see why you
would want to share as much of it as possible, but being restricted to
BPF instructions seems very limiting.

Anyway, I'll not press the issue; but I do think that arch specific
helpers would make sense here.
Will Deacon Jan. 30, 2019, 6:11 p.m. UTC | #45
Hi Alexei,

On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > What I want to avoid is to define the whole execution ordering model upfront.
> > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > Most of the bpf progs are written and running on x86. We shouldn't
> > > twist bpf developer's arm by artificially relaxing memory model.
> > > BPF memory model is equal to memory model of underlying architecture.
> > > What we can do is to make it bpf progs a bit more portable with
> > > smp_rmb instructions, but we must not force weak execution on the developer.
> > 
> > Well, I agree with only introducing bits you actually need, and my
> > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > smp_store_release() might have been a far more useful example.
> > 
> > But I disagree with the last part; we have to pick a model now;
> > otherwise you'll pain yourself into a corner.
> > 
> > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > be gaining a lot of attention and that is very much a weak architecture.
> > Adding strongly ordered assumptions to BPF now, will penalize them in
> > the long run.
> 
> arm64 is gaining attention just like riscV is gaining it too.
> BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> BPF is not picking sides in CPU HW and ISA battles.

It's not about picking a side, it's about providing an abstraction of the
various CPU architectures out there so that the programmer doesn't need to
worry about where their program may run. Hell, even if you just said "eBPF
follows x86 semantics" that would be better than saying nothing (and then we
could have a discussion about whether x86 semantics are really what you
want).

> Memory model is CPU HW design decision. BPF ISA cannot dictate HW design.
> We're not saying today that BPF is strongly ordered.
> BPF load/stores are behaving differently on x86 vs arm64.
> We can add new instructions, but we cannot 'define' how load/stores behave
> from memory model perspective.
> For example, take atomicity of single byte load/store.
> Not all archs have them atomic, but we cannot say to bpf programmers
> to always assume non-atomic byte loads.

Hmm, I don't think this is a good approach to take for the future of eBPF.
Assuming that a desirable property of an eBPF program is portability between
CPU architectures, then you're effectively forcing the programmer to "assume
the worst", where the worst is almost certainly unusable for practical
purposes.

One easy thing you could consider would be to allow tagging of an eBPF
program with its supported target architectures (the JIT will refuse to
accept it for other architectures). This would at least prevent remove the
illusion of portability and force the programmer to be explicit.

However, I think we'd much better off if we defined some basic ordering
primitives such as relaxed and RCpc-style acquire/release operations
(including atomic RmW), single-copy atomic memory accesses up to the native
machine size and a full-fence instruction. If your program uses something
that the underlying arch doesn't support, then it is rejected (e.g. 64-bit
READ_ONCE on a 32-bit arch)

That should map straightforwardly to all modern architectures and allow for
efficient codegen on x86 and arm64. It would probably require a bunch of new
BPF instructions that would be defined to be atomic (you already have XADD
as a relaxed atomic add).

Apologies if this sounds patronising, but I'm keen to help figure out the
semantics *now* so that we don't end up having to infer them later on, which
is the usual painful case for memory models. I suspect Peter and Paul would
also prefer to attack it that way around. I appreciate that the temptation
is to avoid the problem by deferring to the underlying hardware memory
model, but I think that will create more problems than it solves and we're
here to help you get this right.

Will
Paul E. McKenney Jan. 30, 2019, 6:36 p.m. UTC | #46
On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> Hi Alexei,
> 
> On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > BPF memory model is equal to memory model of underlying architecture.
> > > > What we can do is to make it bpf progs a bit more portable with
> > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > 
> > > Well, I agree with only introducing bits you actually need, and my
> > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > smp_store_release() might have been a far more useful example.
> > > 
> > > But I disagree with the last part; we have to pick a model now;
> > > otherwise you'll pain yourself into a corner.
> > > 
> > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > be gaining a lot of attention and that is very much a weak architecture.
> > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > the long run.
> > 
> > arm64 is gaining attention just like riscV is gaining it too.
> > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > BPF is not picking sides in CPU HW and ISA battles.
> 
> It's not about picking a side, it's about providing an abstraction of the
> various CPU architectures out there so that the programmer doesn't need to
> worry about where their program may run. Hell, even if you just said "eBPF
> follows x86 semantics" that would be better than saying nothing (and then we
> could have a discussion about whether x86 semantics are really what you
> want).

To reinforce this point, the Linux-kernel memory model (tools/memory-model)
is that abstraction for the Linux kernel.  Why not just use that for BPF?

							Thanx, Paul

> > Memory model is CPU HW design decision. BPF ISA cannot dictate HW design.
> > We're not saying today that BPF is strongly ordered.
> > BPF load/stores are behaving differently on x86 vs arm64.
> > We can add new instructions, but we cannot 'define' how load/stores behave
> > from memory model perspective.
> > For example, take atomicity of single byte load/store.
> > Not all archs have them atomic, but we cannot say to bpf programmers
> > to always assume non-atomic byte loads.
> 
> Hmm, I don't think this is a good approach to take for the future of eBPF.
> Assuming that a desirable property of an eBPF program is portability between
> CPU architectures, then you're effectively forcing the programmer to "assume
> the worst", where the worst is almost certainly unusable for practical
> purposes.
> 
> One easy thing you could consider would be to allow tagging of an eBPF
> program with its supported target architectures (the JIT will refuse to
> accept it for other architectures). This would at least prevent remove the
> illusion of portability and force the programmer to be explicit.
> 
> However, I think we'd much better off if we defined some basic ordering
> primitives such as relaxed and RCpc-style acquire/release operations
> (including atomic RmW), single-copy atomic memory accesses up to the native
> machine size and a full-fence instruction. If your program uses something
> that the underlying arch doesn't support, then it is rejected (e.g. 64-bit
> READ_ONCE on a 32-bit arch)
> 
> That should map straightforwardly to all modern architectures and allow for
> efficient codegen on x86 and arm64. It would probably require a bunch of new
> BPF instructions that would be defined to be atomic (you already have XADD
> as a relaxed atomic add).
> 
> Apologies if this sounds patronising, but I'm keen to help figure out the
> semantics *now* so that we don't end up having to infer them later on, which
> is the usual painful case for memory models. I suspect Peter and Paul would
> also prefer to attack it that way around. I appreciate that the temptation
> is to avoid the problem by deferring to the underlying hardware memory
> model, but I think that will create more problems than it solves and we're
> here to help you get this right.
> 
> Will
>
Alexei Starovoitov Jan. 30, 2019, 7:36 p.m. UTC | #47
On Wed, Jan 30, 2019 at 09:58:50AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 29, 2019 at 06:32:13PM -0800, Alexei Starovoitov wrote:
> > On Tue, Jan 29, 2019 at 10:16:54AM +0100, Peter Zijlstra wrote:
> > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > 
> > > > > Ah, but the loop won't be in the BPF program itself. The BPF program
> > > > > would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> > > > > code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> > > > > out-of-line versions of them).
> > > > 
> > > > As I said we considered exactly that and such approach has a lot of downsides
> > > > comparing with the helper approach.
> > > > Pretty much every time new feature is added we're evaluating whether it
> > > > should be new instruction or new helper. 99% of the time we go with new helper.
> > > 
> > > Ah; it seems I'm confused on helper vs instruction. As in, I've no idea
> > > what a helper is.
> > 
> > bpf helper is a normal kernel function that can be called from bpf program.
> > In assembler it's a direct function call.
> 
> Ah, it is what is otherwise known as a standard library,
> 
> > > > > There isn't anything that mandates the JIT uses the exact same locking
> > > > > routines the interpreter does, is there?
> > > > 
> > > > sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
> > > > Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
> > > > JITs don't even need to do anything. It looks like function call from bpf prog
> > > > point of view, but in JITed code it is a sequence of native instructions.
> > > > 
> > > > Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
> > > > takes too much time then we can inline fast path of queued_spin_lock
> > > > directly into bpf prog and save function call cost.
> > > 
> > > OK, so then the JIT can optimize helpers. Would it not make sense to
> > > have the simple test-and-set spinlock in the generic code and have the
> > > JITs use arch_spinlock_t where appropriate?
> > 
> > I think that pretty much the same as what I have with qspinlock.
> > Instead of taking a risk how JIT writers implement bpf_spin_lock optimization
> > I'm using qspinlock on architectures that are known to support it.
> 
> I see the argument for it...
> 
> > So instead of starting with dumb test-and-set there will be faster
> > qspinlock from the start on x86, arm64 and few others archs.
> > Those are the archs we care about the most anyway. Other archs can take
> > time to optimize it (if optimizations are necessary at all).
> > In general hacking JITs is much harder and more error prone than
> > changing core and adding helpers. Hence we avoid touching JITs
> > as much as possible.
> 
> So archs/JITs are not trivially able to override those helper functions?
> Because for example ARM (32bit) doesn't do qspinlock but it's
> arch_spinlock_t is _much_ better than a TAS lock.

JITs can override. There is no 'ready to use' facility for all types
of helpers to do that, but it's easy enough to add.
Having said that I'm going to reject arm32 JIT patches that
are trying to use arch_spinlock instead of generic bpf_spin_lock.
The last thing arm32 jit needs is this type of optimization.
Other JITs is a different story.
Alexei Starovoitov Jan. 30, 2019, 7:50 p.m. UTC | #48
On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> Assuming that a desirable property of an eBPF program is portability between
> CPU architectures, then you're effectively forcing the programmer to "assume

that is fundamental misunderstanding that being thrown in this thread.
bpf is not fixated on portability.
All projects that tried to come up with universal byte code miserably failed.
bpf program compiled for big endian won't load on little.
bpf program designed to be used on x86 will work horribly slow on nfp.
It will work, but will be innefficient. Hence we have alu32 mode in llvm.
More so maps don't map one to one to all archs either.
per-cpu map doesn't exist on nfp. we're still figuring out
an equivalent for it for nfp.
So, no, programs are not portable across architectures.
The programmer cannot assume that.
They could be portable in some cases and we're trying to keep portability
as much as possible, but it's not a "desirable property" that we're going
to sacrifice performance and usability for it.
If it helps look at bpf as a safe kernel module.
Does given kernel module work on all archs? No. Sometimes users only need
to recompile it and sometimes do heavy changes. smp_mb and load acquire
are the list things to worry about when folks trying to make
such 'safe kernel module' work on different archs.
Alexei Starovoitov Jan. 30, 2019, 7:51 p.m. UTC | #49
On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > Hi Alexei,
> > 
> > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > 
> > > > Well, I agree with only introducing bits you actually need, and my
> > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > smp_store_release() might have been a far more useful example.
> > > > 
> > > > But I disagree with the last part; we have to pick a model now;
> > > > otherwise you'll pain yourself into a corner.
> > > > 
> > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > the long run.
> > > 
> > > arm64 is gaining attention just like riscV is gaining it too.
> > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > BPF is not picking sides in CPU HW and ISA battles.
> > 
> > It's not about picking a side, it's about providing an abstraction of the
> > various CPU architectures out there so that the programmer doesn't need to
> > worry about where their program may run. Hell, even if you just said "eBPF
> > follows x86 semantics" that would be better than saying nothing (and then we
> > could have a discussion about whether x86 semantics are really what you
> > want).
> 
> To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> is that abstraction for the Linux kernel.  Why not just use that for BPF?

I already answered this earlier in the thread.
tldr: not going to sacrifice performance.
Paul E. McKenney Jan. 30, 2019, 9:05 p.m. UTC | #50
On Wed, Jan 30, 2019 at 11:51:14AM -0800, Alexei Starovoitov wrote:
> On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> > On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > > Hi Alexei,
> > > 
> > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > > 
> > > > > Well, I agree with only introducing bits you actually need, and my
> > > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > > smp_store_release() might have been a far more useful example.
> > > > > 
> > > > > But I disagree with the last part; we have to pick a model now;
> > > > > otherwise you'll pain yourself into a corner.
> > > > > 
> > > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > > the long run.
> > > > 
> > > > arm64 is gaining attention just like riscV is gaining it too.
> > > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > > BPF is not picking sides in CPU HW and ISA battles.
> > > 
> > > It's not about picking a side, it's about providing an abstraction of the
> > > various CPU architectures out there so that the programmer doesn't need to
> > > worry about where their program may run. Hell, even if you just said "eBPF
> > > follows x86 semantics" that would be better than saying nothing (and then we
> > > could have a discussion about whether x86 semantics are really what you
> > > want).
> > 
> > To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> > is that abstraction for the Linux kernel.  Why not just use that for BPF?
> 
> I already answered this earlier in the thread.
> tldr: not going to sacrifice performance.

Understood.

But can we at least say that where there are no performance consequences,
BPF should follow LKMM?  You already mentioned smp_load_acquire()
and smp_store_release(), but the void atomics (e.g., atomic_inc())
should also work because they don't provide any ordering guarantees.
The _relaxed(), _release(), and _acquire() variants of the value-returning
atomics should be just fine as well.

The other value-returning atomics have strong ordering, which is fine
on many systems, but potentially suboptimal for the weakly ordered ones.
Though you have to have pretty good locality of reference to be able to
see the difference, because otherwise cache-miss overhead dominates.

Things like cmpxchg() don't seem to fit BPF because they are normally
used in spin loops, though there are some non-spinning use cases.

You correctly pointed out that READ_ONCE() and WRITE_ONCE() are suboptimal
on systems that don't support all sizes of loads, but I bet that there
are some sizes for which they are just fine across systems, for example,
pointer size and int size.

Does that help?  Or am I missing additional cases where performance
could be degraded?

							Thanx, Paul
Alexei Starovoitov Jan. 30, 2019, 10:57 p.m. UTC | #51
On Wed, Jan 30, 2019 at 01:05:36PM -0800, Paul E. McKenney wrote:
> On Wed, Jan 30, 2019 at 11:51:14AM -0800, Alexei Starovoitov wrote:
> > On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> > > On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > > > Hi Alexei,
> > > > 
> > > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > > > 
> > > > > > Well, I agree with only introducing bits you actually need, and my
> > > > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > > > smp_store_release() might have been a far more useful example.
> > > > > > 
> > > > > > But I disagree with the last part; we have to pick a model now;
> > > > > > otherwise you'll pain yourself into a corner.
> > > > > > 
> > > > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > > > the long run.
> > > > > 
> > > > > arm64 is gaining attention just like riscV is gaining it too.
> > > > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > > > BPF is not picking sides in CPU HW and ISA battles.
> > > > 
> > > > It's not about picking a side, it's about providing an abstraction of the
> > > > various CPU architectures out there so that the programmer doesn't need to
> > > > worry about where their program may run. Hell, even if you just said "eBPF
> > > > follows x86 semantics" that would be better than saying nothing (and then we
> > > > could have a discussion about whether x86 semantics are really what you
> > > > want).
> > > 
> > > To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> > > is that abstraction for the Linux kernel.  Why not just use that for BPF?
> > 
> > I already answered this earlier in the thread.
> > tldr: not going to sacrifice performance.
> 
> Understood.
> 
> But can we at least say that where there are no performance consequences,
> BPF should follow LKMM?  You already mentioned smp_load_acquire()
> and smp_store_release(), but the void atomics (e.g., atomic_inc())
> should also work because they don't provide any ordering guarantees.
> The _relaxed(), _release(), and _acquire() variants of the value-returning
> atomics should be just fine as well.
> 
> The other value-returning atomics have strong ordering, which is fine
> on many systems, but potentially suboptimal for the weakly ordered ones.
> Though you have to have pretty good locality of reference to be able to
> see the difference, because otherwise cache-miss overhead dominates.
> 
> Things like cmpxchg() don't seem to fit BPF because they are normally
> used in spin loops, though there are some non-spinning use cases.
> 
> You correctly pointed out that READ_ONCE() and WRITE_ONCE() are suboptimal
> on systems that don't support all sizes of loads, but I bet that there
> are some sizes for which they are just fine across systems, for example,
> pointer size and int size.
> 
> Does that help?  Or am I missing additional cases where performance
> could be degraded?

bpf doesn't have smp_load_acquire, atomic_fetch_add, xchg, fence instructions.
They can be added step by step. That's easy.
I believe folks already started working on adding atomic_fetch_add.
What I have problem with is making a statement today that bpf's end
goal is LKMM. Even after adding all sorts of instructions it may
not be practical.
Only when real use case requires adding new instruction we do it.
Do you have a bpf program that needs smp_load_acquire ?
Paul E. McKenney Jan. 31, 2019, 2:01 p.m. UTC | #52
On Wed, Jan 30, 2019 at 02:57:43PM -0800, Alexei Starovoitov wrote:
> On Wed, Jan 30, 2019 at 01:05:36PM -0800, Paul E. McKenney wrote:
> > On Wed, Jan 30, 2019 at 11:51:14AM -0800, Alexei Starovoitov wrote:
> > > On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> > > > On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > > > > Hi Alexei,
> > > > > 
> > > > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > > > > 
> > > > > > > Well, I agree with only introducing bits you actually need, and my
> > > > > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > > > > smp_store_release() might have been a far more useful example.
> > > > > > > 
> > > > > > > But I disagree with the last part; we have to pick a model now;
> > > > > > > otherwise you'll pain yourself into a corner.
> > > > > > > 
> > > > > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > > > > the long run.
> > > > > > 
> > > > > > arm64 is gaining attention just like riscV is gaining it too.
> > > > > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > > > > BPF is not picking sides in CPU HW and ISA battles.
> > > > > 
> > > > > It's not about picking a side, it's about providing an abstraction of the
> > > > > various CPU architectures out there so that the programmer doesn't need to
> > > > > worry about where their program may run. Hell, even if you just said "eBPF
> > > > > follows x86 semantics" that would be better than saying nothing (and then we
> > > > > could have a discussion about whether x86 semantics are really what you
> > > > > want).
> > > > 
> > > > To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> > > > is that abstraction for the Linux kernel.  Why not just use that for BPF?
> > > 
> > > I already answered this earlier in the thread.
> > > tldr: not going to sacrifice performance.
> > 
> > Understood.
> > 
> > But can we at least say that where there are no performance consequences,
> > BPF should follow LKMM?  You already mentioned smp_load_acquire()
> > and smp_store_release(), but the void atomics (e.g., atomic_inc())
> > should also work because they don't provide any ordering guarantees.
> > The _relaxed(), _release(), and _acquire() variants of the value-returning
> > atomics should be just fine as well.
> > 
> > The other value-returning atomics have strong ordering, which is fine
> > on many systems, but potentially suboptimal for the weakly ordered ones.
> > Though you have to have pretty good locality of reference to be able to
> > see the difference, because otherwise cache-miss overhead dominates.
> > 
> > Things like cmpxchg() don't seem to fit BPF because they are normally
> > used in spin loops, though there are some non-spinning use cases.
> > 
> > You correctly pointed out that READ_ONCE() and WRITE_ONCE() are suboptimal
> > on systems that don't support all sizes of loads, but I bet that there
> > are some sizes for which they are just fine across systems, for example,
> > pointer size and int size.
> > 
> > Does that help?  Or am I missing additional cases where performance
> > could be degraded?
> 
> bpf doesn't have smp_load_acquire, atomic_fetch_add, xchg, fence instructions.
> They can be added step by step. That's easy.
> I believe folks already started working on adding atomic_fetch_add.
> What I have problem with is making a statement today that bpf's end
> goal is LKMM. Even after adding all sorts of instructions it may
> not be practical.
> Only when real use case requires adding new instruction we do it.
> Do you have a bpf program that needs smp_load_acquire ?

We seem to be talking past each other.  Let me try again...

I believe that if BPF adds a given concurrency feature, it should follow
LKMM unless there is some specific problem with its doing so.

My paragraphs in my previous email list the concurrency features BPF
could follow LKMM without penalty, should BPF choose to add them.

Does that help?

							Thanx, Paul
Alexei Starovoitov Jan. 31, 2019, 6:47 p.m. UTC | #53
On Thu, Jan 31, 2019 at 06:01:56AM -0800, Paul E. McKenney wrote:
> On Wed, Jan 30, 2019 at 02:57:43PM -0800, Alexei Starovoitov wrote:
> > On Wed, Jan 30, 2019 at 01:05:36PM -0800, Paul E. McKenney wrote:
> > > On Wed, Jan 30, 2019 at 11:51:14AM -0800, Alexei Starovoitov wrote:
> > > > On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> > > > > On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > > > > > Hi Alexei,
> > > > > > 
> > > > > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > > > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > > > > > 
> > > > > > > > Well, I agree with only introducing bits you actually need, and my
> > > > > > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > > > > > smp_store_release() might have been a far more useful example.
> > > > > > > > 
> > > > > > > > But I disagree with the last part; we have to pick a model now;
> > > > > > > > otherwise you'll pain yourself into a corner.
> > > > > > > > 
> > > > > > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > > > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > > > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > > > > > the long run.
> > > > > > > 
> > > > > > > arm64 is gaining attention just like riscV is gaining it too.
> > > > > > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > > > > > BPF is not picking sides in CPU HW and ISA battles.
> > > > > > 
> > > > > > It's not about picking a side, it's about providing an abstraction of the
> > > > > > various CPU architectures out there so that the programmer doesn't need to
> > > > > > worry about where their program may run. Hell, even if you just said "eBPF
> > > > > > follows x86 semantics" that would be better than saying nothing (and then we
> > > > > > could have a discussion about whether x86 semantics are really what you
> > > > > > want).
> > > > > 
> > > > > To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> > > > > is that abstraction for the Linux kernel.  Why not just use that for BPF?
> > > > 
> > > > I already answered this earlier in the thread.
> > > > tldr: not going to sacrifice performance.
> > > 
> > > Understood.
> > > 
> > > But can we at least say that where there are no performance consequences,
> > > BPF should follow LKMM?  You already mentioned smp_load_acquire()
> > > and smp_store_release(), but the void atomics (e.g., atomic_inc())
> > > should also work because they don't provide any ordering guarantees.
> > > The _relaxed(), _release(), and _acquire() variants of the value-returning
> > > atomics should be just fine as well.
> > > 
> > > The other value-returning atomics have strong ordering, which is fine
> > > on many systems, but potentially suboptimal for the weakly ordered ones.
> > > Though you have to have pretty good locality of reference to be able to
> > > see the difference, because otherwise cache-miss overhead dominates.
> > > 
> > > Things like cmpxchg() don't seem to fit BPF because they are normally
> > > used in spin loops, though there are some non-spinning use cases.
> > > 
> > > You correctly pointed out that READ_ONCE() and WRITE_ONCE() are suboptimal
> > > on systems that don't support all sizes of loads, but I bet that there
> > > are some sizes for which they are just fine across systems, for example,
> > > pointer size and int size.
> > > 
> > > Does that help?  Or am I missing additional cases where performance
> > > could be degraded?
> > 
> > bpf doesn't have smp_load_acquire, atomic_fetch_add, xchg, fence instructions.
> > They can be added step by step. That's easy.
> > I believe folks already started working on adding atomic_fetch_add.
> > What I have problem with is making a statement today that bpf's end
> > goal is LKMM. Even after adding all sorts of instructions it may
> > not be practical.
> > Only when real use case requires adding new instruction we do it.
> > Do you have a bpf program that needs smp_load_acquire ?
> 
> We seem to be talking past each other.  Let me try again...
> 
> I believe that if BPF adds a given concurrency feature, it should follow
> LKMM unless there is some specific problem with its doing so.
> 
> My paragraphs in my previous email list the concurrency features BPF
> could follow LKMM without penalty, should BPF choose to add them.
> 
> Does that help?

yeah. we're talking past each other indeed.
Doesn't look like that more emails will help.
Let's resolve it either f2f during next conference or join our bi-weekly
bpf bluejeans call Wed 11am pacific.
Reminders and links are on this list
https://lists.iovisor.org/g/iovisor-dev/messages?p=created,0,,20,2,0,0
Paul E. McKenney Feb. 1, 2019, 2:05 p.m. UTC | #54
On Thu, Jan 31, 2019 at 10:47:50AM -0800, Alexei Starovoitov wrote:
> On Thu, Jan 31, 2019 at 06:01:56AM -0800, Paul E. McKenney wrote:
> > On Wed, Jan 30, 2019 at 02:57:43PM -0800, Alexei Starovoitov wrote:
> > > On Wed, Jan 30, 2019 at 01:05:36PM -0800, Paul E. McKenney wrote:
> > > > On Wed, Jan 30, 2019 at 11:51:14AM -0800, Alexei Starovoitov wrote:
> > > > > On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> > > > > > On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > > > > > > Hi Alexei,
> > > > > > > 
> > > > > > > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > > > > > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > > > > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > > > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > > > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > > > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > > > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > > > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > > > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > > > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > > > > > > 
> > > > > > > > > Well, I agree with only introducing bits you actually need, and my
> > > > > > > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > > > > > > smp_store_release() might have been a far more useful example.
> > > > > > > > > 
> > > > > > > > > But I disagree with the last part; we have to pick a model now;
> > > > > > > > > otherwise you'll pain yourself into a corner.
> > > > > > > > > 
> > > > > > > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > > > > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > > > > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > > > > > > the long run.
> > > > > > > > 
> > > > > > > > arm64 is gaining attention just like riscV is gaining it too.
> > > > > > > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > > > > > > BPF is not picking sides in CPU HW and ISA battles.
> > > > > > > 
> > > > > > > It's not about picking a side, it's about providing an abstraction of the
> > > > > > > various CPU architectures out there so that the programmer doesn't need to
> > > > > > > worry about where their program may run. Hell, even if you just said "eBPF
> > > > > > > follows x86 semantics" that would be better than saying nothing (and then we
> > > > > > > could have a discussion about whether x86 semantics are really what you
> > > > > > > want).
> > > > > > 
> > > > > > To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> > > > > > is that abstraction for the Linux kernel.  Why not just use that for BPF?
> > > > > 
> > > > > I already answered this earlier in the thread.
> > > > > tldr: not going to sacrifice performance.
> > > > 
> > > > Understood.
> > > > 
> > > > But can we at least say that where there are no performance consequences,
> > > > BPF should follow LKMM?  You already mentioned smp_load_acquire()
> > > > and smp_store_release(), but the void atomics (e.g., atomic_inc())
> > > > should also work because they don't provide any ordering guarantees.
> > > > The _relaxed(), _release(), and _acquire() variants of the value-returning
> > > > atomics should be just fine as well.
> > > > 
> > > > The other value-returning atomics have strong ordering, which is fine
> > > > on many systems, but potentially suboptimal for the weakly ordered ones.
> > > > Though you have to have pretty good locality of reference to be able to
> > > > see the difference, because otherwise cache-miss overhead dominates.
> > > > 
> > > > Things like cmpxchg() don't seem to fit BPF because they are normally
> > > > used in spin loops, though there are some non-spinning use cases.
> > > > 
> > > > You correctly pointed out that READ_ONCE() and WRITE_ONCE() are suboptimal
> > > > on systems that don't support all sizes of loads, but I bet that there
> > > > are some sizes for which they are just fine across systems, for example,
> > > > pointer size and int size.
> > > > 
> > > > Does that help?  Or am I missing additional cases where performance
> > > > could be degraded?
> > > 
> > > bpf doesn't have smp_load_acquire, atomic_fetch_add, xchg, fence instructions.
> > > They can be added step by step. That's easy.
> > > I believe folks already started working on adding atomic_fetch_add.
> > > What I have problem with is making a statement today that bpf's end
> > > goal is LKMM. Even after adding all sorts of instructions it may
> > > not be practical.
> > > Only when real use case requires adding new instruction we do it.
> > > Do you have a bpf program that needs smp_load_acquire ?
> > 
> > We seem to be talking past each other.  Let me try again...
> > 
> > I believe that if BPF adds a given concurrency feature, it should follow
> > LKMM unless there is some specific problem with its doing so.
> > 
> > My paragraphs in my previous email list the concurrency features BPF
> > could follow LKMM without penalty, should BPF choose to add them.
> > 
> > Does that help?
> 
> yeah. we're talking past each other indeed.
> Doesn't look like that more emails will help.
> Let's resolve it either f2f during next conference or join our bi-weekly
> bpf bluejeans call Wed 11am pacific.
> Reminders and links are on this list
> https://lists.iovisor.org/g/iovisor-dev/messages?p=created,0,,20,2,0,0

There is an instance of this meeting next week, correct?

If so, I could make the instance on Feb 27th, assuming that I have
access to bluejeans.

							Thanx, Paul
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3851529062ec..ee0352bd728f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -72,14 +72,15 @@  struct bpf_map {
 	u32 value_size;
 	u32 max_entries;
 	u32 map_flags;
-	u32 pages;
+	int spin_lock_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
 	struct btf *btf;
+	u32 pages;
 	bool unpriv_array;
-	/* 55 bytes hole */
+	/* 51 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
@@ -91,6 +92,34 @@  struct bpf_map {
 	char name[BPF_OBJ_NAME_LEN];
 };
 
+static inline bool map_value_has_spin_lock(const struct bpf_map *map)
+{
+	return map->spin_lock_off >= 0;
+}
+
+static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+{
+	if (likely(!map_value_has_spin_lock(map)))
+		return;
+	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+		(struct bpf_spin_lock){};
+}
+
+/* copy everything but bpf_spin_lock */
+static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+{
+	if (unlikely(map_value_has_spin_lock(map))) {
+		u32 off = map->spin_lock_off;
+
+		memcpy(dst, src, off);
+		memcpy(dst + off + sizeof(struct bpf_spin_lock),
+		       src + off + sizeof(struct bpf_spin_lock),
+		       map->value_size - off - sizeof(struct bpf_spin_lock));
+	} else {
+		memcpy(dst, src, map->value_size);
+	}
+}
+
 struct bpf_offload_dev;
 struct bpf_offloaded_map;
 
@@ -162,6 +191,7 @@  enum bpf_arg_type {
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
+	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 };
 
 /* type of values returned from helper functions */
@@ -876,7 +906,8 @@  extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
-
+extern const struct bpf_func_proto bpf_spin_lock_proto;
+extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0620e418dde5..69f7a3449eda 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -148,6 +148,7 @@  struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
 	u32 curframe;
+	u32 active_spin_lock;
 	bool speculative;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 12502e25e767..455d31b55828 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -50,6 +50,7 @@  u32 btf_id(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..30f9dfd40f13 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2421,7 +2421,9 @@  union bpf_attr {
 	FN(map_peek_elem),		\
 	FN(msg_push_data),		\
 	FN(msg_pop_data),		\
-	FN(rc_pointer_rel),
+	FN(rc_pointer_rel),		\
+	FN(spin_lock),			\
+	FN(spin_unlock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3054,4 +3056,7 @@  struct bpf_line_info {
 	__u32	line_col;
 };
 
+struct bpf_spin_lock {
+	__u32	val;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..d6d979910a2a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -270,9 +270,10 @@  static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
 	else
-		memcpy(array->value +
-		       array->elem_size * (index & array->index_mask),
-		       value, map->value_size);
+		copy_map_value(map,
+			       array->value +
+			       array->elem_size * (index & array->index_mask),
+			       value);
 	return 0;
 }
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 022ef9ca1296..03b1a7a6195c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -355,6 +355,11 @@  static bool btf_type_is_struct(const struct btf_type *t)
 	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
 }
 
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
 static bool btf_type_is_array(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -2045,6 +2050,43 @@  static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_member *member;
+	u32 i, off = -ENOENT;
+
+	if (!__btf_type_is_struct(t))
+		return -EINVAL;
+
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								    member->type);
+		if (!__btf_type_is_struct(member_type))
+			continue;
+		if (member_type->size != sizeof(struct bpf_spin_lock))
+			continue;
+		if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+			   "bpf_spin_lock"))
+			continue;
+		if (off != -ENOENT)
+			/* only one 'struct bpf_spin_lock' is allowed */
+			return -E2BIG;
+		off = btf_member_bit_offset(t, member);
+		if (off % 8)
+			/* valid C code cannot generate such BTF */
+			return -EINVAL;
+		off /= 8;
+		if (off % __alignof__(struct bpf_spin_lock))
+			/* valid struct bpf_spin_lock will be 4 byte aligned */
+			return -EINVAL;
+	}
+	return off;
+}
+
 static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
 				u32 type_id, void *data, u8 bits_offset,
 				struct seq_file *m)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2a81b8af3748..2b44eca22ef8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2050,6 +2050,8 @@  const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_map_push_elem_proto __weak;
 const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
 const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..48a41bf65e1b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -770,6 +770,8 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
+		check_and_init_map_lock(&htab->map,
+					l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -792,7 +794,9 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 		if (!prealloc)
 			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
-		memcpy(l_new->key + round_up(key_size, 8), value, size);
+		copy_map_value(&htab->map,
+			       l_new->key + round_up(key_size, 8),
+			       value);
 	}
 
 	l_new->hash = hash;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a74972b07e74..2e98e4caf5aa 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -221,6 +221,63 @@  const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg2_type	= ARG_CONST_SIZE,
 };
 
+#ifndef CONFIG_QUEUED_SPINLOCKS
+struct dumb_spin_lock {
+	atomic_t val;
+};
+#endif
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+#if defined(CONFIG_SMP)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	struct qspinlock *qlock = (void *)lock;
+
+	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
+	queued_spin_lock(qlock);
+#else
+	struct dumb_spin_lock *qlock = (void *)lock;
+
+	BUILD_BUG_ON(sizeof(*qlock) != sizeof(*lock));
+	do {
+		while (atomic_read(&qlock->val) != 0)
+			cpu_relax();
+	} while (atomic_cmpxchg(&qlock->val, 0, 1) != 0);
+#endif
+#endif
+	return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+	.func		= bpf_spin_lock,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+#if defined(CONFIG_SMP)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	struct qspinlock *qlock = (void *)lock;
+
+	queued_spin_unlock(qlock);
+#else
+	struct dumb_spin_lock *qlock = (void *)lock;
+
+	atomic_set(&qlock->val, 0);
+#endif
+#endif
+	return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+	.func		= bpf_spin_unlock,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
+};
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 99d243e1ad6e..920eff1b677b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -36,6 +36,11 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (map_value_has_spin_lock(inner_map)) {
+		fdput(f);
+		return ERR_PTR(-ENOTSUPP);
+	}
+
 	inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
 	if (!inner_map_meta) {
 		fdput(f);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..ebf0a673cb83 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -463,7 +463,7 @@  int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			 u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
@@ -478,6 +478,21 @@  static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
+	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+	if (map_value_has_spin_lock(map)) {
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY)
+			return -ENOTSUPP;
+		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+		    map->value_size) {
+			WARN_ONCE(1,
+				  "verifier bug spin_lock_off %d value_size %d\n",
+				  map->spin_lock_off, map->value_size);
+			return -EFAULT;
+		}
+	}
+
 	if (map->ops->map_check_btf)
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -542,6 +557,8 @@  static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 		map->btf_key_type_id = attr->btf_key_type_id;
 		map->btf_value_type_id = attr->btf_value_type_id;
+	} else {
+		map->spin_lock_off = -EINVAL;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -740,7 +757,7 @@  static int map_lookup_elem(union bpf_attr *attr)
 			err = -ENOENT;
 		} else {
 			err = 0;
-			memcpy(value, ptr, value_size);
+			copy_map_value(map, value, ptr);
 		}
 		rcu_read_unlock();
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8cfe39ef770f..f372904fa691 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -213,6 +213,7 @@  struct bpf_call_arg_meta {
 	s64 msize_smax_value;
 	u64 msize_umax_value;
 	int ptr_id;
+	int func_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -351,6 +352,12 @@  static bool reg_is_refcounted(const struct bpf_reg_state *reg)
 	return type_is_refcounted(reg->type);
 }
 
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+	return reg->type == PTR_TO_MAP_VALUE &&
+		map_value_has_spin_lock(reg->map_ptr);
+}
+
 static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
 {
 	return type_is_refcounted_or_null(reg->type);
@@ -712,6 +719,7 @@  static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	}
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
+	dst_state->active_spin_lock = src->active_spin_lock;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1483,6 +1491,21 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	if (err)
 		verbose(env, "R%d max value is outside of the array range\n",
 			regno);
+
+	if (map_value_has_spin_lock(reg->map_ptr)) {
+		u32 lock = reg->map_ptr->spin_lock_off;
+
+		/* if any part of struct bpf_spin_lock can be touched by
+		 * load/store reject this program
+		 */
+		if ((reg->smin_value + off <= lock &&
+		     lock < reg->umax_value + off + size) ||
+		    (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+		     lock + sizeof(struct bpf_spin_lock) <= reg->umax_value + off + size)) {
+			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+			return -EACCES;
+		}
+	}
 	return err;
 }
 
@@ -2192,6 +2215,91 @@  static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+			     bool is_lock)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	struct bpf_verifier_state *cur = env->cur_state;
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (reg->type != PTR_TO_MAP_VALUE) {
+		verbose(env, "R%d is not a pointer to map_value\n", regno);
+		return -EINVAL;
+	}
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env,
+			"map '%s' has to have BTF in order to use bpf_spin_lock\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (!map_value_has_spin_lock(map)) {
+		if (map->spin_lock_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_spin_lock'\n",
+				map->name);
+		else if (map->spin_lock_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_spin_lock'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->spin_lock_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+			val + reg->off);
+		return -EINVAL;
+	}
+	if (is_lock) {
+		if (cur->active_spin_lock) {
+			verbose(env,
+				"Locking two bpf_spin_locks are not allowed\n");
+			return -EINVAL;
+		}
+		cur->active_spin_lock = reg->id;
+	} else {
+		if (!cur->active_spin_lock) {
+			verbose(env, "bpf_spin_unlock without taking a lock\n");
+			return -EINVAL;
+		}
+		if (cur->active_spin_lock != reg->id) {
+			verbose(env, "bpf_spin_unlock of different lock\n");
+			return -EINVAL;
+		}
+		cur->active_spin_lock = 0;
+	}
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -2268,6 +2376,17 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EFAULT;
 		}
 		meta->ptr_id = reg->id;
+	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+		if (meta->func_id == BPF_FUNC_spin_lock) {
+			if (process_spin_lock(env, regno, true))
+				return -EACCES;
+		} else if (meta->func_id == BPF_FUNC_spin_unlock) {
+			if (process_spin_lock(env, regno, false))
+				return -EACCES;
+		} else {
+			verbose(env, "verifier internal error\n");
+			return -EFAULT;
+		}
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2887,6 +3006,7 @@  static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return err;
 	}
 
+	meta.func_id = func_id;
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -4344,7 +4464,8 @@  static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
 		}
-		if (is_null || !reg_is_refcounted(reg)) {
+		if (is_null || !(reg_is_refcounted(reg) ||
+				 reg_may_point_to_spin_lock(reg))) {
 			/* We don't need id from this point onwards anymore,
 			 * thus we should better reset it, so that state
 			 * pruning has chances to take effect.
@@ -4713,6 +4834,11 @@  static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 	}
 
+	if (env->cur_state->active_spin_lock) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -5448,8 +5574,11 @@  static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and
 		 * everything else matches, we are OK.
-		 * We don't care about the 'id' value, because nothing
-		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+		 * 'id' is not compared, since it's only used for maps with
+		 * bpf_spin_lock inside map element and in such cases if
+		 * the rest of the prog is valid for one map element then
+		 * it's valid for all map elements regardless of the key
+		 * used in bpf_map_lookup()
 		 */
 		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 		       range_within(rold, rcur) &&
@@ -5652,6 +5781,9 @@  static bool states_equal(struct bpf_verifier_env *env,
 	if (old->speculative && !cur->speculative)
 		return false;
 
+	if (old->active_spin_lock != cur->active_spin_lock)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -6069,6 +6201,12 @@  static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_spin_lock &&
+				    (insn->src_reg == BPF_PSEUDO_CALL ||
+				     insn->imm != BPF_FUNC_spin_unlock)) {
+					verbose(env, "function calls are not allowed while holding a lock\n");
+					return -EINVAL;
+				}
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
 				else
@@ -6097,6 +6235,11 @@  static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_spin_lock) {
+					verbose(env, "bpf_spin_unlock is missing\n");
+					return -EINVAL;
+				}
+
 				if (state->curframe) {
 					/* exit from nested function */
 					env->prev_insn_idx = env->insn_idx;
diff --git a/net/core/filter.c b/net/core/filter.c
index 2b3b436ef545..24a5d874d156 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5306,10 +5306,20 @@  bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	default:
+		break;
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return NULL;
+
+	switch (func_id) {
+	case BPF_FUNC_spin_lock:
+		return &bpf_spin_lock_proto;
+	case BPF_FUNC_spin_unlock:
+		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
-			return bpf_get_trace_printk_proto();
-		/* else: fall through */
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}