diff mbox series

[bpf-next,v3,1/5] bpf: Support chain calling multiple BPF programs after each other

Message ID 157046883614.2092443.9861796174814370924.stgit@alrua-x1
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series xdp: Support multiple programs on a single interface through chain calls | expand

Commit Message

Toke Høiland-Jørgensen Oct. 7, 2019, 5:20 p.m. UTC
From: Toke Høiland-Jørgensen <toke@redhat.com>

This adds support for wrapping eBPF program dispatch in chain calling
logic. The code injection is controlled by a flag at program load time; if
the flag is set, the BPF program will carry a flag bit that changes the
program dispatch logic to wrap it in a chain call loop.

Ideally, it shouldn't be necessary to set the flag on program load time,
but rather inject the calls when a chain call program is first loaded. The
allocation logic sets the whole of struct bpf_prog to be read-only memory,
so it can't immediately be modified, but conceivably we could just unlock
the first page of the struct and flip the bit when a chain call program is
first attached.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h      |    3 +++
 include/linux/filter.h   |   34 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/bpf.h |    6 ++++++
 kernel/bpf/core.c        |    6 ++++++
 kernel/bpf/syscall.c     |    4 +++-
 5 files changed, 50 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov Oct. 7, 2019, 8:42 p.m. UTC | #1
On Mon, Oct 07, 2019 at 07:20:36PM +0200, Toke Høiland-Jørgensen wrote:
> From: Toke Høiland-Jørgensen <toke@redhat.com>
> 
> This adds support for wrapping eBPF program dispatch in chain calling
> logic. The code injection is controlled by a flag at program load time; if
> the flag is set, the BPF program will carry a flag bit that changes the
> program dispatch logic to wrap it in a chain call loop.
> 
> Ideally, it shouldn't be necessary to set the flag on program load time,
> but rather inject the calls when a chain call program is first loaded. The
> allocation logic sets the whole of struct bpf_prog to be read-only memory,
> so it can't immediately be modified, but conceivably we could just unlock
> the first page of the struct and flip the bit when a chain call program is
> first attached.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>  include/linux/bpf.h      |    3 +++
>  include/linux/filter.h   |   34 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/bpf.h |    6 ++++++
>  kernel/bpf/core.c        |    6 ++++++
>  kernel/bpf/syscall.c     |    4 +++-
>  5 files changed, 50 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5b9d22338606..13e5f38cf5c6 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -365,6 +365,8 @@ struct bpf_prog_stats {
>  	struct u64_stats_sync syncp;
>  };
>  
> +#define BPF_NUM_CHAIN_SLOTS 8
> +
>  struct bpf_prog_aux {
>  	atomic_t refcnt;
>  	u32 used_map_cnt;
> @@ -383,6 +385,7 @@ struct bpf_prog_aux {
>  	struct list_head ksym_lnode;
>  	const struct bpf_prog_ops *ops;
>  	struct bpf_map **used_maps;
> +	struct bpf_prog *chain_progs[BPF_NUM_CHAIN_SLOTS];
>  	struct bpf_prog *prog;
>  	struct user_struct *user;
>  	u64 load_time; /* ns since boottime */
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 2ce57645f3cd..3d1e4991e61d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -21,6 +21,7 @@
>  #include <linux/kallsyms.h>
>  #include <linux/if_vlan.h>
>  #include <linux/vmalloc.h>
> +#include <linux/nospec.h>
>  
>  #include <net/sch_generic.h>
>  
> @@ -528,6 +529,7 @@ struct bpf_prog {
>  				is_func:1,	/* program is a bpf function */
>  				kprobe_override:1, /* Do we override a kprobe? */
>  				has_callchain_buf:1, /* callchain buffer allocated? */
> +				chain_calls:1, /* should this use the chain_call wrapper */
>  				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
>  	enum bpf_prog_type	type;		/* Type of BPF program */
>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
> @@ -551,6 +553,30 @@ struct sk_filter {
>  	struct bpf_prog	*prog;
>  };
>  
> +#define BPF_MAX_CHAIN_CALLS 32
> +static __always_inline unsigned int do_chain_calls(const struct bpf_prog *prog,
> +						   const void *ctx)
> +{
> +	int i = BPF_MAX_CHAIN_CALLS;
> +	int idx;
> +	u32 ret;
> +
> +	do {
> +		ret = (*(prog)->bpf_func)(ctx, prog->insnsi);

This breaks program stats.

> +
> +		if (ret + 1 >= BPF_NUM_CHAIN_SLOTS) {
> +			prog = prog->aux->chain_progs[0];
> +			continue;
> +		}
> +		idx = ret + 1;
> +		idx = array_index_nospec(idx, BPF_NUM_CHAIN_SLOTS);
> +
> +		prog = prog->aux->chain_progs[idx] ?: prog->aux->chain_progs[0];
> +	} while (prog && --i);
> +
> +	return ret;
> +}
> +
>  DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>  
>  #define BPF_PROG_RUN(prog, ctx)	({				\
> @@ -559,14 +585,18 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>  	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
>  		struct bpf_prog_stats *stats;			\
>  		u64 start = sched_clock();			\
> -		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +		ret = prog->chain_calls ?			\
> +			do_chain_calls(prog, ctx) :			\
> +			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\

I thought you agreed on 'no performance regressions' rule?

>  		stats = this_cpu_ptr(prog->aux->stats);		\
>  		u64_stats_update_begin(&stats->syncp);		\
>  		stats->cnt++;					\
>  		stats->nsecs += sched_clock() - start;		\
>  		u64_stats_update_end(&stats->syncp);		\
>  	} else {						\
> -		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +		ret = prog->chain_calls ?				\
> +			do_chain_calls(prog, ctx) :			\
> +			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>  	}							\
>  	ret; })
>  
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 77c6be96d676..1ce80a227be3 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -288,6 +288,12 @@ enum bpf_attach_type {
>  /* The verifier internal test flag. Behavior is undefined */
>  #define BPF_F_TEST_STATE_FREQ	(1U << 3)
>  
> +/* Whether to enable chain call logic at program execution. If set, the program
> + * execution logic will check for and jump to chain call programs configured
> + * with the BPF_PROG_CHAIN_* commands to the bpf syscall.
> + */
> +#define BPF_F_CHAIN_CALLS	(1U << 4)
> +
>  /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
>   * two extensions:
>   *
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 66088a9e9b9e..5dfe3585bc5d 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -254,6 +254,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
>  void __bpf_prog_free(struct bpf_prog *fp)
>  {
>  	if (fp->aux) {
> +		int i;
> +
> +		for (i = 0; i < BPF_NUM_CHAIN_SLOTS; i++)
> +			if (fp->aux->chain_progs[i])
> +				bpf_prog_put(fp->aux->chain_progs[i]);
> +
>  		free_percpu(fp->aux->stats);
>  		kfree(fp->aux);
>  	}
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 82eabd4e38ad..b8a203a05881 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1630,7 +1630,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
>  	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
>  				 BPF_F_ANY_ALIGNMENT |
>  				 BPF_F_TEST_STATE_FREQ |
> -				 BPF_F_TEST_RND_HI32))
> +				 BPF_F_TEST_RND_HI32 |
> +				 BPF_F_CHAIN_CALLS))
>  		return -EINVAL;
>  
>  	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
> @@ -1665,6 +1666,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
>  		return -ENOMEM;
>  
>  	prog->expected_attach_type = attr->expected_attach_type;
> +	prog->chain_calls = !!(attr->prog_flags & BPF_F_CHAIN_CALLS);
>  
>  	prog->aux->offload_requested = !!attr->prog_ifindex;
>  
>
Toke Høiland-Jørgensen Oct. 8, 2019, 8:07 a.m. UTC | #2
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Mon, Oct 07, 2019 at 07:20:36PM +0200, Toke Høiland-Jørgensen wrote:
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
>> 
>> This adds support for wrapping eBPF program dispatch in chain calling
>> logic. The code injection is controlled by a flag at program load time; if
>> the flag is set, the BPF program will carry a flag bit that changes the
>> program dispatch logic to wrap it in a chain call loop.
>> 
>> Ideally, it shouldn't be necessary to set the flag on program load time,
>> but rather inject the calls when a chain call program is first loaded. The
>> allocation logic sets the whole of struct bpf_prog to be read-only memory,
>> so it can't immediately be modified, but conceivably we could just unlock
>> the first page of the struct and flip the bit when a chain call program is
>> first attached.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>>  include/linux/bpf.h      |    3 +++
>>  include/linux/filter.h   |   34 ++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/bpf.h |    6 ++++++
>>  kernel/bpf/core.c        |    6 ++++++
>>  kernel/bpf/syscall.c     |    4 +++-
>>  5 files changed, 50 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 5b9d22338606..13e5f38cf5c6 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -365,6 +365,8 @@ struct bpf_prog_stats {
>>  	struct u64_stats_sync syncp;
>>  };
>>  
>> +#define BPF_NUM_CHAIN_SLOTS 8
>> +
>>  struct bpf_prog_aux {
>>  	atomic_t refcnt;
>>  	u32 used_map_cnt;
>> @@ -383,6 +385,7 @@ struct bpf_prog_aux {
>>  	struct list_head ksym_lnode;
>>  	const struct bpf_prog_ops *ops;
>>  	struct bpf_map **used_maps;
>> +	struct bpf_prog *chain_progs[BPF_NUM_CHAIN_SLOTS];
>>  	struct bpf_prog *prog;
>>  	struct user_struct *user;
>>  	u64 load_time; /* ns since boottime */
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 2ce57645f3cd..3d1e4991e61d 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -21,6 +21,7 @@
>>  #include <linux/kallsyms.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/vmalloc.h>
>> +#include <linux/nospec.h>
>>  
>>  #include <net/sch_generic.h>
>>  
>> @@ -528,6 +529,7 @@ struct bpf_prog {
>>  				is_func:1,	/* program is a bpf function */
>>  				kprobe_override:1, /* Do we override a kprobe? */
>>  				has_callchain_buf:1, /* callchain buffer allocated? */
>> +				chain_calls:1, /* should this use the chain_call wrapper */
>>  				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
>>  	enum bpf_prog_type	type;		/* Type of BPF program */
>>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
>> @@ -551,6 +553,30 @@ struct sk_filter {
>>  	struct bpf_prog	*prog;
>>  };
>>  
>> +#define BPF_MAX_CHAIN_CALLS 32
>> +static __always_inline unsigned int do_chain_calls(const struct bpf_prog *prog,
>> +						   const void *ctx)
>> +{
>> +	int i = BPF_MAX_CHAIN_CALLS;
>> +	int idx;
>> +	u32 ret;
>> +
>> +	do {
>> +		ret = (*(prog)->bpf_func)(ctx, prog->insnsi);
>
> This breaks program stats.

Oh, right, silly me. Will fix.

>> +
>> +		if (ret + 1 >= BPF_NUM_CHAIN_SLOTS) {
>> +			prog = prog->aux->chain_progs[0];
>> +			continue;
>> +		}
>> +		idx = ret + 1;
>> +		idx = array_index_nospec(idx, BPF_NUM_CHAIN_SLOTS);
>> +
>> +		prog = prog->aux->chain_progs[idx] ?: prog->aux->chain_progs[0];
>> +	} while (prog && --i);
>> +
>> +	return ret;
>> +}
>> +
>>  DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>>  
>>  #define BPF_PROG_RUN(prog, ctx)	({				\
>> @@ -559,14 +585,18 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>>  	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
>>  		struct bpf_prog_stats *stats;			\
>>  		u64 start = sched_clock();			\
>> -		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>> +		ret = prog->chain_calls ?			\
>> +			do_chain_calls(prog, ctx) :			\
>> +			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>
> I thought you agreed on 'no performance regressions' rule?

As I wrote in the cover letter I could not measurable a performance
impact from this, even with the simplest possible XDP program (where
program setup time has the largest impact).

This was the performance before/after patch (also in the cover letter):

Before patch (XDP DROP program):  31.5 Mpps
After patch (XDP DROP program):   32.0 Mpps

So actually this *increases* performance ;)
(Or rather, the difference is within the measurement uncertainty on my
system).

-Toke
Alexei Starovoitov Oct. 9, 2019, 1:51 a.m. UTC | #3
On Tue, Oct 08, 2019 at 10:07:46AM +0200, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Mon, Oct 07, 2019 at 07:20:36PM +0200, Toke Høiland-Jørgensen wrote:
> >> From: Toke Høiland-Jørgensen <toke@redhat.com>
> >> 
> >> This adds support for wrapping eBPF program dispatch in chain calling
> >> logic. The code injection is controlled by a flag at program load time; if
> >> the flag is set, the BPF program will carry a flag bit that changes the
> >> program dispatch logic to wrap it in a chain call loop.
> >> 
> >> Ideally, it shouldn't be necessary to set the flag on program load time,
> >> but rather inject the calls when a chain call program is first loaded. The
> >> allocation logic sets the whole of struct bpf_prog to be read-only memory,
> >> so it can't immediately be modified, but conceivably we could just unlock
> >> the first page of the struct and flip the bit when a chain call program is
> >> first attached.
> >> 
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> >> ---
> >>  include/linux/bpf.h      |    3 +++
> >>  include/linux/filter.h   |   34 ++++++++++++++++++++++++++++++++--
> >>  include/uapi/linux/bpf.h |    6 ++++++
> >>  kernel/bpf/core.c        |    6 ++++++
> >>  kernel/bpf/syscall.c     |    4 +++-
> >>  5 files changed, 50 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index 5b9d22338606..13e5f38cf5c6 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -365,6 +365,8 @@ struct bpf_prog_stats {
> >>  	struct u64_stats_sync syncp;
> >>  };
> >>  
> >> +#define BPF_NUM_CHAIN_SLOTS 8
> >> +
> >>  struct bpf_prog_aux {
> >>  	atomic_t refcnt;
> >>  	u32 used_map_cnt;
> >> @@ -383,6 +385,7 @@ struct bpf_prog_aux {
> >>  	struct list_head ksym_lnode;
> >>  	const struct bpf_prog_ops *ops;
> >>  	struct bpf_map **used_maps;
> >> +	struct bpf_prog *chain_progs[BPF_NUM_CHAIN_SLOTS];
> >>  	struct bpf_prog *prog;
> >>  	struct user_struct *user;
> >>  	u64 load_time; /* ns since boottime */
> >> diff --git a/include/linux/filter.h b/include/linux/filter.h
> >> index 2ce57645f3cd..3d1e4991e61d 100644
> >> --- a/include/linux/filter.h
> >> +++ b/include/linux/filter.h
> >> @@ -21,6 +21,7 @@
> >>  #include <linux/kallsyms.h>
> >>  #include <linux/if_vlan.h>
> >>  #include <linux/vmalloc.h>
> >> +#include <linux/nospec.h>
> >>  
> >>  #include <net/sch_generic.h>
> >>  
> >> @@ -528,6 +529,7 @@ struct bpf_prog {
> >>  				is_func:1,	/* program is a bpf function */
> >>  				kprobe_override:1, /* Do we override a kprobe? */
> >>  				has_callchain_buf:1, /* callchain buffer allocated? */
> >> +				chain_calls:1, /* should this use the chain_call wrapper */
> >>  				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
> >>  	enum bpf_prog_type	type;		/* Type of BPF program */
> >>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
> >> @@ -551,6 +553,30 @@ struct sk_filter {
> >>  	struct bpf_prog	*prog;
> >>  };
> >>  
> >> +#define BPF_MAX_CHAIN_CALLS 32
> >> +static __always_inline unsigned int do_chain_calls(const struct bpf_prog *prog,
> >> +						   const void *ctx)
> >> +{
> >> +	int i = BPF_MAX_CHAIN_CALLS;
> >> +	int idx;
> >> +	u32 ret;
> >> +
> >> +	do {
> >> +		ret = (*(prog)->bpf_func)(ctx, prog->insnsi);
> >
> > This breaks program stats.
> 
> Oh, right, silly me. Will fix.
> 
> >> +
> >> +		if (ret + 1 >= BPF_NUM_CHAIN_SLOTS) {
> >> +			prog = prog->aux->chain_progs[0];
> >> +			continue;
> >> +		}
> >> +		idx = ret + 1;
> >> +		idx = array_index_nospec(idx, BPF_NUM_CHAIN_SLOTS);
> >> +
> >> +		prog = prog->aux->chain_progs[idx] ?: prog->aux->chain_progs[0];
> >> +	} while (prog && --i);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >>  DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> >>  
> >>  #define BPF_PROG_RUN(prog, ctx)	({				\
> >> @@ -559,14 +585,18 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> >>  	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> >>  		struct bpf_prog_stats *stats;			\
> >>  		u64 start = sched_clock();			\
> >> -		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> >> +		ret = prog->chain_calls ?			\
> >> +			do_chain_calls(prog, ctx) :			\
> >> +			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> >
> > I thought you agreed on 'no performance regressions' rule?
> 
> As I wrote in the cover letter I could not measurable a performance
> impact from this, even with the simplest possible XDP program (where
> program setup time has the largest impact).
> 
> This was the performance before/after patch (also in the cover letter):
> 
> Before patch (XDP DROP program):  31.5 Mpps
> After patch (XDP DROP program):   32.0 Mpps
> 
> So actually this *increases* performance ;)
> (Or rather, the difference is within the measurement uncertainty on my
> system).

I have hard time believing such numbers.
If I wasn't clear before: Nack to such hack in BPF_PROG_RUN.
Please implement proper indirect calls and jumps.
Apps have to cooperate with each other regardless
whereas above is a narrow solution to one problem.
Toke Høiland-Jørgensen Oct. 9, 2019, 8:03 a.m. UTC | #4
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> Please implement proper indirect calls and jumps.

I am still not convinced this will actually solve our problem; but OK, I
can give it a shot.

However, I don't actually have a clear picture of what exactly is
missing to add this support. Could you please provide a pointer or two?

-Toke
Jesper Dangaard Brouer Oct. 9, 2019, 10:19 a.m. UTC | #5
On Tue, 8 Oct 2019 18:51:19 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Oct 08, 2019 at 10:07:46AM +0200, Toke Høiland-Jørgensen wrote:
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >   
> > > On Mon, Oct 07, 2019 at 07:20:36PM +0200, Toke Høiland-Jørgensen wrote:  
> > >> From: Toke Høiland-Jørgensen <toke@redhat.com>
> > >> 
> > >> This adds support for wrapping eBPF program dispatch in chain calling
> > >> logic. The code injection is controlled by a flag at program load time; if
> > >> the flag is set, the BPF program will carry a flag bit that changes the
> > >> program dispatch logic to wrap it in a chain call loop.
[...]
> > >> diff --git a/include/linux/filter.h b/include/linux/filter.h
> > >> index 2ce57645f3cd..3d1e4991e61d 100644
> > >> --- a/include/linux/filter.h
> > >> +++ b/include/linux/filter.h
[...]
> > >>  #define BPF_PROG_RUN(prog, ctx)	({				\
> > >> @@ -559,14 +585,18 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> > >>  	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> > >>  		struct bpf_prog_stats *stats;			\
> > >>  		u64 start = sched_clock();			\
> > >> -		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> > >> +		ret = prog->chain_calls ?			\
> > >> +			do_chain_calls(prog, ctx) :			\
> > >> +			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\  
> > >
> > > I thought you agreed on 'no performance regressions' rule?  
> > 
> > As I wrote in the cover letter I could not measurable a performance
> > impact from this, even with the simplest possible XDP program (where
> > program setup time has the largest impact).
> > 
> > This was the performance before/after patch (also in the cover letter):
> > 
> > Before patch (XDP DROP program):  31.5 Mpps
> > After patch (XDP DROP program):   32.0 Mpps
> > 
> > So actually this *increases* performance ;)
> > (Or rather, the difference is within the measurement uncertainty on my
> > system).  
> 
> I have hard time believing such numbers.

Don't look at this as +/- 500Kpps.  Instead you have to realize that the
performance difference in ns (nano-seconds) is only 0.5 ns (0.496 ns).

 (1/31.5*1000)-(1/32.0*1000) = 0.4960 ns

This "half-a-nanosec" is below the measurement uncertainty of any
system. My system have approx 2-3 ns measurement variance, which I'm
proud of.  At these speeds (32Mpps) this e.g. 3 ns variance would
result in -2.8 Mpps (29.2Mpps).


The change Toke did in BPF_PROG_RUN does not introduce any measurable
performance change, as least on high-end Intel CPUs.  This DOES satisfy
'no performance regressions' rule.  You can dislike the solution for
other reasons ;-)
Alexei Starovoitov Oct. 9, 2019, 5:57 p.m. UTC | #6
On Wed, Oct 9, 2019 at 3:20 AM Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>
>
> The change Toke did in BPF_PROG_RUN does not introduce any measurable
> performance change, as least on high-end Intel CPUs.  This DOES satisfy
> 'no performance regressions' rule.

It adds extra load and branch in critical path of every program.
Including classic socket filters.
Not being able to measure the overhead in a microbenchmark
doesn't mean that overhead is not added.
Will 10 such branches be measurable?
What if they're not?
Then everyone will say: "It's not measurable in my setup, hence
I'm adding these branches".
tcp stack is even harder to measure. Yet, Eric will rightfully nack patches
that add such things when alternative solution is possible.
Alexei Starovoitov Oct. 10, 2019, 4:41 a.m. UTC | #7
On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > Please implement proper indirect calls and jumps.
> 
> I am still not convinced this will actually solve our problem; but OK, I
> can give it a shot.

If you're not convinced let's talk about it first.

Indirect calls is a building block for debugpoints.
Let's not call them tracepoints, because Linus banned any discusion
that includes that name.
The debugpoints is a way for BPF program to insert points in its
code to let external facility to do tracing and debugging.

void (*debugpoint1)(struct xdp_buff *, int code);
void (*debugpoint2)(struct xdp_buff *);
void (*debugpoint3)(int len);

int bpf_prog(struct xdp_buff *ctx)
{
    // let's parse the packet
    if (debugpoint3)
        debugpoint3(ctx->data_end - ctx->data);

    if (condition) {
        // deciding to drop this packet
        if (debugpoint1)
            debugpoint1(ctx, XDP_DROP);
        return XDP_DROP;
    }
    if (some other condition) {
        // lets pass it to the stack
        if (debugpoint2)
            debugpoint2(ctx);
        return XDP_PASS;
    }
}

In normal operation nothing is being called.
The execution cost to the program is load plus branch.
But since program is annotated with BTF the external tool,
like bpftool, can load another program and populate
debugpointN pointer in the original program to trace its
execution.
Essentially it's live debugging (tracing) of cooperative
bpf programs that added debugpoints to their code.

Obviously indirect calls can be used for a ton of other things
including proper chaing of progs, but I'm convinced that
you don't need chaining to solve your problem.
You need debugging.
If you disagree please explain _your_ problem again.
Saying that fb katran is a use case for chaining is, hrm, not correct.
Toke Høiland-Jørgensen Oct. 14, 2019, 12:35 p.m. UTC | #8
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > Please implement proper indirect calls and jumps.
>> 
>> I am still not convinced this will actually solve our problem; but OK, I
>> can give it a shot.
>
> If you're not convinced let's talk about it first.
>
> Indirect calls is a building block for debugpoints.
> Let's not call them tracepoints, because Linus banned any discusion
> that includes that name.
> The debugpoints is a way for BPF program to insert points in its
> code to let external facility to do tracing and debugging.
>
> void (*debugpoint1)(struct xdp_buff *, int code);
> void (*debugpoint2)(struct xdp_buff *);
> void (*debugpoint3)(int len);

So how would these work? Similar to global variables (i.e., the loader
creates a single-entry PROG_ARRAY map for each one)? Presumably with
some BTF to validate the argument types?

So what would it take to actually support this? It doesn't quite sound
trivial to add?

> Essentially it's live debugging (tracing) of cooperative bpf programs
> that added debugpoints to their code.

Yup, certainly not disputing that this would be useful for debugging;
although it'll probably be a while before its use becomes widespread
enough that it'll be a reliable tool for people deploying XDP programs...

> Obviously indirect calls can be used for a ton of other things
> including proper chaing of progs, but I'm convinced that
> you don't need chaining to solve your problem.
> You need debugging.

Debugging is certainly also an area that I want to improve. However, I
think that focusing on debugging as the driver for chaining programs was
a mistake on my part; rudimentary debugging (using a tool such as
xdpdump) is something that falls out of program chaining, but it's not
the main driver for it.

> If you disagree please explain _your_ problem again.
> Saying that fb katran is a use case for chaining is, hrm, not correct.

I never said Katran was the driver for this. I just used Katran as one
of the "prior art" examples for my "how are people solving running
multiple programs on the same interface" survey.

What I want to achieve is simply the ability to run multiple independent
XDP programs on the same interface, without having to put any
constraints on the programs themselves. I'm not disputing that this is
*possible* to do completely in userspace, I just don't believe the
resulting solution will be very good. Proper kernel support for indirect
calls (or just "tail calls that return") may change that; but in any
case I think I need to go write some userspace code to have some more
concrete examples to discuss from. So we can come back to the
particulars once I've done that :)

-Toke
John Fastabend Oct. 14, 2019, 5:08 p.m. UTC | #9
Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >> 
> >> > Please implement proper indirect calls and jumps.
> >> 
> >> I am still not convinced this will actually solve our problem; but OK, I
> >> can give it a shot.
> >
> > If you're not convinced let's talk about it first.
> >
> > Indirect calls is a building block for debugpoints.
> > Let's not call them tracepoints, because Linus banned any discusion
> > that includes that name.
> > The debugpoints is a way for BPF program to insert points in its
> > code to let external facility to do tracing and debugging.
> >
> > void (*debugpoint1)(struct xdp_buff *, int code);
> > void (*debugpoint2)(struct xdp_buff *);
> > void (*debugpoint3)(int len);

I was considering some basic static linking from libbpf side. Something
like,

  bpf_object__link_programs(struct bpf_object *obj1, struct bpf_object *obj2);

This way you could just 'link' in debugpoint{1,2,3} from libbpf before
loading? This would be useful on my side for adding/removing features
and handling different kernel versions. So more generally useful IMO.

We can manage this now but its a bit ugly the above seems nicer to me.
Also not quite as daunting as getting llvm-lld working although that
would also be worth while.

> 
> So how would these work? Similar to global variables (i.e., the loader
> creates a single-entry PROG_ARRAY map for each one)? Presumably with
> some BTF to validate the argument types?
> 
> So what would it take to actually support this? It doesn't quite sound
> trivial to add?
> 
> > Essentially it's live debugging (tracing) of cooperative bpf programs
> > that added debugpoints to their code.
> 
> Yup, certainly not disputing that this would be useful for debugging;
> although it'll probably be a while before its use becomes widespread
> enough that it'll be a reliable tool for people deploying XDP programs...
> 

I guess linking would be a bit different than tracing. Both seem
useful.

> > Obviously indirect calls can be used for a ton of other things
> > including proper chaing of progs, but I'm convinced that
> > you don't need chaining to solve your problem.
> > You need debugging.
> 
> Debugging is certainly also an area that I want to improve. However, I
> think that focusing on debugging as the driver for chaining programs was
> a mistake on my part; rudimentary debugging (using a tool such as
> xdpdump) is something that falls out of program chaining, but it's not
> the main driver for it.
> 
> > If you disagree please explain _your_ problem again.
> > Saying that fb katran is a use case for chaining is, hrm, not correct.
> 
> I never said Katran was the driver for this. I just used Katran as one
> of the "prior art" examples for my "how are people solving running
> multiple programs on the same interface" survey.
> 
> What I want to achieve is simply the ability to run multiple independent
> XDP programs on the same interface, without having to put any
> constraints on the programs themselves. I'm not disputing that this is
> *possible* to do completely in userspace, I just don't believe the
> resulting solution will be very good. Proper kernel support for indirect
> calls (or just "tail calls that return") may change that; but in any
> case I think I need to go write some userspace code to have some more
> concrete examples to discuss from. So we can come back to the
> particulars once I've done that :)

I was imaging that because you have to develop some sort of coordination
by using linking you could enforce call signatures which would allow
you to drop in any XDP program at a call site as long as it matches the
signature.

> 
> -Toke
Toke Høiland-Jørgensen Oct. 14, 2019, 6:48 p.m. UTC | #10
John Fastabend <john.fastabend@gmail.com> writes:

> Toke Høiland-Jørgensen wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
>> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> >> 
>> >> > Please implement proper indirect calls and jumps.
>> >> 
>> >> I am still not convinced this will actually solve our problem; but OK, I
>> >> can give it a shot.
>> >
>> > If you're not convinced let's talk about it first.
>> >
>> > Indirect calls is a building block for debugpoints.
>> > Let's not call them tracepoints, because Linus banned any discusion
>> > that includes that name.
>> > The debugpoints is a way for BPF program to insert points in its
>> > code to let external facility to do tracing and debugging.
>> >
>> > void (*debugpoint1)(struct xdp_buff *, int code);
>> > void (*debugpoint2)(struct xdp_buff *);
>> > void (*debugpoint3)(int len);
>
> I was considering some basic static linking from libbpf side. Something
> like,
>
>   bpf_object__link_programs(struct bpf_object *obj1, struct bpf_object *obj2);
>
> This way you could just 'link' in debugpoint{1,2,3} from libbpf before
> loading? This would be useful on my side for adding/removing features
> and handling different kernel versions. So more generally useful IMO.

So that will end up with a single monolithic BPF program being loaded
(from the kernel PoV), right? That won't do; we want to be able to go
back to the component programs, and manipulate them as separate kernel
objects.

-Toke
Edward Cree Oct. 15, 2019, 4:30 p.m. UTC | #11
On 14/10/2019 19:48, Toke Høiland-Jørgensen wrote:
> So that will end up with a single monolithic BPF program being loaded
> (from the kernel PoV), right? That won't do; we want to be able to go
> back to the component programs, and manipulate them as separate kernel
> objects.
Why's that?  (Since it also applies to the static-linking PoC I'm
 putting together.)  What do you gain by having the components be
 kernel-visible?
(Bad analogy time: the kernel doesn't care about the .o files you
 linked together to make a userspace executable; any debugging you
 do on that relies on other userspace infrastructure — loading
 symbol tables into a debugger — to interpret things.)

-Ed
Toke Høiland-Jørgensen Oct. 15, 2019, 4:42 p.m. UTC | #12
Edward Cree <ecree@solarflare.com> writes:

> On 14/10/2019 19:48, Toke Høiland-Jørgensen wrote:
>> So that will end up with a single monolithic BPF program being loaded
>> (from the kernel PoV), right? That won't do; we want to be able to go
>> back to the component programs, and manipulate them as separate kernel
>> objects.
> Why's that? (Since it also applies to the static-linking PoC I'm
> putting together.) What do you gain by having the components be
> kernel-visible?

Because then userspace will have to keep state to be able to answer
questions like "show me the list of programs that are currently loaded
(and their call chain)", or do operations like "insert this program into
the call chain at position X".

We already keep all this state in the kernel, so replicating it all in
userspace means both a lot of extra work to implement that
functionality, and having to deal with the inevitable fallout when the
userspace and kernel space state get out of sync...

-Toke
Edward Cree Oct. 15, 2019, 6:33 p.m. UTC | #13
On 15/10/2019 17:42, Toke Høiland-Jørgensen wrote:
> Edward Cree <ecree@solarflare.com> writes:
>> On 14/10/2019 19:48, Toke Høiland-Jørgensen wrote:
>>> So that will end up with a single monolithic BPF program being loaded
>>> (from the kernel PoV), right? That won't do; we want to be able to go
>>> back to the component programs, and manipulate them as separate kernel
>>> objects.
>> Why's that? (Since it also applies to the static-linking PoC I'm
>> putting together.) What do you gain by having the components be
>> kernel-visible?
> Because then userspace will have to keep state to be able to answer
> questions like "show me the list of programs that are currently loaded
> (and their call chain)", or do operations like "insert this program into
> the call chain at position X".
Userspace keeps state for stuff all the time.  We call them "daemons" ;)
Now you might have arguments for why putting a given piece of state in
 userspace is a bad idea — there's a reason why not everything is a
 microkernel — but those arguments need to be made.

> We already keep all this state in the kernel,
The kernel keeps the state of "current (monolithic) BPF program loaded
 (against each hook)".  Prior to this patch series, the kernel does
 *not* keep any state on what that BPF program was made of (except in
 the sense of BTF debuginfos, which a linker could combine appropriately).

So if we _don't_ add your chained-programs functionality into the kernel,
 and then _do_ implement userspace linking, then there isn't any
 duplicated functionality or even duplicated state — the userland state
 is "what are my components and what's the linker invocation that glues
 them together", the kernel state is "here is one monolithic BPF blob,
 along with a BTF blob to debug it".  The kernel knows nothing of the
 former, and userspace doesn't store (but knows how to recreate) the
 latter.

(That said, proper dynamic linking is better than static linking OR chain
 calls, because it gives us the full flexibility of linking while giving
 you your 'subprogs as kernel objects & kernel state'.  But I think we'll
 need to prototype things with static linking first so that we can be
 sure of the linker semantics we want, before we try to put a new dynamic
 linker in the kernel.)

-Ed
Alexei Starovoitov Oct. 16, 2019, 2:28 a.m. UTC | #14
On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >> 
> >> > Please implement proper indirect calls and jumps.
> >> 
> >> I am still not convinced this will actually solve our problem; but OK, I
> >> can give it a shot.
> >
> > If you're not convinced let's talk about it first.
> >
> > Indirect calls is a building block for debugpoints.
> > Let's not call them tracepoints, because Linus banned any discusion
> > that includes that name.
> > The debugpoints is a way for BPF program to insert points in its
> > code to let external facility to do tracing and debugging.
> >
> > void (*debugpoint1)(struct xdp_buff *, int code);
> > void (*debugpoint2)(struct xdp_buff *);
> > void (*debugpoint3)(int len);
> 
> So how would these work? Similar to global variables (i.e., the loader
> creates a single-entry PROG_ARRAY map for each one)? Presumably with
> some BTF to validate the argument types?
> 
> So what would it take to actually support this? It doesn't quite sound
> trivial to add?

Depends on definition of 'trivial' :)
The kernel has a luxury of waiting until clean solution is implemented
instead of resorting to hacks.

> > Essentially it's live debugging (tracing) of cooperative bpf programs
> > that added debugpoints to their code.
> 
> Yup, certainly not disputing that this would be useful for debugging;
> although it'll probably be a while before its use becomes widespread
> enough that it'll be a reliable tool for people deploying XDP programs...

same for any new api.

> > Obviously indirect calls can be used for a ton of other things
> > including proper chaing of progs, but I'm convinced that
> > you don't need chaining to solve your problem.
> > You need debugging.
> 
> Debugging is certainly also an area that I want to improve. However, I
> think that focusing on debugging as the driver for chaining programs was
> a mistake on my part; rudimentary debugging (using a tool such as
> xdpdump) is something that falls out of program chaining, but it's not
> the main driver for it.

xdpdump can be done already the way I suggested without adding new kernel
code and it will work on old-ish kernels. Aside from xdp itself
the other requirement is to have get_fd_by_id sys_bpf command.

> > If you disagree please explain _your_ problem again.
> > Saying that fb katran is a use case for chaining is, hrm, not correct.
> 
> I never said Katran was the driver for this. I just used Katran as one
> of the "prior art" examples for my "how are people solving running
> multiple programs on the same interface" survey.

and they solved it. that's the point.

> What I want to achieve is simply the ability to run multiple independent
> XDP programs on the same interface, without having to put any
> constraints on the programs themselves. I'm not disputing that this is
> *possible* to do completely in userspace, I just don't believe the
> resulting solution will be very good.

What makes me uneasy about the whole push for program chaining
is that tc cls_bpf supported multiple independent programs from day one.
Yet it doesn't help to run two firewalls hooked into tc ingress.
Similarly cgroup-bpf had a ton discussions on proper multi-prog api.
Everyone was eventually convinced that it's flexible and generic.
Yet people who started to use it complain that it's missing features
to make it truly usable in production.
Tracing is the only bit where multi-prog works.
Because kernel always runs all programs there.
If we could use PROG_RUN_ARRAY for XDP that could have been a solution.
But we cannot. Return codes matter for XDP.
Jesper Dangaard Brouer Oct. 16, 2019, 8:27 a.m. UTC | #15
On Tue, 15 Oct 2019 19:28:51 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >   
> > > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:  
> > >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > >>   
> > >> > Please implement proper indirect calls and jumps.  
> > >> 
> > >> I am still not convinced this will actually solve our problem; but OK, I
> > >> can give it a shot.  
> > >
> > > If you're not convinced let's talk about it first.
> > >
> > > Indirect calls is a building block for debugpoints.
> > > Let's not call them tracepoints, because Linus banned any discusion
> > > that includes that name.
> > > The debugpoints is a way for BPF program to insert points in its
> > > code to let external facility to do tracing and debugging.
> > >
> > > void (*debugpoint1)(struct xdp_buff *, int code);
> > > void (*debugpoint2)(struct xdp_buff *);
> > > void (*debugpoint3)(int len);  
> > 
> > So how would these work? Similar to global variables (i.e., the loader
> > creates a single-entry PROG_ARRAY map for each one)? Presumably with
> > some BTF to validate the argument types?
> > 
> > So what would it take to actually support this? It doesn't quite sound
> > trivial to add?  
> 
> Depends on definition of 'trivial' :)
> The kernel has a luxury of waiting until clean solution is implemented
> instead of resorting to hacks.
> 
> > > Essentially it's live debugging (tracing) of cooperative bpf programs
> > > that added debugpoints to their code.  
> > 
> > Yup, certainly not disputing that this would be useful for debugging;
> > although it'll probably be a while before its use becomes widespread
> > enough that it'll be a reliable tool for people deploying XDP programs...  
> 
> same for any new api.
> 
> > > Obviously indirect calls can be used for a ton of other things
> > > including proper chaing of progs, but I'm convinced that
> > > you don't need chaining to solve your problem.
> > > You need debugging.  
> > 
> > Debugging is certainly also an area that I want to improve. However, I
> > think that focusing on debugging as the driver for chaining programs was
> > a mistake on my part; rudimentary debugging (using a tool such as
> > xdpdump) is something that falls out of program chaining, but it's not
> > the main driver for it.  
> 
> xdpdump can be done already the way I suggested without adding new
> kernel code and it will work on old-ish kernels. Aside from xdp itself
> the other requirement is to have get_fd_by_id sys_bpf command.

You only demonstrated we can hook in xdpdump BEFORE an existing XDP
program without modifying the XDP program.  I'm much more interested in
running xdpdump AFTER an existing XDP program (without modifying it),
and very importantly I want to know the XDP-return codes in my xdpdump. 

That said, with your proposal of "proper indirect calls for BPF", then
the xdpdump AFTER will be easy to implement.

Maybe we should not focus on the xdpdump use-case, because it might be
better to solve by simply adding a tracepoint, that have access to the
xdp_buff.


> > > If you disagree please explain _your_ problem again.
> > > Saying that fb katran is a use case for chaining is, hrm, not correct.  
> > 
> > I never said Katran was the driver for this. I just used Katran as one
> > of the "prior art" examples for my "how are people solving running
> > multiple programs on the same interface" survey.  
> 
> and they solved it. that's the point.
> 
> > What I want to achieve is simply the ability to run multiple independent
> > XDP programs on the same interface, without having to put any
> > constraints on the programs themselves. I'm not disputing that this is
> > *possible* to do completely in userspace, I just don't believe the
> > resulting solution will be very good.  
> 
> What makes me uneasy about the whole push for program chaining
> is that tc cls_bpf supported multiple independent programs from day one.
> Yet it doesn't help to run two firewalls hooked into tc ingress.

I do understand your concerns.

Let me explain why I believe TC cls_bpf multiple independent programs
have not seen much usage.

First of all the TC-tool is notorious difficult to use and configure (I
admit, I struggle with this myself every single time). (The TC layer
have some amazing features, like hash based lookup, that never get used
due to this).

Second, the multiple "independent programs", are actually not
independent, because the current running program must return
TC_ACT_UNSPEC to allow next bpf-prog to run.  Thus, it is not really
usable.


> Similarly cgroup-bpf had a ton discussions on proper multi-prog api.
> Everyone was eventually convinced that it's flexible and generic.
> Yet people who started to use it complain that it's missing features
> to make it truly usable in production.

I've not looked at the cgroup-bpf multi-prog API, I guess we should to
understand why this failed.

> Tracing is the only bit where multi-prog works.
> Because kernel always runs all programs there.

This is important insight ("kernel always runs all programs").  A key
part of Toke's design with chain-calling, is that the kernel always
runs all the XDP/BPF-progs in the chain. Regardless of the XDP return
value.  The next program in the chain, need info about previous
BPF-prog return value, but it can choose to override this.

> If we could use PROG_RUN_ARRAY for XDP that could have been a solution.
> But we cannot. Return codes matter for XDP.

The proposal from Toke, is to allow next-chain BPF-program can override
the prev BPF-prog return value.  This part of the design, which I must
admit is also the only option due to tail-calls.  But I do think it
makes sense, because even if XDP_DROP is returned, then I can install
another XDP-prog that does XDP_REDIRECT out another interface to an
analyzer box, or into an AF_XDP based dump tool.
Daniel Borkmann Oct. 16, 2019, 10:35 a.m. UTC | #16
On Wed, Oct 16, 2019 at 10:27:12AM +0200, Jesper Dangaard Brouer wrote:
> On Tue, 15 Oct 2019 19:28:51 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > > > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:  
> > > >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
[...]
> > > > If you disagree please explain _your_ problem again.
> > > > Saying that fb katran is a use case for chaining is, hrm, not correct.  
> > > 
> > > I never said Katran was the driver for this. I just used Katran as one
> > > of the "prior art" examples for my "how are people solving running
> > > multiple programs on the same interface" survey.  
> > 
> > and they solved it. that's the point.
> > 
> > > What I want to achieve is simply the ability to run multiple independent
> > > XDP programs on the same interface, without having to put any
> > > constraints on the programs themselves. I'm not disputing that this is
> > > *possible* to do completely in userspace, I just don't believe the
> > > resulting solution will be very good.  
> > 
> > What makes me uneasy about the whole push for program chaining
> > is that tc cls_bpf supported multiple independent programs from day one.
> > Yet it doesn't help to run two firewalls hooked into tc ingress.
> 
> I do understand your concerns.
> 
> Let me explain why I believe TC cls_bpf multiple independent programs
> have not seen much usage.
> 
> First of all the TC-tool is notorious difficult to use and configure (I
> admit, I struggle with this myself every single time). (The TC layer
> have some amazing features, like hash based lookup, that never get used
> due to this).

We do use cls_bpf heavily in Cilium, but I don't necessarily agree on
the notorious difficult to use aspect (at least for tc + BPF): i) this
is abstracted away from the /user/ entirely to the point that this is an
implementation detail he doesn't need to know about, ii) these days most
access to these hooks is done programmatically, if this is a worry, then
lets simply add a cls_bpf pendant for APIs like bpf_set_link_xdp_fd() we
have in libbpf where you only pass in ifindex, direction (ingress/egress)
and priority of the program so that underneath it sets up cls_act qdisc
with a cls_bpf instance that makes the whole thing foolproof, e.g.:

  int bpf_set_link_tc_fd(int ifindex, int fd, enum bpf_tc_dir dir,
                         __u32 priority, __u32 flags);

The flags could be similar to XDP: 0 or xxx_FLAGS_UPDATE_IF_NOEXIST and
xxx_FLAGS_HW_MODE. The problem that might be easy to miss via tc cmdline
tool is that when you don't specify explicit prio/handle upon tc replace,
then it auto-allocates one and keeps adding new programs instead of
replacing the old ones, but this quirk can be solved via API like above.

> Second, the multiple "independent programs", are actually not
> independent, because the current running program must return
> TC_ACT_UNSPEC to allow next bpf-prog to run.  Thus, it is not really
> usable.

I'd argue that unless the only thing you do in your debugging program is
to introspect (read-only) the packet at the current point, you'd run into
a similar coordination issue, meaning, the "independent programs" works
for simple cases where you only have ACCEPT and DROP policy, such that
you could run through all the programs and have precedence on DROP.

But once you have conflicting policies with regards to how these programs
mangle and redirect packets, how would you handle these? I'd argue it's
a non-trivial task to outsource if /admins/ are supposed to do manual
order adjustments and more importantly to troubleshoot issues due to
them. Potentially debugging hooks would make that easier to avoid
recompilation, but it's more of a developer task.

Often times orchestration tools i) assume they just own the data path
to reduce complexity in an already complex system and ii) also keep
'refreshing' their setup. One random example for the latter is k8s'
kube-proxy that reinstalls its iptables rules every x sec, in order to
make sure there was no manual messing around and to keep the data path
eventually consistent with the daemon view (if they got borked). How
would you make the loader aware of daemons automatically refreshing/
reconfiguring their BPF progs in the situation where admins changed
the pipeline, adding similar handle as tc so whoever does the 'chain'
assembly know which one to update?

> > Similarly cgroup-bpf had a ton discussions on proper multi-prog api.
> > Everyone was eventually convinced that it's flexible and generic.
> > Yet people who started to use it complain that it's missing features
> > to make it truly usable in production.
> 
> I've not looked at the cgroup-bpf multi-prog API, I guess we should to
> understand why this failed.
> 
> > Tracing is the only bit where multi-prog works.
> > Because kernel always runs all programs there.
> 
> This is important insight ("kernel always runs all programs").  A key
> part of Toke's design with chain-calling, is that the kernel always
> runs all the XDP/BPF-progs in the chain. Regardless of the XDP return
> value.  The next program in the chain, need info about previous
> BPF-prog return value, but it can choose to override this.
> 
> > If we could use PROG_RUN_ARRAY for XDP that could have been a solution.
> > But we cannot. Return codes matter for XDP.
> 
> The proposal from Toke, is to allow next-chain BPF-program can override
> the prev BPF-prog return value.  This part of the design, which I must
> admit is also the only option due to tail-calls.  But I do think it
> makes sense, because even if XDP_DROP is returned, then I can install
> another XDP-prog that does XDP_REDIRECT out another interface to an
> analyzer box, or into an AF_XDP based dump tool.

Thanks,
Daniel
Toke Høiland-Jørgensen Oct. 16, 2019, 11:16 a.m. UTC | #17
Daniel Borkmann <daniel@iogearbox.net> writes:
> We do use cls_bpf heavily in Cilium, but I don't necessarily agree on
> the notorious difficult to use aspect (at least for tc + BPF): i) this
> is abstracted away from the /user/ entirely to the point that this is an
> implementation detail he doesn't need to know about, ii) these days most
> access to these hooks is done programmatically, if this is a worry, then
> lets simply add a cls_bpf pendant for APIs like bpf_set_link_xdp_fd() we
> have in libbpf where you only pass in ifindex, direction (ingress/egress)
> and priority of the program so that underneath it sets up cls_act qdisc
> with a cls_bpf instance that makes the whole thing foolproof, e.g.:
>
>   int bpf_set_link_tc_fd(int ifindex, int fd, enum bpf_tc_dir dir,
>                          __u32 priority, __u32 flags);

Basically, what I'm trying to achieve with XDP chain calls is to be able
to do something similar to this, but for XDP programs. Just with the
added ability to also select on return code...

>> Second, the multiple "independent programs", are actually not
>> independent, because the current running program must return
>> TC_ACT_UNSPEC to allow next bpf-prog to run.  Thus, it is not really
>> usable.
>
> I'd argue that unless the only thing you do in your debugging program is
> to introspect (read-only) the packet at the current point, you'd run into
> a similar coordination issue, meaning, the "independent programs" works
> for simple cases where you only have ACCEPT and DROP policy, such that
> you could run through all the programs and have precedence on DROP.
>
> But once you have conflicting policies with regards to how these programs
> mangle and redirect packets, how would you handle these?

I imagine that in most relevant cases this can be handled by convention;
the most obvious convention being "chain call on XDP_PASS". But still
allowing the admin to override this if they know what they are doing.

> I'd argue it's a non-trivial task to outsource if /admins/ are
> supposed to do manual order adjustments and more importantly to
> troubleshoot issues due to them. Potentially debugging hooks would
> make that easier to avoid recompilation, but it's more of a developer
> task.

Sure, in the general case this could become arbitrarily complex; but I
think that the feature can still be useful.

> Often times orchestration tools i) assume they just own the data path
> to reduce complexity in an already complex system and ii) also keep
> 'refreshing' their setup. One random example for the latter is k8s'
> kube-proxy that reinstalls its iptables rules every x sec, in order to
> make sure there was no manual messing around and to keep the data path
> eventually consistent with the daemon view (if they got borked).

This is actually the reason I want the kernel state to be the source of
truth (instead of keeping state in a userspace daemon). If the kernel
keeps the state it can enforce consistency, whereas a userspace daemon
has to be able to deal with things in the kernel changing underneath
it...

> How would you make the loader aware of daemons automatically
> refreshing/ reconfiguring their BPF progs in the situation where
> admins changed the pipeline, adding similar handle as tc so whoever
> does the 'chain' assembly know which one to update?

My idea for this was to implement atomic updates in the form of a
"cmpxchg" type of operation. I.e., we'd add a parameter to the syscall
where userspace could say "I want to install this program in place of
this one", and if that "old program" value doesn't match the current
state of the kernel, it can be rejected, atomically.

-Toke
Toke Høiland-Jørgensen Oct. 16, 2019, 1:51 p.m. UTC | #18
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
>> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> >> 
>> >> > Please implement proper indirect calls and jumps.
>> >> 
>> >> I am still not convinced this will actually solve our problem; but OK, I
>> >> can give it a shot.
>> >
>> > If you're not convinced let's talk about it first.
>> >
>> > Indirect calls is a building block for debugpoints.
>> > Let's not call them tracepoints, because Linus banned any discusion
>> > that includes that name.
>> > The debugpoints is a way for BPF program to insert points in its
>> > code to let external facility to do tracing and debugging.
>> >
>> > void (*debugpoint1)(struct xdp_buff *, int code);
>> > void (*debugpoint2)(struct xdp_buff *);
>> > void (*debugpoint3)(int len);
>> 
>> So how would these work? Similar to global variables (i.e., the loader
>> creates a single-entry PROG_ARRAY map for each one)? Presumably with
>> some BTF to validate the argument types?
>> 
>> So what would it take to actually support this? It doesn't quite sound
>> trivial to add?
>
> Depends on definition of 'trivial' :)

Well, I don't know... :)

> The kernel has a luxury of waiting until clean solution is implemented
> instead of resorting to hacks.

It would be helpful if you could give an opinion on what specific
features are missing in the kernel to support these indirect calls. A
few high-level sentences is fine (e.g., "the verifier needs to be able
to do X, and llvm/libbpf needs to have support for Y")... I'm trying to
gauge whether this is something it would even make sense for me to poke
into, or if I'm better off waiting for someone who actually knows what
they are doing to work on this :)

>> > If you disagree please explain _your_ problem again.
>> > Saying that fb katran is a use case for chaining is, hrm, not correct.
>> 
>> I never said Katran was the driver for this. I just used Katran as one
>> of the "prior art" examples for my "how are people solving running
>> multiple programs on the same interface" survey.
>
> and they solved it. that's the point.

Yes, in a way that's specific to Katran. This whole thing actually
started out as an effort to make something that's (a) generically useful
so everyone doesn't have to re-invent their own way of doing it, and (b)
interoperable in the case where there is no direct coordination between
the program authors.

The ability to chain a program per return code came out of (a), and the
need to not have to modify all programs involved came out of (b).

>> What I want to achieve is simply the ability to run multiple independent
>> XDP programs on the same interface, without having to put any
>> constraints on the programs themselves. I'm not disputing that this is
>> *possible* to do completely in userspace, I just don't believe the
>> resulting solution will be very good.
>
> What makes me uneasy about the whole push for program chaining
> is that tc cls_bpf supported multiple independent programs from day one.
> Yet it doesn't help to run two firewalls hooked into tc ingress.
> Similarly cgroup-bpf had a ton discussions on proper multi-prog api.
> Everyone was eventually convinced that it's flexible and generic.
> Yet people who started to use it complain that it's missing features
> to make it truly usable in production.

I'll go look at the cgroup-bpf API as well... Do you have any references
to those complaints?

-Toke
Toke Høiland-Jørgensen Oct. 17, 2019, 12:11 p.m. UTC | #19
Edward Cree <ecree@solarflare.com> writes:

> On 15/10/2019 17:42, Toke Høiland-Jørgensen wrote:
>> Edward Cree <ecree@solarflare.com> writes:
>>> On 14/10/2019 19:48, Toke Høiland-Jørgensen wrote:
>>>> So that will end up with a single monolithic BPF program being loaded
>>>> (from the kernel PoV), right? That won't do; we want to be able to go
>>>> back to the component programs, and manipulate them as separate kernel
>>>> objects.
>>> Why's that? (Since it also applies to the static-linking PoC I'm
>>> putting together.) What do you gain by having the components be
>>> kernel-visible?
>> Because then userspace will have to keep state to be able to answer
>> questions like "show me the list of programs that are currently loaded
>> (and their call chain)", or do operations like "insert this program into
>> the call chain at position X".
> Userspace keeps state for stuff all the time.  We call them "daemons" ;)
> Now you might have arguments for why putting a given piece of state in
>  userspace is a bad idea — there's a reason why not everything is a
>  microkernel — but those arguments need to be made.
>
>> We already keep all this state in the kernel,
> The kernel keeps the state of "current (monolithic) BPF program loaded
>  (against each hook)".  Prior to this patch series, the kernel does
>  *not* keep any state on what that BPF program was made of (except in
>  the sense of BTF debuginfos, which a linker could combine appropriately).
>
> So if we _don't_ add your chained-programs functionality into the kernel,
>  and then _do_ implement userspace linking, then there isn't any
>  duplicated functionality or even duplicated state — the userland state
>  is "what are my components and what's the linker invocation that glues
>  them together", the kernel state is "here is one monolithic BPF blob,
>  along with a BTF blob to debug it".  The kernel knows nothing of the
>  former, and userspace doesn't store (but knows how to recreate) the
>  latter.

I think there's a conceptual disconnect here in how we view what an XDP
program is. In my mind, an XDP program is a stand-alone entity tied to a
particular application; not a library function that can just be inserted
into another program. Thus, what you're proposing sounds to me like the
equivalent of saying "we don't want to do process management in the
kernel; the init process should just link in all the programs userspace
wants to run". Which is technically possible; but that doesn't make it a
good idea.

Setting aside that for a moment; the reason I don't think this belongs
in userspace is that putting it there would carry a complexity cost that
is higher than having it in the kernel. Specifically, if we do implement
an 'xdpd' daemon to handle all this, that would mean that we:

- Introduce a new, separate code base that we'll have to write, support
  and manage updates to.

- Add a new dependency to using XDP (now you not only need the kernel
  and libraries, you'll also need the daemon).

- Have to duplicate or wrap functionality currently found in the kernel;
  at least:
  
    - Keeping track of which XDP programs are loaded and attached to
      each interface (as well as the "new state" of their attachment
      order).

    - Some kind of interface with the verifier; if an app does
      xdpd_rpc_load(prog), how is the verifier result going to get back
      to the caller?

- Have to deal with state synchronisation issues (how does xdpd handle
  kernel state changing from underneath it?).


While these are issues that are (probably) all solvable, I think the
cost of solving them is far higher than putting the support into the
kernel. Which is why I think kernel support is the best solution :)

-Toke
Alexei Starovoitov Oct. 19, 2019, 8:09 p.m. UTC | #20
On Wed, Oct 16, 2019 at 03:51:52PM +0200, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >> 
> >> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
> >> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >> >> 
> >> >> > Please implement proper indirect calls and jumps.
> >> >> 
> >> >> I am still not convinced this will actually solve our problem; but OK, I
> >> >> can give it a shot.
> >> >
> >> > If you're not convinced let's talk about it first.
> >> >
> >> > Indirect calls is a building block for debugpoints.
> >> > Let's not call them tracepoints, because Linus banned any discusion
> >> > that includes that name.
> >> > The debugpoints is a way for BPF program to insert points in its
> >> > code to let external facility to do tracing and debugging.
> >> >
> >> > void (*debugpoint1)(struct xdp_buff *, int code);
> >> > void (*debugpoint2)(struct xdp_buff *);
> >> > void (*debugpoint3)(int len);
> >> 
> >> So how would these work? Similar to global variables (i.e., the loader
> >> creates a single-entry PROG_ARRAY map for each one)? Presumably with
> >> some BTF to validate the argument types?
> >> 
> >> So what would it take to actually support this? It doesn't quite sound
> >> trivial to add?
> >
> > Depends on definition of 'trivial' :)
> 
> Well, I don't know... :)
> 
> > The kernel has a luxury of waiting until clean solution is implemented
> > instead of resorting to hacks.
> 
> It would be helpful if you could give an opinion on what specific
> features are missing in the kernel to support these indirect calls. A
> few high-level sentences is fine (e.g., "the verifier needs to be able
> to do X, and llvm/libbpf needs to have support for Y")... I'm trying to
> gauge whether this is something it would even make sense for me to poke
> into, or if I'm better off waiting for someone who actually knows what
> they are doing to work on this :)

I have to reveal a secret first...
llvm supports indirect calls since 2017 ;)

It can compile the following:
struct trace_kfree_skb {
	struct sk_buff *skb;
	void *location;
};

typedef void (*fn)(struct sk_buff *skb);
static fn func;

SEC("tp_btf/kfree_skb")
int trace_kfree_skb(struct trace_kfree_skb *ctx)
{
	struct sk_buff *skb = ctx->skb;
	fn f = *(volatile fn *)&func;

	if (f)
		f(skb);
	return 0;
}

into proper BPF assembly:
; 	struct sk_buff *skb = ctx->skb;
       0:	79 11 00 00 00 00 00 00	r1 = *(u64 *)(r1 + 0)
; 	fn f = *(volatile fn *)&func;
       1:	18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r2 = 0 ll
       3:	79 22 00 00 00 00 00 00	r2 = *(u64 *)(r2 + 0)
; 	if (f)
       4:	15 02 01 00 00 00 00 00	if r2 == 0 goto +1 <LBB0_2>
; 		f(skb);
       5:	8d 00 00 00 02 00 00 00	callx 2
0000000000000030 LBB0_2:
; 	return 0;
       6:	b7 00 00 00 00 00 00 00	r0 = 0
       7:	95 00 00 00 00 00 00 00	exit

Indirect call is encoded as JMP|CALL|X
Normal call is JMP|CALL|K

What's left to do is to teach the verifier to parse BTF of global data.
Then teach it to recognize that r2 at insn 1 is PTR_TO_BTF_ID
where btf_id is DATASEC '.bss'
Then load r2+0 is also PTR_TO_BTF_ID where btf_id is VAR 'func'.
New bool flag to reg_state is needed to tell whether if(rX==NULL) check
was completed.
Then at insn 5 the verifier will see that R2 is PTR_TO_BTF_ID and !NULL
and it's a pointer to a function.
Depending on function prototype the verifier would need to check that
R1's type match to arg1 of func proto.
For simplicity we don't need to deal with pointers to stack,
pointers to map, etc. Only PTR_TO_BTF_ID where btf_id is a kernel
data structure or scalar is enough to get a lot of mileage out of
this indirect call feature.
That's mostly it.

Few other safety checks would be needed to make sure that writes
into 'r2+0' are also of correct type.
We also need partial map_update bpf_sys command to populate
function pointer with another bpf program that has matching
function proto.

I think it's not trivial verifier work, but not hard either.
I'm happy to do it as soon as I find time to work on it.
Toke Høiland-Jørgensen Oct. 20, 2019, 10:58 a.m. UTC | #21
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Wed, Oct 16, 2019 at 03:51:52PM +0200, Toke Høiland-Jørgensen wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Mon, Oct 14, 2019 at 02:35:45PM +0200, Toke Høiland-Jørgensen wrote:
>> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> >> 
>> >> > On Wed, Oct 09, 2019 at 10:03:43AM +0200, Toke Høiland-Jørgensen wrote:
>> >> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> >> >> 
>> >> >> > Please implement proper indirect calls and jumps.
>> >> >> 
>> >> >> I am still not convinced this will actually solve our problem; but OK, I
>> >> >> can give it a shot.
>> >> >
>> >> > If you're not convinced let's talk about it first.
>> >> >
>> >> > Indirect calls is a building block for debugpoints.
>> >> > Let's not call them tracepoints, because Linus banned any discusion
>> >> > that includes that name.
>> >> > The debugpoints is a way for BPF program to insert points in its
>> >> > code to let external facility to do tracing and debugging.
>> >> >
>> >> > void (*debugpoint1)(struct xdp_buff *, int code);
>> >> > void (*debugpoint2)(struct xdp_buff *);
>> >> > void (*debugpoint3)(int len);
>> >> 
>> >> So how would these work? Similar to global variables (i.e., the loader
>> >> creates a single-entry PROG_ARRAY map for each one)? Presumably with
>> >> some BTF to validate the argument types?
>> >> 
>> >> So what would it take to actually support this? It doesn't quite sound
>> >> trivial to add?
>> >
>> > Depends on definition of 'trivial' :)
>> 
>> Well, I don't know... :)
>> 
>> > The kernel has a luxury of waiting until clean solution is implemented
>> > instead of resorting to hacks.
>> 
>> It would be helpful if you could give an opinion on what specific
>> features are missing in the kernel to support these indirect calls. A
>> few high-level sentences is fine (e.g., "the verifier needs to be able
>> to do X, and llvm/libbpf needs to have support for Y")... I'm trying to
>> gauge whether this is something it would even make sense for me to poke
>> into, or if I'm better off waiting for someone who actually knows what
>> they are doing to work on this :)
>
> I have to reveal a secret first...
> llvm supports indirect calls since 2017 ;)
>
> It can compile the following:
> struct trace_kfree_skb {
> 	struct sk_buff *skb;
> 	void *location;
> };
>
> typedef void (*fn)(struct sk_buff *skb);
> static fn func;
>
> SEC("tp_btf/kfree_skb")
> int trace_kfree_skb(struct trace_kfree_skb *ctx)
> {
> 	struct sk_buff *skb = ctx->skb;
> 	fn f = *(volatile fn *)&func;
>
> 	if (f)
> 		f(skb);
> 	return 0;
> }
>
> into proper BPF assembly:
> ; 	struct sk_buff *skb = ctx->skb;
>        0:	79 11 00 00 00 00 00 00	r1 = *(u64 *)(r1 + 0)
> ; 	fn f = *(volatile fn *)&func;
>        1:	18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r2 = 0 ll
>        3:	79 22 00 00 00 00 00 00	r2 = *(u64 *)(r2 + 0)
> ; 	if (f)
>        4:	15 02 01 00 00 00 00 00	if r2 == 0 goto +1 <LBB0_2>
> ; 		f(skb);
>        5:	8d 00 00 00 02 00 00 00	callx 2
> 0000000000000030 LBB0_2:
> ; 	return 0;
>        6:	b7 00 00 00 00 00 00 00	r0 = 0
>        7:	95 00 00 00 00 00 00 00	exit
>
> Indirect call is encoded as JMP|CALL|X
> Normal call is JMP|CALL|K

Right, cool! So this would be handled like regular (data) global
variables, with libbpf populating a map to store the value behind the
scenes? 

> What's left to do is to teach the verifier to parse BTF of global data.
> Then teach it to recognize that r2 at insn 1 is PTR_TO_BTF_ID
> where btf_id is DATASEC '.bss'
> Then load r2+0 is also PTR_TO_BTF_ID where btf_id is VAR 'func'.
> New bool flag to reg_state is needed to tell whether if(rX==NULL) check
> was completed.
> Then at insn 5 the verifier will see that R2 is PTR_TO_BTF_ID and !NULL
> and it's a pointer to a function.
> Depending on function prototype the verifier would need to check that
> R1's type match to arg1 of func proto.
> For simplicity we don't need to deal with pointers to stack,
> pointers to map, etc. Only PTR_TO_BTF_ID where btf_id is a kernel
> data structure or scalar is enough to get a lot of mileage out of
> this indirect call feature.

OK, so this means that explicit map lookups could be added later? I.e.,
this:

int trace_kfree_skb(struct trace_kfree_skb *ctx)
{
	struct sk_buff *skb = ctx->skb;
        u32 key = 0;
	fn f;

        f = bpf_map_lookup_elem(&fpointer_map, &key);
        if (f)
          f(skb);
}

would need some more work, right?

> That's mostly it.
>
> Few other safety checks would be needed to make sure that writes
> into 'r2+0' are also of correct type.
> We also need partial map_update bpf_sys command to populate
> function pointer with another bpf program that has matching
> function proto.
>
> I think it's not trivial verifier work, but not hard either.
> I'm happy to do it as soon as I find time to work on it.

Great! I think it's probably more productive for everyone involved if I
just wait for you to get around to this, rather than try my own hand at
this. I'll go hash out the userspace management semantics of chain calls
in the meantime (using my kernel support patch), since that will surely
be needed anyway.

-Toke
Edward Cree Oct. 21, 2019, 11:51 p.m. UTC | #22
On 15/10/2019 19:33, Edward Cree wrote:
> But I think we'll
>  need to prototype things with static linking first so that we can be
>  sure of the linker semantics we want, before we try to put a new dynamic
>  linker in the kernel.
For anyone wanting to follow my progress on this, the first-draft eBPF
 linker 'ebld.py' can now be found at
 https://github.com/solarflarecom/ebpf_asm/tree/linker

It's able to resolve inter-object calls (at least in a _really simple_ test
 I did, where prog A was "call pass_fn; exit" and prog B was "pass_fn:
 ld r0, XDP_PASS; exit"), but I haven't got as far as feeding the resulting
 object file to the kernel (no obvious reason that shouldn't work, I just
 haven't tried it yet).
What it _doesn't_ do yet is deal with BTF — it just silently discards any
 BTF or BTF.ext sections in the input object files.  I'll need to re-read
 the BTF spec to see what's changed there since last I was involved (I hope
 the spec has been kept up to date as BTF has evolved...!)

But with the basic linker there, I think a prototype daemon is probably a
 higher priority than getting fully-featured with BTF, so that's what I
 plan to do next.

-Ed

PS: Yes, it's in Python.  I started out in C, and quickly backed myself
 into a corner trying to keep the data structures simple.  Having first-
 class dictionaries Really Helps.
Edward Cree Oct. 22, 2019, 5:27 p.m. UTC | #23
On 17/10/2019 13:11, Toke Høiland-Jørgensen wrote:
> I think there's a conceptual disconnect here in how we view what an XDP
> program is. In my mind, an XDP program is a stand-alone entity tied to a
> particular application; not a library function that can just be inserted
> into another program.
To me, an XDP (or any other eBPF) program is a function that is already
 being 'inserted into another program', namely, the kernel.  It's a
 function that's being wired up to a hook in the kernel.  Which isn't
 so different to wiring it up to a hook in a function that's wired up to
 a hook in the kernel (which is what my proposal effectively does).

> Setting aside that for a moment; the reason I don't think this belongs
> in userspace is that putting it there would carry a complexity cost that
> is higher than having it in the kernel.
Complexity in the kernel is more expensive than in userland.  There are
 several reasons for this, such as:
* The kernel's reliability requirements are stricter — a daemon that
  crashes can be restarted, a kernel that crashes ruins your day.
* Userland has libraries available for many common tasks that can't be
  used in the kernel.
* Anything ABI-visible (which this would be) has to be kept forever even
  if it turns out to be a Bad Idea™, because We Do Not Break Userspace™.
The last of these is the big one, and means that wherever possible the
 proper course is to prototype functionality in userspace, and then once
 the ABI is solid and known-useful, it can move to the kernel if there's
 an advantage to doing so (typically performance).  Yes, that means
 applications may have to change twice (though hopefully just a matter
 of building against a new libbpf), but the old applications can be kept
 working (by keeping the daemon around on such systems).

> Specifically, if we do implement
> an 'xdpd' daemon to handle all this, that would mean that we:
>
> - Introduce a new, separate code base that we'll have to write, support
>   and manage updates to.
Separation is a good thing.  Whichever way we do this, we have to write
 some new code.  Having that code _outside_ the kernel tree helps to keep
 our layers separate.  Chain calling is a layering violation!

> - Add a new dependency to using XDP (now you not only need the kernel
>   and libraries, you'll also need the daemon).
You'll need *a* daemon.  You won't be tied to a specific implementation.
And if you're just developing, you won't even need that — you can still
 bind a prog directly to the device if you have the ackles — so it's
 only for application deployment that it's needed.  By the time you're
 at the point of deploying an application that people are going to be
 installing with "yum install myFirewall", you have the whole package
 manager dependency resolution system to deal with the daemon.

> - Have to duplicate or wrap functionality currently found in the kernel;
>   at least:
>   
>     - Keeping track of which XDP programs are loaded and attached to
>       each interface
There's already an API to query this.  You would probably want an atomic
 cmpxchg operation, so that you can detect if someone else is fiddling
 with XDP and scream noisy warnings.

> (as well as the "new state" of their attachment order).
That won't be duplicate, because it won't be in the kernel.  The kernel
 will only ever see one blob and it doesn't know or care how userland
 assembled it.

>     - Some kind of interface with the verifier; if an app does
>       xdpd_rpc_load(prog), how is the verifier result going to get back
>       to the caller?
The daemon will get the verifier log back when it tries to update the
 program; it might want to do a bit of translation before passing it on,
 but an RPC call can definitely return errors to the caller.
In the Ideal World of kernel dynamic linking, of course, each app prog
 gets submitted to the verifier by the app to create a floating function
 in the kernel that's not bound to any XDP hook (app gets its verifier
 responses at this point) and then the app just sends an fd for that
 function to the daemon; at that point any verifier errors after linking
 are the fault of the daemon and its master program.  Thus the Ideal
 World doesn't need any kind of translation of verifier output to make
 it match up with individual app's program.

> - Have to deal with state synchronisation issues (how does xdpd handle
>   kernel state changing from underneath it?).
The cmpxchg I mentioned above would help with that.

> While these are issues that are (probably) all solvable, I think the
> cost of solving them is far higher than putting the support into the
> kernel. Which is why I think kernel support is the best solution :)
See my remarks above about kernel ABIs.
Also, chain calling and the synchronisation dance between apps still
 looks needlessly complex and fragile to me — it's like you're having
 the kernel there to be the central point of control and then not
 actually having a central point of control after all.  (But if chain
 calling does turn out to be the right API, well, the daemon can
 always implement that!)

-Ed
Toke Høiland-Jørgensen Oct. 22, 2019, 6:07 p.m. UTC | #24
Edward Cree <ecree@solarflare.com> writes:

> On 17/10/2019 13:11, Toke Høiland-Jørgensen wrote:
>> I think there's a conceptual disconnect here in how we view what an XDP
>> program is. In my mind, an XDP program is a stand-alone entity tied to a
>> particular application; not a library function that can just be inserted
>> into another program.

> To me, an XDP (or any other eBPF) program is a function that is already
>  being 'inserted into another program', namely, the kernel.  It's a
>  function that's being wired up to a hook in the kernel.  Which isn't
>  so different to wiring it up to a hook in a function that's wired up to
>  a hook in the kernel (which is what my proposal effectively does).

Yes, see: Different mental models leading to different notions of what
is the natural way to do things :)

>> Setting aside that for a moment; the reason I don't think this belongs
>> in userspace is that putting it there would carry a complexity cost that
>> is higher than having it in the kernel.
> Complexity in the kernel is more expensive than in userland.  There are
>  several reasons for this, such as:
> * The kernel's reliability requirements are stricter — a daemon that
>   crashes can be restarted, a kernel that crashes ruins your day.
> * Userland has libraries available for many common tasks that can't be
>   used in the kernel.
> * Anything ABI-visible (which this would be) has to be kept forever even
>   if it turns out to be a Bad Idea™, because We Do Not Break
> Userspace™.

To me, the first and last of those are actually arguments for putting
this into the kernel (from the consuming application's PoV). Surely we
want something that's reliable and well-supported? ;)

> The last of these is the big one, and means that wherever possible the
> proper course is to prototype functionality in userspace, and then
> once the ABI is solid and known-useful, it can move to the kernel if
> there's an advantage to doing so

To me, the prototyping was the tail call-based stuff Facebook and
Cloudflare has been doing, and this is an attempt to synthesise that
into something that we can actually agree to support as part of the XDP
feature set.

>> - Add a new dependency to using XDP (now you not only need the kernel
>>   and libraries, you'll also need the daemon).
> You'll need *a* daemon. You won't be tied to a specific
> implementation.

But the point is that I *want* this to be a specific implementation; or
rather, a specific interface. I.e., I want this to be an interface that
people can rely on being available, rather than have a proliferation of
slightly different ways to achieve this that are subtly incompatible.

>>     - Keeping track of which XDP programs are loaded and attached to
>>       each interface
> There's already an API to query this.

There's an API, but the daemon still has to deal with it.

> You would probably want an atomic cmpxchg operation, so that you can
> detect if someone else is fiddling with XDP and scream noisy warnings.

Yup, probably going to do some sort of atomic program replace API in any
case :)

>>     - Some kind of interface with the verifier; if an app does
>>       xdpd_rpc_load(prog), how is the verifier result going to get back
>>       to the caller?
> The daemon will get the verifier log back when it tries to update the
> program; it might want to do a bit of translation before passing it
> on, but an RPC call can definitely return errors to the caller.

I wasn't implying that it was impossible to report errors over RPC.
I was saying that you "a bit of translation" is not as trivial as you
make it out to be...


> In the Ideal World of kernel dynamic linking, of course, each app prog
> gets submitted to the verifier by the app to create a floating function
> in the kernel that's not bound to any XDP hook (app gets its verifier
> responses at this point)

I believe this is what Alexei means by "indirect calls". That is
different, though, because it implies that each program lives as a
separate object in the kernel - and so it might actually work. What you
were talking about (until this paragraph) was something that was
entirely in userspace, and all the kernel sees is a blob of the eBPF
equivalent of `cat *.so > my_composite_prog.so`.

-Toke
Alexei Starovoitov Oct. 25, 2019, 4:30 p.m. UTC | #25
On Sun, Oct 20, 2019 at 3:58 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> Great! I think it's probably more productive for everyone involved if I
> just wait for you to get around to this, rather than try my own hand at
> this. I'll go hash out the userspace management semantics of chain calls
> in the meantime (using my kernel support patch), since that will surely
> be needed anyway.

No problem. Indirect calls are next on my todo list.
Shouldn't take long.

Right now I'm hacking accelerated kretprobes for tracing.
I've shared first rough draft of patches with few folks and they
quickly pointed out that the same logic can be used to
create arbitrary call chains.
To make kretprobe+bpf fast I'm replacing:
  call foo
with
  prologue
  capture args
  call foo
  capture return
  call bpf_prog
  epilogue
That "call foo" can be anything. Including another bpf prog.
Patches are too rough for public review.
I hope to get them cleaned up by the end of next week.

Jesper request to capture both struct xdp_md and return
value after prog returns will be possible soon.
xdpdump before and after xdp prog.. here I come :)
Toke Høiland-Jørgensen Oct. 27, 2019, 12:15 p.m. UTC | #26
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sun, Oct 20, 2019 at 3:58 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> Great! I think it's probably more productive for everyone involved if I
>> just wait for you to get around to this, rather than try my own hand at
>> this. I'll go hash out the userspace management semantics of chain calls
>> in the meantime (using my kernel support patch), since that will surely
>> be needed anyway.
>
> No problem. Indirect calls are next on my todo list.
> Shouldn't take long.

Awesome!

> Right now I'm hacking accelerated kretprobes for tracing.
> I've shared first rough draft of patches with few folks and they
> quickly pointed out that the same logic can be used to
> create arbitrary call chains.
> To make kretprobe+bpf fast I'm replacing:
>   call foo
> with
>   prologue
>   capture args
>   call foo
>   capture return
>   call bpf_prog
>   epilogue
> That "call foo" can be anything. Including another bpf prog.
> Patches are too rough for public review.
> I hope to get them cleaned up by the end of next week.
>
> Jesper request to capture both struct xdp_md and return
> value after prog returns will be possible soon.
> xdpdump before and after xdp prog.. here I come :)

Sounds promising! Looking forward to seeing where this is going :)

-Toke
Alexei Starovoitov Nov. 12, 2019, 2:51 a.m. UTC | #27
On Tue, Oct 22, 2019 at 08:07:42PM +0200, Toke Høiland-Jørgensen wrote:
> 
> I believe this is what Alexei means by "indirect calls". That is
> different, though, because it implies that each program lives as a
> separate object in the kernel - and so it might actually work. What you
> were talking about (until this paragraph) was something that was
> entirely in userspace, and all the kernel sees is a blob of the eBPF
> equivalent of `cat *.so > my_composite_prog.so`.

So I've looked at indirect calls and realized that they're _indirect_ calls.
The retpoline overhead will be around, so a solution has to work without them.
I still think they're necessary for all sorts of things, but priority shifted.

I think what Ed is proposing with static linking is the best generic solution.
The chaining policy doesn't belong in the kernel. A user space can express the
chaining logic in the form of BPF program. Static linking achieves that. There
could be a 'root' bpf program (let's call it rootlet.o) that looks like:
int xdp_firewall_placeholder1(struct xdp_md *ctx)
{
   return XDP_PASS;
}
int xdp_firewall_placeholder2(struct xdp_md *ctx)
{
   return XDP_PASS;
}
int xdp_load_balancer_placeholder1(struct xdp_md *ctx)
{
   return XDP_PASS;
}
int main_xdp_prog(struct xdp_md *ctx)
{
   int ret;

   ret = xdp_firewall_placeholder1(ctx);
   switch (ret) {
   case XDP_PASS: break;
   case XDP_PROP: return XDP_DROP;
   case XDP_TX: case XDP_REDIRECT:
      /* buggy firewall */
      bpf_perf_event_output(ctx,...);
   default: break; /* or whatever else */
   }
   
   ret = xdp_firewall_placeholder2(ctx);
   switch (ret) {
   case XDP_PASS: break;
   case XDP_PROP: return XDP_DROP;
   default: break;
   }

   ret = xdp_load_balancer_placeholder1(ctx);
   switch (ret) {
   case XDP_PASS: break;
   case XDP_PROP: return XDP_DROP;
   case XDP_TX: return XDP_TX;
   case XDP_REDIRECT: return XDP_REDIRECT;
   default: break; /* or whatever else */
   }
   return XDP_PASS;
}

When firewall1.rpm is installed it needs to use either a central daemon or
common library (let's call it libxdp.so) that takes care of orchestration. The
library would need to keep a state somewhere (like local file or a database).
The state will include rootlet.o and new firewall1.o that wants to be linked
into the existing program chain. When firewall2.rpm gets installed it calls the
same libxdp.so functions that operate on shared state. libxdp.so needs to link
firewall1.o + firewall2.o + rootlet.o into one program and attach it to netdev.
This is static linking. The existing kernel infrastructure already supports
such model and I think it's enough for a lot of use cases. In particular fb's
firewall+katran XDP style will fit right in. But bpf_tail_calls are
incompatible with bpf2bpf calls that static linking will use and I think
cloudlfare folks expressed the interest to use them for some reason even within
single firewall ? so we need to improve the model a bit.

We can introduce dynamic linking. The second part of 'BPF trampoline' patches
allows tracing programs to attach to other BPF programs. The idea of dynamic
linking is to replace a program or subprogram instead of attaching to it.
The firewall1.rpm application will still use libxdp.so, but instead of statically
linking it will ask kernel to replace a subprogram rootlet_fd +
btf_id_of_xdp_firewall_placeholder1 with new firewall1.o. The same interface is
used for attaching tracing prog to networking prog. Initially I plan to keep
the verifier job simple and allow replacing xdp-equivalent subprogram with xdp
program. Meaning that subprogram (in above case xdp_firewall_placeholder1)
needs to have exactly one argument and it has to be 'struct xdp_md *'. Then
during the loading of firewall1.o the verifier wouldn't need to re-verify the
whole thing. BTF type matching that the verifier is doing as part of 'BPF
trampoline' series will be reused for this purpose. Longer term I'd like to
allow more than one argument while preserving partial verification model.
The rootlet.o calls into firewall1.o directly. So no retpoline to worry about
and firewall1.o can use bpf_tail_call() if it wants so. That tail_call will
still return back to rootlet.o which will make policy decision. This rootlet.o
can be automatically generated by libxdp.so. If in the future we figure out how
to do two load-balancers libxdp.so will be able to accommodate that new policy.
This firewall1.o can be developed and tested independently of other xdp
programs. The key gotcha here is that the verifier needs to allow more than 512
stack usage for the rootlet.o. I think that's acceptable.

In the future indirect calls will allow rootlet.o to be cleaner:
typedef int (*ptr_to_xdp_prog)(struct xdp_md *ctx);
ptr_to_xdp_prog prog_array[100];
int main_xdp_prog(struct xdp_md *ctx)
{
   int ret, i;

   for (i = 0; i < 100; i++) {
       ret = prog_array[i](ctx);
       switch (ret) {
       case XDP_PASS: break;
       case XDP_PROP: return XDP_DROP;
       ..
   }
}
but they're indirect calls and retpoline. Hence lower priority atm.

Thoughts?
Toke Høiland-Jørgensen Nov. 12, 2019, 4:20 p.m. UTC | #28
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Tue, Oct 22, 2019 at 08:07:42PM +0200, Toke Høiland-Jørgensen wrote:
>> 
>> I believe this is what Alexei means by "indirect calls". That is
>> different, though, because it implies that each program lives as a
>> separate object in the kernel - and so it might actually work. What you
>> were talking about (until this paragraph) was something that was
>> entirely in userspace, and all the kernel sees is a blob of the eBPF
>> equivalent of `cat *.so > my_composite_prog.so`.
>
> So I've looked at indirect calls and realized that they're _indirect_ calls.
> The retpoline overhead will be around, so a solution has to work without them.
> I still think they're necessary for all sorts of things, but priority shifted.
>
> I think what Ed is proposing with static linking is the best generic solution.
> The chaining policy doesn't belong in the kernel. A user space can express the
> chaining logic in the form of BPF program. Static linking achieves that. There
> could be a 'root' bpf program (let's call it rootlet.o) that looks like:
> int xdp_firewall_placeholder1(struct xdp_md *ctx)
> {
>    return XDP_PASS;
> }
> int xdp_firewall_placeholder2(struct xdp_md *ctx)
> {
>    return XDP_PASS;
> }
> int xdp_load_balancer_placeholder1(struct xdp_md *ctx)
> {
>    return XDP_PASS;
> }
> int main_xdp_prog(struct xdp_md *ctx)
> {
>    int ret;
>
>    ret = xdp_firewall_placeholder1(ctx);
>    switch (ret) {
>    case XDP_PASS: break;
>    case XDP_PROP: return XDP_DROP;
>    case XDP_TX: case XDP_REDIRECT:
>       /* buggy firewall */
>       bpf_perf_event_output(ctx,...);
>    default: break; /* or whatever else */
>    }
>    
>    ret = xdp_firewall_placeholder2(ctx);
>    switch (ret) {
>    case XDP_PASS: break;
>    case XDP_PROP: return XDP_DROP;
>    default: break;
>    }
>
>    ret = xdp_load_balancer_placeholder1(ctx);
>    switch (ret) {
>    case XDP_PASS: break;
>    case XDP_PROP: return XDP_DROP;
>    case XDP_TX: return XDP_TX;
>    case XDP_REDIRECT: return XDP_REDIRECT;
>    default: break; /* or whatever else */
>    }
>    return XDP_PASS;
> }
>
> When firewall1.rpm is installed it needs to use either a central daemon or
> common library (let's call it libxdp.so) that takes care of orchestration. The
> library would need to keep a state somewhere (like local file or a database).
> The state will include rootlet.o and new firewall1.o that wants to be linked
> into the existing program chain. When firewall2.rpm gets installed it calls the
> same libxdp.so functions that operate on shared state. libxdp.so needs to link
> firewall1.o + firewall2.o + rootlet.o into one program and attach it to netdev.
> This is static linking. The existing kernel infrastructure already supports
> such model and I think it's enough for a lot of use cases. In particular fb's
> firewall+katran XDP style will fit right in. But bpf_tail_calls are
> incompatible with bpf2bpf calls that static linking will use and I think
> cloudlfare folks expressed the interest to use them for some reason even within
> single firewall ? so we need to improve the model a bit.
>
> We can introduce dynamic linking. The second part of 'BPF trampoline'
> patches allows tracing programs to attach to other BPF programs. The
> idea of dynamic linking is to replace a program or subprogram instead
> of attaching to it. The firewall1.rpm application will still use
> libxdp.so, but instead of statically linking it will ask kernel to
> replace a subprogram rootlet_fd + btf_id_of_xdp_firewall_placeholder1
> with new firewall1.o. The same interface is used for attaching tracing
> prog to networking prog.

Hmm, let's see if I'm understanding you correctly. In this model, to
attach program #2 (assuming the first one is already loaded on an
interface), userspace would need to do something like:

old_fd = get_xdp_fd(eth0);
new_fd = load_bpf("newprog.o"); // verifies newprog.o
proglet = load_bpf("xdp-run-2-progs.o"); // or dynamically generate this
replace_subprog(proglet, 0, old_fd); // similar to map FD replacement?
replace_subprog(proglet, 1, new_fd);
proglet_fd = load_bpf(proglet); // verifier reuses sub-fd prog verdicts

bpf_tracing_prog_attach(old_fd, proglet_fd, FLAG_REPLACE);


So the two component programs would still exist as kernel objects,
right? And the trampolines would keep individual stats for each one (if
BPF stats are enabled)? Could userspace also extract the prog IDs being
referenced by the "glue" proglet? Similar to how bpftool shows which map
IDs a program refers to today?

What about attaching a third program? Would that work by recursion (as
above, but with the old proglet as old_fd), or should the library build
a whole new sequence from the component programs?

Finally, what happens if someone where to try to attach a retprobe to
one of the component programs? Could it be possible to do that even
while program is being run from proglet dispatch? That way we can still
debug an individual XDP program even though it's run as part of a chain.

> Initially I plan to keep the verifier job simple and allow replacing
> xdp-equivalent subprogram with xdp program. Meaning that subprogram
> (in above case xdp_firewall_placeholder1) needs to have exactly one
> argument and it has to be 'struct xdp_md *'.

That's fine.

> Then during the loading of firewall1.o the verifier wouldn't need to
> re-verify the whole thing. BTF type matching that the verifier is
> doing as part of 'BPF trampoline' series will be reused for this
> purpose. Longer term I'd like to allow more than one argument while
> preserving partial verification model. The rootlet.o calls into
> firewall1.o directly. So no retpoline to worry about and firewall1.o
> can use bpf_tail_call() if it wants so. That tail_call will still
> return back to rootlet.o which will make policy decision. This
> rootlet.o can be automatically generated by libxdp.so.

Sounds reasonable. Any reason libxdp.so couldn't be part of libbpf?

> If in the future we figure out how to do two load-balancers libxdp.so
> will be able to accommodate that new policy.

Yeah, it would be cool if we could move things across CPUs; like with
cpumap, but executing another XDP program on the target CPU.

> This firewall1.o can be developed and tested independently of other
> xdp programs. The key gotcha here is that the verifier needs to allow
> more than 512 stack usage for the rootlet.o. I think that's
> acceptable.

Right, cool.

> In the future indirect calls will allow rootlet.o to be cleaner:
> typedef int (*ptr_to_xdp_prog)(struct xdp_md *ctx);
> ptr_to_xdp_prog prog_array[100];
> int main_xdp_prog(struct xdp_md *ctx)
> {
>    int ret, i;
>
>    for (i = 0; i < 100; i++) {
>        ret = prog_array[i](ctx);
>        switch (ret) {
>        case XDP_PASS: break;
>        case XDP_PROP: return XDP_DROP;
>        ..
>    }
> }
> but they're indirect calls and retpoline. Hence lower priority atm.

Yes, this was what I was envisioning when you first said 'indirect
calls'. This would be wonderfully flexible... But a shame about the
indirect calls, performance-wise.

-Toke
Edward Cree Nov. 12, 2019, 4:32 p.m. UTC | #29
On 12/11/2019 02:51, Alexei Starovoitov wrote:
> There
> could be a 'root' bpf program (let's call it rootlet.o) that looks like:
This looks a lot like what I had in mind...
> We can introduce dynamic linking. The second part of 'BPF trampoline' patches
> allows tracing programs to attach to other BPF programs. The idea of dynamic
> linking is to replace a program or subprogram instead of attaching to it.
... as does this, particularly the "partial verification" / verify a subprog
 as a separate unit with its own contract.
> The rootlet.o calls into firewall1.o directly. So no retpoline to worry about
> and firewall1.o can use bpf_tail_call() if it wants so. That tail_call will
> still return back to rootlet.o
Yep, that's a really nice gain that comes out of partial verification.

+1 to this whole proposal.

-Ed
Alexei Starovoitov Nov. 12, 2019, 7:52 p.m. UTC | #30
On Tue, Nov 12, 2019 at 05:20:07PM +0100, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Tue, Oct 22, 2019 at 08:07:42PM +0200, Toke Høiland-Jørgensen wrote:
> >> 
> >> I believe this is what Alexei means by "indirect calls". That is
> >> different, though, because it implies that each program lives as a
> >> separate object in the kernel - and so it might actually work. What you
> >> were talking about (until this paragraph) was something that was
> >> entirely in userspace, and all the kernel sees is a blob of the eBPF
> >> equivalent of `cat *.so > my_composite_prog.so`.
> >
> > So I've looked at indirect calls and realized that they're _indirect_ calls.
> > The retpoline overhead will be around, so a solution has to work without them.
> > I still think they're necessary for all sorts of things, but priority shifted.
> >
> > I think what Ed is proposing with static linking is the best generic solution.
> > The chaining policy doesn't belong in the kernel. A user space can express the
> > chaining logic in the form of BPF program. Static linking achieves that. There
> > could be a 'root' bpf program (let's call it rootlet.o) that looks like:
> > int xdp_firewall_placeholder1(struct xdp_md *ctx)
> > {
> >    return XDP_PASS;
> > }
> > int xdp_firewall_placeholder2(struct xdp_md *ctx)
> > {
> >    return XDP_PASS;
> > }
> > int xdp_load_balancer_placeholder1(struct xdp_md *ctx)
> > {
> >    return XDP_PASS;
> > }
> > int main_xdp_prog(struct xdp_md *ctx)
> > {
> >    int ret;
> >
> >    ret = xdp_firewall_placeholder1(ctx);
> >    switch (ret) {
> >    case XDP_PASS: break;
> >    case XDP_PROP: return XDP_DROP;
> >    case XDP_TX: case XDP_REDIRECT:
> >       /* buggy firewall */
> >       bpf_perf_event_output(ctx,...);
> >    default: break; /* or whatever else */
> >    }
> >    
> >    ret = xdp_firewall_placeholder2(ctx);
> >    switch (ret) {
> >    case XDP_PASS: break;
> >    case XDP_PROP: return XDP_DROP;
> >    default: break;
> >    }
> >
> >    ret = xdp_load_balancer_placeholder1(ctx);
> >    switch (ret) {
> >    case XDP_PASS: break;
> >    case XDP_PROP: return XDP_DROP;
> >    case XDP_TX: return XDP_TX;
> >    case XDP_REDIRECT: return XDP_REDIRECT;
> >    default: break; /* or whatever else */
> >    }
> >    return XDP_PASS;
> > }
> >
> > When firewall1.rpm is installed it needs to use either a central daemon or
> > common library (let's call it libxdp.so) that takes care of orchestration. The
> > library would need to keep a state somewhere (like local file or a database).
> > The state will include rootlet.o and new firewall1.o that wants to be linked
> > into the existing program chain. When firewall2.rpm gets installed it calls the
> > same libxdp.so functions that operate on shared state. libxdp.so needs to link
> > firewall1.o + firewall2.o + rootlet.o into one program and attach it to netdev.
> > This is static linking. The existing kernel infrastructure already supports
> > such model and I think it's enough for a lot of use cases. In particular fb's
> > firewall+katran XDP style will fit right in. But bpf_tail_calls are
> > incompatible with bpf2bpf calls that static linking will use and I think
> > cloudlfare folks expressed the interest to use them for some reason even within
> > single firewall ? so we need to improve the model a bit.
> >
> > We can introduce dynamic linking. The second part of 'BPF trampoline'
> > patches allows tracing programs to attach to other BPF programs. The
> > idea of dynamic linking is to replace a program or subprogram instead
> > of attaching to it. The firewall1.rpm application will still use
> > libxdp.so, but instead of statically linking it will ask kernel to
> > replace a subprogram rootlet_fd + btf_id_of_xdp_firewall_placeholder1
> > with new firewall1.o. The same interface is used for attaching tracing
> > prog to networking prog.
> 
> Hmm, let's see if I'm understanding you correctly. In this model, to
> attach program #2 (assuming the first one is already loaded on an
> interface), userspace would need to do something like:
> 
> old_fd = get_xdp_fd(eth0);
> new_fd = load_bpf("newprog.o"); // verifies newprog.o
> proglet = load_bpf("xdp-run-2-progs.o"); // or dynamically generate this
> replace_subprog(proglet, 0, old_fd); // similar to map FD replacement?
> replace_subprog(proglet, 1, new_fd);
> proglet_fd = load_bpf(proglet); // verifier reuses sub-fd prog verdicts

I think long term the set of features supported by static and dynamic linking
should be the same. Partial verification should be available regardless of
whether kernel performs dynamic linking or libbpf staticly links multiple .o
together. The only visible difference dynamic vs static should be that dynamic
linking links to already loaded programs that could be executing whereas static
counterpart links a set of .o. At that point libbpf may see that some 'extern
int prog(...);' referenced in one .o cannot be resolved from another .o. In
such case libbpf will try to link it dynamically with progs running in the
kernel. We haven't yet defined what 'extern' keyword in the program means.
There are a preliminary patches for llvm to support extern variables. Extern
functions are not done yet. We have to walk before we run. With dynamic linking
I'm proposing an api for the kernel that I think will work regardless of how
libbpf and llvm decide to define the meaning of 'extern'.
There are two parts to dynamic linking:
1. replace a prog or subprog in the kernel with new prog.
2. from new prog call a prog/subprog that is already loaded.

First case is useful to chain multiple progs together. Second is needed to
reuse already present progs instead of loading duplicated coode. Imagine the
following scenario:
subprog1:
  int bpf_packet_parser(struct xdp_md *ctx, struct flow_keys *key)
  { /* parses the packet and populates flow_keys */ }
subprog2:
  int __weak xdp_firewall_placeholder(struct xdp_md *ctx)
  { return XDP_PASS; }
main prog:
  int rootlet(struct xdp_md *ctx)
  { ret = xdp_firewall_placeholder(ctx);
    switch (ret) { case XDP_PASS: break;...
  }

New xdp program might want to replace xdp_firewall_placeholder and it might
also want to reuse existing code. It would want to call bpf_packet_parser()
subprog. I'm proposing that a pair of (prog_fd, btf_id of subprog) to be used
in both cases. To replace the program the new prog needs to specify (prog_fd of
rootlet, btf_id of xdp_firewall_placeholder) at load time via
attr->attach_prog_fd, attr->attach_btf_id fields that I'm adding as part of
trampoline patches. To call bpf_packet_parser() from the new prog the call
instruction inside the new prog needs to be annotated with (prog_fd of rootlet,
btf_id of bpf_packet_parser). That is similar to how the maps are referenced
from the program. I think such interface is nicely symmetrical and should work
regardless of how libbpf decides to find 'foo' after seeing 'extern int foo()'.
Similarly static linking should be able to do exactly the same linking, but
completely in user space. I think libbpf should be able to statically link
new_prog.o that calls bpf_packet_parser() with rootlet.o and adjust a call in
rootlet.o into new xdp_firewall_placeholder. Conceptually both static and
dynamic linking for BPF programs should look very similar to how traditional C
programs do it. libbpf will do the magic of specifying (prog_fd, btf_id) in
case dynamic is necessary. Otherwise libbpf will do static linking and adjust
offsets in call instructions. libbpf is already doing call insn adjustment for
subprograms within single .o. Support for static linking in libbpf will be
straightforward to do as soon as we define the meaning of 'extern' and add llvm
support.

On the kernel side the first part of dynamic linking (replacing the prog or
subprog) is a relativly simple follow up to trampoline patches that build most
of type checking and connecting apis. The second part of dynamic linking is a
bit more work. Every subprog and prog need to be verified independently
otherwise the whole thing won't scale beyond simple programs. I would like to
build a consensus on the long term plan for dynamic and static linking before
implementing the first step.

Back to your question of how fw2 will get loaded.. I'm thinking the following:
1. Static linking:
  obj = bpf_object__open("rootlet.o", "fw1.o", "fw2.o");
  // libbpf adjusts call offsets and links into single loadable bpf_object
  bpf_object__load(obj);
  bpf_set_link_xdp_fd()
No kernel changes are necessary to support program chaining via static linking.

2. Dynamic linking:
  // assuming libxdp.so manages eth0
  rootlet_fd = get_xdp_fd(eth0);
  subprog_btf_id = libbpf_find_prog_btf_id("name_of_placeholder", roolet_fd);
  //                  ^ this function is in patch 16/18 of trampoline
  attr.attach_prog_fd = roolet_fd;
  attr.attach_btf_id = subprog_btf_id;
  // pair (prog_fd, btf_id) needs to be specified at load time
  obj = bpf_object__open("fw2.o", attr);
  bpf_object__load(obj);
  prog = bpf_object__find_program_by_title(obj);
  link = bpf_program__replace(prog); // similar to bpf_program__attach_trace()
  // no extra arguments during 'replace'.
  // Target (prog_fd, btf_id) already known to the kernel and verified

> So the two component programs would still exist as kernel objects,
> right? 

yes. Both fw1.o and fw2.o will be loaded and running instead of placeholders.

> And the trampolines would keep individual stats for each one (if
> BPF stats are enabled)? 

In case of dynamic linking both fw1.o and fw2.o will be seen as individual
programs from 'bpftool p s' point of view. And both will have individual stats.

> Could userspace also extract the prog IDs being
> referenced by the "glue" proglet? 

Not sure I follow. Both fw1.o and fw2.o will have their own prog ids.
fw1_prog->aux->linked_prog == rootlet_prog
fw2_prog->aux->linked_prog == rootlet_prog
Unloading and detaching fw1.o will make kernel to switch back to placeholder
subprog in roolet_prog. I believe roolet_prog should not keep a list of progs
that attached to it (or replaced its subprogs) to avoid circular dependency.
Due to that detaching roolet_prog from netdev will stop the flow of packets
into fw1.o, but refcnt of rootlet_prog will not go to zero, so it will stay in
memory until both fw1.o and fw2.o detach from rootlet.o.

> What about attaching a third program? Would that work by recursion (as
> above, but with the old proglet as old_fd), or should the library build
> a whole new sequence from the component programs?

This choice is up to libxdp.so. It can have a number of placeholders
ready to be replaced by new progs. Or it can re-generate rootlet.o
every time new fwX.o comes along. Short term I would start development
with auto-generated roolet.o and static linking done by libbpf
while the policy and roolet are done by libxdp.so, since this work
doesn't depend on any kernel changes. Long term auto-generation
can stay in libxdp.so if it turns out to be sufficient.

> Finally, what happens if someone where to try to attach a retprobe to
> one of the component programs? Could it be possible to do that even
> while program is being run from proglet dispatch? That way we can still
> debug an individual XDP program even though it's run as part of a chain.

Right. The fentry/fexit tracing is orthogonal to static/dynamic linking.
It will be available for all prog types after trampoline patches land.
See fexit_bpf2bpf.c example in the last 18/18 patch.
We will be able to debug XDP program regardless whether it's a rootlet
or a subprogram. Doesn't matter whether linking was static or dynamic.
With fentry/fexit we will be able to do different stats too.
Right now bpf program stats are limited to cycles and I resisted a lot
of pressure to add more hard coded stats. With fentry/fexit we can
collect arbitrary counters per program. Like number of L1-cache misses
or number of TLB misses in a given XDP prog.

> Sounds reasonable. Any reason libxdp.so couldn't be part of libbpf?

libxdp.so is a policy specifier while libbpf is a tool. It makes more sense for
them to be separate. libbpf has strong api compatibility guarantees. While I
don't think anyone knows at this point how libxdp api should look and it will
take some time for it to mature.

> 
> > If in the future we figure out how to do two load-balancers libxdp.so
> > will be able to accommodate that new policy.
> 
> Yeah, it would be cool if we could move things across CPUs; like with
> cpumap, but executing another XDP program on the target CPU.

You mean like different load balancers for different nic rx queues?
Hmm. I guess that's possible. Another reason to keep libxdp policy
flexible at early stages of development.
Edward Cree Nov. 12, 2019, 9:25 p.m. UTC | #31
On 12/11/2019 19:52, Alexei Starovoitov wrote:
> We haven't yet defined what 'extern' keyword in the program means.
> There are a preliminary patches for llvm to support extern variables. Extern
> functions are not done yet. We have to walk before we run. With dynamic linking
> I'm proposing an api for the kernel that I think will work regardless of how
> libbpf and llvm decide to define the meaning of 'extern'.
Fwiw the 'natural' C way of doing it would be that for any extern symbol in
 the C file, the ELF file gets a symbol entry with st_shndx=SHN_UNDEF, and
 code in .text that uses that symbol gets relocation entries.  That's (AIUI)
 how it works on 'normal' architectures, and that's what my ebld linker
 understands; when it sees a definition in another file for that symbol
 (matched just by the symbol name) it applies all the relocations of the
 symbol to the appropriate progbits.
I don't really see what else you could define 'extern' to mean.

> Partial verification should be available regardless of
> whether kernel performs dynamic linking or libbpf staticly links multiple .o
> together.
It's not clear to me how partial verification would work for statically
 linked programs — could you elaborate on this?

-Ed
Alexei Starovoitov Nov. 12, 2019, 11:18 p.m. UTC | #32
On Tue, Nov 12, 2019 at 09:25:06PM +0000, Edward Cree wrote:
> On 12/11/2019 19:52, Alexei Starovoitov wrote:
> > We haven't yet defined what 'extern' keyword in the program means.
> > There are a preliminary patches for llvm to support extern variables. Extern
> > functions are not done yet. We have to walk before we run. With dynamic linking
> > I'm proposing an api for the kernel that I think will work regardless of how
> > libbpf and llvm decide to define the meaning of 'extern'.
> Fwiw the 'natural' C way of doing it would be that for any extern symbol in
>  the C file, the ELF file gets a symbol entry with st_shndx=SHN_UNDEF, and
>  code in .text that uses that symbol gets relocation entries.  That's (AIUI)
>  how it works on 'normal' architectures, and that's what my ebld linker
>  understands; when it sees a definition in another file for that symbol
>  (matched just by the symbol name) it applies all the relocations of the
>  symbol to the appropriate progbits.
> I don't really see what else you could define 'extern' to mean.

That's exactly the problem with standard 'extern'. ELF preserves the name only.
There is no type. I think size is there, but linkers ignore it. There is also
no way to place extern into a section. Currently SEC("..") is a standard way to
annotate bpf programs. I think reliable 'extern' has to have more than just
name. 'extern int foo;' can be a reference to 'int foo;' in another BPF ELF
file, or it can be a reference to 'int foo;' in already loaded BPF prog, or it
can be a reference to 'int foo;' inside the kernel itself, or it can be a
reference to pseudo variable that libbpf should replace. For example 'extern
int kernel_version;' or 'extern int CONFIG_HZ;' would be useful extern-like
variables that program might want to use. Disambiguating by name is probably
not enough. We can define an order of resolution. libbpf will search in other
.o first, then will search in loaded bpf progs, than in kernel, and if all
fails than will resolve things like 'extern int CONFIG_HZ' on its own. It feels
fragile though. I think we need to be able to specify something like section to
extern variables and functions. That would be a compiler extension.

> > Partial verification should be available regardless of
> > whether kernel performs dynamic linking or libbpf staticly links multiple .o
> > together.
> It's not clear to me how partial verification would work for statically
>  linked programs — could you elaborate on this?

I was imagining that the verifier will do per-function verification
of program with sub-programs instead of analyzing from root.
Today the verifier is essentially told: Here is XDP program. Check that it's
valid with any valid context. Now imagine the verifier sees a function:
int foo(struct xdp_md *ctx);
The verifier can check that the function is safe when 'ctx' is valid. There is
no program type for the verifier to deal with. Such 'foo' is an abstract
program. It receives a valid pointer to xpd_md and can call any helper that
accepts xdp_md.

Applying the same logic for two arguments:
int foo(struct xdp_md *arg1, struct __sk_buff *arg2);
The verifier can check that such function is safe when both arg1 and arg2 are
valid pointers. This is not a realistic function. It illustrates the idea that
programs/functions can be abstract. Valid pointers of different types are
received and can be further passed into different helpers when types match.

The next step is to extend this thought process to integers.
int foo(struct xdp_md *arg1, int arg2);
The verifier can check that the program is valid for any valid arg1 and
arg2 = mark_reg_unbounded().

Support for pointers to non-context structs are a bit harder. That needs
more thinking. It should be possible to support:
int foo(struct xdp_md *arg1, char *arg2, bpf_size_t arg3);
The verifier will validate that arg2 array is accessed in [0, arg3] range.

I think that will cover most of the use cases. It won't be possible to support
aribtrary passing of pointers the way current bpf2bpf calls support, but I
think it's an acceptable limitation for partial verification.
John Fastabend Nov. 12, 2019, 11:25 p.m. UTC | #33
Alexei Starovoitov wrote:
> On Tue, Nov 12, 2019 at 05:20:07PM +0100, Toke Høiland-Jørgensen wrote:
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > 
> > > On Tue, Oct 22, 2019 at 08:07:42PM +0200, Toke Høiland-Jørgensen wrote:
> > >> 
> > >
> > > I think what Ed is proposing with static linking is the best generic solution.
> > > The chaining policy doesn't belong in the kernel. A user space can express the
> > > chaining logic in the form of BPF program. Static linking achieves that. There
> > > could be a 'root' bpf program (let's call it rootlet.o) that looks like:
> > > int xdp_firewall_placeholder1(struct xdp_md *ctx)

+1 static linking seems the clear winner here to me.

[...]

> 
> I think long term the set of features supported by static and dynamic linking
> should be the same. Partial verification should be available regardless of
> whether kernel performs dynamic linking or libbpf staticly links multiple .o

same question as Ed so we can follow up there. How does static linking help
verification?

> together. The only visible difference dynamic vs static should be that dynamic
> linking links to already loaded programs that could be executing whereas static
> counterpart links a set of .o. At that point libbpf may see that some 'extern
> int prog(...);' referenced in one .o cannot be resolved from another .o. In
> such case libbpf will try to link it dynamically with progs running in the
> kernel. We haven't yet defined what 'extern' keyword in the program means.
> There are a preliminary patches for llvm to support extern variables. Extern
> functions are not done yet. We have to walk before we run. With dynamic linking
> I'm proposing an api for the kernel that I think will work regardless of how
> libbpf and llvm decide to define the meaning of 'extern'.
> There are two parts to dynamic linking:
> 1. replace a prog or subprog in the kernel with new prog.
> 2. from new prog call a prog/subprog that is already loaded.
> 
> First case is useful to chain multiple progs together. Second is needed to
> reuse already present progs instead of loading duplicated coode. Imagine the
> following scenario:
> subprog1:
>   int bpf_packet_parser(struct xdp_md *ctx, struct flow_keys *key)
>   { /* parses the packet and populates flow_keys */ }
> subprog2:
>   int __weak xdp_firewall_placeholder(struct xdp_md *ctx)
>   { return XDP_PASS; }
> main prog:
>   int rootlet(struct xdp_md *ctx)
>   { ret = xdp_firewall_placeholder(ctx);
>     switch (ret) { case XDP_PASS: break;...
>   }
> 
> New xdp program might want to replace xdp_firewall_placeholder and it might
> also want to reuse existing code. It would want to call bpf_packet_parser()
> subprog. I'm proposing that a pair of (prog_fd, btf_id of subprog) to be used
> in both cases. To replace the program the new prog needs to specify (prog_fd of
> rootlet, btf_id of xdp_firewall_placeholder) at load time via
> attr->attach_prog_fd, attr->attach_btf_id fields that I'm adding as part of
> trampoline patches. To call bpf_packet_parser() from the new prog the call
> instruction inside the new prog needs to be annotated with (prog_fd of rootlet,
> btf_id of bpf_packet_parser). That is similar to how the maps are referenced
> from the program. I think such interface is nicely symmetrical and should work
> regardless of how libbpf decides to find 'foo' after seeing 'extern int foo()'.
> Similarly static linking should be able to do exactly the same linking, but
> completely in user space. I think libbpf should be able to statically link
> new_prog.o that calls bpf_packet_parser() with rootlet.o and adjust a call in
> rootlet.o into new xdp_firewall_placeholder. Conceptually both static and
> dynamic linking for BPF programs should look very similar to how traditional C
> programs do it. libbpf will do the magic of specifying (prog_fd, btf_id) in
> case dynamic is necessary. Otherwise libbpf will do static linking and adjust
> offsets in call instructions. libbpf is already doing call insn adjustment for
> subprograms within single .o. Support for static linking in libbpf will be
> straightforward to do as soon as we define the meaning of 'extern' and add llvm
> support.
> 
> On the kernel side the first part of dynamic linking (replacing the prog or
> subprog) is a relativly simple follow up to trampoline patches that build most
> of type checking and connecting apis. The second part of dynamic linking is a
> bit more work. Every subprog and prog need to be verified independently
> otherwise the whole thing won't scale beyond simple programs. I would like to
> build a consensus on the long term plan for dynamic and static linking before
> implementing the first step.

Another use case for dynamic linking (long-term) is to reduce verification
time. By and large many of the programs we (Cilium) load have large
blocks of repeated code. Currently, verification happens on the entire program
but would be great to front-load the verification by pre-loading a set of
library helpers that are validated at load time and generate a call contract
or set of preconditions needed to satisfy verification and set of postconditions
upon return. Then when we attach a program (kprobe or networking) the latency
can be reduced significantly. I think it likely looks similar (the same?) as
below with a slightly different flow from (2.).

   // first load shared libraries
   obj = bpf_object__open("fw2.o", attr)
   bpf_object__load(obj); // load and verify object could pin/unpin as well
  
   // inform open hook of symbols with attributes
   attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj);
   attr.attach_subprog[1] = libbpf_find__obj_btf_id("bar", obj);

   // open and load main program which will now use "foo", "bar"
   main = bpf_object__open("rootlet.o", attr)
   bpf_object__load(main)

   // finally attach it, load it into xdp, load into tc, etc  ...
   prog = bpf_object__find_program_by_title(main, "main")
   bpf_program__attach_*(prog);

> 
> 2. Dynamic linking:
>   // assuming libxdp.so manages eth0
>   rootlet_fd = get_xdp_fd(eth0);
>   subprog_btf_id = libbpf_find_prog_btf_id("name_of_placeholder", roolet_fd);
>   //                  ^ this function is in patch 16/18 of trampoline
>   attr.attach_prog_fd = roolet_fd;
>   attr.attach_btf_id = subprog_btf_id;
>   // pair (prog_fd, btf_id) needs to be specified at load time
>   obj = bpf_object__open("fw2.o", attr);
>   bpf_object__load(obj);
>   prog = bpf_object__find_program_by_title(obj);
>   link = bpf_program__replace(prog); // similar to bpf_program__attach_trace()
>   // no extra arguments during 'replace'.
>   // Target (prog_fd, btf_id) already known to the kernel and verified
> 

I'll post the code I talked about at LPC that generates pre/post conditions
as a possible example. Although initially the pre/post conditions could be
simple, e.g. arg1 is 'struct xdp_md *' and on return register/stack state
are unknown. Basically no change from whats proposed here but with a path
to arbitrary calls.

> 
> Back to your question of how fw2 will get loaded.. I'm thinking the following:
> 1. Static linking:
>   obj = bpf_object__open("rootlet.o", "fw1.o", "fw2.o");
>   // libbpf adjusts call offsets and links into single loadable bpf_object
>   bpf_object__load(obj);
>   bpf_set_link_xdp_fd()
> No kernel changes are necessary to support program chaining via static linking.

Nice thing here is it will work on any recent kernels.

> 
> 2. Dynamic linking:
>   // assuming libxdp.so manages eth0
>   rootlet_fd = get_xdp_fd(eth0);
>   subprog_btf_id = libbpf_find_prog_btf_id("name_of_placeholder", roolet_fd);
>   //                  ^ this function is in patch 16/18 of trampoline
>   attr.attach_prog_fd = roolet_fd;
>   attr.attach_btf_id = subprog_btf_id;
>   // pair (prog_fd, btf_id) needs to be specified at load time
>   obj = bpf_object__open("fw2.o", attr);
>   bpf_object__load(obj);
>   prog = bpf_object__find_program_by_title(obj);
>   link = bpf_program__replace(prog); // similar to bpf_program__attach_trace()
>   // no extra arguments during 'replace'.
>   // Target (prog_fd, btf_id) already known to the kernel and verified
> 
> > So the two component programs would still exist as kernel objects,
> > right? 
> 
> yes. Both fw1.o and fw2.o will be loaded and running instead of placeholders.
> 
> > And the trampolines would keep individual stats for each one (if
> > BPF stats are enabled)? 
> 
> In case of dynamic linking both fw1.o and fw2.o will be seen as individual
> programs from 'bpftool p s' point of view. And both will have individual stats.
> 
> > Could userspace also extract the prog IDs being
> > referenced by the "glue" proglet? 
> 
> Not sure I follow. Both fw1.o and fw2.o will have their own prog ids.
> fw1_prog->aux->linked_prog == rootlet_prog
> fw2_prog->aux->linked_prog == rootlet_prog
> Unloading and detaching fw1.o will make kernel to switch back to placeholder
> subprog in roolet_prog. I believe roolet_prog should not keep a list of progs
> that attached to it (or replaced its subprogs) to avoid circular dependency.
> Due to that detaching roolet_prog from netdev will stop the flow of packets
> into fw1.o, but refcnt of rootlet_prog will not go to zero, so it will stay in
> memory until both fw1.o and fw2.o detach from rootlet.o.

I would expect detaching rootlet prog from netdev to _detach_ the rootlet
program. Because it happens to be linked to two other object shouldn't
impact this IMO.

The first transition is clear,

  detaching fw1.o -> rootlet runs placeholder

But second case

  detach rootlet -> ???

Both fw1.o and fw2.o in this case are object files with their own lifetime
which can be pinned/unpinned, etc. I see it makes the implementation a bit
more difficult but should be doable.

> 
> > What about attaching a third program? Would that work by recursion (as
> > above, but with the old proglet as old_fd), or should the library build
> > a whole new sequence from the component programs?
> 
> This choice is up to libxdp.so. It can have a number of placeholders
> ready to be replaced by new progs. Or it can re-generate rootlet.o
> every time new fwX.o comes along. Short term I would start development
> with auto-generated roolet.o and static linking done by libbpf
> while the policy and roolet are done by libxdp.so, since this work
> doesn't depend on any kernel changes. Long term auto-generation
> can stay in libxdp.so if it turns out to be sufficient.

Agreed. Otherwise who/what would have enough information to figure this
out. But we should allow for multiple levels of dynamic linking. Example:

   // first load shared libraries
   obj1 = bpf_object__open("shared1.o", attr)
   bpf_object__load(obj1); // load and verify object could pin/unpin as well

   // second shared lib using first libs
   attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj1);
   obj2 = bpf_object__open("shared2.o", attr)
   bpf_object__load(obj2); // load and verify object could pin/unpin as well
  
   // open and load main program which will now use "bar" in obj2.
   attr.attach_subprog[0] = libbpf_find__obj_btf_id("bar", obj2);
   main = bpf_object__open("rootlet.o", attr)
   bpf_object__load(main)

   // now we have a main object with main -> obj2 -> obj1

If we can unload obj1 but its refcnt will not go to zero because of
obj2. However unloading main is OK.

Is the circular dependency case you are woried about the following then?

   load obj2             obj2(refcnt 1)
   load obj1->obj2       obj2(refcnt 2) obj1(refcnt 1)
   load obj3->obj1->obj2 obj2(refcnt 3) obj1(refcnt 2) obj3(refcnt 1)
            ->obj2

If you unload obj3 it will have to walk the chain of objects and dereference
them. Is this the circular dependency?
 
> 
> > Finally, what happens if someone where to try to attach a retprobe to
> > one of the component programs? Could it be possible to do that even
> > while program is being run from proglet dispatch? That way we can still
> > debug an individual XDP program even though it's run as part of a chain.
> 
> Right. The fentry/fexit tracing is orthogonal to static/dynamic linking.
> It will be available for all prog types after trampoline patches land.
> See fexit_bpf2bpf.c example in the last 18/18 patch.
> We will be able to debug XDP program regardless whether it's a rootlet
> or a subprogram. Doesn't matter whether linking was static or dynamic.
> With fentry/fexit we will be able to do different stats too.
> Right now bpf program stats are limited to cycles and I resisted a lot
> of pressure to add more hard coded stats. With fentry/fexit we can
> collect arbitrary counters per program. Like number of L1-cache misses
> or number of TLB misses in a given XDP prog.
> 
> > Sounds reasonable. Any reason libxdp.so couldn't be part of libbpf?
> 
> libxdp.so is a policy specifier while libbpf is a tool. It makes more sense for
> them to be separate. libbpf has strong api compatibility guarantees. While I
> don't think anyone knows at this point how libxdp api should look and it will
> take some time for it to mature.

Agree separate makes more sense to me. Seems you may end up with a policy
language and rpm packaging details, etc. in this. Probably not libbpf.
Could be in ./tools/lib/xdp though.

> 
> > 
> > > If in the future we figure out how to do two load-balancers libxdp.so
> > > will be able to accommodate that new policy.
> > 
> > Yeah, it would be cool if we could move things across CPUs; like with
> > cpumap, but executing another XDP program on the target CPU.
> 
> You mean like different load balancers for different nic rx queues?
> Hmm. I guess that's possible. Another reason to keep libxdp policy
> flexible at early stages of development.
>
Alexei Starovoitov Nov. 13, 2019, 12:21 a.m. UTC | #34
On Tue, Nov 12, 2019 at 03:25:02PM -0800, John Fastabend wrote:
> 
> same question as Ed so we can follow up there. How does static linking help
> verification?

pls see my reply to Ed.

> > together. The only visible difference dynamic vs static should be that dynamic
> > linking links to already loaded programs that could be executing whereas static
> > counterpart links a set of .o. At that point libbpf may see that some 'extern
> > int prog(...);' referenced in one .o cannot be resolved from another .o. In
> > such case libbpf will try to link it dynamically with progs running in the
> > kernel. We haven't yet defined what 'extern' keyword in the program means.
> > There are a preliminary patches for llvm to support extern variables. Extern
> > functions are not done yet. We have to walk before we run. With dynamic linking
> > I'm proposing an api for the kernel that I think will work regardless of how
> > libbpf and llvm decide to define the meaning of 'extern'.
> > There are two parts to dynamic linking:
> > 1. replace a prog or subprog in the kernel with new prog.
> > 2. from new prog call a prog/subprog that is already loaded.
> > 
> > First case is useful to chain multiple progs together. Second is needed to
> > reuse already present progs instead of loading duplicated coode. Imagine the
> > following scenario:
> > subprog1:
> >   int bpf_packet_parser(struct xdp_md *ctx, struct flow_keys *key)
> >   { /* parses the packet and populates flow_keys */ }
> > subprog2:
> >   int __weak xdp_firewall_placeholder(struct xdp_md *ctx)
> >   { return XDP_PASS; }
> > main prog:
> >   int rootlet(struct xdp_md *ctx)
> >   { ret = xdp_firewall_placeholder(ctx);
> >     switch (ret) { case XDP_PASS: break;...
> >   }
> > 
> > New xdp program might want to replace xdp_firewall_placeholder and it might
> > also want to reuse existing code. It would want to call bpf_packet_parser()
> > subprog. I'm proposing that a pair of (prog_fd, btf_id of subprog) to be used
> > in both cases. To replace the program the new prog needs to specify (prog_fd of
> > rootlet, btf_id of xdp_firewall_placeholder) at load time via
> > attr->attach_prog_fd, attr->attach_btf_id fields that I'm adding as part of
> > trampoline patches. To call bpf_packet_parser() from the new prog the call
> > instruction inside the new prog needs to be annotated with (prog_fd of rootlet,
> > btf_id of bpf_packet_parser). That is similar to how the maps are referenced
> > from the program. I think such interface is nicely symmetrical and should work
> > regardless of how libbpf decides to find 'foo' after seeing 'extern int foo()'.
> > Similarly static linking should be able to do exactly the same linking, but
> > completely in user space. I think libbpf should be able to statically link
> > new_prog.o that calls bpf_packet_parser() with rootlet.o and adjust a call in
> > rootlet.o into new xdp_firewall_placeholder. Conceptually both static and
> > dynamic linking for BPF programs should look very similar to how traditional C
> > programs do it. libbpf will do the magic of specifying (prog_fd, btf_id) in
> > case dynamic is necessary. Otherwise libbpf will do static linking and adjust
> > offsets in call instructions. libbpf is already doing call insn adjustment for
> > subprograms within single .o. Support for static linking in libbpf will be
> > straightforward to do as soon as we define the meaning of 'extern' and add llvm
> > support.
> > 
> > On the kernel side the first part of dynamic linking (replacing the prog or
> > subprog) is a relativly simple follow up to trampoline patches that build most
> > of type checking and connecting apis. The second part of dynamic linking is a
> > bit more work. Every subprog and prog need to be verified independently
> > otherwise the whole thing won't scale beyond simple programs. I would like to
> > build a consensus on the long term plan for dynamic and static linking before
> > implementing the first step.
> 
> Another use case for dynamic linking (long-term) is to reduce verification
> time. By and large many of the programs we (Cilium) load have large
> blocks of repeated code. Currently, verification happens on the entire program
> but would be great to front-load the verification by pre-loading a set of
> library helpers that are validated at load time and generate a call contract
> or set of preconditions needed to satisfy verification and set of postconditions
> upon return. Then when we attach a program (kprobe or networking) the latency
> can be reduced significantly. I think it likely looks similar (the same?) as
> below with a slightly different flow from (2.).
> 
>    // first load shared libraries
>    obj = bpf_object__open("fw2.o", attr)
>    bpf_object__load(obj); // load and verify object could pin/unpin as well
>   
>    // inform open hook of symbols with attributes
>    attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj);
>    attr.attach_subprog[1] = libbpf_find__obj_btf_id("bar", obj);
> 
>    // open and load main program which will now use "foo", "bar"
>    main = bpf_object__open("rootlet.o", attr)
>    bpf_object__load(main)

That sounds very similar to my 2nd case of dynamic linking above, no?
Except that you propose to make it explicit by populating attach_subprog[0|1] ?
I think the amount of explicit libbpf api calls should be brought to minimum.
The .c program should have enough information about what it wants to link to
and what it exposes to outside. imo standard C style should work. If BPF
function is 'static' it cannot be used as helper by other BPF program. If it's
non-static (global) then it can be. The main headache to resolve is the scope
of search for libbpf to do. I think from libbpf api side it should look like:
   obj = bpf_object__open("helpers.o", attr);
   bpf_object__load(obj);
   // kernel loads and verifies all funcs in helpers.o. They are not attached
   // to anything yet.

   main = bpf_object__open("main.o", attr);
   // libbpf needs to figure out what main.o needs to link to.
   // If libbpf can do static linking it should, otherwise it will link
   // into funcs that were loaded as part of helpers.o
   // libbpf will adjust call insns in main.o with
   // (prog_fd_of_helpers, btf_id_of_that_helper_func)
   bpf_object__load(main);


> > Not sure I follow. Both fw1.o and fw2.o will have their own prog ids.
> > fw1_prog->aux->linked_prog == rootlet_prog
> > fw2_prog->aux->linked_prog == rootlet_prog
> > Unloading and detaching fw1.o will make kernel to switch back to placeholder
> > subprog in roolet_prog. I believe roolet_prog should not keep a list of progs
> > that attached to it (or replaced its subprogs) to avoid circular dependency.
> > Due to that detaching roolet_prog from netdev will stop the flow of packets
> > into fw1.o, but refcnt of rootlet_prog will not go to zero, so it will stay in
> > memory until both fw1.o and fw2.o detach from rootlet.o.
> 
> I would expect detaching rootlet prog from netdev to _detach_ the rootlet
> program. Because it happens to be linked to two other object shouldn't
> impact this IMO.
> 
> The first transition is clear,
> 
>   detaching fw1.o -> rootlet runs placeholder
> 
> But second case
> 
>   detach rootlet -> ???
> 
> Both fw1.o and fw2.o in this case are object files with their own lifetime
> which can be pinned/unpinned, etc. I see it makes the implementation a bit
> more difficult but should be doable.

detach rootlet - is simply detaching rootlet. It's no longer executing.
Hence fw1 doesn't see packets anymore. Both rootlet.o and fw1.o stay
in kernel memory, because libxdp.so didn't unload fw1.o yet and fw1
keeps rootlet.o alive in memory, because it tracks it as dependency.
When refcnt drops to zero everywhere all of them gets unloaded and memory freed.
Same thing when fw1.o and fw2.o did bpf_prog_get(rootlet.o) once for each.

> > > What about attaching a third program? Would that work by recursion (as
> > > above, but with the old proglet as old_fd), or should the library build
> > > a whole new sequence from the component programs?
> > 
> > This choice is up to libxdp.so. It can have a number of placeholders
> > ready to be replaced by new progs. Or it can re-generate rootlet.o
> > every time new fwX.o comes along. Short term I would start development
> > with auto-generated roolet.o and static linking done by libbpf
> > while the policy and roolet are done by libxdp.so, since this work
> > doesn't depend on any kernel changes. Long term auto-generation
> > can stay in libxdp.so if it turns out to be sufficient.
> 
> Agreed. Otherwise who/what would have enough information to figure this
> out. But we should allow for multiple levels of dynamic linking. Example:
> 
>    // first load shared libraries
>    obj1 = bpf_object__open("shared1.o", attr)
>    bpf_object__load(obj1); // load and verify object could pin/unpin as well
> 
>    // second shared lib using first libs
>    attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj1);

multi level certainly should be possible.
I think 'multi level' is probably not entirely accurate name.
It's the graph of progs and funcs that exists in the kernel and on disk
that libbpf needs to link to.
It can have any number of dependencies and levels.
Hence my proposal above to keep only uni-directional dependency
when replacing/attaching.

> If you unload obj3 it will have to walk the chain of objects and dereference
> them. Is this the circular dependency?

By circular dependency I meant the case of fw1 and fw2 attaching to rootlet.
I'm proposing that fw1 remembers that it attached to rootlet, and fw2 remembers
that it attached to the same rootlet, but rootlet doesn't track what attached to
it. Otherwise there are bi-directional links and link-lists everywhere.
John Fastabend Nov. 13, 2019, 5:33 a.m. UTC | #35
Alexei Starovoitov wrote:
> On Tue, Nov 12, 2019 at 03:25:02PM -0800, John Fastabend wrote:
> > 
> > same question as Ed so we can follow up there. How does static linking help
> > verification?
> 
> pls see my reply to Ed.


great will read thansk.

> > > together. The only visible difference dynamic vs static should be that dynamic
> > > linking links to already loaded programs that could be executing whereas static
> > > counterpart links a set of .o. At that point libbpf may see that some 'extern
> > > int prog(...);' referenced in one .o cannot be resolved from another .o. In
> > > such case libbpf will try to link it dynamically with progs running in the
> > > kernel. We haven't yet defined what 'extern' keyword in the program means.
> > > There are a preliminary patches for llvm to support extern variables. Extern
> > > functions are not done yet. We have to walk before we run. With dynamic linking
> > > I'm proposing an api for the kernel that I think will work regardless of how
> > > libbpf and llvm decide to define the meaning of 'extern'.
> > > There are two parts to dynamic linking:
> > > 1. replace a prog or subprog in the kernel with new prog.
> > > 2. from new prog call a prog/subprog that is already loaded.
> > > 
> > > First case is useful to chain multiple progs together. Second is needed to
> > > reuse already present progs instead of loading duplicated coode. Imagine the
> > > following scenario:
> > > subprog1:
> > >   int bpf_packet_parser(struct xdp_md *ctx, struct flow_keys *key)
> > >   { /* parses the packet and populates flow_keys */ }
> > > subprog2:
> > >   int __weak xdp_firewall_placeholder(struct xdp_md *ctx)
> > >   { return XDP_PASS; }
> > > main prog:
> > >   int rootlet(struct xdp_md *ctx)
> > >   { ret = xdp_firewall_placeholder(ctx);
> > >     switch (ret) { case XDP_PASS: break;...
> > >   }
> > > 
> > > New xdp program might want to replace xdp_firewall_placeholder and it might
> > > also want to reuse existing code. It would want to call bpf_packet_parser()
> > > subprog. I'm proposing that a pair of (prog_fd, btf_id of subprog) to be used
> > > in both cases. To replace the program the new prog needs to specify (prog_fd of
> > > rootlet, btf_id of xdp_firewall_placeholder) at load time via
> > > attr->attach_prog_fd, attr->attach_btf_id fields that I'm adding as part of
> > > trampoline patches. To call bpf_packet_parser() from the new prog the call
> > > instruction inside the new prog needs to be annotated with (prog_fd of rootlet,
> > > btf_id of bpf_packet_parser). That is similar to how the maps are referenced
> > > from the program. I think such interface is nicely symmetrical and should work
> > > regardless of how libbpf decides to find 'foo' after seeing 'extern int foo()'.
> > > Similarly static linking should be able to do exactly the same linking, but
> > > completely in user space. I think libbpf should be able to statically link
> > > new_prog.o that calls bpf_packet_parser() with rootlet.o and adjust a call in
> > > rootlet.o into new xdp_firewall_placeholder. Conceptually both static and
> > > dynamic linking for BPF programs should look very similar to how traditional C
> > > programs do it. libbpf will do the magic of specifying (prog_fd, btf_id) in
> > > case dynamic is necessary. Otherwise libbpf will do static linking and adjust
> > > offsets in call instructions. libbpf is already doing call insn adjustment for
> > > subprograms within single .o. Support for static linking in libbpf will be
> > > straightforward to do as soon as we define the meaning of 'extern' and add llvm
> > > support.
> > > 
> > > On the kernel side the first part of dynamic linking (replacing the prog or
> > > subprog) is a relativly simple follow up to trampoline patches that build most
> > > of type checking and connecting apis. The second part of dynamic linking is a
> > > bit more work. Every subprog and prog need to be verified independently
> > > otherwise the whole thing won't scale beyond simple programs. I would like to
> > > build a consensus on the long term plan for dynamic and static linking before
> > > implementing the first step.
> > 
> > Another use case for dynamic linking (long-term) is to reduce verification
> > time. By and large many of the programs we (Cilium) load have large
> > blocks of repeated code. Currently, verification happens on the entire program
> > but would be great to front-load the verification by pre-loading a set of
> > library helpers that are validated at load time and generate a call contract
> > or set of preconditions needed to satisfy verification and set of postconditions
> > upon return. Then when we attach a program (kprobe or networking) the latency
> > can be reduced significantly. I think it likely looks similar (the same?) as
> > below with a slightly different flow from (2.).
> > 
> >    // first load shared libraries
> >    obj = bpf_object__open("fw2.o", attr)
> >    bpf_object__load(obj); // load and verify object could pin/unpin as well
> >   
> >    // inform open hook of symbols with attributes
> >    attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj);
> >    attr.attach_subprog[1] = libbpf_find__obj_btf_id("bar", obj);
> > 
> >    // open and load main program which will now use "foo", "bar"
> >    main = bpf_object__open("rootlet.o", attr)
> >    bpf_object__load(main)
> 
> That sounds very similar to my 2nd case of dynamic linking above, no?

I think it is the same but the order is reversed. First load all the
libraries and then the program that uses them vs loading xdp_main followed
by overwriting the functions. The reason is to reduce the verification
time of the main program because we would have already verified all the
libraries it is using. See below but I believe the proposed API works
for this case?

> Except that you propose to make it explicit by populating attach_subprog[0|1] ?

I don't actually care if its explicit. That could be an internal detail of
libbpf. But if its not exposed it seems the programmer has to tell libbpf
somehow about how to resolve the extern functions? It has to know somehow
to look in obj1, obj2 when loading rootlet.o in my case. So attach_prog_fd
and attach_btf_id would work similarly but I think we still need to be
able to specify multiples of these if the order is reversed. Or alternatively
kernel maintains the table per your additional example below.

Above you had this example,

> 2. Dynamic linking:
>  // assuming libxdp.so manages eth0
>  rootlet_fd = get_xdp_fd(eth0);
>  subprog_btf_id = libbpf_find_prog_btf_id("name_of_placeholder", roolet_fd);
>  //                  ^ this function is in patch 16/18 of trampoline
>  attr.attach_prog_fd = roolet_fd;
>  attr.attach_btf_id = subprog_btf_id;

In my case rootlet_fd hasn't been created yet so we cant use attach_prog_fd here. 

>  // pair (prog_fd, btf_id) needs to be specified at load time
>  obj = bpf_object__open("fw2.o", attr);
>  bpf_object__load(obj);
>  prog = bpf_object__find_program_by_title(obj);
>  link = bpf_program__replace(prog); // similar to bpf_program__attach_trace()
>  // no extra arguments during 'replace'.
>  // Target (prog_fd, btf_id) already known to the kernel and verified

In addition to above flow something like this to load libraries first should
also work?

   // here fw2 is a library its never attached to anything but can be
   // used to pull functions from
   obj = bpf_object__open("fw2.o", attr);
   bpf_object__load(obj);
   prog = bpf_object__find_program_by_title(obj);
   subprog_btf_id0 = libbpf_find_obj_btf_id("name of function", obj);
   subprog_btf_id1 = libbpf_find_obj_btf_id("name of function", obj);

   // all pairs of (prog_fd, btf_id) need to be specified at load time
   attr.attach[0].prog_fd = fw2_fd;
   attr.attach[0].btf_id = subprog_btf_id0;
   attr.attach[1].prog_fd = fw2_fd;
   attr.attach[1].btf_id = subprog_btf_id1;
   obj = bpf_object__open("rootlet.o", attr)
   bpf_object__load(obj)
   prog = bpf_object__find_program_by_title(obj);
   link = bpf_program__replace(prog);
   // attach rootlet.o at this point with subprog_btf_id

I think this is exactly the same just different order? Alternatively to
avoid the array of attach[] the bpf_program__replace() could take the fd/id
like

   link = bpf_program__replace(prog, fw2, subprog_btf_id0))
   prog = bpf_program__from_link(link)
   link = bpf_program__replace(prog, fw2, subprog_btf_id0))

I slightly prefer using multiple replace() calls it feels somewhat similar
to the for each map loops I use today to replace map fds. If multiple replace()
calls are allowed it seems that it would work in original example and in the
case where fw1.o and fw2.o are loaded first.

Was that more clear?

> I think the amount of explicit libbpf api calls should be brought to minimum.

Sounds good to me. On my TODO list to see if some of the map handling
parts we have make sense in libbpf context. For example we have
routines to do map fd replacement which, as noted above, feels
similar to replacing functions.

> The .c program should have enough information about what it wants to link to
> and what it exposes to outside. imo standard C style should work. If BPF
> function is 'static' it cannot be used as helper by other BPF program. If it's
> non-static (global) then it can be. The main headache to resolve is the scope
> of search for libbpf to do. I think from libbpf api side it should look like:
>
>    obj = bpf_object__open("helpers.o", attr);
>    bpf_object__load(obj);
>    // kernel loads and verifies all funcs in helpers.o. They are not attached
>    // to anything yet.
> 
>    main = bpf_object__open("main.o", attr);
>    // libbpf needs to figure out what main.o needs to link to.
>    // If libbpf can do static linking it should, otherwise it will link
>    // into funcs that were loaded as part of helpers.o
>    // libbpf will adjust call insns in main.o with
>    // (prog_fd_of_helpers, btf_id_of_that_helper_func)
>    bpf_object__load(main);

I think this is better than my proposal above. I didn't just delete my
example because I think if we allow multiple replace() calls then we get
extra flexibility without really creating much extra complexity. The
open/load helpers.o case would be the normal flow for using dynamic
linking and if folks want to replace/remove functions they can use above.

One question, in this case if helpers.o is removed after main what happens?
Will main.o also be removed because it will no longer have helpers.o or will
it somehow keep a reference to helpers.o which would break the
uni-directional graph below. FWIW it makes sense to me to just tear the
entire thing down. That is a clear rule user space can follow and organize
as needed it also avoids any strange case where things get stuck with ref
counts.

So I think you have two APIs, the case where the kernel handles it and
the other case where user uses replace() calls to do the dynamic linking
as in original example.

> 
> 
> > > Not sure I follow. Both fw1.o and fw2.o will have their own prog ids.
> > > fw1_prog->aux->linked_prog == rootlet_prog
> > > fw2_prog->aux->linked_prog == rootlet_prog
> > > Unloading and detaching fw1.o will make kernel to switch back to placeholder
> > > subprog in roolet_prog. I believe roolet_prog should not keep a list of progs
> > > that attached to it (or replaced its subprogs) to avoid circular dependency.
> > > Due to that detaching roolet_prog from netdev will stop the flow of packets
> > > into fw1.o, but refcnt of rootlet_prog will not go to zero, so it will stay in
> > > memory until both fw1.o and fw2.o detach from rootlet.o.
> > 
> > I would expect detaching rootlet prog from netdev to _detach_ the rootlet
> > program. Because it happens to be linked to two other object shouldn't
> > impact this IMO.
> > 
> > The first transition is clear,
> > 
> >   detaching fw1.o -> rootlet runs placeholder
> > 
> > But second case
> > 
> >   detach rootlet -> ???
> > 
> > Both fw1.o and fw2.o in this case are object files with their own lifetime
> > which can be pinned/unpinned, etc. I see it makes the implementation a bit
> > more difficult but should be doable.
> 
> detach rootlet - is simply detaching rootlet. It's no longer executing.
> Hence fw1 doesn't see packets anymore. Both rootlet.o and fw1.o stay
> in kernel memory, because libxdp.so didn't unload fw1.o yet and fw1
> keeps rootlet.o alive in memory, because it tracks it as dependency.
> When refcnt drops to zero everywhere all of them gets unloaded and memory freed.
> Same thing when fw1.o and fw2.o did bpf_prog_get(rootlet.o) once for each.

OK this makes sense now thanks.

> 
> > > > What about attaching a third program? Would that work by recursion (as
> > > > above, but with the old proglet as old_fd), or should the library build
> > > > a whole new sequence from the component programs?
> > > 
> > > This choice is up to libxdp.so. It can have a number of placeholders
> > > ready to be replaced by new progs. Or it can re-generate rootlet.o
> > > every time new fwX.o comes along. Short term I would start development
> > > with auto-generated roolet.o and static linking done by libbpf
> > > while the policy and roolet are done by libxdp.so, since this work
> > > doesn't depend on any kernel changes. Long term auto-generation
> > > can stay in libxdp.so if it turns out to be sufficient.
> > 
> > Agreed. Otherwise who/what would have enough information to figure this
> > out. But we should allow for multiple levels of dynamic linking. Example:
> > 
> >    // first load shared libraries
> >    obj1 = bpf_object__open("shared1.o", attr)
> >    bpf_object__load(obj1); // load and verify object could pin/unpin as well
> > 
> >    // second shared lib using first libs
> >    attr.attach_subprog[0] = libbpf_find__obj_btf_id("foo", obj1);
> 
> multi level certainly should be possible.
> I think 'multi level' is probably not entirely accurate name.
> It's the graph of progs and funcs that exists in the kernel and on disk
> that libbpf needs to link to.
> It can have any number of dependencies and levels.
> Hence my proposal above to keep only uni-directional dependency
> when replacing/attaching.

I'm convinced +1

> 
> > If you unload obj3 it will have to walk the chain of objects and dereference
> > them. Is this the circular dependency?
> 
> By circular dependency I meant the case of fw1 and fw2 attaching to rootlet.
> I'm proposing that fw1 remembers that it attached to rootlet, and fw2 remembers
> that it attached to the same rootlet, but rootlet doesn't track what attached to
> it. Otherwise there are bi-directional links and link-lists everywhere.
> 

Got it. And if you don't have a weak func for this to allow fw1 to remove just
remove the rootlet as well.
Edward Cree Nov. 13, 2019, 6:30 p.m. UTC | #36
On 12/11/2019 23:18, Alexei Starovoitov wrote:
> On Tue, Nov 12, 2019 at 09:25:06PM +0000, Edward Cree wrote:
>> Fwiw the 'natural' C way of doing it would be that for any extern symbol in
>>  the C file, the ELF file gets a symbol entry with st_shndx=SHN_UNDEF, and
>>  code in .text that uses that symbol gets relocation entries.  That's (AIUI)
>>  how it works on 'normal' architectures, and that's what my ebld linker
>>  understands; when it sees a definition in another file for that symbol
>>  (matched just by the symbol name) it applies all the relocations of the
>>  symbol to the appropriate progbits.
>> I don't really see what else you could define 'extern' to mean.
> That's exactly the problem with standard 'extern'. ELF preserves the name only.
> There is no type.
But if you have BTFs, then you can look up each symbol's type at link time and
 check that they match.  Point being that the BTF is for validation (and
 therefore strictly optional at this point) rather than using it to identify a
 symbol.  Trouble with the latter is that BTF ids get renumbered on linking, so
 any references to them have to change, whereas symbol names stay the same.

> There is also
> no way to place extern into a section. Currently SEC("..") is a standard way to
> annotate bpf programs.
While the symbol itself doesn't have a section, each _use_ of the symbol has a
 reloc, and the SHT_REL[A] in which that reloc resides has a sh_info specifying
 "the section header index of the section to which the relocation applies."  So
 can't that be used if symbol visibility needs to depend on section?  Tbh I
 can't exactly see why externs need placing in a section in the first place.

> I think reliable 'extern' has to have more than just
> name. 'extern int foo;' can be a reference to 'int foo;' in another BPF ELF
> file, or it can be a reference to 'int foo;' in already loaded BPF prog, or it
> can be a reference to 'int foo;' inside the kernel itself, or it can be a
> reference to pseudo variable that libbpf should replace. For example 'extern
> int kernel_version;' or 'extern int CONFIG_HZ;' would be useful extern-like
> variables that program might want to use. Disambiguating by name is probably
> not enough. We can define an order of resolution. libbpf will search in other
> .o first, then will search in loaded bpf progs, than in kernel, and if all
> fails than will resolve things like 'extern int CONFIG_HZ' on its own. It feels
> fragile though.
It sounds perfectly reasonable and not fragile to me.  The main alternative
 I see, about equally good, is to not allow defining symbols that are already
 (non-weakly) defined; so if a bpf prog tries to globally declare "int CONFIG_HZ"
 or "int netif_receive_skb(struct sk_buff *skb)" then it gets rejected.

> I think we need to be able to specify something like section to
> extern variables and functions.
It seems unnecessary to have the user code specify this.  Another a bad
 analogy: in userland C code you don't have to annotate the function protos in
 your header files to say whether they come from another .o file, a random
 library or the libc.  You just declare "a function called this exists somewhere
 and we'll find it at link time".

> I was imagining that the verifier will do per-function verification
> of program with sub-programs instead of analyzing from root.
Ah I see.  Yes, that's a very attractive design.

If we make it from a sufficiently generic idea of pre/postconditions, then it
 could also be useful for e.g. loop bodies (user-supplied annotations that allow
 us to walk the body only once instead of N times); then a function call just
 gets standard pre/postconditions generated from its argument types if the user
 didn't specify something else.

That would then also support things like:
> The next step is to extend this thought process to integers.
> int foo(struct xdp_md *arg1, int arg2);
> The verifier can check that the program is valid for any valid arg1 and
> arg2 = mark_reg_unbounded().
... this but arg2 isn't unbounded.
However, it might be difficult to do this without exposing details of the
 verifier into the ABI.  Still, if we can it sounds like it would make John
 quite happy too.  And of course it doesn't need to have the user annotations
 from the beginning, it can start out as just the kernel generating pre/post
 conditions internally.

-Ed
Andrii Nakryiko Nov. 13, 2019, 6:51 p.m. UTC | #37
On Wed, Nov 13, 2019 at 10:30 AM Edward Cree <ecree@solarflare.com> wrote:
>
> On 12/11/2019 23:18, Alexei Starovoitov wrote:
> > On Tue, Nov 12, 2019 at 09:25:06PM +0000, Edward Cree wrote:
> >> Fwiw the 'natural' C way of doing it would be that for any extern symbol in
> >>  the C file, the ELF file gets a symbol entry with st_shndx=SHN_UNDEF, and
> >>  code in .text that uses that symbol gets relocation entries.  That's (AIUI)
> >>  how it works on 'normal' architectures, and that's what my ebld linker
> >>  understands; when it sees a definition in another file for that symbol
> >>  (matched just by the symbol name) it applies all the relocations of the
> >>  symbol to the appropriate progbits.
> >> I don't really see what else you could define 'extern' to mean.
> > That's exactly the problem with standard 'extern'. ELF preserves the name only.
> > There is no type.
> But if you have BTFs, then you can look up each symbol's type at link time and
>  check that they match.  Point being that the BTF is for validation (and

There is no BTF for extern right now, though. Not even expected size
(for extern variables), it's always zero. So until we have BTF emitted
with type information for externs (either variable or funcs), there is
not "expected type", it's just a name of a symbol.

>  therefore strictly optional at this point) rather than using it to identify a
>  symbol.  Trouble with the latter is that BTF ids get renumbered on linking, so
>  any references to them have to change, whereas symbol names stay the same.
>
> > There is also
> > no way to place extern into a section. Currently SEC("..") is a standard way to
> > annotate bpf programs.
> While the symbol itself doesn't have a section, each _use_ of the symbol has a
>  reloc, and the SHT_REL[A] in which that reloc resides has a sh_info specifying
>  "the section header index of the section to which the relocation applies."  So
>  can't that be used if symbol visibility needs to depend on section?  Tbh I
>  can't exactly see why externs need placing in a section in the first place.

That reloc section is a section in which the **code** that uses extern
is located. It's always going to be program or sub-program section, so
not that useful.

With section names for externs, we can use them as a namespace of
sorts, specifying whether the symbol has to come from kernel itself,
some named BPF object (where name comes from section name as well), or
from libbpf itself. This can be specified if symbol name is ambiguous
and defined in multiple places. Or always just for explicitness.

>
> > I think reliable 'extern' has to have more than just
> > name. 'extern int foo;' can be a reference to 'int foo;' in another BPF ELF
> > file, or it can be a reference to 'int foo;' in already loaded BPF prog, or it
> > can be a reference to 'int foo;' inside the kernel itself, or it can be a
> > reference to pseudo variable that libbpf should replace. For example 'extern
> > int kernel_version;' or 'extern int CONFIG_HZ;' would be useful extern-like
> > variables that program might want to use. Disambiguating by name is probably
> > not enough. We can define an order of resolution. libbpf will search in other
> > .o first, then will search in loaded bpf progs, than in kernel, and if all
> > fails than will resolve things like 'extern int CONFIG_HZ' on its own. It feels
> > fragile though.
> It sounds perfectly reasonable and not fragile to me.  The main alternative
>  I see, about equally good, is to not allow defining symbols that are already
>  (non-weakly) defined; so if a bpf prog tries to globally declare "int CONFIG_HZ"
>  or "int netif_receive_skb(struct sk_buff *skb)" then it gets rejected.
>
> > I think we need to be able to specify something like section to
> > extern variables and functions.
> It seems unnecessary to have the user code specify this.  Another a bad
>  analogy: in userland C code you don't have to annotate the function protos in
>  your header files to say whether they come from another .o file, a random
>  library or the libc.  You just declare "a function called this exists somewhere
>  and we'll find it at link time".

With userspace linking, you control which libraries you are trying to
resolve against. Here it might be against any loaded BPF object in a
system. One way around this would be to specify object names to check
(or kernel) programmatically to libbpf on open/load, but it would be
good to be able to do that declaratively as well, IMO.

>
> > I was imagining that the verifier will do per-function verification
> > of program with sub-programs instead of analyzing from root.
> Ah I see.  Yes, that's a very attractive design.
>
> If we make it from a sufficiently generic idea of pre/postconditions, then it
>  could also be useful for e.g. loop bodies (user-supplied annotations that allow
>  us to walk the body only once instead of N times); then a function call just
>  gets standard pre/postconditions generated from its argument types if the user
>  didn't specify something else.
>
> That would then also support things like:
> > The next step is to extend this thought process to integers.
> > int foo(struct xdp_md *arg1, int arg2);
> > The verifier can check that the program is valid for any valid arg1 and
> > arg2 = mark_reg_unbounded().
> ... this but arg2 isn't unbounded.
> However, it might be difficult to do this without exposing details of the
>  verifier into the ABI.  Still, if we can it sounds like it would make John
>  quite happy too.  And of course it doesn't need to have the user annotations
>  from the beginning, it can start out as just the kernel generating pre/post
>  conditions internally.
>
> -Ed
Toke Høiland-Jørgensen Nov. 14, 2019, 3:41 p.m. UTC | #38
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

[...]

> Back to your question of how fw2 will get loaded.. I'm thinking the following:
> 1. Static linking:
>   obj = bpf_object__open("rootlet.o", "fw1.o", "fw2.o");
>   // libbpf adjusts call offsets and links into single loadable bpf_object
>   bpf_object__load(obj);
>   bpf_set_link_xdp_fd()
> No kernel changes are necessary to support program chaining via static linking.
>
> 2. Dynamic linking:
>   // assuming libxdp.so manages eth0
>   rootlet_fd = get_xdp_fd(eth0);
>   subprog_btf_id = libbpf_find_prog_btf_id("name_of_placeholder", roolet_fd);
>   //                  ^ this function is in patch 16/18 of trampoline
>   attr.attach_prog_fd = roolet_fd;
>   attr.attach_btf_id = subprog_btf_id;
>   // pair (prog_fd, btf_id) needs to be specified at load time
>   obj = bpf_object__open("fw2.o", attr);
>   bpf_object__load(obj);
>   prog = bpf_object__find_program_by_title(obj);
>   link = bpf_program__replace(prog); // similar to bpf_program__attach_trace()
>   // no extra arguments during 'replace'.
>   // Target (prog_fd, btf_id) already known to the kernel and verified

OK, this makes sense.

>> So the two component programs would still exist as kernel objects,
>> right? 
>
> yes. Both fw1.o and fw2.o will be loaded and running instead of placeholders.
>
>> And the trampolines would keep individual stats for each one (if
>> BPF stats are enabled)? 
>
> In case of dynamic linking both fw1.o and fw2.o will be seen as individual
> programs from 'bpftool p s' point of view. And both will have
> individual stats.

Right, this is important, and I think it's where my skepticism about
static linking comes from. With static linking, each XDP program will be
"reduced" to a subprog instead of a full stand-alone program. Which
means that its execution will be different depending on whether it is
just attached directly to an interface, or if it's been linked with a
rootlet before loading.

I'll admit I don't know enough about how subprograms actually work to
know if it's a *meaningful* difference, so I guess I'll go play around
with it. If nothing else, experimenting with static linking can be a way
to hash out the semantics until dynamic linking lands.

>> Could userspace also extract the prog IDs being
>> referenced by the "glue" proglet? 
>
> Not sure I follow. Both fw1.o and fw2.o will have their own prog ids.
> fw1_prog->aux->linked_prog == rootlet_prog
> fw2_prog->aux->linked_prog == rootlet_prog
> Unloading and detaching fw1.o will make kernel to switch back to placeholder
> subprog in roolet_prog. I believe roolet_prog should not keep a list of progs
> that attached to it (or replaced its subprogs) to avoid circular
> dependency.

Well I did mean the link in the other direction. But thinking about it
some more, I don't think it really matters. The important bit is that
userspace can answer the question "given that rootlet ID X is currently
attached on eth0, which two program IDs Y and Z will actually run on
that interface?". And if there's a link in the other direction, it could
just iterate over all loaded programs in the kernel to find them, so
that is OK; as long as we can also tell in which "slot" in the rootlet a
given program is currently attached.

> Due to that detaching roolet_prog from netdev will stop the flow of
> packets into fw1.o, but refcnt of rootlet_prog will not go to zero, so
> it will stay in memory until both fw1.o and fw2.o detach from
> rootlet.o.

OK, that is probably fine. I think we should teach most utilities to
deal with this anyway; in particular, iproute2 should know about
multi-progs (i.e., link against libxdp).

>> What about attaching a third program? Would that work by recursion (as
>> above, but with the old proglet as old_fd), or should the library build
>> a whole new sequence from the component programs?
>
> This choice is up to libxdp.so. It can have a number of placeholders
> ready to be replaced by new progs. Or it can re-generate rootlet.o
> every time new fwX.o comes along. Short term I would start development
> with auto-generated roolet.o and static linking done by libbpf
> while the policy and roolet are done by libxdp.so, since this work
> doesn't depend on any kernel changes. Long term auto-generation
> can stay in libxdp.so if it turns out to be sufficient.

Yes, as I said above this sounds like at least it's a start.

>> Finally, what happens if someone where to try to attach a retprobe to
>> one of the component programs? Could it be possible to do that even
>> while program is being run from proglet dispatch? That way we can still
>> debug an individual XDP program even though it's run as part of a chain.
>
> Right. The fentry/fexit tracing is orthogonal to static/dynamic linking.
> It will be available for all prog types after trampoline patches land.
> See fexit_bpf2bpf.c example in the last 18/18 patch.
> We will be able to debug XDP program regardless whether it's a rootlet
> or a subprogram. Doesn't matter whether linking was static or dynamic.

OK, that's great, and certainly resolved one point of skepticism :)

> With fentry/fexit we will be able to do different stats too.
> Right now bpf program stats are limited to cycles and I resisted a lot
> of pressure to add more hard coded stats. With fentry/fexit we can
> collect arbitrary counters per program. Like number of L1-cache misses
> or number of TLB misses in a given XDP prog.

Yeah, that makes a lot of sense, of course. Great!

>> Sounds reasonable. Any reason libxdp.so couldn't be part of libbpf?
>
> libxdp.so is a policy specifier while libbpf is a tool. It makes more
> sense for them to be separate. libbpf has strong api compatibility
> guarantees. While I don't think anyone knows at this point how libxdp
> api should look and it will take some time for it to mature.

Well, we'd want libxdp to have the same strong API guarantees,
eventually. Which would be a reason to just include it in libbpf. But
sure, I wasn't suggesting to do this from the get-go; we can start out
with something separate and decide later when/if it makes sense to
integrate. As long as libbpf can do the heavy lifting on the actual
linking that is fine with me.

-Toke
Alexei Starovoitov Nov. 15, 2019, 1:50 a.m. UTC | #39
On Tue, Nov 12, 2019 at 09:33:18PM -0800, John Fastabend wrote:
> 
> In addition to above flow something like this to load libraries first should
> also work?
> 
>    // here fw2 is a library its never attached to anything but can be
>    // used to pull functions from
>    obj = bpf_object__open("fw2.o", attr);
>    bpf_object__load(obj);
>    prog = bpf_object__find_program_by_title(obj);
>    subprog_btf_id0 = libbpf_find_obj_btf_id("name of function", obj);
>    subprog_btf_id1 = libbpf_find_obj_btf_id("name of function", obj);
> 
>    // all pairs of (prog_fd, btf_id) need to be specified at load time
>    attr.attach[0].prog_fd = fw2_fd;
>    attr.attach[0].btf_id = subprog_btf_id0;
>    attr.attach[1].prog_fd = fw2_fd;
>    attr.attach[1].btf_id = subprog_btf_id1;
>    obj = bpf_object__open("rootlet.o", attr)
>    bpf_object__load(obj)
>    prog = bpf_object__find_program_by_title(obj);
>    link = bpf_program__replace(prog);
>    // attach rootlet.o at this point with subprog_btf_id

The point I'm arguing that these:
   attr.attach[0].prog_fd = fw2_fd;
   attr.attach[0].btf_id = subprog_btf_id0;
   attr.attach[1].prog_fd = fw2_fd;
   attr.attach[1].btf_id = subprog_btf_id1;
should not be part of libbpf api. Instead libbpf should be able to adjust
relocations inside the program. You're proposing to do linking via explicit
calls, I'm saying such linking should be declarative. libbpf should be able to
derive the intent from the program and patch calls.

Example:
helpers.o:
int foo(struct xdp_md *ctx, int var) {...}
int bar(int *array, bpf_size_t size) {...}
obj = bpf_object__open("helpers.o", attr)
bpf_object__load(obj);
// load and verify helpers. 'foo' and 'bar' are not attachable to anything.
// These two programs don't have program type.
// The kernel loaded and verified them.
main_prog.o:
int foo(struct xdp_md *ctx, int var);
int bar(int *array, bpf_size_t size);
int main_prog(struct xdp_md *ctx) 
{ 
  int ar[5], ret;
  ret = foo(ctx, 1) + bar(ar, 5);
}
// 'foo' and 'bar' are extern functions from main_prog pov.
obj = bpf_object__open("main_prog.o", attr)
bpf_object__load(obj);
// libbpf finds foo/bar in the kernel and adjusts two call instructions inside
// main_prog to point to prog_fd+btf_id

That is the second use case of dynamic linking I've been talking earlier. The
same thing should be possible to do with static linking. Then libbpf will
adjust calls inside main_prog to be 'call pc+123' and 'foo' and 'bar' will
become traditional bpf subprograms. main_prog() has single 'struct xdp_md *'
argument. It is normal attachable XDP program.

Loading main_prog.o first and then loading helpers.o should be possible as
well. The verifier needs BTF of extern 'foo' and 'bar' symbols to be able to
verify main_prog() independently. For example to check that main_prog() is
passing correct ctx into foo(). That is the main difference vs traditional
dynamic linking. I think we all agree that we want bpf programs to be verified
independently. To do that the verifier needs to have BTF (function prototypes)
of extern symbols. One can argue that it's not necessary and helpers.o can be
loaded first. I don't think that will work in all cases. There could be many
dependencies between helpers1.o calling another helpers2.o and so on and there
will be no good order where calling extern foo() can be avoided.

This thread is getting long :) and sounds like we're converging. I'm thinking
to combine everything we've discussed so far into dynamic/static linking doc.
Alexei Starovoitov Nov. 15, 2019, 2:13 a.m. UTC | #40
On Wed, Nov 13, 2019 at 06:30:04PM +0000, Edward Cree wrote:
> > There is also
> > no way to place extern into a section. Currently SEC("..") is a standard way to
> > annotate bpf programs.
> While the symbol itself doesn't have a section, each _use_ of the symbol has a
>  reloc, and the SHT_REL[A] in which that reloc resides has a sh_info specifying
>  "the section header index of the section to which the relocation applies."  So
>  can't that be used if symbol visibility needs to depend on section?  Tbh I
>  can't exactly see why externs need placing in a section in the first place.

I think section for extern can give a scope of search and make libbpf decisions
more predictable? May be we can live without it for now, but we need BTF of
extern symbols. See my example in reply to John.

> > I think we need to be able to specify something like section to
> > extern variables and functions.
> It seems unnecessary to have the user code specify this.  Another a bad
>  analogy: in userland C code you don't have to annotate the function protos in
>  your header files to say whether they come from another .o file, a random
>  library or the libc.  You just declare "a function called this exists somewhere
>  and we'll find it at link time".

yeah. good analogy.

> > I was imagining that the verifier will do per-function verification
> > of program with sub-programs instead of analyzing from root.
> Ah I see.  Yes, that's a very attractive design.
> 
> If we make it from a sufficiently generic idea of pre/postconditions, then it
>  could also be useful for e.g. loop bodies (user-supplied annotations that allow
>  us to walk the body only once instead of N times); then a function call just
>  gets standard pre/postconditions generated from its argument types if the user
>  didn't specify something else.

regarding pre/post conditions.
I think we have them already. These conditions are the function prototypes.
Instead of making the verifier figuring the conditions it's simpler to use
function prototypes instead. If program author is saying that argument to the
function is 'struct xpd_md *' the verifier will check that the function is safe
when such pointer is passed into it. Then to verify the callsite the verifier
only need to check that what is passed into such function matches the type. I
think it's easy to see when such type is context. Like 'struct __sk_buff *'.
But the idea applies to pointer to int too. I believe you were arguing that
instead of tnum_unknown there could be cases with tnum_range(0-2) as
pre-condition is useful. May be. I think it will simplify the verifier logic
quite a bit if we avoid going fine grain.
Say we have a function:
int foo(struct __sk_buff *skb, int arg)
{
   if (arg > 2)
      return 0;
   // do safe stuff with skb depending whether arg is 0, 1, or 2.
}
That first 'if' is enough to turn pre-conditions into 'any skb' and 'any arg'.
That is exactly what BTF says about this function.
Lorenz Bauer Nov. 15, 2019, 11:48 a.m. UTC | #41
On Tue, 12 Nov 2019 at 02:51, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> This is static linking. The existing kernel infrastructure already supports
> such model and I think it's enough for a lot of use cases. In particular fb's
> firewall+katran XDP style will fit right in. But bpf_tail_calls are
> incompatible with bpf2bpf calls that static linking will use and I think
> cloudlfare folks expressed the interest to use them for some reason even within
> single firewall ? so we need to improve the model a bit.

We several components that we'd like to keep (logically) separate. At a high
level, our rootlet would look like this:

  sample_packets(ctx);
  if (ddos_mitigate(ctx) != XDP_PASS) {
     return XDP_DROP;
  }
  return l4lb(ctx);

I think we could statically link ddos_mitigate() together from
multiple separate .o.
It depends on how complicated our rules become. Maybe we'd use dynamic linking,
to reduce the overhead of re-verification.

The rootlet would use dynamic linking, to be able to debug / inspect
sampling, ddos
mitigation and the l4lb separately. Combined with the ability to hook
arbitrary BPF
programs at entry / exit we could probably get rid of our tail_call
use. I don't think
you have to change the model for us to fit into it.

> We can introduce dynamic linking. The second part of 'BPF trampoline' patches
> allows tracing programs to attach to other BPF programs. The idea of dynamic
> linking is to replace a program or subprogram instead of attaching to it.

Reading the rest of the thread, I'm on board with type 2 of dynamic linking
(load time linking?) However, type 1 (run time linking) I'm not so sure about.
Specifically, the callee holding onto the caller instead of vice versa.

Picking up your rootlet and fw1 example: fw1 holds the refcount on rootlet.
This means user space needs to hold the refcount on fw1 to make sure
the override is kept. This in turn means either: hold on to the file descriptor
or pin the program into a bpffs. The former implies a persistent process,
which doesn't work for tc. The latter makes  lifetime management of fw1 hard:
there is no way to have the kernel automatically deallocate it when it no longer
needed, aka when the rootlet refcount reaches zero. It also overloads close()
to automatically detach the replaced / overridden BPF, which is contrary to how
other BPF hook points work.

I'd much prefer if the API didn't require attach_prog_fd and id at
load time, and
rather have an explicit replace_sub_prog(prog_fd, btf_id, sub_prog_fd).

> [...] This rootlet.o
> can be automatically generated by libxdp.so. If in the future we figure out how
> to do two load-balancers libxdp.so will be able to accommodate that new policy.
> This firewall1.o can be developed and tested independently of other xdp
> programs. The key gotcha here is that the verifier needs to allow more than 512
> stack usage for the rootlet.o. I think that's acceptable.

How would the verifier know which programs are allowed to have larger stacks?

Lorenz
John Fastabend Nov. 15, 2019, 4:39 p.m. UTC | #42
Alexei Starovoitov wrote:
> On Tue, Nov 12, 2019 at 09:33:18PM -0800, John Fastabend wrote:
> > 
> > In addition to above flow something like this to load libraries first should
> > also work?
> > 
> >    // here fw2 is a library its never attached to anything but can be
> >    // used to pull functions from
> >    obj = bpf_object__open("fw2.o", attr);
> >    bpf_object__load(obj);
> >    prog = bpf_object__find_program_by_title(obj);
> >    subprog_btf_id0 = libbpf_find_obj_btf_id("name of function", obj);
> >    subprog_btf_id1 = libbpf_find_obj_btf_id("name of function", obj);
> > 
> >    // all pairs of (prog_fd, btf_id) need to be specified at load time
> >    attr.attach[0].prog_fd = fw2_fd;
> >    attr.attach[0].btf_id = subprog_btf_id0;
> >    attr.attach[1].prog_fd = fw2_fd;
> >    attr.attach[1].btf_id = subprog_btf_id1;
> >    obj = bpf_object__open("rootlet.o", attr)
> >    bpf_object__load(obj)
> >    prog = bpf_object__find_program_by_title(obj);
> >    link = bpf_program__replace(prog);
> >    // attach rootlet.o at this point with subprog_btf_id
> 
> The point I'm arguing that these:
>    attr.attach[0].prog_fd = fw2_fd;
>    attr.attach[0].btf_id = subprog_btf_id0;
>    attr.attach[1].prog_fd = fw2_fd;
>    attr.attach[1].btf_id = subprog_btf_id1;
> should not be part of libbpf api. Instead libbpf should be able to adjust
> relocations inside the program. You're proposing to do linking via explicit
> calls, I'm saying such linking should be declarative. libbpf should be able to
> derive the intent from the program and patch calls.

Agree seems cleaner. So when libs are loaded we create a in-kernel table
of functions/symbols and when main program is loaded search for the funcs. 

> 
> Example:
> helpers.o:
> int foo(struct xdp_md *ctx, int var) {...}
> int bar(int *array, bpf_size_t size) {...}
> obj = bpf_object__open("helpers.o", attr)
> bpf_object__load(obj);
> // load and verify helpers. 'foo' and 'bar' are not attachable to anything.
> // These two programs don't have program type.
> // The kernel loaded and verified them.
> main_prog.o:
> int foo(struct xdp_md *ctx, int var);
> int bar(int *array, bpf_size_t size);
> int main_prog(struct xdp_md *ctx) 
> { 
>   int ar[5], ret;
>   ret = foo(ctx, 1) + bar(ar, 5);
> }
> // 'foo' and 'bar' are extern functions from main_prog pov.
> obj = bpf_object__open("main_prog.o", attr)
> bpf_object__load(obj);
> // libbpf finds foo/bar in the kernel and adjusts two call instructions inside
> // main_prog to point to prog_fd+btf_id
> 
> That is the second use case of dynamic linking I've been talking earlier. The

This solves my use case.

> same thing should be possible to do with static linking. Then libbpf will
> adjust calls inside main_prog to be 'call pc+123' and 'foo' and 'bar' will
> become traditional bpf subprograms. main_prog() has single 'struct xdp_md *'
> argument. It is normal attachable XDP program.
> 
> Loading main_prog.o first and then loading helpers.o should be possible as
> well. The verifier needs BTF of extern 'foo' and 'bar' symbols to be able to
> verify main_prog() independently. For example to check that main_prog() is
> passing correct ctx into foo(). That is the main difference vs traditional
> dynamic linking. I think we all agree that we want bpf programs to be verified
> independently. To do that the verifier needs to have BTF (function prototypes)
> of extern symbols. One can argue that it's not necessary and helpers.o can be
> loaded first. I don't think that will work in all cases. There could be many
> dependencies between helpers1.o calling another helpers2.o and so on and there
> will be no good order where calling extern foo() can be avoided.

Agree, its a bit unclear what we would do with a __weak symbol if a helper.o
is loaded after main.o with that symbol. I think you keep using the __weak
symbol instead of automatically reloading it so you would need some explicit
logic to make that happen in libbpf if user wants.

> 
> This thread is getting long :) and sounds like we're converging. I'm thinking
> to combine everything we've discussed so far into dynamic/static linking doc.
> 

Yep seems we are close now. FWIW I just backed into a case where at least
static linking is going to be needed so this moves it up on my list because
I'll be using it as soon as its available.
John Fastabend Nov. 15, 2019, 4:56 p.m. UTC | #43
Alexei Starovoitov wrote:
> On Wed, Nov 13, 2019 at 06:30:04PM +0000, Edward Cree wrote:
> > > There is also
> > > no way to place extern into a section. Currently SEC("..") is a standard way to
> > > annotate bpf programs.
> > While the symbol itself doesn't have a section, each _use_ of the symbol has a
> >  reloc, and the SHT_REL[A] in which that reloc resides has a sh_info specifying
> >  "the section header index of the section to which the relocation applies."  So
> >  can't that be used if symbol visibility needs to depend on section?  Tbh I
> >  can't exactly see why externs need placing in a section in the first place.
> 
> I think section for extern can give a scope of search and make libbpf decisions
> more predictable? May be we can live without it for now, but we need BTF of
> extern symbols. See my example in reply to John.
> 
> > > I think we need to be able to specify something like section to
> > > extern variables and functions.
> > It seems unnecessary to have the user code specify this.  Another a bad
> >  analogy: in userland C code you don't have to annotate the function protos in
> >  your header files to say whether they come from another .o file, a random
> >  library or the libc.  You just declare "a function called this exists somewhere
> >  and we'll find it at link time".
> 
> yeah. good analogy.
> 
> > > I was imagining that the verifier will do per-function verification
> > > of program with sub-programs instead of analyzing from root.
> > Ah I see.  Yes, that's a very attractive design.
> > 
> > If we make it from a sufficiently generic idea of pre/postconditions, then it
> >  could also be useful for e.g. loop bodies (user-supplied annotations that allow
> >  us to walk the body only once instead of N times); then a function call just
> >  gets standard pre/postconditions generated from its argument types if the user
> >  didn't specify something else.
> 
> regarding pre/post conditions.
> I think we have them already. These conditions are the function prototypes.
> Instead of making the verifier figuring the conditions it's simpler to use
> function prototypes instead. If program author is saying that argument to the
> function is 'struct xpd_md *' the verifier will check that the function is safe
> when such pointer is passed into it. Then to verify the callsite the verifier
> only need to check that what is passed into such function matches the type. I
> think it's easy to see when such type is context. Like 'struct __sk_buff *'.
> But the idea applies to pointer to int too. I believe you were arguing that
> instead of tnum_unknown there could be cases with tnum_range(0-2) as
> pre-condition is useful. May be. I think it will simplify the verifier logic
> quite a bit if we avoid going fine grain.

I was thinking we may want tnum_range(0-2) conditions and also min/max lengths
of arrays, buffers etc. otherwise it might be tricky to convince the verifier
its safe to write into an array. I at least am already hitting some limits here
with helper calls that I would like to resolve at some point. We had to use
some inline 'asm goto' logic to convince clang to generate code that the
verifier would accept. Perhaps some of this can be resolved on LLVM backend
side to insert checks as needed. Also adding compiler barriers and if/else
bounds everywhere is a bit natural.  I expect post conditions will
be important for same reason, returning a pointer into a buffer without
bounds is going to make it difficult to use in the caller program. 

But(!) lets walk first implementing BTF based conditions and linking is
a huge step and doesn't preclude doing more advance things next.

> Say we have a function:
> int foo(struct __sk_buff *skb, int arg)
> {
>    if (arg > 2)
>       return 0;
>    // do safe stuff with skb depending whether arg is 0, 1, or 2.
> }
> That first 'if' is enough to turn pre-conditions into 'any skb' and 'any arg'.
> That is exactly what BTF says about this function.
> 

But this would probably break,

 char *foo(struct __Sk_buff *skb, char *buffer)
 {
      int i;

      for i = 0; i < BUFFER_LENGTH; i++) {
             buffer[i] = blah
      }
      return buffer[i]
 }

Lets deal with that later though.
Alexei Starovoitov Nov. 15, 2019, 11:02 p.m. UTC | #44
On Fri, Nov 15, 2019 at 11:48:42AM +0000, Lorenz Bauer wrote:
> On Tue, 12 Nov 2019 at 02:51, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > This is static linking. The existing kernel infrastructure already supports
> > such model and I think it's enough for a lot of use cases. In particular fb's
> > firewall+katran XDP style will fit right in. But bpf_tail_calls are
> > incompatible with bpf2bpf calls that static linking will use and I think
> > cloudlfare folks expressed the interest to use them for some reason even within
> > single firewall ? so we need to improve the model a bit.
> 
> We several components that we'd like to keep (logically) separate. At a high
> level, our rootlet would look like this:
> 
>   sample_packets(ctx);
>   if (ddos_mitigate(ctx) != XDP_PASS) {
>      return XDP_DROP;
>   }
>   return l4lb(ctx);
> 
> I think we could statically link ddos_mitigate() together from
> multiple separate .o.
> It depends on how complicated our rules become. Maybe we'd use dynamic linking,
> to reduce the overhead of re-verification.
> 
> The rootlet would use dynamic linking, to be able to debug / inspect
> sampling, ddos
> mitigation and the l4lb separately. Combined with the ability to hook
> arbitrary BPF
> programs at entry / exit we could probably get rid of our tail_call
> use. I don't think
> you have to change the model for us to fit into it.
> 
> > We can introduce dynamic linking. The second part of 'BPF trampoline' patches
> > allows tracing programs to attach to other BPF programs. The idea of dynamic
> > linking is to replace a program or subprogram instead of attaching to it.
> 
> Reading the rest of the thread, I'm on board with type 2 of dynamic linking
> (load time linking?) However, type 1 (run time linking) I'm not so sure about.
> Specifically, the callee holding onto the caller instead of vice versa.
> 
> Picking up your rootlet and fw1 example: fw1 holds the refcount on rootlet.
> This means user space needs to hold the refcount on fw1 to make sure
> the override is kept. This in turn means either: hold on to the file descriptor
> or pin the program into a bpffs. The former implies a persistent process,
> which doesn't work for tc. 

Pinning the program alone is not enough.
The folks have requested a feature to pin raw_tp_fd into bpffs.
I'm very much in favor of that. imo all other 'link fd' should be pinnable.
Then auto-detach won't happen on close() of such 'link fd'.

> The latter makes  lifetime management of fw1 hard:
> there is no way to have the kernel automatically deallocate it when it no longer
> needed, aka when the rootlet refcount reaches zero. 

hmm. Not sure I follow. See below.

> It also overloads close()
> to automatically detach the replaced / overridden BPF, which is contrary to how
> other BPF hook points work.

imo all bpf attach api-s that are not FD-based are fragile and error prone
operationally. We've seen people adding a ton of TC ingress progs because of
bugs. Then there were issues with roolet being removed due to bugs. The issues
with overriding wrong entries in prog_array. When multiple teams working on
common infra having globally visible and unprotected state is dangerous. imo
XDP and TC have to move to FD based api. When application holds the 'link fd'
that attached program will not be detached by anything else accidentally.
The operation 'attach rootlet XDP prog to netdev eth0' should return FD.
While that FD is held by application (or pinned in bpffs) nothing should be
able to override XDP prog on that eth0. We don't have such api yet, but I think
it's necessary. Same thing with replacing rootlet's placeholder subprogram with
fw1. When fw1's application links fw1 prog into rootlet nothing should be able
to break that attachment. But if fw1 application crashes that fw1 prog will be
auto-detached from rootlet. The admin can ssh into the box and kill fw1. The
packets will flow into rootlet and will flow into dummy placeholder. No
cleanups to worry about.

In case of fentry/fexit that 'link fd' protects the connection between
tracer prog and tracee prog. Whether tracer has a link to tracee or vice
versa doesn't matter. That's kernel internal implementation. The
uni-directional link avoids circular issues.

> I'd much prefer if the API didn't require attach_prog_fd and id at
> load time, and
> rather have an explicit replace_sub_prog(prog_fd, btf_id, sub_prog_fd).

The verifier has to see the target prog and its full BTF at load time. The
fentry prog needs target prog's BTF. XDP replacement prog needs target prog's
BTF too. So prog_fd+btf_id has to be passed at load time. I think
tgt_prog->refcnt++ should be done at load time too. The ugly alternative would
be to do tgt_prog->refcnt++ before verification. Then after verification
duplicate tgt_prog BTF, store it somewhere, and do tgr_prog->refcnt--. Then
later during attach/replace compare saved BTF with tgt_prog's BTF. That's imo a
ton of unncessary work for the kernel.

> > [...] This rootlet.o
> > can be automatically generated by libxdp.so. If in the future we figure out how
> > to do two load-balancers libxdp.so will be able to accommodate that new policy.
> > This firewall1.o can be developed and tested independently of other xdp
> > programs. The key gotcha here is that the verifier needs to allow more than 512
> > stack usage for the rootlet.o. I think that's acceptable.
> 
> How would the verifier know which programs are allowed to have larger stacks?

Excellent question. I was thinking that every individually verified unit will
be under existing 512 limitation. The composition needs to check for cycles in
the attach graph and for maximum stack. The maximum stack will have some limit.
That's minor. More interesting problem to solve is how to check for cycles.
Lorenz Bauer Nov. 18, 2019, 1:29 p.m. UTC | #45
On Fri, 15 Nov 2019 at 23:02, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> imo all bpf attach api-s that are not FD-based are fragile and error prone
> operationally. We've seen people adding a ton of TC ingress progs because of
> bugs. Then there were issues with roolet being removed due to bugs. The issues
> with overriding wrong entries in prog_array. When multiple teams working on
> common infra having globally visible and unprotected state is dangerous. imo
> XDP and TC have to move to FD based api. When application holds the 'link fd'
> that attached program will not be detached by anything else accidentally.
> The operation 'attach rootlet XDP prog to netdev eth0' should return FD.
> While that FD is held by application (or pinned in bpffs) nothing should be
> able to override XDP prog on that eth0. We don't have such api yet, but I think
> it's necessary.

Ok, I wasn't aware you're planning this. Having a separate fd for the
link resolves
my concerns, since now the lifetime of the program and the link are independent.

Re: the rootlet example, the API would be load (with attach_prog_fd) followed by
override / attach (without arguments?) which then returns a "link fd"?

> Same thing with replacing rootlet's placeholder subprogram with
> fw1. When fw1's application links fw1 prog into rootlet nothing should be able
> to break that attachment. But if fw1 application crashes that fw1 prog will be
> auto-detached from rootlet. The admin can ssh into the box and kill fw1. The
> packets will flow into rootlet and will flow into dummy placeholder. No
> cleanups to worry about.

Nice!

> > I'd much prefer if the API didn't require attach_prog_fd and id at
> > load time, and
> > rather have an explicit replace_sub_prog(prog_fd, btf_id, sub_prog_fd).
>
> The verifier has to see the target prog and its full BTF at load time. The
> fentry prog needs target prog's BTF. XDP replacement prog needs target prog's
> BTF too. So prog_fd+btf_id has to be passed at load time. I think
> tgt_prog->refcnt++ should be done at load time too. The ugly alternative would
> be to do tgt_prog->refcnt++ before verification. Then after verification
> duplicate tgt_prog BTF, store it somewhere, and do tgr_prog->refcnt--. Then
> later during attach/replace compare saved BTF with tgt_prog's BTF. That's imo a
> ton of unncessary work for the kernel.

I've not looked at the fentry patch set, so I don't understand the
technical reasons why
having prog_fd at load time is necessary, I'm just not a fan of the
implied UAPI.
I'll take a look, hopefully I'll understand the trade off better afterwards.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d22338606..13e5f38cf5c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -365,6 +365,8 @@  struct bpf_prog_stats {
 	struct u64_stats_sync syncp;
 };
 
+#define BPF_NUM_CHAIN_SLOTS 8
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -383,6 +385,7 @@  struct bpf_prog_aux {
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
+	struct bpf_prog *chain_progs[BPF_NUM_CHAIN_SLOTS];
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2ce57645f3cd..3d1e4991e61d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -21,6 +21,7 @@ 
 #include <linux/kallsyms.h>
 #include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
+#include <linux/nospec.h>
 
 #include <net/sch_generic.h>
 
@@ -528,6 +529,7 @@  struct bpf_prog {
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
 				has_callchain_buf:1, /* callchain buffer allocated? */
+				chain_calls:1, /* should this use the chain_call wrapper */
 				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
@@ -551,6 +553,30 @@  struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
+#define BPF_MAX_CHAIN_CALLS 32
+static __always_inline unsigned int do_chain_calls(const struct bpf_prog *prog,
+						   const void *ctx)
+{
+	int i = BPF_MAX_CHAIN_CALLS;
+	int idx;
+	u32 ret;
+
+	do {
+		ret = (*(prog)->bpf_func)(ctx, prog->insnsi);
+
+		if (ret + 1 >= BPF_NUM_CHAIN_SLOTS) {
+			prog = prog->aux->chain_progs[0];
+			continue;
+		}
+		idx = ret + 1;
+		idx = array_index_nospec(idx, BPF_NUM_CHAIN_SLOTS);
+
+		prog = prog->aux->chain_progs[idx] ?: prog->aux->chain_progs[0];
+	} while (prog && --i);
+
+	return ret;
+}
+
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 #define BPF_PROG_RUN(prog, ctx)	({				\
@@ -559,14 +585,18 @@  DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
 		struct bpf_prog_stats *stats;			\
 		u64 start = sched_clock();			\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		ret = prog->chain_calls ?			\
+			do_chain_calls(prog, ctx) :			\
+			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
 		stats = this_cpu_ptr(prog->aux->stats);		\
 		u64_stats_update_begin(&stats->syncp);		\
 		stats->cnt++;					\
 		stats->nsecs += sched_clock() - start;		\
 		u64_stats_update_end(&stats->syncp);		\
 	} else {						\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		ret = prog->chain_calls ?				\
+			do_chain_calls(prog, ctx) :			\
+			 (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
 	}							\
 	ret; })
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77c6be96d676..1ce80a227be3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -288,6 +288,12 @@  enum bpf_attach_type {
 /* The verifier internal test flag. Behavior is undefined */
 #define BPF_F_TEST_STATE_FREQ	(1U << 3)
 
+/* Whether to enable chain call logic at program execution. If set, the program
+ * execution logic will check for and jump to chain call programs configured
+ * with the BPF_PROG_CHAIN_* commands to the bpf syscall.
+ */
+#define BPF_F_CHAIN_CALLS	(1U << 4)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * two extensions:
  *
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 66088a9e9b9e..5dfe3585bc5d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -254,6 +254,12 @@  struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 void __bpf_prog_free(struct bpf_prog *fp)
 {
 	if (fp->aux) {
+		int i;
+
+		for (i = 0; i < BPF_NUM_CHAIN_SLOTS; i++)
+			if (fp->aux->chain_progs[i])
+				bpf_prog_put(fp->aux->chain_progs[i]);
+
 		free_percpu(fp->aux->stats);
 		kfree(fp->aux);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 82eabd4e38ad..b8a203a05881 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1630,7 +1630,8 @@  static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
 				 BPF_F_ANY_ALIGNMENT |
 				 BPF_F_TEST_STATE_FREQ |
-				 BPF_F_TEST_RND_HI32))
+				 BPF_F_TEST_RND_HI32 |
+				 BPF_F_CHAIN_CALLS))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1665,6 +1666,7 @@  static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 		return -ENOMEM;
 
 	prog->expected_attach_type = attr->expected_attach_type;
+	prog->chain_calls = !!(attr->prog_flags & BPF_F_CHAIN_CALLS);
 
 	prog->aux->offload_requested = !!attr->prog_ifindex;