diff mbox series

[RFC,bpf-next,05/16] bpf: create file or anonymous dumpers

Message ID 20200408232526.2675664-1-yhs@fb.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf based dumping of kernel data structures | expand

Commit Message

Yonghong Song April 8, 2020, 11:25 p.m. UTC
Given a loaded dumper bpf program, which already
knows which target it should bind to, there
two ways to create a dumper:
  - a file based dumper under hierarchy of
    /sys/kernel/bpfdump/ which uses can
    "cat" to print out the output.
  - an anonymous dumper which user application
    can "read" the dumping output.

For file based dumper, BPF_OBJ_PIN syscall interface
is used. For anonymous dumper, BPF_PROG_ATTACH
syscall interface is used.

To facilitate target seq_ops->show() to get the
bpf program easily, dumper creation increased
the target-provided seq_file private data size
so bpf program pointer is also stored in seq_file
private data.

Further, a seq_num which represents how many
bpf_dump_get_prog() has been called is also
available to the target seq_ops->show().
Such information can be used to e.g., print
banner before printing out actual data.

Note the seq_num does not represent the num
of unique kernel objects the bpf program has
seen. But it should be a good approximate.

A target feature BPF_DUMP_SEQ_NET_PRIVATE
is implemented specifically useful for
net based dumpers. It sets net namespace
as the current process net namespace.
This avoids changing existing net seq_ops
in order to retrieve net namespace from
the seq_file pointer.

For open dumper files, anonymous or not, the
fdinfo will show the target and prog_id associated
with that file descriptor. For dumper file itself,
a kernel interface will be provided to retrieve the
prog_id in one of the later patches.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |   5 +
 include/uapi/linux/bpf.h       |   6 +-
 kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |  11 +-
 tools/include/uapi/linux/bpf.h |   6 +-
 5 files changed, 362 insertions(+), 4 deletions(-)

Comments

Alexei Starovoitov April 10, 2020, 3 a.m. UTC | #1
On Wed, Apr 08, 2020 at 04:25:26PM -0700, Yonghong Song wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 0f1cbed446c1..b51d56fc77f9 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -354,6 +354,7 @@ enum {
>  /* Flags for accessing BPF object from syscall side. */
>  	BPF_F_RDONLY		= (1U << 3),
>  	BPF_F_WRONLY		= (1U << 4),
> +	BPF_F_DUMP		= (1U << 5),
...
>  static int bpf_obj_pin(const union bpf_attr *attr)
>  {
> -	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
> +	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
>  		return -EINVAL;
>  
> +	if (attr->file_flags == BPF_F_DUMP)
> +		return bpf_dump_create(attr->bpf_fd,
> +				       u64_to_user_ptr(attr->dumper_name));
> +
>  	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
>  }

I think kernel can be a bit smarter here. There is no need for user space
to pass BPF_F_DUMP flag to kernel just to differentiate the pinning.
Can prog attach type be used instead?
Yonghong Song April 10, 2020, 6:09 a.m. UTC | #2
On 4/9/20 8:00 PM, Alexei Starovoitov wrote:
> On Wed, Apr 08, 2020 at 04:25:26PM -0700, Yonghong Song wrote:
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 0f1cbed446c1..b51d56fc77f9 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -354,6 +354,7 @@ enum {
>>   /* Flags for accessing BPF object from syscall side. */
>>   	BPF_F_RDONLY		= (1U << 3),
>>   	BPF_F_WRONLY		= (1U << 4),
>> +	BPF_F_DUMP		= (1U << 5),
> ...
>>   static int bpf_obj_pin(const union bpf_attr *attr)
>>   {
>> -	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
>> +	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
>>   		return -EINVAL;
>>   
>> +	if (attr->file_flags == BPF_F_DUMP)
>> +		return bpf_dump_create(attr->bpf_fd,
>> +				       u64_to_user_ptr(attr->dumper_name));
>> +
>>   	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
>>   }
> 
> I think kernel can be a bit smarter here. There is no need for user space
> to pass BPF_F_DUMP flag to kernel just to differentiate the pinning.
> Can prog attach type be used instead?

Good point. Yes, no need for BPF_F_DUMP, expected_attach_type
is enough. Will change.
Yonghong Song April 10, 2020, 10:42 p.m. UTC | #3
On 4/9/20 8:00 PM, Alexei Starovoitov wrote:
> On Wed, Apr 08, 2020 at 04:25:26PM -0700, Yonghong Song wrote:
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 0f1cbed446c1..b51d56fc77f9 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -354,6 +354,7 @@ enum {
>>   /* Flags for accessing BPF object from syscall side. */
>>   	BPF_F_RDONLY		= (1U << 3),
>>   	BPF_F_WRONLY		= (1U << 4),
>> +	BPF_F_DUMP		= (1U << 5),
> ...
>>   static int bpf_obj_pin(const union bpf_attr *attr)
>>   {
>> -	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
>> +	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
>>   		return -EINVAL;
>>   
>> +	if (attr->file_flags == BPF_F_DUMP)
>> +		return bpf_dump_create(attr->bpf_fd,
>> +				       u64_to_user_ptr(attr->dumper_name));
>> +
>>   	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
>>   }
> 
> I think kernel can be a bit smarter here. There is no need for user space
> to pass BPF_F_DUMP flag to kernel just to differentiate the pinning.
> Can prog attach type be used instead?

Think again. I think a flag is still useful.
Suppose that we have the following scenario:
   - the current directory /sys/fs/bpf/
   - user says pin a tracing/dump (target task) prog to "p1"

It is not really clear whether user wants to pin to
    /sys/fs/bpf/p1
or user wants to pin to
    /sys/kernel/bpfdump/task/p1

unless we say that a tracing/dump program cannot pin
to /sys/fs/bpf which seems unnecessary restriction.

What do you think?
Andrii Nakryiko April 10, 2020, 10:51 p.m. UTC | #4
On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a loaded dumper bpf program, which already
> knows which target it should bind to, there
> two ways to create a dumper:
>   - a file based dumper under hierarchy of
>     /sys/kernel/bpfdump/ which uses can
>     "cat" to print out the output.
>   - an anonymous dumper which user application
>     can "read" the dumping output.
>
> For file based dumper, BPF_OBJ_PIN syscall interface
> is used. For anonymous dumper, BPF_PROG_ATTACH
> syscall interface is used.
>
> To facilitate target seq_ops->show() to get the
> bpf program easily, dumper creation increased
> the target-provided seq_file private data size
> so bpf program pointer is also stored in seq_file
> private data.
>
> Further, a seq_num which represents how many
> bpf_dump_get_prog() has been called is also
> available to the target seq_ops->show().
> Such information can be used to e.g., print
> banner before printing out actual data.
>
> Note the seq_num does not represent the num
> of unique kernel objects the bpf program has
> seen. But it should be a good approximate.
>
> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> is implemented specifically useful for
> net based dumpers. It sets net namespace
> as the current process net namespace.
> This avoids changing existing net seq_ops
> in order to retrieve net namespace from
> the seq_file pointer.
>
> For open dumper files, anonymous or not, the
> fdinfo will show the target and prog_id associated
> with that file descriptor. For dumper file itself,
> a kernel interface will be provided to retrieve the
> prog_id in one of the later patches.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   5 +
>  include/uapi/linux/bpf.h       |   6 +-
>  kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |  11 +-
>  tools/include/uapi/linux/bpf.h |   6 +-
>  5 files changed, 362 insertions(+), 4 deletions(-)
>

[...]

>
> +struct dumper_inode_info {
> +       struct bpfdump_target_info *tinfo;
> +       struct bpf_prog *prog;
> +};
> +
> +struct dumper_info {
> +       struct list_head list;
> +       /* file to identify an anon dumper,
> +        * dentry to identify a file dumper.
> +        */
> +       union {
> +               struct file *file;
> +               struct dentry *dentry;
> +       };
> +       struct bpfdump_target_info *tinfo;
> +       struct bpf_prog *prog;
> +};

This is essentially a bpf_link. Why not do it as a bpf_link from the
get go? Instead of having all this duplication for anonymous and
pinned dumpers, it would always be a bpf_link-based dumper, but for
those pinned bpf_link itself is going to be pinned. You also get a
benefit of being able to list all dumpers through existing bpf_link
API (also see my RFC patches with bpf_link_prime/bpf_link_settle,
which makes using bpf_link safe and simple).

[...]

> +
> +static void anon_dumper_show_fdinfo(struct seq_file *m, struct file *filp)
> +{
> +       struct dumper_info *dinfo;
> +
> +       mutex_lock(&anon_dumpers.dumper_mutex);
> +       list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) {

this (and few other places where you search in a loop) would also be
simplified, because struct file* would point to bpf_dumper_link, which
then would have a pointer to bpf_prog, dentry (if pinned), etc. No
searching at all.

> +               if (dinfo->file == filp) {
> +                       seq_printf(m, "target:\t%s\n"
> +                                     "prog_id:\t%u\n",
> +                                  dinfo->tinfo->target,
> +                                  dinfo->prog->aux->id);
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&anon_dumpers.dumper_mutex);
> +}
> +
> +#endif
> +

[...]
Andrii Nakryiko April 10, 2020, 10:53 p.m. UTC | #5
On Fri, Apr 10, 2020 at 3:43 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/9/20 8:00 PM, Alexei Starovoitov wrote:
> > On Wed, Apr 08, 2020 at 04:25:26PM -0700, Yonghong Song wrote:
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 0f1cbed446c1..b51d56fc77f9 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -354,6 +354,7 @@ enum {
> >>   /* Flags for accessing BPF object from syscall side. */
> >>      BPF_F_RDONLY            = (1U << 3),
> >>      BPF_F_WRONLY            = (1U << 4),
> >> +    BPF_F_DUMP              = (1U << 5),
> > ...
> >>   static int bpf_obj_pin(const union bpf_attr *attr)
> >>   {
> >> -    if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
> >> +    if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
> >>              return -EINVAL;
> >>
> >> +    if (attr->file_flags == BPF_F_DUMP)
> >> +            return bpf_dump_create(attr->bpf_fd,
> >> +                                   u64_to_user_ptr(attr->dumper_name));
> >> +
> >>      return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
> >>   }
> >
> > I think kernel can be a bit smarter here. There is no need for user space
> > to pass BPF_F_DUMP flag to kernel just to differentiate the pinning.
> > Can prog attach type be used instead?
>
> Think again. I think a flag is still useful.
> Suppose that we have the following scenario:
>    - the current directory /sys/fs/bpf/
>    - user says pin a tracing/dump (target task) prog to "p1"
>
> It is not really clear whether user wants to pin to
>     /sys/fs/bpf/p1
> or user wants to pin to
>     /sys/kernel/bpfdump/task/p1
>
> unless we say that a tracing/dump program cannot pin
> to /sys/fs/bpf which seems unnecessary restriction.
>
> What do you think?

Instead of special-casing dumper_name, can we require specifying full
path, and then check whether it is in BPF FS vs BPFDUMP FS? If the
latter, additionally check that it is in the right sub-directory
matching its intended target type.

But honestly, just doing everything within BPF FS starts to seem
cleaner at this point...
Andrii Nakryiko April 10, 2020, 11:25 p.m. UTC | #6
On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a loaded dumper bpf program, which already
> knows which target it should bind to, there
> two ways to create a dumper:
>   - a file based dumper under hierarchy of
>     /sys/kernel/bpfdump/ which uses can
>     "cat" to print out the output.
>   - an anonymous dumper which user application
>     can "read" the dumping output.
>
> For file based dumper, BPF_OBJ_PIN syscall interface
> is used. For anonymous dumper, BPF_PROG_ATTACH
> syscall interface is used.
>
> To facilitate target seq_ops->show() to get the
> bpf program easily, dumper creation increased
> the target-provided seq_file private data size
> so bpf program pointer is also stored in seq_file
> private data.
>
> Further, a seq_num which represents how many
> bpf_dump_get_prog() has been called is also
> available to the target seq_ops->show().
> Such information can be used to e.g., print
> banner before printing out actual data.

So I looked up seq_operations struct and did a very cursory read of
fs/seq_file.c and seq_file documentation, so I might be completely off
here.

start() is called before iteration begins, stop() is called after
iteration ends. Would it be a bit better and user-friendly interface
to have to extra calls to BPF program, say with NULL input element,
but with extra enum/flag that specifies that this is a START or END of
iteration, in addition to seq_num?

Also, right now it's impossible to write stateful dumpers that do any
kind of stats calculation, because it's impossible to determine when
iteration restarted (it starts from the very beginning, not from the
last element). It's impossible to just rememebr last processed
seq_num, because BPF program might be called for a new "session" in
parallel with the old one.

So it seems like few things would be useful:

1. end flag for post-aggregation and/or footer printing (seq_num == 0
is providing similar means for start flag).
2. Some sort of "session id", so that bpfdumper can maintain
per-session intermediate state. Plus with this it would be possible to
detect restarts (if there is some state for the same session and
seq_num == 0, this is restart).

It seems like it might be a bit more flexible to, instead of providing
seq_file * pointer directly, actually provide a bpfdumper_context
struct, which would have seq_file * as one of fields, other being
session_id and start/stop flags.

A bit unstructured thoughts, but what do you think?

>
> Note the seq_num does not represent the num
> of unique kernel objects the bpf program has
> seen. But it should be a good approximate.
>
> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> is implemented specifically useful for
> net based dumpers. It sets net namespace
> as the current process net namespace.
> This avoids changing existing net seq_ops
> in order to retrieve net namespace from
> the seq_file pointer.
>
> For open dumper files, anonymous or not, the
> fdinfo will show the target and prog_id associated
> with that file descriptor. For dumper file itself,
> a kernel interface will be provided to retrieve the
> prog_id in one of the later patches.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   5 +
>  include/uapi/linux/bpf.h       |   6 +-
>  kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |  11 +-
>  tools/include/uapi/linux/bpf.h |   6 +-
>  5 files changed, 362 insertions(+), 4 deletions(-)
>

[...]
Yonghong Song April 10, 2020, 11:41 p.m. UTC | #7
On 4/10/20 3:51 PM, Andrii Nakryiko wrote:
> On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a loaded dumper bpf program, which already
>> knows which target it should bind to, there
>> two ways to create a dumper:
>>    - a file based dumper under hierarchy of
>>      /sys/kernel/bpfdump/ which uses can
>>      "cat" to print out the output.
>>    - an anonymous dumper which user application
>>      can "read" the dumping output.
>>
>> For file based dumper, BPF_OBJ_PIN syscall interface
>> is used. For anonymous dumper, BPF_PROG_ATTACH
>> syscall interface is used.
>>
>> To facilitate target seq_ops->show() to get the
>> bpf program easily, dumper creation increased
>> the target-provided seq_file private data size
>> so bpf program pointer is also stored in seq_file
>> private data.
>>
>> Further, a seq_num which represents how many
>> bpf_dump_get_prog() has been called is also
>> available to the target seq_ops->show().
>> Such information can be used to e.g., print
>> banner before printing out actual data.
>>
>> Note the seq_num does not represent the num
>> of unique kernel objects the bpf program has
>> seen. But it should be a good approximate.
>>
>> A target feature BPF_DUMP_SEQ_NET_PRIVATE
>> is implemented specifically useful for
>> net based dumpers. It sets net namespace
>> as the current process net namespace.
>> This avoids changing existing net seq_ops
>> in order to retrieve net namespace from
>> the seq_file pointer.
>>
>> For open dumper files, anonymous or not, the
>> fdinfo will show the target and prog_id associated
>> with that file descriptor. For dumper file itself,
>> a kernel interface will be provided to retrieve the
>> prog_id in one of the later patches.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   5 +
>>   include/uapi/linux/bpf.h       |   6 +-
>>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>>   kernel/bpf/syscall.c           |  11 +-
>>   tools/include/uapi/linux/bpf.h |   6 +-
>>   5 files changed, 362 insertions(+), 4 deletions(-)
>>
> 
> [...]
> 
>>
>> +struct dumper_inode_info {
>> +       struct bpfdump_target_info *tinfo;
>> +       struct bpf_prog *prog;
>> +};
>> +
>> +struct dumper_info {
>> +       struct list_head list;
>> +       /* file to identify an anon dumper,
>> +        * dentry to identify a file dumper.
>> +        */
>> +       union {
>> +               struct file *file;
>> +               struct dentry *dentry;
>> +       };
>> +       struct bpfdump_target_info *tinfo;
>> +       struct bpf_prog *prog;
>> +};
> 
> This is essentially a bpf_link. Why not do it as a bpf_link from the
> get go? Instead of having all this duplication for anonymous and

This is a good question. Maybe part of bpf-link can be used and
I have to implement others. I will check.

> pinned dumpers, it would always be a bpf_link-based dumper, but for
> those pinned bpf_link itself is going to be pinned. You also get a
> benefit of being able to list all dumpers through existing bpf_link
> API (also see my RFC patches with bpf_link_prime/bpf_link_settle,
> which makes using bpf_link safe and simple).

Agree. Alternative is to use BPF_OBJ_GET_INFO_BY_FD to query individual
dumper as directory tree walk can be easily done at user space.


> 
> [...]
> 
>> +
>> +static void anon_dumper_show_fdinfo(struct seq_file *m, struct file *filp)
>> +{
>> +       struct dumper_info *dinfo;
>> +
>> +       mutex_lock(&anon_dumpers.dumper_mutex);
>> +       list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) {
> 
> this (and few other places where you search in a loop) would also be
> simplified, because struct file* would point to bpf_dumper_link, which
> then would have a pointer to bpf_prog, dentry (if pinned), etc. No
> searching at all.

This is a reason for this. the same as bpflink, bpfdump already has
the full information about file, inode, etc.
The file private_data actually points to seq_file. The seq_file private 
data is used in the target. That is exactly why we try to have this 
mapping to keep track. bpf_link won't help here.

> 
>> +               if (dinfo->file == filp) {
>> +                       seq_printf(m, "target:\t%s\n"
>> +                                     "prog_id:\t%u\n",
>> +                                  dinfo->tinfo->target,
>> +                                  dinfo->prog->aux->id);
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&anon_dumpers.dumper_mutex);
>> +}
>> +
>> +#endif
>> +
> 
> [...]
>
Yonghong Song April 10, 2020, 11:47 p.m. UTC | #8
On 4/10/20 3:53 PM, Andrii Nakryiko wrote:
> On Fri, Apr 10, 2020 at 3:43 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 4/9/20 8:00 PM, Alexei Starovoitov wrote:
>>> On Wed, Apr 08, 2020 at 04:25:26PM -0700, Yonghong Song wrote:
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 0f1cbed446c1..b51d56fc77f9 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -354,6 +354,7 @@ enum {
>>>>    /* Flags for accessing BPF object from syscall side. */
>>>>       BPF_F_RDONLY            = (1U << 3),
>>>>       BPF_F_WRONLY            = (1U << 4),
>>>> +    BPF_F_DUMP              = (1U << 5),
>>> ...
>>>>    static int bpf_obj_pin(const union bpf_attr *attr)
>>>>    {
>>>> -    if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
>>>> +    if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
>>>>               return -EINVAL;
>>>>
>>>> +    if (attr->file_flags == BPF_F_DUMP)
>>>> +            return bpf_dump_create(attr->bpf_fd,
>>>> +                                   u64_to_user_ptr(attr->dumper_name));
>>>> +
>>>>       return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
>>>>    }
>>>
>>> I think kernel can be a bit smarter here. There is no need for user space
>>> to pass BPF_F_DUMP flag to kernel just to differentiate the pinning.
>>> Can prog attach type be used instead?
>>
>> Think again. I think a flag is still useful.
>> Suppose that we have the following scenario:
>>     - the current directory /sys/fs/bpf/
>>     - user says pin a tracing/dump (target task) prog to "p1"
>>
>> It is not really clear whether user wants to pin to
>>      /sys/fs/bpf/p1
>> or user wants to pin to
>>      /sys/kernel/bpfdump/task/p1
>>
>> unless we say that a tracing/dump program cannot pin
>> to /sys/fs/bpf which seems unnecessary restriction.
>>
>> What do you think?
> 
> Instead of special-casing dumper_name, can we require specifying full
> path, and then check whether it is in BPF FS vs BPFDUMP FS? If the
> latter, additionally check that it is in the right sub-directory
> matching its intended target type.

We could. I just think specifying full path for bpfdump is not necessary 
since it is a single user mount...

> 
> But honestly, just doing everything within BPF FS starts to seem
> cleaner at this point...

bpffs is multi mount, which is not a perfect fit for bpfdump,
considering mounting inside namespace, etc, all dumpers are gone.
Yonghong Song April 11, 2020, 12:23 a.m. UTC | #9
On 4/10/20 4:25 PM, Andrii Nakryiko wrote:
> On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a loaded dumper bpf program, which already
>> knows which target it should bind to, there
>> two ways to create a dumper:
>>    - a file based dumper under hierarchy of
>>      /sys/kernel/bpfdump/ which uses can
>>      "cat" to print out the output.
>>    - an anonymous dumper which user application
>>      can "read" the dumping output.
>>
>> For file based dumper, BPF_OBJ_PIN syscall interface
>> is used. For anonymous dumper, BPF_PROG_ATTACH
>> syscall interface is used.
>>
>> To facilitate target seq_ops->show() to get the
>> bpf program easily, dumper creation increased
>> the target-provided seq_file private data size
>> so bpf program pointer is also stored in seq_file
>> private data.
>>
>> Further, a seq_num which represents how many
>> bpf_dump_get_prog() has been called is also
>> available to the target seq_ops->show().
>> Such information can be used to e.g., print
>> banner before printing out actual data.
> 
> So I looked up seq_operations struct and did a very cursory read of
> fs/seq_file.c and seq_file documentation, so I might be completely off
> here.
> 
> start() is called before iteration begins, stop() is called after
> iteration ends. Would it be a bit better and user-friendly interface
> to have to extra calls to BPF program, say with NULL input element,
> but with extra enum/flag that specifies that this is a START or END of
> iteration, in addition to seq_num?

The current design always pass a valid object (task, file, netlink_sock,
fib6_info). That is, access to fields to those data structure won't 
cause runtime exceptions.

Therefore, with the existing seq_ops implementation for ipv6_route
and netlink, etc, we don't have END information. We can get START
information though.

> 
> Also, right now it's impossible to write stateful dumpers that do any
> kind of stats calculation, because it's impossible to determine when
> iteration restarted (it starts from the very beginning, not from the
> last element). It's impossible to just rememebr last processed
> seq_num, because BPF program might be called for a new "session" in
> parallel with the old one.

Theoretically, session end can be detected by checking the return
value of last bpf_seq_printf() or bpf_seq_write(). If it indicates
an overflow, that means session end.

Or bpfdump infrastructure can help do this work to provide
session id.

> 
> So it seems like few things would be useful:
> 
> 1. end flag for post-aggregation and/or footer printing (seq_num == 0
> is providing similar means for start flag).

the end flag is a problem. We could say hijack next or stop so we
can detect the end, but passing a NULL pointer as the object
to the bpf program may be problematic without verifier enforcement
as it may cause a lot of exceptions... Although all these exception
will be silenced by bpf infra, but still not sure whether this
is acceptable or not.

> 2. Some sort of "session id", so that bpfdumper can maintain
> per-session intermediate state. Plus with this it would be possible to
> detect restarts (if there is some state for the same session and
> seq_num == 0, this is restart).

I guess we can do this.

> 
> It seems like it might be a bit more flexible to, instead of providing
> seq_file * pointer directly, actually provide a bpfdumper_context
> struct, which would have seq_file * as one of fields, other being
> session_id and start/stop flags.

As you mentioned, if we have more fields related to seq_file passing
to bpf program, yes, grouping them into a structure makes sense.

> 
> A bit unstructured thoughts, but what do you think?
> 
>>
>> Note the seq_num does not represent the num
>> of unique kernel objects the bpf program has
>> seen. But it should be a good approximate.
>>
>> A target feature BPF_DUMP_SEQ_NET_PRIVATE
>> is implemented specifically useful for
>> net based dumpers. It sets net namespace
>> as the current process net namespace.
>> This avoids changing existing net seq_ops
>> in order to retrieve net namespace from
>> the seq_file pointer.
>>
>> For open dumper files, anonymous or not, the
>> fdinfo will show the target and prog_id associated
>> with that file descriptor. For dumper file itself,
>> a kernel interface will be provided to retrieve the
>> prog_id in one of the later patches.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   5 +
>>   include/uapi/linux/bpf.h       |   6 +-
>>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>>   kernel/bpf/syscall.c           |  11 +-
>>   tools/include/uapi/linux/bpf.h |   6 +-
>>   5 files changed, 362 insertions(+), 4 deletions(-)
>>
> 
> [...]
>
Alexei Starovoitov April 11, 2020, 11:11 p.m. UTC | #10
On Fri, Apr 10, 2020 at 04:47:36PM -0700, Yonghong Song wrote:
> > 
> > Instead of special-casing dumper_name, can we require specifying full
> > path, and then check whether it is in BPF FS vs BPFDUMP FS? If the
> > latter, additionally check that it is in the right sub-directory
> > matching its intended target type.
> 
> We could. I just think specifying full path for bpfdump is not necessary
> since it is a single user mount...
> 
> > 
> > But honestly, just doing everything within BPF FS starts to seem
> > cleaner at this point...
> 
> bpffs is multi mount, which is not a perfect fit for bpfdump,
> considering mounting inside namespace, etc, all dumpers are gone.

As Yonghong pointed out reusing bpffs for dumpers doesn't look possible
from implementation perspective.
Even if it was possible the files in such mix-and-match file system
would be of different kinds with different semantics. I think that
will lead to mediocre user experience when file 'foo' is cat-able
with nice human output, but file 'bar' isn't cat-able at all because
it's just a pinned map. imo having all dumpers in one fixed location
in /sys/kernel/bpfdump makes it easy to discover for folks who might
not even know what bpf is.
For example when I'm trying to learn some new area of the kernel I might go
poke around /proc and /sys directory looking for a file name that could be
interesting to 'cat'. This is how I discovered /sys/kernel/slab/ :)
I think keeping all dumpers in /sys/kernel/bpfdump/ will make them
similarly discoverable.

re: f_dump flag...
May be it's a sign that pinning is not the right name for such operation?
If kernel cannot distinguish pinning dumper prog into bpffs as a vanilla
pinning operation vs pinning into bpfdumpfs to make it cat-able then something
isn't right about api. Either it needs to be a new bpf syscall command (like
install_dumper_in_dumpfs) or reuse pinning command, but make libbpf specify the
full path. From bpf prog point of view it may still specify only the final
name, but libbpf can prepend the /sys/kernel/bpfdump/.../. May be there is a
third option. Extra flag for pinning just doesn't look right. What if we do
another specialized file system later? It would need yet another flag to pin
there?
Alexei Starovoitov April 11, 2020, 11:17 p.m. UTC | #11
On Fri, Apr 10, 2020 at 05:23:30PM -0700, Yonghong Song wrote:
> > 
> > So it seems like few things would be useful:
> > 
> > 1. end flag for post-aggregation and/or footer printing (seq_num == 0
> > is providing similar means for start flag).
> 
> the end flag is a problem. We could say hijack next or stop so we
> can detect the end, but passing a NULL pointer as the object
> to the bpf program may be problematic without verifier enforcement
> as it may cause a lot of exceptions... Although all these exception
> will be silenced by bpf infra, but still not sure whether this
> is acceptable or not.

I don't like passing NULL there just to indicate something to a program.
It's not too horrible to support from verifier side, but NULL is only
one such flag. What does it suppose to indicate? That dumper prog
is just starting? or ending? Let's pass (void*)1, and (void *)2 ?
I'm not a fan of such inband signaling.
imo it's cleaner and simpler when that object pointer is always valid.

> > 2. Some sort of "session id", so that bpfdumper can maintain
> > per-session intermediate state. Plus with this it would be possible to
> > detect restarts (if there is some state for the same session and
> > seq_num == 0, this is restart).
> 
> I guess we can do this.

beyond seq_num passing session_id is a good idea. Though I don't quite see
the use case where you'd need bpfdumper prog to be stateful, but doesn't hurt.
Yonghong Song April 12, 2020, 6:51 a.m. UTC | #12
On 4/11/20 4:11 PM, Alexei Starovoitov wrote:
> On Fri, Apr 10, 2020 at 04:47:36PM -0700, Yonghong Song wrote:
>>>
>>> Instead of special-casing dumper_name, can we require specifying full
>>> path, and then check whether it is in BPF FS vs BPFDUMP FS? If the
>>> latter, additionally check that it is in the right sub-directory
>>> matching its intended target type.
>>
>> We could. I just think specifying full path for bpfdump is not necessary
>> since it is a single user mount...
>>
>>>
>>> But honestly, just doing everything within BPF FS starts to seem
>>> cleaner at this point...
>>
>> bpffs is multi mount, which is not a perfect fit for bpfdump,
>> considering mounting inside namespace, etc, all dumpers are gone.
> 
> As Yonghong pointed out reusing bpffs for dumpers doesn't look possible
> from implementation perspective.
> Even if it was possible the files in such mix-and-match file system
> would be of different kinds with different semantics. I think that
> will lead to mediocre user experience when file 'foo' is cat-able
> with nice human output, but file 'bar' isn't cat-able at all because
> it's just a pinned map. imo having all dumpers in one fixed location
> in /sys/kernel/bpfdump makes it easy to discover for folks who might
> not even know what bpf is.
> For example when I'm trying to learn some new area of the kernel I might go
> poke around /proc and /sys directory looking for a file name that could be
> interesting to 'cat'. This is how I discovered /sys/kernel/slab/ :)
> I think keeping all dumpers in /sys/kernel/bpfdump/ will make them
> similarly discoverable.
> 
> re: f_dump flag...
> May be it's a sign that pinning is not the right name for such operation?
> If kernel cannot distinguish pinning dumper prog into bpffs as a vanilla
> pinning operation vs pinning into bpfdumpfs to make it cat-able then something
> isn't right about api. Either it needs to be a new bpf syscall command (like
> install_dumper_in_dumpfs) or reuse pinning command, but make libbpf specify the
> full path. From bpf prog point of view it may still specify only the final
> name, but libbpf can prepend the /sys/kernel/bpfdump/.../. May be there is a
> third option. Extra flag for pinning just doesn't look right. What if we do
> another specialized file system later? It would need yet another flag to pin
> there?

For the 2nd option,
    - user still just specifying the dumper name, and
    - bpftool will prepend /sys/kernel/bpfdump/...
this should work. In this case, the kernel API
to create bpf dumper will be
    BPF_OBJ_PIN with a file path
this is fine only with one following annoyance.
Suppose somehow:
    - bpfdump is mounted at /sys/kernel/bpfdump and somewhere else say
      /root/tmp/bpfdump/
      [
        I checked do_mount in namespace.c, and did not find a flag
        to prevent multi mounting, maybe I missed something. I will be
        glad if somebody knows and let me know.
      ]
    - user call BPF_OBJ_PIN to path /root/tmp/bpfdump/task/my_task.
    - But actually the file will also appear in
      /sys/kernel/bpfdump/task/my_task.
there is a little confusion here based on kernel API.
That is exactly why I supplied with only filename. Conceptually, it
will be clear that the dumper will appear in all mount points.

Maybe a new bpf subcommand is warranted.
maybe BPF_DUMPER_INSTALL?
Andrii Nakryiko April 13, 2020, 7:45 p.m. UTC | #13
On Fri, Apr 10, 2020 at 4:41 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/10/20 3:51 PM, Andrii Nakryiko wrote:
> > On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Given a loaded dumper bpf program, which already
> >> knows which target it should bind to, there
> >> two ways to create a dumper:
> >>    - a file based dumper under hierarchy of
> >>      /sys/kernel/bpfdump/ which uses can
> >>      "cat" to print out the output.
> >>    - an anonymous dumper which user application
> >>      can "read" the dumping output.
> >>
> >> For file based dumper, BPF_OBJ_PIN syscall interface
> >> is used. For anonymous dumper, BPF_PROG_ATTACH
> >> syscall interface is used.
> >>
> >> To facilitate target seq_ops->show() to get the
> >> bpf program easily, dumper creation increased
> >> the target-provided seq_file private data size
> >> so bpf program pointer is also stored in seq_file
> >> private data.
> >>
> >> Further, a seq_num which represents how many
> >> bpf_dump_get_prog() has been called is also
> >> available to the target seq_ops->show().
> >> Such information can be used to e.g., print
> >> banner before printing out actual data.
> >>
> >> Note the seq_num does not represent the num
> >> of unique kernel objects the bpf program has
> >> seen. But it should be a good approximate.
> >>
> >> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> >> is implemented specifically useful for
> >> net based dumpers. It sets net namespace
> >> as the current process net namespace.
> >> This avoids changing existing net seq_ops
> >> in order to retrieve net namespace from
> >> the seq_file pointer.
> >>
> >> For open dumper files, anonymous or not, the
> >> fdinfo will show the target and prog_id associated
> >> with that file descriptor. For dumper file itself,
> >> a kernel interface will be provided to retrieve the
> >> prog_id in one of the later patches.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h            |   5 +
> >>   include/uapi/linux/bpf.h       |   6 +-
> >>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
> >>   kernel/bpf/syscall.c           |  11 +-
> >>   tools/include/uapi/linux/bpf.h |   6 +-
> >>   5 files changed, 362 insertions(+), 4 deletions(-)
> >>
> >
> > [...]
> >
> >>
> >> +struct dumper_inode_info {
> >> +       struct bpfdump_target_info *tinfo;
> >> +       struct bpf_prog *prog;
> >> +};
> >> +
> >> +struct dumper_info {
> >> +       struct list_head list;
> >> +       /* file to identify an anon dumper,
> >> +        * dentry to identify a file dumper.
> >> +        */
> >> +       union {
> >> +               struct file *file;
> >> +               struct dentry *dentry;
> >> +       };
> >> +       struct bpfdump_target_info *tinfo;
> >> +       struct bpf_prog *prog;
> >> +};
> >
> > This is essentially a bpf_link. Why not do it as a bpf_link from the
> > get go? Instead of having all this duplication for anonymous and
>
> This is a good question. Maybe part of bpf-link can be used and
> I have to implement others. I will check.
>
> > pinned dumpers, it would always be a bpf_link-based dumper, but for
> > those pinned bpf_link itself is going to be pinned. You also get a
> > benefit of being able to list all dumpers through existing bpf_link
> > API (also see my RFC patches with bpf_link_prime/bpf_link_settle,
> > which makes using bpf_link safe and simple).
>
> Agree. Alternative is to use BPF_OBJ_GET_INFO_BY_FD to query individual
> dumper as directory tree walk can be easily done at user space.

But BPF_OBJ_GET_INFO_BY_FD won't work well for anonymous dumpers,
because it's not so easy to iterate over them (possible, but not
easy)?

>
>
> >
> > [...]
> >
> >> +
> >> +static void anon_dumper_show_fdinfo(struct seq_file *m, struct file *filp)
> >> +{
> >> +       struct dumper_info *dinfo;
> >> +
> >> +       mutex_lock(&anon_dumpers.dumper_mutex);
> >> +       list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) {
> >
> > this (and few other places where you search in a loop) would also be
> > simplified, because struct file* would point to bpf_dumper_link, which
> > then would have a pointer to bpf_prog, dentry (if pinned), etc. No
> > searching at all.
>
> This is a reason for this. the same as bpflink, bpfdump already has
> the full information about file, inode, etc.

I think (if I understand what you are saying), this is my point. What
you have in struct dumper_info is already a custom bpf_link. You are
just missing `struct bpf_link link;` field there and plugging it into
overall bpf_link infrastructure (bpf_link__init + bpf_link__prime +
bpf_link__settle, from my RFC) to gain benefits of bpf_link infra.


> The file private_data actually points to seq_file. The seq_file private
> data is used in the target. That is exactly why we try to have this
> mapping to keep track. bpf_link won't help here.

I need to go and re-read all the code again carefully with who stores
what in their private_data field...

>
> >
> >> +               if (dinfo->file == filp) {
> >> +                       seq_printf(m, "target:\t%s\n"
> >> +                                     "prog_id:\t%u\n",
> >> +                                  dinfo->tinfo->target,
> >> +                                  dinfo->prog->aux->id);
> >> +                       break;
> >> +               }
> >> +       }
> >> +       mutex_unlock(&anon_dumpers.dumper_mutex);
> >> +}
> >> +
> >> +#endif
> >> +
> >
> > [...]
> >
Andrii Nakryiko April 13, 2020, 7:59 p.m. UTC | #14
On Fri, Apr 10, 2020 at 5:23 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/10/20 4:25 PM, Andrii Nakryiko wrote:
> > On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Given a loaded dumper bpf program, which already
> >> knows which target it should bind to, there
> >> two ways to create a dumper:
> >>    - a file based dumper under hierarchy of
> >>      /sys/kernel/bpfdump/ which uses can
> >>      "cat" to print out the output.
> >>    - an anonymous dumper which user application
> >>      can "read" the dumping output.
> >>
> >> For file based dumper, BPF_OBJ_PIN syscall interface
> >> is used. For anonymous dumper, BPF_PROG_ATTACH
> >> syscall interface is used.
> >>
> >> To facilitate target seq_ops->show() to get the
> >> bpf program easily, dumper creation increased
> >> the target-provided seq_file private data size
> >> so bpf program pointer is also stored in seq_file
> >> private data.
> >>
> >> Further, a seq_num which represents how many
> >> bpf_dump_get_prog() has been called is also
> >> available to the target seq_ops->show().
> >> Such information can be used to e.g., print
> >> banner before printing out actual data.
> >
> > So I looked up seq_operations struct and did a very cursory read of
> > fs/seq_file.c and seq_file documentation, so I might be completely off
> > here.
> >
> > start() is called before iteration begins, stop() is called after
> > iteration ends. Would it be a bit better and user-friendly interface
> > to have to extra calls to BPF program, say with NULL input element,
> > but with extra enum/flag that specifies that this is a START or END of
> > iteration, in addition to seq_num?
>
> The current design always pass a valid object (task, file, netlink_sock,
> fib6_info). That is, access to fields to those data structure won't
> cause runtime exceptions.
>
> Therefore, with the existing seq_ops implementation for ipv6_route
> and netlink, etc, we don't have END information. We can get START
> information though.

Right, I understand this about current implementation, because it
calls BPF program from show. But I noticed also stop(), which

>
> >
> > Also, right now it's impossible to write stateful dumpers that do any
> > kind of stats calculation, because it's impossible to determine when
> > iteration restarted (it starts from the very beginning, not from the
> > last element). It's impossible to just rememebr last processed
> > seq_num, because BPF program might be called for a new "session" in
> > parallel with the old one.
>
> Theoretically, session end can be detected by checking the return
> value of last bpf_seq_printf() or bpf_seq_write(). If it indicates
> an overflow, that means session end.

That's not what I meant by session end. If there is an overflow, the
session is going to be restart from start (but it's still the same
session, we just got bigger output buffer).

>
> Or bpfdump infrastructure can help do this work to provide
> session id.

Well, come to think about it. seq_file pointer itself is unique per
session, so that one can be used as session id, is that right?

>
> >
> > So it seems like few things would be useful:
> >
> > 1. end flag for post-aggregation and/or footer printing (seq_num == 0
> > is providing similar means for start flag).
>
> the end flag is a problem. We could say hijack next or stop so we
> can detect the end, but passing a NULL pointer as the object
> to the bpf program may be problematic without verifier enforcement
> as it may cause a lot of exceptions... Although all these exception
> will be silenced by bpf infra, but still not sure whether this
> is acceptable or not.

Right, verifier will need to know that item can be valid pointer or
NULL. It's not perfect, but not too big of a deal for user to check
for NULL at the very beginning.

What I'm aiming for with this end flags is ability for BPF program to
collect data during show() calls, and then at the end get extra call
to give ability to post-aggregate this data and emit some sort of
summary into seq_file. Think about printing out summary stats across
all tasks (e.g., p50 of run queue latency, or something like that). In
that case, I need to iterate all tasks, I don't need to emit anything
for any individual tasks, but I need to produce an aggregation and
output after the last task was iterated. Right now it's impossible to
do, but seems like an extremely powerful and useful feature. drgn
could utilize this to speed up its scripts. There are plenty of tools
that would like to have a frequent but cheap view into internals of
the system, which current is implemented through netlink (taskstats)
or procfs, both quite expensive, if polled every second.

Anonymous bpfdump, though, is going to be much cheaper, because a lot
of aggregation can happen in the kernel and only minimal output at the
end will be read by user-space.

>
> > 2. Some sort of "session id", so that bpfdumper can maintain
> > per-session intermediate state. Plus with this it would be possible to
> > detect restarts (if there is some state for the same session and
> > seq_num == 0, this is restart).
>
> I guess we can do this.

See above, probably using seq_file pointer is good enough.

>
> >
> > It seems like it might be a bit more flexible to, instead of providing
> > seq_file * pointer directly, actually provide a bpfdumper_context
> > struct, which would have seq_file * as one of fields, other being
> > session_id and start/stop flags.
>
> As you mentioned, if we have more fields related to seq_file passing
> to bpf program, yes, grouping them into a structure makes sense.
>
> >
> > A bit unstructured thoughts, but what do you think?
> >
> >>
> >> Note the seq_num does not represent the num
> >> of unique kernel objects the bpf program has
> >> seen. But it should be a good approximate.
> >>
> >> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> >> is implemented specifically useful for
> >> net based dumpers. It sets net namespace
> >> as the current process net namespace.
> >> This avoids changing existing net seq_ops
> >> in order to retrieve net namespace from
> >> the seq_file pointer.
> >>
> >> For open dumper files, anonymous or not, the
> >> fdinfo will show the target and prog_id associated
> >> with that file descriptor. For dumper file itself,
> >> a kernel interface will be provided to retrieve the
> >> prog_id in one of the later patches.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h            |   5 +
> >>   include/uapi/linux/bpf.h       |   6 +-
> >>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
> >>   kernel/bpf/syscall.c           |  11 +-
> >>   tools/include/uapi/linux/bpf.h |   6 +-
> >>   5 files changed, 362 insertions(+), 4 deletions(-)
> >>
> >
> > [...]
> >
Andrii Nakryiko April 13, 2020, 8:48 p.m. UTC | #15
On Sat, Apr 11, 2020 at 4:11 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 10, 2020 at 04:47:36PM -0700, Yonghong Song wrote:
> > >
> > > Instead of special-casing dumper_name, can we require specifying full
> > > path, and then check whether it is in BPF FS vs BPFDUMP FS? If the
> > > latter, additionally check that it is in the right sub-directory
> > > matching its intended target type.
> >
> > We could. I just think specifying full path for bpfdump is not necessary
> > since it is a single user mount...
> >
> > >
> > > But honestly, just doing everything within BPF FS starts to seem
> > > cleaner at this point...
> >
> > bpffs is multi mount, which is not a perfect fit for bpfdump,
> > considering mounting inside namespace, etc, all dumpers are gone.
>
> As Yonghong pointed out reusing bpffs for dumpers doesn't look possible
> from implementation perspective.
> Even if it was possible the files in such mix-and-match file system
> would be of different kinds with different semantics. I think that
> will lead to mediocre user experience when file 'foo' is cat-able
> with nice human output, but file 'bar' isn't cat-able at all because
> it's just a pinned map. imo having all dumpers in one fixed location
> in /sys/kernel/bpfdump makes it easy to discover for folks who might
> not even know what bpf is.

I agree about importance of discoverability, but bpffs will typically
be mounted as /sys/fs/bpf/ as well, so it's just as discoverable at
/sys/fs/bpf/bpfdump. But I'm not too fixated on unifying bpffs and
bpfdumpfs, it's just that bpfdumpfs feels a bit too single-purpose.

> For example when I'm trying to learn some new area of the kernel I might go
> poke around /proc and /sys directory looking for a file name that could be
> interesting to 'cat'. This is how I discovered /sys/kernel/slab/ :)
> I think keeping all dumpers in /sys/kernel/bpfdump/ will make them
> similarly discoverable.
>
> re: f_dump flag...
> May be it's a sign that pinning is not the right name for such operation?
> If kernel cannot distinguish pinning dumper prog into bpffs as a vanilla
> pinning operation vs pinning into bpfdumpfs to make it cat-able then something
> isn't right about api. Either it needs to be a new bpf syscall command (like
> install_dumper_in_dumpfs) or reuse pinning command, but make libbpf specify the
> full path. From bpf prog point of view it may still specify only the final
> name, but libbpf can prepend the /sys/kernel/bpfdump/.../. May be there is a
> third option. Extra flag for pinning just doesn't look right. What if we do
> another specialized file system later? It would need yet another flag to pin
> there?

I agree about specifying full path from libbpf side. But section
definition shouldn't include /sys/fs/bpfdump part, so program would be
defined as:

SEC("dump/task/file")
int prog(...) { }

And libbpf by default will concat that with /sys/fs/bpfdump, but
probably should also provide a way to override prefix with custom
value, provided by users.
Andrii Nakryiko April 13, 2020, 9:04 p.m. UTC | #16
On Sat, Apr 11, 2020 at 4:17 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 10, 2020 at 05:23:30PM -0700, Yonghong Song wrote:
> > >
> > > So it seems like few things would be useful:
> > >
> > > 1. end flag for post-aggregation and/or footer printing (seq_num == 0
> > > is providing similar means for start flag).
> >
> > the end flag is a problem. We could say hijack next or stop so we
> > can detect the end, but passing a NULL pointer as the object
> > to the bpf program may be problematic without verifier enforcement
> > as it may cause a lot of exceptions... Although all these exception
> > will be silenced by bpf infra, but still not sure whether this
> > is acceptable or not.
>
> I don't like passing NULL there just to indicate something to a program.
> It's not too horrible to support from verifier side, but NULL is only
> one such flag. What does it suppose to indicate? That dumper prog
> is just starting? or ending? Let's pass (void*)1, and (void *)2 ?
> I'm not a fan of such inband signaling.
> imo it's cleaner and simpler when that object pointer is always valid.

I'm not proposing to pass fake pointers. I proposed to have bpfdump
context instead. E.g., one way to do this would be something like:

struct bpf_dump_context {
  struct seq_file *seq;
  u64 seq_num;
  int flags; /* 0 | BPF_DUMP_START | BPF_DUMP_END */
};

int prog(struct bpf_dump_context *ctx, struct netlink_sock *sk) {
  if (ctx->flags & BPF_DUMP_END) {
    /* emit summary */
    return 0;
  }

  /* sk must be not null here. */
}


This is one way. We can make it simpler by saying that sk == NULL is
always end of aggregation for given seq_file, then we won't need flags
and will require `if (!sk)` check explicitly. Don't know what's the
best way, but what I'm advocating for is to have a way for BPF program
to know that processing is finished and it's time to emit summary. See
my other reply in this thread with example use cases.


>
> > > 2. Some sort of "session id", so that bpfdumper can maintain
> > > per-session intermediate state. Plus with this it would be possible to
> > > detect restarts (if there is some state for the same session and
> > > seq_num == 0, this is restart).
> >
> > I guess we can do this.
>
> beyond seq_num passing session_id is a good idea. Though I don't quite see
> the use case where you'd need bpfdumper prog to be stateful, but doesn't hurt.

State per session seems most useful, so session id + hashmap solves
it. If we do sk_local storage per seq_file, that might be enough as
well, I guess...

Examples are any kind of summary stats across all sockets/tasks/etc.

Another interesting use case: produce map from process ID (tgid) to
bpf_maps, bpf_progs, bpf_links (or sockets, or whatever kind of file
we need). You'd need FD/file -> kernel object map and then kernel
object -> tgid map. I think there are many useful use-cases beyond
"one line per object" output cases that inspired bpfdump in the first
place.
Andrii Nakryiko April 14, 2020, 5:56 a.m. UTC | #17
On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a loaded dumper bpf program, which already
> knows which target it should bind to, there
> two ways to create a dumper:
>   - a file based dumper under hierarchy of
>     /sys/kernel/bpfdump/ which uses can
>     "cat" to print out the output.
>   - an anonymous dumper which user application
>     can "read" the dumping output.
>
> For file based dumper, BPF_OBJ_PIN syscall interface
> is used. For anonymous dumper, BPF_PROG_ATTACH
> syscall interface is used.

We discussed this offline with Yonghong a bit, but I thought I'd put
my thoughts about this in writing for completeness. To me, it seems
like the most consistent way to do both anonymous and named dumpers is
through the following steps:

1. BPF_PROG_LOAD to load/verify program, that created program FD.
2. LINK_CREATE using that program FD and direntry FD. This creates
dumper bpf_link (bpf_dumper_link), returns anonymous link FD. If link
FD is closed, dumper program is detached and dumper is destroyed
(unless pinned in bpffs, just like with any other bpf_link.
3. At this point bpf_dumper_link can be treated like a factory of
seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
illustration purposes) command, that accepts dumper link FD and
returns a new seq_file FD, which can be read() normally (or, e.g.,
cat'ed from shell).
4. Additionally, this anonymous bpf_link can be pinned/mounted in
bpfdumpfs. We can do it as BPF_OBJ_PIN or as a separate command. Once
pinned at, e.g., /sys/fs/bpfdump/task/my_dumper, just opening that
file is equivalent to BPF_DUMPER_OPEN_FILE and will create a new
seq_file that can be read() independently from other seq_files opened
against the same dumper. Pinning bpfdumpfs entry also bumps refcnt of
bpf_link itself, so even if process that created link dies, bpf dumper
stays attached until its bpfdumpfs entry is deleted.

Apart from BPF_DUMPER_OPEN_FILE and open()'ing bpfdumpfs file duality,
it seems pretty consistent and follows safe-by-default auto-cleanup of
anonymous link, unless pinned in bpfdumpfs (or one can still pin
bpf_link in bpffs, but it can't be open()'ed the same way, it just
preserves BPF program from being cleaned up).

Out of all schemes I could come up with, this one seems most unified
and nicely fits into bpf_link infra. Thoughts?

>
> To facilitate target seq_ops->show() to get the
> bpf program easily, dumper creation increased
> the target-provided seq_file private data size
> so bpf program pointer is also stored in seq_file
> private data.
>
> Further, a seq_num which represents how many
> bpf_dump_get_prog() has been called is also
> available to the target seq_ops->show().
> Such information can be used to e.g., print
> banner before printing out actual data.
>
> Note the seq_num does not represent the num
> of unique kernel objects the bpf program has
> seen. But it should be a good approximate.
>
> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> is implemented specifically useful for
> net based dumpers. It sets net namespace
> as the current process net namespace.
> This avoids changing existing net seq_ops
> in order to retrieve net namespace from
> the seq_file pointer.
>
> For open dumper files, anonymous or not, the
> fdinfo will show the target and prog_id associated
> with that file descriptor. For dumper file itself,
> a kernel interface will be provided to retrieve the
> prog_id in one of the later patches.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   5 +
>  include/uapi/linux/bpf.h       |   6 +-
>  kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |  11 +-
>  tools/include/uapi/linux/bpf.h |   6 +-
>  5 files changed, 362 insertions(+), 4 deletions(-)
>

[...]
Yonghong Song April 14, 2020, 11:59 p.m. UTC | #18
On 4/13/20 10:56 PM, Andrii Nakryiko wrote:
> On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a loaded dumper bpf program, which already
>> knows which target it should bind to, there
>> two ways to create a dumper:
>>    - a file based dumper under hierarchy of
>>      /sys/kernel/bpfdump/ which uses can
>>      "cat" to print out the output.
>>    - an anonymous dumper which user application
>>      can "read" the dumping output.
>>
>> For file based dumper, BPF_OBJ_PIN syscall interface
>> is used. For anonymous dumper, BPF_PROG_ATTACH
>> syscall interface is used.
> 
> We discussed this offline with Yonghong a bit, but I thought I'd put
> my thoughts about this in writing for completeness. To me, it seems
> like the most consistent way to do both anonymous and named dumpers is
> through the following steps:

The main motivation for me to use bpf_link is to enumerate
anonymous bpf dumpers by using idr based link_query mechanism in one
of previous Andrii's RFC patch so I do not need to re-invent the wheel.

But looks like there are some difficulties:

> 
> 1. BPF_PROG_LOAD to load/verify program, that created program FD.
> 2. LINK_CREATE using that program FD and direntry FD. This creates
> dumper bpf_link (bpf_dumper_link), returns anonymous link FD. If link

bpf dump program already have the target information as part of
verification propose, so it does not need directory FD.
LINK_CREATE probably not a good fit here.

bpf dump program is kind similar to fentry/fexit program,
where after successful program loading, the program will know
where to attach trampoline.

Looking at kernel code, for fentry/fexit program, at raw_tracepoint_open
syscall, the trampoline will be installed and actually bpf program will
be called.

So, ideally, if we want to use kernel bpf_link, we want to
return a cat-able bpf_link because ultimately we want to query
file descriptors which actually 'read' bpf program outputs.

Current bpf_link is not cat-able.
I try to hack by manipulating fops and other stuff, it may work,
but looks ugly. Or we create a bpf_catable_link and build an 
infrastructure around that? Not sure whether it is worthwhile for this 
one-off thing (bpfdump)?

Or to query anonymous bpf dumpers, I can just write a bpf dump program
to go through all fd's to find out.

BTW, my current approach (in my private branch),
anonymous dumper:
    bpf_raw_tracepoint_open(NULL, prog) -> cat-able fd
file dumper:
    bpf_obj_pin(prog, path)  -> a cat-able file

If you consider program itself is a link, this is like what
described below in 3 and 4.


> FD is closed, dumper program is detached and dumper is destroyed
> (unless pinned in bpffs, just like with any other bpf_link.
> 3. At this point bpf_dumper_link can be treated like a factory of
> seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> illustration purposes) command, that accepts dumper link FD and
> returns a new seq_file FD, which can be read() normally (or, e.g.,
> cat'ed from shell).

In this case, link_query may not be accurate if a bpf_dumper_link
is created but no corresponding bpf_dumper_open_file. What we really
need to iterate through all dumper seq_file FDs.

> 4. Additionally, this anonymous bpf_link can be pinned/mounted in
> bpfdumpfs. We can do it as BPF_OBJ_PIN or as a separate command. Once
> pinned at, e.g., /sys/fs/bpfdump/task/my_dumper, just opening that
> file is equivalent to BPF_DUMPER_OPEN_FILE and will create a new
> seq_file that can be read() independently from other seq_files opened
> against the same dumper. Pinning bpfdumpfs entry also bumps refcnt of
> bpf_link itself, so even if process that created link dies, bpf dumper
> stays attached until its bpfdumpfs entry is deleted.
> 
> Apart from BPF_DUMPER_OPEN_FILE and open()'ing bpfdumpfs file duality,
> it seems pretty consistent and follows safe-by-default auto-cleanup of
> anonymous link, unless pinned in bpfdumpfs (or one can still pin
> bpf_link in bpffs, but it can't be open()'ed the same way, it just
> preserves BPF program from being cleaned up).
> 
> Out of all schemes I could come up with, this one seems most unified
> and nicely fits into bpf_link infra. Thoughts?
> 
>>
>> To facilitate target seq_ops->show() to get the
>> bpf program easily, dumper creation increased
>> the target-provided seq_file private data size
>> so bpf program pointer is also stored in seq_file
>> private data.
>>
>> Further, a seq_num which represents how many
>> bpf_dump_get_prog() has been called is also
>> available to the target seq_ops->show().
>> Such information can be used to e.g., print
>> banner before printing out actual data.
>>
>> Note the seq_num does not represent the num
>> of unique kernel objects the bpf program has
>> seen. But it should be a good approximate.
>>
>> A target feature BPF_DUMP_SEQ_NET_PRIVATE
>> is implemented specifically useful for
>> net based dumpers. It sets net namespace
>> as the current process net namespace.
>> This avoids changing existing net seq_ops
>> in order to retrieve net namespace from
>> the seq_file pointer.
>>
>> For open dumper files, anonymous or not, the
>> fdinfo will show the target and prog_id associated
>> with that file descriptor. For dumper file itself,
>> a kernel interface will be provided to retrieve the
>> prog_id in one of the later patches.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   5 +
>>   include/uapi/linux/bpf.h       |   6 +-
>>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
>>   kernel/bpf/syscall.c           |  11 +-
>>   tools/include/uapi/linux/bpf.h |   6 +-
>>   5 files changed, 362 insertions(+), 4 deletions(-)
>>
> 
> [...]
>
Andrii Nakryiko April 15, 2020, 4:45 a.m. UTC | #19
On Tue, Apr 14, 2020 at 4:59 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/13/20 10:56 PM, Andrii Nakryiko wrote:
> > On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Given a loaded dumper bpf program, which already
> >> knows which target it should bind to, there
> >> two ways to create a dumper:
> >>    - a file based dumper under hierarchy of
> >>      /sys/kernel/bpfdump/ which uses can
> >>      "cat" to print out the output.
> >>    - an anonymous dumper which user application
> >>      can "read" the dumping output.
> >>
> >> For file based dumper, BPF_OBJ_PIN syscall interface
> >> is used. For anonymous dumper, BPF_PROG_ATTACH
> >> syscall interface is used.
> >
> > We discussed this offline with Yonghong a bit, but I thought I'd put
> > my thoughts about this in writing for completeness. To me, it seems
> > like the most consistent way to do both anonymous and named dumpers is
> > through the following steps:
>
> The main motivation for me to use bpf_link is to enumerate
> anonymous bpf dumpers by using idr based link_query mechanism in one
> of previous Andrii's RFC patch so I do not need to re-invent the wheel.
>
> But looks like there are some difficulties:
>
> >
> > 1. BPF_PROG_LOAD to load/verify program, that created program FD.
> > 2. LINK_CREATE using that program FD and direntry FD. This creates
> > dumper bpf_link (bpf_dumper_link), returns anonymous link FD. If link
>
> bpf dump program already have the target information as part of
> verification propose, so it does not need directory FD.
> LINK_CREATE probably not a good fit here.
>
> bpf dump program is kind similar to fentry/fexit program,
> where after successful program loading, the program will know
> where to attach trampoline.
>
> Looking at kernel code, for fentry/fexit program, at raw_tracepoint_open
> syscall, the trampoline will be installed and actually bpf program will
> be called.
>

direntry FD doesn't have to be specified at attach time, I forgot that
it is already provided during load. That wasn't a requirement or
critical part. I think if we already had LINK_CREATE command, we'd
never have to create RAW_TRACEPOINT_OPEN one, all of them could be the
same command.

> So, ideally, if we want to use kernel bpf_link, we want to
> return a cat-able bpf_link because ultimately we want to query
> file descriptors which actually 'read' bpf program outputs.
>
> Current bpf_link is not cat-able.

Let's be precise here. By cat-able you mean that you'd like to just
start issuing read() calls and get output of bpfdump program, is that
right? Wouldn't that mean that you can read output just once? So it
won't be possible to create anonymous dumper and periodically get
up-to-date output. User would need to call RAW_TRACEPOINT_OPEN every
single time it would need to do a dump. I guess that would work, but
I'm not seeing why it has to be that way.

What I proposed above was that once you create a bpf_link, you can use
that same bpf_link to open many seq_files, each with its own FD, which
can be read() independently of each other. This behavior would be
consistent with named bpfdumper, which can produce many independent
seq_files with each new open() syscall, but all from exactly the same
attached bpfdumper.

> I try to hack by manipulating fops and other stuff, it may work,
> but looks ugly. Or we create a bpf_catable_link and build an
> infrastructure around that? Not sure whether it is worthwhile for this
> one-off thing (bpfdump)?
>
> Or to query anonymous bpf dumpers, I can just write a bpf dump program
> to go through all fd's to find out.
>
> BTW, my current approach (in my private branch),
> anonymous dumper:
>     bpf_raw_tracepoint_open(NULL, prog) -> cat-able fd

So just to re-iterate. If my understanding is correct, this cat-able
fd is a single seq_file. If you want to dump it again, you would call
bpf_raw_tracepoint_open() again?

> file dumper:
>     bpf_obj_pin(prog, path)  -> a cat-able file

While in this case, you'd open() as many times as you need and get new
cat-able fd for each of those calls.

>
> If you consider program itself is a link, this is like what
> described below in 3 and 4.

Program is not a link. Same as cgroup BPF program attached somewhere
to a cgroup is not a link. Because that BPF program can be attached to
multiple cgroups or even under multiple attach types to the same
cgroup. Same here, same dumper can be "attached" in bpfdumpfs multiple
times, and each instance of attachment is link, but it's still the
same program.

>
>
> > FD is closed, dumper program is detached and dumper is destroyed
> > (unless pinned in bpffs, just like with any other bpf_link.
> > 3. At this point bpf_dumper_link can be treated like a factory of
> > seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> > illustration purposes) command, that accepts dumper link FD and
> > returns a new seq_file FD, which can be read() normally (or, e.g.,
> > cat'ed from shell).
>
> In this case, link_query may not be accurate if a bpf_dumper_link
> is created but no corresponding bpf_dumper_open_file. What we really
> need to iterate through all dumper seq_file FDs.

If the goal is to iterate all the open seq_files (i.e., bpfdump active
sessions), then bpf_link is clearly not the right approach. But I
thought we are talking about iterating all the bpfdump programs
attachments, not **sessions**, in which case bpf_link is exactly the
right approach.


>
> > 4. Additionally, this anonymous bpf_link can be pinned/mounted in
> > bpfdumpfs. We can do it as BPF_OBJ_PIN or as a separate command. Once
> > pinned at, e.g., /sys/fs/bpfdump/task/my_dumper, just opening that
> > file is equivalent to BPF_DUMPER_OPEN_FILE and will create a new
> > seq_file that can be read() independently from other seq_files opened
> > against the same dumper. Pinning bpfdumpfs entry also bumps refcnt of
> > bpf_link itself, so even if process that created link dies, bpf dumper
> > stays attached until its bpfdumpfs entry is deleted.
> >
> > Apart from BPF_DUMPER_OPEN_FILE and open()'ing bpfdumpfs file duality,
> > it seems pretty consistent and follows safe-by-default auto-cleanup of
> > anonymous link, unless pinned in bpfdumpfs (or one can still pin
> > bpf_link in bpffs, but it can't be open()'ed the same way, it just
> > preserves BPF program from being cleaned up).
> >
> > Out of all schemes I could come up with, this one seems most unified
> > and nicely fits into bpf_link infra. Thoughts?
> >
> >>
> >> To facilitate target seq_ops->show() to get the
> >> bpf program easily, dumper creation increased
> >> the target-provided seq_file private data size
> >> so bpf program pointer is also stored in seq_file
> >> private data.
> >>
> >> Further, a seq_num which represents how many
> >> bpf_dump_get_prog() has been called is also
> >> available to the target seq_ops->show().
> >> Such information can be used to e.g., print
> >> banner before printing out actual data.
> >>
> >> Note the seq_num does not represent the num
> >> of unique kernel objects the bpf program has
> >> seen. But it should be a good approximate.
> >>
> >> A target feature BPF_DUMP_SEQ_NET_PRIVATE
> >> is implemented specifically useful for
> >> net based dumpers. It sets net namespace
> >> as the current process net namespace.
> >> This avoids changing existing net seq_ops
> >> in order to retrieve net namespace from
> >> the seq_file pointer.
> >>
> >> For open dumper files, anonymous or not, the
> >> fdinfo will show the target and prog_id associated
> >> with that file descriptor. For dumper file itself,
> >> a kernel interface will be provided to retrieve the
> >> prog_id in one of the later patches.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h            |   5 +
> >>   include/uapi/linux/bpf.h       |   6 +-
> >>   kernel/bpf/dump.c              | 338 ++++++++++++++++++++++++++++++++-
> >>   kernel/bpf/syscall.c           |  11 +-
> >>   tools/include/uapi/linux/bpf.h |   6 +-
> >>   5 files changed, 362 insertions(+), 4 deletions(-)
> >>
> >
> > [...]
> >
Alexei Starovoitov April 15, 2020, 4:46 p.m. UTC | #20
On Tue, Apr 14, 2020 at 09:45:08PM -0700, Andrii Nakryiko wrote:
> >
> > > FD is closed, dumper program is detached and dumper is destroyed
> > > (unless pinned in bpffs, just like with any other bpf_link.
> > > 3. At this point bpf_dumper_link can be treated like a factory of
> > > seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> > > illustration purposes) command, that accepts dumper link FD and
> > > returns a new seq_file FD, which can be read() normally (or, e.g.,
> > > cat'ed from shell).
> >
> > In this case, link_query may not be accurate if a bpf_dumper_link
> > is created but no corresponding bpf_dumper_open_file. What we really
> > need to iterate through all dumper seq_file FDs.
> 
> If the goal is to iterate all the open seq_files (i.e., bpfdump active
> sessions), then bpf_link is clearly not the right approach. But I
> thought we are talking about iterating all the bpfdump programs
> attachments, not **sessions**, in which case bpf_link is exactly the
> right approach.

That's an important point. What is the pinned /sys/kernel/bpfdump/tasks/foo ?
Every time 'cat' opens it a new seq_file is created with new FD, right ?
Reading of that file can take infinite amount of time, since 'cat' can be
paused in the middle.
I think we're dealing with several different kinds of objects here.
1. "template" of seq_file that is seen with 'ls' in /sys/kernel/bpfdump/
2. given instance of seq_file after "template" was open
3. bpfdumper program
4. and now links. One bpf_link from seq_file template to bpf prog and
  many other bpf_links from actual seq_file kernel object to bpf prog.
  I think both kinds of links need to be iteratable via get_next_id.

At the same time I don't think 1 and 2 are links.
read-ing link FD should not trigger program execution. link is the connecting
abstraction. It shouldn't be used to trigger anything. It's static.
Otherwise read-ing cgroup-bpf link would need to trigger cgroup bpf prog too.
FD that points to actual seq_file is the one that should be triggering
iteration of kernel objects and corresponding execution of linked prog.
That FD can be anon_inode returned from raw_tp_open (or something else)
or FD from open("/sys/kernel/bpfdump/foo").

The more I think about all the objects involved the more it feels that the
whole process should consist of three steps (instead of two).
1. load bpfdump prog
2. create seq_file-template in /sys/kernel/bpfdump/
   (not sure which api should do that)
3. use bpf_link_create api to attach bpfdumper prog to that seq_file-template

Then when the file is opened a new bpf_link is created for that reading session.
At the same time both kinds of links (to teamplte and to seq_file) should be
iteratable for observability reasons, but get_fd_from_id on them should probably
be disallowed, since holding such FD to these special links by other process
has odd semantics.

Similarly for anon seq_file it should be three step process as well:
1. load bpfdump prog
2. create anon seq_file (api is tbd) that returns FD
3. use bpf_link_create to attach prog to seq_file FD

May be it's all overkill. These are just my thoughts so far.
Andrii Nakryiko April 16, 2020, 1:48 a.m. UTC | #21
On Wed, Apr 15, 2020 at 9:46 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 14, 2020 at 09:45:08PM -0700, Andrii Nakryiko wrote:
> > >
> > > > FD is closed, dumper program is detached and dumper is destroyed
> > > > (unless pinned in bpffs, just like with any other bpf_link.
> > > > 3. At this point bpf_dumper_link can be treated like a factory of
> > > > seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> > > > illustration purposes) command, that accepts dumper link FD and
> > > > returns a new seq_file FD, which can be read() normally (or, e.g.,
> > > > cat'ed from shell).
> > >
> > > In this case, link_query may not be accurate if a bpf_dumper_link
> > > is created but no corresponding bpf_dumper_open_file. What we really
> > > need to iterate through all dumper seq_file FDs.
> >
> > If the goal is to iterate all the open seq_files (i.e., bpfdump active
> > sessions), then bpf_link is clearly not the right approach. But I
> > thought we are talking about iterating all the bpfdump programs
> > attachments, not **sessions**, in which case bpf_link is exactly the
> > right approach.
>
> That's an important point. What is the pinned /sys/kernel/bpfdump/tasks/foo ?

Assuming it's not a rhetorical question, foo is a pinned bpf_dumper
link (in my interpretation of all this).

> Every time 'cat' opens it a new seq_file is created with new FD, right ?

yes

> Reading of that file can take infinite amount of time, since 'cat' can be
> paused in the middle.

yep, correct (though most use case probably going to be very short-lived)

> I think we're dealing with several different kinds of objects here.
> 1. "template" of seq_file that is seen with 'ls' in /sys/kernel/bpfdump/

Let's clarify here again, because this can be interpreted differently.

Are you talking about, e.g., /sys/fs/bpfdump/task directory that
defines what class of items should be iterated? Or you are talking
about named dumper: /sys/fs/bpfdump/task/my_dumper?

If the former, I agree that it's not a link. If the latter, then
that's what we've been so far calling "a named bpfdumper". Which is
what I argue is a link, pinned in bpfdumpfs (*not bpffs*).

UPD: reading further, seems like it's some third interpretation, so
please clarify.

> 2. given instance of seq_file after "template" was open

Right, corresponding to "bpfdump session" (has its own unique session_id).

> 3. bpfdumper program

Yep, BPF_PROG_LOAD returns FD to verified bpfdumper program.

> 4. and now links. One bpf_link from seq_file template to bpf prog and

So I guess "seq_file template" is /sys/kernel/bpfdump/tasks direntry
itself, which has to be specified as FD during BPF_PROG_LOAD, is that
right? If yes, I agree, "seq_file template" + attached bpf_prog is a
link.

>   many other bpf_links from actual seq_file kernel object to bpf prog.

I think this one is not a link at all. It's a bpfdumper session. For
me this is equivalent of a single BPF program invocation on cgroup due
to a single packet. I understand that in this case it's multiple BPF
program invocations, so it's not exactly 1:1, but if we had an easy
way to do iteration from inside BPF program over all, say, tasks, that
would be one BPF program invocation with a loop inside. So to me one
seq_file session is analogous to a single BPF program execution (or,
say one hardware event triggering one execution of perf_event BPF
program).

>   I think both kinds of links need to be iteratable via get_next_id.
>
> At the same time I don't think 1 and 2 are links.
> read-ing link FD should not trigger program execution. link is the connecting
> abstraction. It shouldn't be used to trigger anything. It's static.
> Otherwise read-ing cgroup-bpf link would need to trigger cgroup bpf prog too.
> FD that points to actual seq_file is the one that should be triggering
> iteration of kernel objects and corresponding execution of linked prog.

Yep, I agree totally, reading bpf_link FD directly as if it was
seq_file seems weird and would support only a single time to read.

> That FD can be anon_inode returned from raw_tp_open (or something else)

raw_tp_open currently always returns bpf_link FDs, so if this suddenly
returns readable seq_file instead, that would be weird, IMO.


> or FD from open("/sys/kernel/bpfdump/foo").

Agreed.

>
> The more I think about all the objects involved the more it feels that the
> whole process should consist of three steps (instead of two).
> 1. load bpfdump prog
> 2. create seq_file-template in /sys/kernel/bpfdump/
>    (not sure which api should do that)

Hm... ok, I think seq_file-template means something else entirely.
It's not an attached BPF program, but also not a /sys/fs/bpfdump/task
"provider". What is it and what is its purpose? Also, how is it
cleaned up if application crashes between creating "seq_file-template"
and attaching BPF program to it?

> 3. use bpf_link_create api to attach bpfdumper prog to that seq_file-template
>
> Then when the file is opened a new bpf_link is created for that reading session.
> At the same time both kinds of links (to teamplte and to seq_file) should be
> iteratable for observability reasons, but get_fd_from_id on them should probably
> be disallowed, since holding such FD to these special links by other process
> has odd semantics.

This special get_fd_from_id handling for bpfdumper links (in your
interpretation) looks like a sign that using bpf_link to represent a
specific bpfdumper session is not the right design.

As for obserabilitiy of bpfdumper sessions, I think using bpfdump
program + task/file provider will give a good way to do this,
actually, with no need to maintain a separate IDR just for bpfdumper
sessions.

>
> Similarly for anon seq_file it should be three step process as well:
> 1. load bpfdump prog
> 2. create anon seq_file (api is tbd) that returns FD
> 3. use bpf_link_create to attach prog to seq_file FD
>
> May be it's all overkill. These are just my thoughts so far.

Just to contrast, in a condensed form, what I was proposing:

For named dumper:
1. load bpfdump prog
2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
bpf_link anon FD back
3. pin link in bpfdumpfs (e.g., /sys/fs/bpfdump/task/my_dumper)
4. each open() of /sys/fs/bpfdump/task/my_dumper produces new
bpfdumper session/seq_file

For anon dumper:
1. load bpfdump prog
2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
bpf_link anon FD back
3. give bpf_link FD to some new API (say, BPF_DUMP_NEW_SESSION or
whatever name) to create seq_file/bpfdumper session, which will create
FD that can be read(). One can do that many times, each time getting
its own bpfdumper session.

First two steps are exactly the same, as it should be, because
named/anon dumper is still the same dumper. Note also that we can use
bpf_link FD of named dumper and BPF_DUMP_NEW_SESSION command to also
create sessions, which further underlines that the only difference
between named and anon dumper is this bpfdumpfs direntry that allows
to create new seq_file/session by doing normal open(), instead of
BPF's BPF_DUMP_NEW_SESSION.

Named vs anon dumper is like "named" vs "anon" bpf_link -- we don't
even talk in those terms about bpf_link, because the only difference
is pinned direntry in a special FS, really.
Yonghong Song April 16, 2020, 7:15 a.m. UTC | #22
On 4/15/20 6:48 PM, Andrii Nakryiko wrote:
> On Wed, Apr 15, 2020 at 9:46 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>
>> On Tue, Apr 14, 2020 at 09:45:08PM -0700, Andrii Nakryiko wrote:
>>>>
>>>>> FD is closed, dumper program is detached and dumper is destroyed
>>>>> (unless pinned in bpffs, just like with any other bpf_link.
>>>>> 3. At this point bpf_dumper_link can be treated like a factory of
>>>>> seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
>>>>> illustration purposes) command, that accepts dumper link FD and
>>>>> returns a new seq_file FD, which can be read() normally (or, e.g.,
>>>>> cat'ed from shell).
>>>>
>>>> In this case, link_query may not be accurate if a bpf_dumper_link
>>>> is created but no corresponding bpf_dumper_open_file. What we really
>>>> need to iterate through all dumper seq_file FDs.
>>>
>>> If the goal is to iterate all the open seq_files (i.e., bpfdump active
>>> sessions), then bpf_link is clearly not the right approach. But I
>>> thought we are talking about iterating all the bpfdump programs
>>> attachments, not **sessions**, in which case bpf_link is exactly the
>>> right approach.
>>
>> That's an important point. What is the pinned /sys/kernel/bpfdump/tasks/foo ?
> 
> Assuming it's not a rhetorical question, foo is a pinned bpf_dumper
> link (in my interpretation of all this).
> 
>> Every time 'cat' opens it a new seq_file is created with new FD, right ?
> 
> yes
> 
>> Reading of that file can take infinite amount of time, since 'cat' can be
>> paused in the middle.
> 
> yep, correct (though most use case probably going to be very short-lived)
> 
>> I think we're dealing with several different kinds of objects here.
>> 1. "template" of seq_file that is seen with 'ls' in /sys/kernel/bpfdump/
> 
> Let's clarify here again, because this can be interpreted differently.
> 
> Are you talking about, e.g., /sys/fs/bpfdump/task directory that
> defines what class of items should be iterated? Or you are talking
> about named dumper: /sys/fs/bpfdump/task/my_dumper?
> 
> If the former, I agree that it's not a link. If the latter, then
> that's what we've been so far calling "a named bpfdumper". Which is
> what I argue is a link, pinned in bpfdumpfs (*not bpffs*).
> 
> UPD: reading further, seems like it's some third interpretation, so
> please clarify.
> 
>> 2. given instance of seq_file after "template" was open
> 
> Right, corresponding to "bpfdump session" (has its own unique session_id).
> 
>> 3. bpfdumper program
> 
> Yep, BPF_PROG_LOAD returns FD to verified bpfdumper program.
> 
>> 4. and now links. One bpf_link from seq_file template to bpf prog and
> 
> So I guess "seq_file template" is /sys/kernel/bpfdump/tasks direntry
> itself, which has to be specified as FD during BPF_PROG_LOAD, is that
> right? If yes, I agree, "seq_file template" + attached bpf_prog is a
> link.
> 
>>    many other bpf_links from actual seq_file kernel object to bpf prog.
> 
> I think this one is not a link at all. It's a bpfdumper session. For
> me this is equivalent of a single BPF program invocation on cgroup due
> to a single packet. I understand that in this case it's multiple BPF
> program invocations, so it's not exactly 1:1, but if we had an easy
> way to do iteration from inside BPF program over all, say, tasks, that
> would be one BPF program invocation with a loop inside. So to me one
> seq_file session is analogous to a single BPF program execution (or,
> say one hardware event triggering one execution of perf_event BPF
> program).
> 
>>    I think both kinds of links need to be iteratable via get_next_id.
>>
>> At the same time I don't think 1 and 2 are links.
>> read-ing link FD should not trigger program execution. link is the connecting
>> abstraction. It shouldn't be used to trigger anything. It's static.
>> Otherwise read-ing cgroup-bpf link would need to trigger cgroup bpf prog too.
>> FD that points to actual seq_file is the one that should be triggering
>> iteration of kernel objects and corresponding execution of linked prog.
> 
> Yep, I agree totally, reading bpf_link FD directly as if it was
> seq_file seems weird and would support only a single time to read.
> 
>> That FD can be anon_inode returned from raw_tp_open (or something else)
> 
> raw_tp_open currently always returns bpf_link FDs, so if this suddenly
> returns readable seq_file instead, that would be weird, IMO.
> 
> 
>> or FD from open("/sys/kernel/bpfdump/foo").
> 
> Agreed.
> 
>>
>> The more I think about all the objects involved the more it feels that the
>> whole process should consist of three steps (instead of two).
>> 1. load bpfdump prog
>> 2. create seq_file-template in /sys/kernel/bpfdump/
>>     (not sure which api should do that)
> 
> Hm... ok, I think seq_file-template means something else entirely.
> It's not an attached BPF program, but also not a /sys/fs/bpfdump/task
> "provider". What is it and what is its purpose? Also, how is it
> cleaned up if application crashes between creating "seq_file-template"
> and attaching BPF program to it?
> 
>> 3. use bpf_link_create api to attach bpfdumper prog to that seq_file-template
>>
>> Then when the file is opened a new bpf_link is created for that reading session.
>> At the same time both kinds of links (to teamplte and to seq_file) should be
>> iteratable for observability reasons, but get_fd_from_id on them should probably
>> be disallowed, since holding such FD to these special links by other process
>> has odd semantics.
> 
> This special get_fd_from_id handling for bpfdumper links (in your
> interpretation) looks like a sign that using bpf_link to represent a
> specific bpfdumper session is not the right design.
> 
> As for obserabilitiy of bpfdumper sessions, I think using bpfdump
> program + task/file provider will give a good way to do this,
> actually, with no need to maintain a separate IDR just for bpfdumper
> sessions.
> 
>>
>> Similarly for anon seq_file it should be three step process as well:
>> 1. load bpfdump prog
>> 2. create anon seq_file (api is tbd) that returns FD
>> 3. use bpf_link_create to attach prog to seq_file FD
>>
>> May be it's all overkill. These are just my thoughts so far.
> 
> Just to contrast, in a condensed form, what I was proposing:
> 
> For named dumper:
> 1. load bpfdump prog
> 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> bpf_link anon FD back

I actually tried to prototype earlier today.
for existing tracing program (non-raw-tracepoint, e.g., fentry/fexit),
when raw_tracepoint_open() is called,
bpf_trampoline_link_prog() is called, trampoline is actually
updated with bpf_program and program start running. You can hold
this link_fd at user application or pin it to /sys/fs/bpf.

That is what I refers to in my previous email whether
we can have 'cat'-able link or not. But looks it is pretty hard.

Alternatively, we could still return a bpf_link.
The only thing bpf_link did is to hold a reference count for bpf_prog
and nothing else. Later on, we can use this bpf_link to pin dumper
or open anonymous seq_file.

But since bpf_link just holds a reference for prog and nothing more
and that is why I mentioned not 100% sure whether bpf_link is needed
as I could achieve the same thing with bpf_prog. Further,
it does not provide ability to query open files (a bpf program
for task/file target should be able to do it.)

But if for API consistency, we prefer raw_tracepoint_open() to
return a link fd. I can still do it, I guess. Or maybe link_query
still useful in some way.


> 3. pin link in bpfdumpfs (e.g., /sys/fs/bpfdump/task/my_dumper)
> 4. each open() of /sys/fs/bpfdump/task/my_dumper produces new
> bpfdumper session/seq_file
> 
> For anon dumper:
> 1. load bpfdump prog
> 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> bpf_link anon FD back
> 3. give bpf_link FD to some new API (say, BPF_DUMP_NEW_SESSION or
> whatever name) to create seq_file/bpfdumper session, which will create
> FD that can be read(). One can do that many times, each time getting
> its own bpfdumper session.
> 
> First two steps are exactly the same, as it should be, because
> named/anon dumper is still the same dumper. Note also that we can use
> bpf_link FD of named dumper and BPF_DUMP_NEW_SESSION command to also
> create sessions, which further underlines that the only difference
> between named and anon dumper is this bpfdumpfs direntry that allows
> to create new seq_file/session by doing normal open(), instead of
> BPF's BPF_DUMP_NEW_SESSION.
> 
> Named vs anon dumper is like "named" vs "anon" bpf_link -- we don't
> even talk in those terms about bpf_link, because the only difference
> is pinned direntry in a special FS, really.
>
Alexei Starovoitov April 16, 2020, 5:04 p.m. UTC | #23
On Wed, Apr 15, 2020 at 06:48:13PM -0700, Andrii Nakryiko wrote:
> On Wed, Apr 15, 2020 at 9:46 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Apr 14, 2020 at 09:45:08PM -0700, Andrii Nakryiko wrote:
> > > >
> > > > > FD is closed, dumper program is detached and dumper is destroyed
> > > > > (unless pinned in bpffs, just like with any other bpf_link.
> > > > > 3. At this point bpf_dumper_link can be treated like a factory of
> > > > > seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> > > > > illustration purposes) command, that accepts dumper link FD and
> > > > > returns a new seq_file FD, which can be read() normally (or, e.g.,
> > > > > cat'ed from shell).
> > > >
> > > > In this case, link_query may not be accurate if a bpf_dumper_link
> > > > is created but no corresponding bpf_dumper_open_file. What we really
> > > > need to iterate through all dumper seq_file FDs.
> > >
> > > If the goal is to iterate all the open seq_files (i.e., bpfdump active
> > > sessions), then bpf_link is clearly not the right approach. But I
> > > thought we are talking about iterating all the bpfdump programs
> > > attachments, not **sessions**, in which case bpf_link is exactly the
> > > right approach.
> >
> > That's an important point. What is the pinned /sys/kernel/bpfdump/tasks/foo ?
> 
> Assuming it's not a rhetorical question, foo is a pinned bpf_dumper
> link (in my interpretation of all this).

It wasn't rhetorical question and your answer is differrent from mine :)
It's not a link. It's a template of seq_file. It's the same as
$ stat /proc/net/ipv6_route
  File: ‘/proc/net/ipv6_route’
  Size: 0         	Blocks: 0          IO Block: 1024   regular empty file

> > Every time 'cat' opens it a new seq_file is created with new FD, right ?
> 
> yes
> 
> > Reading of that file can take infinite amount of time, since 'cat' can be
> > paused in the middle.
> 
> yep, correct (though most use case probably going to be very short-lived)
> 
> > I think we're dealing with several different kinds of objects here.
> > 1. "template" of seq_file that is seen with 'ls' in /sys/kernel/bpfdump/
> 
> Let's clarify here again, because this can be interpreted differently.
> 
> Are you talking about, e.g., /sys/fs/bpfdump/task directory that
> defines what class of items should be iterated? Or you are talking
> about named dumper: /sys/fs/bpfdump/task/my_dumper?

the latter.

> 
> If the former, I agree that it's not a link. If the latter, then
> that's what we've been so far calling "a named bpfdumper". Which is
> what I argue is a link, pinned in bpfdumpfs (*not bpffs*).

It cannot be a link, since link is only a connection between
kernel object and bpf prog.
Whereas seq_file is such kernel object.

> 
> For named dumper:
> 1. load bpfdump prog
> 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> bpf_link anon FD back
> 3. pin link in bpfdumpfs (e.g., /sys/fs/bpfdump/task/my_dumper)
> 4. each open() of /sys/fs/bpfdump/task/my_dumper produces new
> bpfdumper session/seq_file
> 
> For anon dumper:
> 1. load bpfdump prog
> 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> bpf_link anon FD back
> 3. give bpf_link FD to some new API (say, BPF_DUMP_NEW_SESSION or
> whatever name) to create seq_file/bpfdumper session, which will create
> FD that can be read(). One can do that many times, each time getting
> its own bpfdumper session.

I slept on it and still fundamentally disagree that seq_file + bpf_prog
is a derivative of link. Or in OoO terms it's not a child class of bpf_link.
seq_file is its own class that should contain bpf_link as one of its
members, but it shouldn't be derived from 'class bpf_link'.

In that sense Yonghong proposed api (raw_tp_open to create anon seq_file+prog
and obj_pin to create a template of named seq_file+prog) are the best fit.
Implementation wise his 'struct extra_priv_data' needs to include
'struct bpf_link' instead of 'struct bpf_prog *prog;' directly.

So evertime 'cat' opens named seq_file there is bpf_link registered in IDR.
Anon seq_file should have another bpf_link as well.

My earlier suggestion to disallow get_fd_from_id for such links is wrong.
It's fine to get an FD to such link, but it shouldn't prevent destruction
of seq_file. 'cat' will close named seq_file and 'struct extra_priv_data' class
should do link_put. If some other process did get_fd_from_id then such link will
become dangling. Just like removal of netdev will make dangling xdp links.
Andrii Nakryiko April 16, 2020, 7:35 p.m. UTC | #24
On Thu, Apr 16, 2020 at 10:04 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Apr 15, 2020 at 06:48:13PM -0700, Andrii Nakryiko wrote:
> > On Wed, Apr 15, 2020 at 9:46 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Apr 14, 2020 at 09:45:08PM -0700, Andrii Nakryiko wrote:
> > > > >
> > > > > > FD is closed, dumper program is detached and dumper is destroyed
> > > > > > (unless pinned in bpffs, just like with any other bpf_link.
> > > > > > 3. At this point bpf_dumper_link can be treated like a factory of
> > > > > > seq_files. We can add a new BPF_DUMPER_OPEN_FILE (all names are for
> > > > > > illustration purposes) command, that accepts dumper link FD and
> > > > > > returns a new seq_file FD, which can be read() normally (or, e.g.,
> > > > > > cat'ed from shell).
> > > > >
> > > > > In this case, link_query may not be accurate if a bpf_dumper_link
> > > > > is created but no corresponding bpf_dumper_open_file. What we really
> > > > > need to iterate through all dumper seq_file FDs.
> > > >
> > > > If the goal is to iterate all the open seq_files (i.e., bpfdump active
> > > > sessions), then bpf_link is clearly not the right approach. But I
> > > > thought we are talking about iterating all the bpfdump programs
> > > > attachments, not **sessions**, in which case bpf_link is exactly the
> > > > right approach.
> > >
> > > That's an important point. What is the pinned /sys/kernel/bpfdump/tasks/foo ?
> >
> > Assuming it's not a rhetorical question, foo is a pinned bpf_dumper
> > link (in my interpretation of all this).
>
> It wasn't rhetorical question and your answer is differrent from mine :)
> It's not a link. It's a template of seq_file. It's the same as
> $ stat /proc/net/ipv6_route
>   File: ‘/proc/net/ipv6_route’
>   Size: 0               Blocks: 0          IO Block: 1024   regular empty file

I don't see a contradiction. Pinning bpfdumper link in bpfdumpfs will
create a direntry and corresponding inode. That inode's i_private
field will contain a pointer to that link. When that direntry is
open()'ed, seq_file is going to be created. That seq_file will
probably need to take refcnt on underlying bpf_link and store it in
its private data. I was *not* implying that
/sys/kernel/bpfdump/tasks/foo is same as bpf_link pinned in bpffs,
which you can restore by doing BPF_OBJ_GET. It's more of a "backed by
bpf_link", if that helps to clarify.

But in your terminology, bpfdumper bpf_link *is* "a template of
seq_file", that I agree.

>
> > > Every time 'cat' opens it a new seq_file is created with new FD, right ?
> >
> > yes
> >
> > > Reading of that file can take infinite amount of time, since 'cat' can be
> > > paused in the middle.
> >
> > yep, correct (though most use case probably going to be very short-lived)
> >
> > > I think we're dealing with several different kinds of objects here.
> > > 1. "template" of seq_file that is seen with 'ls' in /sys/kernel/bpfdump/
> >
> > Let's clarify here again, because this can be interpreted differently.
> >
> > Are you talking about, e.g., /sys/fs/bpfdump/task directory that
> > defines what class of items should be iterated? Or you are talking
> > about named dumper: /sys/fs/bpfdump/task/my_dumper?
>
> the latter.
>
> >
> > If the former, I agree that it's not a link. If the latter, then
> > that's what we've been so far calling "a named bpfdumper". Which is
> > what I argue is a link, pinned in bpfdumpfs (*not bpffs*).
>
> It cannot be a link, since link is only a connection between
> kernel object and bpf prog.
> Whereas seq_file is such kernel object.

Not sure, but maybe that's where the misconnect is? seq_file instance
is derivative of bpf_prog + bpfdump provider. That couplin of bpf_prog
and provider is a link to me. That bpf_link can be used to "produce"
many independent seq_files then.

I do agree that link is a connection between prog and kernel object,
but I argue that "kernel object" in this case is bpfdumper provider
(e.g., what is backing /sys/fs/bpfdump/task), not any specific
seq_file.

>
> >
> > For named dumper:
> > 1. load bpfdump prog
> > 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> > bpf_link anon FD back
> > 3. pin link in bpfdumpfs (e.g., /sys/fs/bpfdump/task/my_dumper)
> > 4. each open() of /sys/fs/bpfdump/task/my_dumper produces new
> > bpfdumper session/seq_file
> >
> > For anon dumper:
> > 1. load bpfdump prog
> > 2. attach prog to bpfdump "provider" (/sys/fs/bpfdump/task), get
> > bpf_link anon FD back
> > 3. give bpf_link FD to some new API (say, BPF_DUMP_NEW_SESSION or
> > whatever name) to create seq_file/bpfdumper session, which will create
> > FD that can be read(). One can do that many times, each time getting
> > its own bpfdumper session.
>
> I slept on it and still fundamentally disagree that seq_file + bpf_prog
> is a derivative of link. Or in OoO terms it's not a child class of bpf_link.
> seq_file is its own class that should contain bpf_link as one of its
> members, but it shouldn't be derived from 'class bpf_link'.

Referring to inheritance here doesn't seem necessary or helpful, I'd
rather not confuse and complicate all this further.

bpfdump provider/target + bpf_prog = bpf_link. bpf_link is "a factory"
of seq_files. That's it, no inheritance.


>
> In that sense Yonghong proposed api (raw_tp_open to create anon seq_file+prog
> and obj_pin to create a template of named seq_file+prog) are the best fit.
> Implementation wise his 'struct extra_priv_data' needs to include
> 'struct bpf_link' instead of 'struct bpf_prog *prog;' directly.
>
> So evertime 'cat' opens named seq_file there is bpf_link registered in IDR.
> Anon seq_file should have another bpf_link as well.

So that's where I disagree and don't see the point of having all those
short-lived bpf_links. cat opening seq_file doesn't create a bpf_link,
it creates a seq_file. If we want to associate some ID with it, it's
fine, but it's not a bpf_link ID (in my opinion, of course).

>
> My earlier suggestion to disallow get_fd_from_id for such links is wrong.
> It's fine to get an FD to such link, but it shouldn't prevent destruction

This is again some custom limitations and implementation, which again
I think is a sign of not ideal design for this. And now that we'll
have bpfdumper for iterate task/file, I also don't think that
everything should have ID to be "iterable" anymore.


> of seq_file. 'cat' will close named seq_file and 'struct extra_priv_data' class
> should do link_put. If some other process did get_fd_from_id then such link will
> become dangling. Just like removal of netdev will make dangling xdp links.
Alexei Starovoitov April 16, 2020, 11:18 p.m. UTC | #25
On Thu, Apr 16, 2020 at 12:35:07PM -0700, Andrii Nakryiko wrote:
> >
> > I slept on it and still fundamentally disagree that seq_file + bpf_prog
> > is a derivative of link. Or in OoO terms it's not a child class of bpf_link.
> > seq_file is its own class that should contain bpf_link as one of its
> > members, but it shouldn't be derived from 'class bpf_link'.
> 
> Referring to inheritance here doesn't seem necessary or helpful, I'd
> rather not confuse and complicate all this further.
> 
> bpfdump provider/target + bpf_prog = bpf_link. bpf_link is "a factory"
> of seq_files. That's it, no inheritance.

named seq_file in bpfdumpfs does indeed look like "factory" pattern.
And yes, there is no inheritance between named seq_file and given seq_file after open().

> > In that sense Yonghong proposed api (raw_tp_open to create anon seq_file+prog
> > and obj_pin to create a template of named seq_file+prog) are the best fit.
> > Implementation wise his 'struct extra_priv_data' needs to include
> > 'struct bpf_link' instead of 'struct bpf_prog *prog;' directly.
> >
> > So evertime 'cat' opens named seq_file there is bpf_link registered in IDR.
> > Anon seq_file should have another bpf_link as well.
> 
> So that's where I disagree and don't see the point of having all those
> short-lived bpf_links. cat opening seq_file doesn't create a bpf_link,
> it creates a seq_file. If we want to associate some ID with it, it's
> fine, but it's not a bpf_link ID (in my opinion, of course).

I thought we're on the same page with the definition of bpf_link ;)
Let's recap. To make it easier I'll keep using object oriented analogy
since I think it's the most appropriate to internalize all the concepts.
- first what is file descriptor? It's nothing but std::shared_ptr<> to some kernel object.
- then there is a key class == struct bpf_link
- for raw tracepoints raw_tp_open() returns an FD to child class of bpf_link
  which is 'struct bpf_raw_tp_link'.
  In other words it returns std::shared_ptr<struct bpf_raw_tp_link>.
- for fentry/fexit/freplace/lsm raw_tp_open() returns an FD to a different child
  class of bpf_link which is "struct bpf_tracing_link".
  This is std::share_ptr<struct bpf_trace_link>.
- for cgroup-bpf progs bpf_link_create() returns an FD to child class of bpf_link
  which is 'struct bpf_cgroup_link'.
  This is std::share_ptr<struct bpf_cgroup_link>.

In all those cases three different shared pointers are seen as file descriptors
from the process pov but they point to different children of bpf_link base class.
link_update() is a method of base class bpf_link and it has to work for
all children classes.
Similarly your future get_obj_info_by_fd() from any of these three shared pointers
will return information specific to that child class.
In all those cases one link attaches one program to one kernel object.

Now back to bpfdumpfs.
In the latest Yonghong's patches raw_tp_open() returns an FD that is a pointer
to seq_file. This is existing kernel base class. It has its own seq_operations
virtual methods that are defined for bpfdumpfs_seq_file which is a child class
of seq_file that keeps start/stop/next methods as-is and overrides show()
method to be able to call bpf prog for every iteratable kernel object.

What you're proposing is to make bpfdump_seq_file class to be a child of two
base classes (seq_file and bpf_link) whereas I'm saying that it should be
a child of seq_file only, since bpf_link methods do not apply to it.
Like there is no sensible behavior for link_update() on such dual parent object.

In my proposal bpfdump_seq_file class keeps cat-ability and all methods of seq_file
and no extra methods from bpf_link that don't belong in seq_file.
But I'm arguing that bpfdump_seq_file class should have a member bpf_link
instead of simply holding bpf_prog via refcnt.
Let's call this child class of bpf_link the bpf_seq_file_link class. Having
bpf_seq_file_link as member would mean that such link is discoverable via IDR,
the user process can get an FD to it and can do get_obj_info_by_fd().
The information returned for such link will be a pair (bpfdump_prog, bpfdump_seq_file).
Meaning that at any given time 'bpftool link show' will show where every bpf
prog in the system is attached to.
Say named bpfdump_seq_file exists in /sys/kernel/bpfdump/tasks/foo.
No one is doing a 'cat' on it yet.
"bpftool link show" will show one link which is a pair (bpfdump_prog, "tasks/foo").
Now two humans are doing 'cat' of that file.
The bpfdump_prog refcnt is now 3 and there are two additional seq_files created
by the kernel when user said open("/sys/kernel/bpfdump/tasks/foo").
If these two humans are slow somebody could have done "rm /sys/kernel/bpfdump/tasks/foo"
and that bpfdump_seq_file and it's member bpf_seq_file_link would be gone,
but two other bpdump_seq_file-s are still active and they are different.
"bpftool link show" should be showing two pairs (bpfdump_prog, seq_file_A) and
(bpfdump_prog, seq_file_B).
The users could have been in different pid namespaces. What seq_file_A is
iterating could be completely different from seq_file_B, but I think it's
useful for admin to know where all bpf progs in the system are attached and
what kind of things are triggering them.
Andrii Nakryiko April 17, 2020, 5:11 a.m. UTC | #26
On Thu, Apr 16, 2020 at 4:18 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Apr 16, 2020 at 12:35:07PM -0700, Andrii Nakryiko wrote:
> > >
> > > I slept on it and still fundamentally disagree that seq_file + bpf_prog
> > > is a derivative of link. Or in OoO terms it's not a child class of bpf_link.
> > > seq_file is its own class that should contain bpf_link as one of its
> > > members, but it shouldn't be derived from 'class bpf_link'.
> >
> > Referring to inheritance here doesn't seem necessary or helpful, I'd
> > rather not confuse and complicate all this further.
> >
> > bpfdump provider/target + bpf_prog = bpf_link. bpf_link is "a factory"
> > of seq_files. That's it, no inheritance.
>
> named seq_file in bpfdumpfs does indeed look like "factory" pattern.
> And yes, there is no inheritance between named seq_file and given seq_file after open().
>
> > > In that sense Yonghong proposed api (raw_tp_open to create anon seq_file+prog
> > > and obj_pin to create a template of named seq_file+prog) are the best fit.
> > > Implementation wise his 'struct extra_priv_data' needs to include
> > > 'struct bpf_link' instead of 'struct bpf_prog *prog;' directly.
> > >
> > > So evertime 'cat' opens named seq_file there is bpf_link registered in IDR.
> > > Anon seq_file should have another bpf_link as well.
> >
> > So that's where I disagree and don't see the point of having all those
> > short-lived bpf_links. cat opening seq_file doesn't create a bpf_link,
> > it creates a seq_file. If we want to associate some ID with it, it's
> > fine, but it's not a bpf_link ID (in my opinion, of course).
>
> I thought we're on the same page with the definition of bpf_link ;)
> Let's recap. To make it easier I'll keep using object oriented analogy
> since I think it's the most appropriate to internalize all the concepts.
> - first what is file descriptor? It's nothing but std::shared_ptr<> to some kernel object.

I agree overall, but if I may be 100% pedantic, FD and kernel objects
topology can be quite a bit more complicated:

FD ---> struct file --(private_data)----> kernel object
     /                                 /
FD --                                 /
                                     /
FD ---> struct file --(private_data)/

I'll refer to this a bit further down.

> - then there is a key class == struct bpf_link
> - for raw tracepoints raw_tp_open() returns an FD to child class of bpf_link
>   which is 'struct bpf_raw_tp_link'.
>   In other words it returns std::shared_ptr<struct bpf_raw_tp_link>.
> - for fentry/fexit/freplace/lsm raw_tp_open() returns an FD to a different child
>   class of bpf_link which is "struct bpf_tracing_link".
>   This is std::share_ptr<struct bpf_trace_link>.
> - for cgroup-bpf progs bpf_link_create() returns an FD to child class of bpf_link
>   which is 'struct bpf_cgroup_link'.
>   This is std::share_ptr<struct bpf_cgroup_link>.
>
> In all those cases three different shared pointers are seen as file descriptors
> from the process pov but they point to different children of bpf_link base class.
> link_update() is a method of base class bpf_link and it has to work for
> all children classes.
> Similarly your future get_obj_info_by_fd() from any of these three shared pointers
> will return information specific to that child class.
> In all those cases one link attaches one program to one kernel object.
>

Thank you for a nice recap! :)

> Now back to bpfdumpfs.
> In the latest Yonghong's patches raw_tp_open() returns an FD that is a pointer
> to seq_file. This is existing kernel base class. It has its own seq_operations
> virtual methods that are defined for bpfdumpfs_seq_file which is a child class
> of seq_file that keeps start/stop/next methods as-is and overrides show()
> method to be able to call bpf prog for every iteratable kernel object.
>
> What you're proposing is to make bpfdump_seq_file class to be a child of two
> base classes (seq_file and bpf_link) whereas I'm saying that it should be
> a child of seq_file only, since bpf_link methods do not apply to it.
> Like there is no sensible behavior for link_update() on such dual parent object.
>
> In my proposal bpfdump_seq_file class keeps cat-ability and all methods of seq_file
> and no extra methods from bpf_link that don't belong in seq_file.
> But I'm arguing that bpfdump_seq_file class should have a member bpf_link
> instead of simply holding bpf_prog via refcnt.
> Let's call this child class of bpf_link the bpf_seq_file_link class. Having
> bpf_seq_file_link as member would mean that such link is discoverable via IDR,
> the user process can get an FD to it and can do get_obj_info_by_fd().
> The information returned for such link will be a pair (bpfdump_prog, bpfdump_seq_file).
> Meaning that at any given time 'bpftool link show' will show where every bpf
> prog in the system is attached to.
> Say named bpfdump_seq_file exists in /sys/kernel/bpfdump/tasks/foo.
> No one is doing a 'cat' on it yet.
> "bpftool link show" will show one link which is a pair (bpfdump_prog, "tasks/foo").
> Now two humans are doing 'cat' of that file.
> The bpfdump_prog refcnt is now 3 and there are two additional seq_files created
> by the kernel when user said open("/sys/kernel/bpfdump/tasks/foo").
> If these two humans are slow somebody could have done "rm /sys/kernel/bpfdump/tasks/foo"
> and that bpfdump_seq_file and it's member bpf_seq_file_link would be gone,
> but two other bpdump_seq_file-s are still active and they are different.
> "bpftool link show" should be showing two pairs (bpfdump_prog, seq_file_A) and
> (bpfdump_prog, seq_file_B).
> The users could have been in different pid namespaces. What seq_file_A is
> iterating could be completely different from seq_file_B, but I think it's
> useful for admin to know where all bpf progs in the system are attached and
> what kind of things are triggering them.

How exactly bpf_link is implemented for bpfdumper is not all that
important to me. It can be a separate struct, a field, a pointer to a
separate struct -- not that different.

I didn't mean for this thread to be just another endless discussion,
so I'll try to wrap it up in this email. I really like bpfdumper idea
and usability overall. Getting call for end of iteration is a big deal
and I'm glad I got at least that :)

But let me try to just point out few things you proposed above that I
disagree on the high-level with, as well as provide few supporting
point to the scheme I proposed previously. If all that is not
convincing, I rest my case and I won't object to bpfdumper to go in in
any form, as long as I can use it anonymously with extra call at the
end to do post-aggregation.

So, first. I do not see a point of treating each instance of seq_file
as if it was an new bpf_link:
1. It's a bit like saying that each inherited cgroup bpf program in
effective_prog_array should has a new bpf_link created. It's not how
it's done for cgroups and I think for a good reason.
2. Further, each seq_file, when created from "seq_file template",
should take a refcnt on bpf_prog, not bpf_link. Because seq_file
expects bpf_prog itself to be exactly the same throughout entire
iteration process. Bpf_link, on the other hand, allows to do in-place
update of bpf_program, which would ruin seq_file iteration,
potentially. I know we can disable that, but it just feels like
arbitrary restrictions.
3. Suppose each seq_file is/has bpf_link and one can iterate over each
active seq_file (what I've been calling a session, but whatever). What
kind of info user-facing info you can get from get_obj_info? prog_id,
prog_tag, provider ID/name (i.e., /sys/fs/bpfdump/task). Is that
useful? Yes! Is that enough to do anything actionable? No! Immediately
you'd need to know PIDs of all processes that have FD open to that
seq_file (and see diagram above, there could be many processes with
many FDs for the same seq_file). bpf_link doesn't know all PIDs. So
it's this generic "who has this file opened" problem all over again,
which I'm already pretty tired to talk about :) Except now we have at
least 3 ways to answer such questions: iterate procfs+fdinfo, drgn
scripts, now also bpfdump program for task/file provider.

So even if you can enumerate active bpfdump seq_files in the system,
you still need extra info and iterate over task/file items to be able
to do anything about that. Which is one of the reasons I think
auto-creating bpf_links for each seq_file is useless and will just
pollute the view of the system (from bpf_link standpoint).

Now, second. Getting back what I proposed with 3-4 step process (load
--> attach (create_link) --> (pin in bpfdumpds + open() |
BPF_NEW_DUMP_SESSION)). I realize now that attach might seem
superficial here, because it doesn't provide any extra information (FD
of provider was specified at prog load time). It does feel a bit
weird, but:

1. It's not that weird, because fentry/fexit/freplace and tp_btf also
don't provide anything extra: all the info was specified at load time.
2. This attach step is a good point to provide some sort of
"parametrization" to narrow down behavior of providers. I'll give two
examples that I think are going to be very useful and we'll eventually
add support for them in one way or another.

Example A. task/file provider. Instead of iterating over all tasks,
the simplest extension would be to specify **one** specific task PID
to iterate all files of. Attach time would be the place to specify
this PID. We don't need to know PID at load time, because that doesn't
change anything about BPF program validation and verified just doesn't
need to know. So now, once attached, bpf_link is created that can be
pinned in bpfdumpfs or BPF_NEW_DUMP_SESSION can be used to create
potentially many seq_files (e.g., poll every second) to see all open
files from a specific task. We can keep generalizing to, say, having
all tasks in a given cgroup. All that can be implemented by filtering
out inside BPF program, of course, but having narrower scope from the
beginning could save tons of time and resources.

Example B. Iterating BPF map items. We already have bpf_map provider,
next could be bpf_map/items, which would call BPF program for each
key/value pair, something like:

int BPF_PROG(for_each_map_kv, struct seq_file *seq, struct bpf_map *map,
             void *key, size_t key_size, void *value, size_t value_size)
{
    ...
}

Now, once you have that, a natural next desire is to say "only dump
items of map with ID 123", instead of iterating over all BPF maps in
the system. That map ID could be specified at attachment time, when
bpf_link with these parameters are going to be created. Again, at load
time BPF verifier doesn't need to know specific BPF map we are going
to iterate, if we stick to generic key/value blobs semantics.

So with such possibility considered, I hope having explicit
LINK_CREATE step starts making much more sense. This, plus not having
to distinguish between named and anonymous dumpers (just like we don't
distinguish pinned, i.e. "named", bpf_link from anonymous one), makes
me still believe that this is a better approach.

But alas, my goal here is to bring different perspectives, not to
obstruct or delay progress. So I'm going to spend some more time
reviewing v2 and will provide feedback on relevant patches, but if my
arguments were not convincing, I'm fine with that. I managed to
convince you guys that "anonymous" bpfdumper without bpfdumpfs pinning
and post-aggregation callback are a good thing and I'm happy about
that already. Can't get 100% of what I want, right? :)
Yonghong Song April 19, 2020, 6:11 a.m. UTC | #27
On 4/16/20 10:11 PM, Andrii Nakryiko wrote:
> On Thu, Apr 16, 2020 at 4:18 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>
>> On Thu, Apr 16, 2020 at 12:35:07PM -0700, Andrii Nakryiko wrote:
>>>>
>>>> I slept on it and still fundamentally disagree that seq_file + bpf_prog
>>>> is a derivative of link. Or in OoO terms it's not a child class of bpf_link.
>>>> seq_file is its own class that should contain bpf_link as one of its
>>>> members, but it shouldn't be derived from 'class bpf_link'.
>>>
>>> Referring to inheritance here doesn't seem necessary or helpful, I'd
>>> rather not confuse and complicate all this further.
>>>
>>> bpfdump provider/target + bpf_prog = bpf_link. bpf_link is "a factory"
>>> of seq_files. That's it, no inheritance.
>>
>> named seq_file in bpfdumpfs does indeed look like "factory" pattern.
>> And yes, there is no inheritance between named seq_file and given seq_file after open().
>>
>>>> In that sense Yonghong proposed api (raw_tp_open to create anon seq_file+prog
>>>> and obj_pin to create a template of named seq_file+prog) are the best fit.
>>>> Implementation wise his 'struct extra_priv_data' needs to include
>>>> 'struct bpf_link' instead of 'struct bpf_prog *prog;' directly.
>>>>
>>>> So evertime 'cat' opens named seq_file there is bpf_link registered in IDR.
>>>> Anon seq_file should have another bpf_link as well.
>>>
>>> So that's where I disagree and don't see the point of having all those
>>> short-lived bpf_links. cat opening seq_file doesn't create a bpf_link,
>>> it creates a seq_file. If we want to associate some ID with it, it's
>>> fine, but it's not a bpf_link ID (in my opinion, of course).
>>
>> I thought we're on the same page with the definition of bpf_link ;)
>> Let's recap. To make it easier I'll keep using object oriented analogy
>> since I think it's the most appropriate to internalize all the concepts.
>> - first what is file descriptor? It's nothing but std::shared_ptr<> to some kernel object.
> 
> I agree overall, but if I may be 100% pedantic, FD and kernel objects
> topology can be quite a bit more complicated:
> 
> FD ---> struct file --(private_data)----> kernel object
>       /                                 /
> FD --                                 /
>                                       /
> FD ---> struct file --(private_data)/
> 
> I'll refer to this a bit further down.
> 
>> - then there is a key class == struct bpf_link
>> - for raw tracepoints raw_tp_open() returns an FD to child class of bpf_link
>>    which is 'struct bpf_raw_tp_link'.
>>    In other words it returns std::shared_ptr<struct bpf_raw_tp_link>.
>> - for fentry/fexit/freplace/lsm raw_tp_open() returns an FD to a different child
>>    class of bpf_link which is "struct bpf_tracing_link".
>>    This is std::share_ptr<struct bpf_trace_link>.
>> - for cgroup-bpf progs bpf_link_create() returns an FD to child class of bpf_link
>>    which is 'struct bpf_cgroup_link'.
>>    This is std::share_ptr<struct bpf_cgroup_link>.
>>
>> In all those cases three different shared pointers are seen as file descriptors
>> from the process pov but they point to different children of bpf_link base class.
>> link_update() is a method of base class bpf_link and it has to work for
>> all children classes.
>> Similarly your future get_obj_info_by_fd() from any of these three shared pointers
>> will return information specific to that child class.
>> In all those cases one link attaches one program to one kernel object.
>>
> 
> Thank you for a nice recap! :)
> 
>> Now back to bpfdumpfs.
>> In the latest Yonghong's patches raw_tp_open() returns an FD that is a pointer
>> to seq_file. This is existing kernel base class. It has its own seq_operations
>> virtual methods that are defined for bpfdumpfs_seq_file which is a child class
>> of seq_file that keeps start/stop/next methods as-is and overrides show()
>> method to be able to call bpf prog for every iteratable kernel object.
>>
>> What you're proposing is to make bpfdump_seq_file class to be a child of two
>> base classes (seq_file and bpf_link) whereas I'm saying that it should be
>> a child of seq_file only, since bpf_link methods do not apply to it.
>> Like there is no sensible behavior for link_update() on such dual parent object.
>>
>> In my proposal bpfdump_seq_file class keeps cat-ability and all methods of seq_file
>> and no extra methods from bpf_link that don't belong in seq_file.
>> But I'm arguing that bpfdump_seq_file class should have a member bpf_link
>> instead of simply holding bpf_prog via refcnt.
>> Let's call this child class of bpf_link the bpf_seq_file_link class. Having
>> bpf_seq_file_link as member would mean that such link is discoverable via IDR,
>> the user process can get an FD to it and can do get_obj_info_by_fd().
>> The information returned for such link will be a pair (bpfdump_prog, bpfdump_seq_file).
>> Meaning that at any given time 'bpftool link show' will show where every bpf
>> prog in the system is attached to.
>> Say named bpfdump_seq_file exists in /sys/kernel/bpfdump/tasks/foo.
>> No one is doing a 'cat' on it yet.
>> "bpftool link show" will show one link which is a pair (bpfdump_prog, "tasks/foo").
>> Now two humans are doing 'cat' of that file.
>> The bpfdump_prog refcnt is now 3 and there are two additional seq_files created
>> by the kernel when user said open("/sys/kernel/bpfdump/tasks/foo").
>> If these two humans are slow somebody could have done "rm /sys/kernel/bpfdump/tasks/foo"
>> and that bpfdump_seq_file and it's member bpf_seq_file_link would be gone,
>> but two other bpdump_seq_file-s are still active and they are different.
>> "bpftool link show" should be showing two pairs (bpfdump_prog, seq_file_A) and
>> (bpfdump_prog, seq_file_B).
>> The users could have been in different pid namespaces. What seq_file_A is
>> iterating could be completely different from seq_file_B, but I think it's
>> useful for admin to know where all bpf progs in the system are attached and
>> what kind of things are triggering them.
> 
> How exactly bpf_link is implemented for bpfdumper is not all that
> important to me. It can be a separate struct, a field, a pointer to a
> separate struct -- not that different.
> 
> I didn't mean for this thread to be just another endless discussion,
> so I'll try to wrap it up in this email. I really like bpfdumper idea
> and usability overall. Getting call for end of iteration is a big deal
> and I'm glad I got at least that :)
> 
> But let me try to just point out few things you proposed above that I
> disagree on the high-level with, as well as provide few supporting
> point to the scheme I proposed previously. If all that is not
> convincing, I rest my case and I won't object to bpfdumper to go in in
> any form, as long as I can use it anonymously with extra call at the
> end to do post-aggregation.
> 
> So, first. I do not see a point of treating each instance of seq_file
> as if it was an new bpf_link:
> 1. It's a bit like saying that each inherited cgroup bpf program in
> effective_prog_array should has a new bpf_link created. It's not how
> it's done for cgroups and I think for a good reason.
> 2. Further, each seq_file, when created from "seq_file template",
> should take a refcnt on bpf_prog, not bpf_link. Because seq_file
> expects bpf_prog itself to be exactly the same throughout entire
> iteration process. Bpf_link, on the other hand, allows to do in-place
> update of bpf_program, which would ruin seq_file iteration,
> potentially. I know we can disable that, but it just feels like
> arbitrary restrictions.
> 3. Suppose each seq_file is/has bpf_link and one can iterate over each
> active seq_file (what I've been calling a session, but whatever). What
> kind of info user-facing info you can get from get_obj_info? prog_id,
> prog_tag, provider ID/name (i.e., /sys/fs/bpfdump/task). Is that
> useful? Yes! Is that enough to do anything actionable? No! Immediately
> you'd need to know PIDs of all processes that have FD open to that
> seq_file (and see diagram above, there could be many processes with
> many FDs for the same seq_file). bpf_link doesn't know all PIDs. So
> it's this generic "who has this file opened" problem all over again,
> which I'm already pretty tired to talk about :) Except now we have at
> least 3 ways to answer such questions: iterate procfs+fdinfo, drgn
> scripts, now also bpfdump program for task/file provider.
> 
> So even if you can enumerate active bpfdump seq_files in the system,
> you still need extra info and iterate over task/file items to be able
> to do anything about that. Which is one of the reasons I think
> auto-creating bpf_links for each seq_file is useless and will just
> pollute the view of the system (from bpf_link standpoint).
> 
> Now, second. Getting back what I proposed with 3-4 step process (load
> --> attach (create_link) --> (pin in bpfdumpds + open() |
> BPF_NEW_DUMP_SESSION)). I realize now that attach might seem
> superficial here, because it doesn't provide any extra information (FD
> of provider was specified at prog load time). It does feel a bit
> weird, but:
> 
> 1. It's not that weird, because fentry/fexit/freplace and tp_btf also
> don't provide anything extra: all the info was specified at load time.
> 2. This attach step is a good point to provide some sort of
> "parametrization" to narrow down behavior of providers. I'll give two
> examples that I think are going to be very useful and we'll eventually
> add support for them in one way or another.
> 
> Example A. task/file provider. Instead of iterating over all tasks,
> the simplest extension would be to specify **one** specific task PID
> to iterate all files of. Attach time would be the place to specify
> this PID. We don't need to know PID at load time, because that doesn't
> change anything about BPF program validation and verified just doesn't
> need to know. So now, once attached, bpf_link is created that can be
> pinned in bpfdumpfs or BPF_NEW_DUMP_SESSION can be used to create
> potentially many seq_files (e.g., poll every second) to see all open
> files from a specific task. We can keep generalizing to, say, having
> all tasks in a given cgroup. All that can be implemented by filtering
> out inside BPF program, of course, but having narrower scope from the
> beginning could save tons of time and resources.
> 
> Example B. Iterating BPF map items. We already have bpf_map provider,
> next could be bpf_map/items, which would call BPF program for each
> key/value pair, something like:
> 
> int BPF_PROG(for_each_map_kv, struct seq_file *seq, struct bpf_map *map,
>               void *key, size_t key_size, void *value, size_t value_size)
> {
>      ...
> }
> 
> Now, once you have that, a natural next desire is to say "only dump
> items of map with ID 123", instead of iterating over all BPF maps in
> the system. That map ID could be specified at attachment time, when
> bpf_link with these parameters are going to be created. Again, at load
> time BPF verifier doesn't need to know specific BPF map we are going
> to iterate, if we stick to generic key/value blobs semantics.

Thanks for bringing out this use case. I have not thought this carefully 
before, just thinking bpf filtering even for second-level data structure 
should be enough for most cases. But I do agree in certain cases, this 
is not good e.g., every map has millions of elements and you only want 
to scan through a particular map id.

But I think fixed parameterization at kernel interface might not be good 
enough. For example,
     - we want to filter only for files for this pid
       pid is passed to the kernel
     - we want to filter only for files for tasks in a particular cgroup
       cgroup id passed to the kernel and target need to check
       whether a particular task belongs to this cgroup
     - this is a hypothetical case.
       suppose you want to traverse the nh_list for a certain route
       with src1 and dst1
       src1 and dst1 need to be passed to the kernel and target.

Maybe a bpf based filter is a good choice here.

For a dumper program prog3 at foo1/foo2/foo3,
two filter programs can exist:
    prog1: target foo1
    prog2: target foo1/foo2
prog1/prog2 returns 1 means skip that object and 0 means not skipping

For dump prog3, return value 1 means stopping the dump and 0 means not
    stopping.

Note here, I did not put any further restriction to prog1/prog2, they
can use bpf_seq_printf() or any other tracing prog helpers.

So when to create a dumper (anonymous or file), multiple bpf programs
*can* be present:
    - all programs must be in the same hierarchy
      foo1/, foo1/foo3 are good
      foo1/, bar1/ will be rejected
    - each hierarchy can only have 0 or 1 program
    - the deepest hierarchy program is the one to do dumper,
      all early hierarchy programs, if present, are filter programs.
      if the filter program does not exist for a particular hierarchy,
      assumes a program always returns not skipping

I have not thought about kernel API yet. Not 100% LINK_CREATE is
the right choice here or not.

Any thoughts?

> 
> So with such possibility considered, I hope having explicit
> LINK_CREATE step starts making much more sense. This, plus not having
> to distinguish between named and anonymous dumpers (just like we don't
> distinguish pinned, i.e. "named", bpf_link from anonymous one), makes
> me still believe that this is a better approach.
> 
> But alas, my goal here is to bring different perspectives, not to
> obstruct or delay progress. So I'm going to spend some more time
> reviewing v2 and will provide feedback on relevant patches, but if my
> arguments were not convincing, I'm fine with that. I managed to
> convince you guys that "anonymous" bpfdumper without bpfdumpfs pinning
> and post-aggregation callback are a good thing and I'm happy about
> that already. Can't get 100% of what I want, right? :)
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 44268d36d901..8171e01ff4be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1110,10 +1110,15 @@  struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+#define BPF_DUMP_SEQ_NET_PRIVATE	BIT(0)
+
 int bpf_dump_reg_target(const char *target, const char *target_proto,
 			const struct seq_operations *seq_ops,
 			u32 seq_priv_size, u32 target_feature);
 int bpf_dump_set_target_info(u32 target_fd, struct bpf_prog *prog);
+int bpf_dump_create(u32 prog_fd, const char __user *dumper_name);
+struct bpf_prog *bpf_dump_get_prog(struct seq_file *seq, u32 priv_data_size,
+				   u64 *seq_num);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0f1cbed446c1..b51d56fc77f9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -354,6 +354,7 @@  enum {
 /* Flags for accessing BPF object from syscall side. */
 	BPF_F_RDONLY		= (1U << 3),
 	BPF_F_WRONLY		= (1U << 4),
+	BPF_F_DUMP		= (1U << 5),
 
 /* Flag for stack_map, store build_id+offset instead of pointer */
 	BPF_F_STACK_BUILD_ID	= (1U << 5),
@@ -481,7 +482,10 @@  union bpf_attr {
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-		__aligned_u64	pathname;
+		union {
+			__aligned_u64	pathname;
+			__aligned_u64	dumper_name;
+		};
 		__u32		bpf_fd;
 		__u32		file_flags;
 	};
diff --git a/kernel/bpf/dump.c b/kernel/bpf/dump.c
index 1091affe8b3f..ac6856abb711 100644
--- a/kernel/bpf/dump.c
+++ b/kernel/bpf/dump.c
@@ -30,22 +30,173 @@  struct bpfdump_targets {
 	struct mutex dumper_mutex;
 };
 
+struct dumper_inode_info {
+	struct bpfdump_target_info *tinfo;
+	struct bpf_prog *prog;
+};
+
+struct dumper_info {
+	struct list_head list;
+	/* file to identify an anon dumper,
+	 * dentry to identify a file dumper.
+	 */
+	union {
+		struct file *file;
+		struct dentry *dentry;
+	};
+	struct bpfdump_target_info *tinfo;
+	struct bpf_prog *prog;
+};
+
+struct dumpers {
+	struct list_head dumpers;
+	struct mutex dumper_mutex;
+};
+
+struct extra_priv_data {
+	struct bpf_prog *prog;
+	u64 seq_num;
+};
+
 /* registered dump targets */
 static struct bpfdump_targets dump_targets;
 
 static struct dentry *bpfdump_dentry;
 
+static struct dumpers anon_dumpers, file_dumpers;
+
+static const struct file_operations bpf_dumper_ops;
+static const struct inode_operations bpf_dir_iops;
+
+static struct dentry *bpfdump_add_file(const char *name, struct dentry *parent,
+				       const struct file_operations *f_ops,
+				       void *data);
 static struct dentry *bpfdump_add_dir(const char *name, struct dentry *parent,
 				      const struct inode_operations *i_ops,
 				      void *data);
 static int __bpfdump_init(void);
 
+static u32 get_total_priv_dsize(u32 old_size)
+{
+	return roundup(old_size, 8) + sizeof(struct extra_priv_data);
+}
+
+static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
+{
+	return old_ptr + roundup(old_size, 8);
+}
+
+#ifdef CONFIG_PROC_FS
+static void dumper_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	struct dumper_inode_info *i_info = filp->f_inode->i_private;
+
+	seq_printf(m, "target:\t%s\n"
+		      "prog_id:\t%u\n",
+		   i_info->tinfo->target,
+		   i_info->prog->aux->id);
+}
+
+static void anon_dumper_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	struct dumper_info *dinfo;
+
+	mutex_lock(&anon_dumpers.dumper_mutex);
+	list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) {
+		if (dinfo->file == filp) {
+			seq_printf(m, "target:\t%s\n"
+				      "prog_id:\t%u\n",
+				   dinfo->tinfo->target,
+				   dinfo->prog->aux->id);
+			break;
+		}
+	}
+	mutex_unlock(&anon_dumpers.dumper_mutex);
+}
+
+#endif
+
+static void process_target_feature(u32 feature, void *priv_data)
+{
+	/* use the current net namespace */
+	if (feature & BPF_DUMP_SEQ_NET_PRIVATE)
+		set_seq_net_private((struct seq_net_private *)priv_data,
+				    current->nsproxy->net_ns);
+}
+
+static int dumper_open(struct inode *inode, struct file *file)
+{
+	struct dumper_inode_info *i_info = inode->i_private;
+	struct extra_priv_data *extra_data;
+	u32 old_priv_size, total_priv_size;
+	void *priv_data;
+
+	old_priv_size = i_info->tinfo->seq_priv_size;
+	total_priv_size = get_total_priv_dsize(old_priv_size);
+	priv_data = __seq_open_private(file, i_info->tinfo->seq_ops,
+				       total_priv_size);
+	if (!priv_data)
+		return -ENOMEM;
+
+	process_target_feature(i_info->tinfo->target_feature, priv_data);
+
+	extra_data = get_extra_priv_dptr(priv_data, old_priv_size);
+	extra_data->prog = i_info->prog;
+	extra_data->seq_num = 0;
+
+	return 0;
+}
+
+static int anon_dumper_release(struct inode *inode, struct file *file)
+{
+	struct dumper_info *dinfo;
+
+	/* release the bpf program */
+	mutex_lock(&anon_dumpers.dumper_mutex);
+	list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) {
+		if (dinfo->file == file) {
+			bpf_prog_put(dinfo->prog);
+			list_del(&dinfo->list);
+			break;
+		}
+	}
+	mutex_unlock(&anon_dumpers.dumper_mutex);
+
+	return seq_release_private(inode, file);
+}
+
+static int dumper_release(struct inode *inode, struct file *file)
+{
+	return seq_release_private(inode, file);
+}
+
 static int dumper_unlink(struct inode *dir, struct dentry *dentry)
 {
-	kfree(d_inode(dentry)->i_private);
+	struct dumper_inode_info *i_info = d_inode(dentry)->i_private;
+
+	bpf_prog_put(i_info->prog);
+	kfree(i_info);
+
 	return simple_unlink(dir, dentry);
 }
 
+static const struct file_operations bpf_dumper_ops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= dumper_show_fdinfo,
+#endif
+	.open		= dumper_open,
+	.read		= seq_read,
+	.release	= dumper_release,
+};
+
+static const struct file_operations anon_bpf_dumper_ops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= anon_dumper_show_fdinfo,
+#endif
+	.read		= seq_read,
+	.release	= anon_dumper_release,
+};
+
 static const struct inode_operations bpf_dir_iops = {
 	.lookup		= simple_lookup,
 	.unlink		= dumper_unlink,
@@ -88,6 +239,179 @@  int bpf_dump_set_target_info(u32 target_fd, struct bpf_prog *prog)
 	return err;
 }
 
+static int create_anon_dumper(struct bpfdump_target_info *tinfo,
+			      struct bpf_prog *prog)
+{
+	struct extra_priv_data *extra_data;
+	u32 old_priv_size, total_priv_size;
+	struct dumper_info *dinfo;
+	struct file *file;
+	int err, anon_fd;
+	void *priv_data;
+	struct fd fd;
+
+	anon_fd = anon_inode_getfd("bpf-dumper", &anon_bpf_dumper_ops,
+				   NULL, O_CLOEXEC);
+	if (anon_fd < 0)
+		return anon_fd;
+
+	/* setup seq_file for anon dumper */
+	fd = fdget(anon_fd);
+	file = fd.file;
+
+	dinfo = kmalloc(sizeof(*dinfo), GFP_KERNEL);
+	if (!dinfo) {
+		err = -ENOMEM;
+		goto free_fd;
+	}
+
+	old_priv_size = tinfo->seq_priv_size;
+	total_priv_size = get_total_priv_dsize(old_priv_size);
+
+	priv_data = __seq_open_private(file, tinfo->seq_ops,
+				       total_priv_size);
+	if (!priv_data) {
+		err = -ENOMEM;
+		goto free_dinfo;
+	}
+
+	dinfo->file = file;
+	dinfo->tinfo = tinfo;
+	dinfo->prog = prog;
+
+	mutex_lock(&anon_dumpers.dumper_mutex);
+	list_add(&dinfo->list, &anon_dumpers.dumpers);
+	mutex_unlock(&anon_dumpers.dumper_mutex);
+
+	process_target_feature(tinfo->target_feature, priv_data);
+
+	extra_data = get_extra_priv_dptr(priv_data, old_priv_size);
+	extra_data->prog = prog;
+	extra_data->seq_num = 0;
+
+	fdput(fd);
+	return anon_fd;
+
+free_dinfo:
+	kfree(dinfo);
+free_fd:
+	fdput(fd);
+	return err;
+}
+
+static int create_dumper(struct bpfdump_target_info *tinfo,
+			 const char __user *dumper_name,
+			 struct bpf_prog *prog)
+{
+	struct dumper_inode_info *i_info;
+	struct dumper_info *dinfo;
+	struct dentry *dentry;
+	const char *dname;
+	int err = 0;
+
+	i_info = kmalloc(sizeof(*i_info), GFP_KERNEL);
+	if (!i_info)
+		return -ENOMEM;
+
+	i_info->tinfo = tinfo;
+	i_info->prog = prog;
+
+	dinfo = kmalloc(sizeof(*dinfo), GFP_KERNEL);
+	if (!dinfo) {
+		err = -ENOMEM;
+		goto free_i_info;
+	}
+
+	dname = strndup_user(dumper_name, PATH_MAX);
+	if (!dname) {
+		err = -ENOMEM;
+		goto free_dinfo;
+	}
+
+	dentry = bpfdump_add_file(dname, tinfo->dir_dentry,
+				  &bpf_dumper_ops, i_info);
+	kfree(dname);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto free_dinfo;
+	}
+
+	dinfo->dentry = dentry;
+	dinfo->tinfo = tinfo;
+	dinfo->prog = prog;
+
+	mutex_lock(&file_dumpers.dumper_mutex);
+	list_add(&dinfo->list, &file_dumpers.dumpers);
+	mutex_unlock(&file_dumpers.dumper_mutex);
+
+	return 0;
+
+free_dinfo:
+	kfree(dinfo);
+free_i_info:
+	kfree(i_info);
+	return err;
+}
+
+int bpf_dump_create(u32 prog_fd, const char __user *dumper_name)
+{
+	struct bpfdump_target_info *tinfo;
+	const char *target;
+	struct bpf_prog *prog;
+	bool existed = false;
+	int err = 0;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	target = prog->aux->dump_target;
+	if (!target) {
+		err = -EINVAL;
+		goto free_prog;
+	}
+
+	mutex_lock(&dump_targets.dumper_mutex);
+	list_for_each_entry(tinfo, &dump_targets.dumpers, list) {
+		if (strcmp(tinfo->target, target) == 0) {
+			existed = true;
+			break;
+		}
+	}
+	mutex_unlock(&dump_targets.dumper_mutex);
+
+	if (!existed) {
+		err = -EINVAL;
+		goto free_prog;
+	}
+
+	err = dumper_name ? create_dumper(tinfo, dumper_name, prog)
+			  : create_anon_dumper(tinfo, prog);
+	if (err < 0)
+		goto free_prog;
+
+	return err;
+
+free_prog:
+	bpf_prog_put(prog);
+	return err;
+}
+
+struct bpf_prog *bpf_dump_get_prog(struct seq_file *seq, u32 priv_data_size,
+	u64 *seq_num)
+{
+	struct extra_priv_data *extra_data;
+
+	if (seq->file->f_op != &bpf_dumper_ops &&
+	    seq->file->f_op != &anon_bpf_dumper_ops)
+		return NULL;
+
+	extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
+	*seq_num = extra_data->seq_num++;
+
+	return extra_data->prog;
+}
+
 int bpf_dump_reg_target(const char *target,
 			const char *target_proto,
 			const struct seq_operations *seq_ops,
@@ -211,6 +535,14 @@  bpfdump_create_dentry(const char *name, umode_t mode, struct dentry *parent,
 	return dentry;
 }
 
+static struct dentry *
+bpfdump_add_file(const char *name, struct dentry *parent,
+		 const struct file_operations *f_ops, void *data)
+{
+	return bpfdump_create_dentry(name, S_IFREG | 0444, parent,
+				     data, NULL, f_ops);
+}
+
 static struct dentry *
 bpfdump_add_dir(const char *name, struct dentry *parent,
 		const struct inode_operations *i_ops, void *data)
@@ -290,6 +622,10 @@  static int __bpfdump_init(void)
 
 	INIT_LIST_HEAD(&dump_targets.dumpers);
 	mutex_init(&dump_targets.dumper_mutex);
+	INIT_LIST_HEAD(&anon_dumpers.dumpers);
+	mutex_init(&anon_dumpers.dumper_mutex);
+	INIT_LIST_HEAD(&file_dumpers.dumpers);
+	mutex_init(&file_dumpers.dumper_mutex);
 	return 0;
 
 remove_mount:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 41005dee8957..b5e4f18cc633 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2173,9 +2173,13 @@  static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP)
 		return -EINVAL;
 
+	if (attr->file_flags == BPF_F_DUMP)
+		return bpf_dump_create(attr->bpf_fd,
+				       u64_to_user_ptr(attr->dumper_name));
+
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
 }
 
@@ -2605,6 +2609,8 @@  attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	case BPF_TRACE_DUMP:
+		return BPF_PROG_TYPE_TRACING;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -2663,6 +2669,9 @@  static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_SOCK_OPS:
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		ret = bpf_dump_create(attr->attach_bpf_fd, (void __user *)NULL);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0f1cbed446c1..b51d56fc77f9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -354,6 +354,7 @@  enum {
 /* Flags for accessing BPF object from syscall side. */
 	BPF_F_RDONLY		= (1U << 3),
 	BPF_F_WRONLY		= (1U << 4),
+	BPF_F_DUMP		= (1U << 5),
 
 /* Flag for stack_map, store build_id+offset instead of pointer */
 	BPF_F_STACK_BUILD_ID	= (1U << 5),
@@ -481,7 +482,10 @@  union bpf_attr {
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-		__aligned_u64	pathname;
+		union {
+			__aligned_u64	pathname;
+			__aligned_u64	dumper_name;
+		};
 		__u32		bpf_fd;
 		__u32		file_flags;
 	};