diff mbox series

[bpf-next,v1,07/19] bpf: create anonymous bpf iterator

Message ID 20200427201242.2995160-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song April 27, 2020, 8:12 p.m. UTC
A new bpf command BPF_ITER_CREATE is added.

The anonymous bpf iterator is seq_file based.
The seq_file private data are referenced by targets.
The bpf_iter infrastructure allocated additional space
at seq_file->private after the space used by targets
to store some meta data, e.g.,
  prog:       prog to run
  session_id: an unique id for each opened seq_file
  seq_num:    how many times bpf programs are queried in this session
  has_last:   indicate whether or not bpf_prog has been called after
              all valid objects have been processed

A map between file and prog/link is established to help
fops->release(). When fops->release() is called, just based on
inode and file, bpf program cannot be located since target
seq_priv_size not available. This map helps retrieve the prog
whose reference count needs to be decremented.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |   3 +
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |  27 ++++++
 tools/include/uapi/linux/bpf.h |   6 ++
 5 files changed, 203 insertions(+), 1 deletion(-)

Comments

Martin KaFai Lau April 29, 2020, 5:39 a.m. UTC | #1
On Mon, Apr 27, 2020 at 01:12:42PM -0700, Yonghong Song wrote:
> A new bpf command BPF_ITER_CREATE is added.
> 
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private after the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   has_last:   indicate whether or not bpf_prog has been called after
>               all valid objects have been processed
> 
> A map between file and prog/link is established to help
> fops->release(). When fops->release() is called, just based on
> inode and file, bpf program cannot be located since target
> seq_priv_size not available. This map helps retrieve the prog
> whose reference count needs to be decremented.
How about putting "struct extra_priv_data" at the beginning of
the seq_file's private store instead since its size is known.
seq->private can point to an aligned byte after
+sizeof(struct extra_priv_data).

[ ... ]

> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index fc1ce5ee5c3f..1f4e778d1814 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c

[ ... ]

> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
>  /* protect bpf_iter_link.link->prog upddate */
>  static struct mutex bpf_iter_mutex;
>  
> +/* Since at anon seq_file release function, the prog cannot
> + * be retrieved since target seq_priv_size is not available.
> + * Keep a list of <anon_file, prog> mapping, so that
> + * at file release stage, the prog can be released properly.
> + */
> +static struct list_head anon_iter_info;
> +static struct mutex anon_iter_info_mutex;
> +
> +/* incremented on every opened seq_file */
> +static atomic64_t session_id;
> +
> +static u32 get_total_priv_dsize(u32 old_size)
> +{
> +	return roundup(old_size, 8) + sizeof(struct extra_priv_data);
> +}
> +
> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
> +{
> +	return old_ptr + roundup(old_size, 8);
> +}
> +
> +static int anon_iter_release(struct inode *inode, struct file *file)
> +{
> +	struct anon_file_prog_assoc *finfo;
> +
> +	mutex_lock(&anon_iter_info_mutex);
> +	list_for_each_entry(finfo, &anon_iter_info, list) {
> +		if (finfo->file == file) {
> +			bpf_prog_put(finfo->prog);
> +			list_del(&finfo->list);
> +			kfree(finfo);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&anon_iter_info_mutex);
> +
> +	return seq_release_private(inode, file);
> +}
> +
> +static const struct file_operations anon_bpf_iter_fops = {
> +	.read		= seq_read,
> +	.release	= anon_iter_release,
> +};
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>  	struct bpf_iter_target_info *tinfo;

[ ... ]

> @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>  	mutex_unlock(&bpf_iter_mutex);
>  	return ret;
>  }
> +
> +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo,
> +			  struct bpf_prog *prog)
> +{
> +	struct extra_priv_data *extra_data;
> +
> +	if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE)
> +		set_seq_net_private((struct seq_net_private *)priv_data,
> +				    current->nsproxy->net_ns);
> +
> +	extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size);
> +	extra_data->session_id = atomic64_add_return(1, &session_id);
> +	extra_data->prog = prog;
> +	extra_data->seq_num = 0;
> +	extra_data->has_last = false;
> +}
> +
> +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
> +{
> +	struct anon_file_prog_assoc *finfo;
> +	struct bpf_iter_target_info *tinfo;
> +	struct bpf_prog *prog;
> +	u32 total_priv_dsize;
> +	void *priv_data;
> +
> +	finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN);
> +	if (!finfo)
> +		return -ENOMEM;
> +
> +	mutex_lock(&bpf_iter_mutex);
> +	prog = link->link.prog;
> +	bpf_prog_inc(prog);
> +	mutex_unlock(&bpf_iter_mutex);
> +
> +	tinfo = link->tinfo;
> +	total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size);
> +	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
> +	if (!priv_data) {
> +		bpf_prog_sub(prog, 1);
Could prog's refcnt go 0 here?  If yes, bpf_prog_put() should be used.

> +		kfree(finfo);
> +		return -ENOMEM;
> +	}
> +
> +	init_seq_file(priv_data, tinfo, prog);
> +
> +	finfo->file = file;
> +	finfo->prog = prog;
> +
> +	mutex_lock(&anon_iter_info_mutex);
> +	list_add(&finfo->list, &anon_iter_info);
> +	mutex_unlock(&anon_iter_info_mutex);
> +	return 0;
> +}
> +
Andrii Nakryiko April 29, 2020, 6:56 a.m. UTC | #2
On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new bpf command BPF_ITER_CREATE is added.
>
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private after the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   has_last:   indicate whether or not bpf_prog has been called after
>               all valid objects have been processed
>
> A map between file and prog/link is established to help
> fops->release(). When fops->release() is called, just based on
> inode and file, bpf program cannot be located since target
> seq_priv_size not available. This map helps retrieve the prog
> whose reference count needs to be decremented.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   3 +
>  include/uapi/linux/bpf.h       |   6 ++
>  kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |  27 ++++++
>  tools/include/uapi/linux/bpf.h |   6 ++
>  5 files changed, 203 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4fc39d9b5cd0..0f0cafc65a04 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>  int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>  int bpf_obj_get_user(const char __user *pathname, int flags);
>
> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
> +
>  struct bpf_iter_reg {
>         const char *target;
>         const char *target_func_name;
> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>  int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>  int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>                           struct bpf_prog *new_prog);
> +int bpf_iter_new_fd(struct bpf_link *link);
>
>  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f39b9fec37ab..576651110d16 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -113,6 +113,7 @@ enum bpf_cmd {
>         BPF_MAP_DELETE_BATCH,
>         BPF_LINK_CREATE,
>         BPF_LINK_UPDATE,
> +       BPF_ITER_CREATE,
>  };
>
>  enum bpf_map_type {
> @@ -590,6 +591,11 @@ union bpf_attr {
>                 __u32           old_prog_fd;
>         } link_update;
>
> +       struct { /* struct used by BPF_ITER_CREATE command */
> +               __u32           link_fd;
> +               __u32           flags;
> +       } iter_create;
> +
>  } __attribute__((aligned(8)));
>
>  /* The description below is an attempt at providing documentation to eBPF
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index fc1ce5ee5c3f..1f4e778d1814 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -2,6 +2,7 @@
>  /* Copyright (c) 2020 Facebook */
>
>  #include <linux/fs.h>
> +#include <linux/anon_inodes.h>
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>
> @@ -19,6 +20,19 @@ struct bpf_iter_link {
>         struct bpf_iter_target_info *tinfo;
>  };
>
> +struct extra_priv_data {
> +       struct bpf_prog *prog;
> +       u64 session_id;
> +       u64 seq_num;
> +       bool has_last;
> +};
> +
> +struct anon_file_prog_assoc {
> +       struct list_head list;
> +       struct file *file;
> +       struct bpf_prog *prog;
> +};
> +
>  static struct list_head targets;
>  static struct mutex targets_mutex;
>  static bool bpf_iter_inited = false;
> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
>  /* protect bpf_iter_link.link->prog upddate */
>  static struct mutex bpf_iter_mutex;
>
> +/* Since at anon seq_file release function, the prog cannot
> + * be retrieved since target seq_priv_size is not available.
> + * Keep a list of <anon_file, prog> mapping, so that
> + * at file release stage, the prog can be released properly.
> + */
> +static struct list_head anon_iter_info;
> +static struct mutex anon_iter_info_mutex;
> +
> +/* incremented on every opened seq_file */
> +static atomic64_t session_id;
> +
> +static u32 get_total_priv_dsize(u32 old_size)
> +{
> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
> +}
> +
> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
> +{
> +       return old_ptr + roundup(old_size, 8);
> +}
> +
> +static int anon_iter_release(struct inode *inode, struct file *file)
> +{
> +       struct anon_file_prog_assoc *finfo;
> +
> +       mutex_lock(&anon_iter_info_mutex);
> +       list_for_each_entry(finfo, &anon_iter_info, list) {
> +               if (finfo->file == file) {

I'll look at this and other patches more thoroughly tomorrow with
clear head, but this iteration to find anon_file_prog_assoc is really
unfortunate.

I think the problem is that you are allowing seq_file infrastructure
to call directly into target implementation of seq_operations without
intercepting them. If you change that and put whatever extra info is
necessary into seq_file->private in front of target's private state,
then you shouldn't need this, right?

This would also make each target's logic a bit simpler because you can:
- centralize creation and initialization of bpf_iter_meta (session_id,
seq, seq_num will be set up once in this generic code);
- loff_t pos increments;
- you can extract and centralize bpf_iter_get_prog() call in show()
implementation as well.

I think with that each target's logic will be simpler and you won't
need to maintain anon_file_prog_assocs.

Are there complications I'm missing?

[...]
Yonghong Song April 29, 2020, 7:06 a.m. UTC | #3
On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
> On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A new bpf command BPF_ITER_CREATE is added.
>>
>> The anonymous bpf iterator is seq_file based.
>> The seq_file private data are referenced by targets.
>> The bpf_iter infrastructure allocated additional space
>> at seq_file->private after the space used by targets
>> to store some meta data, e.g.,
>>    prog:       prog to run
>>    session_id: an unique id for each opened seq_file
>>    seq_num:    how many times bpf programs are queried in this session
>>    has_last:   indicate whether or not bpf_prog has been called after
>>                all valid objects have been processed
>>
>> A map between file and prog/link is established to help
>> fops->release(). When fops->release() is called, just based on
>> inode and file, bpf program cannot be located since target
>> seq_priv_size not available. This map helps retrieve the prog
>> whose reference count needs to be decremented.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   3 +
>>   include/uapi/linux/bpf.h       |   6 ++
>>   kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
>>   kernel/bpf/syscall.c           |  27 ++++++
>>   tools/include/uapi/linux/bpf.h |   6 ++
>>   5 files changed, 203 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 4fc39d9b5cd0..0f0cafc65a04 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>   int bpf_obj_get_user(const char __user *pathname, int flags);
>>
>> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
>> +
>>   struct bpf_iter_reg {
>>          const char *target;
>>          const char *target_func_name;
>> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>>   int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>>   int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>>                            struct bpf_prog *new_prog);
>> +int bpf_iter_new_fd(struct bpf_link *link);
>>
>>   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>>   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f39b9fec37ab..576651110d16 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -113,6 +113,7 @@ enum bpf_cmd {
>>          BPF_MAP_DELETE_BATCH,
>>          BPF_LINK_CREATE,
>>          BPF_LINK_UPDATE,
>> +       BPF_ITER_CREATE,
>>   };
>>
>>   enum bpf_map_type {
>> @@ -590,6 +591,11 @@ union bpf_attr {
>>                  __u32           old_prog_fd;
>>          } link_update;
>>
>> +       struct { /* struct used by BPF_ITER_CREATE command */
>> +               __u32           link_fd;
>> +               __u32           flags;
>> +       } iter_create;
>> +
>>   } __attribute__((aligned(8)));
>>
>>   /* The description below is an attempt at providing documentation to eBPF
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index fc1ce5ee5c3f..1f4e778d1814 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -2,6 +2,7 @@
>>   /* Copyright (c) 2020 Facebook */
>>
>>   #include <linux/fs.h>
>> +#include <linux/anon_inodes.h>
>>   #include <linux/filter.h>
>>   #include <linux/bpf.h>
>>
>> @@ -19,6 +20,19 @@ struct bpf_iter_link {
>>          struct bpf_iter_target_info *tinfo;
>>   };
>>
>> +struct extra_priv_data {
>> +       struct bpf_prog *prog;
>> +       u64 session_id;
>> +       u64 seq_num;
>> +       bool has_last;
>> +};
>> +
>> +struct anon_file_prog_assoc {
>> +       struct list_head list;
>> +       struct file *file;
>> +       struct bpf_prog *prog;
>> +};
>> +
>>   static struct list_head targets;
>>   static struct mutex targets_mutex;
>>   static bool bpf_iter_inited = false;
>> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
>>   /* protect bpf_iter_link.link->prog upddate */
>>   static struct mutex bpf_iter_mutex;
>>
>> +/* Since at anon seq_file release function, the prog cannot
>> + * be retrieved since target seq_priv_size is not available.
>> + * Keep a list of <anon_file, prog> mapping, so that
>> + * at file release stage, the prog can be released properly.
>> + */
>> +static struct list_head anon_iter_info;
>> +static struct mutex anon_iter_info_mutex;
>> +
>> +/* incremented on every opened seq_file */
>> +static atomic64_t session_id;
>> +
>> +static u32 get_total_priv_dsize(u32 old_size)
>> +{
>> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
>> +}
>> +
>> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
>> +{
>> +       return old_ptr + roundup(old_size, 8);
>> +}
>> +
>> +static int anon_iter_release(struct inode *inode, struct file *file)
>> +{
>> +       struct anon_file_prog_assoc *finfo;
>> +
>> +       mutex_lock(&anon_iter_info_mutex);
>> +       list_for_each_entry(finfo, &anon_iter_info, list) {
>> +               if (finfo->file == file) {
> 
> I'll look at this and other patches more thoroughly tomorrow with
> clear head, but this iteration to find anon_file_prog_assoc is really
> unfortunate.
> 
> I think the problem is that you are allowing seq_file infrastructure
> to call directly into target implementation of seq_operations without
> intercepting them. If you change that and put whatever extra info is
> necessary into seq_file->private in front of target's private state,
> then you shouldn't need this, right?

Yes. This is true. The idea is to minimize the target change.
But maybe this is not a good goal by itself.

You are right, if I intercept all seq_ops(), I do not need the
above change, I can tailor seq_file private_data right before
calling target one and restore after the target call.

Originally I only have one interception, show(), now I have
stop() too to call bpf at the end of iteration. Maybe I can
interpret all four, I think. This way, I can also get ride
of target feature.

> 
> This would also make each target's logic a bit simpler because you can:
> - centralize creation and initialization of bpf_iter_meta (session_id,
> seq, seq_num will be set up once in this generic code);
> - loff_t pos increments;
> - you can extract and centralize bpf_iter_get_prog() call in show()
> implementation as well.
> 
> I think with that each target's logic will be simpler and you won't
> need to maintain anon_file_prog_assocs.
> 
> Are there complications I'm missing?
> 
> [...]
>
Andrii Nakryiko April 29, 2020, 6:16 p.m. UTC | #4
On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
> > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> A new bpf command BPF_ITER_CREATE is added.
> >>
> >> The anonymous bpf iterator is seq_file based.
> >> The seq_file private data are referenced by targets.
> >> The bpf_iter infrastructure allocated additional space
> >> at seq_file->private after the space used by targets
> >> to store some meta data, e.g.,
> >>    prog:       prog to run
> >>    session_id: an unique id for each opened seq_file
> >>    seq_num:    how many times bpf programs are queried in this session
> >>    has_last:   indicate whether or not bpf_prog has been called after
> >>                all valid objects have been processed
> >>
> >> A map between file and prog/link is established to help
> >> fops->release(). When fops->release() is called, just based on
> >> inode and file, bpf program cannot be located since target
> >> seq_priv_size not available. This map helps retrieve the prog
> >> whose reference count needs to be decremented.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h            |   3 +
> >>   include/uapi/linux/bpf.h       |   6 ++
> >>   kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
> >>   kernel/bpf/syscall.c           |  27 ++++++
> >>   tools/include/uapi/linux/bpf.h |   6 ++
> >>   5 files changed, 203 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index 4fc39d9b5cd0..0f0cafc65a04 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> >>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> >>   int bpf_obj_get_user(const char __user *pathname, int flags);
> >>
> >> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
> >> +
> >>   struct bpf_iter_reg {
> >>          const char *target;
> >>          const char *target_func_name;
> >> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
> >>   int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> >>   int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
> >>                            struct bpf_prog *new_prog);
> >> +int bpf_iter_new_fd(struct bpf_link *link);
> >>
> >>   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
> >>   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index f39b9fec37ab..576651110d16 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -113,6 +113,7 @@ enum bpf_cmd {
> >>          BPF_MAP_DELETE_BATCH,
> >>          BPF_LINK_CREATE,
> >>          BPF_LINK_UPDATE,
> >> +       BPF_ITER_CREATE,
> >>   };
> >>
> >>   enum bpf_map_type {
> >> @@ -590,6 +591,11 @@ union bpf_attr {
> >>                  __u32           old_prog_fd;
> >>          } link_update;
> >>
> >> +       struct { /* struct used by BPF_ITER_CREATE command */
> >> +               __u32           link_fd;
> >> +               __u32           flags;
> >> +       } iter_create;
> >> +
> >>   } __attribute__((aligned(8)));
> >>
> >>   /* The description below is an attempt at providing documentation to eBPF
> >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> >> index fc1ce5ee5c3f..1f4e778d1814 100644
> >> --- a/kernel/bpf/bpf_iter.c
> >> +++ b/kernel/bpf/bpf_iter.c
> >> @@ -2,6 +2,7 @@
> >>   /* Copyright (c) 2020 Facebook */
> >>
> >>   #include <linux/fs.h>
> >> +#include <linux/anon_inodes.h>
> >>   #include <linux/filter.h>
> >>   #include <linux/bpf.h>
> >>
> >> @@ -19,6 +20,19 @@ struct bpf_iter_link {
> >>          struct bpf_iter_target_info *tinfo;
> >>   };
> >>
> >> +struct extra_priv_data {
> >> +       struct bpf_prog *prog;
> >> +       u64 session_id;
> >> +       u64 seq_num;
> >> +       bool has_last;
> >> +};
> >> +
> >> +struct anon_file_prog_assoc {
> >> +       struct list_head list;
> >> +       struct file *file;
> >> +       struct bpf_prog *prog;
> >> +};
> >> +
> >>   static struct list_head targets;
> >>   static struct mutex targets_mutex;
> >>   static bool bpf_iter_inited = false;
> >> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
> >>   /* protect bpf_iter_link.link->prog upddate */
> >>   static struct mutex bpf_iter_mutex;
> >>
> >> +/* Since at anon seq_file release function, the prog cannot
> >> + * be retrieved since target seq_priv_size is not available.
> >> + * Keep a list of <anon_file, prog> mapping, so that
> >> + * at file release stage, the prog can be released properly.
> >> + */
> >> +static struct list_head anon_iter_info;
> >> +static struct mutex anon_iter_info_mutex;
> >> +
> >> +/* incremented on every opened seq_file */
> >> +static atomic64_t session_id;
> >> +
> >> +static u32 get_total_priv_dsize(u32 old_size)
> >> +{
> >> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
> >> +}
> >> +
> >> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
> >> +{
> >> +       return old_ptr + roundup(old_size, 8);
> >> +}
> >> +
> >> +static int anon_iter_release(struct inode *inode, struct file *file)
> >> +{
> >> +       struct anon_file_prog_assoc *finfo;
> >> +
> >> +       mutex_lock(&anon_iter_info_mutex);
> >> +       list_for_each_entry(finfo, &anon_iter_info, list) {
> >> +               if (finfo->file == file) {
> >
> > I'll look at this and other patches more thoroughly tomorrow with
> > clear head, but this iteration to find anon_file_prog_assoc is really
> > unfortunate.
> >
> > I think the problem is that you are allowing seq_file infrastructure
> > to call directly into target implementation of seq_operations without
> > intercepting them. If you change that and put whatever extra info is
> > necessary into seq_file->private in front of target's private state,
> > then you shouldn't need this, right?
>
> Yes. This is true. The idea is to minimize the target change.
> But maybe this is not a good goal by itself.
>
> You are right, if I intercept all seq_ops(), I do not need the
> above change, I can tailor seq_file private_data right before
> calling target one and restore after the target call.
>
> Originally I only have one interception, show(), now I have
> stop() too to call bpf at the end of iteration. Maybe I can
> interpret all four, I think. This way, I can also get ride
> of target feature.

If the main goal is to minimize target changes and make them exactly
seq_operations implementation, then one easier way to get easy access
to our own metadata in seq_file->private is to set it to point
**after** our metadata, but before target's metadata. Roughly in
pseudo code:

struct bpf_iter_seq_file_meta {} __attribute((aligned(8)));

void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) +
target_private_size);
seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta);


Then to recover bpf_iter_Seq_file_meta:

struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta);

/* voila! */

This doesn't have a benefit of making targets simpler, but will
require no changes to them at all. Plus less indirect calls, so less
performance penalty.

>
> >
> > This would also make each target's logic a bit simpler because you can:
> > - centralize creation and initialization of bpf_iter_meta (session_id,
> > seq, seq_num will be set up once in this generic code);
> > - loff_t pos increments;
> > - you can extract and centralize bpf_iter_get_prog() call in show()
> > implementation as well.
> >
> > I think with that each target's logic will be simpler and you won't
> > need to maintain anon_file_prog_assocs.
> >
> > Are there complications I'm missing?
> >
> > [...]
> >
Martin KaFai Lau April 29, 2020, 6:46 p.m. UTC | #5
On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote:
> On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote:
> >
> >
> >
> > On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
> > > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
> > >>
> > >> A new bpf command BPF_ITER_CREATE is added.
> > >>
> > >> The anonymous bpf iterator is seq_file based.
> > >> The seq_file private data are referenced by targets.
> > >> The bpf_iter infrastructure allocated additional space
> > >> at seq_file->private after the space used by targets
> > >> to store some meta data, e.g.,
> > >>    prog:       prog to run
> > >>    session_id: an unique id for each opened seq_file
> > >>    seq_num:    how many times bpf programs are queried in this session
> > >>    has_last:   indicate whether or not bpf_prog has been called after
> > >>                all valid objects have been processed
> > >>
> > >> A map between file and prog/link is established to help
> > >> fops->release(). When fops->release() is called, just based on
> > >> inode and file, bpf program cannot be located since target
> > >> seq_priv_size not available. This map helps retrieve the prog
> > >> whose reference count needs to be decremented.
> > >>
> > >> Signed-off-by: Yonghong Song <yhs@fb.com>
> > >> ---
> > >>   include/linux/bpf.h            |   3 +
> > >>   include/uapi/linux/bpf.h       |   6 ++
> > >>   kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
> > >>   kernel/bpf/syscall.c           |  27 ++++++
> > >>   tools/include/uapi/linux/bpf.h |   6 ++
> > >>   5 files changed, 203 insertions(+), 1 deletion(-)
> > >>
> > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > >> index 4fc39d9b5cd0..0f0cafc65a04 100644
> > >> --- a/include/linux/bpf.h
> > >> +++ b/include/linux/bpf.h
> > >> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> > >>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> > >>   int bpf_obj_get_user(const char __user *pathname, int flags);
> > >>
> > >> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
> > >> +
> > >>   struct bpf_iter_reg {
> > >>          const char *target;
> > >>          const char *target_func_name;
> > >> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
> > >>   int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> > >>   int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
> > >>                            struct bpf_prog *new_prog);
> > >> +int bpf_iter_new_fd(struct bpf_link *link);
> > >>
> > >>   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
> > >>   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >> index f39b9fec37ab..576651110d16 100644
> > >> --- a/include/uapi/linux/bpf.h
> > >> +++ b/include/uapi/linux/bpf.h
> > >> @@ -113,6 +113,7 @@ enum bpf_cmd {
> > >>          BPF_MAP_DELETE_BATCH,
> > >>          BPF_LINK_CREATE,
> > >>          BPF_LINK_UPDATE,
> > >> +       BPF_ITER_CREATE,
> > >>   };
> > >>
> > >>   enum bpf_map_type {
> > >> @@ -590,6 +591,11 @@ union bpf_attr {
> > >>                  __u32           old_prog_fd;
> > >>          } link_update;
> > >>
> > >> +       struct { /* struct used by BPF_ITER_CREATE command */
> > >> +               __u32           link_fd;
> > >> +               __u32           flags;
> > >> +       } iter_create;
> > >> +
> > >>   } __attribute__((aligned(8)));
> > >>
> > >>   /* The description below is an attempt at providing documentation to eBPF
> > >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> > >> index fc1ce5ee5c3f..1f4e778d1814 100644
> > >> --- a/kernel/bpf/bpf_iter.c
> > >> +++ b/kernel/bpf/bpf_iter.c
> > >> @@ -2,6 +2,7 @@
> > >>   /* Copyright (c) 2020 Facebook */
> > >>
> > >>   #include <linux/fs.h>
> > >> +#include <linux/anon_inodes.h>
> > >>   #include <linux/filter.h>
> > >>   #include <linux/bpf.h>
> > >>
> > >> @@ -19,6 +20,19 @@ struct bpf_iter_link {
> > >>          struct bpf_iter_target_info *tinfo;
> > >>   };
> > >>
> > >> +struct extra_priv_data {
> > >> +       struct bpf_prog *prog;
> > >> +       u64 session_id;
> > >> +       u64 seq_num;
> > >> +       bool has_last;
> > >> +};
> > >> +
> > >> +struct anon_file_prog_assoc {
> > >> +       struct list_head list;
> > >> +       struct file *file;
> > >> +       struct bpf_prog *prog;
> > >> +};
> > >> +
> > >>   static struct list_head targets;
> > >>   static struct mutex targets_mutex;
> > >>   static bool bpf_iter_inited = false;
> > >> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
> > >>   /* protect bpf_iter_link.link->prog upddate */
> > >>   static struct mutex bpf_iter_mutex;
> > >>
> > >> +/* Since at anon seq_file release function, the prog cannot
> > >> + * be retrieved since target seq_priv_size is not available.
> > >> + * Keep a list of <anon_file, prog> mapping, so that
> > >> + * at file release stage, the prog can be released properly.
> > >> + */
> > >> +static struct list_head anon_iter_info;
> > >> +static struct mutex anon_iter_info_mutex;
> > >> +
> > >> +/* incremented on every opened seq_file */
> > >> +static atomic64_t session_id;
> > >> +
> > >> +static u32 get_total_priv_dsize(u32 old_size)
> > >> +{
> > >> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
> > >> +}
> > >> +
> > >> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
> > >> +{
> > >> +       return old_ptr + roundup(old_size, 8);
> > >> +}
> > >> +
> > >> +static int anon_iter_release(struct inode *inode, struct file *file)
> > >> +{
> > >> +       struct anon_file_prog_assoc *finfo;
> > >> +
> > >> +       mutex_lock(&anon_iter_info_mutex);
> > >> +       list_for_each_entry(finfo, &anon_iter_info, list) {
> > >> +               if (finfo->file == file) {
> > >
> > > I'll look at this and other patches more thoroughly tomorrow with
> > > clear head, but this iteration to find anon_file_prog_assoc is really
> > > unfortunate.
> > >
> > > I think the problem is that you are allowing seq_file infrastructure
> > > to call directly into target implementation of seq_operations without
> > > intercepting them. If you change that and put whatever extra info is
> > > necessary into seq_file->private in front of target's private state,
> > > then you shouldn't need this, right?
> >
> > Yes. This is true. The idea is to minimize the target change.
> > But maybe this is not a good goal by itself.
> >
> > You are right, if I intercept all seq_ops(), I do not need the
> > above change, I can tailor seq_file private_data right before
> > calling target one and restore after the target call.
> >
> > Originally I only have one interception, show(), now I have
> > stop() too to call bpf at the end of iteration. Maybe I can
> > interpret all four, I think. This way, I can also get ride
> > of target feature.
> 
> If the main goal is to minimize target changes and make them exactly
> seq_operations implementation, then one easier way to get easy access
> to our own metadata in seq_file->private is to set it to point
> **after** our metadata, but before target's metadata. Roughly in
> pseudo code:
> 
> struct bpf_iter_seq_file_meta {} __attribute((aligned(8)));
> 
> void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) +
> target_private_size);
> seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta);
I have suggested the same thing earlier.  Good to know that we think alike ;)

May be put them in a struct such that container_of...etc can be used:
struct bpf_iter_private {
        struct extra_priv_data iter_private;
	u8 target_private[] __aligned(8);
};

> 
> 
> Then to recover bpf_iter_Seq_file_meta:
> 
> struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta);
> 
> /* voila! */
> 
> This doesn't have a benefit of making targets simpler, but will
> require no changes to them at all. Plus less indirect calls, so less
> performance penalty.
>
Yonghong Song April 29, 2020, 7:20 p.m. UTC | #6
On 4/29/20 11:46 AM, Martin KaFai Lau wrote:
> On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote:
>> On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote:
>>>
>>>
>>>
>>> On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
>>>> On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>> A new bpf command BPF_ITER_CREATE is added.
>>>>>
>>>>> The anonymous bpf iterator is seq_file based.
>>>>> The seq_file private data are referenced by targets.
>>>>> The bpf_iter infrastructure allocated additional space
>>>>> at seq_file->private after the space used by targets
>>>>> to store some meta data, e.g.,
>>>>>     prog:       prog to run
>>>>>     session_id: an unique id for each opened seq_file
>>>>>     seq_num:    how many times bpf programs are queried in this session
>>>>>     has_last:   indicate whether or not bpf_prog has been called after
>>>>>                 all valid objects have been processed
>>>>>
>>>>> A map between file and prog/link is established to help
>>>>> fops->release(). When fops->release() is called, just based on
>>>>> inode and file, bpf program cannot be located since target
>>>>> seq_priv_size not available. This map helps retrieve the prog
>>>>> whose reference count needs to be decremented.
>>>>>
>>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>>> ---
>>>>>    include/linux/bpf.h            |   3 +
>>>>>    include/uapi/linux/bpf.h       |   6 ++
>>>>>    kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
>>>>>    kernel/bpf/syscall.c           |  27 ++++++
>>>>>    tools/include/uapi/linux/bpf.h |   6 ++
>>>>>    5 files changed, 203 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>> index 4fc39d9b5cd0..0f0cafc65a04 100644
>>>>> --- a/include/linux/bpf.h
>>>>> +++ b/include/linux/bpf.h
>>>>> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>>>>    int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>>>>    int bpf_obj_get_user(const char __user *pathname, int flags);
>>>>>
>>>>> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
>>>>> +
>>>>>    struct bpf_iter_reg {
>>>>>           const char *target;
>>>>>           const char *target_func_name;
>>>>> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>>>>>    int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>>>>>    int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>>>>>                             struct bpf_prog *new_prog);
>>>>> +int bpf_iter_new_fd(struct bpf_link *link);
>>>>>
>>>>>    int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>>>>>    int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>> index f39b9fec37ab..576651110d16 100644
>>>>> --- a/include/uapi/linux/bpf.h
>>>>> +++ b/include/uapi/linux/bpf.h
>>>>> @@ -113,6 +113,7 @@ enum bpf_cmd {
>>>>>           BPF_MAP_DELETE_BATCH,
>>>>>           BPF_LINK_CREATE,
>>>>>           BPF_LINK_UPDATE,
>>>>> +       BPF_ITER_CREATE,
>>>>>    };
>>>>>
>>>>>    enum bpf_map_type {
>>>>> @@ -590,6 +591,11 @@ union bpf_attr {
>>>>>                   __u32           old_prog_fd;
>>>>>           } link_update;
>>>>>
>>>>> +       struct { /* struct used by BPF_ITER_CREATE command */
>>>>> +               __u32           link_fd;
>>>>> +               __u32           flags;
>>>>> +       } iter_create;
>>>>> +
>>>>>    } __attribute__((aligned(8)));
>>>>>
>>>>>    /* The description below is an attempt at providing documentation to eBPF
>>>>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>>>>> index fc1ce5ee5c3f..1f4e778d1814 100644
>>>>> --- a/kernel/bpf/bpf_iter.c
>>>>> +++ b/kernel/bpf/bpf_iter.c
>>>>> @@ -2,6 +2,7 @@
>>>>>    /* Copyright (c) 2020 Facebook */
>>>>>
>>>>>    #include <linux/fs.h>
>>>>> +#include <linux/anon_inodes.h>
>>>>>    #include <linux/filter.h>
>>>>>    #include <linux/bpf.h>
>>>>>
>>>>> @@ -19,6 +20,19 @@ struct bpf_iter_link {
>>>>>           struct bpf_iter_target_info *tinfo;
>>>>>    };
>>>>>
>>>>> +struct extra_priv_data {
>>>>> +       struct bpf_prog *prog;
>>>>> +       u64 session_id;
>>>>> +       u64 seq_num;
>>>>> +       bool has_last;
>>>>> +};
>>>>> +
>>>>> +struct anon_file_prog_assoc {
>>>>> +       struct list_head list;
>>>>> +       struct file *file;
>>>>> +       struct bpf_prog *prog;
>>>>> +};
>>>>> +
>>>>>    static struct list_head targets;
>>>>>    static struct mutex targets_mutex;
>>>>>    static bool bpf_iter_inited = false;
>>>>> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
>>>>>    /* protect bpf_iter_link.link->prog upddate */
>>>>>    static struct mutex bpf_iter_mutex;
>>>>>
>>>>> +/* Since at anon seq_file release function, the prog cannot
>>>>> + * be retrieved since target seq_priv_size is not available.
>>>>> + * Keep a list of <anon_file, prog> mapping, so that
>>>>> + * at file release stage, the prog can be released properly.
>>>>> + */
>>>>> +static struct list_head anon_iter_info;
>>>>> +static struct mutex anon_iter_info_mutex;
>>>>> +
>>>>> +/* incremented on every opened seq_file */
>>>>> +static atomic64_t session_id;
>>>>> +
>>>>> +static u32 get_total_priv_dsize(u32 old_size)
>>>>> +{
>>>>> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
>>>>> +}
>>>>> +
>>>>> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
>>>>> +{
>>>>> +       return old_ptr + roundup(old_size, 8);
>>>>> +}
>>>>> +
>>>>> +static int anon_iter_release(struct inode *inode, struct file *file)
>>>>> +{
>>>>> +       struct anon_file_prog_assoc *finfo;
>>>>> +
>>>>> +       mutex_lock(&anon_iter_info_mutex);
>>>>> +       list_for_each_entry(finfo, &anon_iter_info, list) {
>>>>> +               if (finfo->file == file) {
>>>>
>>>> I'll look at this and other patches more thoroughly tomorrow with
>>>> clear head, but this iteration to find anon_file_prog_assoc is really
>>>> unfortunate.
>>>>
>>>> I think the problem is that you are allowing seq_file infrastructure
>>>> to call directly into target implementation of seq_operations without
>>>> intercepting them. If you change that and put whatever extra info is
>>>> necessary into seq_file->private in front of target's private state,
>>>> then you shouldn't need this, right?
>>>
>>> Yes. This is true. The idea is to minimize the target change.
>>> But maybe this is not a good goal by itself.
>>>
>>> You are right, if I intercept all seq_ops(), I do not need the
>>> above change, I can tailor seq_file private_data right before
>>> calling target one and restore after the target call.
>>>
>>> Originally I only have one interception, show(), now I have
>>> stop() too to call bpf at the end of iteration. Maybe I can
>>> interpret all four, I think. This way, I can also get ride
>>> of target feature.
>>
>> If the main goal is to minimize target changes and make them exactly
>> seq_operations implementation, then one easier way to get easy access
>> to our own metadata in seq_file->private is to set it to point
>> **after** our metadata, but before target's metadata. Roughly in
>> pseudo code:
>>
>> struct bpf_iter_seq_file_meta {} __attribute((aligned(8)));
>>
>> void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) +
>> target_private_size);
>> seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta);
> I have suggested the same thing earlier.  Good to know that we think alike ;)
> 
> May be put them in a struct such that container_of...etc can be used:
> struct bpf_iter_private {
>          struct extra_priv_data iter_private;
> 	u8 target_private[] __aligned(8);
> };

This should work, but need to intercept all seq_ops() operations
because target expects private data is `target_private` only.
Let me experiment what is the best way to do this.

> 
>>
>>
>> Then to recover bpf_iter_Seq_file_meta:
>>
>> struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta);
>>
>> /* voila! */
>>
>> This doesn't have a benefit of making targets simpler, but will
>> require no changes to them at all. Plus less indirect calls, so less
>> performance penalty.
>>
Andrii Nakryiko April 29, 2020, 7:39 p.m. UTC | #7
On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new bpf command BPF_ITER_CREATE is added.
>
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private after the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   has_last:   indicate whether or not bpf_prog has been called after
>               all valid objects have been processed
>
> A map between file and prog/link is established to help
> fops->release(). When fops->release() is called, just based on
> inode and file, bpf program cannot be located since target
> seq_priv_size not available. This map helps retrieve the prog
> whose reference count needs to be decremented.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   3 +
>  include/uapi/linux/bpf.h       |   6 ++
>  kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |  27 ++++++
>  tools/include/uapi/linux/bpf.h |   6 ++
>  5 files changed, 203 insertions(+), 1 deletion(-)
>

[...]

>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> @@ -37,6 +95,8 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>                 INIT_LIST_HEAD(&targets);
>                 mutex_init(&targets_mutex);
>                 mutex_init(&bpf_iter_mutex);
> +               INIT_LIST_HEAD(&anon_iter_info);
> +               mutex_init(&anon_iter_info_mutex);
>                 bpf_iter_inited = true;
>         }
>
> @@ -61,7 +121,20 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
>                                    u64 *session_id, u64 *seq_num, bool is_last)

instead of passing many pointers (session_id, seq_num), would it be
better to just pass bpf_iter_meta * instead?

>  {
> -       return NULL;
> +       struct extra_priv_data *extra_data;
> +
> +       if (seq->file->f_op != &anon_bpf_iter_fops)
> +               return NULL;
> +
> +       extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
> +       if (extra_data->has_last)
> +               return NULL;
> +
> +       *session_id = extra_data->session_id;
> +       *seq_num = extra_data->seq_num++;
> +       extra_data->has_last = is_last;
> +
> +       return extra_data->prog;
>  }
>
>  int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
> @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>         mutex_unlock(&bpf_iter_mutex);
>         return ret;
>  }
> +
> +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo,
> +                         struct bpf_prog *prog)
> +{
> +       struct extra_priv_data *extra_data;
> +
> +       if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE)
> +               set_seq_net_private((struct seq_net_private *)priv_data,
> +                                   current->nsproxy->net_ns);
> +
> +       extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size);
> +       extra_data->session_id = atomic64_add_return(1, &session_id);
> +       extra_data->prog = prog;
> +       extra_data->seq_num = 0;
> +       extra_data->has_last = false;
> +}
> +
> +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
> +{
> +       struct anon_file_prog_assoc *finfo;
> +       struct bpf_iter_target_info *tinfo;
> +       struct bpf_prog *prog;
> +       u32 total_priv_dsize;
> +       void *priv_data;
> +
> +       finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN);
> +       if (!finfo)
> +               return -ENOMEM;
> +
> +       mutex_lock(&bpf_iter_mutex);
> +       prog = link->link.prog;
> +       bpf_prog_inc(prog);
> +       mutex_unlock(&bpf_iter_mutex);
> +
> +       tinfo = link->tinfo;
> +       total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size);
> +       priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
> +       if (!priv_data) {
> +               bpf_prog_sub(prog, 1);
> +               kfree(finfo);
> +               return -ENOMEM;
> +       }
> +
> +       init_seq_file(priv_data, tinfo, prog);
> +
> +       finfo->file = file;
> +       finfo->prog = prog;
> +
> +       mutex_lock(&anon_iter_info_mutex);
> +       list_add(&finfo->list, &anon_iter_info);
> +       mutex_unlock(&anon_iter_info_mutex);
> +       return 0;
> +}
> +
> +int bpf_iter_new_fd(struct bpf_link *link)
> +{
> +       struct file *file;
> +       int err, fd;
> +
> +       if (link->ops != &bpf_iter_link_lops)
> +               return -EINVAL;
> +
> +       fd = get_unused_fd_flags(O_CLOEXEC);
> +       if (fd < 0)
> +               return fd;
> +
> +       file = anon_inode_getfile("bpf_iter", &anon_bpf_iter_fops,
> +                                 NULL, O_CLOEXEC);

Shouldn't this anon file be readable and have O_RDONLY flag as well?

> +       if (IS_ERR(file)) {
> +               err = PTR_ERR(file);
> +               goto free_fd;
> +       }
> +
> +       err = prepare_seq_file(file,
> +                              container_of(link, struct bpf_iter_link, link));
> +       if (err)
> +               goto free_file;
> +
> +       fd_install(fd, file);
> +       return fd;
> +
> +free_file:
> +       fput(file);
> +free_fd:
> +       put_unused_fd(fd);
> +       return err;
> +}
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b7af4f006f2e..458f7000887a 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3696,6 +3696,30 @@ static int link_update(union bpf_attr *attr)
>         return ret;
>  }
>
> +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
> +
> +static int bpf_iter_create(union bpf_attr *attr)
> +{
> +       struct bpf_link *link;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_ITER_CREATE))
> +               return -EINVAL;
> +
> +       if (attr->iter_create.flags)
> +               return -EINVAL;
> +
> +       link = bpf_link_get_from_fd(attr->iter_create.link_fd);
> +       if (IS_ERR(link))
> +               return PTR_ERR(link);
> +
> +       err = bpf_iter_new_fd(link);
> +       if (err < 0)
> +               bpf_link_put(link);

bpf_iter_new_fd() doesn't take a refcnt on link, so you need to put it
regardless of success or error

> +
> +       return err;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>  {
>         union bpf_attr attr;
> @@ -3813,6 +3837,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_LINK_UPDATE:
>                 err = link_update(&attr);
>                 break;
> +       case BPF_ITER_CREATE:
> +               err = bpf_iter_create(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;

[...]
Martin KaFai Lau April 29, 2020, 8:50 p.m. UTC | #8
On Wed, Apr 29, 2020 at 12:20:05PM -0700, Yonghong Song wrote:
> 
> 
> On 4/29/20 11:46 AM, Martin KaFai Lau wrote:
> > On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote:
> > > On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote:
> > > > 
> > > > 
> > > > 
> > > > On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
> > > > > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
> > > > > > 
> > > > > > A new bpf command BPF_ITER_CREATE is added.
> > > > > > 
> > > > > > The anonymous bpf iterator is seq_file based.
> > > > > > The seq_file private data are referenced by targets.
> > > > > > The bpf_iter infrastructure allocated additional space
> > > > > > at seq_file->private after the space used by targets
> > > > > > to store some meta data, e.g.,
> > > > > >     prog:       prog to run
> > > > > >     session_id: an unique id for each opened seq_file
> > > > > >     seq_num:    how many times bpf programs are queried in this session
> > > > > >     has_last:   indicate whether or not bpf_prog has been called after
> > > > > >                 all valid objects have been processed
> > > > > > 
> > > > > > A map between file and prog/link is established to help
> > > > > > fops->release(). When fops->release() is called, just based on
> > > > > > inode and file, bpf program cannot be located since target
> > > > > > seq_priv_size not available. This map helps retrieve the prog
> > > > > > whose reference count needs to be decremented.
> > > > > > 
> > > > > > Signed-off-by: Yonghong Song <yhs@fb.com>
> > > > > > ---
> > > > > >    include/linux/bpf.h            |   3 +
> > > > > >    include/uapi/linux/bpf.h       |   6 ++
> > > > > >    kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
> > > > > >    kernel/bpf/syscall.c           |  27 ++++++
> > > > > >    tools/include/uapi/linux/bpf.h |   6 ++
> > > > > >    5 files changed, 203 insertions(+), 1 deletion(-)
> > > > > > 
> > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > index 4fc39d9b5cd0..0f0cafc65a04 100644
> > > > > > --- a/include/linux/bpf.h
> > > > > > +++ b/include/linux/bpf.h
> > > > > > @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> > > > > >    int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> > > > > >    int bpf_obj_get_user(const char __user *pathname, int flags);
> > > > > > 
> > > > > > +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
> > > > > > +
> > > > > >    struct bpf_iter_reg {
> > > > > >           const char *target;
> > > > > >           const char *target_func_name;
> > > > > > @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
> > > > > >    int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> > > > > >    int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
> > > > > >                             struct bpf_prog *new_prog);
> > > > > > +int bpf_iter_new_fd(struct bpf_link *link);
> > > > > > 
> > > > > >    int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
> > > > > >    int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> > > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > > > index f39b9fec37ab..576651110d16 100644
> > > > > > --- a/include/uapi/linux/bpf.h
> > > > > > +++ b/include/uapi/linux/bpf.h
> > > > > > @@ -113,6 +113,7 @@ enum bpf_cmd {
> > > > > >           BPF_MAP_DELETE_BATCH,
> > > > > >           BPF_LINK_CREATE,
> > > > > >           BPF_LINK_UPDATE,
> > > > > > +       BPF_ITER_CREATE,
> > > > > >    };
> > > > > > 
> > > > > >    enum bpf_map_type {
> > > > > > @@ -590,6 +591,11 @@ union bpf_attr {
> > > > > >                   __u32           old_prog_fd;
> > > > > >           } link_update;
> > > > > > 
> > > > > > +       struct { /* struct used by BPF_ITER_CREATE command */
> > > > > > +               __u32           link_fd;
> > > > > > +               __u32           flags;
> > > > > > +       } iter_create;
> > > > > > +
> > > > > >    } __attribute__((aligned(8)));
> > > > > > 
> > > > > >    /* The description below is an attempt at providing documentation to eBPF
> > > > > > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> > > > > > index fc1ce5ee5c3f..1f4e778d1814 100644
> > > > > > --- a/kernel/bpf/bpf_iter.c
> > > > > > +++ b/kernel/bpf/bpf_iter.c
> > > > > > @@ -2,6 +2,7 @@
> > > > > >    /* Copyright (c) 2020 Facebook */
> > > > > > 
> > > > > >    #include <linux/fs.h>
> > > > > > +#include <linux/anon_inodes.h>
> > > > > >    #include <linux/filter.h>
> > > > > >    #include <linux/bpf.h>
> > > > > > 
> > > > > > @@ -19,6 +20,19 @@ struct bpf_iter_link {
> > > > > >           struct bpf_iter_target_info *tinfo;
> > > > > >    };
> > > > > > 
> > > > > > +struct extra_priv_data {
> > > > > > +       struct bpf_prog *prog;
> > > > > > +       u64 session_id;
> > > > > > +       u64 seq_num;
> > > > > > +       bool has_last;
> > > > > > +};
> > > > > > +
> > > > > > +struct anon_file_prog_assoc {
> > > > > > +       struct list_head list;
> > > > > > +       struct file *file;
> > > > > > +       struct bpf_prog *prog;
> > > > > > +};
> > > > > > +
> > > > > >    static struct list_head targets;
> > > > > >    static struct mutex targets_mutex;
> > > > > >    static bool bpf_iter_inited = false;
> > > > > > @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
> > > > > >    /* protect bpf_iter_link.link->prog upddate */
> > > > > >    static struct mutex bpf_iter_mutex;
> > > > > > 
> > > > > > +/* Since at anon seq_file release function, the prog cannot
> > > > > > + * be retrieved since target seq_priv_size is not available.
> > > > > > + * Keep a list of <anon_file, prog> mapping, so that
> > > > > > + * at file release stage, the prog can be released properly.
> > > > > > + */
> > > > > > +static struct list_head anon_iter_info;
> > > > > > +static struct mutex anon_iter_info_mutex;
> > > > > > +
> > > > > > +/* incremented on every opened seq_file */
> > > > > > +static atomic64_t session_id;
> > > > > > +
> > > > > > +static u32 get_total_priv_dsize(u32 old_size)
> > > > > > +{
> > > > > > +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
> > > > > > +}
> > > > > > +
> > > > > > +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
> > > > > > +{
> > > > > > +       return old_ptr + roundup(old_size, 8);
> > > > > > +}
> > > > > > +
> > > > > > +static int anon_iter_release(struct inode *inode, struct file *file)
> > > > > > +{
> > > > > > +       struct anon_file_prog_assoc *finfo;
> > > > > > +
> > > > > > +       mutex_lock(&anon_iter_info_mutex);
> > > > > > +       list_for_each_entry(finfo, &anon_iter_info, list) {
> > > > > > +               if (finfo->file == file) {
> > > > > 
> > > > > I'll look at this and other patches more thoroughly tomorrow with
> > > > > clear head, but this iteration to find anon_file_prog_assoc is really
> > > > > unfortunate.
> > > > > 
> > > > > I think the problem is that you are allowing seq_file infrastructure
> > > > > to call directly into target implementation of seq_operations without
> > > > > intercepting them. If you change that and put whatever extra info is
> > > > > necessary into seq_file->private in front of target's private state,
> > > > > then you shouldn't need this, right?
> > > > 
> > > > Yes. This is true. The idea is to minimize the target change.
> > > > But maybe this is not a good goal by itself.
> > > > 
> > > > You are right, if I intercept all seq_ops(), I do not need the
> > > > above change, I can tailor seq_file private_data right before
> > > > calling target one and restore after the target call.
> > > > 
> > > > Originally I only have one interception, show(), now I have
> > > > stop() too to call bpf at the end of iteration. Maybe I can
> > > > interpret all four, I think. This way, I can also get ride
> > > > of target feature.
> > > 
> > > If the main goal is to minimize target changes and make them exactly
> > > seq_operations implementation, then one easier way to get easy access
> > > to our own metadata in seq_file->private is to set it to point
> > > **after** our metadata, but before target's metadata. Roughly in
> > > pseudo code:
> > > 
> > > struct bpf_iter_seq_file_meta {} __attribute((aligned(8)));
> > > 
> > > void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) +
> > > target_private_size);
> > > seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta);
> > I have suggested the same thing earlier.  Good to know that we think alike ;)
> > 
> > May be put them in a struct such that container_of...etc can be used:
> > struct bpf_iter_private {
> >          struct extra_priv_data iter_private;
> > 	u8 target_private[] __aligned(8);
> > };
> 
> This should work, but need to intercept all seq_ops() operations
> because target expects private data is `target_private` only.
> Let me experiment what is the best way to do this.
As long as "seq_file->private = bpf_iter_private->target_private;" as
Andrii also suggested, the existing seq_ops() should not have to be
changed or needed to be intercepted because they are only
accessing it through seq_file->private.

The bpf_iter logic should be the only one needed to access the
bpf_iter_private->iter_private and it can be obtained by, e.g.
"container_of(seq_file->private, struct bpf_iter_private, target_private)"

> 
> > 
> > > 
> > > 
> > > Then to recover bpf_iter_Seq_file_meta:
> > > 
> > > struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta);
> > > 
> > > /* voila! */
> > > 
> > > This doesn't have a benefit of making targets simpler, but will
> > > require no changes to them at all. Plus less indirect calls, so less
> > > performance penalty.
> > >
Yonghong Song April 29, 2020, 8:54 p.m. UTC | #9
On 4/29/20 1:50 PM, Martin KaFai Lau wrote:
> On Wed, Apr 29, 2020 at 12:20:05PM -0700, Yonghong Song wrote:
>>
>>
>> On 4/29/20 11:46 AM, Martin KaFai Lau wrote:
>>> On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote:
>>>> On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 4/28/20 11:56 PM, Andrii Nakryiko wrote:
>>>>>> On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>>>
>>>>>>> A new bpf command BPF_ITER_CREATE is added.
>>>>>>>
>>>>>>> The anonymous bpf iterator is seq_file based.
>>>>>>> The seq_file private data are referenced by targets.
>>>>>>> The bpf_iter infrastructure allocated additional space
>>>>>>> at seq_file->private after the space used by targets
>>>>>>> to store some meta data, e.g.,
>>>>>>>      prog:       prog to run
>>>>>>>      session_id: an unique id for each opened seq_file
>>>>>>>      seq_num:    how many times bpf programs are queried in this session
>>>>>>>      has_last:   indicate whether or not bpf_prog has been called after
>>>>>>>                  all valid objects have been processed
>>>>>>>
>>>>>>> A map between file and prog/link is established to help
>>>>>>> fops->release(). When fops->release() is called, just based on
>>>>>>> inode and file, bpf program cannot be located since target
>>>>>>> seq_priv_size not available. This map helps retrieve the prog
>>>>>>> whose reference count needs to be decremented.
>>>>>>>
>>>>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>>>>> ---
>>>>>>>     include/linux/bpf.h            |   3 +
>>>>>>>     include/uapi/linux/bpf.h       |   6 ++
>>>>>>>     kernel/bpf/bpf_iter.c          | 162 ++++++++++++++++++++++++++++++++-
>>>>>>>     kernel/bpf/syscall.c           |  27 ++++++
>>>>>>>     tools/include/uapi/linux/bpf.h |   6 ++
>>>>>>>     5 files changed, 203 insertions(+), 1 deletion(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>>> index 4fc39d9b5cd0..0f0cafc65a04 100644
>>>>>>> --- a/include/linux/bpf.h
>>>>>>> +++ b/include/linux/bpf.h
>>>>>>> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>>>>>>     int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>>>>>>     int bpf_obj_get_user(const char __user *pathname, int flags);
>>>>>>>
>>>>>>> +#define BPF_DUMP_SEQ_NET_PRIVATE       BIT(0)
>>>>>>> +
>>>>>>>     struct bpf_iter_reg {
>>>>>>>            const char *target;
>>>>>>>            const char *target_func_name;
>>>>>>> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>>>>>>>     int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>>>>>>>     int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>>>>>>>                              struct bpf_prog *new_prog);
>>>>>>> +int bpf_iter_new_fd(struct bpf_link *link);
>>>>>>>
>>>>>>>     int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>>>>>>>     int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
>>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>>>> index f39b9fec37ab..576651110d16 100644
>>>>>>> --- a/include/uapi/linux/bpf.h
>>>>>>> +++ b/include/uapi/linux/bpf.h
>>>>>>> @@ -113,6 +113,7 @@ enum bpf_cmd {
>>>>>>>            BPF_MAP_DELETE_BATCH,
>>>>>>>            BPF_LINK_CREATE,
>>>>>>>            BPF_LINK_UPDATE,
>>>>>>> +       BPF_ITER_CREATE,
>>>>>>>     };
>>>>>>>
>>>>>>>     enum bpf_map_type {
>>>>>>> @@ -590,6 +591,11 @@ union bpf_attr {
>>>>>>>                    __u32           old_prog_fd;
>>>>>>>            } link_update;
>>>>>>>
>>>>>>> +       struct { /* struct used by BPF_ITER_CREATE command */
>>>>>>> +               __u32           link_fd;
>>>>>>> +               __u32           flags;
>>>>>>> +       } iter_create;
>>>>>>> +
>>>>>>>     } __attribute__((aligned(8)));
>>>>>>>
>>>>>>>     /* The description below is an attempt at providing documentation to eBPF
>>>>>>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>>>>>>> index fc1ce5ee5c3f..1f4e778d1814 100644
>>>>>>> --- a/kernel/bpf/bpf_iter.c
>>>>>>> +++ b/kernel/bpf/bpf_iter.c
>>>>>>> @@ -2,6 +2,7 @@
>>>>>>>     /* Copyright (c) 2020 Facebook */
>>>>>>>
>>>>>>>     #include <linux/fs.h>
>>>>>>> +#include <linux/anon_inodes.h>
>>>>>>>     #include <linux/filter.h>
>>>>>>>     #include <linux/bpf.h>
>>>>>>>
>>>>>>> @@ -19,6 +20,19 @@ struct bpf_iter_link {
>>>>>>>            struct bpf_iter_target_info *tinfo;
>>>>>>>     };
>>>>>>>
>>>>>>> +struct extra_priv_data {
>>>>>>> +       struct bpf_prog *prog;
>>>>>>> +       u64 session_id;
>>>>>>> +       u64 seq_num;
>>>>>>> +       bool has_last;
>>>>>>> +};
>>>>>>> +
>>>>>>> +struct anon_file_prog_assoc {
>>>>>>> +       struct list_head list;
>>>>>>> +       struct file *file;
>>>>>>> +       struct bpf_prog *prog;
>>>>>>> +};
>>>>>>> +
>>>>>>>     static struct list_head targets;
>>>>>>>     static struct mutex targets_mutex;
>>>>>>>     static bool bpf_iter_inited = false;
>>>>>>> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false;
>>>>>>>     /* protect bpf_iter_link.link->prog upddate */
>>>>>>>     static struct mutex bpf_iter_mutex;
>>>>>>>
>>>>>>> +/* Since at anon seq_file release function, the prog cannot
>>>>>>> + * be retrieved since target seq_priv_size is not available.
>>>>>>> + * Keep a list of <anon_file, prog> mapping, so that
>>>>>>> + * at file release stage, the prog can be released properly.
>>>>>>> + */
>>>>>>> +static struct list_head anon_iter_info;
>>>>>>> +static struct mutex anon_iter_info_mutex;
>>>>>>> +
>>>>>>> +/* incremented on every opened seq_file */
>>>>>>> +static atomic64_t session_id;
>>>>>>> +
>>>>>>> +static u32 get_total_priv_dsize(u32 old_size)
>>>>>>> +{
>>>>>>> +       return roundup(old_size, 8) + sizeof(struct extra_priv_data);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
>>>>>>> +{
>>>>>>> +       return old_ptr + roundup(old_size, 8);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int anon_iter_release(struct inode *inode, struct file *file)
>>>>>>> +{
>>>>>>> +       struct anon_file_prog_assoc *finfo;
>>>>>>> +
>>>>>>> +       mutex_lock(&anon_iter_info_mutex);
>>>>>>> +       list_for_each_entry(finfo, &anon_iter_info, list) {
>>>>>>> +               if (finfo->file == file) {
>>>>>>
>>>>>> I'll look at this and other patches more thoroughly tomorrow with
>>>>>> clear head, but this iteration to find anon_file_prog_assoc is really
>>>>>> unfortunate.
>>>>>>
>>>>>> I think the problem is that you are allowing seq_file infrastructure
>>>>>> to call directly into target implementation of seq_operations without
>>>>>> intercepting them. If you change that and put whatever extra info is
>>>>>> necessary into seq_file->private in front of target's private state,
>>>>>> then you shouldn't need this, right?
>>>>>
>>>>> Yes. This is true. The idea is to minimize the target change.
>>>>> But maybe this is not a good goal by itself.
>>>>>
>>>>> You are right, if I intercept all seq_ops(), I do not need the
>>>>> above change, I can tailor seq_file private_data right before
>>>>> calling target one and restore after the target call.
>>>>>
>>>>> Originally I only have one interception, show(), now I have
>>>>> stop() too to call bpf at the end of iteration. Maybe I can
>>>>> interpret all four, I think. This way, I can also get ride
>>>>> of target feature.
>>>>
>>>> If the main goal is to minimize target changes and make them exactly
>>>> seq_operations implementation, then one easier way to get easy access
>>>> to our own metadata in seq_file->private is to set it to point
>>>> **after** our metadata, but before target's metadata. Roughly in
>>>> pseudo code:
>>>>
>>>> struct bpf_iter_seq_file_meta {} __attribute((aligned(8)));
>>>>
>>>> void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) +
>>>> target_private_size);
>>>> seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta);
>>> I have suggested the same thing earlier.  Good to know that we think alike ;)
>>>
>>> May be put them in a struct such that container_of...etc can be used:
>>> struct bpf_iter_private {
>>>           struct extra_priv_data iter_private;
>>> 	u8 target_private[] __aligned(8);
>>> };
>>
>> This should work, but need to intercept all seq_ops() operations
>> because target expects private data is `target_private` only.
>> Let me experiment what is the best way to do this.
> As long as "seq_file->private = bpf_iter_private->target_private;" as
> Andrii also suggested, the existing seq_ops() should not have to be
> changed or needed to be intercepted because they are only
> accessing it through seq_file->private.
> 
> The bpf_iter logic should be the only one needed to access the
> bpf_iter_private->iter_private and it can be obtained by, e.g.
> "container_of(seq_file->private, struct bpf_iter_private, target_private)"

Thanks for explanation! I never thought of using container_of
magic here. It indeed work very nicely.

> 
>>
>>>
>>>>
>>>>
>>>> Then to recover bpf_iter_Seq_file_meta:
>>>>
>>>> struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta);
>>>>
>>>> /* voila! */
>>>>
>>>> This doesn't have a benefit of making targets simpler, but will
>>>> require no changes to them at all. Plus less indirect calls, so less
>>>> performance penalty.
>>>>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fc39d9b5cd0..0f0cafc65a04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1112,6 +1112,8 @@  struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+#define BPF_DUMP_SEQ_NET_PRIVATE	BIT(0)
+
 struct bpf_iter_reg {
 	const char *target;
 	const char *target_func_name;
@@ -1133,6 +1135,7 @@  int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
 			  struct bpf_prog *new_prog);
+int bpf_iter_new_fd(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f39b9fec37ab..576651110d16 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@  enum bpf_cmd {
 	BPF_MAP_DELETE_BATCH,
 	BPF_LINK_CREATE,
 	BPF_LINK_UPDATE,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -590,6 +591,11 @@  union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index fc1ce5ee5c3f..1f4e778d1814 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -2,6 +2,7 @@ 
 /* Copyright (c) 2020 Facebook */
 
 #include <linux/fs.h>
+#include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 
@@ -19,6 +20,19 @@  struct bpf_iter_link {
 	struct bpf_iter_target_info *tinfo;
 };
 
+struct extra_priv_data {
+	struct bpf_prog *prog;
+	u64 session_id;
+	u64 seq_num;
+	bool has_last;
+};
+
+struct anon_file_prog_assoc {
+	struct list_head list;
+	struct file *file;
+	struct bpf_prog *prog;
+};
+
 static struct list_head targets;
 static struct mutex targets_mutex;
 static bool bpf_iter_inited = false;
@@ -26,6 +40,50 @@  static bool bpf_iter_inited = false;
 /* protect bpf_iter_link.link->prog upddate */
 static struct mutex bpf_iter_mutex;
 
+/* Since at anon seq_file release function, the prog cannot
+ * be retrieved since target seq_priv_size is not available.
+ * Keep a list of <anon_file, prog> mapping, so that
+ * at file release stage, the prog can be released properly.
+ */
+static struct list_head anon_iter_info;
+static struct mutex anon_iter_info_mutex;
+
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
+static u32 get_total_priv_dsize(u32 old_size)
+{
+	return roundup(old_size, 8) + sizeof(struct extra_priv_data);
+}
+
+static void *get_extra_priv_dptr(void *old_ptr, u32 old_size)
+{
+	return old_ptr + roundup(old_size, 8);
+}
+
+static int anon_iter_release(struct inode *inode, struct file *file)
+{
+	struct anon_file_prog_assoc *finfo;
+
+	mutex_lock(&anon_iter_info_mutex);
+	list_for_each_entry(finfo, &anon_iter_info, list) {
+		if (finfo->file == file) {
+			bpf_prog_put(finfo->prog);
+			list_del(&finfo->list);
+			kfree(finfo);
+			break;
+		}
+	}
+	mutex_unlock(&anon_iter_info_mutex);
+
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations anon_bpf_iter_fops = {
+	.read		= seq_read,
+	.release	= anon_iter_release,
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -37,6 +95,8 @@  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 		INIT_LIST_HEAD(&targets);
 		mutex_init(&targets_mutex);
 		mutex_init(&bpf_iter_mutex);
+		INIT_LIST_HEAD(&anon_iter_info);
+		mutex_init(&anon_iter_info_mutex);
 		bpf_iter_inited = true;
 	}
 
@@ -61,7 +121,20 @@  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
 				   u64 *session_id, u64 *seq_num, bool is_last)
 {
-	return NULL;
+	struct extra_priv_data *extra_data;
+
+	if (seq->file->f_op != &anon_bpf_iter_fops)
+		return NULL;
+
+	extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
+	if (extra_data->has_last)
+		return NULL;
+
+	*session_id = extra_data->session_id;
+	*seq_num = extra_data->seq_num++;
+	extra_data->has_last = is_last;
+
+	return extra_data->prog;
 }
 
 int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
@@ -150,3 +223,90 @@  int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
 	mutex_unlock(&bpf_iter_mutex);
 	return ret;
 }
+
+static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo,
+			  struct bpf_prog *prog)
+{
+	struct extra_priv_data *extra_data;
+
+	if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE)
+		set_seq_net_private((struct seq_net_private *)priv_data,
+				    current->nsproxy->net_ns);
+
+	extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size);
+	extra_data->session_id = atomic64_add_return(1, &session_id);
+	extra_data->prog = prog;
+	extra_data->seq_num = 0;
+	extra_data->has_last = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+	struct anon_file_prog_assoc *finfo;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_prog *prog;
+	u32 total_priv_dsize;
+	void *priv_data;
+
+	finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN);
+	if (!finfo)
+		return -ENOMEM;
+
+	mutex_lock(&bpf_iter_mutex);
+	prog = link->link.prog;
+	bpf_prog_inc(prog);
+	mutex_unlock(&bpf_iter_mutex);
+
+	tinfo = link->tinfo;
+	total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size);
+	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
+	if (!priv_data) {
+		bpf_prog_sub(prog, 1);
+		kfree(finfo);
+		return -ENOMEM;
+	}
+
+	init_seq_file(priv_data, tinfo, prog);
+
+	finfo->file = file;
+	finfo->prog = prog;
+
+	mutex_lock(&anon_iter_info_mutex);
+	list_add(&finfo->list, &anon_iter_info);
+	mutex_unlock(&anon_iter_info_mutex);
+	return 0;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+	struct file *file;
+	int err, fd;
+
+	if (link->ops != &bpf_iter_link_lops)
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("bpf_iter", &anon_bpf_iter_fops,
+				  NULL, O_CLOEXEC);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto free_fd;
+	}
+
+	err = prepare_seq_file(file,
+			       container_of(link, struct bpf_iter_link, link));
+	if (err)
+		goto free_file;
+
+	fd_install(fd, file);
+	return fd;
+
+free_file:
+	fput(file);
+free_fd:
+	put_unused_fd(fd);
+	return err;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b7af4f006f2e..458f7000887a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3696,6 +3696,30 @@  static int link_update(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int err;
+
+	if (CHECK_ATTR(BPF_ITER_CREATE))
+		return -EINVAL;
+
+	if (attr->iter_create.flags)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	err = bpf_iter_new_fd(link);
+	if (err < 0)
+		bpf_link_put(link);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -3813,6 +3837,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_LINK_UPDATE:
 		err = link_update(&attr);
 		break;
+	case BPF_ITER_CREATE:
+		err = bpf_iter_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f39b9fec37ab..576651110d16 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@  enum bpf_cmd {
 	BPF_MAP_DELETE_BATCH,
 	BPF_LINK_CREATE,
 	BPF_LINK_UPDATE,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -590,6 +591,11 @@  union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF