Message ID | 20200427201242.2995160-1-yhs@fb.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: implement bpf iterator for kernel data | expand |
On Mon, Apr 27, 2020 at 01:12:42PM -0700, Yonghong Song wrote: > A new bpf command BPF_ITER_CREATE is added. > > The anonymous bpf iterator is seq_file based. > The seq_file private data are referenced by targets. > The bpf_iter infrastructure allocated additional space > at seq_file->private after the space used by targets > to store some meta data, e.g., > prog: prog to run > session_id: an unique id for each opened seq_file > seq_num: how many times bpf programs are queried in this session > has_last: indicate whether or not bpf_prog has been called after > all valid objects have been processed > > A map between file and prog/link is established to help > fops->release(). When fops->release() is called, just based on > inode and file, bpf program cannot be located since target > seq_priv_size not available. This map helps retrieve the prog > whose reference count needs to be decremented. How about putting "struct extra_priv_data" at the beginning of the seq_file's private store instead since its size is known. seq->private can point to an aligned byte after +sizeof(struct extra_priv_data). [ ... ] > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index fc1ce5ee5c3f..1f4e778d1814 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c [ ... ] > @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > /* protect bpf_iter_link.link->prog upddate */ > static struct mutex bpf_iter_mutex; > > +/* Since at anon seq_file release function, the prog cannot > + * be retrieved since target seq_priv_size is not available. > + * Keep a list of <anon_file, prog> mapping, so that > + * at file release stage, the prog can be released properly. > + */ > +static struct list_head anon_iter_info; > +static struct mutex anon_iter_info_mutex; > + > +/* incremented on every opened seq_file */ > +static atomic64_t session_id; > + > +static u32 get_total_priv_dsize(u32 old_size) > +{ > + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > +} > + > +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > +{ > + return old_ptr + roundup(old_size, 8); > +} > + > +static int anon_iter_release(struct inode *inode, struct file *file) > +{ > + struct anon_file_prog_assoc *finfo; > + > + mutex_lock(&anon_iter_info_mutex); > + list_for_each_entry(finfo, &anon_iter_info, list) { > + if (finfo->file == file) { > + bpf_prog_put(finfo->prog); > + list_del(&finfo->list); > + kfree(finfo); > + break; > + } > + } > + mutex_unlock(&anon_iter_info_mutex); > + > + return seq_release_private(inode, file); > +} > + > +static const struct file_operations anon_bpf_iter_fops = { > + .read = seq_read, > + .release = anon_iter_release, > +}; > + > int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > { > struct bpf_iter_target_info *tinfo; [ ... ] > @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > mutex_unlock(&bpf_iter_mutex); > return ret; > } > + > +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo, > + struct bpf_prog *prog) > +{ > + struct extra_priv_data *extra_data; > + > + if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE) > + set_seq_net_private((struct seq_net_private *)priv_data, > + current->nsproxy->net_ns); > + > + extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size); > + extra_data->session_id = atomic64_add_return(1, &session_id); > + extra_data->prog = prog; > + extra_data->seq_num = 0; > + extra_data->has_last = false; > +} > + > +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) > +{ > + struct anon_file_prog_assoc *finfo; > + struct bpf_iter_target_info *tinfo; > + struct bpf_prog *prog; > + u32 total_priv_dsize; > + void *priv_data; > + > + finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN); > + if (!finfo) > + return -ENOMEM; > + > + mutex_lock(&bpf_iter_mutex); > + prog = link->link.prog; > + bpf_prog_inc(prog); > + mutex_unlock(&bpf_iter_mutex); > + > + tinfo = link->tinfo; > + total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size); > + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); > + if (!priv_data) { > + bpf_prog_sub(prog, 1); Could prog's refcnt go 0 here? If yes, bpf_prog_put() should be used. > + kfree(finfo); > + return -ENOMEM; > + } > + > + init_seq_file(priv_data, tinfo, prog); > + > + finfo->file = file; > + finfo->prog = prog; > + > + mutex_lock(&anon_iter_info_mutex); > + list_add(&finfo->list, &anon_iter_info); > + mutex_unlock(&anon_iter_info_mutex); > + return 0; > +} > +
On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: > > A new bpf command BPF_ITER_CREATE is added. > > The anonymous bpf iterator is seq_file based. > The seq_file private data are referenced by targets. > The bpf_iter infrastructure allocated additional space > at seq_file->private after the space used by targets > to store some meta data, e.g., > prog: prog to run > session_id: an unique id for each opened seq_file > seq_num: how many times bpf programs are queried in this session > has_last: indicate whether or not bpf_prog has been called after > all valid objects have been processed > > A map between file and prog/link is established to help > fops->release(). When fops->release() is called, just based on > inode and file, bpf program cannot be located since target > seq_priv_size not available. This map helps retrieve the prog > whose reference count needs to be decremented. > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 3 + > include/uapi/linux/bpf.h | 6 ++ > kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > kernel/bpf/syscall.c | 27 ++++++ > tools/include/uapi/linux/bpf.h | 6 ++ > 5 files changed, 203 insertions(+), 1 deletion(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 4fc39d9b5cd0..0f0cafc65a04 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); > int bpf_obj_pin_user(u32 ufd, const char __user *pathname); > int bpf_obj_get_user(const char __user *pathname, int flags); > > +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) > + > struct bpf_iter_reg { > const char *target; > const char *target_func_name; > @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); > int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > struct bpf_prog *new_prog); > +int bpf_iter_new_fd(struct bpf_link *link); > > int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index f39b9fec37ab..576651110d16 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -113,6 +113,7 @@ enum bpf_cmd { > BPF_MAP_DELETE_BATCH, > BPF_LINK_CREATE, > BPF_LINK_UPDATE, > + BPF_ITER_CREATE, > }; > > enum bpf_map_type { > @@ -590,6 +591,11 @@ union bpf_attr { > __u32 old_prog_fd; > } link_update; > > + struct { /* struct used by BPF_ITER_CREATE command */ > + __u32 link_fd; > + __u32 flags; > + } iter_create; > + > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index fc1ce5ee5c3f..1f4e778d1814 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c > @@ -2,6 +2,7 @@ > /* Copyright (c) 2020 Facebook */ > > #include <linux/fs.h> > +#include <linux/anon_inodes.h> > #include <linux/filter.h> > #include <linux/bpf.h> > > @@ -19,6 +20,19 @@ struct bpf_iter_link { > struct bpf_iter_target_info *tinfo; > }; > > +struct extra_priv_data { > + struct bpf_prog *prog; > + u64 session_id; > + u64 seq_num; > + bool has_last; > +}; > + > +struct anon_file_prog_assoc { > + struct list_head list; > + struct file *file; > + struct bpf_prog *prog; > +}; > + > static struct list_head targets; > static struct mutex targets_mutex; > static bool bpf_iter_inited = false; > @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > /* protect bpf_iter_link.link->prog upddate */ > static struct mutex bpf_iter_mutex; > > +/* Since at anon seq_file release function, the prog cannot > + * be retrieved since target seq_priv_size is not available. > + * Keep a list of <anon_file, prog> mapping, so that > + * at file release stage, the prog can be released properly. > + */ > +static struct list_head anon_iter_info; > +static struct mutex anon_iter_info_mutex; > + > +/* incremented on every opened seq_file */ > +static atomic64_t session_id; > + > +static u32 get_total_priv_dsize(u32 old_size) > +{ > + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > +} > + > +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > +{ > + return old_ptr + roundup(old_size, 8); > +} > + > +static int anon_iter_release(struct inode *inode, struct file *file) > +{ > + struct anon_file_prog_assoc *finfo; > + > + mutex_lock(&anon_iter_info_mutex); > + list_for_each_entry(finfo, &anon_iter_info, list) { > + if (finfo->file == file) { I'll look at this and other patches more thoroughly tomorrow with clear head, but this iteration to find anon_file_prog_assoc is really unfortunate. I think the problem is that you are allowing seq_file infrastructure to call directly into target implementation of seq_operations without intercepting them. If you change that and put whatever extra info is necessary into seq_file->private in front of target's private state, then you shouldn't need this, right? This would also make each target's logic a bit simpler because you can: - centralize creation and initialization of bpf_iter_meta (session_id, seq, seq_num will be set up once in this generic code); - loff_t pos increments; - you can extract and centralize bpf_iter_get_prog() call in show() implementation as well. I think with that each target's logic will be simpler and you won't need to maintain anon_file_prog_assocs. Are there complications I'm missing? [...]
On 4/28/20 11:56 PM, Andrii Nakryiko wrote: > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: >> >> A new bpf command BPF_ITER_CREATE is added. >> >> The anonymous bpf iterator is seq_file based. >> The seq_file private data are referenced by targets. >> The bpf_iter infrastructure allocated additional space >> at seq_file->private after the space used by targets >> to store some meta data, e.g., >> prog: prog to run >> session_id: an unique id for each opened seq_file >> seq_num: how many times bpf programs are queried in this session >> has_last: indicate whether or not bpf_prog has been called after >> all valid objects have been processed >> >> A map between file and prog/link is established to help >> fops->release(). When fops->release() is called, just based on >> inode and file, bpf program cannot be located since target >> seq_priv_size not available. This map helps retrieve the prog >> whose reference count needs to be decremented. >> >> Signed-off-by: Yonghong Song <yhs@fb.com> >> --- >> include/linux/bpf.h | 3 + >> include/uapi/linux/bpf.h | 6 ++ >> kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- >> kernel/bpf/syscall.c | 27 ++++++ >> tools/include/uapi/linux/bpf.h | 6 ++ >> 5 files changed, 203 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 4fc39d9b5cd0..0f0cafc65a04 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); >> int bpf_obj_pin_user(u32 ufd, const char __user *pathname); >> int bpf_obj_get_user(const char __user *pathname, int flags); >> >> +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) >> + >> struct bpf_iter_reg { >> const char *target; >> const char *target_func_name; >> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); >> int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); >> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, >> struct bpf_prog *new_prog); >> +int bpf_iter_new_fd(struct bpf_link *link); >> >> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); >> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index f39b9fec37ab..576651110d16 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -113,6 +113,7 @@ enum bpf_cmd { >> BPF_MAP_DELETE_BATCH, >> BPF_LINK_CREATE, >> BPF_LINK_UPDATE, >> + BPF_ITER_CREATE, >> }; >> >> enum bpf_map_type { >> @@ -590,6 +591,11 @@ union bpf_attr { >> __u32 old_prog_fd; >> } link_update; >> >> + struct { /* struct used by BPF_ITER_CREATE command */ >> + __u32 link_fd; >> + __u32 flags; >> + } iter_create; >> + >> } __attribute__((aligned(8))); >> >> /* The description below is an attempt at providing documentation to eBPF >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c >> index fc1ce5ee5c3f..1f4e778d1814 100644 >> --- a/kernel/bpf/bpf_iter.c >> +++ b/kernel/bpf/bpf_iter.c >> @@ -2,6 +2,7 @@ >> /* Copyright (c) 2020 Facebook */ >> >> #include <linux/fs.h> >> +#include <linux/anon_inodes.h> >> #include <linux/filter.h> >> #include <linux/bpf.h> >> >> @@ -19,6 +20,19 @@ struct bpf_iter_link { >> struct bpf_iter_target_info *tinfo; >> }; >> >> +struct extra_priv_data { >> + struct bpf_prog *prog; >> + u64 session_id; >> + u64 seq_num; >> + bool has_last; >> +}; >> + >> +struct anon_file_prog_assoc { >> + struct list_head list; >> + struct file *file; >> + struct bpf_prog *prog; >> +}; >> + >> static struct list_head targets; >> static struct mutex targets_mutex; >> static bool bpf_iter_inited = false; >> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; >> /* protect bpf_iter_link.link->prog upddate */ >> static struct mutex bpf_iter_mutex; >> >> +/* Since at anon seq_file release function, the prog cannot >> + * be retrieved since target seq_priv_size is not available. >> + * Keep a list of <anon_file, prog> mapping, so that >> + * at file release stage, the prog can be released properly. >> + */ >> +static struct list_head anon_iter_info; >> +static struct mutex anon_iter_info_mutex; >> + >> +/* incremented on every opened seq_file */ >> +static atomic64_t session_id; >> + >> +static u32 get_total_priv_dsize(u32 old_size) >> +{ >> + return roundup(old_size, 8) + sizeof(struct extra_priv_data); >> +} >> + >> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) >> +{ >> + return old_ptr + roundup(old_size, 8); >> +} >> + >> +static int anon_iter_release(struct inode *inode, struct file *file) >> +{ >> + struct anon_file_prog_assoc *finfo; >> + >> + mutex_lock(&anon_iter_info_mutex); >> + list_for_each_entry(finfo, &anon_iter_info, list) { >> + if (finfo->file == file) { > > I'll look at this and other patches more thoroughly tomorrow with > clear head, but this iteration to find anon_file_prog_assoc is really > unfortunate. > > I think the problem is that you are allowing seq_file infrastructure > to call directly into target implementation of seq_operations without > intercepting them. If you change that and put whatever extra info is > necessary into seq_file->private in front of target's private state, > then you shouldn't need this, right? Yes. This is true. The idea is to minimize the target change. But maybe this is not a good goal by itself. You are right, if I intercept all seq_ops(), I do not need the above change, I can tailor seq_file private_data right before calling target one and restore after the target call. Originally I only have one interception, show(), now I have stop() too to call bpf at the end of iteration. Maybe I can interpret all four, I think. This way, I can also get ride of target feature. > > This would also make each target's logic a bit simpler because you can: > - centralize creation and initialization of bpf_iter_meta (session_id, > seq, seq_num will be set up once in this generic code); > - loff_t pos increments; > - you can extract and centralize bpf_iter_get_prog() call in show() > implementation as well. > > I think with that each target's logic will be simpler and you won't > need to maintain anon_file_prog_assocs. > > Are there complications I'm missing? > > [...] >
On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote: > > > > On 4/28/20 11:56 PM, Andrii Nakryiko wrote: > > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: > >> > >> A new bpf command BPF_ITER_CREATE is added. > >> > >> The anonymous bpf iterator is seq_file based. > >> The seq_file private data are referenced by targets. > >> The bpf_iter infrastructure allocated additional space > >> at seq_file->private after the space used by targets > >> to store some meta data, e.g., > >> prog: prog to run > >> session_id: an unique id for each opened seq_file > >> seq_num: how many times bpf programs are queried in this session > >> has_last: indicate whether or not bpf_prog has been called after > >> all valid objects have been processed > >> > >> A map between file and prog/link is established to help > >> fops->release(). When fops->release() is called, just based on > >> inode and file, bpf program cannot be located since target > >> seq_priv_size not available. This map helps retrieve the prog > >> whose reference count needs to be decremented. > >> > >> Signed-off-by: Yonghong Song <yhs@fb.com> > >> --- > >> include/linux/bpf.h | 3 + > >> include/uapi/linux/bpf.h | 6 ++ > >> kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > >> kernel/bpf/syscall.c | 27 ++++++ > >> tools/include/uapi/linux/bpf.h | 6 ++ > >> 5 files changed, 203 insertions(+), 1 deletion(-) > >> > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h > >> index 4fc39d9b5cd0..0f0cafc65a04 100644 > >> --- a/include/linux/bpf.h > >> +++ b/include/linux/bpf.h > >> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); > >> int bpf_obj_pin_user(u32 ufd, const char __user *pathname); > >> int bpf_obj_get_user(const char __user *pathname, int flags); > >> > >> +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) > >> + > >> struct bpf_iter_reg { > >> const char *target; > >> const char *target_func_name; > >> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); > >> int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > >> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > >> struct bpf_prog *new_prog); > >> +int bpf_iter_new_fd(struct bpf_link *link); > >> > >> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > >> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > >> index f39b9fec37ab..576651110d16 100644 > >> --- a/include/uapi/linux/bpf.h > >> +++ b/include/uapi/linux/bpf.h > >> @@ -113,6 +113,7 @@ enum bpf_cmd { > >> BPF_MAP_DELETE_BATCH, > >> BPF_LINK_CREATE, > >> BPF_LINK_UPDATE, > >> + BPF_ITER_CREATE, > >> }; > >> > >> enum bpf_map_type { > >> @@ -590,6 +591,11 @@ union bpf_attr { > >> __u32 old_prog_fd; > >> } link_update; > >> > >> + struct { /* struct used by BPF_ITER_CREATE command */ > >> + __u32 link_fd; > >> + __u32 flags; > >> + } iter_create; > >> + > >> } __attribute__((aligned(8))); > >> > >> /* The description below is an attempt at providing documentation to eBPF > >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > >> index fc1ce5ee5c3f..1f4e778d1814 100644 > >> --- a/kernel/bpf/bpf_iter.c > >> +++ b/kernel/bpf/bpf_iter.c > >> @@ -2,6 +2,7 @@ > >> /* Copyright (c) 2020 Facebook */ > >> > >> #include <linux/fs.h> > >> +#include <linux/anon_inodes.h> > >> #include <linux/filter.h> > >> #include <linux/bpf.h> > >> > >> @@ -19,6 +20,19 @@ struct bpf_iter_link { > >> struct bpf_iter_target_info *tinfo; > >> }; > >> > >> +struct extra_priv_data { > >> + struct bpf_prog *prog; > >> + u64 session_id; > >> + u64 seq_num; > >> + bool has_last; > >> +}; > >> + > >> +struct anon_file_prog_assoc { > >> + struct list_head list; > >> + struct file *file; > >> + struct bpf_prog *prog; > >> +}; > >> + > >> static struct list_head targets; > >> static struct mutex targets_mutex; > >> static bool bpf_iter_inited = false; > >> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > >> /* protect bpf_iter_link.link->prog upddate */ > >> static struct mutex bpf_iter_mutex; > >> > >> +/* Since at anon seq_file release function, the prog cannot > >> + * be retrieved since target seq_priv_size is not available. > >> + * Keep a list of <anon_file, prog> mapping, so that > >> + * at file release stage, the prog can be released properly. > >> + */ > >> +static struct list_head anon_iter_info; > >> +static struct mutex anon_iter_info_mutex; > >> + > >> +/* incremented on every opened seq_file */ > >> +static atomic64_t session_id; > >> + > >> +static u32 get_total_priv_dsize(u32 old_size) > >> +{ > >> + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > >> +} > >> + > >> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > >> +{ > >> + return old_ptr + roundup(old_size, 8); > >> +} > >> + > >> +static int anon_iter_release(struct inode *inode, struct file *file) > >> +{ > >> + struct anon_file_prog_assoc *finfo; > >> + > >> + mutex_lock(&anon_iter_info_mutex); > >> + list_for_each_entry(finfo, &anon_iter_info, list) { > >> + if (finfo->file == file) { > > > > I'll look at this and other patches more thoroughly tomorrow with > > clear head, but this iteration to find anon_file_prog_assoc is really > > unfortunate. > > > > I think the problem is that you are allowing seq_file infrastructure > > to call directly into target implementation of seq_operations without > > intercepting them. If you change that and put whatever extra info is > > necessary into seq_file->private in front of target's private state, > > then you shouldn't need this, right? > > Yes. This is true. The idea is to minimize the target change. > But maybe this is not a good goal by itself. > > You are right, if I intercept all seq_ops(), I do not need the > above change, I can tailor seq_file private_data right before > calling target one and restore after the target call. > > Originally I only have one interception, show(), now I have > stop() too to call bpf at the end of iteration. Maybe I can > interpret all four, I think. This way, I can also get ride > of target feature. If the main goal is to minimize target changes and make them exactly seq_operations implementation, then one easier way to get easy access to our own metadata in seq_file->private is to set it to point **after** our metadata, but before target's metadata. Roughly in pseudo code: struct bpf_iter_seq_file_meta {} __attribute((aligned(8))); void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) + target_private_size); seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta); Then to recover bpf_iter_Seq_file_meta: struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta); /* voila! */ This doesn't have a benefit of making targets simpler, but will require no changes to them at all. Plus less indirect calls, so less performance penalty. > > > > > This would also make each target's logic a bit simpler because you can: > > - centralize creation and initialization of bpf_iter_meta (session_id, > > seq, seq_num will be set up once in this generic code); > > - loff_t pos increments; > > - you can extract and centralize bpf_iter_get_prog() call in show() > > implementation as well. > > > > I think with that each target's logic will be simpler and you won't > > need to maintain anon_file_prog_assocs. > > > > Are there complications I'm missing? > > > > [...] > >
On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote: > On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote: > > > > > > > > On 4/28/20 11:56 PM, Andrii Nakryiko wrote: > > > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: > > >> > > >> A new bpf command BPF_ITER_CREATE is added. > > >> > > >> The anonymous bpf iterator is seq_file based. > > >> The seq_file private data are referenced by targets. > > >> The bpf_iter infrastructure allocated additional space > > >> at seq_file->private after the space used by targets > > >> to store some meta data, e.g., > > >> prog: prog to run > > >> session_id: an unique id for each opened seq_file > > >> seq_num: how many times bpf programs are queried in this session > > >> has_last: indicate whether or not bpf_prog has been called after > > >> all valid objects have been processed > > >> > > >> A map between file and prog/link is established to help > > >> fops->release(). When fops->release() is called, just based on > > >> inode and file, bpf program cannot be located since target > > >> seq_priv_size not available. This map helps retrieve the prog > > >> whose reference count needs to be decremented. > > >> > > >> Signed-off-by: Yonghong Song <yhs@fb.com> > > >> --- > > >> include/linux/bpf.h | 3 + > > >> include/uapi/linux/bpf.h | 6 ++ > > >> kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > > >> kernel/bpf/syscall.c | 27 ++++++ > > >> tools/include/uapi/linux/bpf.h | 6 ++ > > >> 5 files changed, 203 insertions(+), 1 deletion(-) > > >> > > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > >> index 4fc39d9b5cd0..0f0cafc65a04 100644 > > >> --- a/include/linux/bpf.h > > >> +++ b/include/linux/bpf.h > > >> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); > > >> int bpf_obj_pin_user(u32 ufd, const char __user *pathname); > > >> int bpf_obj_get_user(const char __user *pathname, int flags); > > >> > > >> +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) > > >> + > > >> struct bpf_iter_reg { > > >> const char *target; > > >> const char *target_func_name; > > >> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); > > >> int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > > >> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > > >> struct bpf_prog *new_prog); > > >> +int bpf_iter_new_fd(struct bpf_link *link); > > >> > > >> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > > >> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > >> index f39b9fec37ab..576651110d16 100644 > > >> --- a/include/uapi/linux/bpf.h > > >> +++ b/include/uapi/linux/bpf.h > > >> @@ -113,6 +113,7 @@ enum bpf_cmd { > > >> BPF_MAP_DELETE_BATCH, > > >> BPF_LINK_CREATE, > > >> BPF_LINK_UPDATE, > > >> + BPF_ITER_CREATE, > > >> }; > > >> > > >> enum bpf_map_type { > > >> @@ -590,6 +591,11 @@ union bpf_attr { > > >> __u32 old_prog_fd; > > >> } link_update; > > >> > > >> + struct { /* struct used by BPF_ITER_CREATE command */ > > >> + __u32 link_fd; > > >> + __u32 flags; > > >> + } iter_create; > > >> + > > >> } __attribute__((aligned(8))); > > >> > > >> /* The description below is an attempt at providing documentation to eBPF > > >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > > >> index fc1ce5ee5c3f..1f4e778d1814 100644 > > >> --- a/kernel/bpf/bpf_iter.c > > >> +++ b/kernel/bpf/bpf_iter.c > > >> @@ -2,6 +2,7 @@ > > >> /* Copyright (c) 2020 Facebook */ > > >> > > >> #include <linux/fs.h> > > >> +#include <linux/anon_inodes.h> > > >> #include <linux/filter.h> > > >> #include <linux/bpf.h> > > >> > > >> @@ -19,6 +20,19 @@ struct bpf_iter_link { > > >> struct bpf_iter_target_info *tinfo; > > >> }; > > >> > > >> +struct extra_priv_data { > > >> + struct bpf_prog *prog; > > >> + u64 session_id; > > >> + u64 seq_num; > > >> + bool has_last; > > >> +}; > > >> + > > >> +struct anon_file_prog_assoc { > > >> + struct list_head list; > > >> + struct file *file; > > >> + struct bpf_prog *prog; > > >> +}; > > >> + > > >> static struct list_head targets; > > >> static struct mutex targets_mutex; > > >> static bool bpf_iter_inited = false; > > >> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > > >> /* protect bpf_iter_link.link->prog upddate */ > > >> static struct mutex bpf_iter_mutex; > > >> > > >> +/* Since at anon seq_file release function, the prog cannot > > >> + * be retrieved since target seq_priv_size is not available. > > >> + * Keep a list of <anon_file, prog> mapping, so that > > >> + * at file release stage, the prog can be released properly. > > >> + */ > > >> +static struct list_head anon_iter_info; > > >> +static struct mutex anon_iter_info_mutex; > > >> + > > >> +/* incremented on every opened seq_file */ > > >> +static atomic64_t session_id; > > >> + > > >> +static u32 get_total_priv_dsize(u32 old_size) > > >> +{ > > >> + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > > >> +} > > >> + > > >> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > > >> +{ > > >> + return old_ptr + roundup(old_size, 8); > > >> +} > > >> + > > >> +static int anon_iter_release(struct inode *inode, struct file *file) > > >> +{ > > >> + struct anon_file_prog_assoc *finfo; > > >> + > > >> + mutex_lock(&anon_iter_info_mutex); > > >> + list_for_each_entry(finfo, &anon_iter_info, list) { > > >> + if (finfo->file == file) { > > > > > > I'll look at this and other patches more thoroughly tomorrow with > > > clear head, but this iteration to find anon_file_prog_assoc is really > > > unfortunate. > > > > > > I think the problem is that you are allowing seq_file infrastructure > > > to call directly into target implementation of seq_operations without > > > intercepting them. If you change that and put whatever extra info is > > > necessary into seq_file->private in front of target's private state, > > > then you shouldn't need this, right? > > > > Yes. This is true. The idea is to minimize the target change. > > But maybe this is not a good goal by itself. > > > > You are right, if I intercept all seq_ops(), I do not need the > > above change, I can tailor seq_file private_data right before > > calling target one and restore after the target call. > > > > Originally I only have one interception, show(), now I have > > stop() too to call bpf at the end of iteration. Maybe I can > > interpret all four, I think. This way, I can also get ride > > of target feature. > > If the main goal is to minimize target changes and make them exactly > seq_operations implementation, then one easier way to get easy access > to our own metadata in seq_file->private is to set it to point > **after** our metadata, but before target's metadata. Roughly in > pseudo code: > > struct bpf_iter_seq_file_meta {} __attribute((aligned(8))); > > void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) + > target_private_size); > seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta); I have suggested the same thing earlier. Good to know that we think alike ;) May be put them in a struct such that container_of...etc can be used: struct bpf_iter_private { struct extra_priv_data iter_private; u8 target_private[] __aligned(8); }; > > > Then to recover bpf_iter_Seq_file_meta: > > struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta); > > /* voila! */ > > This doesn't have a benefit of making targets simpler, but will > require no changes to them at all. Plus less indirect calls, so less > performance penalty. >
On 4/29/20 11:46 AM, Martin KaFai Lau wrote: > On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote: >> On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote: >>> >>> >>> >>> On 4/28/20 11:56 PM, Andrii Nakryiko wrote: >>>> On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: >>>>> >>>>> A new bpf command BPF_ITER_CREATE is added. >>>>> >>>>> The anonymous bpf iterator is seq_file based. >>>>> The seq_file private data are referenced by targets. >>>>> The bpf_iter infrastructure allocated additional space >>>>> at seq_file->private after the space used by targets >>>>> to store some meta data, e.g., >>>>> prog: prog to run >>>>> session_id: an unique id for each opened seq_file >>>>> seq_num: how many times bpf programs are queried in this session >>>>> has_last: indicate whether or not bpf_prog has been called after >>>>> all valid objects have been processed >>>>> >>>>> A map between file and prog/link is established to help >>>>> fops->release(). When fops->release() is called, just based on >>>>> inode and file, bpf program cannot be located since target >>>>> seq_priv_size not available. This map helps retrieve the prog >>>>> whose reference count needs to be decremented. >>>>> >>>>> Signed-off-by: Yonghong Song <yhs@fb.com> >>>>> --- >>>>> include/linux/bpf.h | 3 + >>>>> include/uapi/linux/bpf.h | 6 ++ >>>>> kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- >>>>> kernel/bpf/syscall.c | 27 ++++++ >>>>> tools/include/uapi/linux/bpf.h | 6 ++ >>>>> 5 files changed, 203 insertions(+), 1 deletion(-) >>>>> >>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>>>> index 4fc39d9b5cd0..0f0cafc65a04 100644 >>>>> --- a/include/linux/bpf.h >>>>> +++ b/include/linux/bpf.h >>>>> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); >>>>> int bpf_obj_pin_user(u32 ufd, const char __user *pathname); >>>>> int bpf_obj_get_user(const char __user *pathname, int flags); >>>>> >>>>> +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) >>>>> + >>>>> struct bpf_iter_reg { >>>>> const char *target; >>>>> const char *target_func_name; >>>>> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); >>>>> int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); >>>>> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, >>>>> struct bpf_prog *new_prog); >>>>> +int bpf_iter_new_fd(struct bpf_link *link); >>>>> >>>>> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); >>>>> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); >>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>>>> index f39b9fec37ab..576651110d16 100644 >>>>> --- a/include/uapi/linux/bpf.h >>>>> +++ b/include/uapi/linux/bpf.h >>>>> @@ -113,6 +113,7 @@ enum bpf_cmd { >>>>> BPF_MAP_DELETE_BATCH, >>>>> BPF_LINK_CREATE, >>>>> BPF_LINK_UPDATE, >>>>> + BPF_ITER_CREATE, >>>>> }; >>>>> >>>>> enum bpf_map_type { >>>>> @@ -590,6 +591,11 @@ union bpf_attr { >>>>> __u32 old_prog_fd; >>>>> } link_update; >>>>> >>>>> + struct { /* struct used by BPF_ITER_CREATE command */ >>>>> + __u32 link_fd; >>>>> + __u32 flags; >>>>> + } iter_create; >>>>> + >>>>> } __attribute__((aligned(8))); >>>>> >>>>> /* The description below is an attempt at providing documentation to eBPF >>>>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c >>>>> index fc1ce5ee5c3f..1f4e778d1814 100644 >>>>> --- a/kernel/bpf/bpf_iter.c >>>>> +++ b/kernel/bpf/bpf_iter.c >>>>> @@ -2,6 +2,7 @@ >>>>> /* Copyright (c) 2020 Facebook */ >>>>> >>>>> #include <linux/fs.h> >>>>> +#include <linux/anon_inodes.h> >>>>> #include <linux/filter.h> >>>>> #include <linux/bpf.h> >>>>> >>>>> @@ -19,6 +20,19 @@ struct bpf_iter_link { >>>>> struct bpf_iter_target_info *tinfo; >>>>> }; >>>>> >>>>> +struct extra_priv_data { >>>>> + struct bpf_prog *prog; >>>>> + u64 session_id; >>>>> + u64 seq_num; >>>>> + bool has_last; >>>>> +}; >>>>> + >>>>> +struct anon_file_prog_assoc { >>>>> + struct list_head list; >>>>> + struct file *file; >>>>> + struct bpf_prog *prog; >>>>> +}; >>>>> + >>>>> static struct list_head targets; >>>>> static struct mutex targets_mutex; >>>>> static bool bpf_iter_inited = false; >>>>> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; >>>>> /* protect bpf_iter_link.link->prog upddate */ >>>>> static struct mutex bpf_iter_mutex; >>>>> >>>>> +/* Since at anon seq_file release function, the prog cannot >>>>> + * be retrieved since target seq_priv_size is not available. >>>>> + * Keep a list of <anon_file, prog> mapping, so that >>>>> + * at file release stage, the prog can be released properly. >>>>> + */ >>>>> +static struct list_head anon_iter_info; >>>>> +static struct mutex anon_iter_info_mutex; >>>>> + >>>>> +/* incremented on every opened seq_file */ >>>>> +static atomic64_t session_id; >>>>> + >>>>> +static u32 get_total_priv_dsize(u32 old_size) >>>>> +{ >>>>> + return roundup(old_size, 8) + sizeof(struct extra_priv_data); >>>>> +} >>>>> + >>>>> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) >>>>> +{ >>>>> + return old_ptr + roundup(old_size, 8); >>>>> +} >>>>> + >>>>> +static int anon_iter_release(struct inode *inode, struct file *file) >>>>> +{ >>>>> + struct anon_file_prog_assoc *finfo; >>>>> + >>>>> + mutex_lock(&anon_iter_info_mutex); >>>>> + list_for_each_entry(finfo, &anon_iter_info, list) { >>>>> + if (finfo->file == file) { >>>> >>>> I'll look at this and other patches more thoroughly tomorrow with >>>> clear head, but this iteration to find anon_file_prog_assoc is really >>>> unfortunate. >>>> >>>> I think the problem is that you are allowing seq_file infrastructure >>>> to call directly into target implementation of seq_operations without >>>> intercepting them. If you change that and put whatever extra info is >>>> necessary into seq_file->private in front of target's private state, >>>> then you shouldn't need this, right? >>> >>> Yes. This is true. The idea is to minimize the target change. >>> But maybe this is not a good goal by itself. >>> >>> You are right, if I intercept all seq_ops(), I do not need the >>> above change, I can tailor seq_file private_data right before >>> calling target one and restore after the target call. >>> >>> Originally I only have one interception, show(), now I have >>> stop() too to call bpf at the end of iteration. Maybe I can >>> interpret all four, I think. This way, I can also get ride >>> of target feature. >> >> If the main goal is to minimize target changes and make them exactly >> seq_operations implementation, then one easier way to get easy access >> to our own metadata in seq_file->private is to set it to point >> **after** our metadata, but before target's metadata. Roughly in >> pseudo code: >> >> struct bpf_iter_seq_file_meta {} __attribute((aligned(8))); >> >> void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) + >> target_private_size); >> seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta); > I have suggested the same thing earlier. Good to know that we think alike ;) > > May be put them in a struct such that container_of...etc can be used: > struct bpf_iter_private { > struct extra_priv_data iter_private; > u8 target_private[] __aligned(8); > }; This should work, but need to intercept all seq_ops() operations because target expects private data is `target_private` only. Let me experiment what is the best way to do this. > >> >> >> Then to recover bpf_iter_Seq_file_meta: >> >> struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta); >> >> /* voila! */ >> >> This doesn't have a benefit of making targets simpler, but will >> require no changes to them at all. Plus less indirect calls, so less >> performance penalty. >>
On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: > > A new bpf command BPF_ITER_CREATE is added. > > The anonymous bpf iterator is seq_file based. > The seq_file private data are referenced by targets. > The bpf_iter infrastructure allocated additional space > at seq_file->private after the space used by targets > to store some meta data, e.g., > prog: prog to run > session_id: an unique id for each opened seq_file > seq_num: how many times bpf programs are queried in this session > has_last: indicate whether or not bpf_prog has been called after > all valid objects have been processed > > A map between file and prog/link is established to help > fops->release(). When fops->release() is called, just based on > inode and file, bpf program cannot be located since target > seq_priv_size not available. This map helps retrieve the prog > whose reference count needs to be decremented. > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 3 + > include/uapi/linux/bpf.h | 6 ++ > kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > kernel/bpf/syscall.c | 27 ++++++ > tools/include/uapi/linux/bpf.h | 6 ++ > 5 files changed, 203 insertions(+), 1 deletion(-) > [...] > int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > { > struct bpf_iter_target_info *tinfo; > @@ -37,6 +95,8 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > INIT_LIST_HEAD(&targets); > mutex_init(&targets_mutex); > mutex_init(&bpf_iter_mutex); > + INIT_LIST_HEAD(&anon_iter_info); > + mutex_init(&anon_iter_info_mutex); > bpf_iter_inited = true; > } > > @@ -61,7 +121,20 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, > u64 *session_id, u64 *seq_num, bool is_last) instead of passing many pointers (session_id, seq_num), would it be better to just pass bpf_iter_meta * instead? > { > - return NULL; > + struct extra_priv_data *extra_data; > + > + if (seq->file->f_op != &anon_bpf_iter_fops) > + return NULL; > + > + extra_data = get_extra_priv_dptr(seq->private, priv_data_size); > + if (extra_data->has_last) > + return NULL; > + > + *session_id = extra_data->session_id; > + *seq_num = extra_data->seq_num++; > + extra_data->has_last = is_last; > + > + return extra_data->prog; > } > > int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) > @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > mutex_unlock(&bpf_iter_mutex); > return ret; > } > + > +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo, > + struct bpf_prog *prog) > +{ > + struct extra_priv_data *extra_data; > + > + if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE) > + set_seq_net_private((struct seq_net_private *)priv_data, > + current->nsproxy->net_ns); > + > + extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size); > + extra_data->session_id = atomic64_add_return(1, &session_id); > + extra_data->prog = prog; > + extra_data->seq_num = 0; > + extra_data->has_last = false; > +} > + > +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) > +{ > + struct anon_file_prog_assoc *finfo; > + struct bpf_iter_target_info *tinfo; > + struct bpf_prog *prog; > + u32 total_priv_dsize; > + void *priv_data; > + > + finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN); > + if (!finfo) > + return -ENOMEM; > + > + mutex_lock(&bpf_iter_mutex); > + prog = link->link.prog; > + bpf_prog_inc(prog); > + mutex_unlock(&bpf_iter_mutex); > + > + tinfo = link->tinfo; > + total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size); > + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); > + if (!priv_data) { > + bpf_prog_sub(prog, 1); > + kfree(finfo); > + return -ENOMEM; > + } > + > + init_seq_file(priv_data, tinfo, prog); > + > + finfo->file = file; > + finfo->prog = prog; > + > + mutex_lock(&anon_iter_info_mutex); > + list_add(&finfo->list, &anon_iter_info); > + mutex_unlock(&anon_iter_info_mutex); > + return 0; > +} > + > +int bpf_iter_new_fd(struct bpf_link *link) > +{ > + struct file *file; > + int err, fd; > + > + if (link->ops != &bpf_iter_link_lops) > + return -EINVAL; > + > + fd = get_unused_fd_flags(O_CLOEXEC); > + if (fd < 0) > + return fd; > + > + file = anon_inode_getfile("bpf_iter", &anon_bpf_iter_fops, > + NULL, O_CLOEXEC); Shouldn't this anon file be readable and have O_RDONLY flag as well? > + if (IS_ERR(file)) { > + err = PTR_ERR(file); > + goto free_fd; > + } > + > + err = prepare_seq_file(file, > + container_of(link, struct bpf_iter_link, link)); > + if (err) > + goto free_file; > + > + fd_install(fd, file); > + return fd; > + > +free_file: > + fput(file); > +free_fd: > + put_unused_fd(fd); > + return err; > +} > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index b7af4f006f2e..458f7000887a 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3696,6 +3696,30 @@ static int link_update(union bpf_attr *attr) > return ret; > } > > +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags > + > +static int bpf_iter_create(union bpf_attr *attr) > +{ > + struct bpf_link *link; > + int err; > + > + if (CHECK_ATTR(BPF_ITER_CREATE)) > + return -EINVAL; > + > + if (attr->iter_create.flags) > + return -EINVAL; > + > + link = bpf_link_get_from_fd(attr->iter_create.link_fd); > + if (IS_ERR(link)) > + return PTR_ERR(link); > + > + err = bpf_iter_new_fd(link); > + if (err < 0) > + bpf_link_put(link); bpf_iter_new_fd() doesn't take a refcnt on link, so you need to put it regardless of success or error > + > + return err; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr; > @@ -3813,6 +3837,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_LINK_UPDATE: > err = link_update(&attr); > break; > + case BPF_ITER_CREATE: > + err = bpf_iter_create(&attr); > + break; > default: > err = -EINVAL; > break; [...]
On Wed, Apr 29, 2020 at 12:20:05PM -0700, Yonghong Song wrote: > > > On 4/29/20 11:46 AM, Martin KaFai Lau wrote: > > On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote: > > > On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote: > > > > > > > > > > > > > > > > On 4/28/20 11:56 PM, Andrii Nakryiko wrote: > > > > > On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: > > > > > > > > > > > > A new bpf command BPF_ITER_CREATE is added. > > > > > > > > > > > > The anonymous bpf iterator is seq_file based. > > > > > > The seq_file private data are referenced by targets. > > > > > > The bpf_iter infrastructure allocated additional space > > > > > > at seq_file->private after the space used by targets > > > > > > to store some meta data, e.g., > > > > > > prog: prog to run > > > > > > session_id: an unique id for each opened seq_file > > > > > > seq_num: how many times bpf programs are queried in this session > > > > > > has_last: indicate whether or not bpf_prog has been called after > > > > > > all valid objects have been processed > > > > > > > > > > > > A map between file and prog/link is established to help > > > > > > fops->release(). When fops->release() is called, just based on > > > > > > inode and file, bpf program cannot be located since target > > > > > > seq_priv_size not available. This map helps retrieve the prog > > > > > > whose reference count needs to be decremented. > > > > > > > > > > > > Signed-off-by: Yonghong Song <yhs@fb.com> > > > > > > --- > > > > > > include/linux/bpf.h | 3 + > > > > > > include/uapi/linux/bpf.h | 6 ++ > > > > > > kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > > > > > > kernel/bpf/syscall.c | 27 ++++++ > > > > > > tools/include/uapi/linux/bpf.h | 6 ++ > > > > > > 5 files changed, 203 insertions(+), 1 deletion(-) > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > index 4fc39d9b5cd0..0f0cafc65a04 100644 > > > > > > --- a/include/linux/bpf.h > > > > > > +++ b/include/linux/bpf.h > > > > > > @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); > > > > > > int bpf_obj_pin_user(u32 ufd, const char __user *pathname); > > > > > > int bpf_obj_get_user(const char __user *pathname, int flags); > > > > > > > > > > > > +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) > > > > > > + > > > > > > struct bpf_iter_reg { > > > > > > const char *target; > > > > > > const char *target_func_name; > > > > > > @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); > > > > > > int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > > > > > > int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > > > > > > struct bpf_prog *new_prog); > > > > > > +int bpf_iter_new_fd(struct bpf_link *link); > > > > > > > > > > > > int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > > > > > > int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > > > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > > > > > index f39b9fec37ab..576651110d16 100644 > > > > > > --- a/include/uapi/linux/bpf.h > > > > > > +++ b/include/uapi/linux/bpf.h > > > > > > @@ -113,6 +113,7 @@ enum bpf_cmd { > > > > > > BPF_MAP_DELETE_BATCH, > > > > > > BPF_LINK_CREATE, > > > > > > BPF_LINK_UPDATE, > > > > > > + BPF_ITER_CREATE, > > > > > > }; > > > > > > > > > > > > enum bpf_map_type { > > > > > > @@ -590,6 +591,11 @@ union bpf_attr { > > > > > > __u32 old_prog_fd; > > > > > > } link_update; > > > > > > > > > > > > + struct { /* struct used by BPF_ITER_CREATE command */ > > > > > > + __u32 link_fd; > > > > > > + __u32 flags; > > > > > > + } iter_create; > > > > > > + > > > > > > } __attribute__((aligned(8))); > > > > > > > > > > > > /* The description below is an attempt at providing documentation to eBPF > > > > > > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > > > > > > index fc1ce5ee5c3f..1f4e778d1814 100644 > > > > > > --- a/kernel/bpf/bpf_iter.c > > > > > > +++ b/kernel/bpf/bpf_iter.c > > > > > > @@ -2,6 +2,7 @@ > > > > > > /* Copyright (c) 2020 Facebook */ > > > > > > > > > > > > #include <linux/fs.h> > > > > > > +#include <linux/anon_inodes.h> > > > > > > #include <linux/filter.h> > > > > > > #include <linux/bpf.h> > > > > > > > > > > > > @@ -19,6 +20,19 @@ struct bpf_iter_link { > > > > > > struct bpf_iter_target_info *tinfo; > > > > > > }; > > > > > > > > > > > > +struct extra_priv_data { > > > > > > + struct bpf_prog *prog; > > > > > > + u64 session_id; > > > > > > + u64 seq_num; > > > > > > + bool has_last; > > > > > > +}; > > > > > > + > > > > > > +struct anon_file_prog_assoc { > > > > > > + struct list_head list; > > > > > > + struct file *file; > > > > > > + struct bpf_prog *prog; > > > > > > +}; > > > > > > + > > > > > > static struct list_head targets; > > > > > > static struct mutex targets_mutex; > > > > > > static bool bpf_iter_inited = false; > > > > > > @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > > > > > > /* protect bpf_iter_link.link->prog upddate */ > > > > > > static struct mutex bpf_iter_mutex; > > > > > > > > > > > > +/* Since at anon seq_file release function, the prog cannot > > > > > > + * be retrieved since target seq_priv_size is not available. > > > > > > + * Keep a list of <anon_file, prog> mapping, so that > > > > > > + * at file release stage, the prog can be released properly. > > > > > > + */ > > > > > > +static struct list_head anon_iter_info; > > > > > > +static struct mutex anon_iter_info_mutex; > > > > > > + > > > > > > +/* incremented on every opened seq_file */ > > > > > > +static atomic64_t session_id; > > > > > > + > > > > > > +static u32 get_total_priv_dsize(u32 old_size) > > > > > > +{ > > > > > > + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > > > > > > +} > > > > > > + > > > > > > +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > > > > > > +{ > > > > > > + return old_ptr + roundup(old_size, 8); > > > > > > +} > > > > > > + > > > > > > +static int anon_iter_release(struct inode *inode, struct file *file) > > > > > > +{ > > > > > > + struct anon_file_prog_assoc *finfo; > > > > > > + > > > > > > + mutex_lock(&anon_iter_info_mutex); > > > > > > + list_for_each_entry(finfo, &anon_iter_info, list) { > > > > > > + if (finfo->file == file) { > > > > > > > > > > I'll look at this and other patches more thoroughly tomorrow with > > > > > clear head, but this iteration to find anon_file_prog_assoc is really > > > > > unfortunate. > > > > > > > > > > I think the problem is that you are allowing seq_file infrastructure > > > > > to call directly into target implementation of seq_operations without > > > > > intercepting them. If you change that and put whatever extra info is > > > > > necessary into seq_file->private in front of target's private state, > > > > > then you shouldn't need this, right? > > > > > > > > Yes. This is true. The idea is to minimize the target change. > > > > But maybe this is not a good goal by itself. > > > > > > > > You are right, if I intercept all seq_ops(), I do not need the > > > > above change, I can tailor seq_file private_data right before > > > > calling target one and restore after the target call. > > > > > > > > Originally I only have one interception, show(), now I have > > > > stop() too to call bpf at the end of iteration. Maybe I can > > > > interpret all four, I think. This way, I can also get ride > > > > of target feature. > > > > > > If the main goal is to minimize target changes and make them exactly > > > seq_operations implementation, then one easier way to get easy access > > > to our own metadata in seq_file->private is to set it to point > > > **after** our metadata, but before target's metadata. Roughly in > > > pseudo code: > > > > > > struct bpf_iter_seq_file_meta {} __attribute((aligned(8))); > > > > > > void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) + > > > target_private_size); > > > seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta); > > I have suggested the same thing earlier. Good to know that we think alike ;) > > > > May be put them in a struct such that container_of...etc can be used: > > struct bpf_iter_private { > > struct extra_priv_data iter_private; > > u8 target_private[] __aligned(8); > > }; > > This should work, but need to intercept all seq_ops() operations > because target expects private data is `target_private` only. > Let me experiment what is the best way to do this. As long as "seq_file->private = bpf_iter_private->target_private;" as Andrii also suggested, the existing seq_ops() should not have to be changed or needed to be intercepted because they are only accessing it through seq_file->private. The bpf_iter logic should be the only one needed to access the bpf_iter_private->iter_private and it can be obtained by, e.g. "container_of(seq_file->private, struct bpf_iter_private, target_private)" > > > > > > > > > > > > Then to recover bpf_iter_Seq_file_meta: > > > > > > struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta); > > > > > > /* voila! */ > > > > > > This doesn't have a benefit of making targets simpler, but will > > > require no changes to them at all. Plus less indirect calls, so less > > > performance penalty. > > >
On 4/29/20 1:50 PM, Martin KaFai Lau wrote: > On Wed, Apr 29, 2020 at 12:20:05PM -0700, Yonghong Song wrote: >> >> >> On 4/29/20 11:46 AM, Martin KaFai Lau wrote: >>> On Wed, Apr 29, 2020 at 11:16:35AM -0700, Andrii Nakryiko wrote: >>>> On Wed, Apr 29, 2020 at 12:07 AM Yonghong Song <yhs@fb.com> wrote: >>>>> >>>>> >>>>> >>>>> On 4/28/20 11:56 PM, Andrii Nakryiko wrote: >>>>>> On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@fb.com> wrote: >>>>>>> >>>>>>> A new bpf command BPF_ITER_CREATE is added. >>>>>>> >>>>>>> The anonymous bpf iterator is seq_file based. >>>>>>> The seq_file private data are referenced by targets. >>>>>>> The bpf_iter infrastructure allocated additional space >>>>>>> at seq_file->private after the space used by targets >>>>>>> to store some meta data, e.g., >>>>>>> prog: prog to run >>>>>>> session_id: an unique id for each opened seq_file >>>>>>> seq_num: how many times bpf programs are queried in this session >>>>>>> has_last: indicate whether or not bpf_prog has been called after >>>>>>> all valid objects have been processed >>>>>>> >>>>>>> A map between file and prog/link is established to help >>>>>>> fops->release(). When fops->release() is called, just based on >>>>>>> inode and file, bpf program cannot be located since target >>>>>>> seq_priv_size not available. This map helps retrieve the prog >>>>>>> whose reference count needs to be decremented. >>>>>>> >>>>>>> Signed-off-by: Yonghong Song <yhs@fb.com> >>>>>>> --- >>>>>>> include/linux/bpf.h | 3 + >>>>>>> include/uapi/linux/bpf.h | 6 ++ >>>>>>> kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- >>>>>>> kernel/bpf/syscall.c | 27 ++++++ >>>>>>> tools/include/uapi/linux/bpf.h | 6 ++ >>>>>>> 5 files changed, 203 insertions(+), 1 deletion(-) >>>>>>> >>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>>>>>> index 4fc39d9b5cd0..0f0cafc65a04 100644 >>>>>>> --- a/include/linux/bpf.h >>>>>>> +++ b/include/linux/bpf.h >>>>>>> @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); >>>>>>> int bpf_obj_pin_user(u32 ufd, const char __user *pathname); >>>>>>> int bpf_obj_get_user(const char __user *pathname, int flags); >>>>>>> >>>>>>> +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) >>>>>>> + >>>>>>> struct bpf_iter_reg { >>>>>>> const char *target; >>>>>>> const char *target_func_name; >>>>>>> @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); >>>>>>> int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); >>>>>>> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, >>>>>>> struct bpf_prog *new_prog); >>>>>>> +int bpf_iter_new_fd(struct bpf_link *link); >>>>>>> >>>>>>> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); >>>>>>> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); >>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>>>>>> index f39b9fec37ab..576651110d16 100644 >>>>>>> --- a/include/uapi/linux/bpf.h >>>>>>> +++ b/include/uapi/linux/bpf.h >>>>>>> @@ -113,6 +113,7 @@ enum bpf_cmd { >>>>>>> BPF_MAP_DELETE_BATCH, >>>>>>> BPF_LINK_CREATE, >>>>>>> BPF_LINK_UPDATE, >>>>>>> + BPF_ITER_CREATE, >>>>>>> }; >>>>>>> >>>>>>> enum bpf_map_type { >>>>>>> @@ -590,6 +591,11 @@ union bpf_attr { >>>>>>> __u32 old_prog_fd; >>>>>>> } link_update; >>>>>>> >>>>>>> + struct { /* struct used by BPF_ITER_CREATE command */ >>>>>>> + __u32 link_fd; >>>>>>> + __u32 flags; >>>>>>> + } iter_create; >>>>>>> + >>>>>>> } __attribute__((aligned(8))); >>>>>>> >>>>>>> /* The description below is an attempt at providing documentation to eBPF >>>>>>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c >>>>>>> index fc1ce5ee5c3f..1f4e778d1814 100644 >>>>>>> --- a/kernel/bpf/bpf_iter.c >>>>>>> +++ b/kernel/bpf/bpf_iter.c >>>>>>> @@ -2,6 +2,7 @@ >>>>>>> /* Copyright (c) 2020 Facebook */ >>>>>>> >>>>>>> #include <linux/fs.h> >>>>>>> +#include <linux/anon_inodes.h> >>>>>>> #include <linux/filter.h> >>>>>>> #include <linux/bpf.h> >>>>>>> >>>>>>> @@ -19,6 +20,19 @@ struct bpf_iter_link { >>>>>>> struct bpf_iter_target_info *tinfo; >>>>>>> }; >>>>>>> >>>>>>> +struct extra_priv_data { >>>>>>> + struct bpf_prog *prog; >>>>>>> + u64 session_id; >>>>>>> + u64 seq_num; >>>>>>> + bool has_last; >>>>>>> +}; >>>>>>> + >>>>>>> +struct anon_file_prog_assoc { >>>>>>> + struct list_head list; >>>>>>> + struct file *file; >>>>>>> + struct bpf_prog *prog; >>>>>>> +}; >>>>>>> + >>>>>>> static struct list_head targets; >>>>>>> static struct mutex targets_mutex; >>>>>>> static bool bpf_iter_inited = false; >>>>>>> @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; >>>>>>> /* protect bpf_iter_link.link->prog upddate */ >>>>>>> static struct mutex bpf_iter_mutex; >>>>>>> >>>>>>> +/* Since at anon seq_file release function, the prog cannot >>>>>>> + * be retrieved since target seq_priv_size is not available. >>>>>>> + * Keep a list of <anon_file, prog> mapping, so that >>>>>>> + * at file release stage, the prog can be released properly. >>>>>>> + */ >>>>>>> +static struct list_head anon_iter_info; >>>>>>> +static struct mutex anon_iter_info_mutex; >>>>>>> + >>>>>>> +/* incremented on every opened seq_file */ >>>>>>> +static atomic64_t session_id; >>>>>>> + >>>>>>> +static u32 get_total_priv_dsize(u32 old_size) >>>>>>> +{ >>>>>>> + return roundup(old_size, 8) + sizeof(struct extra_priv_data); >>>>>>> +} >>>>>>> + >>>>>>> +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) >>>>>>> +{ >>>>>>> + return old_ptr + roundup(old_size, 8); >>>>>>> +} >>>>>>> + >>>>>>> +static int anon_iter_release(struct inode *inode, struct file *file) >>>>>>> +{ >>>>>>> + struct anon_file_prog_assoc *finfo; >>>>>>> + >>>>>>> + mutex_lock(&anon_iter_info_mutex); >>>>>>> + list_for_each_entry(finfo, &anon_iter_info, list) { >>>>>>> + if (finfo->file == file) { >>>>>> >>>>>> I'll look at this and other patches more thoroughly tomorrow with >>>>>> clear head, but this iteration to find anon_file_prog_assoc is really >>>>>> unfortunate. >>>>>> >>>>>> I think the problem is that you are allowing seq_file infrastructure >>>>>> to call directly into target implementation of seq_operations without >>>>>> intercepting them. If you change that and put whatever extra info is >>>>>> necessary into seq_file->private in front of target's private state, >>>>>> then you shouldn't need this, right? >>>>> >>>>> Yes. This is true. The idea is to minimize the target change. >>>>> But maybe this is not a good goal by itself. >>>>> >>>>> You are right, if I intercept all seq_ops(), I do not need the >>>>> above change, I can tailor seq_file private_data right before >>>>> calling target one and restore after the target call. >>>>> >>>>> Originally I only have one interception, show(), now I have >>>>> stop() too to call bpf at the end of iteration. Maybe I can >>>>> interpret all four, I think. This way, I can also get ride >>>>> of target feature. >>>> >>>> If the main goal is to minimize target changes and make them exactly >>>> seq_operations implementation, then one easier way to get easy access >>>> to our own metadata in seq_file->private is to set it to point >>>> **after** our metadata, but before target's metadata. Roughly in >>>> pseudo code: >>>> >>>> struct bpf_iter_seq_file_meta {} __attribute((aligned(8))); >>>> >>>> void *meta = kmalloc(sizeof(struct bpf_iter_seq_file_meta) + >>>> target_private_size); >>>> seq_file->private = meta + sizeof(struct bpf_iter_seq_file_meta); >>> I have suggested the same thing earlier. Good to know that we think alike ;) >>> >>> May be put them in a struct such that container_of...etc can be used: >>> struct bpf_iter_private { >>> struct extra_priv_data iter_private; >>> u8 target_private[] __aligned(8); >>> }; >> >> This should work, but need to intercept all seq_ops() operations >> because target expects private data is `target_private` only. >> Let me experiment what is the best way to do this. > As long as "seq_file->private = bpf_iter_private->target_private;" as > Andrii also suggested, the existing seq_ops() should not have to be > changed or needed to be intercepted because they are only > accessing it through seq_file->private. > > The bpf_iter logic should be the only one needed to access the > bpf_iter_private->iter_private and it can be obtained by, e.g. > "container_of(seq_file->private, struct bpf_iter_private, target_private)" Thanks for explanation! I never thought of using container_of magic here. It indeed work very nicely. > >> >>> >>>> >>>> >>>> Then to recover bpf_iter_Seq_file_meta: >>>> >>>> struct bpf_iter_seq_file_meta *meta = seq_file->private - sizeof(*meta); >>>> >>>> /* voila! */ >>>> >>>> This doesn't have a benefit of making targets simpler, but will >>>> require no changes to them at all. Plus less indirect calls, so less >>>> performance penalty. >>>>
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fc39d9b5cd0..0f0cafc65a04 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1112,6 +1112,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) + struct bpf_iter_reg { const char *target; const char *target_func_name; @@ -1133,6 +1135,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, struct bpf_prog *new_prog); +int bpf_iter_new_fd(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f39b9fec37ab..576651110d16 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -590,6 +591,11 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index fc1ce5ee5c3f..1f4e778d1814 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include <linux/fs.h> +#include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -19,6 +20,19 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct extra_priv_data { + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool has_last; +}; + +struct anon_file_prog_assoc { + struct list_head list; + struct file *file; + struct bpf_prog *prog; +}; + static struct list_head targets; static struct mutex targets_mutex; static bool bpf_iter_inited = false; @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; /* protect bpf_iter_link.link->prog upddate */ static struct mutex bpf_iter_mutex; +/* Since at anon seq_file release function, the prog cannot + * be retrieved since target seq_priv_size is not available. + * Keep a list of <anon_file, prog> mapping, so that + * at file release stage, the prog can be released properly. + */ +static struct list_head anon_iter_info; +static struct mutex anon_iter_info_mutex; + +/* incremented on every opened seq_file */ +static atomic64_t session_id; + +static u32 get_total_priv_dsize(u32 old_size) +{ + return roundup(old_size, 8) + sizeof(struct extra_priv_data); +} + +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) +{ + return old_ptr + roundup(old_size, 8); +} + +static int anon_iter_release(struct inode *inode, struct file *file) +{ + struct anon_file_prog_assoc *finfo; + + mutex_lock(&anon_iter_info_mutex); + list_for_each_entry(finfo, &anon_iter_info, list) { + if (finfo->file == file) { + bpf_prog_put(finfo->prog); + list_del(&finfo->list); + kfree(finfo); + break; + } + } + mutex_unlock(&anon_iter_info_mutex); + + return seq_release_private(inode, file); +} + +static const struct file_operations anon_bpf_iter_fops = { + .read = seq_read, + .release = anon_iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -37,6 +95,8 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) INIT_LIST_HEAD(&targets); mutex_init(&targets_mutex); mutex_init(&bpf_iter_mutex); + INIT_LIST_HEAD(&anon_iter_info); + mutex_init(&anon_iter_info_mutex); bpf_iter_inited = true; } @@ -61,7 +121,20 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, u64 *session_id, u64 *seq_num, bool is_last) { - return NULL; + struct extra_priv_data *extra_data; + + if (seq->file->f_op != &anon_bpf_iter_fops) + return NULL; + + extra_data = get_extra_priv_dptr(seq->private, priv_data_size); + if (extra_data->has_last) + return NULL; + + *session_id = extra_data->session_id; + *seq_num = extra_data->seq_num++; + extra_data->has_last = is_last; + + return extra_data->prog; } int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, mutex_unlock(&bpf_iter_mutex); return ret; } + +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + struct extra_priv_data *extra_data; + + if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE) + set_seq_net_private((struct seq_net_private *)priv_data, + current->nsproxy->net_ns); + + extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size); + extra_data->session_id = atomic64_add_return(1, &session_id); + extra_data->prog = prog; + extra_data->seq_num = 0; + extra_data->has_last = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct anon_file_prog_assoc *finfo; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + void *priv_data; + + finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN); + if (!finfo) + return -ENOMEM; + + mutex_lock(&bpf_iter_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&bpf_iter_mutex); + + tinfo = link->tinfo; + total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size); + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + bpf_prog_sub(prog, 1); + kfree(finfo); + return -ENOMEM; + } + + init_seq_file(priv_data, tinfo, prog); + + finfo->file = file; + finfo->prog = prog; + + mutex_lock(&anon_iter_info_mutex); + list_add(&finfo->list, &anon_iter_info); + mutex_unlock(&anon_iter_info_mutex); + return 0; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &anon_bpf_iter_fops, + NULL, O_CLOEXEC); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b7af4f006f2e..458f7000887a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3696,6 +3696,30 @@ static int link_update(union bpf_attr *attr) return ret; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + if (err < 0) + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3813,6 +3837,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f39b9fec37ab..576651110d16 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -590,6 +591,11 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF
A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private after the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session has_last: indicate whether or not bpf_prog has been called after all valid objects have been processed A map between file and prog/link is established to help fops->release(). When fops->release() is called, just based on inode and file, bpf program cannot be located since target seq_priv_size not available. This map helps retrieve the prog whose reference count needs to be decremented. Signed-off-by: Yonghong Song <yhs@fb.com> --- include/linux/bpf.h | 3 + include/uapi/linux/bpf.h | 6 ++ kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 27 ++++++ tools/include/uapi/linux/bpf.h | 6 ++ 5 files changed, 203 insertions(+), 1 deletion(-)