Message ID | 20200427201244.2995241-1-yhs@fb.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: implement bpf iterator for kernel data | expand |
On Mon, Apr 27, 2020 at 1:18 PM Yonghong Song <yhs@fb.com> wrote: > > A new obj type BPF_TYPE_ITER is added to bpffs. > To produce a file bpf iterator, the fd must be > corresponding to a link_fd assocciated with a > trace/iter program. When the pinned file is > opened, a seq_file will be generated. > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 3 +++ > kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++- > kernel/bpf/inode.c | 28 +++++++++++++++++++++++++ > kernel/bpf/syscall.c | 2 +- > 4 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 0f0cafc65a04..601b3299b7e4 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1021,6 +1021,8 @@ static inline void bpf_enable_instrumentation(void) > > extern const struct file_operations bpf_map_fops; > extern const struct file_operations bpf_prog_fops; > +extern const struct file_operations bpf_link_fops; > +extern const struct file_operations bpffs_iter_fops; > > #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ > extern const struct bpf_prog_ops _name ## _prog_ops; \ > @@ -1136,6 +1138,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > struct bpf_prog *new_prog); > int bpf_iter_new_fd(struct bpf_link *link); > +void *bpf_iter_get_from_fd(u32 ufd); > > int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index 1f4e778d1814..f5e933236996 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c > @@ -123,7 +123,8 @@ struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, > { > struct extra_priv_data *extra_data; > > - if (seq->file->f_op != &anon_bpf_iter_fops) > + if (seq->file->f_op != &anon_bpf_iter_fops && > + seq->file->f_op != &bpffs_iter_fops) Do we really need anon_bpf_iter_fops and bpffs_iter_fops? Seems like the only difference is bpffs_iter_open. Could it be implemented as part of anon_bpf_iter_ops as well? Seems like open() is never called for anon_inode_file, so it should work for both? > return NULL; > > extra_data = get_extra_priv_dptr(seq->private, priv_data_size); > @@ -310,3 +311,48 @@ int bpf_iter_new_fd(struct bpf_link *link) > put_unused_fd(fd); > return err; > } > + > +static int bpffs_iter_open(struct inode *inode, struct file *file) > +{ > + struct bpf_iter_link *link = inode->i_private; > + > + return prepare_seq_file(file, link); > +} > + > +static int bpffs_iter_release(struct inode *inode, struct file *file) > +{ > + return anon_iter_release(inode, file); > +} > + > +const struct file_operations bpffs_iter_fops = { > + .open = bpffs_iter_open, > + .read = seq_read, > + .release = bpffs_iter_release, > +}; > + > +void *bpf_iter_get_from_fd(u32 ufd) return struct bpf_iter_link * here, given this is specific constructor for bpf_iter_link? > +{ > + struct bpf_link *link; > + struct bpf_prog *prog; > + struct fd f; > + > + f = fdget(ufd); > + if (!f.file) > + return ERR_PTR(-EBADF); > + if (f.file->f_op != &bpf_link_fops) { > + link = ERR_PTR(-EINVAL); > + goto out; > + } > + > + link = f.file->private_data; > + prog = link->prog; > + if (prog->expected_attach_type != BPF_TRACE_ITER) { > + link = ERR_PTR(-EINVAL); > + goto out; > + } > + > + bpf_link_inc(link); > +out: > + fdput(f); > + return link; > +} > diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c > index 95087d9f4ed3..de4493983a37 100644 > --- a/kernel/bpf/inode.c > +++ b/kernel/bpf/inode.c > @@ -26,6 +26,7 @@ enum bpf_type { > BPF_TYPE_PROG, > BPF_TYPE_MAP, > BPF_TYPE_LINK, > + BPF_TYPE_ITER, Adding ITER as an alternative type of pinned object to BPF_TYPE_LINK seems undesirable. We can allow opening bpf_iter's seq_file by doing the same trick as is done for bpf_maps, supporting seq_show (see bpf_mkmap() and bpf_map_support_seq_show()). Do you think we can do the same here? If we later see that more kinds of links would want to allow direct open() to create a file with some output from BPF program, we can generalize this as part of bpf_link infrastructure. For now having a custom check similar to bpf_map's seems sufficient. What do you think? > }; > > static void *bpf_any_get(void *raw, enum bpf_type type) > @@ -38,6 +39,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) > bpf_map_inc_with_uref(raw); > break; > case BPF_TYPE_LINK: > + case BPF_TYPE_ITER: > bpf_link_inc(raw); > break; > default: > @@ -58,6 +60,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) > bpf_map_put_with_uref(raw); > break; > case BPF_TYPE_LINK: > + case BPF_TYPE_ITER: > bpf_link_put(raw); > break; > default: > @@ -82,6 +85,15 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) > return raw; > } > [...]
On 4/29/20 1:40 PM, Andrii Nakryiko wrote: > On Mon, Apr 27, 2020 at 1:18 PM Yonghong Song <yhs@fb.com> wrote: >> >> A new obj type BPF_TYPE_ITER is added to bpffs. >> To produce a file bpf iterator, the fd must be >> corresponding to a link_fd assocciated with a >> trace/iter program. When the pinned file is >> opened, a seq_file will be generated. >> >> Signed-off-by: Yonghong Song <yhs@fb.com> >> --- >> include/linux/bpf.h | 3 +++ >> kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++- >> kernel/bpf/inode.c | 28 +++++++++++++++++++++++++ >> kernel/bpf/syscall.c | 2 +- >> 4 files changed, 79 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 0f0cafc65a04..601b3299b7e4 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -1021,6 +1021,8 @@ static inline void bpf_enable_instrumentation(void) >> >> extern const struct file_operations bpf_map_fops; >> extern const struct file_operations bpf_prog_fops; >> +extern const struct file_operations bpf_link_fops; >> +extern const struct file_operations bpffs_iter_fops; >> >> #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ >> extern const struct bpf_prog_ops _name ## _prog_ops; \ >> @@ -1136,6 +1138,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); >> int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, >> struct bpf_prog *new_prog); >> int bpf_iter_new_fd(struct bpf_link *link); >> +void *bpf_iter_get_from_fd(u32 ufd); >> >> int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); >> int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c >> index 1f4e778d1814..f5e933236996 100644 >> --- a/kernel/bpf/bpf_iter.c >> +++ b/kernel/bpf/bpf_iter.c >> @@ -123,7 +123,8 @@ struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, >> { >> struct extra_priv_data *extra_data; >> >> - if (seq->file->f_op != &anon_bpf_iter_fops) >> + if (seq->file->f_op != &anon_bpf_iter_fops && >> + seq->file->f_op != &bpffs_iter_fops) > > Do we really need anon_bpf_iter_fops and bpffs_iter_fops? Seems like > the only difference is bpffs_iter_open. Could it be implemented as > part of anon_bpf_iter_ops as well? Seems like open() is never called > for anon_inode_file, so it should work for both? Yes, open() will not be used for anon_bpf_iter. I used two file_operations just for this reason. But I guess, I can just use one. It won't hurt. > >> return NULL; >> >> extra_data = get_extra_priv_dptr(seq->private, priv_data_size); >> @@ -310,3 +311,48 @@ int bpf_iter_new_fd(struct bpf_link *link) >> put_unused_fd(fd); >> return err; >> } >> + >> +static int bpffs_iter_open(struct inode *inode, struct file *file) >> +{ >> + struct bpf_iter_link *link = inode->i_private; >> + >> + return prepare_seq_file(file, link); >> +} >> + >> +static int bpffs_iter_release(struct inode *inode, struct file *file) >> +{ >> + return anon_iter_release(inode, file); >> +} >> + >> +const struct file_operations bpffs_iter_fops = { >> + .open = bpffs_iter_open, >> + .read = seq_read, >> + .release = bpffs_iter_release, >> +}; >> + >> +void *bpf_iter_get_from_fd(u32 ufd) > > return struct bpf_iter_link * here, given this is specific constructor > for bpf_iter_link? > >> +{ >> + struct bpf_link *link; >> + struct bpf_prog *prog; >> + struct fd f; >> + >> + f = fdget(ufd); >> + if (!f.file) >> + return ERR_PTR(-EBADF); >> + if (f.file->f_op != &bpf_link_fops) { >> + link = ERR_PTR(-EINVAL); >> + goto out; >> + } >> + >> + link = f.file->private_data; >> + prog = link->prog; >> + if (prog->expected_attach_type != BPF_TRACE_ITER) { >> + link = ERR_PTR(-EINVAL); >> + goto out; >> + } >> + >> + bpf_link_inc(link); >> +out: >> + fdput(f); >> + return link; >> +} >> diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c >> index 95087d9f4ed3..de4493983a37 100644 >> --- a/kernel/bpf/inode.c >> +++ b/kernel/bpf/inode.c >> @@ -26,6 +26,7 @@ enum bpf_type { >> BPF_TYPE_PROG, >> BPF_TYPE_MAP, >> BPF_TYPE_LINK, >> + BPF_TYPE_ITER, > > Adding ITER as an alternative type of pinned object to BPF_TYPE_LINK > seems undesirable. We can allow opening bpf_iter's seq_file by doing > the same trick as is done for bpf_maps, supporting seq_show (see > bpf_mkmap() and bpf_map_support_seq_show()). Do you think we can do > the same here? If we later see that more kinds of links would want to > allow direct open() to create a file with some output from BPF > program, we can generalize this as part of bpf_link infrastructure. > For now having a custom check similar to bpf_map's seems sufficient. > > What do you think? Sounds good. Will use the mechanism similar to bpf_map. > >> }; >> >> static void *bpf_any_get(void *raw, enum bpf_type type) >> @@ -38,6 +39,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) >> bpf_map_inc_with_uref(raw); >> break; >> case BPF_TYPE_LINK: >> + case BPF_TYPE_ITER: >> bpf_link_inc(raw); >> break; >> default: >> @@ -58,6 +60,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) >> bpf_map_put_with_uref(raw); >> break; >> case BPF_TYPE_LINK: >> + case BPF_TYPE_ITER: >> bpf_link_put(raw); >> break; >> default: >> @@ -82,6 +85,15 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) >> return raw; >> } >> > > [...] >
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f0cafc65a04..601b3299b7e4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1021,6 +1021,8 @@ static inline void bpf_enable_instrumentation(void) extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; +extern const struct file_operations bpf_link_fops; +extern const struct file_operations bpffs_iter_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ @@ -1136,6 +1138,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, struct bpf_prog *new_prog); int bpf_iter_new_fd(struct bpf_link *link); +void *bpf_iter_get_from_fd(u32 ufd); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 1f4e778d1814..f5e933236996 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -123,7 +123,8 @@ struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, { struct extra_priv_data *extra_data; - if (seq->file->f_op != &anon_bpf_iter_fops) + if (seq->file->f_op != &anon_bpf_iter_fops && + seq->file->f_op != &bpffs_iter_fops) return NULL; extra_data = get_extra_priv_dptr(seq->private, priv_data_size); @@ -310,3 +311,48 @@ int bpf_iter_new_fd(struct bpf_link *link) put_unused_fd(fd); return err; } + +static int bpffs_iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + +static int bpffs_iter_release(struct inode *inode, struct file *file) +{ + return anon_iter_release(inode, file); +} + +const struct file_operations bpffs_iter_fops = { + .open = bpffs_iter_open, + .read = seq_read, + .release = bpffs_iter_release, +}; + +void *bpf_iter_get_from_fd(u32 ufd) +{ + struct bpf_link *link; + struct bpf_prog *prog; + struct fd f; + + f = fdget(ufd); + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_link_fops) { + link = ERR_PTR(-EINVAL); + goto out; + } + + link = f.file->private_data; + prog = link->prog; + if (prog->expected_attach_type != BPF_TRACE_ITER) { + link = ERR_PTR(-EINVAL); + goto out; + } + + bpf_link_inc(link); +out: + fdput(f); + return link; +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..de4493983a37 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -26,6 +26,7 @@ enum bpf_type { BPF_TYPE_PROG, BPF_TYPE_MAP, BPF_TYPE_LINK, + BPF_TYPE_ITER, }; static void *bpf_any_get(void *raw, enum bpf_type type) @@ -38,6 +39,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) bpf_map_inc_with_uref(raw); break; case BPF_TYPE_LINK: + case BPF_TYPE_ITER: bpf_link_inc(raw); break; default: @@ -58,6 +60,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_map_put_with_uref(raw); break; case BPF_TYPE_LINK: + case BPF_TYPE_ITER: bpf_link_put(raw); break; default: @@ -82,6 +85,15 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) return raw; } + /* check bpf_iter before bpf_link as + * ufd is also a link. + */ + raw = bpf_iter_get_from_fd(ufd); + if (!IS_ERR(raw)) { + *type = BPF_TYPE_ITER; + return raw; + } + raw = bpf_link_get_from_fd(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_LINK; @@ -96,6 +108,7 @@ static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; +static const struct inode_operations bpf_iter_iops = { }; static struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -135,6 +148,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) *type = BPF_TYPE_MAP; else if (inode->i_op == &bpf_link_iops) *type = BPF_TYPE_LINK; + else if (inode->i_op == &bpf_iter_iops) + *type = BPF_TYPE_ITER; else return -EACCES; @@ -362,6 +377,12 @@ static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) &bpffs_obj_fops); } +static int bpf_mkiter(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_iter_iops, + &bpffs_iter_fops); +} + static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { @@ -441,6 +462,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, case BPF_TYPE_LINK: ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); break; + case BPF_TYPE_ITER: + ret = vfs_mkobj(dentry, mode, bpf_mkiter, raw); + break; default: ret = -EPERM; } @@ -519,6 +543,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) ret = bpf_link_new_fd(raw); + else if (type == BPF_TYPE_ITER) + ret = bpf_iter_new_fd(raw); else return -ENOENT; @@ -538,6 +564,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type return ERR_PTR(-EINVAL); if (inode->i_op == &bpf_link_iops) return ERR_PTR(-EINVAL); + if (inode->i_op == &bpf_iter_iops) + return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 458f7000887a..e9ca5fbe8723 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2285,7 +2285,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) } #endif -static const struct file_operations bpf_link_fops = { +const struct file_operations bpf_link_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, #endif
A new obj type BPF_TYPE_ITER is added to bpffs. To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song <yhs@fb.com> --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/inode.c | 28 +++++++++++++++++++++++++ kernel/bpf/syscall.c | 2 +- 4 files changed, 79 insertions(+), 2 deletions(-)