diff mbox series

[bpf-next,v1,08/19] bpf: create file bpf iterator

Message ID 20200427201244.2995241-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song April 27, 2020, 8:12 p.m. UTC
A new obj type BPF_TYPE_ITER is added to bpffs.
To produce a file bpf iterator, the fd must be
corresponding to a link_fd assocciated with a
trace/iter program. When the pinned file is
opened, a seq_file will be generated.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/inode.c    | 28 +++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  2 +-
 4 files changed, 79 insertions(+), 2 deletions(-)

Comments

Andrii Nakryiko April 29, 2020, 8:40 p.m. UTC | #1
On Mon, Apr 27, 2020 at 1:18 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new obj type BPF_TYPE_ITER is added to bpffs.
> To produce a file bpf iterator, the fd must be
> corresponding to a link_fd assocciated with a
> trace/iter program. When the pinned file is
> opened, a seq_file will be generated.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   |  3 +++
>  kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++-
>  kernel/bpf/inode.c    | 28 +++++++++++++++++++++++++
>  kernel/bpf/syscall.c  |  2 +-
>  4 files changed, 79 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0f0cafc65a04..601b3299b7e4 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1021,6 +1021,8 @@ static inline void bpf_enable_instrumentation(void)
>
>  extern const struct file_operations bpf_map_fops;
>  extern const struct file_operations bpf_prog_fops;
> +extern const struct file_operations bpf_link_fops;
> +extern const struct file_operations bpffs_iter_fops;
>
>  #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
>         extern const struct bpf_prog_ops _name ## _prog_ops; \
> @@ -1136,6 +1138,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>  int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>                           struct bpf_prog *new_prog);
>  int bpf_iter_new_fd(struct bpf_link *link);
> +void *bpf_iter_get_from_fd(u32 ufd);
>
>  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 1f4e778d1814..f5e933236996 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -123,7 +123,8 @@ struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
>  {
>         struct extra_priv_data *extra_data;
>
> -       if (seq->file->f_op != &anon_bpf_iter_fops)
> +       if (seq->file->f_op != &anon_bpf_iter_fops &&
> +           seq->file->f_op != &bpffs_iter_fops)

Do we really need anon_bpf_iter_fops and bpffs_iter_fops? Seems like
the only difference is bpffs_iter_open. Could it be implemented as
part of anon_bpf_iter_ops as well? Seems like open() is never called
for anon_inode_file, so it should work for both?

>                 return NULL;
>
>         extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
> @@ -310,3 +311,48 @@ int bpf_iter_new_fd(struct bpf_link *link)
>         put_unused_fd(fd);
>         return err;
>  }
> +
> +static int bpffs_iter_open(struct inode *inode, struct file *file)
> +{
> +       struct bpf_iter_link *link = inode->i_private;
> +
> +       return prepare_seq_file(file, link);
> +}
> +
> +static int bpffs_iter_release(struct inode *inode, struct file *file)
> +{
> +       return anon_iter_release(inode, file);
> +}
> +
> +const struct file_operations bpffs_iter_fops = {
> +       .open           = bpffs_iter_open,
> +       .read           = seq_read,
> +       .release        = bpffs_iter_release,
> +};
> +
> +void *bpf_iter_get_from_fd(u32 ufd)

return struct bpf_iter_link * here, given this is specific constructor
for bpf_iter_link?

> +{
> +       struct bpf_link *link;
> +       struct bpf_prog *prog;
> +       struct fd f;
> +
> +       f = fdget(ufd);
> +       if (!f.file)
> +               return ERR_PTR(-EBADF);
> +       if (f.file->f_op != &bpf_link_fops) {
> +               link = ERR_PTR(-EINVAL);
> +               goto out;
> +       }
> +
> +       link = f.file->private_data;
> +       prog = link->prog;
> +       if (prog->expected_attach_type != BPF_TRACE_ITER) {
> +               link = ERR_PTR(-EINVAL);
> +               goto out;
> +       }
> +
> +       bpf_link_inc(link);
> +out:
> +       fdput(f);
> +       return link;
> +}
> diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
> index 95087d9f4ed3..de4493983a37 100644
> --- a/kernel/bpf/inode.c
> +++ b/kernel/bpf/inode.c
> @@ -26,6 +26,7 @@ enum bpf_type {
>         BPF_TYPE_PROG,
>         BPF_TYPE_MAP,
>         BPF_TYPE_LINK,
> +       BPF_TYPE_ITER,

Adding ITER as an alternative type of pinned object to BPF_TYPE_LINK
seems undesirable. We can allow opening bpf_iter's seq_file by doing
the same trick as is done for bpf_maps, supporting seq_show (see
bpf_mkmap() and bpf_map_support_seq_show()). Do you think we can do
the same here? If we later see that more kinds of links would want to
allow direct open() to create a file with some output from BPF
program, we can generalize this as part of bpf_link infrastructure.
For now having a custom check similar to bpf_map's seems sufficient.

What do you think?

>  };
>
>  static void *bpf_any_get(void *raw, enum bpf_type type)
> @@ -38,6 +39,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
>                 bpf_map_inc_with_uref(raw);
>                 break;
>         case BPF_TYPE_LINK:
> +       case BPF_TYPE_ITER:
>                 bpf_link_inc(raw);
>                 break;
>         default:
> @@ -58,6 +60,7 @@ static void bpf_any_put(void *raw, enum bpf_type type)
>                 bpf_map_put_with_uref(raw);
>                 break;
>         case BPF_TYPE_LINK:
> +       case BPF_TYPE_ITER:
>                 bpf_link_put(raw);
>                 break;
>         default:
> @@ -82,6 +85,15 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
>                 return raw;
>         }
>

[...]
Yonghong Song April 30, 2020, 6:02 p.m. UTC | #2
On 4/29/20 1:40 PM, Andrii Nakryiko wrote:
> On Mon, Apr 27, 2020 at 1:18 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A new obj type BPF_TYPE_ITER is added to bpffs.
>> To produce a file bpf iterator, the fd must be
>> corresponding to a link_fd assocciated with a
>> trace/iter program. When the pinned file is
>> opened, a seq_file will be generated.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h   |  3 +++
>>   kernel/bpf/bpf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++-
>>   kernel/bpf/inode.c    | 28 +++++++++++++++++++++++++
>>   kernel/bpf/syscall.c  |  2 +-
>>   4 files changed, 79 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 0f0cafc65a04..601b3299b7e4 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1021,6 +1021,8 @@ static inline void bpf_enable_instrumentation(void)
>>
>>   extern const struct file_operations bpf_map_fops;
>>   extern const struct file_operations bpf_prog_fops;
>> +extern const struct file_operations bpf_link_fops;
>> +extern const struct file_operations bpffs_iter_fops;
>>
>>   #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
>>          extern const struct bpf_prog_ops _name ## _prog_ops; \
>> @@ -1136,6 +1138,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
>>   int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
>>                            struct bpf_prog *new_prog);
>>   int bpf_iter_new_fd(struct bpf_link *link);
>> +void *bpf_iter_get_from_fd(u32 ufd);
>>
>>   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>>   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 1f4e778d1814..f5e933236996 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -123,7 +123,8 @@ struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
>>   {
>>          struct extra_priv_data *extra_data;
>>
>> -       if (seq->file->f_op != &anon_bpf_iter_fops)
>> +       if (seq->file->f_op != &anon_bpf_iter_fops &&
>> +           seq->file->f_op != &bpffs_iter_fops)
> 
> Do we really need anon_bpf_iter_fops and bpffs_iter_fops? Seems like
> the only difference is bpffs_iter_open. Could it be implemented as
> part of anon_bpf_iter_ops as well? Seems like open() is never called
> for anon_inode_file, so it should work for both?

Yes, open() will not be used for anon_bpf_iter. I used two
file_operations just for this reason. But I guess, I can
just use one. It won't hurt.

> 
>>                  return NULL;
>>
>>          extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
>> @@ -310,3 +311,48 @@ int bpf_iter_new_fd(struct bpf_link *link)
>>          put_unused_fd(fd);
>>          return err;
>>   }
>> +
>> +static int bpffs_iter_open(struct inode *inode, struct file *file)
>> +{
>> +       struct bpf_iter_link *link = inode->i_private;
>> +
>> +       return prepare_seq_file(file, link);
>> +}
>> +
>> +static int bpffs_iter_release(struct inode *inode, struct file *file)
>> +{
>> +       return anon_iter_release(inode, file);
>> +}
>> +
>> +const struct file_operations bpffs_iter_fops = {
>> +       .open           = bpffs_iter_open,
>> +       .read           = seq_read,
>> +       .release        = bpffs_iter_release,
>> +};
>> +
>> +void *bpf_iter_get_from_fd(u32 ufd)
> 
> return struct bpf_iter_link * here, given this is specific constructor
> for bpf_iter_link?
> 
>> +{
>> +       struct bpf_link *link;
>> +       struct bpf_prog *prog;
>> +       struct fd f;
>> +
>> +       f = fdget(ufd);
>> +       if (!f.file)
>> +               return ERR_PTR(-EBADF);
>> +       if (f.file->f_op != &bpf_link_fops) {
>> +               link = ERR_PTR(-EINVAL);
>> +               goto out;
>> +       }
>> +
>> +       link = f.file->private_data;
>> +       prog = link->prog;
>> +       if (prog->expected_attach_type != BPF_TRACE_ITER) {
>> +               link = ERR_PTR(-EINVAL);
>> +               goto out;
>> +       }
>> +
>> +       bpf_link_inc(link);
>> +out:
>> +       fdput(f);
>> +       return link;
>> +}
>> diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
>> index 95087d9f4ed3..de4493983a37 100644
>> --- a/kernel/bpf/inode.c
>> +++ b/kernel/bpf/inode.c
>> @@ -26,6 +26,7 @@ enum bpf_type {
>>          BPF_TYPE_PROG,
>>          BPF_TYPE_MAP,
>>          BPF_TYPE_LINK,
>> +       BPF_TYPE_ITER,
> 
> Adding ITER as an alternative type of pinned object to BPF_TYPE_LINK
> seems undesirable. We can allow opening bpf_iter's seq_file by doing
> the same trick as is done for bpf_maps, supporting seq_show (see
> bpf_mkmap() and bpf_map_support_seq_show()). Do you think we can do
> the same here? If we later see that more kinds of links would want to
> allow direct open() to create a file with some output from BPF
> program, we can generalize this as part of bpf_link infrastructure.
> For now having a custom check similar to bpf_map's seems sufficient.
> 
> What do you think?

Sounds good. Will use the mechanism similar to bpf_map.

> 
>>   };
>>
>>   static void *bpf_any_get(void *raw, enum bpf_type type)
>> @@ -38,6 +39,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
>>                  bpf_map_inc_with_uref(raw);
>>                  break;
>>          case BPF_TYPE_LINK:
>> +       case BPF_TYPE_ITER:
>>                  bpf_link_inc(raw);
>>                  break;
>>          default:
>> @@ -58,6 +60,7 @@ static void bpf_any_put(void *raw, enum bpf_type type)
>>                  bpf_map_put_with_uref(raw);
>>                  break;
>>          case BPF_TYPE_LINK:
>> +       case BPF_TYPE_ITER:
>>                  bpf_link_put(raw);
>>                  break;
>>          default:
>> @@ -82,6 +85,15 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
>>                  return raw;
>>          }
>>
> 
> [...]
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0f0cafc65a04..601b3299b7e4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1021,6 +1021,8 @@  static inline void bpf_enable_instrumentation(void)
 
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
+extern const struct file_operations bpf_link_fops;
+extern const struct file_operations bpffs_iter_fops;
 
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
@@ -1136,6 +1138,7 @@  int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog,
 			  struct bpf_prog *new_prog);
 int bpf_iter_new_fd(struct bpf_link *link);
+void *bpf_iter_get_from_fd(u32 ufd);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 1f4e778d1814..f5e933236996 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -123,7 +123,8 @@  struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
 {
 	struct extra_priv_data *extra_data;
 
-	if (seq->file->f_op != &anon_bpf_iter_fops)
+	if (seq->file->f_op != &anon_bpf_iter_fops &&
+	    seq->file->f_op != &bpffs_iter_fops)
 		return NULL;
 
 	extra_data = get_extra_priv_dptr(seq->private, priv_data_size);
@@ -310,3 +311,48 @@  int bpf_iter_new_fd(struct bpf_link *link)
 	put_unused_fd(fd);
 	return err;
 }
+
+static int bpffs_iter_open(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_link *link = inode->i_private;
+
+	return prepare_seq_file(file, link);
+}
+
+static int bpffs_iter_release(struct inode *inode, struct file *file)
+{
+	return anon_iter_release(inode, file);
+}
+
+const struct file_operations bpffs_iter_fops = {
+	.open		= bpffs_iter_open,
+	.read		= seq_read,
+	.release	= bpffs_iter_release,
+};
+
+void *bpf_iter_get_from_fd(u32 ufd)
+{
+	struct bpf_link *link;
+	struct bpf_prog *prog;
+	struct fd f;
+
+	f = fdget(ufd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &bpf_link_fops) {
+		link = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	link = f.file->private_data;
+	prog = link->prog;
+	if (prog->expected_attach_type != BPF_TRACE_ITER) {
+		link = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	bpf_link_inc(link);
+out:
+	fdput(f);
+	return link;
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 95087d9f4ed3..de4493983a37 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -26,6 +26,7 @@  enum bpf_type {
 	BPF_TYPE_PROG,
 	BPF_TYPE_MAP,
 	BPF_TYPE_LINK,
+	BPF_TYPE_ITER,
 };
 
 static void *bpf_any_get(void *raw, enum bpf_type type)
@@ -38,6 +39,7 @@  static void *bpf_any_get(void *raw, enum bpf_type type)
 		bpf_map_inc_with_uref(raw);
 		break;
 	case BPF_TYPE_LINK:
+	case BPF_TYPE_ITER:
 		bpf_link_inc(raw);
 		break;
 	default:
@@ -58,6 +60,7 @@  static void bpf_any_put(void *raw, enum bpf_type type)
 		bpf_map_put_with_uref(raw);
 		break;
 	case BPF_TYPE_LINK:
+	case BPF_TYPE_ITER:
 		bpf_link_put(raw);
 		break;
 	default:
@@ -82,6 +85,15 @@  static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
 		return raw;
 	}
 
+	/* check bpf_iter before bpf_link as
+	 * ufd is also a link.
+	 */
+	raw = bpf_iter_get_from_fd(ufd);
+	if (!IS_ERR(raw)) {
+		*type = BPF_TYPE_ITER;
+		return raw;
+	}
+
 	raw = bpf_link_get_from_fd(ufd);
 	if (!IS_ERR(raw)) {
 		*type = BPF_TYPE_LINK;
@@ -96,6 +108,7 @@  static const struct inode_operations bpf_dir_iops;
 static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
+static const struct inode_operations bpf_iter_iops  = { };
 
 static struct inode *bpf_get_inode(struct super_block *sb,
 				   const struct inode *dir,
@@ -135,6 +148,8 @@  static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 		*type = BPF_TYPE_MAP;
 	else if (inode->i_op == &bpf_link_iops)
 		*type = BPF_TYPE_LINK;
+	else if (inode->i_op == &bpf_iter_iops)
+		*type = BPF_TYPE_ITER;
 	else
 		return -EACCES;
 
@@ -362,6 +377,12 @@  static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
 			     &bpffs_obj_fops);
 }
 
+static int bpf_mkiter(struct dentry *dentry, umode_t mode, void *arg)
+{
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_iter_iops,
+			     &bpffs_iter_fops);
+}
+
 static struct dentry *
 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
@@ -441,6 +462,9 @@  static int bpf_obj_do_pin(const char __user *pathname, void *raw,
 	case BPF_TYPE_LINK:
 		ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
 		break;
+	case BPF_TYPE_ITER:
+		ret = vfs_mkobj(dentry, mode, bpf_mkiter, raw);
+		break;
 	default:
 		ret = -EPERM;
 	}
@@ -519,6 +543,8 @@  int bpf_obj_get_user(const char __user *pathname, int flags)
 		ret = bpf_map_new_fd(raw, f_flags);
 	else if (type == BPF_TYPE_LINK)
 		ret = bpf_link_new_fd(raw);
+	else if (type == BPF_TYPE_ITER)
+		ret = bpf_iter_new_fd(raw);
 	else
 		return -ENOENT;
 
@@ -538,6 +564,8 @@  static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type
 		return ERR_PTR(-EINVAL);
 	if (inode->i_op == &bpf_link_iops)
 		return ERR_PTR(-EINVAL);
+	if (inode->i_op == &bpf_iter_iops)
+		return ERR_PTR(-EINVAL);
 	if (inode->i_op != &bpf_prog_iops)
 		return ERR_PTR(-EACCES);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 458f7000887a..e9ca5fbe8723 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2285,7 +2285,7 @@  static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_link_fops = {
+const struct file_operations bpf_link_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_link_show_fdinfo,
 #endif