diff mbox series

[bpf-next,v2,06/20] bpf: create anonymous bpf iterator

Message ID 20200504062553.2047848-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song May 4, 2020, 6:25 a.m. UTC
A new bpf command BPF_ITER_CREATE is added.

The anonymous bpf iterator is seq_file based.
The seq_file private data are referenced by targets.
The bpf_iter infrastructure allocated additional space
at seq_file->private before the space used by targets
to store some meta data, e.g.,
  prog:       prog to run
  session_id: an unique id for each opened seq_file
  seq_num:    how many times bpf programs are queried in this session
  do_stop:    an internal state to decide whether bpf program
              should be called in seq_ops->stop() or not

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |   1 +
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  26 +++++++
 tools/include/uapi/linux/bpf.h |   6 ++
 5 files changed, 167 insertions(+)

Comments

Andrii Nakryiko May 5, 2020, 8:11 p.m. UTC | #1
On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new bpf command BPF_ITER_CREATE is added.
>
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private before the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   do_stop:    an internal state to decide whether bpf program
>               should be called in seq_ops->stop() or not
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   1 +
>  include/uapi/linux/bpf.h       |   6 ++
>  kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           |  26 +++++++
>  tools/include/uapi/linux/bpf.h |   6 ++
>  5 files changed, 167 insertions(+)
>

[...]

>  /* The description below is an attempt at providing documentation to eBPF
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 2674c9cbc3dc..2a9f939be6e6 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -2,6 +2,7 @@
>  /* Copyright (c) 2020 Facebook */
>
>  #include <linux/fs.h>
> +#include <linux/anon_inodes.h>
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>
> @@ -20,12 +21,26 @@ struct bpf_iter_link {
>         struct bpf_iter_target_info *tinfo;
>  };
>
> +struct bpf_iter_priv_data {
> +       struct {

nit: anon struct seems unnecessary here? is it just for visual grouping?

> +               struct bpf_iter_target_info *tinfo;
> +               struct bpf_prog *prog;
> +               u64 session_id;
> +               u64 seq_num;
> +               u64 do_stop;
> +       };
> +       u8 target_private[] __aligned(8);
> +};
> +
>  static struct list_head targets = LIST_HEAD_INIT(targets);
>  static DEFINE_MUTEX(targets_mutex);
>
>  /* protect bpf_iter_link changes */
>  static DEFINE_MUTEX(link_mutex);
>
> +/* incremented on every opened seq_file */
> +static atomic64_t session_id;
> +
>  /* bpf_seq_read, a customized and simpler version for bpf iterator.
>   * no_llseek is assumed for this file.
>   * The following are differences from seq_read():
> @@ -154,6 +169,31 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>         goto Done;
>  }
>
> +static int iter_release(struct inode *inode, struct file *file)
> +{
> +       struct bpf_iter_priv_data *iter_priv;
> +       void *file_priv = file->private_data;
> +       struct seq_file *seq;
> +
> +       seq = file_priv;


seq might be NULL, if anon_inode_getfile succeeded, but then
prepare_seq_file failed, so you need to handle that.

Also, file_priv is redundant, assign to seq directly from file->private_data?

> +       iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
> +                                target_private);
> +
> +       if (iter_priv->tinfo->fini_seq_private)
> +               iter_priv->tinfo->fini_seq_private(seq->private);
> +
> +       bpf_prog_put(iter_priv->prog);
> +       seq->private = iter_priv;
> +
> +       return seq_release_private(inode, file);
> +}
> +
> +static const struct file_operations bpf_iter_fops = {
> +       .llseek         = no_llseek,
> +       .read           = bpf_seq_read,
> +       .release        = iter_release,
> +};
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> @@ -289,3 +329,91 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>
>         return bpf_link_settle(&link_primer);
>  }
> +
> +static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
> +                         struct bpf_iter_target_info *tinfo,
> +                         struct bpf_prog *prog)
> +{
> +       priv_data->tinfo = tinfo;
> +       priv_data->prog = prog;
> +       priv_data->session_id = atomic64_add_return(1, &session_id);

nit: atomic64_inc_return?

> +       priv_data->seq_num = 0;
> +       priv_data->do_stop = 0;
> +}
> +

[...]
Yonghong Song May 5, 2020, 8:28 p.m. UTC | #2
On 5/5/20 1:11 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A new bpf command BPF_ITER_CREATE is added.
>>
>> The anonymous bpf iterator is seq_file based.
>> The seq_file private data are referenced by targets.
>> The bpf_iter infrastructure allocated additional space
>> at seq_file->private before the space used by targets
>> to store some meta data, e.g.,
>>    prog:       prog to run
>>    session_id: an unique id for each opened seq_file
>>    seq_num:    how many times bpf programs are queried in this session
>>    do_stop:    an internal state to decide whether bpf program
>>                should be called in seq_ops->stop() or not
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   1 +
>>   include/uapi/linux/bpf.h       |   6 ++
>>   kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c           |  26 +++++++
>>   tools/include/uapi/linux/bpf.h |   6 ++
>>   5 files changed, 167 insertions(+)
>>
> 
> [...]
> 
>>   /* The description below is an attempt at providing documentation to eBPF
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 2674c9cbc3dc..2a9f939be6e6 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -2,6 +2,7 @@
>>   /* Copyright (c) 2020 Facebook */
>>
>>   #include <linux/fs.h>
>> +#include <linux/anon_inodes.h>
>>   #include <linux/filter.h>
>>   #include <linux/bpf.h>
>>
>> @@ -20,12 +21,26 @@ struct bpf_iter_link {
>>          struct bpf_iter_target_info *tinfo;
>>   };
>>
>> +struct bpf_iter_priv_data {
>> +       struct {
> 
> nit: anon struct seems unnecessary here? is it just for visual grouping?

Yes, this is just for virtual grouping. Not 100% sure whether this
is needed or not.

> 
>> +               struct bpf_iter_target_info *tinfo;
>> +               struct bpf_prog *prog;
>> +               u64 session_id;
>> +               u64 seq_num;
>> +               u64 do_stop;
>> +       };
>> +       u8 target_private[] __aligned(8);
>> +};
>> +
>>   static struct list_head targets = LIST_HEAD_INIT(targets);
>>   static DEFINE_MUTEX(targets_mutex);
>>
>>   /* protect bpf_iter_link changes */
>>   static DEFINE_MUTEX(link_mutex);
>>
>> +/* incremented on every opened seq_file */
>> +static atomic64_t session_id;
>> +
>>   /* bpf_seq_read, a customized and simpler version for bpf iterator.
>>    * no_llseek is assumed for this file.
>>    * The following are differences from seq_read():
>> @@ -154,6 +169,31 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>>          goto Done;
>>   }
>>
>> +static int iter_release(struct inode *inode, struct file *file)
>> +{
>> +       struct bpf_iter_priv_data *iter_priv;
>> +       void *file_priv = file->private_data;
>> +       struct seq_file *seq;
>> +
>> +       seq = file_priv;
> 
> 
> seq might be NULL, if anon_inode_getfile succeeded, but then
> prepare_seq_file failed, so you need to handle that.

Thanks for catching this. Missed this case.

> 
> Also, file_priv is redundant, assign to seq directly from file->private_data?

Ack.

> 
>> +       iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
>> +                                target_private);
>> +
>> +       if (iter_priv->tinfo->fini_seq_private)
>> +               iter_priv->tinfo->fini_seq_private(seq->private);
>> +
>> +       bpf_prog_put(iter_priv->prog);
>> +       seq->private = iter_priv;
>> +
>> +       return seq_release_private(inode, file);
>> +}
>> +
>> +static const struct file_operations bpf_iter_fops = {
>> +       .llseek         = no_llseek,
>> +       .read           = bpf_seq_read,
>> +       .release        = iter_release,
>> +};
>> +
>>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>>   {
>>          struct bpf_iter_target_info *tinfo;
>> @@ -289,3 +329,91 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>>
>>          return bpf_link_settle(&link_primer);
>>   }
>> +
>> +static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
>> +                         struct bpf_iter_target_info *tinfo,
>> +                         struct bpf_prog *prog)
>> +{
>> +       priv_data->tinfo = tinfo;
>> +       priv_data->prog = prog;
>> +       priv_data->session_id = atomic64_add_return(1, &session_id);
> 
> nit: atomic64_inc_return?

Ack.

> 
>> +       priv_data->seq_num = 0;
>> +       priv_data->do_stop = 0;
>> +}
>> +
> 
> [...]
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8621ad080b24..9108d1a9b934 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1143,6 +1143,7 @@  struct bpf_iter_reg {
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int bpf_iter_new_fd(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@  enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@  union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2674c9cbc3dc..2a9f939be6e6 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -2,6 +2,7 @@ 
 /* Copyright (c) 2020 Facebook */
 
 #include <linux/fs.h>
+#include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 
@@ -20,12 +21,26 @@  struct bpf_iter_link {
 	struct bpf_iter_target_info *tinfo;
 };
 
+struct bpf_iter_priv_data {
+	struct {
+		struct bpf_iter_target_info *tinfo;
+		struct bpf_prog *prog;
+		u64 session_id;
+		u64 seq_num;
+		u64 do_stop;
+	};
+	u8 target_private[] __aligned(8);
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
 /* protect bpf_iter_link changes */
 static DEFINE_MUTEX(link_mutex);
 
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -154,6 +169,31 @@  static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	goto Done;
 }
 
+static int iter_release(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_priv_data *iter_priv;
+	void *file_priv = file->private_data;
+	struct seq_file *seq;
+
+	seq = file_priv;
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+
+	if (iter_priv->tinfo->fini_seq_private)
+		iter_priv->tinfo->fini_seq_private(seq->private);
+
+	bpf_prog_put(iter_priv->prog);
+	seq->private = iter_priv;
+
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations bpf_iter_fops = {
+	.llseek		= no_llseek,
+	.read		= bpf_seq_read,
+	.release	= iter_release,
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -289,3 +329,91 @@  int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	return bpf_link_settle(&link_primer);
 }
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+			  struct bpf_iter_target_info *tinfo,
+			  struct bpf_prog *prog)
+{
+	priv_data->tinfo = tinfo;
+	priv_data->prog = prog;
+	priv_data->session_id = atomic64_add_return(1, &session_id);
+	priv_data->seq_num = 0;
+	priv_data->do_stop = 0;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+	struct bpf_iter_priv_data *priv_data;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_prog *prog;
+	u32 total_priv_dsize;
+	struct seq_file *seq;
+	int err = 0;
+
+	mutex_lock(&link_mutex);
+	prog = link->link.prog;
+	bpf_prog_inc(prog);
+	mutex_unlock(&link_mutex);
+
+	tinfo = link->tinfo;
+	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+			   tinfo->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
+	if (!priv_data) {
+		err = -ENOMEM;
+		goto release_prog;
+	}
+
+	if (tinfo->init_seq_private) {
+		err = tinfo->init_seq_private(priv_data->target_private);
+		if (err)
+			goto release_seq_file;
+	}
+
+	init_seq_meta(priv_data, tinfo, prog);
+	seq = file->private_data;
+	seq->private = priv_data->target_private;
+
+	return 0;
+
+release_seq_file:
+	seq_release_private(file->f_inode, file);
+release_prog:
+	bpf_prog_put(prog);
+	return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+	struct file *file;
+	unsigned int flags;
+	int err, fd;
+
+	if (link->ops != &bpf_iter_link_lops)
+		return -EINVAL;
+
+	flags = O_RDONLY | O_CLOEXEC;
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto free_fd;
+	}
+
+	err = prepare_seq_file(file,
+			       container_of(link, struct bpf_iter_link, link));
+	if (err)
+		goto free_file;
+
+	fd_install(fd, file);
+	return fd;
+
+free_file:
+	fput(file);
+free_fd:
+	put_unused_fd(fd);
+	return err;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6ffe2d8fb6c7..a293e88ee01a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3941,6 +3941,29 @@  static int bpf_enable_stats(union bpf_attr *attr)
 	return -EINVAL;
 }
 
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int err;
+
+	if (CHECK_ATTR(BPF_ITER_CREATE))
+		return -EINVAL;
+
+	if (attr->iter_create.flags)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	err = bpf_iter_new_fd(link);
+	bpf_link_put(link);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4068,6 +4091,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_ENABLE_STATS:
 		err = bpf_enable_stats(&attr);
 		break;
+	case BPF_ITER_CREATE:
+		err = bpf_iter_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@  enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@  union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF