diff mbox series

[bpf-next,v1,03/19] bpf: add bpf_map iterator

Message ID 20200427201237.2994794-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song April 27, 2020, 8:12 p.m. UTC
The bpf_map iterator is implemented.
The bpf program is called at seq_ops show() and stop() functions.
bpf_iter_get_prog() will retrieve bpf program and other
parameters during seq_file object traversal. In show() function,
bpf program will traverse every valid object, and in stop()
function, bpf program will be called one more time after all
objects are traversed.

The first member of the bpf context contains the meta data, namely,
the seq_file, session_id and seq_num. Here, the session_id is
a unique id for one specific seq_file session. The seq_num is
the number of bpf prog invocations in the current session.
The bpf_iter_get_prog(), which will be implemented in subsequent
patches, will have more information on how meta data are computed.

The second member of the bpf context is a struct bpf_map pointer,
which bpf program can examine.

The target implementation also provided the structure definition
for bpf program and the function definition for verifier to
verify the bpf program. Specifically for bpf_map iterator,
the structure is "bpf_iter__bpf_map" andd the function is
"__bpf_iter__bpf_map".

More targets will be implemented later, all of which will include
the following, similar to bpf_map iterator:
  - seq_ops() implementation
  - function definition for verifier to verify the bpf program
  - seq_file private data size
  - additional target feature

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  10 ++++
 kernel/bpf/Makefile   |   2 +-
 kernel/bpf/bpf_iter.c |  19 ++++++++
 kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  13 +++++
 5 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/map_iter.c

Comments

Martin KaFai Lau April 29, 2020, 12:37 a.m. UTC | #1
On Mon, Apr 27, 2020 at 01:12:37PM -0700, Yonghong Song wrote:
> The bpf_map iterator is implemented.
> The bpf program is called at seq_ops show() and stop() functions.
> bpf_iter_get_prog() will retrieve bpf program and other
> parameters during seq_file object traversal. In show() function,
> bpf program will traverse every valid object, and in stop()
> function, bpf program will be called one more time after all
> objects are traversed.
> 
> The first member of the bpf context contains the meta data, namely,
> the seq_file, session_id and seq_num. Here, the session_id is
> a unique id for one specific seq_file session. The seq_num is
> the number of bpf prog invocations in the current session.
> The bpf_iter_get_prog(), which will be implemented in subsequent
> patches, will have more information on how meta data are computed.
> 
> The second member of the bpf context is a struct bpf_map pointer,
> which bpf program can examine.
> 
> The target implementation also provided the structure definition
> for bpf program and the function definition for verifier to
> verify the bpf program. Specifically for bpf_map iterator,
> the structure is "bpf_iter__bpf_map" andd the function is
> "__bpf_iter__bpf_map".
> 
> More targets will be implemented later, all of which will include
> the following, similar to bpf_map iterator:
>   - seq_ops() implementation
>   - function definition for verifier to verify the bpf program
>   - seq_file private data size
>   - additional target feature
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   |  10 ++++
>  kernel/bpf/Makefile   |   2 +-
>  kernel/bpf/bpf_iter.c |  19 ++++++++
>  kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c  |  13 +++++
>  5 files changed, 150 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/map_iter.c
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5e56abc1e2f1..4ac8d61f7c3e 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1078,6 +1078,7 @@ int  generic_map_update_batch(struct bpf_map *map,
>  int  generic_map_delete_batch(struct bpf_map *map,
>  			      const union bpf_attr *attr,
>  			      union bpf_attr __user *uattr);
> +struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
>  
>  extern int sysctl_unprivileged_bpf_disabled;
>  
> @@ -1118,7 +1119,16 @@ struct bpf_iter_reg {
>  	u32 target_feature;
>  };
>  
> +struct bpf_iter_meta {
> +	__bpf_md_ptr(struct seq_file *, seq);
> +	u64 session_id;
> +	u64 seq_num;
> +};
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
> +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
> +				   u64 *session_id, u64 *seq_num, bool is_last);
> +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>  
>  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 6a8b0febd3f6..b2b5eefc5254 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -2,7 +2,7 @@
>  obj-y := core.o
>  CFLAGS_core.o += $(call cc-disable-warning, override-init)
>  
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 1115b978607a..284c95587803 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -48,3 +48,22 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  
>  	return 0;
>  }
> +
> +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
> +				   u64 *session_id, u64 *seq_num, bool is_last)
> +{
> +	return NULL;
Can this patch be moved after this function is implemented?

> +}
> +
> +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
> +{
> +	int ret;
> +
> +	migrate_disable();
> +	rcu_read_lock();
> +	ret = BPF_PROG_RUN(prog, ctx);
> +	rcu_read_unlock();
> +	migrate_enable();
> +
> +	return ret;
> +}
> diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
> new file mode 100644
> index 000000000000..bb3ad4c3bde5
> --- /dev/null
> +++ b/kernel/bpf/map_iter.c
> @@ -0,0 +1,107 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 Facebook */
> +#include <linux/bpf.h>
> +#include <linux/fs.h>
> +#include <linux/filter.h>
> +#include <linux/kernel.h>
> +
> +struct bpf_iter_seq_map_info {
> +	struct bpf_map *map;
> +	u32 id;
> +};
> +
> +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct bpf_iter_seq_map_info *info = seq->private;
> +	struct bpf_map *map;
> +	u32 id = info->id;
> +
> +	map = bpf_map_get_curr_or_next(&id);
> +	if (IS_ERR_OR_NULL(map))
> +		return NULL;
> +
> +	++*pos;
Does pos always need to be incremented here?

> +	info->map = map;
> +	info->id = id;
> +	return map;
> +}
> +
> +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_iter_seq_map_info *info = seq->private;
> +	struct bpf_map *map;
> +
> +	++*pos;
> +	++info->id;
> +	map = bpf_map_get_curr_or_next(&info->id);
> +	if (IS_ERR_OR_NULL(map))
> +		return NULL;
> +
> +	bpf_map_put(info->map);
> +	info->map = map;
> +	return map;
> +}
> +
> +struct bpf_iter__bpf_map {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct bpf_map *, map);
> +};
> +
> +int __init __bpf_iter__bpf_map(struct bpf_iter_meta *meta, struct bpf_map *map)
> +{
> +	return 0;
> +}
> +
> +static int bpf_map_seq_show(struct seq_file *seq, void *v)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__bpf_map ctx;
> +	struct bpf_prog *prog;
> +	int ret = 0;
> +
> +	ctx.meta = &meta;
> +	ctx.map = v;
> +	meta.seq = seq;
> +	prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
> +				 &meta.session_id, &meta.seq_num,
> +				 v == (void *)0);
From looking at seq_file.c, when will show() be called with "v == NULL"?

> +	if (prog)
> +		ret = bpf_iter_run_prog(prog, &ctx);
> +
> +	return ret == 0 ? 0 : -EINVAL;
The verifier change in patch 4 should have ensured that prog
can only return 0?

> +}
> +
> +static void bpf_map_seq_stop(struct seq_file *seq, void *v)
> +{
> +	struct bpf_iter_seq_map_info *info = seq->private;
> +
> +	if (!v)
> +		bpf_map_seq_show(seq, v);
> +
> +	if (info->map) {
> +		bpf_map_put(info->map);
> +		info->map = NULL;
> +	}
> +}
> +
> +static const struct seq_operations bpf_map_seq_ops = {
> +	.start	= bpf_map_seq_start,
> +	.next	= bpf_map_seq_next,
> +	.stop	= bpf_map_seq_stop,
> +	.show	= bpf_map_seq_show,
> +};
> +
> +static int __init bpf_map_iter_init(void)
> +{
> +	struct bpf_iter_reg reg_info = {
> +		.target			= "bpf_map",
> +		.target_func_name	= "__bpf_iter__bpf_map",
> +		.seq_ops		= &bpf_map_seq_ops,
> +		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
> +		.target_feature		= 0,
> +	};
> +
> +	return bpf_iter_reg_target(&reg_info);
> +}
> +
> +late_initcall(bpf_map_iter_init);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 7626b8024471..022187640943 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2800,6 +2800,19 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
>  	return err;
>  }
>  
> +struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
> +{
> +	struct bpf_map *map;
> +
> +	spin_lock_bh(&map_idr_lock);
> +	map = idr_get_next(&map_idr, id);
> +	if (map)
> +		map = __bpf_map_inc_not_zero(map, false);
nit. For the !map case, set "map = ERR_PTR(-ENOENT)" so that
the _OR_NULL() test is not needed.  It will be more consistent
with other error checking codes in syscall.c.

> +	spin_unlock_bh(&map_idr_lock);
> +
> +	return map;
> +}
> +
>  #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
>  
>  struct bpf_prog *bpf_prog_by_id(u32 id)
> -- 
> 2.24.1
>
Alexei Starovoitov April 29, 2020, 12:48 a.m. UTC | #2
On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>> +	prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
>> +				 &meta.session_id, &meta.seq_num,
>> +				 v == (void *)0);
>  From looking at seq_file.c, when will show() be called with "v == NULL"?
> 

that v == NULL here and the whole verifier change just to allow NULL...
may be use seq_num as an indicator of the last elem instead?
Like seq_num with upper bit set to indicate that it's last?
Yonghong Song April 29, 2020, 1:02 a.m. UTC | #3
On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> On Mon, Apr 27, 2020 at 01:12:37PM -0700, Yonghong Song wrote:
>> The bpf_map iterator is implemented.
>> The bpf program is called at seq_ops show() and stop() functions.
>> bpf_iter_get_prog() will retrieve bpf program and other
>> parameters during seq_file object traversal. In show() function,
>> bpf program will traverse every valid object, and in stop()
>> function, bpf program will be called one more time after all
>> objects are traversed.
>>
>> The first member of the bpf context contains the meta data, namely,
>> the seq_file, session_id and seq_num. Here, the session_id is
>> a unique id for one specific seq_file session. The seq_num is
>> the number of bpf prog invocations in the current session.
>> The bpf_iter_get_prog(), which will be implemented in subsequent
>> patches, will have more information on how meta data are computed.
>>
>> The second member of the bpf context is a struct bpf_map pointer,
>> which bpf program can examine.
>>
>> The target implementation also provided the structure definition
>> for bpf program and the function definition for verifier to
>> verify the bpf program. Specifically for bpf_map iterator,
>> the structure is "bpf_iter__bpf_map" andd the function is
>> "__bpf_iter__bpf_map".
>>
>> More targets will be implemented later, all of which will include
>> the following, similar to bpf_map iterator:
>>    - seq_ops() implementation
>>    - function definition for verifier to verify the bpf program
>>    - seq_file private data size
>>    - additional target feature
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h   |  10 ++++
>>   kernel/bpf/Makefile   |   2 +-
>>   kernel/bpf/bpf_iter.c |  19 ++++++++
>>   kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c  |  13 +++++
>>   5 files changed, 150 insertions(+), 1 deletion(-)
>>   create mode 100644 kernel/bpf/map_iter.c
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 5e56abc1e2f1..4ac8d61f7c3e 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1078,6 +1078,7 @@ int  generic_map_update_batch(struct bpf_map *map,
>>   int  generic_map_delete_batch(struct bpf_map *map,
>>   			      const union bpf_attr *attr,
>>   			      union bpf_attr __user *uattr);
>> +struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
>>   
>>   extern int sysctl_unprivileged_bpf_disabled;
>>   
>> @@ -1118,7 +1119,16 @@ struct bpf_iter_reg {
>>   	u32 target_feature;
>>   };
>>   
>> +struct bpf_iter_meta {
>> +	__bpf_md_ptr(struct seq_file *, seq);
>> +	u64 session_id;
>> +	u64 seq_num;
>> +};
>> +
>>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
>> +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
>> +				   u64 *session_id, u64 *seq_num, bool is_last);
>> +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
>>   
>>   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
>>   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>> index 6a8b0febd3f6..b2b5eefc5254 100644
>> --- a/kernel/bpf/Makefile
>> +++ b/kernel/bpf/Makefile
>> @@ -2,7 +2,7 @@
>>   obj-y := core.o
>>   CFLAGS_core.o += $(call cc-disable-warning, override-init)
>>   
>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 1115b978607a..284c95587803 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -48,3 +48,22 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>>   
>>   	return 0;
>>   }
>> +
>> +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
>> +				   u64 *session_id, u64 *seq_num, bool is_last)
>> +{
>> +	return NULL;
> Can this patch be moved after this function is implemented?

I tried to have an example on how regristration looks like,
so I put bpf_map iterator implementation patch immediately
after the bpf_iter_reg_target() patch. Unfortunately, I make
the iterator implementation complete and compiler can pass,
I need this function() to be implemented in the above.

I guess I can delay this patch until I can properly
implement it, just like my RFC v2.

> 
>> +}
>> +
>> +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
>> +{
>> +	int ret;
>> +
>> +	migrate_disable();
>> +	rcu_read_lock();
>> +	ret = BPF_PROG_RUN(prog, ctx);
>> +	rcu_read_unlock();
>> +	migrate_enable();
>> +
>> +	return ret;
>> +}
>> diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
>> new file mode 100644
>> index 000000000000..bb3ad4c3bde5
>> --- /dev/null
>> +++ b/kernel/bpf/map_iter.c
>> @@ -0,0 +1,107 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (c) 2020 Facebook */
>> +#include <linux/bpf.h>
>> +#include <linux/fs.h>
>> +#include <linux/filter.h>
>> +#include <linux/kernel.h>
>> +
>> +struct bpf_iter_seq_map_info {
>> +	struct bpf_map *map;
>> +	u32 id;
>> +};
>> +
>> +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
>> +{
>> +	struct bpf_iter_seq_map_info *info = seq->private;
>> +	struct bpf_map *map;
>> +	u32 id = info->id;
>> +
>> +	map = bpf_map_get_curr_or_next(&id);
>> +	if (IS_ERR_OR_NULL(map))
>> +		return NULL;
>> +
>> +	++*pos;
> Does pos always need to be incremented here?

Yes, I skipped passing SEQ_START_TOKEN to show(). Put it another way,
bpf program won't be called for SEQ_START_TOKEN, so I did a shortcut here.

> 
>> +	info->map = map;
>> +	info->id = id;
>> +	return map;
>> +}
>> +
>> +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>> +{
>> +	struct bpf_iter_seq_map_info *info = seq->private;
>> +	struct bpf_map *map;
>> +
>> +	++*pos;
>> +	++info->id;
>> +	map = bpf_map_get_curr_or_next(&info->id);
>> +	if (IS_ERR_OR_NULL(map))
>> +		return NULL;
>> +
>> +	bpf_map_put(info->map);
>> +	info->map = map;
>> +	return map;
>> +}
>> +
>> +struct bpf_iter__bpf_map {
>> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
>> +	__bpf_md_ptr(struct bpf_map *, map);
>> +};
>> +
>> +int __init __bpf_iter__bpf_map(struct bpf_iter_meta *meta, struct bpf_map *map)
>> +{
>> +	return 0;
>> +}
>> +
>> +static int bpf_map_seq_show(struct seq_file *seq, void *v)
>> +{
>> +	struct bpf_iter_meta meta;
>> +	struct bpf_iter__bpf_map ctx;
>> +	struct bpf_prog *prog;
>> +	int ret = 0;
>> +
>> +	ctx.meta = &meta;
>> +	ctx.map = v;
>> +	meta.seq = seq;
>> +	prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
>> +				 &meta.session_id, &meta.seq_num,
>> +				 v == (void *)0);
>  From looking at seq_file.c, when will show() be called with "v == NULL"?

In the stop() function.

> 
>> +	if (prog)
>> +		ret = bpf_iter_run_prog(prog, &ctx);
>> +
>> +	return ret == 0 ? 0 : -EINVAL;
> The verifier change in patch 4 should have ensured that prog
> can only return 0?

Yes. I forgot to update this after last minutes I added verifier
enforcement. I can do
	if (prog)
		bpf_iter_run_prog(prog, &ctx);

	return 0;

> 
>> +}
>> +
>> +static void bpf_map_seq_stop(struct seq_file *seq, void *v)
>> +{
>> +	struct bpf_iter_seq_map_info *info = seq->private;
>> +
>> +	if (!v)
>> +		bpf_map_seq_show(seq, v);

bpf program for NULL object is called here.

>> +
>> +	if (info->map) {
>> +		bpf_map_put(info->map);
>> +		info->map = NULL;
>> +	}
>> +}
>> +
>> +static const struct seq_operations bpf_map_seq_ops = {
>> +	.start	= bpf_map_seq_start,
>> +	.next	= bpf_map_seq_next,
>> +	.stop	= bpf_map_seq_stop,
>> +	.show	= bpf_map_seq_show,
>> +};
>> +
>> +static int __init bpf_map_iter_init(void)
>> +{
>> +	struct bpf_iter_reg reg_info = {
>> +		.target			= "bpf_map",
>> +		.target_func_name	= "__bpf_iter__bpf_map",
>> +		.seq_ops		= &bpf_map_seq_ops,
>> +		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
>> +		.target_feature		= 0,
>> +	};
>> +
>> +	return bpf_iter_reg_target(&reg_info);
>> +}
>> +
>> +late_initcall(bpf_map_iter_init);
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 7626b8024471..022187640943 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2800,6 +2800,19 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
>>   	return err;
>>   }
>>   
>> +struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
>> +{
>> +	struct bpf_map *map;
>> +
>> +	spin_lock_bh(&map_idr_lock);
>> +	map = idr_get_next(&map_idr, id);
>> +	if (map)
>> +		map = __bpf_map_inc_not_zero(map, false);
> nit. For the !map case, set "map = ERR_PTR(-ENOENT)" so that
> the _OR_NULL() test is not needed.  It will be more consistent
> with other error checking codes in syscall.c.

Good point, will do that.

> 
>> +	spin_unlock_bh(&map_idr_lock);
>> +
>> +	return map;
>> +}
>> +
>>   #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
>>   
>>   struct bpf_prog *bpf_prog_by_id(u32 id)
>> -- 
>> 2.24.1
>>
Yonghong Song April 29, 2020, 1:15 a.m. UTC | #4
On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
>>> +                 &meta.session_id, &meta.seq_num,
>>> +                 v == (void *)0);
>>  From looking at seq_file.c, when will show() be called with "v == NULL"?
>>
> 
> that v == NULL here and the whole verifier change just to allow NULL...
> may be use seq_num as an indicator of the last elem instead?
> Like seq_num with upper bit set to indicate that it's last?

We could. But then verifier won't have an easy way to verify that.
For example, the above is expected:

      int prog(struct bpf_map *map, u64 seq_num) {
         if (seq_num >> 63)
           return 0;
         ... map->id ...
         ... map->user_cnt ...
      }

But if user writes

      int prog(struct bpf_map *map, u64 seq_num) {
          ... map->id ...
          ... map->user_cnt ...
      }

verifier won't be easy to conclude inproper map pointer tracing
here and in the above map->id, map->user_cnt will cause
exceptions and they will silently get value 0.

I do have another potential use case for this ptr_to_btf_id_or_null,
e.g., for tcp6, instead of pointer casting, I could have bpf_prog
like
     int prog(..., struct tcp6_sock *tcp_sk,
              struct timewait_sock *tw_sk, struct request_sock *req_sk) {
         if (tcp_sk) { /* dump tcp_sk ... */ }
         if (tw_sk) { /* dump tw_sk ... */ }
         if (req_sk) { /* dump req_sk ... */ }
     }
The kernel infrastructure will ensure at any time only one
of tcp_sk/tw_sk/req_sk is valid and the other two is NULL.
Alexei Starovoitov April 29, 2020, 2:44 a.m. UTC | #5
On 4/28/20 6:15 PM, Yonghong Song wrote:
> 
> 
> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct 
>>>> bpf_iter_seq_map_info),
>>>> +                 &meta.session_id, &meta.seq_num,
>>>> +                 v == (void *)0);
>>>  From looking at seq_file.c, when will show() be called with "v == 
>>> NULL"?
>>>
>>
>> that v == NULL here and the whole verifier change just to allow NULL...
>> may be use seq_num as an indicator of the last elem instead?
>> Like seq_num with upper bit set to indicate that it's last?
> 
> We could. But then verifier won't have an easy way to verify that.
> For example, the above is expected:
> 
>       int prog(struct bpf_map *map, u64 seq_num) {
>          if (seq_num >> 63)
>            return 0;
>          ... map->id ...
>          ... map->user_cnt ...
>       }
> 
> But if user writes
> 
>       int prog(struct bpf_map *map, u64 seq_num) {
>           ... map->id ...
>           ... map->user_cnt ...
>       }
> 
> verifier won't be easy to conclude inproper map pointer tracing
> here and in the above map->id, map->user_cnt will cause
> exceptions and they will silently get value 0.

I mean always pass valid object pointer into the prog.
In above case 'map' will always be valid.
Consider prog that iterating all map elements.
It's weird that the prog would always need to do
if (map == 0)
   goto out;
even if it doesn't care about finding last.
All progs would have to have such extra 'if'.
If we always pass valid object than there is no need
for such extra checks inside the prog.
First and last element can be indicated via seq_num
or via another flag or via helper call like is_this_last_elem()
or something.
Yonghong Song April 29, 2020, 5:09 a.m. UTC | #6
On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>
>>
>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct 
>>>>> bpf_iter_seq_map_info),
>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>> +                 v == (void *)0);
>>>>  From looking at seq_file.c, when will show() be called with "v == 
>>>> NULL"?
>>>>
>>>
>>> that v == NULL here and the whole verifier change just to allow NULL...
>>> may be use seq_num as an indicator of the last elem instead?
>>> Like seq_num with upper bit set to indicate that it's last?
>>
>> We could. But then verifier won't have an easy way to verify that.
>> For example, the above is expected:
>>
>>       int prog(struct bpf_map *map, u64 seq_num) {
>>          if (seq_num >> 63)
>>            return 0;
>>          ... map->id ...
>>          ... map->user_cnt ...
>>       }
>>
>> But if user writes
>>
>>       int prog(struct bpf_map *map, u64 seq_num) {
>>           ... map->id ...
>>           ... map->user_cnt ...
>>       }
>>
>> verifier won't be easy to conclude inproper map pointer tracing
>> here and in the above map->id, map->user_cnt will cause
>> exceptions and they will silently get value 0.
> 
> I mean always pass valid object pointer into the prog.
> In above case 'map' will always be valid.
> Consider prog that iterating all map elements.
> It's weird that the prog would always need to do
> if (map == 0)
>    goto out;
> even if it doesn't care about finding last.
> All progs would have to have such extra 'if'.
> If we always pass valid object than there is no need
> for such extra checks inside the prog.
> First and last element can be indicated via seq_num
> or via another flag or via helper call like is_this_last_elem()
> or something.

Okay, I see what you mean now. Basically this means
seq_ops->next() should try to get/maintain next two elements,
otherwise, we won't know whether the one in seq_ops->show()
is the last or not. We could do it in newly implemented
iterator bpf_map/task/task_file. Let me check how I could
make existing seq_ops (ipv6_route/netlink) works with
minimum changes.
Andrii Nakryiko April 29, 2020, 6:04 a.m. UTC | #7
On Mon, Apr 27, 2020 at 1:18 PM Yonghong Song <yhs@fb.com> wrote:
>
> The bpf_map iterator is implemented.
> The bpf program is called at seq_ops show() and stop() functions.
> bpf_iter_get_prog() will retrieve bpf program and other
> parameters during seq_file object traversal. In show() function,
> bpf program will traverse every valid object, and in stop()
> function, bpf program will be called one more time after all
> objects are traversed.
>
> The first member of the bpf context contains the meta data, namely,
> the seq_file, session_id and seq_num. Here, the session_id is
> a unique id for one specific seq_file session. The seq_num is
> the number of bpf prog invocations in the current session.
> The bpf_iter_get_prog(), which will be implemented in subsequent
> patches, will have more information on how meta data are computed.
>
> The second member of the bpf context is a struct bpf_map pointer,
> which bpf program can examine.
>
> The target implementation also provided the structure definition
> for bpf program and the function definition for verifier to
> verify the bpf program. Specifically for bpf_map iterator,
> the structure is "bpf_iter__bpf_map" andd the function is
> "__bpf_iter__bpf_map".
>
> More targets will be implemented later, all of which will include
> the following, similar to bpf_map iterator:
>   - seq_ops() implementation
>   - function definition for verifier to verify the bpf program
>   - seq_file private data size
>   - additional target feature
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   |  10 ++++
>  kernel/bpf/Makefile   |   2 +-
>  kernel/bpf/bpf_iter.c |  19 ++++++++
>  kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c  |  13 +++++
>  5 files changed, 150 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/map_iter.c
>

[...]

> +static int __init bpf_map_iter_init(void)
> +{
> +       struct bpf_iter_reg reg_info = {
> +               .target                 = "bpf_map",
> +               .target_func_name       = "__bpf_iter__bpf_map",

I wonder if it would be better instead of strings to use a pointer to
a function here. It would preserve __bpf_iter__bpf_map function
without __init, plus it's hard to mistype the name accidentally. In
bpf_iter_reg_target() one would just need to find function in kallsyms
by function address and extract it's name.

Or that would be too much trouble?

> +               .seq_ops                = &bpf_map_seq_ops,
> +               .seq_priv_size          = sizeof(struct bpf_iter_seq_map_info),
> +               .target_feature         = 0,
> +       };
> +
> +       return bpf_iter_reg_target(&reg_info);
> +}
> +
> +late_initcall(bpf_map_iter_init);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 7626b8024471..022187640943 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2800,6 +2800,19 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
>         return err;
>  }
>
> +struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
> +{
> +       struct bpf_map *map;
> +
> +       spin_lock_bh(&map_idr_lock);
> +       map = idr_get_next(&map_idr, id);
> +       if (map)
> +               map = __bpf_map_inc_not_zero(map, false);

When __bpf_map_inc_not_zero return ENOENT, it doesn't mean there are
no more BPF maps, it just means that the current one we got was
already released (or in the process of being released). I think you
need to retry with id+1 in such case, otherwise your iteration might
end prematurely.

> +       spin_unlock_bh(&map_idr_lock);
> +
> +       return map;
> +}
> +
>  #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
>
>  struct bpf_prog *bpf_prog_by_id(u32 id)
> --
> 2.24.1
>
Andrii Nakryiko April 29, 2020, 6:08 a.m. UTC | #8
On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> > On 4/28/20 6:15 PM, Yonghong Song wrote:
> >>
> >>
> >> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> >>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> >>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
> >>>>> bpf_iter_seq_map_info),
> >>>>> +                 &meta.session_id, &meta.seq_num,
> >>>>> +                 v == (void *)0);
> >>>>  From looking at seq_file.c, when will show() be called with "v ==
> >>>> NULL"?
> >>>>
> >>>
> >>> that v == NULL here and the whole verifier change just to allow NULL...
> >>> may be use seq_num as an indicator of the last elem instead?
> >>> Like seq_num with upper bit set to indicate that it's last?
> >>
> >> We could. But then verifier won't have an easy way to verify that.
> >> For example, the above is expected:
> >>
> >>       int prog(struct bpf_map *map, u64 seq_num) {
> >>          if (seq_num >> 63)
> >>            return 0;
> >>          ... map->id ...
> >>          ... map->user_cnt ...
> >>       }
> >>
> >> But if user writes
> >>
> >>       int prog(struct bpf_map *map, u64 seq_num) {
> >>           ... map->id ...
> >>           ... map->user_cnt ...
> >>       }
> >>
> >> verifier won't be easy to conclude inproper map pointer tracing
> >> here and in the above map->id, map->user_cnt will cause
> >> exceptions and they will silently get value 0.
> >
> > I mean always pass valid object pointer into the prog.
> > In above case 'map' will always be valid.
> > Consider prog that iterating all map elements.
> > It's weird that the prog would always need to do
> > if (map == 0)
> >    goto out;
> > even if it doesn't care about finding last.
> > All progs would have to have such extra 'if'.
> > If we always pass valid object than there is no need
> > for such extra checks inside the prog.
> > First and last element can be indicated via seq_num
> > or via another flag or via helper call like is_this_last_elem()
> > or something.
>
> Okay, I see what you mean now. Basically this means
> seq_ops->next() should try to get/maintain next two elements,

What about the case when there are no elements to iterate to begin
with? In that case, we still need to call bpf_prog for (empty)
post-aggregation, but we have no valid element... For bpf_map
iteration we could have fake empty bpf_map that would be passed, but
I'm not sure it's applicable for any time of object (e.g., having a
fake task_struct is probably quite a bit more problematic?)...

> otherwise, we won't know whether the one in seq_ops->show()
> is the last or not. We could do it in newly implemented
> iterator bpf_map/task/task_file. Let me check how I could
> make existing seq_ops (ipv6_route/netlink) works with
> minimum changes.
Yonghong Song April 29, 2020, 6:20 a.m. UTC | #9
On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>
>>>>
>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>> bpf_iter_seq_map_info),
>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>> +                 v == (void *)0);
>>>>>>   From looking at seq_file.c, when will show() be called with "v ==
>>>>>> NULL"?
>>>>>>
>>>>>
>>>>> that v == NULL here and the whole verifier change just to allow NULL...
>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>
>>>> We could. But then verifier won't have an easy way to verify that.
>>>> For example, the above is expected:
>>>>
>>>>        int prog(struct bpf_map *map, u64 seq_num) {
>>>>           if (seq_num >> 63)
>>>>             return 0;
>>>>           ... map->id ...
>>>>           ... map->user_cnt ...
>>>>        }
>>>>
>>>> But if user writes
>>>>
>>>>        int prog(struct bpf_map *map, u64 seq_num) {
>>>>            ... map->id ...
>>>>            ... map->user_cnt ...
>>>>        }
>>>>
>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>> here and in the above map->id, map->user_cnt will cause
>>>> exceptions and they will silently get value 0.
>>>
>>> I mean always pass valid object pointer into the prog.
>>> In above case 'map' will always be valid.
>>> Consider prog that iterating all map elements.
>>> It's weird that the prog would always need to do
>>> if (map == 0)
>>>     goto out;
>>> even if it doesn't care about finding last.
>>> All progs would have to have such extra 'if'.
>>> If we always pass valid object than there is no need
>>> for such extra checks inside the prog.
>>> First and last element can be indicated via seq_num
>>> or via another flag or via helper call like is_this_last_elem()
>>> or something.
>>
>> Okay, I see what you mean now. Basically this means
>> seq_ops->next() should try to get/maintain next two elements,
> 
> What about the case when there are no elements to iterate to begin
> with? In that case, we still need to call bpf_prog for (empty)
> post-aggregation, but we have no valid element... For bpf_map
> iteration we could have fake empty bpf_map that would be passed, but
> I'm not sure it's applicable for any time of object (e.g., having a
> fake task_struct is probably quite a bit more problematic?)...

Oh, yes, thanks for reminding me of this. I put a call to
bpf_prog in seq_ops->stop() especially to handle no object
case. In that case, seq_ops->start() will return NULL,
seq_ops->next() won't be called, and then seq_ops->stop()
is called. My earlier attempt tries to hook with next()
and then find it not working in all cases.

> 
>> otherwise, we won't know whether the one in seq_ops->show()
>> is the last or not. We could do it in newly implemented
>> iterator bpf_map/task/task_file. Let me check how I could
>> make existing seq_ops (ipv6_route/netlink) works with
>> minimum changes.
Alexei Starovoitov April 29, 2020, 6:30 a.m. UTC | #10
On 4/28/20 11:20 PM, Yonghong Song wrote:
> 
> 
> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>
>>>
>>>
>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>> +                 v == (void *)0);
>>>>>>>   From looking at seq_file.c, when will show() be called with "v ==
>>>>>>> NULL"?
>>>>>>>
>>>>>>
>>>>>> that v == NULL here and the whole verifier change just to allow 
>>>>>> NULL...
>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>
>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>> For example, the above is expected:
>>>>>
>>>>>        int prog(struct bpf_map *map, u64 seq_num) {
>>>>>           if (seq_num >> 63)
>>>>>             return 0;
>>>>>           ... map->id ...
>>>>>           ... map->user_cnt ...
>>>>>        }
>>>>>
>>>>> But if user writes
>>>>>
>>>>>        int prog(struct bpf_map *map, u64 seq_num) {
>>>>>            ... map->id ...
>>>>>            ... map->user_cnt ...
>>>>>        }
>>>>>
>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>> here and in the above map->id, map->user_cnt will cause
>>>>> exceptions and they will silently get value 0.
>>>>
>>>> I mean always pass valid object pointer into the prog.
>>>> In above case 'map' will always be valid.
>>>> Consider prog that iterating all map elements.
>>>> It's weird that the prog would always need to do
>>>> if (map == 0)
>>>>     goto out;
>>>> even if it doesn't care about finding last.
>>>> All progs would have to have such extra 'if'.
>>>> If we always pass valid object than there is no need
>>>> for such extra checks inside the prog.
>>>> First and last element can be indicated via seq_num
>>>> or via another flag or via helper call like is_this_last_elem()
>>>> or something.
>>>
>>> Okay, I see what you mean now. Basically this means
>>> seq_ops->next() should try to get/maintain next two elements,
>>
>> What about the case when there are no elements to iterate to begin
>> with? In that case, we still need to call bpf_prog for (empty)
>> post-aggregation, but we have no valid element... For bpf_map
>> iteration we could have fake empty bpf_map that would be passed, but
>> I'm not sure it's applicable for any time of object (e.g., having a
>> fake task_struct is probably quite a bit more problematic?)...
> 
> Oh, yes, thanks for reminding me of this. I put a call to
> bpf_prog in seq_ops->stop() especially to handle no object
> case. In that case, seq_ops->start() will return NULL,
> seq_ops->next() won't be called, and then seq_ops->stop()
> is called. My earlier attempt tries to hook with next()
> and then find it not working in all cases.

wait a sec. seq_ops->stop() is not the end.
With lseek of seq_file it can be called multiple times.
What's the point calling bpf prog with NULL then?
Martin KaFai Lau April 29, 2020, 6:34 a.m. UTC | #11
On Tue, Apr 28, 2020 at 11:20:30PM -0700, Yonghong Song wrote:
> 
> 
> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
> > On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
> > > 
> > > 
> > > 
> > > On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> > > > On 4/28/20 6:15 PM, Yonghong Song wrote:
> > > > > 
> > > > > 
> > > > > On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> > > > > > On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> > > > > > > > +    prog = bpf_iter_get_prog(seq, sizeof(struct
> > > > > > > > bpf_iter_seq_map_info),
> > > > > > > > +                 &meta.session_id, &meta.seq_num,
> > > > > > > > +                 v == (void *)0);
> > > > > > >   From looking at seq_file.c, when will show() be called with "v ==
> > > > > > > NULL"?
> > > > > > > 
> > > > > > 
> > > > > > that v == NULL here and the whole verifier change just to allow NULL...
> > > > > > may be use seq_num as an indicator of the last elem instead?
> > > > > > Like seq_num with upper bit set to indicate that it's last?
> > > > > 
> > > > > We could. But then verifier won't have an easy way to verify that.
> > > > > For example, the above is expected:
> > > > > 
> > > > >        int prog(struct bpf_map *map, u64 seq_num) {
> > > > >           if (seq_num >> 63)
> > > > >             return 0;
> > > > >           ... map->id ...
> > > > >           ... map->user_cnt ...
> > > > >        }
> > > > > 
> > > > > But if user writes
> > > > > 
> > > > >        int prog(struct bpf_map *map, u64 seq_num) {
> > > > >            ... map->id ...
> > > > >            ... map->user_cnt ...
> > > > >        }
> > > > > 
> > > > > verifier won't be easy to conclude inproper map pointer tracing
> > > > > here and in the above map->id, map->user_cnt will cause
> > > > > exceptions and they will silently get value 0.
> > > > 
> > > > I mean always pass valid object pointer into the prog.
> > > > In above case 'map' will always be valid.
> > > > Consider prog that iterating all map elements.
> > > > It's weird that the prog would always need to do
> > > > if (map == 0)
> > > >     goto out;
> > > > even if it doesn't care about finding last.
> > > > All progs would have to have such extra 'if'.
> > > > If we always pass valid object than there is no need
> > > > for such extra checks inside the prog.
> > > > First and last element can be indicated via seq_num
> > > > or via another flag or via helper call like is_this_last_elem()
> > > > or something.
> > > 
> > > Okay, I see what you mean now. Basically this means
> > > seq_ops->next() should try to get/maintain next two elements,
> > 
> > What about the case when there are no elements to iterate to begin
> > with? In that case, we still need to call bpf_prog for (empty)
> > post-aggregation, but we have no valid element... For bpf_map
> > iteration we could have fake empty bpf_map that would be passed, but
> > I'm not sure it's applicable for any time of object (e.g., having a
> > fake task_struct is probably quite a bit more problematic?)...
> 
> Oh, yes, thanks for reminding me of this. I put a call to
> bpf_prog in seq_ops->stop() especially to handle no object
> case. In that case, seq_ops->start() will return NULL,
> seq_ops->next() won't be called, and then seq_ops->stop()
> is called. My earlier attempt tries to hook with next()
> and then find it not working in all cases.
> 
> > 
> > > otherwise, we won't know whether the one in seq_ops->show()
> > > is the last or not. 
I think "show()" is convoluted with "stop()/eof()".  Could "stop()/eof()"
be its own separate (and optional) bpf_prog which only does "stop()/eof()"?

> > > We could do it in newly implemented
> > > iterator bpf_map/task/task_file. Let me check how I could
> > > make existing seq_ops (ipv6_route/netlink) works with
> > > minimum changes.
Andrii Nakryiko April 29, 2020, 6:40 a.m. UTC | #12
On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 4/28/20 11:20 PM, Yonghong Song wrote:
> >
> >
> > On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
> >> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
> >>>
> >>>
> >>>
> >>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> >>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
> >>>>>
> >>>>>
> >>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> >>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> >>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
> >>>>>>>> bpf_iter_seq_map_info),
> >>>>>>>> +                 &meta.session_id, &meta.seq_num,
> >>>>>>>> +                 v == (void *)0);
> >>>>>>>   From looking at seq_file.c, when will show() be called with "v ==
> >>>>>>> NULL"?
> >>>>>>>
> >>>>>>
> >>>>>> that v == NULL here and the whole verifier change just to allow
> >>>>>> NULL...
> >>>>>> may be use seq_num as an indicator of the last elem instead?
> >>>>>> Like seq_num with upper bit set to indicate that it's last?
> >>>>>
> >>>>> We could. But then verifier won't have an easy way to verify that.
> >>>>> For example, the above is expected:
> >>>>>
> >>>>>        int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>           if (seq_num >> 63)
> >>>>>             return 0;
> >>>>>           ... map->id ...
> >>>>>           ... map->user_cnt ...
> >>>>>        }
> >>>>>
> >>>>> But if user writes
> >>>>>
> >>>>>        int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>            ... map->id ...
> >>>>>            ... map->user_cnt ...
> >>>>>        }
> >>>>>
> >>>>> verifier won't be easy to conclude inproper map pointer tracing
> >>>>> here and in the above map->id, map->user_cnt will cause
> >>>>> exceptions and they will silently get value 0.
> >>>>
> >>>> I mean always pass valid object pointer into the prog.
> >>>> In above case 'map' will always be valid.
> >>>> Consider prog that iterating all map elements.
> >>>> It's weird that the prog would always need to do
> >>>> if (map == 0)
> >>>>     goto out;
> >>>> even if it doesn't care about finding last.
> >>>> All progs would have to have such extra 'if'.
> >>>> If we always pass valid object than there is no need
> >>>> for such extra checks inside the prog.
> >>>> First and last element can be indicated via seq_num
> >>>> or via another flag or via helper call like is_this_last_elem()
> >>>> or something.
> >>>
> >>> Okay, I see what you mean now. Basically this means
> >>> seq_ops->next() should try to get/maintain next two elements,
> >>
> >> What about the case when there are no elements to iterate to begin
> >> with? In that case, we still need to call bpf_prog for (empty)
> >> post-aggregation, but we have no valid element... For bpf_map
> >> iteration we could have fake empty bpf_map that would be passed, but
> >> I'm not sure it's applicable for any time of object (e.g., having a
> >> fake task_struct is probably quite a bit more problematic?)...
> >
> > Oh, yes, thanks for reminding me of this. I put a call to
> > bpf_prog in seq_ops->stop() especially to handle no object
> > case. In that case, seq_ops->start() will return NULL,
> > seq_ops->next() won't be called, and then seq_ops->stop()
> > is called. My earlier attempt tries to hook with next()
> > and then find it not working in all cases.
>
> wait a sec. seq_ops->stop() is not the end.
> With lseek of seq_file it can be called multiple times.

We don't allow seeking on seq_file created from bpf_iter_link, so
there should be no lseek'ing?

> What's the point calling bpf prog with NULL then?

To know that iteration has ended, even if there were 0 elements to
iterate. 0, 1 or N doesn't matter, we might still need to do some
final actions (e.g., submit or print summary).
Yonghong Song April 29, 2020, 6:44 a.m. UTC | #13
On 4/28/20 11:40 PM, Andrii Nakryiko wrote:
> On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 4/28/20 11:20 PM, Yonghong Song wrote:
>>>
>>>
>>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>>>> +                 v == (void *)0);
>>>>>>>>>    From looking at seq_file.c, when will show() be called with "v ==
>>>>>>>>> NULL"?
>>>>>>>>>
>>>>>>>>
>>>>>>>> that v == NULL here and the whole verifier change just to allow
>>>>>>>> NULL...
>>>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>>>
>>>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>>>> For example, the above is expected:
>>>>>>>
>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>            if (seq_num >> 63)
>>>>>>>              return 0;
>>>>>>>            ... map->id ...
>>>>>>>            ... map->user_cnt ...
>>>>>>>         }
>>>>>>>
>>>>>>> But if user writes
>>>>>>>
>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>             ... map->id ...
>>>>>>>             ... map->user_cnt ...
>>>>>>>         }
>>>>>>>
>>>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>>>> here and in the above map->id, map->user_cnt will cause
>>>>>>> exceptions and they will silently get value 0.
>>>>>>
>>>>>> I mean always pass valid object pointer into the prog.
>>>>>> In above case 'map' will always be valid.
>>>>>> Consider prog that iterating all map elements.
>>>>>> It's weird that the prog would always need to do
>>>>>> if (map == 0)
>>>>>>      goto out;
>>>>>> even if it doesn't care about finding last.
>>>>>> All progs would have to have such extra 'if'.
>>>>>> If we always pass valid object than there is no need
>>>>>> for such extra checks inside the prog.
>>>>>> First and last element can be indicated via seq_num
>>>>>> or via another flag or via helper call like is_this_last_elem()
>>>>>> or something.
>>>>>
>>>>> Okay, I see what you mean now. Basically this means
>>>>> seq_ops->next() should try to get/maintain next two elements,
>>>>
>>>> What about the case when there are no elements to iterate to begin
>>>> with? In that case, we still need to call bpf_prog for (empty)
>>>> post-aggregation, but we have no valid element... For bpf_map
>>>> iteration we could have fake empty bpf_map that would be passed, but
>>>> I'm not sure it's applicable for any time of object (e.g., having a
>>>> fake task_struct is probably quite a bit more problematic?)...
>>>
>>> Oh, yes, thanks for reminding me of this. I put a call to
>>> bpf_prog in seq_ops->stop() especially to handle no object
>>> case. In that case, seq_ops->start() will return NULL,
>>> seq_ops->next() won't be called, and then seq_ops->stop()
>>> is called. My earlier attempt tries to hook with next()
>>> and then find it not working in all cases.
>>
>> wait a sec. seq_ops->stop() is not the end.
>> With lseek of seq_file it can be called multiple times.

Yes, I have taken care of this. when the object is NULL,
bpf program will be called. When the object is NULL again,
it won't be called. The private data remembers it has
been called with NULL.

> 
> We don't allow seeking on seq_file created from bpf_iter_link, so
> there should be no lseek'ing?
> 
>> What's the point calling bpf prog with NULL then?
> 
> To know that iteration has ended, even if there were 0 elements to
> iterate. 0, 1 or N doesn't matter, we might still need to do some
> final actions (e.g., submit or print summary).
>
Yonghong Song April 29, 2020, 6:51 a.m. UTC | #14
On 4/28/20 11:34 PM, Martin KaFai Lau wrote:
> On Tue, Apr 28, 2020 at 11:20:30PM -0700, Yonghong Song wrote:
>>
>>
>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>>
>>>>
>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>>
>>>>>>
>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>>> +                 v == (void *)0);
>>>>>>>>    From looking at seq_file.c, when will show() be called with "v ==
>>>>>>>> NULL"?
>>>>>>>>
>>>>>>>
>>>>>>> that v == NULL here and the whole verifier change just to allow NULL...
>>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>>
>>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>>> For example, the above is expected:
>>>>>>
>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>            if (seq_num >> 63)
>>>>>>              return 0;
>>>>>>            ... map->id ...
>>>>>>            ... map->user_cnt ...
>>>>>>         }
>>>>>>
>>>>>> But if user writes
>>>>>>
>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>             ... map->id ...
>>>>>>             ... map->user_cnt ...
>>>>>>         }
>>>>>>
>>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>>> here and in the above map->id, map->user_cnt will cause
>>>>>> exceptions and they will silently get value 0.
>>>>>
>>>>> I mean always pass valid object pointer into the prog.
>>>>> In above case 'map' will always be valid.
>>>>> Consider prog that iterating all map elements.
>>>>> It's weird that the prog would always need to do
>>>>> if (map == 0)
>>>>>      goto out;
>>>>> even if it doesn't care about finding last.
>>>>> All progs would have to have such extra 'if'.
>>>>> If we always pass valid object than there is no need
>>>>> for such extra checks inside the prog.
>>>>> First and last element can be indicated via seq_num
>>>>> or via another flag or via helper call like is_this_last_elem()
>>>>> or something.
>>>>
>>>> Okay, I see what you mean now. Basically this means
>>>> seq_ops->next() should try to get/maintain next two elements,
>>>
>>> What about the case when there are no elements to iterate to begin
>>> with? In that case, we still need to call bpf_prog for (empty)
>>> post-aggregation, but we have no valid element... For bpf_map
>>> iteration we could have fake empty bpf_map that would be passed, but
>>> I'm not sure it's applicable for any time of object (e.g., having a
>>> fake task_struct is probably quite a bit more problematic?)...
>>
>> Oh, yes, thanks for reminding me of this. I put a call to
>> bpf_prog in seq_ops->stop() especially to handle no object
>> case. In that case, seq_ops->start() will return NULL,
>> seq_ops->next() won't be called, and then seq_ops->stop()
>> is called. My earlier attempt tries to hook with next()
>> and then find it not working in all cases.
>>
>>>
>>>> otherwise, we won't know whether the one in seq_ops->show()
>>>> is the last or not.
> I think "show()" is convoluted with "stop()/eof()".  Could "stop()/eof()"
> be its own separate (and optional) bpf_prog which only does "stop()/eof()"?

I thought this before. But user need to write a program instead of
a simple "if" condition in the main program...

> 
>>>> We could do it in newly implemented
>>>> iterator bpf_map/task/task_file. Let me check how I could
>>>> make existing seq_ops (ipv6_route/netlink) works with
>>>> minimum changes.
Alexei Starovoitov April 29, 2020, 3:34 p.m. UTC | #15
On 4/28/20 11:44 PM, Yonghong Song wrote:
> 
> 
> On 4/28/20 11:40 PM, Andrii Nakryiko wrote:
>> On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>> On 4/28/20 11:20 PM, Yonghong Song wrote:
>>>>
>>>>
>>>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>>>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>>>>> +                 v == (void *)0);
>>>>>>>>>>    From looking at seq_file.c, when will show() be called with 
>>>>>>>>>> "v ==
>>>>>>>>>> NULL"?
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> that v == NULL here and the whole verifier change just to allow
>>>>>>>>> NULL...
>>>>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>>>>
>>>>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>>>>> For example, the above is expected:
>>>>>>>>
>>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>            if (seq_num >> 63)
>>>>>>>>              return 0;
>>>>>>>>            ... map->id ...
>>>>>>>>            ... map->user_cnt ...
>>>>>>>>         }
>>>>>>>>
>>>>>>>> But if user writes
>>>>>>>>
>>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>             ... map->id ...
>>>>>>>>             ... map->user_cnt ...
>>>>>>>>         }
>>>>>>>>
>>>>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>>>>> here and in the above map->id, map->user_cnt will cause
>>>>>>>> exceptions and they will silently get value 0.
>>>>>>>
>>>>>>> I mean always pass valid object pointer into the prog.
>>>>>>> In above case 'map' will always be valid.
>>>>>>> Consider prog that iterating all map elements.
>>>>>>> It's weird that the prog would always need to do
>>>>>>> if (map == 0)
>>>>>>>      goto out;
>>>>>>> even if it doesn't care about finding last.
>>>>>>> All progs would have to have such extra 'if'.
>>>>>>> If we always pass valid object than there is no need
>>>>>>> for such extra checks inside the prog.
>>>>>>> First and last element can be indicated via seq_num
>>>>>>> or via another flag or via helper call like is_this_last_elem()
>>>>>>> or something.
>>>>>>
>>>>>> Okay, I see what you mean now. Basically this means
>>>>>> seq_ops->next() should try to get/maintain next two elements,
>>>>>
>>>>> What about the case when there are no elements to iterate to begin
>>>>> with? In that case, we still need to call bpf_prog for (empty)
>>>>> post-aggregation, but we have no valid element... For bpf_map
>>>>> iteration we could have fake empty bpf_map that would be passed, but
>>>>> I'm not sure it's applicable for any time of object (e.g., having a
>>>>> fake task_struct is probably quite a bit more problematic?)...
>>>>
>>>> Oh, yes, thanks for reminding me of this. I put a call to
>>>> bpf_prog in seq_ops->stop() especially to handle no object
>>>> case. In that case, seq_ops->start() will return NULL,
>>>> seq_ops->next() won't be called, and then seq_ops->stop()
>>>> is called. My earlier attempt tries to hook with next()
>>>> and then find it not working in all cases.
>>>
>>> wait a sec. seq_ops->stop() is not the end.
>>> With lseek of seq_file it can be called multiple times.
> 
> Yes, I have taken care of this. when the object is NULL,
> bpf program will be called. When the object is NULL again,
> it won't be called. The private data remembers it has
> been called with NULL.

Even without lseek stop() will be called multiple times.
If I read seq_file.c correctly it will be called before
every copy_to_user(). Which means that for a lot of text
(or if read() is done with small buffer) there will be
plenty of start,show,show,stop sequences.
Yonghong Song April 29, 2020, 6:14 p.m. UTC | #16
On 4/29/20 8:34 AM, Alexei Starovoitov wrote:
> On 4/28/20 11:44 PM, Yonghong Song wrote:
>>
>>
>> On 4/28/20 11:40 PM, Andrii Nakryiko wrote:
>>> On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
>>>>
>>>> On 4/28/20 11:20 PM, Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>>>>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>>>>>> +                 v == (void *)0);
>>>>>>>>>>>    From looking at seq_file.c, when will show() be called 
>>>>>>>>>>> with "v ==
>>>>>>>>>>> NULL"?
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> that v == NULL here and the whole verifier change just to allow
>>>>>>>>>> NULL...
>>>>>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>>>>>
>>>>>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>>>>>> For example, the above is expected:
>>>>>>>>>
>>>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>>            if (seq_num >> 63)
>>>>>>>>>              return 0;
>>>>>>>>>            ... map->id ...
>>>>>>>>>            ... map->user_cnt ...
>>>>>>>>>         }
>>>>>>>>>
>>>>>>>>> But if user writes
>>>>>>>>>
>>>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>>             ... map->id ...
>>>>>>>>>             ... map->user_cnt ...
>>>>>>>>>         }
>>>>>>>>>
>>>>>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>>>>>> here and in the above map->id, map->user_cnt will cause
>>>>>>>>> exceptions and they will silently get value 0.
>>>>>>>>
>>>>>>>> I mean always pass valid object pointer into the prog.
>>>>>>>> In above case 'map' will always be valid.
>>>>>>>> Consider prog that iterating all map elements.
>>>>>>>> It's weird that the prog would always need to do
>>>>>>>> if (map == 0)
>>>>>>>>      goto out;
>>>>>>>> even if it doesn't care about finding last.
>>>>>>>> All progs would have to have such extra 'if'.
>>>>>>>> If we always pass valid object than there is no need
>>>>>>>> for such extra checks inside the prog.
>>>>>>>> First and last element can be indicated via seq_num
>>>>>>>> or via another flag or via helper call like is_this_last_elem()
>>>>>>>> or something.
>>>>>>>
>>>>>>> Okay, I see what you mean now. Basically this means
>>>>>>> seq_ops->next() should try to get/maintain next two elements,
>>>>>>
>>>>>> What about the case when there are no elements to iterate to begin
>>>>>> with? In that case, we still need to call bpf_prog for (empty)
>>>>>> post-aggregation, but we have no valid element... For bpf_map
>>>>>> iteration we could have fake empty bpf_map that would be passed, but
>>>>>> I'm not sure it's applicable for any time of object (e.g., having a
>>>>>> fake task_struct is probably quite a bit more problematic?)...
>>>>>
>>>>> Oh, yes, thanks for reminding me of this. I put a call to
>>>>> bpf_prog in seq_ops->stop() especially to handle no object
>>>>> case. In that case, seq_ops->start() will return NULL,
>>>>> seq_ops->next() won't be called, and then seq_ops->stop()
>>>>> is called. My earlier attempt tries to hook with next()
>>>>> and then find it not working in all cases.
>>>>
>>>> wait a sec. seq_ops->stop() is not the end.
>>>> With lseek of seq_file it can be called multiple times.
>>
>> Yes, I have taken care of this. when the object is NULL,
>> bpf program will be called. When the object is NULL again,
>> it won't be called. The private data remembers it has
>> been called with NULL.
> 
> Even without lseek stop() will be called multiple times.
> If I read seq_file.c correctly it will be called before
> every copy_to_user(). Which means that for a lot of text
> (or if read() is done with small buffer) there will be
> plenty of start,show,show,stop sequences.

That is true, this may cause revisit the same object if the object
still exists return start() called again. I followed similar
practice with ipv6_route(), trying to looking up the same
object at start() and only advanced right before next().
Andrii Nakryiko April 29, 2020, 7:19 p.m. UTC | #17
On Wed, Apr 29, 2020 at 8:34 AM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 4/28/20 11:44 PM, Yonghong Song wrote:
> >
> >
> > On 4/28/20 11:40 PM, Andrii Nakryiko wrote:
> >> On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
> >>>
> >>> On 4/28/20 11:20 PM, Yonghong Song wrote:
> >>>>
> >>>>
> >>>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
> >>>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> >>>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> >>>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> >>>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
> >>>>>>>>>>> bpf_iter_seq_map_info),
> >>>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
> >>>>>>>>>>> +                 v == (void *)0);
> >>>>>>>>>>    From looking at seq_file.c, when will show() be called with
> >>>>>>>>>> "v ==
> >>>>>>>>>> NULL"?
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> that v == NULL here and the whole verifier change just to allow
> >>>>>>>>> NULL...
> >>>>>>>>> may be use seq_num as an indicator of the last elem instead?
> >>>>>>>>> Like seq_num with upper bit set to indicate that it's last?
> >>>>>>>>
> >>>>>>>> We could. But then verifier won't have an easy way to verify that.
> >>>>>>>> For example, the above is expected:
> >>>>>>>>
> >>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>>>>            if (seq_num >> 63)
> >>>>>>>>              return 0;
> >>>>>>>>            ... map->id ...
> >>>>>>>>            ... map->user_cnt ...
> >>>>>>>>         }
> >>>>>>>>
> >>>>>>>> But if user writes
> >>>>>>>>
> >>>>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>>>>             ... map->id ...
> >>>>>>>>             ... map->user_cnt ...
> >>>>>>>>         }
> >>>>>>>>
> >>>>>>>> verifier won't be easy to conclude inproper map pointer tracing
> >>>>>>>> here and in the above map->id, map->user_cnt will cause
> >>>>>>>> exceptions and they will silently get value 0.
> >>>>>>>
> >>>>>>> I mean always pass valid object pointer into the prog.
> >>>>>>> In above case 'map' will always be valid.
> >>>>>>> Consider prog that iterating all map elements.
> >>>>>>> It's weird that the prog would always need to do
> >>>>>>> if (map == 0)
> >>>>>>>      goto out;
> >>>>>>> even if it doesn't care about finding last.
> >>>>>>> All progs would have to have such extra 'if'.
> >>>>>>> If we always pass valid object than there is no need
> >>>>>>> for such extra checks inside the prog.
> >>>>>>> First and last element can be indicated via seq_num
> >>>>>>> or via another flag or via helper call like is_this_last_elem()
> >>>>>>> or something.
> >>>>>>
> >>>>>> Okay, I see what you mean now. Basically this means
> >>>>>> seq_ops->next() should try to get/maintain next two elements,
> >>>>>
> >>>>> What about the case when there are no elements to iterate to begin
> >>>>> with? In that case, we still need to call bpf_prog for (empty)
> >>>>> post-aggregation, but we have no valid element... For bpf_map
> >>>>> iteration we could have fake empty bpf_map that would be passed, but
> >>>>> I'm not sure it's applicable for any time of object (e.g., having a
> >>>>> fake task_struct is probably quite a bit more problematic?)...
> >>>>
> >>>> Oh, yes, thanks for reminding me of this. I put a call to
> >>>> bpf_prog in seq_ops->stop() especially to handle no object
> >>>> case. In that case, seq_ops->start() will return NULL,
> >>>> seq_ops->next() won't be called, and then seq_ops->stop()
> >>>> is called. My earlier attempt tries to hook with next()
> >>>> and then find it not working in all cases.
> >>>
> >>> wait a sec. seq_ops->stop() is not the end.
> >>> With lseek of seq_file it can be called multiple times.
> >
> > Yes, I have taken care of this. when the object is NULL,
> > bpf program will be called. When the object is NULL again,
> > it won't be called. The private data remembers it has
> > been called with NULL.
>
> Even without lseek stop() will be called multiple times.
> If I read seq_file.c correctly it will be called before
> every copy_to_user(). Which means that for a lot of text
> (or if read() is done with small buffer) there will be
> plenty of start,show,show,stop sequences.


Right start/stop can be called multiple times, but seems like there
are clear indicators of beginning of iteration and end of iteration:
- start() with seq_num == 0 is start of iteration (can be called
multiple times, if first element overflows buffer);
- stop() with p == NULL is end of iteration (seems like can be called
multiple times as well, if user keeps read()'ing after iteration
completed).

There is another problem with stop(), though. If BPF program will
attempt to output anything during stop(), that output will be just
discarded. Not great. Especially if that output overflows and we need
to re-allocate buffer.

We are trying to use seq_file just to reuse 140 lines of code in
seq_read(), which is no magic, just a simple double buffer and retry
piece of logic. We don't need lseek and traverse, we don't need all
the escaping stuff. I think bpf_iter implementation would be much
simpler if bpf_iter had better control over iteration. Then this whole
"end of iteration" behavior would be crystal clear. Should we maybe
reconsider again?

I understand we want to re-use networking iteration code, but we can
still do that with custom implementation of seq_read, because we are
still using struct seq_file and follow its semantics. The change would
be to allow stop(NULL) (or any stop() call for that matter) to perform
output (and handle retry and buffer re-allocation). Or, alternatively,
coupled with seq_operations intercept proposal in patch #7 discussion,
we can add extra method (e.g., finish()) that would be called after
all elements are traversed and will allow to emit extra stuff. We can
do that (implement finish()) in seq_read, as well, if that's going to
fly ok with seq_file maintainers, of course.
Andrii Nakryiko April 29, 2020, 7:25 p.m. UTC | #18
On Tue, Apr 28, 2020 at 11:51 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 4/28/20 11:34 PM, Martin KaFai Lau wrote:
> > On Tue, Apr 28, 2020 at 11:20:30PM -0700, Yonghong Song wrote:
> >>
> >>
> >> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
> >>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
> >>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
> >>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
> >>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
> >>>>>>>>> bpf_iter_seq_map_info),
> >>>>>>>>> +                 &meta.session_id, &meta.seq_num,
> >>>>>>>>> +                 v == (void *)0);
> >>>>>>>>    From looking at seq_file.c, when will show() be called with "v ==
> >>>>>>>> NULL"?
> >>>>>>>>
> >>>>>>>
> >>>>>>> that v == NULL here and the whole verifier change just to allow NULL...
> >>>>>>> may be use seq_num as an indicator of the last elem instead?
> >>>>>>> Like seq_num with upper bit set to indicate that it's last?
> >>>>>>
> >>>>>> We could. But then verifier won't have an easy way to verify that.
> >>>>>> For example, the above is expected:
> >>>>>>
> >>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>>            if (seq_num >> 63)
> >>>>>>              return 0;
> >>>>>>            ... map->id ...
> >>>>>>            ... map->user_cnt ...
> >>>>>>         }
> >>>>>>
> >>>>>> But if user writes
> >>>>>>
> >>>>>>         int prog(struct bpf_map *map, u64 seq_num) {
> >>>>>>             ... map->id ...
> >>>>>>             ... map->user_cnt ...
> >>>>>>         }
> >>>>>>
> >>>>>> verifier won't be easy to conclude inproper map pointer tracing
> >>>>>> here and in the above map->id, map->user_cnt will cause
> >>>>>> exceptions and they will silently get value 0.
> >>>>>
> >>>>> I mean always pass valid object pointer into the prog.
> >>>>> In above case 'map' will always be valid.
> >>>>> Consider prog that iterating all map elements.
> >>>>> It's weird that the prog would always need to do
> >>>>> if (map == 0)
> >>>>>      goto out;
> >>>>> even if it doesn't care about finding last.
> >>>>> All progs would have to have such extra 'if'.
> >>>>> If we always pass valid object than there is no need
> >>>>> for such extra checks inside the prog.
> >>>>> First and last element can be indicated via seq_num
> >>>>> or via another flag or via helper call like is_this_last_elem()
> >>>>> or something.
> >>>>
> >>>> Okay, I see what you mean now. Basically this means
> >>>> seq_ops->next() should try to get/maintain next two elements,
> >>>
> >>> What about the case when there are no elements to iterate to begin
> >>> with? In that case, we still need to call bpf_prog for (empty)
> >>> post-aggregation, but we have no valid element... For bpf_map
> >>> iteration we could have fake empty bpf_map that would be passed, but
> >>> I'm not sure it's applicable for any time of object (e.g., having a
> >>> fake task_struct is probably quite a bit more problematic?)...
> >>
> >> Oh, yes, thanks for reminding me of this. I put a call to
> >> bpf_prog in seq_ops->stop() especially to handle no object
> >> case. In that case, seq_ops->start() will return NULL,
> >> seq_ops->next() won't be called, and then seq_ops->stop()
> >> is called. My earlier attempt tries to hook with next()
> >> and then find it not working in all cases.
> >>
> >>>
> >>>> otherwise, we won't know whether the one in seq_ops->show()
> >>>> is the last or not.
> > I think "show()" is convoluted with "stop()/eof()".  Could "stop()/eof()"
> > be its own separate (and optional) bpf_prog which only does "stop()/eof()"?
>
> I thought this before. But user need to write a program instead of
> a simple "if" condition in the main program...
>

I agree with Yonghong, requiring user to check for null is pretty
trivial and verifier can give very clear error message if user didn't
check.
The PTR_TO_BTF_ID_OR_NULL seems useful in general as well, it's an
optional typed input arguments and might be useful in other
situations. Verifier changes don't seem excessive as well.

Having two coupled BPF programs to do single iteration becomes awkward
to manage, will complicate kernel interface (e.g., special variants of
LINK_CREATE and LINK_UPDATE) and libbpf implementation. It's also
going to be harder to replace them atomically. I think overall cons
outweight pros.

As one way to maybe simplify it for users a bit, we can make this
post-aggregation call optional with extra flag on BPF_PROG_LOAD.
Unless extra flag is specified, input arguments can stay PTR_TO_BTF_ID
and we'll just get non-NULL inputs and no "end of iteration" call.
With extra flags, inputs become PTR_TO_BTF_ID_OR_NULL and one extra
call at the end.



> >
> >>>> We could do it in newly implemented
> >>>> iterator bpf_map/task/task_file. Let me check how I could
> >>>> make existing seq_ops (ipv6_route/netlink) works with
> >>>> minimum changes.
Yonghong Song April 29, 2020, 8:15 p.m. UTC | #19
On 4/29/20 12:19 PM, Andrii Nakryiko wrote:
> On Wed, Apr 29, 2020 at 8:34 AM Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 4/28/20 11:44 PM, Yonghong Song wrote:
>>>
>>>
>>> On 4/28/20 11:40 PM, Andrii Nakryiko wrote:
>>>> On Tue, Apr 28, 2020 at 11:30 PM Alexei Starovoitov <ast@fb.com> wrote:
>>>>>
>>>>> On 4/28/20 11:20 PM, Yonghong Song wrote:
>>>>>>
>>>>>>
>>>>>> On 4/28/20 11:08 PM, Andrii Nakryiko wrote:
>>>>>>> On Tue, Apr 28, 2020 at 10:10 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 4/28/20 7:44 PM, Alexei Starovoitov wrote:
>>>>>>>>> On 4/28/20 6:15 PM, Yonghong Song wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 4/28/20 5:48 PM, Alexei Starovoitov wrote:
>>>>>>>>>>> On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
>>>>>>>>>>>>> +    prog = bpf_iter_get_prog(seq, sizeof(struct
>>>>>>>>>>>>> bpf_iter_seq_map_info),
>>>>>>>>>>>>> +                 &meta.session_id, &meta.seq_num,
>>>>>>>>>>>>> +                 v == (void *)0);
>>>>>>>>>>>>     From looking at seq_file.c, when will show() be called with
>>>>>>>>>>>> "v ==
>>>>>>>>>>>> NULL"?
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> that v == NULL here and the whole verifier change just to allow
>>>>>>>>>>> NULL...
>>>>>>>>>>> may be use seq_num as an indicator of the last elem instead?
>>>>>>>>>>> Like seq_num with upper bit set to indicate that it's last?
>>>>>>>>>>
>>>>>>>>>> We could. But then verifier won't have an easy way to verify that.
>>>>>>>>>> For example, the above is expected:
>>>>>>>>>>
>>>>>>>>>>          int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>>>             if (seq_num >> 63)
>>>>>>>>>>               return 0;
>>>>>>>>>>             ... map->id ...
>>>>>>>>>>             ... map->user_cnt ...
>>>>>>>>>>          }
>>>>>>>>>>
>>>>>>>>>> But if user writes
>>>>>>>>>>
>>>>>>>>>>          int prog(struct bpf_map *map, u64 seq_num) {
>>>>>>>>>>              ... map->id ...
>>>>>>>>>>              ... map->user_cnt ...
>>>>>>>>>>          }
>>>>>>>>>>
>>>>>>>>>> verifier won't be easy to conclude inproper map pointer tracing
>>>>>>>>>> here and in the above map->id, map->user_cnt will cause
>>>>>>>>>> exceptions and they will silently get value 0.
>>>>>>>>>
>>>>>>>>> I mean always pass valid object pointer into the prog.
>>>>>>>>> In above case 'map' will always be valid.
>>>>>>>>> Consider prog that iterating all map elements.
>>>>>>>>> It's weird that the prog would always need to do
>>>>>>>>> if (map == 0)
>>>>>>>>>       goto out;
>>>>>>>>> even if it doesn't care about finding last.
>>>>>>>>> All progs would have to have such extra 'if'.
>>>>>>>>> If we always pass valid object than there is no need
>>>>>>>>> for such extra checks inside the prog.
>>>>>>>>> First and last element can be indicated via seq_num
>>>>>>>>> or via another flag or via helper call like is_this_last_elem()
>>>>>>>>> or something.
>>>>>>>>
>>>>>>>> Okay, I see what you mean now. Basically this means
>>>>>>>> seq_ops->next() should try to get/maintain next two elements,
>>>>>>>
>>>>>>> What about the case when there are no elements to iterate to begin
>>>>>>> with? In that case, we still need to call bpf_prog for (empty)
>>>>>>> post-aggregation, but we have no valid element... For bpf_map
>>>>>>> iteration we could have fake empty bpf_map that would be passed, but
>>>>>>> I'm not sure it's applicable for any time of object (e.g., having a
>>>>>>> fake task_struct is probably quite a bit more problematic?)...
>>>>>>
>>>>>> Oh, yes, thanks for reminding me of this. I put a call to
>>>>>> bpf_prog in seq_ops->stop() especially to handle no object
>>>>>> case. In that case, seq_ops->start() will return NULL,
>>>>>> seq_ops->next() won't be called, and then seq_ops->stop()
>>>>>> is called. My earlier attempt tries to hook with next()
>>>>>> and then find it not working in all cases.
>>>>>
>>>>> wait a sec. seq_ops->stop() is not the end.
>>>>> With lseek of seq_file it can be called multiple times.
>>>
>>> Yes, I have taken care of this. when the object is NULL,
>>> bpf program will be called. When the object is NULL again,
>>> it won't be called. The private data remembers it has
>>> been called with NULL.
>>
>> Even without lseek stop() will be called multiple times.
>> If I read seq_file.c correctly it will be called before
>> every copy_to_user(). Which means that for a lot of text
>> (or if read() is done with small buffer) there will be
>> plenty of start,show,show,stop sequences.
> 
> 
> Right start/stop can be called multiple times, but seems like there
> are clear indicators of beginning of iteration and end of iteration:
> - start() with seq_num == 0 is start of iteration (can be called
> multiple times, if first element overflows buffer);
> - stop() with p == NULL is end of iteration (seems like can be called
> multiple times as well, if user keeps read()'ing after iteration
> completed).
> 
> There is another problem with stop(), though. If BPF program will
> attempt to output anything during stop(), that output will be just
> discarded. Not great. Especially if that output overflows and we need

The stop() output will not be discarded in the following cases:
    - regular show() objects overflow and stop() BPF program not called
    - regular show() objects not overflow, which means iteration is done,
      and stop() BPF program does not overflow.

The stop() seq_file output will be discarded if
    - regular show() objects not overflow and stop() BPF program output
      overflows.
    - no objects to iterate, BPF program got called, but its seq_file
      write/printf will be discarded.

Two options here:
   - implement Alexei suggestion to look ahead two elements to
     always having valid object and indicating the last element
     with a special flag.
   - Per Andrii's suggestion below to implement new way or to
     tweak seq_file() a little bit to resolve the above cases
     where stop() seq_file outputs being discarded.

Will try to experiment with both above options...


> to re-allocate buffer.
> 
> We are trying to use seq_file just to reuse 140 lines of code in
> seq_read(), which is no magic, just a simple double buffer and retry
> piece of logic. We don't need lseek and traverse, we don't need all
> the escaping stuff. I think bpf_iter implementation would be much
> simpler if bpf_iter had better control over iteration. Then this whole
> "end of iteration" behavior would be crystal clear. Should we maybe
> reconsider again?
> 
> I understand we want to re-use networking iteration code, but we can
> still do that with custom implementation of seq_read, because we are
> still using struct seq_file and follow its semantics. The change would
> be to allow stop(NULL) (or any stop() call for that matter) to perform
> output (and handle retry and buffer re-allocation). Or, alternatively,
> coupled with seq_operations intercept proposal in patch #7 discussion,
> we can add extra method (e.g., finish()) that would be called after
> all elements are traversed and will allow to emit extra stuff. We can
> do that (implement finish()) in seq_read, as well, if that's going to
> fly ok with seq_file maintainers, of course.
>
Alexei Starovoitov April 30, 2020, 3:06 a.m. UTC | #20
On 4/29/20 1:15 PM, Yonghong Song wrote:
>>>
>>> Even without lseek stop() will be called multiple times.
>>> If I read seq_file.c correctly it will be called before
>>> every copy_to_user(). Which means that for a lot of text
>>> (or if read() is done with small buffer) there will be
>>> plenty of start,show,show,stop sequences.
>>
>>
>> Right start/stop can be called multiple times, but seems like there
>> are clear indicators of beginning of iteration and end of iteration:
>> - start() with seq_num == 0 is start of iteration (can be called
>> multiple times, if first element overflows buffer);
>> - stop() with p == NULL is end of iteration (seems like can be called
>> multiple times as well, if user keeps read()'ing after iteration
>> completed).
>>
>> There is another problem with stop(), though. If BPF program will
>> attempt to output anything during stop(), that output will be just
>> discarded. Not great. Especially if that output overflows and we need
> 
> The stop() output will not be discarded in the following cases:
>     - regular show() objects overflow and stop() BPF program not called
>     - regular show() objects not overflow, which means iteration is done,
>       and stop() BPF program does not overflow.
> 
> The stop() seq_file output will be discarded if
>     - regular show() objects not overflow and stop() BPF program output
>       overflows.
>     - no objects to iterate, BPF program got called, but its seq_file
>       write/printf will be discarded.
> 
> Two options here:
>    - implement Alexei suggestion to look ahead two elements to
>      always having valid object and indicating the last element
>      with a special flag.
>    - Per Andrii's suggestion below to implement new way or to
>      tweak seq_file() a little bit to resolve the above cases
>      where stop() seq_file outputs being discarded.
> 
> Will try to experiment with both above options...
> 
> 
>> to re-allocate buffer.
>>
>> We are trying to use seq_file just to reuse 140 lines of code in
>> seq_read(), which is no magic, just a simple double buffer and retry
>> piece of logic. We don't need lseek and traverse, we don't need all
>> the escaping stuff. I think bpf_iter implementation would be much
>> simpler if bpf_iter had better control over iteration. Then this whole
>> "end of iteration" behavior would be crystal clear. Should we maybe
>> reconsider again?

That's what I was advocating for some time now.

I think seq_file is barely usable as a /proc extension and completely
unusable for iterating.
All the discussions in the last few weeks are pointing out that
majority of use cases are in the iterating space instead of dumping.
Dumping human readable strings as unstable /proc extension is
a small subset. So I think we shouldn't use fs/seq_file.c.
The dance around double op->next() or introducing op->finish()
into seq_ops looks like fifth wheel to the car.
I think bpf_iter semantics and bpf prog logic would be much simpler
and easier to understand if op->read method was re-implemented
for the purpose of iterating the objects.
I mean seq_op->start/next/stop can be reused as-is to iterate
existing kernel objects like sockets, but seq_read() will not be
used. We should explicitly disable lseek and write on our
cat-able files and use new bpf_seq_read() as .read op.
This specialized bpf_seq_read() will still do a sequences of
start/show/show/stop for every copy_to_user, but we don't need to
add finish() to seq_op and hack existing seq_read().
We also will be able to provide precise seq_num into the program
instead of approximation.
bpf_seq_read wouldn't need to deal with ppos and traverse.
It wouldn't need fancy m->size<<=1 retries.
It can allocate fixed PAGE_SIZE and be done with it.
It's fine to restrict bpf progs to not dump more than 4k
chracters per object.
And we can call bpf_iter prog exactly once per element.
Plenty of pros and no real cons.
Yonghong Song April 30, 2020, 4:01 a.m. UTC | #21
On 4/29/20 8:06 PM, Alexei Starovoitov wrote:
> On 4/29/20 1:15 PM, Yonghong Song wrote:
>>>>
>>>> Even without lseek stop() will be called multiple times.
>>>> If I read seq_file.c correctly it will be called before
>>>> every copy_to_user(). Which means that for a lot of text
>>>> (or if read() is done with small buffer) there will be
>>>> plenty of start,show,show,stop sequences.
>>>
>>>
>>> Right start/stop can be called multiple times, but seems like there
>>> are clear indicators of beginning of iteration and end of iteration:
>>> - start() with seq_num == 0 is start of iteration (can be called
>>> multiple times, if first element overflows buffer);
>>> - stop() with p == NULL is end of iteration (seems like can be called
>>> multiple times as well, if user keeps read()'ing after iteration
>>> completed).
>>>
>>> There is another problem with stop(), though. If BPF program will
>>> attempt to output anything during stop(), that output will be just
>>> discarded. Not great. Especially if that output overflows and we need
>>
>> The stop() output will not be discarded in the following cases:
>>     - regular show() objects overflow and stop() BPF program not called
>>     - regular show() objects not overflow, which means iteration is done,
>>       and stop() BPF program does not overflow.
>>
>> The stop() seq_file output will be discarded if
>>     - regular show() objects not overflow and stop() BPF program output
>>       overflows.
>>     - no objects to iterate, BPF program got called, but its seq_file
>>       write/printf will be discarded.
>>
>> Two options here:
>>    - implement Alexei suggestion to look ahead two elements to
>>      always having valid object and indicating the last element
>>      with a special flag.
>>    - Per Andrii's suggestion below to implement new way or to
>>      tweak seq_file() a little bit to resolve the above cases
>>      where stop() seq_file outputs being discarded.
>>
>> Will try to experiment with both above options...
>>
>>
>>> to re-allocate buffer.
>>>
>>> We are trying to use seq_file just to reuse 140 lines of code in
>>> seq_read(), which is no magic, just a simple double buffer and retry
>>> piece of logic. We don't need lseek and traverse, we don't need all
>>> the escaping stuff. I think bpf_iter implementation would be much
>>> simpler if bpf_iter had better control over iteration. Then this whole
>>> "end of iteration" behavior would be crystal clear. Should we maybe
>>> reconsider again?
> 
> That's what I was advocating for some time now.
> 
> I think seq_file is barely usable as a /proc extension and completely
> unusable for iterating.
> All the discussions in the last few weeks are pointing out that
> majority of use cases are in the iterating space instead of dumping.
> Dumping human readable strings as unstable /proc extension is
> a small subset. So I think we shouldn't use fs/seq_file.c.
> The dance around double op->next() or introducing op->finish()
> into seq_ops looks like fifth wheel to the car.
> I think bpf_iter semantics and bpf prog logic would be much simpler
> and easier to understand if op->read method was re-implemented
> for the purpose of iterating the objects.
> I mean seq_op->start/next/stop can be reused as-is to iterate
> existing kernel objects like sockets, but seq_read() will not be
> used. We should explicitly disable lseek and write on our
> cat-able files and use new bpf_seq_read() as .read op.
> This specialized bpf_seq_read() will still do a sequences of
> start/show/show/stop for every copy_to_user, but we don't need to
> add finish() to seq_op and hack existing seq_read().
> We also will be able to provide precise seq_num into the program
> instead of approximation.
> bpf_seq_read wouldn't need to deal with ppos and traverse.
> It wouldn't need fancy m->size<<=1 retries.
> It can allocate fixed PAGE_SIZE and be done with it.
> It's fine to restrict bpf progs to not dump more than 4k
> chracters per object.
> And we can call bpf_iter prog exactly once per element.
> Plenty of pros and no real cons.

This may indeed be simpler and scalarable since it is specific for our 
use case compared to ouble next with tweaking the
existing target (ipv6_route, netlink, etc.). Will explore
this way.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5e56abc1e2f1..4ac8d61f7c3e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1078,6 +1078,7 @@  int  generic_map_update_batch(struct bpf_map *map,
 int  generic_map_delete_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
@@ -1118,7 +1119,16 @@  struct bpf_iter_reg {
 	u32 target_feature;
 };
 
+struct bpf_iter_meta {
+	__bpf_md_ptr(struct seq_file *, seq);
+	u64 session_id;
+	u64 seq_num;
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
+				   u64 *session_id, u64 *seq_num, bool is_last);
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a8b0febd3f6..b2b5eefc5254 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@ 
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 1115b978607a..284c95587803 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -48,3 +48,22 @@  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 
 	return 0;
 }
+
+struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
+				   u64 *session_id, u64 *seq_num, bool is_last)
+{
+	return NULL;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+	int ret;
+
+	migrate_disable();
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+	migrate_enable();
+
+	return ret;
+}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..bb3ad4c3bde5
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,107 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+	struct bpf_map *map;
+	u32 id;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+	u32 id = info->id;
+
+	map = bpf_map_get_curr_or_next(&id);
+	if (IS_ERR_OR_NULL(map))
+		return NULL;
+
+	++*pos;
+	info->map = map;
+	info->id = id;
+	return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+
+	++*pos;
+	++info->id;
+	map = bpf_map_get_curr_or_next(&info->id);
+	if (IS_ERR_OR_NULL(map))
+		return NULL;
+
+	bpf_map_put(info->map);
+	info->map = map;
+	return map;
+}
+
+struct bpf_iter__bpf_map {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+};
+
+int __init __bpf_iter__bpf_map(struct bpf_iter_meta *meta, struct bpf_map *map)
+{
+	return 0;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__bpf_map ctx;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.map = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
+				 &meta.session_id, &meta.seq_num,
+				 v == (void *)0);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret == 0 ? 0 : -EINVAL;
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+
+	if (!v)
+		bpf_map_seq_show(seq, v);
+
+	if (info->map) {
+		bpf_map_put(info->map);
+		info->map = NULL;
+	}
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+	.start	= bpf_map_seq_start,
+	.next	= bpf_map_seq_next,
+	.stop	= bpf_map_seq_stop,
+	.show	= bpf_map_seq_show,
+};
+
+static int __init bpf_map_iter_init(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "bpf_map",
+		.target_func_name	= "__bpf_iter__bpf_map",
+		.seq_ops		= &bpf_map_seq_ops,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+		.target_feature		= 0,
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7626b8024471..022187640943 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2800,6 +2800,19 @@  static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+	struct bpf_map *map;
+
+	spin_lock_bh(&map_idr_lock);
+	map = idr_get_next(&map_idr, id);
+	if (map)
+		map = __bpf_map_inc_not_zero(map, false);
+	spin_unlock_bh(&map_idr_lock);
+
+	return map;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)