diff mbox series

[bpf-next,v2,02/20] bpf: allow loading of a bpf_iter program

Message ID 20200504062548.2047454-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song May 4, 2020, 6:25 a.m. UTC
A bpf_iter program is a tracing program with attach type
BPF_TRACE_ITER. The load attribute
  attach_btf_id
is used by the verifier against a particular kernel function,
which represents a target, e.g., __bpf_iter__bpf_map
for target bpf_map which is implemented later.

The program return value must be 0 or 1 for now.
  0 : successful, except potential seq_file buffer overflow
      which is handled by seq_file reader.
  1 : request to restart the same object

In the future, other return values may be used for filtering or
teminating the iterator.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 56 insertions(+)

Comments

Andrii Nakryiko May 5, 2020, 9:29 p.m. UTC | #1
On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> A bpf_iter program is a tracing program with attach type
> BPF_TRACE_ITER. The load attribute
>   attach_btf_id
> is used by the verifier against a particular kernel function,
> which represents a target, e.g., __bpf_iter__bpf_map
> for target bpf_map which is implemented later.
>
> The program return value must be 0 or 1 for now.
>   0 : successful, except potential seq_file buffer overflow
>       which is handled by seq_file reader.
>   1 : request to restart the same object

This bit is interesting. Is the idea that if BPF program also wants to
send something over, say, perf_buffer, but fails, it can "request"
same execution again? I wonder if typical libc fread() implementation
would handle EAGAIN properly, it seems more driven towards
non-blocking I/O?

On the other hand, following start/show/next logic for seq_file
iteration, requesting skipping element seems useful. It would allow
(in some cases) to "speculatively" generate output and at some point
realize that this is not an element we actually want in the output and
request to ignore that output.

Don't know how useful the latter is going to be in practice, but just
something to keep in mind for the future, I guess...

>
> In the future, other return values may be used for filtering or
> teminating the iterator.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |  3 +++
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
>  kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 56 insertions(+)
>

[...]


> +
> +bool bpf_iter_prog_supported(struct bpf_prog *prog)
> +{
> +       const char *attach_fname = prog->aux->attach_func_name;
> +       u32 prog_btf_id = prog->aux->attach_btf_id;
> +       const char *prefix = BPF_ITER_FUNC_PREFIX;
> +       struct bpf_iter_target_info *tinfo;
> +       int prefix_len = strlen(prefix);
> +       bool supported = false;
> +
> +       if (strncmp(attach_fname, prefix, prefix_len))
> +               return false;
> +
> +       mutex_lock(&targets_mutex);
> +       list_for_each_entry(tinfo, &targets, list) {
> +               if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
> +                       supported = true;
> +                       break;
> +               }
> +               if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
> +                       tinfo->btf_id = prog->aux->attach_btf_id;

This target_info->btf_id caching here is a bit subtle and easy to
miss, it would be nice to have a code calling this out explicitly.
Thanks!

> +                       supported = true;
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&targets_mutex);
> +
> +       return supported;
> +}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 70ad009577f8..d725ff7d11db 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
>                         return 0;
>                 range = tnum_const(0);
>                 break;
> +       case BPF_PROG_TYPE_TRACING:
> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
> +                       return 0;

Commit message mentions enforcing [0, 1], shouldn't it be done here?


> +               break;
>         default:
>                 return 0;
>         }

[...]
Yonghong Song May 6, 2020, 12:07 a.m. UTC | #2
On 5/5/20 2:29 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A bpf_iter program is a tracing program with attach type
>> BPF_TRACE_ITER. The load attribute
>>    attach_btf_id
>> is used by the verifier against a particular kernel function,
>> which represents a target, e.g., __bpf_iter__bpf_map
>> for target bpf_map which is implemented later.
>>
>> The program return value must be 0 or 1 for now.
>>    0 : successful, except potential seq_file buffer overflow
>>        which is handled by seq_file reader.
>>    1 : request to restart the same object
> 
> This bit is interesting. Is the idea that if BPF program also wants to
> send something over, say, perf_buffer, but fails, it can "request"
> same execution again? I wonder if typical libc fread() implementation

Yes. The bpf_seq_read() can handle this the same as any other
retry request. The following is current mapping.
    bpf program return 0   ---> seq_ops->show() return 0
    bpf program return 1   ---> seq_ops->show() return -EAGAIN

> would handle EAGAIN properly, it seems more driven towards
> non-blocking I/O?

I did not have a test for this in current patch set for bpf program
returning 1. Will add a test in the next version.

> 
> On the other hand, following start/show/next logic for seq_file
> iteration, requesting skipping element seems useful. It would allow
> (in some cases) to "speculatively" generate output and at some point
> realize that this is not an element we actually want in the output and
> request to ignore that output.
> 
> Don't know how useful the latter is going to be in practice, but just
> something to keep in mind for the future, I guess...
> 
>>
>> In the future, other return values may be used for filtering or
>> teminating the iterator.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |  3 +++
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
>>   kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   5 files changed, 56 insertions(+)
>>
> 
> [...]
> 
> 
>> +
>> +bool bpf_iter_prog_supported(struct bpf_prog *prog)
>> +{
>> +       const char *attach_fname = prog->aux->attach_func_name;
>> +       u32 prog_btf_id = prog->aux->attach_btf_id;
>> +       const char *prefix = BPF_ITER_FUNC_PREFIX;
>> +       struct bpf_iter_target_info *tinfo;
>> +       int prefix_len = strlen(prefix);
>> +       bool supported = false;
>> +
>> +       if (strncmp(attach_fname, prefix, prefix_len))
>> +               return false;
>> +
>> +       mutex_lock(&targets_mutex);
>> +       list_for_each_entry(tinfo, &targets, list) {
>> +               if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
>> +                       supported = true;
>> +                       break;
>> +               }
>> +               if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
>> +                       tinfo->btf_id = prog->aux->attach_btf_id;
> 
> This target_info->btf_id caching here is a bit subtle and easy to
> miss, it would be nice to have a code calling this out explicitly.

Will do.

> Thanks!
> 
>> +                       supported = true;
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&targets_mutex);
>> +
>> +       return supported;
>> +}
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 70ad009577f8..d725ff7d11db 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
>>                          return 0;
>>                  range = tnum_const(0);
>>                  break;
>> +       case BPF_PROG_TYPE_TRACING:
>> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> +                       return 0;
> 
> Commit message mentions enforcing [0, 1], shouldn't it be done here?

The default range is [0, 1], hence no explicit assignment here.

static int check_return_code(struct bpf_verifier_env *env)
{
         struct tnum enforce_attach_type_range = tnum_unknown;
         const struct bpf_prog *prog = env->prog;
         struct bpf_reg_state *reg;
         struct tnum range = tnum_range(0, 1);
......

> 
> 
>> +               break;
>>          default:
>>                  return 0;
>>          }
> 
> [...]
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 597b37c4e1c6..cd385c36a172 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1127,6 +1127,8 @@  struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+#define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
+
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 
@@ -1139,6 +1141,7 @@  struct bpf_iter_reg {
 };
 
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+bool bpf_iter_prog_supported(struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@  enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index ed930a0470e9..c1fae67a1452 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -12,6 +12,7 @@  struct bpf_iter_target_info {
 	bpf_iter_init_seq_priv_t init_seq_private;
 	bpf_iter_fini_seq_priv_t fini_seq_private;
 	u32 seq_priv_size;
+	u32 btf_id;
 };
 
 static struct list_head targets = LIST_HEAD_INIT(targets);
@@ -38,3 +39,32 @@  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 
 	return 0;
 }
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+	const char *attach_fname = prog->aux->attach_func_name;
+	u32 prog_btf_id = prog->aux->attach_btf_id;
+	const char *prefix = BPF_ITER_FUNC_PREFIX;
+	struct bpf_iter_target_info *tinfo;
+	int prefix_len = strlen(prefix);
+	bool supported = false;
+
+	if (strncmp(attach_fname, prefix, prefix_len))
+		return false;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+			supported = true;
+			break;
+		}
+		if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
+			tinfo->btf_id = prog->aux->attach_btf_id;
+			supported = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	return supported;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 70ad009577f8..d725ff7d11db 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7101,6 +7101,10 @@  static int check_return_code(struct bpf_verifier_env *env)
 			return 0;
 		range = tnum_const(0);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		if (env->prog->expected_attach_type != BPF_TRACE_ITER)
+			return 0;
+		break;
 	default:
 		return 0;
 	}
@@ -10481,6 +10485,7 @@  static int check_attach_btf_id(struct bpf_verifier_env *env)
 	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
 	u32 btf_id = prog->aux->attach_btf_id;
 	const char prefix[] = "btf_trace_";
+	struct btf_func_model fmodel;
 	int ret = 0, subprog = -1, i;
 	struct bpf_trampoline *tr;
 	const struct btf_type *t;
@@ -10622,6 +10627,22 @@  static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		prog->aux->attach_btf_trace = true;
 		return 0;
+	case BPF_TRACE_ITER:
+		if (!btf_type_is_func(t)) {
+			verbose(env, "attach_btf_id %u is not a function\n",
+				btf_id);
+			return -EINVAL;
+		}
+		t = btf_type_by_id(btf, t->type);
+		if (!btf_type_is_func_proto(t))
+			return -EINVAL;
+		prog->aux->attach_func_name = tname;
+		prog->aux->attach_func_proto = t;
+		if (!bpf_iter_prog_supported(prog))
+			return -EINVAL;
+		ret = btf_distill_func_proto(&env->log, btf, t,
+					     tname, &fmodel);
+		return ret;
 	default:
 		if (!prog_extension)
 			return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@  enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };