diff mbox series

[bpf-next,v2,03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE

Message ID 20200504062549.2047531-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song May 4, 2020, 6:25 a.m. UTC
Given a bpf program, the step to create an anonymous bpf iterator is:
  - create a bpf_iter_link, which combines bpf program and the target.
    In the future, there could be more information recorded in the link.
    A link_fd will be returned to the user space.
  - create an anonymous bpf iterator with the given link_fd.

The bpf_iter_link can be pinned to bpffs mount file system to
create a file based bpf iterator as well.

The benefit to use of bpf_iter_link:
  - using bpf link simplifies design and implementation as bpf link
    is used for other tracing bpf programs.
  - for file based bpf iterator, bpf_iter_link provides a standard
    way to replace underlying bpf programs.
  - for both anonymous and free based iterators, bpf link query
    capability can be leveraged.

The patch added support of tracing/iter programs for BPF_LINK_CREATE.
A new link type BPF_LINK_TYPE_ITER is added to facilitate link
querying. Currently, only prog_id is needed, so there is no
additional in-kernel show_fdinfo() and fill_link_info() hook
is needed for BPF_LINK_TYPE_ITER link.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/linux/bpf_types.h      |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           | 14 ++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 80 insertions(+)

Comments

Andrii Nakryiko May 5, 2020, 9:30 p.m. UTC | #1
On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a bpf program, the step to create an anonymous bpf iterator is:
>   - create a bpf_iter_link, which combines bpf program and the target.
>     In the future, there could be more information recorded in the link.
>     A link_fd will be returned to the user space.
>   - create an anonymous bpf iterator with the given link_fd.
>
> The bpf_iter_link can be pinned to bpffs mount file system to
> create a file based bpf iterator as well.
>
> The benefit to use of bpf_iter_link:
>   - using bpf link simplifies design and implementation as bpf link
>     is used for other tracing bpf programs.
>   - for file based bpf iterator, bpf_iter_link provides a standard
>     way to replace underlying bpf programs.
>   - for both anonymous and free based iterators, bpf link query
>     capability can be leveraged.
>
> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> querying. Currently, only prog_id is needed, so there is no
> additional in-kernel show_fdinfo() and fill_link_info() hook
> is needed for BPF_LINK_TYPE_ITER link.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM. See small nit about __GFP_NOWARN.

Acked-by: Andrii Nakryiko <andriin@fb.com>


>  include/linux/bpf.h            |  1 +
>  include/linux/bpf_types.h      |  1 +
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           | 14 ++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  6 files changed, 80 insertions(+)
>

[...]

> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> +       struct bpf_link_primer link_primer;
> +       struct bpf_iter_target_info *tinfo;
> +       struct bpf_iter_link *link;
> +       bool existed = false;
> +       u32 prog_btf_id;
> +       int err;
> +
> +       if (attr->link_create.target_fd || attr->link_create.flags)
> +               return -EINVAL;
> +
> +       prog_btf_id = prog->aux->attach_btf_id;
> +       mutex_lock(&targets_mutex);
> +       list_for_each_entry(tinfo, &targets, list) {
> +               if (tinfo->btf_id == prog_btf_id) {
> +                       existed = true;
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&targets_mutex);
> +       if (!existed)
> +               return -ENOENT;
> +
> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);

nit: all existing link implementation don't specify __GFP_NOWARN,
wonder if bpf_iter_link should be special?

> +       if (!link)
> +               return -ENOMEM;
> +
> +       bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
> +       link->tinfo = tinfo;
> +
> +       err  = bpf_link_prime(&link->link, &link_primer);
> +       if (err) {
> +               kfree(link);
> +               return err;
> +       }
> +
> +       return bpf_link_settle(&link_primer);
> +}

[...]
Yonghong Song May 6, 2020, 12:14 a.m. UTC | #2
On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>    - create a bpf_iter_link, which combines bpf program and the target.
>>      In the future, there could be more information recorded in the link.
>>      A link_fd will be returned to the user space.
>>    - create an anonymous bpf iterator with the given link_fd.
>>
>> The bpf_iter_link can be pinned to bpffs mount file system to
>> create a file based bpf iterator as well.
>>
>> The benefit to use of bpf_iter_link:
>>    - using bpf link simplifies design and implementation as bpf link
>>      is used for other tracing bpf programs.
>>    - for file based bpf iterator, bpf_iter_link provides a standard
>>      way to replace underlying bpf programs.
>>    - for both anonymous and free based iterators, bpf link query
>>      capability can be leveraged.
>>
>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>> querying. Currently, only prog_id is needed, so there is no
>> additional in-kernel show_fdinfo() and fill_link_info() hook
>> is needed for BPF_LINK_TYPE_ITER link.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> LGTM. See small nit about __GFP_NOWARN.
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
> 
>>   include/linux/bpf.h            |  1 +
>>   include/linux/bpf_types.h      |  1 +
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c           | 14 ++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   6 files changed, 80 insertions(+)
>>
> 
> [...]
> 
>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>> +{
>> +       struct bpf_link_primer link_primer;
>> +       struct bpf_iter_target_info *tinfo;
>> +       struct bpf_iter_link *link;
>> +       bool existed = false;
>> +       u32 prog_btf_id;
>> +       int err;
>> +
>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>> +               return -EINVAL;
>> +
>> +       prog_btf_id = prog->aux->attach_btf_id;
>> +       mutex_lock(&targets_mutex);
>> +       list_for_each_entry(tinfo, &targets, list) {
>> +               if (tinfo->btf_id == prog_btf_id) {
>> +                       existed = true;
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&targets_mutex);
>> +       if (!existed)
>> +               return -ENOENT;
>> +
>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
> 
> nit: all existing link implementation don't specify __GFP_NOWARN,
> wonder if bpf_iter_link should be special?

Nothing special. Just feel __GFP_NOWARN is the right thing to do to 
avoid pollute dmesg since -ENOMEM is returned to user space. But in
reality, unlike some key/value allocation where the size could be huge
and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
and small, __GFP_NOWARN probably not that useful.

Will drop it.

> 
>> +       if (!link)
>> +               return -ENOMEM;
>> +
>> +       bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
>> +       link->tinfo = tinfo;
>> +
>> +       err  = bpf_link_prime(&link->link, &link_primer);
>> +       if (err) {
>> +               kfree(link);
>> +               return err;
>> +       }
>> +
>> +       return bpf_link_settle(&link_primer);
>> +}
> 
> [...]
>
Alexei Starovoitov May 6, 2020, 12:54 a.m. UTC | #3
On 5/5/20 5:14 PM, Yonghong Song wrote:
> 
> 
> On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
>> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>>
>>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>>    - create a bpf_iter_link, which combines bpf program and the target.
>>>      In the future, there could be more information recorded in the 
>>> link.
>>>      A link_fd will be returned to the user space.
>>>    - create an anonymous bpf iterator with the given link_fd.
>>>
>>> The bpf_iter_link can be pinned to bpffs mount file system to
>>> create a file based bpf iterator as well.
>>>
>>> The benefit to use of bpf_iter_link:
>>>    - using bpf link simplifies design and implementation as bpf link
>>>      is used for other tracing bpf programs.
>>>    - for file based bpf iterator, bpf_iter_link provides a standard
>>>      way to replace underlying bpf programs.
>>>    - for both anonymous and free based iterators, bpf link query
>>>      capability can be leveraged.
>>>
>>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>>> querying. Currently, only prog_id is needed, so there is no
>>> additional in-kernel show_fdinfo() and fill_link_info() hook
>>> is needed for BPF_LINK_TYPE_ITER link.
>>>
>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>> ---
>>
>> LGTM. See small nit about __GFP_NOWARN.
>>
>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>
>>
>>>   include/linux/bpf.h            |  1 +
>>>   include/linux/bpf_types.h      |  1 +
>>>   include/uapi/linux/bpf.h       |  1 +
>>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>>   kernel/bpf/syscall.c           | 14 ++++++++
>>>   tools/include/uapi/linux/bpf.h |  1 +
>>>   6 files changed, 80 insertions(+)
>>>
>>
>> [...]
>>
>>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog 
>>> *prog)
>>> +{
>>> +       struct bpf_link_primer link_primer;
>>> +       struct bpf_iter_target_info *tinfo;
>>> +       struct bpf_iter_link *link;
>>> +       bool existed = false;
>>> +       u32 prog_btf_id;
>>> +       int err;
>>> +
>>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>>> +               return -EINVAL;
>>> +
>>> +       prog_btf_id = prog->aux->attach_btf_id;
>>> +       mutex_lock(&targets_mutex);
>>> +       list_for_each_entry(tinfo, &targets, list) {
>>> +               if (tinfo->btf_id == prog_btf_id) {
>>> +                       existed = true;
>>> +                       break;
>>> +               }
>>> +       }
>>> +       mutex_unlock(&targets_mutex);
>>> +       if (!existed)
>>> +               return -ENOENT;
>>> +
>>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
>>
>> nit: all existing link implementation don't specify __GFP_NOWARN,
>> wonder if bpf_iter_link should be special?
> 
> Nothing special. Just feel __GFP_NOWARN is the right thing to do to 
> avoid pollute dmesg since -ENOMEM is returned to user space. But in
> reality, unlike some key/value allocation where the size could be huge
> and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
> and small, __GFP_NOWARN probably not that useful.
> 
> Will drop it.

actually all existing user space driven allocation have nowarn.
If we missed it in other link allocs we should probably add it.
Andrii Nakryiko May 6, 2020, 3:09 a.m. UTC | #4
On Tue, May 5, 2020 at 5:54 PM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 5/5/20 5:14 PM, Yonghong Song wrote:
> >
> >
> > On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
> >> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>>
> >>> Given a bpf program, the step to create an anonymous bpf iterator is:
> >>>    - create a bpf_iter_link, which combines bpf program and the target.
> >>>      In the future, there could be more information recorded in the
> >>> link.
> >>>      A link_fd will be returned to the user space.
> >>>    - create an anonymous bpf iterator with the given link_fd.
> >>>
> >>> The bpf_iter_link can be pinned to bpffs mount file system to
> >>> create a file based bpf iterator as well.
> >>>
> >>> The benefit to use of bpf_iter_link:
> >>>    - using bpf link simplifies design and implementation as bpf link
> >>>      is used for other tracing bpf programs.
> >>>    - for file based bpf iterator, bpf_iter_link provides a standard
> >>>      way to replace underlying bpf programs.
> >>>    - for both anonymous and free based iterators, bpf link query
> >>>      capability can be leveraged.
> >>>
> >>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> >>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> >>> querying. Currently, only prog_id is needed, so there is no
> >>> additional in-kernel show_fdinfo() and fill_link_info() hook
> >>> is needed for BPF_LINK_TYPE_ITER link.
> >>>
> >>> Signed-off-by: Yonghong Song <yhs@fb.com>
> >>> ---
> >>
> >> LGTM. See small nit about __GFP_NOWARN.
> >>
> >> Acked-by: Andrii Nakryiko <andriin@fb.com>
> >>
> >>
> >>>   include/linux/bpf.h            |  1 +
> >>>   include/linux/bpf_types.h      |  1 +
> >>>   include/uapi/linux/bpf.h       |  1 +
> >>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
> >>>   kernel/bpf/syscall.c           | 14 ++++++++
> >>>   tools/include/uapi/linux/bpf.h |  1 +
> >>>   6 files changed, 80 insertions(+)
> >>>
> >>
> >> [...]
> >>
> >>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog
> >>> *prog)
> >>> +{
> >>> +       struct bpf_link_primer link_primer;
> >>> +       struct bpf_iter_target_info *tinfo;
> >>> +       struct bpf_iter_link *link;
> >>> +       bool existed = false;
> >>> +       u32 prog_btf_id;
> >>> +       int err;
> >>> +
> >>> +       if (attr->link_create.target_fd || attr->link_create.flags)
> >>> +               return -EINVAL;
> >>> +
> >>> +       prog_btf_id = prog->aux->attach_btf_id;
> >>> +       mutex_lock(&targets_mutex);
> >>> +       list_for_each_entry(tinfo, &targets, list) {
> >>> +               if (tinfo->btf_id == prog_btf_id) {
> >>> +                       existed = true;
> >>> +                       break;
> >>> +               }
> >>> +       }
> >>> +       mutex_unlock(&targets_mutex);
> >>> +       if (!existed)
> >>> +               return -ENOENT;
> >>> +
> >>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
> >>
> >> nit: all existing link implementation don't specify __GFP_NOWARN,
> >> wonder if bpf_iter_link should be special?
> >
> > Nothing special. Just feel __GFP_NOWARN is the right thing to do to
> > avoid pollute dmesg since -ENOMEM is returned to user space. But in
> > reality, unlike some key/value allocation where the size could be huge
> > and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
> > and small, __GFP_NOWARN probably not that useful.
> >
> > Will drop it.
>
> actually all existing user space driven allocation have nowarn.

Can you define "user space driven"? I understand why for map, map key,
map value, program we want to do that, because it's way too easy for
user-space to specify huge sizes and allocation is proportional to
that size. But in this case links are fixed-sized objects, same as
struct file and struct inode. From BPF world, for instance, there is
struct bpf_prog_list, which is created when user is attaching BPF
program to cgroup, so it is user-space driven in similar sense. Yet we
allocate it without __GFP_NOWARN.

> If we missed it in other link allocs we should probably add it.

Before bpf_link was formalized, raw_tracepoint_open was creating
struct bpf_raw_tracepoint, without NOWARN.
Alexei Starovoitov May 6, 2020, 6:08 p.m. UTC | #5
On 5/5/20 8:09 PM, Andrii Nakryiko wrote:
> On Tue, May 5, 2020 at 5:54 PM Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 5/5/20 5:14 PM, Yonghong Song wrote:
>>>
>>>
>>> On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
>>>> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>>>>     - create a bpf_iter_link, which combines bpf program and the target.
>>>>>       In the future, there could be more information recorded in the
>>>>> link.
>>>>>       A link_fd will be returned to the user space.
>>>>>     - create an anonymous bpf iterator with the given link_fd.
>>>>>
>>>>> The bpf_iter_link can be pinned to bpffs mount file system to
>>>>> create a file based bpf iterator as well.
>>>>>
>>>>> The benefit to use of bpf_iter_link:
>>>>>     - using bpf link simplifies design and implementation as bpf link
>>>>>       is used for other tracing bpf programs.
>>>>>     - for file based bpf iterator, bpf_iter_link provides a standard
>>>>>       way to replace underlying bpf programs.
>>>>>     - for both anonymous and free based iterators, bpf link query
>>>>>       capability can be leveraged.
>>>>>
>>>>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>>>>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>>>>> querying. Currently, only prog_id is needed, so there is no
>>>>> additional in-kernel show_fdinfo() and fill_link_info() hook
>>>>> is needed for BPF_LINK_TYPE_ITER link.
>>>>>
>>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>>> ---
>>>>
>>>> LGTM. See small nit about __GFP_NOWARN.
>>>>
>>>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>>>
>>>>
>>>>>    include/linux/bpf.h            |  1 +
>>>>>    include/linux/bpf_types.h      |  1 +
>>>>>    include/uapi/linux/bpf.h       |  1 +
>>>>>    kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>>>>    kernel/bpf/syscall.c           | 14 ++++++++
>>>>>    tools/include/uapi/linux/bpf.h |  1 +
>>>>>    6 files changed, 80 insertions(+)
>>>>>
>>>>
>>>> [...]
>>>>
>>>>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog
>>>>> *prog)
>>>>> +{
>>>>> +       struct bpf_link_primer link_primer;
>>>>> +       struct bpf_iter_target_info *tinfo;
>>>>> +       struct bpf_iter_link *link;
>>>>> +       bool existed = false;
>>>>> +       u32 prog_btf_id;
>>>>> +       int err;
>>>>> +
>>>>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>>>>> +               return -EINVAL;
>>>>> +
>>>>> +       prog_btf_id = prog->aux->attach_btf_id;
>>>>> +       mutex_lock(&targets_mutex);
>>>>> +       list_for_each_entry(tinfo, &targets, list) {
>>>>> +               if (tinfo->btf_id == prog_btf_id) {
>>>>> +                       existed = true;
>>>>> +                       break;
>>>>> +               }
>>>>> +       }
>>>>> +       mutex_unlock(&targets_mutex);
>>>>> +       if (!existed)
>>>>> +               return -ENOENT;
>>>>> +
>>>>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
>>>>
>>>> nit: all existing link implementation don't specify __GFP_NOWARN,
>>>> wonder if bpf_iter_link should be special?
>>>
>>> Nothing special. Just feel __GFP_NOWARN is the right thing to do to
>>> avoid pollute dmesg since -ENOMEM is returned to user space. But in
>>> reality, unlike some key/value allocation where the size could be huge
>>> and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
>>> and small, __GFP_NOWARN probably not that useful.
>>>
>>> Will drop it.
>>
>> actually all existing user space driven allocation have nowarn.
> 
> Can you define "user space driven"? I understand why for map, map key,
> map value, program we want to do that, because it's way too easy for
> user-space to specify huge sizes and allocation is proportional to
> that size. But in this case links are fixed-sized objects, same as
> struct file and struct inode. From BPF world, for instance, there is
> struct bpf_prog_list, which is created when user is attaching BPF
> program to cgroup, so it is user-space driven in similar sense. Yet we
> allocate it without __GFP_NOWARN.

For tiny objects it doesn't really matter. If slab cannot allocate
another single page the system is in bad shape and warn is good
to have in most cases, but when it's user driven like here that
warn won't help kernel developers debug ooms. Most likely NIC driver
is spamming page alloc warn at this point.
In this particular case bpf_iter arguments will likely grow
and struct will grow too, but probably not the point of kmalloc_large,
so it's really fine which ever way.
Personally I would keep nowarn here.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd385c36a172..8621ad080b24 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1142,6 +1142,7 @@  struct bpf_iter_reg {
 
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 8345cdf553b8..29d22752fc87 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -124,3 +124,4 @@  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
 #ifdef CONFIG_CGROUP_BPF
 BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
 #endif
+BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index c1fae67a1452..cc0e205fb8c5 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -15,6 +15,11 @@  struct bpf_iter_target_info {
 	u32 btf_id;
 };
 
+struct bpf_iter_link {
+	struct bpf_link link;
+	struct bpf_iter_target_info *tinfo;
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
@@ -68,3 +73,60 @@  bool bpf_iter_prog_supported(struct bpf_prog *prog)
 
 	return supported;
 }
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_iter_link *iter_link =
+		container_of(link, struct bpf_iter_link, link);
+
+	kfree(iter_link);
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+	.release = bpf_iter_link_release,
+	.dealloc = bpf_iter_link_dealloc,
+};
+
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_iter_link *link;
+	bool existed = false;
+	u32 prog_btf_id;
+	int err;
+
+	if (attr->link_create.target_fd || attr->link_create.flags)
+		return -EINVAL;
+
+	prog_btf_id = prog->aux->attach_btf_id;
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id == prog_btf_id) {
+			existed = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+	if (!existed)
+		return -ENOENT;
+
+	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+	link->tinfo = tinfo;
+
+	err  = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bb1ab7da6103..6ffe2d8fb6c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2729,6 +2729,8 @@  attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	case BPF_TRACE_ITER:
+		return BPF_PROG_TYPE_TRACING;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3729,6 +3731,15 @@  static int bpf_map_do_batch(const union bpf_attr *attr,
 	return err;
 }
 
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+	    prog->expected_attach_type == BPF_TRACE_ITER)
+		return bpf_iter_link_attach(attr, prog);
+
+	return -EINVAL;
+}
+
 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
 static int link_create(union bpf_attr *attr)
 {
@@ -3765,6 +3776,9 @@  static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		ret = tracing_bpf_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };