diff mbox series

[bpf-next,2/5] bpf: add main_thread_only customization for task/task_file iterators

Message ID 20200827000620.2711963-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: add main_thread_only customization for task/task_file iterators | expand

Commit Message

Yonghong Song Aug. 27, 2020, 12:06 a.m. UTC
Currently, task and task_file by default iterates through
all tasks. For task_file, by default, all files from all tasks
will be traversed.

But for a user process, the file_table is shared by all threads
of that process. So traversing the main thread per process should
be enough to traverse all files and this can save a lot of cpu
time if some process has large number of threads and each thread
has lots of open files.

This patch implemented a customization for task/task_file iterator,
permitting to traverse only the kernel task where its pid equal
to tgid in the kernel. This includes some kernel threads, and
main threads of user processes. This will solve the above potential
performance issue for task_file. This customization may be useful
for task iterator too if only traversing main threads is enough.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  3 ++-
 include/uapi/linux/bpf.h       |  5 ++++
 kernel/bpf/task_iter.c         | 46 +++++++++++++++++++++++-----------
 tools/include/uapi/linux/bpf.h |  5 ++++
 4 files changed, 43 insertions(+), 16 deletions(-)

Comments

Andrii Nakryiko Aug. 27, 2020, 5:07 a.m. UTC | #1
On Wed, Aug 26, 2020 at 5:07 PM Yonghong Song <yhs@fb.com> wrote:
>
> Currently, task and task_file by default iterates through
> all tasks. For task_file, by default, all files from all tasks
> will be traversed.
>
> But for a user process, the file_table is shared by all threads
> of that process. So traversing the main thread per process should
> be enough to traverse all files and this can save a lot of cpu
> time if some process has large number of threads and each thread
> has lots of open files.
>
> This patch implemented a customization for task/task_file iterator,
> permitting to traverse only the kernel task where its pid equal
> to tgid in the kernel. This includes some kernel threads, and
> main threads of user processes. This will solve the above potential
> performance issue for task_file. This customization may be useful
> for task iterator too if only traversing main threads is enough.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |  3 ++-
>  include/uapi/linux/bpf.h       |  5 ++++
>  kernel/bpf/task_iter.c         | 46 +++++++++++++++++++++++-----------
>  tools/include/uapi/linux/bpf.h |  5 ++++
>  4 files changed, 43 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index a6131d95e31e..058eb9b0ba78 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1220,7 +1220,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>         int __init bpf_iter_ ## target(args) { return 0; }
>
>  struct bpf_iter_aux_info {
> -       struct bpf_map *map;
> +       struct bpf_map *map;    /* for iterator traversing map elements */
> +       bool main_thread_only;  /* for task/task_file iterator */

As a user of task_file iterator I'd hate to make this decision,
honestly, especially if I can't prove that all processes share the
same file table (I think clone() syscall allows to do weird
combinations like that, right?). It does make sense for a task/
iterator, though, if I need to iterate a user-space process (group of
tasks). So can we do this:

1a. Either add a new bpf_iter type process/ (or in kernel lingo
task_group/) to iterate only main threads (group_leader in kernel
lingo);
1b. Or make this main_thread_only an option for only a task/ iterator
(and maybe call it "group_leader_only" or something to reflect kernel
terminology?)

2. For task/file iterator, still iterate all tasks, but if the task's
files struct is the same as group_leader's files struct, then go to
the next one. This should eliminate tons of duplication of iterating
the same files over and over. It would still iterate a bunch of tasks,
but compared to the number of files that's generally a much smaller
number, so should still give sizable savings. I don't think we need an
extra option for this, tbh, this behavior was my intuitive
expectation, so I don't think you'll be breaking any sane user of this
iterator.

Disclaimer: I haven't got a chance to look through kernel code much,
so I'm sorry if what I'm proposing is something that is impossible to
implement or extremely hard/unreliable. But thought I'll put this idea
out there before we decide on this.

WDYT?

[...]
Yonghong Song Aug. 27, 2020, 6:09 p.m. UTC | #2
On 8/26/20 10:07 PM, Andrii Nakryiko wrote:
> On Wed, Aug 26, 2020 at 5:07 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Currently, task and task_file by default iterates through
>> all tasks. For task_file, by default, all files from all tasks
>> will be traversed.
>>
>> But for a user process, the file_table is shared by all threads
>> of that process. So traversing the main thread per process should
>> be enough to traverse all files and this can save a lot of cpu
>> time if some process has large number of threads and each thread
>> has lots of open files.
>>
>> This patch implemented a customization for task/task_file iterator,
>> permitting to traverse only the kernel task where its pid equal
>> to tgid in the kernel. This includes some kernel threads, and
>> main threads of user processes. This will solve the above potential
>> performance issue for task_file. This customization may be useful
>> for task iterator too if only traversing main threads is enough.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |  3 ++-
>>   include/uapi/linux/bpf.h       |  5 ++++
>>   kernel/bpf/task_iter.c         | 46 +++++++++++++++++++++++-----------
>>   tools/include/uapi/linux/bpf.h |  5 ++++
>>   4 files changed, 43 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index a6131d95e31e..058eb9b0ba78 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1220,7 +1220,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>>          int __init bpf_iter_ ## target(args) { return 0; }
>>
>>   struct bpf_iter_aux_info {
>> -       struct bpf_map *map;
>> +       struct bpf_map *map;    /* for iterator traversing map elements */
>> +       bool main_thread_only;  /* for task/task_file iterator */
> 
> As a user of task_file iterator I'd hate to make this decision,
> honestly, especially if I can't prove that all processes share the
> same file table (I think clone() syscall allows to do weird
> combinations like that, right?). It does make sense for a task/

Right. the clone() syscall permits different kinds of sharing,
sharing address space and sharing files are separated. It is possible
that address space is shared and files are not shared. That is
why I want to add flags for task_file so that if user knows
they do not have cases where address space shared and files not
shared, they can use main_thread_only to improve performance.

> iterator, though, if I need to iterate a user-space process (group of
> tasks). So can we do this:
> 
> 1a. Either add a new bpf_iter type process/ (or in kernel lingo
> task_group/) to iterate only main threads (group_leader in kernel
> lingo);
> 1b. Or make this main_thread_only an option for only a task/ iterator
> (and maybe call it "group_leader_only" or something to reflect kernel
> terminology?)

The following is the kernel pid_type definition,

enum pid_type
{
         PIDTYPE_PID,
         PIDTYPE_TGID,
         PIDTYPE_PGID,
         PIDTYPE_SID,
         PIDTYPE_MAX,
};

Right now, task iterator is traversed following
PIDTYPE_PID, i.e., every tasks in the system.

To iterate through main thread, we need to traverse following
PIDTYPE_TGID.

In the future, it is possible, people want to traverse
following PIDTYPE_PGID (process group) or PIDTYPE_SID (session).

Or we might have other customization, e.g., cgroup_id, which can
be filtered in the bpf program, but in-kernel filtering can
definitely improve performance.

So I prefer 1b.

Yes, naming is hard.
The name "main_thread_only" is mostly from userspace perspective.
"group_leader_only" might not be good as it may be confused with
possible future process group.

> 
> 2. For task/file iterator, still iterate all tasks, but if the task's
> files struct is the same as group_leader's files struct, then go to
> the next one. This should eliminate tons of duplication of iterating
> the same files over and over. It would still iterate a bunch of tasks,
> but compared to the number of files that's generally a much smaller
> number, so should still give sizable savings. I don't think we need an
> extra option for this, tbh, this behavior was my intuitive
> expectation, so I don't think you'll be breaking any sane user of this
> iterator.

What you suggested makes sense. for task_file iterator, we only promise
to visit all files from all tasks. We can do necessary filtering to 
remove duplicates in the kernel and did not break promise.
I will remove customization from task_file iterator.

> 
> Disclaimer: I haven't got a chance to look through kernel code much,
> so I'm sorry if what I'm proposing is something that is impossible to
> implement or extremely hard/unreliable. But thought I'll put this idea
> out there before we decide on this.
> 
> WDYT?
> 
> [...]
>
Andrii Nakryiko Sept. 2, 2020, 12:34 a.m. UTC | #3
On Thu, Aug 27, 2020 at 11:09 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 8/26/20 10:07 PM, Andrii Nakryiko wrote:
> > On Wed, Aug 26, 2020 at 5:07 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Currently, task and task_file by default iterates through
> >> all tasks. For task_file, by default, all files from all tasks
> >> will be traversed.
> >>
> >> But for a user process, the file_table is shared by all threads
> >> of that process. So traversing the main thread per process should
> >> be enough to traverse all files and this can save a lot of cpu
> >> time if some process has large number of threads and each thread
> >> has lots of open files.
> >>
> >> This patch implemented a customization for task/task_file iterator,
> >> permitting to traverse only the kernel task where its pid equal
> >> to tgid in the kernel. This includes some kernel threads, and
> >> main threads of user processes. This will solve the above potential
> >> performance issue for task_file. This customization may be useful
> >> for task iterator too if only traversing main threads is enough.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h            |  3 ++-
> >>   include/uapi/linux/bpf.h       |  5 ++++
> >>   kernel/bpf/task_iter.c         | 46 +++++++++++++++++++++++-----------
> >>   tools/include/uapi/linux/bpf.h |  5 ++++
> >>   4 files changed, 43 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index a6131d95e31e..058eb9b0ba78 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -1220,7 +1220,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> >>          int __init bpf_iter_ ## target(args) { return 0; }
> >>
> >>   struct bpf_iter_aux_info {
> >> -       struct bpf_map *map;
> >> +       struct bpf_map *map;    /* for iterator traversing map elements */
> >> +       bool main_thread_only;  /* for task/task_file iterator */
> >
> > As a user of task_file iterator I'd hate to make this decision,
> > honestly, especially if I can't prove that all processes share the
> > same file table (I think clone() syscall allows to do weird
> > combinations like that, right?). It does make sense for a task/
>
> Right. the clone() syscall permits different kinds of sharing,
> sharing address space and sharing files are separated. It is possible
> that address space is shared and files are not shared. That is
> why I want to add flags for task_file so that if user knows
> they do not have cases where address space shared and files not
> shared, they can use main_thread_only to improve performance.

The problem with such options is that for a lot of users it won't be
clear at all when and if those options can be used. E.g., when I
imagine myself building some generic tool utilizing task_file bpf_iter
that is supposed to be run on any server, how do I know if it's safe
to specify "main_thread_only"? I can't guarantee that. So I'll be
forced to either risk it or fallback to default and unnecessarily slow
behavior.

This is different for task/ bpf_iter, though, so I support that for
task/ only. But you've already done that split, so thank you! :)

>
> > iterator, though, if I need to iterate a user-space process (group of
> > tasks). So can we do this:
> >
> > 1a. Either add a new bpf_iter type process/ (or in kernel lingo
> > task_group/) to iterate only main threads (group_leader in kernel
> > lingo);
> > 1b. Or make this main_thread_only an option for only a task/ iterator
> > (and maybe call it "group_leader_only" or something to reflect kernel
> > terminology?)
>
> The following is the kernel pid_type definition,
>
> enum pid_type
> {
>          PIDTYPE_PID,
>          PIDTYPE_TGID,
>          PIDTYPE_PGID,
>          PIDTYPE_SID,
>          PIDTYPE_MAX,
> };
>
> Right now, task iterator is traversed following
> PIDTYPE_PID, i.e., every tasks in the system.
>
> To iterate through main thread, we need to traverse following
> PIDTYPE_TGID.
>
> In the future, it is possible, people want to traverse
> following PIDTYPE_PGID (process group) or PIDTYPE_SID (session).
>
> Or we might have other customization, e.g., cgroup_id, which can
> be filtered in the bpf program, but in-kernel filtering can
> definitely improve performance.
>
> So I prefer 1b.

Sounds good, but let's use a proper enum, not a set of bit fields. We
can support all 4 from the get go. There is no need to wait for the
actual use case to appear, as the iteration semantics is pretty clear.

>
> Yes, naming is hard.
> The name "main_thread_only" is mostly from userspace perspective.
> "group_leader_only" might not be good as it may be confused with
> possible future process group.
>
> >
> > 2. For task/file iterator, still iterate all tasks, but if the task's
> > files struct is the same as group_leader's files struct, then go to
> > the next one. This should eliminate tons of duplication of iterating
> > the same files over and over. It would still iterate a bunch of tasks,
> > but compared to the number of files that's generally a much smaller
> > number, so should still give sizable savings. I don't think we need an
> > extra option for this, tbh, this behavior was my intuitive
> > expectation, so I don't think you'll be breaking any sane user of this
> > iterator.
>
> What you suggested makes sense. for task_file iterator, we only promise
> to visit all files from all tasks. We can do necessary filtering to
> remove duplicates in the kernel and did not break promise.
> I will remove customization from task_file iterator.

Thanks for doing that!

>
> >
> > Disclaimer: I haven't got a chance to look through kernel code much,
> > so I'm sorry if what I'm proposing is something that is impossible to
> > implement or extremely hard/unreliable. But thought I'll put this idea
> > out there before we decide on this.
> >
> > WDYT?
> >
> > [...]
> >
Yonghong Song Sept. 2, 2020, 2:23 a.m. UTC | #4
On 9/1/20 5:34 PM, Andrii Nakryiko wrote:
> On Thu, Aug 27, 2020 at 11:09 AM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 8/26/20 10:07 PM, Andrii Nakryiko wrote:
>>> On Wed, Aug 26, 2020 at 5:07 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>> Currently, task and task_file by default iterates through
>>>> all tasks. For task_file, by default, all files from all tasks
>>>> will be traversed.
>>>>
>>>> But for a user process, the file_table is shared by all threads
>>>> of that process. So traversing the main thread per process should
>>>> be enough to traverse all files and this can save a lot of cpu
>>>> time if some process has large number of threads and each thread
>>>> has lots of open files.
>>>>
>>>> This patch implemented a customization for task/task_file iterator,
>>>> permitting to traverse only the kernel task where its pid equal
>>>> to tgid in the kernel. This includes some kernel threads, and
>>>> main threads of user processes. This will solve the above potential
>>>> performance issue for task_file. This customization may be useful
>>>> for task iterator too if only traversing main threads is enough.
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>    include/linux/bpf.h            |  3 ++-
>>>>    include/uapi/linux/bpf.h       |  5 ++++
>>>>    kernel/bpf/task_iter.c         | 46 +++++++++++++++++++++++-----------
>>>>    tools/include/uapi/linux/bpf.h |  5 ++++
>>>>    4 files changed, 43 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index a6131d95e31e..058eb9b0ba78 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -1220,7 +1220,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>>>>           int __init bpf_iter_ ## target(args) { return 0; }
>>>>
>>>>    struct bpf_iter_aux_info {
>>>> -       struct bpf_map *map;
>>>> +       struct bpf_map *map;    /* for iterator traversing map elements */
>>>> +       bool main_thread_only;  /* for task/task_file iterator */
>>>
>>> As a user of task_file iterator I'd hate to make this decision,
>>> honestly, especially if I can't prove that all processes share the
>>> same file table (I think clone() syscall allows to do weird
>>> combinations like that, right?). It does make sense for a task/
>>
>> Right. the clone() syscall permits different kinds of sharing,
>> sharing address space and sharing files are separated. It is possible
>> that address space is shared and files are not shared. That is
>> why I want to add flags for task_file so that if user knows
>> they do not have cases where address space shared and files not
>> shared, they can use main_thread_only to improve performance.
> 
> The problem with such options is that for a lot of users it won't be
> clear at all when and if those options can be used. E.g., when I
> imagine myself building some generic tool utilizing task_file bpf_iter
> that is supposed to be run on any server, how do I know if it's safe
> to specify "main_thread_only"? I can't guarantee that. So I'll be
> forced to either risk it or fallback to default and unnecessarily slow
> behavior.
> 
> This is different for task/ bpf_iter, though, so I support that for
> task/ only. But you've already done that split, so thank you! :)
> 
>>
>>> iterator, though, if I need to iterate a user-space process (group of
>>> tasks). So can we do this:
>>>
>>> 1a. Either add a new bpf_iter type process/ (or in kernel lingo
>>> task_group/) to iterate only main threads (group_leader in kernel
>>> lingo);
>>> 1b. Or make this main_thread_only an option for only a task/ iterator
>>> (and maybe call it "group_leader_only" or something to reflect kernel
>>> terminology?)
>>
>> The following is the kernel pid_type definition,
>>
>> enum pid_type
>> {
>>           PIDTYPE_PID,
>>           PIDTYPE_TGID,
>>           PIDTYPE_PGID,
>>           PIDTYPE_SID,
>>           PIDTYPE_MAX,
>> };
>>
>> Right now, task iterator is traversed following
>> PIDTYPE_PID, i.e., every tasks in the system.
>>
>> To iterate through main thread, we need to traverse following
>> PIDTYPE_TGID.
>>
>> In the future, it is possible, people want to traverse
>> following PIDTYPE_PGID (process group) or PIDTYPE_SID (session).
>>
>> Or we might have other customization, e.g., cgroup_id, which can
>> be filtered in the bpf program, but in-kernel filtering can
>> definitely improve performance.
>>
>> So I prefer 1b.
> 
> Sounds good, but let's use a proper enum, not a set of bit fields. We
> can support all 4 from the get go. There is no need to wait for the
> actual use case to appear, as the iteration semantics is pretty clear.

Agree. enum is better than individual bit fields esp. considering they
are mutually exclusive. Will design the interface along this line
when time comes to implement those customization. Thanks!

> 
>>
>> Yes, naming is hard.
>> The name "main_thread_only" is mostly from userspace perspective.
>> "group_leader_only" might not be good as it may be confused with
>> possible future process group.
>>
>>>
>>> 2. For task/file iterator, still iterate all tasks, but if the task's
>>> files struct is the same as group_leader's files struct, then go to
>>> the next one. This should eliminate tons of duplication of iterating
>>> the same files over and over. It would still iterate a bunch of tasks,
>>> but compared to the number of files that's generally a much smaller
>>> number, so should still give sizable savings. I don't think we need an
>>> extra option for this, tbh, this behavior was my intuitive
>>> expectation, so I don't think you'll be breaking any sane user of this
>>> iterator.
>>
>> What you suggested makes sense. for task_file iterator, we only promise
>> to visit all files from all tasks. We can do necessary filtering to
>> remove duplicates in the kernel and did not break promise.
>> I will remove customization from task_file iterator.
> 
> Thanks for doing that!
> 
>>
>>>
>>> Disclaimer: I haven't got a chance to look through kernel code much,
>>> so I'm sorry if what I'm proposing is something that is impossible to
>>> implement or extremely hard/unreliable. But thought I'll put this idea
>>> out there before we decide on this.
>>>
>>> WDYT?
>>>
>>> [...]
>>>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6131d95e31e..058eb9b0ba78 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1220,7 +1220,8 @@  int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
-	struct bpf_map *map;
+	struct bpf_map *map;	/* for iterator traversing map elements */
+	bool main_thread_only;	/* for task/task_file iterator */
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef7af384f5ee..af5c600bf673 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,11 @@  union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+
+	struct {
+		__u32	main_thread_only:1;
+		__u32	:31;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 232df29793e9..362bf2dda63a 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -11,19 +11,22 @@ 
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	bool main_thread_only;
 };
 
 struct bpf_iter_seq_task_info {
 	/* The first field must be struct bpf_iter_seq_task_common.
-	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 * this is assumed by {init, fini}_seq_task_common() callback functions.
 	 */
 	struct bpf_iter_seq_task_common common;
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
-					     u32 *tid)
+static struct task_struct *task_seq_get_next(
+	struct bpf_iter_seq_task_common *task_common, u32 *tid)
 {
+	bool main_thread_only = task_common->main_thread_only;
+	struct pid_namespace *ns = task_common->ns;
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
@@ -31,7 +34,10 @@  static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
 retry:
 	pid = idr_get_next(&ns->idr, tid);
 	if (pid) {
-		task = get_pid_task(pid, PIDTYPE_PID);
+		if (main_thread_only)
+			task = get_pid_task(pid, PIDTYPE_TGID);
+		else
+			task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
 			goto retry;
@@ -47,7 +53,7 @@  static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid);
+	task = task_seq_get_next(&info->common, &info->tid);
 	if (!task)
 		return NULL;
 
@@ -64,7 +70,7 @@  static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid);
+	task = task_seq_get_next(&info->common, &info->tid);
 	if (!task)
 		return NULL;
 
@@ -118,7 +124,7 @@  static const struct seq_operations task_seq_ops = {
 
 struct bpf_iter_seq_task_file_info {
 	/* The first field must be struct bpf_iter_seq_task_common.
-	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 * this is assumed by {init, fini}_seq_task_common() callback functions.
 	 */
 	struct bpf_iter_seq_task_common common;
 	struct task_struct *task;
@@ -131,7 +137,6 @@  static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
 		       struct task_struct **task, struct files_struct **fstruct)
 {
-	struct pid_namespace *ns = info->common.ns;
 	u32 curr_tid = info->tid, max_fds;
 	struct files_struct *curr_files;
 	struct task_struct *curr_task;
@@ -147,7 +152,7 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
 		curr_files = *fstruct;
 		curr_fd = info->fd;
 	} else {
-		curr_task = task_seq_get_next(ns, &curr_tid);
+		curr_task = task_seq_get_next(&info->common, &curr_tid);
 		if (!curr_task)
 			return NULL;
 
@@ -293,15 +298,16 @@  static void task_file_seq_stop(struct seq_file *seq, void *v)
 	}
 }
 
-static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
+static int init_seq_task_common(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->main_thread_only = aux->main_thread_only;
 	return 0;
 }
 
-static void fini_seq_pidns(void *priv_data)
+static void fini_seq_task_common(void *priv_data)
 {
 	struct bpf_iter_seq_task_common *common = priv_data;
 
@@ -315,19 +321,28 @@  static const struct seq_operations task_file_seq_ops = {
 	.show	= task_file_seq_show,
 };
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	aux->main_thread_only = linfo->task.main_thread_only;
+	return 0;
+}
+
 BTF_ID_LIST(btf_task_file_ids)
 BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
 
 static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
-	.init_seq_private	= init_seq_pidns,
-	.fini_seq_private	= fini_seq_pidns,
+	.init_seq_private	= init_seq_task_common,
+	.fini_seq_private	= fini_seq_task_common,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
 };
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task, task),
@@ -338,13 +353,14 @@  static struct bpf_iter_reg task_reg_info = {
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
 	.seq_ops		= &task_file_seq_ops,
-	.init_seq_private	= init_seq_pidns,
-	.fini_seq_private	= fini_seq_pidns,
+	.init_seq_private	= init_seq_task_common,
+	.fini_seq_private	= fini_seq_task_common,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
 };
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task_file, task),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ef7af384f5ee..af5c600bf673 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -85,6 +85,11 @@  union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+
+	struct {
+		__u32	main_thread_only:1;
+		__u32	:31;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for details. */