diff mbox series

[bpf-next,v2,11/20] bpf: add task and task/file iterator targets

Message ID 20200504062559.2048228-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song May 4, 2020, 6:25 a.m. UTC
Only the tasks belonging to "current" pid namespace
are enumerated.

For task/file target, the bpf program will have access to
  struct task_struct *task
  u32 fd
  struct file *file
where fd/file is an open file for the task.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/task_iter.c

Comments

Andrii Nakryiko May 6, 2020, 7:30 a.m. UTC | #1
On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>
> Only the tasks belonging to "current" pid namespace
> are enumerated.
>
> For task/file target, the bpf program will have access to
>   struct task_struct *task
>   u32 fd
>   struct file *file
> where fd/file is an open file for the task.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

I might be missing some subtleties with task refcounting for task_file
iterator, asked few questions below...

>  kernel/bpf/Makefile    |   2 +-
>  kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 337 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/task_iter.c
>
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index b2b5eefc5254..37b2d8620153 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -2,7 +2,7 @@
>  obj-y := core.o
>  CFLAGS_core.o += $(call cc-disable-warning, override-init)
>
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> new file mode 100644
> index 000000000000..1ca258f6e9f4
> --- /dev/null
> +++ b/kernel/bpf/task_iter.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 Facebook */
> +
> +#include <linux/init.h>
> +#include <linux/namei.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/fs.h>
> +#include <linux/fdtable.h>
> +#include <linux/filter.h>
> +
> +struct bpf_iter_seq_task_common {
> +       struct pid_namespace *ns;
> +};
> +
> +struct bpf_iter_seq_task_info {
> +       struct bpf_iter_seq_task_common common;

you have comment below in init_seq_pidns() that common is supposed to
be the very first field, but I think it's more important and
appropriate here, so that whoever adds anything here knows that order
of field is important.

> +       struct task_struct *task;
> +       u32 id;
> +};
> +

[...]

> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
> +{
> +       struct bpf_iter_meta meta;
> +       struct bpf_iter__task ctx;
> +       struct bpf_prog *prog;
> +       int ret = 0;
> +
> +       meta.seq = seq;
> +       prog = bpf_iter_get_info(&meta, in_stop);
> +       if (prog) {


nit: `if (!prog) return 0;` here would reduce nesting level below

> +               meta.seq = seq;
> +               ctx.meta = &meta;
> +               ctx.task = v;
> +               ret = bpf_iter_run_prog(prog, &ctx);
> +       }
> +
> +       return 0;

return **ret**; ?

> +}
> +

[...]

> +
> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
> +                                          int *fd, struct task_struct **task,
> +                                          struct files_struct **fstruct)
> +{
> +       struct files_struct *files;
> +       struct task_struct *tk;
> +       u32 sid = *id;
> +       int sfd;
> +
> +       /* If this function returns a non-NULL file object,
> +        * it held a reference to the files_struct and file.
> +        * Otherwise, it does not hold any reference.
> +        */
> +again:
> +       if (*fstruct) {
> +               files = *fstruct;
> +               sfd = *fd;
> +       } else {
> +               tk = task_seq_get_next(ns, &sid);
> +               if (!tk)
> +                       return NULL;
> +
> +               files = get_files_struct(tk);
> +               put_task_struct(tk);

task is put here, but is still used below.. is there some additional
hidden refcounting involved?

> +               if (!files) {
> +                       sid = ++(*id);
> +                       *fd = 0;
> +                       goto again;
> +               }
> +               *fstruct = files;
> +               *task = tk;
> +               if (sid == *id) {
> +                       sfd = *fd;
> +               } else {
> +                       *id = sid;
> +                       sfd = 0;
> +               }
> +       }
> +
> +       rcu_read_lock();
> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {

files_fdtable does rcu_dereference on each iteration, would it be
better to just cache files_fdtable(files)->max_fds into local
variable? It's unlikely that there will be many iterations, but
still...

> +               struct file *f;
> +
> +               f = fcheck_files(files, sfd);
> +               if (!f)
> +                       continue;
> +               *fd = sfd;
> +               get_file(f);
> +               rcu_read_unlock();
> +               return f;
> +       }
> +
> +       /* the current task is done, go to the next task */
> +       rcu_read_unlock();
> +       put_files_struct(files);
> +       *fstruct = NULL;

*task = NULL; for completeness?

> +       sid = ++(*id);
> +       *fd = 0;
> +       goto again;
> +}
> +
> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +       struct bpf_iter_seq_task_file_info *info = seq->private;
> +       struct files_struct *files = NULL;
> +       struct task_struct *task = NULL;
> +       struct file *file;
> +       u32 id = info->id;
> +       int fd = info->fd;
> +
> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
> +       if (!file) {
> +               info->files = NULL;

what about info->task here?

> +               return NULL;
> +       }
> +
> +       ++*pos;
> +       info->id = id;
> +       info->fd = fd;
> +       info->task = task;
> +       info->files = files;
> +
> +       return file;
> +}
> +

[...]

> +
> +struct bpf_iter__task_file {
> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
> +       __bpf_md_ptr(struct task_struct *, task);
> +       u32 fd;

nit: sort of works by accident (due to all other field being 8-byte
aligned pointers), shall we add __attribute__((aligned(8)))?

> +       __bpf_md_ptr(struct file *, file);
> +};
> +

[...]
Yonghong Song May 6, 2020, 6:24 p.m. UTC | #2
On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Only the tasks belonging to "current" pid namespace
>> are enumerated.
>>
>> For task/file target, the bpf program will have access to
>>    struct task_struct *task
>>    u32 fd
>>    struct file *file
>> where fd/file is an open file for the task.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> I might be missing some subtleties with task refcounting for task_file
> iterator, asked few questions below...
> 
>>   kernel/bpf/Makefile    |   2 +-
>>   kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 337 insertions(+), 1 deletion(-)
>>   create mode 100644 kernel/bpf/task_iter.c
>>
>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>> index b2b5eefc5254..37b2d8620153 100644
>> --- a/kernel/bpf/Makefile
>> +++ b/kernel/bpf/Makefile
>> @@ -2,7 +2,7 @@
>>   obj-y := core.o
>>   CFLAGS_core.o += $(call cc-disable-warning, override-init)
>>
>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> new file mode 100644
>> index 000000000000..1ca258f6e9f4
>> --- /dev/null
>> +++ b/kernel/bpf/task_iter.c
>> @@ -0,0 +1,336 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (c) 2020 Facebook */
>> +
>> +#include <linux/init.h>
>> +#include <linux/namei.h>
>> +#include <linux/pid_namespace.h>
>> +#include <linux/fs.h>
>> +#include <linux/fdtable.h>
>> +#include <linux/filter.h>
>> +
>> +struct bpf_iter_seq_task_common {
>> +       struct pid_namespace *ns;
>> +};
>> +
>> +struct bpf_iter_seq_task_info {
>> +       struct bpf_iter_seq_task_common common;
> 
> you have comment below in init_seq_pidns() that common is supposed to
> be the very first field, but I think it's more important and
> appropriate here, so that whoever adds anything here knows that order
> of field is important.

I can move the comments here.

> 
>> +       struct task_struct *task;
>> +       u32 id;
>> +};
>> +
> 
> [...]
> 
>> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
>> +{
>> +       struct bpf_iter_meta meta;
>> +       struct bpf_iter__task ctx;
>> +       struct bpf_prog *prog;
>> +       int ret = 0;
>> +
>> +       meta.seq = seq;
>> +       prog = bpf_iter_get_info(&meta, in_stop);
>> +       if (prog) {
> 
> 
> nit: `if (!prog) return 0;` here would reduce nesting level below
> 
>> +               meta.seq = seq;
>> +               ctx.meta = &meta;
>> +               ctx.task = v;
>> +               ret = bpf_iter_run_prog(prog, &ctx);
>> +       }
>> +
>> +       return 0;
> 
> return **ret**; ?

It should return "ret". In task_file show() code is similar but correct.
I can do early return with !prog too although we do not have
deep nesting level yet.

> 
>> +}
>> +
> 
> [...]
> 
>> +
>> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
>> +                                          int *fd, struct task_struct **task,
>> +                                          struct files_struct **fstruct)
>> +{
>> +       struct files_struct *files;
>> +       struct task_struct *tk;
>> +       u32 sid = *id;
>> +       int sfd;
>> +
>> +       /* If this function returns a non-NULL file object,
>> +        * it held a reference to the files_struct and file.
>> +        * Otherwise, it does not hold any reference.
>> +        */
>> +again:
>> +       if (*fstruct) {
>> +               files = *fstruct;
>> +               sfd = *fd;
>> +       } else {
>> +               tk = task_seq_get_next(ns, &sid);
>> +               if (!tk)
>> +                       return NULL;
>> +
>> +               files = get_files_struct(tk);
>> +               put_task_struct(tk);
> 
> task is put here, but is still used below.. is there some additional
> hidden refcounting involved?

Good question. I had an impression that we take a reference count
for task->files so task should not go away. But reading linux
code again, I do not have sufficient evidence to back my claim.
So I will reference count task as well, e.g., do not put_task_struct()
until all files are done here.

> 
>> +               if (!files) {
>> +                       sid = ++(*id);
>> +                       *fd = 0;
>> +                       goto again;
>> +               }
>> +               *fstruct = files;
>> +               *task = tk;
>> +               if (sid == *id) {
>> +                       sfd = *fd;
>> +               } else {
>> +                       *id = sid;
>> +                       sfd = 0;
>> +               }
>> +       }
>> +
>> +       rcu_read_lock();
>> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {
> 
> files_fdtable does rcu_dereference on each iteration, would it be
> better to just cache files_fdtable(files)->max_fds into local
> variable? It's unlikely that there will be many iterations, but
> still...

I borrowed code from fs/proc/fd.c. But I can certainly to avoid
repeated reading max_fds as suggested.

> 
>> +               struct file *f;
>> +
>> +               f = fcheck_files(files, sfd);
>> +               if (!f)
>> +                       continue;
>> +               *fd = sfd;
>> +               get_file(f);
>> +               rcu_read_unlock();
>> +               return f;
>> +       }
>> +
>> +       /* the current task is done, go to the next task */
>> +       rcu_read_unlock();
>> +       put_files_struct(files);
>> +       *fstruct = NULL;
> 
> *task = NULL; for completeness?

if *fstruct == NULL, will try to get next task, so *task = NULL
is unnecessary, but I can add it, won't hurt and possibly make
it easy to understand.

> 
>> +       sid = ++(*id);
>> +       *fd = 0;
>> +       goto again;
>> +}
>> +
>> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
>> +{
>> +       struct bpf_iter_seq_task_file_info *info = seq->private;
>> +       struct files_struct *files = NULL;
>> +       struct task_struct *task = NULL;
>> +       struct file *file;
>> +       u32 id = info->id;
>> +       int fd = info->fd;
>> +
>> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
>> +       if (!file) {
>> +               info->files = NULL;
> 
> what about info->task here?

info->files == NULL indicates the end of iteration, info->task will not 
be checked any more. But I guess, I can assign NULL to task as well to
avoid confusion.

> 
>> +               return NULL;
>> +       }
>> +
>> +       ++*pos;
>> +       info->id = id;
>> +       info->fd = fd;
>> +       info->task = task;
>> +       info->files = files;
>> +
>> +       return file;
>> +}
>> +
> 
> [...]
> 
>> +
>> +struct bpf_iter__task_file {
>> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
>> +       __bpf_md_ptr(struct task_struct *, task);
>> +       u32 fd;
> 
> nit: sort of works by accident (due to all other field being 8-byte
> aligned pointers), shall we add __attribute__((aligned(8)))?

This is what I thought as well. It should work. But I think
add aligned(8) wont' hurt to expresss the intention.. Will add it.

> 
>> +       __bpf_md_ptr(struct file *, file);
>> +};
>> +
> 
> [...]
>
Andrii Nakryiko May 6, 2020, 8:51 p.m. UTC | #3
On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Only the tasks belonging to "current" pid namespace
> >> are enumerated.
> >>
> >> For task/file target, the bpf program will have access to
> >>    struct task_struct *task
> >>    u32 fd
> >>    struct file *file
> >> where fd/file is an open file for the task.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >
> > I might be missing some subtleties with task refcounting for task_file
> > iterator, asked few questions below...
> >
> >>   kernel/bpf/Makefile    |   2 +-
> >>   kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
> >>   2 files changed, 337 insertions(+), 1 deletion(-)
> >>   create mode 100644 kernel/bpf/task_iter.c
> >>
> >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> >> index b2b5eefc5254..37b2d8620153 100644
> >> --- a/kernel/bpf/Makefile
> >> +++ b/kernel/bpf/Makefile
> >> @@ -2,7 +2,7 @@
> >>   obj-y := core.o
> >>   CFLAGS_core.o += $(call cc-disable-warning, override-init)
> >>
> >> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
> >> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> new file mode 100644
> >> index 000000000000..1ca258f6e9f4
> >> --- /dev/null
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -0,0 +1,336 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/* Copyright (c) 2020 Facebook */
> >> +
> >> +#include <linux/init.h>
> >> +#include <linux/namei.h>
> >> +#include <linux/pid_namespace.h>
> >> +#include <linux/fs.h>
> >> +#include <linux/fdtable.h>
> >> +#include <linux/filter.h>
> >> +
> >> +struct bpf_iter_seq_task_common {
> >> +       struct pid_namespace *ns;
> >> +};
> >> +
> >> +struct bpf_iter_seq_task_info {
> >> +       struct bpf_iter_seq_task_common common;
> >
> > you have comment below in init_seq_pidns() that common is supposed to
> > be the very first field, but I think it's more important and
> > appropriate here, so that whoever adds anything here knows that order
> > of field is important.
>
> I can move the comments here.
>
> >
> >> +       struct task_struct *task;
> >> +       u32 id;
> >> +};
> >> +
> >
> > [...]
> >
> >> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
> >> +{
> >> +       struct bpf_iter_meta meta;
> >> +       struct bpf_iter__task ctx;
> >> +       struct bpf_prog *prog;
> >> +       int ret = 0;
> >> +
> >> +       meta.seq = seq;
> >> +       prog = bpf_iter_get_info(&meta, in_stop);
> >> +       if (prog) {
> >
> >
> > nit: `if (!prog) return 0;` here would reduce nesting level below
> >
> >> +               meta.seq = seq;
> >> +               ctx.meta = &meta;
> >> +               ctx.task = v;
> >> +               ret = bpf_iter_run_prog(prog, &ctx);
> >> +       }
> >> +
> >> +       return 0;
> >
> > return **ret**; ?
>
> It should return "ret". In task_file show() code is similar but correct.
> I can do early return with !prog too although we do not have
> deep nesting level yet.
>
> >
> >> +}
> >> +
> >
> > [...]
> >
> >> +
> >> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
> >> +                                          int *fd, struct task_struct **task,
> >> +                                          struct files_struct **fstruct)
> >> +{
> >> +       struct files_struct *files;
> >> +       struct task_struct *tk;
> >> +       u32 sid = *id;
> >> +       int sfd;
> >> +
> >> +       /* If this function returns a non-NULL file object,
> >> +        * it held a reference to the files_struct and file.
> >> +        * Otherwise, it does not hold any reference.
> >> +        */
> >> +again:
> >> +       if (*fstruct) {
> >> +               files = *fstruct;
> >> +               sfd = *fd;
> >> +       } else {
> >> +               tk = task_seq_get_next(ns, &sid);
> >> +               if (!tk)
> >> +                       return NULL;
> >> +
> >> +               files = get_files_struct(tk);
> >> +               put_task_struct(tk);
> >
> > task is put here, but is still used below.. is there some additional
> > hidden refcounting involved?
>
> Good question. I had an impression that we take a reference count
> for task->files so task should not go away. But reading linux
> code again, I do not have sufficient evidence to back my claim.
> So I will reference count task as well, e.g., do not put_task_struct()
> until all files are done here.

All threads within the process share files table. So some threads
might exit, but files will stay, which is why task_struct and
files_struct have separate refcounting, and having refcount on files
doesn't guarantee any particular task will stay alive for long enough.
So I think we need to refcount both files and task in this case.
Reading source code of copy_files() in kernel/fork.c (CLONE_FILES
flags just bumps refcnt on old process' files_struct), seems to
confirm this as well.

>
> >
> >> +               if (!files) {
> >> +                       sid = ++(*id);
> >> +                       *fd = 0;
> >> +                       goto again;
> >> +               }
> >> +               *fstruct = files;
> >> +               *task = tk;
> >> +               if (sid == *id) {
> >> +                       sfd = *fd;
> >> +               } else {
> >> +                       *id = sid;
> >> +                       sfd = 0;
> >> +               }
> >> +       }
> >> +
> >> +       rcu_read_lock();
> >> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {
> >
> > files_fdtable does rcu_dereference on each iteration, would it be
> > better to just cache files_fdtable(files)->max_fds into local
> > variable? It's unlikely that there will be many iterations, but
> > still...
>
> I borrowed code from fs/proc/fd.c. But I can certainly to avoid
> repeated reading max_fds as suggested.
>
> >
> >> +               struct file *f;
> >> +
> >> +               f = fcheck_files(files, sfd);
> >> +               if (!f)
> >> +                       continue;
> >> +               *fd = sfd;
> >> +               get_file(f);
> >> +               rcu_read_unlock();
> >> +               return f;
> >> +       }
> >> +
> >> +       /* the current task is done, go to the next task */
> >> +       rcu_read_unlock();
> >> +       put_files_struct(files);
> >> +       *fstruct = NULL;
> >
> > *task = NULL; for completeness?
>
> if *fstruct == NULL, will try to get next task, so *task = NULL
> is unnecessary, but I can add it, won't hurt and possibly make
> it easy to understand.
>
> >
> >> +       sid = ++(*id);
> >> +       *fd = 0;
> >> +       goto again;
> >> +}
> >> +
> >> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
> >> +{
> >> +       struct bpf_iter_seq_task_file_info *info = seq->private;
> >> +       struct files_struct *files = NULL;
> >> +       struct task_struct *task = NULL;
> >> +       struct file *file;
> >> +       u32 id = info->id;
> >> +       int fd = info->fd;
> >> +
> >> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
> >> +       if (!file) {
> >> +               info->files = NULL;
> >
> > what about info->task here?
>
> info->files == NULL indicates the end of iteration, info->task will not
> be checked any more. But I guess, I can assign NULL to task as well to
> avoid confusion.
>
> >
> >> +               return NULL;
> >> +       }
> >> +
> >> +       ++*pos;
> >> +       info->id = id;
> >> +       info->fd = fd;
> >> +       info->task = task;
> >> +       info->files = files;
> >> +
> >> +       return file;
> >> +}
> >> +
> >
> > [...]
> >
> >> +
> >> +struct bpf_iter__task_file {
> >> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
> >> +       __bpf_md_ptr(struct task_struct *, task);
> >> +       u32 fd;
> >
> > nit: sort of works by accident (due to all other field being 8-byte
> > aligned pointers), shall we add __attribute__((aligned(8)))?
>
> This is what I thought as well. It should work. But I think
> add aligned(8) wont' hurt to expresss the intention.. Will add it.
>
> >
> >> +       __bpf_md_ptr(struct file *, file);
> >> +};
> >> +
> >
> > [...]
> >
Yonghong Song May 6, 2020, 9:20 p.m. UTC | #4
On 5/6/20 1:51 PM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
>>> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>> Only the tasks belonging to "current" pid namespace
>>>> are enumerated.
>>>>
>>>> For task/file target, the bpf program will have access to
>>>>     struct task_struct *task
>>>>     u32 fd
>>>>     struct file *file
>>>> where fd/file is an open file for the task.
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>
>>> I might be missing some subtleties with task refcounting for task_file
>>> iterator, asked few questions below...
>>>
>>>>    kernel/bpf/Makefile    |   2 +-
>>>>    kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>>>>    2 files changed, 337 insertions(+), 1 deletion(-)
>>>>    create mode 100644 kernel/bpf/task_iter.c
>>>>
>>>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>>>> index b2b5eefc5254..37b2d8620153 100644
>>>> --- a/kernel/bpf/Makefile
>>>> +++ b/kernel/bpf/Makefile
>>>> @@ -2,7 +2,7 @@
>>>>    obj-y := core.o
>>>>    CFLAGS_core.o += $(call cc-disable-warning, override-init)
>>>>
>>>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>>>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>>> new file mode 100644
>>>> index 000000000000..1ca258f6e9f4
>>>> --- /dev/null
>>>> +++ b/kernel/bpf/task_iter.c
>>>> @@ -0,0 +1,336 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/* Copyright (c) 2020 Facebook */
>>>> +
>>>> +#include <linux/init.h>
>>>> +#include <linux/namei.h>
>>>> +#include <linux/pid_namespace.h>
>>>> +#include <linux/fs.h>
>>>> +#include <linux/fdtable.h>
>>>> +#include <linux/filter.h>
>>>> +
>>>> +struct bpf_iter_seq_task_common {
>>>> +       struct pid_namespace *ns;
>>>> +};
>>>> +
>>>> +struct bpf_iter_seq_task_info {
>>>> +       struct bpf_iter_seq_task_common common;
>>>
>>> you have comment below in init_seq_pidns() that common is supposed to
>>> be the very first field, but I think it's more important and
>>> appropriate here, so that whoever adds anything here knows that order
>>> of field is important.
>>
>> I can move the comments here.
>>
>>>
>>>> +       struct task_struct *task;
>>>> +       u32 id;
>>>> +};
>>>> +
>>>
>>> [...]
>>>
>>>> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
>>>> +{
>>>> +       struct bpf_iter_meta meta;
>>>> +       struct bpf_iter__task ctx;
>>>> +       struct bpf_prog *prog;
>>>> +       int ret = 0;
>>>> +
>>>> +       meta.seq = seq;
>>>> +       prog = bpf_iter_get_info(&meta, in_stop);
>>>> +       if (prog) {
>>>
>>>
>>> nit: `if (!prog) return 0;` here would reduce nesting level below
>>>
>>>> +               meta.seq = seq;
>>>> +               ctx.meta = &meta;
>>>> +               ctx.task = v;
>>>> +               ret = bpf_iter_run_prog(prog, &ctx);
>>>> +       }
>>>> +
>>>> +       return 0;
>>>
>>> return **ret**; ?
>>
>> It should return "ret". In task_file show() code is similar but correct.
>> I can do early return with !prog too although we do not have
>> deep nesting level yet.
>>
>>>
>>>> +}
>>>> +
>>>
>>> [...]
>>>
>>>> +
>>>> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
>>>> +                                          int *fd, struct task_struct **task,
>>>> +                                          struct files_struct **fstruct)
>>>> +{
>>>> +       struct files_struct *files;
>>>> +       struct task_struct *tk;
>>>> +       u32 sid = *id;
>>>> +       int sfd;
>>>> +
>>>> +       /* If this function returns a non-NULL file object,
>>>> +        * it held a reference to the files_struct and file.
>>>> +        * Otherwise, it does not hold any reference.
>>>> +        */
>>>> +again:
>>>> +       if (*fstruct) {
>>>> +               files = *fstruct;
>>>> +               sfd = *fd;
>>>> +       } else {
>>>> +               tk = task_seq_get_next(ns, &sid);
>>>> +               if (!tk)
>>>> +                       return NULL;
>>>> +
>>>> +               files = get_files_struct(tk);
>>>> +               put_task_struct(tk);
>>>
>>> task is put here, but is still used below.. is there some additional
>>> hidden refcounting involved?
>>
>> Good question. I had an impression that we take a reference count
>> for task->files so task should not go away. But reading linux
>> code again, I do not have sufficient evidence to back my claim.
>> So I will reference count task as well, e.g., do not put_task_struct()
>> until all files are done here.
> 
> All threads within the process share files table. So some threads
> might exit, but files will stay, which is why task_struct and
> files_struct have separate refcounting, and having refcount on files
> doesn't guarantee any particular task will stay alive for long enough.
> So I think we need to refcount both files and task in this case.
> Reading source code of copy_files() in kernel/fork.c (CLONE_FILES
> flags just bumps refcnt on old process' files_struct), seems to
> confirm this as well.

Just checked the code. It does look like files are shared among
threads (tasks). So yes, in this case, reference counting to
both task and file_table needed.

> 
>>
>>>
>>>> +               if (!files) {
>>>> +                       sid = ++(*id);
>>>> +                       *fd = 0;
>>>> +                       goto again;
>>>> +               }
>>>> +               *fstruct = files;
>>>> +               *task = tk;
>>>> +               if (sid == *id) {
>>>> +                       sfd = *fd;
>>>> +               } else {
>>>> +                       *id = sid;
>>>> +                       sfd = 0;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       rcu_read_lock();
[...]
diff mbox series

Patch

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index b2b5eefc5254..37b2d8620153 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@ 
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..1ca258f6e9f4
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,336 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+	struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	u32 id;
+};
+
+static struct task_struct *task_seq_get_next(struct pid_namespace *ns, u32 *id)
+{
+	struct task_struct *task = NULL;
+	struct pid *pid;
+
+	rcu_read_lock();
+	pid = idr_get_next(&ns->idr, id);
+	if (pid)
+		task = get_pid_task(pid, PIDTYPE_PID);
+	rcu_read_unlock();
+
+	return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+	u32 id = info->id;
+
+	task = task_seq_get_next(info->common.ns, &id);
+	if (!task)
+		return NULL;
+
+	++*pos;
+	info->task = task;
+	info->id = id;
+
+	return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+
+	++*pos;
+	++info->id;
+	task = task_seq_get_next(info->common.ns, &info->id);
+	if (!task)
+		return NULL;
+
+	put_task_struct(info->task);
+	info->task = task;
+	return task;
+}
+
+struct bpf_iter__task {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__task ctx;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog) {
+		meta.seq = seq;
+		ctx.meta = &meta;
+		ctx.task = v;
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return 0;
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+
+	if (!v)
+		__task_seq_show(seq, v, true);
+
+	if (info->task) {
+		put_task_struct(info->task);
+		info->task = NULL;
+	}
+}
+
+static const struct seq_operations task_seq_ops = {
+	.start	= task_seq_start,
+	.next	= task_seq_next,
+	.stop	= task_seq_stop,
+	.show	= task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	struct files_struct *files;
+	u32 id;
+	u32 fd;
+};
+
+static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
+					   int *fd, struct task_struct **task,
+					   struct files_struct **fstruct)
+{
+	struct files_struct *files;
+	struct task_struct *tk;
+	u32 sid = *id;
+	int sfd;
+
+	/* If this function returns a non-NULL file object,
+	 * it held a reference to the files_struct and file.
+	 * Otherwise, it does not hold any reference.
+	 */
+again:
+	if (*fstruct) {
+		files = *fstruct;
+		sfd = *fd;
+	} else {
+		tk = task_seq_get_next(ns, &sid);
+		if (!tk)
+			return NULL;
+
+		files = get_files_struct(tk);
+		put_task_struct(tk);
+		if (!files) {
+			sid = ++(*id);
+			*fd = 0;
+			goto again;
+		}
+		*fstruct = files;
+		*task = tk;
+		if (sid == *id) {
+			sfd = *fd;
+		} else {
+			*id = sid;
+			sfd = 0;
+		}
+	}
+
+	rcu_read_lock();
+	for (; sfd < files_fdtable(files)->max_fds; sfd++) {
+		struct file *f;
+
+		f = fcheck_files(files, sfd);
+		if (!f)
+			continue;
+		*fd = sfd;
+		get_file(f);
+		rcu_read_unlock();
+		return f;
+	}
+
+	/* the current task is done, go to the next task */
+	rcu_read_unlock();
+	put_files_struct(files);
+	*fstruct = NULL;
+	sid = ++(*id);
+	*fd = 0;
+	goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = NULL;
+	struct task_struct *task = NULL;
+	struct file *file;
+	u32 id = info->id;
+	int fd = info->fd;
+
+	file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
+	if (!file) {
+		info->files = NULL;
+		return NULL;
+	}
+
+	++*pos;
+	info->id = id;
+	info->fd = fd;
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = info->files;
+	struct task_struct *task = info->task;
+	struct file *file;
+
+	++*pos;
+	++info->fd;
+	fput((struct file *)v);
+	file = task_file_seq_get_next(info->common.ns, &info->id, &info->fd,
+				      &task, &files);
+	if (!file) {
+		info->files = NULL;
+		return NULL;
+	}
+
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+struct bpf_iter__task_file {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+	u32 fd;
+	__bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+		     struct task_struct *task, u32 fd,
+		     struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct bpf_iter__task_file ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.task = info->task;
+		ctx.fd = info->fd;
+		ctx.file = v;
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+
+	if (!v)
+		__task_file_seq_show(seq, v, true);
+	else if (!IS_ERR(v))
+		fput((struct file *)v);
+
+	if (info->files) {
+		put_files_struct(info->files);
+		info->files = NULL;
+	}
+}
+
+/* The first field of task/task_file private data is
+ * struct bpf_iter_seq_task_common.
+ */
+static int init_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	common->ns = get_pid_ns(task_active_pid_ns(current));
+	return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+	.start	= task_file_seq_start,
+	.next	= task_file_seq_next,
+	.stop	= task_file_seq_stop,
+	.show	= task_file_seq_show,
+};
+
+static int __init task_iter_init(void)
+{
+	struct bpf_iter_reg task_file_reg_info = {
+		.target			= "task_file",
+		.seq_ops		= &task_file_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+	};
+	struct bpf_iter_reg task_reg_info = {
+		.target			= "task",
+		.seq_ops		= &task_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+	};
+	int ret;
+
+	ret = bpf_iter_reg_target(&task_reg_info);
+	if (ret)
+		return ret;
+
+	return bpf_iter_reg_target(&task_file_reg_info);
+}
+late_initcall(task_iter_init);