Message ID | 20200504062559.2048228-1-yhs@fb.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: implement bpf iterator for kernel data | expand |
On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote: > > Only the tasks belonging to "current" pid namespace > are enumerated. > > For task/file target, the bpf program will have access to > struct task_struct *task > u32 fd > struct file *file > where fd/file is an open file for the task. > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- I might be missing some subtleties with task refcounting for task_file iterator, asked few questions below... > kernel/bpf/Makefile | 2 +- > kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 337 insertions(+), 1 deletion(-) > create mode 100644 kernel/bpf/task_iter.c > > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index b2b5eefc5254..37b2d8620153 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -2,7 +2,7 @@ > obj-y := core.o > CFLAGS_core.o += $(call cc-disable-warning, override-init) > > -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o > +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o > obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o > obj-$(CONFIG_BPF_SYSCALL) += disasm.o > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > new file mode 100644 > index 000000000000..1ca258f6e9f4 > --- /dev/null > +++ b/kernel/bpf/task_iter.c > @@ -0,0 +1,336 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2020 Facebook */ > + > +#include <linux/init.h> > +#include <linux/namei.h> > +#include <linux/pid_namespace.h> > +#include <linux/fs.h> > +#include <linux/fdtable.h> > +#include <linux/filter.h> > + > +struct bpf_iter_seq_task_common { > + struct pid_namespace *ns; > +}; > + > +struct bpf_iter_seq_task_info { > + struct bpf_iter_seq_task_common common; you have comment below in init_seq_pidns() that common is supposed to be the very first field, but I think it's more important and appropriate here, so that whoever adds anything here knows that order of field is important. > + struct task_struct *task; > + u32 id; > +}; > + [...] > +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__task ctx; > + struct bpf_prog *prog; > + int ret = 0; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, in_stop); > + if (prog) { nit: `if (!prog) return 0;` here would reduce nesting level below > + meta.seq = seq; > + ctx.meta = &meta; > + ctx.task = v; > + ret = bpf_iter_run_prog(prog, &ctx); > + } > + > + return 0; return **ret**; ? > +} > + [...] > + > +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, > + int *fd, struct task_struct **task, > + struct files_struct **fstruct) > +{ > + struct files_struct *files; > + struct task_struct *tk; > + u32 sid = *id; > + int sfd; > + > + /* If this function returns a non-NULL file object, > + * it held a reference to the files_struct and file. > + * Otherwise, it does not hold any reference. > + */ > +again: > + if (*fstruct) { > + files = *fstruct; > + sfd = *fd; > + } else { > + tk = task_seq_get_next(ns, &sid); > + if (!tk) > + return NULL; > + > + files = get_files_struct(tk); > + put_task_struct(tk); task is put here, but is still used below.. is there some additional hidden refcounting involved? > + if (!files) { > + sid = ++(*id); > + *fd = 0; > + goto again; > + } > + *fstruct = files; > + *task = tk; > + if (sid == *id) { > + sfd = *fd; > + } else { > + *id = sid; > + sfd = 0; > + } > + } > + > + rcu_read_lock(); > + for (; sfd < files_fdtable(files)->max_fds; sfd++) { files_fdtable does rcu_dereference on each iteration, would it be better to just cache files_fdtable(files)->max_fds into local variable? It's unlikely that there will be many iterations, but still... > + struct file *f; > + > + f = fcheck_files(files, sfd); > + if (!f) > + continue; > + *fd = sfd; > + get_file(f); > + rcu_read_unlock(); > + return f; > + } > + > + /* the current task is done, go to the next task */ > + rcu_read_unlock(); > + put_files_struct(files); > + *fstruct = NULL; *task = NULL; for completeness? > + sid = ++(*id); > + *fd = 0; > + goto again; > +} > + > +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) > +{ > + struct bpf_iter_seq_task_file_info *info = seq->private; > + struct files_struct *files = NULL; > + struct task_struct *task = NULL; > + struct file *file; > + u32 id = info->id; > + int fd = info->fd; > + > + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files); > + if (!file) { > + info->files = NULL; what about info->task here? > + return NULL; > + } > + > + ++*pos; > + info->id = id; > + info->fd = fd; > + info->task = task; > + info->files = files; > + > + return file; > +} > + [...] > + > +struct bpf_iter__task_file { > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > + __bpf_md_ptr(struct task_struct *, task); > + u32 fd; nit: sort of works by accident (due to all other field being 8-byte aligned pointers), shall we add __attribute__((aligned(8)))? > + __bpf_md_ptr(struct file *, file); > +}; > + [...]
On 5/6/20 12:30 AM, Andrii Nakryiko wrote: > On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote: >> >> Only the tasks belonging to "current" pid namespace >> are enumerated. >> >> For task/file target, the bpf program will have access to >> struct task_struct *task >> u32 fd >> struct file *file >> where fd/file is an open file for the task. >> >> Signed-off-by: Yonghong Song <yhs@fb.com> >> --- > > I might be missing some subtleties with task refcounting for task_file > iterator, asked few questions below... > >> kernel/bpf/Makefile | 2 +- >> kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 337 insertions(+), 1 deletion(-) >> create mode 100644 kernel/bpf/task_iter.c >> >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile >> index b2b5eefc5254..37b2d8620153 100644 >> --- a/kernel/bpf/Makefile >> +++ b/kernel/bpf/Makefile >> @@ -2,7 +2,7 @@ >> obj-y := core.o >> CFLAGS_core.o += $(call cc-disable-warning, override-init) >> >> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o >> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o >> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o >> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o >> obj-$(CONFIG_BPF_SYSCALL) += disasm.o >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >> new file mode 100644 >> index 000000000000..1ca258f6e9f4 >> --- /dev/null >> +++ b/kernel/bpf/task_iter.c >> @@ -0,0 +1,336 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +/* Copyright (c) 2020 Facebook */ >> + >> +#include <linux/init.h> >> +#include <linux/namei.h> >> +#include <linux/pid_namespace.h> >> +#include <linux/fs.h> >> +#include <linux/fdtable.h> >> +#include <linux/filter.h> >> + >> +struct bpf_iter_seq_task_common { >> + struct pid_namespace *ns; >> +}; >> + >> +struct bpf_iter_seq_task_info { >> + struct bpf_iter_seq_task_common common; > > you have comment below in init_seq_pidns() that common is supposed to > be the very first field, but I think it's more important and > appropriate here, so that whoever adds anything here knows that order > of field is important. I can move the comments here. > >> + struct task_struct *task; >> + u32 id; >> +}; >> + > > [...] > >> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) >> +{ >> + struct bpf_iter_meta meta; >> + struct bpf_iter__task ctx; >> + struct bpf_prog *prog; >> + int ret = 0; >> + >> + meta.seq = seq; >> + prog = bpf_iter_get_info(&meta, in_stop); >> + if (prog) { > > > nit: `if (!prog) return 0;` here would reduce nesting level below > >> + meta.seq = seq; >> + ctx.meta = &meta; >> + ctx.task = v; >> + ret = bpf_iter_run_prog(prog, &ctx); >> + } >> + >> + return 0; > > return **ret**; ? It should return "ret". In task_file show() code is similar but correct. I can do early return with !prog too although we do not have deep nesting level yet. > >> +} >> + > > [...] > >> + >> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, >> + int *fd, struct task_struct **task, >> + struct files_struct **fstruct) >> +{ >> + struct files_struct *files; >> + struct task_struct *tk; >> + u32 sid = *id; >> + int sfd; >> + >> + /* If this function returns a non-NULL file object, >> + * it held a reference to the files_struct and file. >> + * Otherwise, it does not hold any reference. >> + */ >> +again: >> + if (*fstruct) { >> + files = *fstruct; >> + sfd = *fd; >> + } else { >> + tk = task_seq_get_next(ns, &sid); >> + if (!tk) >> + return NULL; >> + >> + files = get_files_struct(tk); >> + put_task_struct(tk); > > task is put here, but is still used below.. is there some additional > hidden refcounting involved? Good question. I had an impression that we take a reference count for task->files so task should not go away. But reading linux code again, I do not have sufficient evidence to back my claim. So I will reference count task as well, e.g., do not put_task_struct() until all files are done here. > >> + if (!files) { >> + sid = ++(*id); >> + *fd = 0; >> + goto again; >> + } >> + *fstruct = files; >> + *task = tk; >> + if (sid == *id) { >> + sfd = *fd; >> + } else { >> + *id = sid; >> + sfd = 0; >> + } >> + } >> + >> + rcu_read_lock(); >> + for (; sfd < files_fdtable(files)->max_fds; sfd++) { > > files_fdtable does rcu_dereference on each iteration, would it be > better to just cache files_fdtable(files)->max_fds into local > variable? It's unlikely that there will be many iterations, but > still... I borrowed code from fs/proc/fd.c. But I can certainly to avoid repeated reading max_fds as suggested. > >> + struct file *f; >> + >> + f = fcheck_files(files, sfd); >> + if (!f) >> + continue; >> + *fd = sfd; >> + get_file(f); >> + rcu_read_unlock(); >> + return f; >> + } >> + >> + /* the current task is done, go to the next task */ >> + rcu_read_unlock(); >> + put_files_struct(files); >> + *fstruct = NULL; > > *task = NULL; for completeness? if *fstruct == NULL, will try to get next task, so *task = NULL is unnecessary, but I can add it, won't hurt and possibly make it easy to understand. > >> + sid = ++(*id); >> + *fd = 0; >> + goto again; >> +} >> + >> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) >> +{ >> + struct bpf_iter_seq_task_file_info *info = seq->private; >> + struct files_struct *files = NULL; >> + struct task_struct *task = NULL; >> + struct file *file; >> + u32 id = info->id; >> + int fd = info->fd; >> + >> + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files); >> + if (!file) { >> + info->files = NULL; > > what about info->task here? info->files == NULL indicates the end of iteration, info->task will not be checked any more. But I guess, I can assign NULL to task as well to avoid confusion. > >> + return NULL; >> + } >> + >> + ++*pos; >> + info->id = id; >> + info->fd = fd; >> + info->task = task; >> + info->files = files; >> + >> + return file; >> +} >> + > > [...] > >> + >> +struct bpf_iter__task_file { >> + __bpf_md_ptr(struct bpf_iter_meta *, meta); >> + __bpf_md_ptr(struct task_struct *, task); >> + u32 fd; > > nit: sort of works by accident (due to all other field being 8-byte > aligned pointers), shall we add __attribute__((aligned(8)))? This is what I thought as well. It should work. But I think add aligned(8) wont' hurt to expresss the intention.. Will add it. > >> + __bpf_md_ptr(struct file *, file); >> +}; >> + > > [...] >
On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote: > > > > On 5/6/20 12:30 AM, Andrii Nakryiko wrote: > > On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote: > >> > >> Only the tasks belonging to "current" pid namespace > >> are enumerated. > >> > >> For task/file target, the bpf program will have access to > >> struct task_struct *task > >> u32 fd > >> struct file *file > >> where fd/file is an open file for the task. > >> > >> Signed-off-by: Yonghong Song <yhs@fb.com> > >> --- > > > > I might be missing some subtleties with task refcounting for task_file > > iterator, asked few questions below... > > > >> kernel/bpf/Makefile | 2 +- > >> kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ > >> 2 files changed, 337 insertions(+), 1 deletion(-) > >> create mode 100644 kernel/bpf/task_iter.c > >> > >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > >> index b2b5eefc5254..37b2d8620153 100644 > >> --- a/kernel/bpf/Makefile > >> +++ b/kernel/bpf/Makefile > >> @@ -2,7 +2,7 @@ > >> obj-y := core.o > >> CFLAGS_core.o += $(call cc-disable-warning, override-init) > >> > >> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o > >> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o > >> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o > >> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o > >> obj-$(CONFIG_BPF_SYSCALL) += disasm.o > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > >> new file mode 100644 > >> index 000000000000..1ca258f6e9f4 > >> --- /dev/null > >> +++ b/kernel/bpf/task_iter.c > >> @@ -0,0 +1,336 @@ > >> +// SPDX-License-Identifier: GPL-2.0-only > >> +/* Copyright (c) 2020 Facebook */ > >> + > >> +#include <linux/init.h> > >> +#include <linux/namei.h> > >> +#include <linux/pid_namespace.h> > >> +#include <linux/fs.h> > >> +#include <linux/fdtable.h> > >> +#include <linux/filter.h> > >> + > >> +struct bpf_iter_seq_task_common { > >> + struct pid_namespace *ns; > >> +}; > >> + > >> +struct bpf_iter_seq_task_info { > >> + struct bpf_iter_seq_task_common common; > > > > you have comment below in init_seq_pidns() that common is supposed to > > be the very first field, but I think it's more important and > > appropriate here, so that whoever adds anything here knows that order > > of field is important. > > I can move the comments here. > > > > >> + struct task_struct *task; > >> + u32 id; > >> +}; > >> + > > > > [...] > > > >> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) > >> +{ > >> + struct bpf_iter_meta meta; > >> + struct bpf_iter__task ctx; > >> + struct bpf_prog *prog; > >> + int ret = 0; > >> + > >> + meta.seq = seq; > >> + prog = bpf_iter_get_info(&meta, in_stop); > >> + if (prog) { > > > > > > nit: `if (!prog) return 0;` here would reduce nesting level below > > > >> + meta.seq = seq; > >> + ctx.meta = &meta; > >> + ctx.task = v; > >> + ret = bpf_iter_run_prog(prog, &ctx); > >> + } > >> + > >> + return 0; > > > > return **ret**; ? > > It should return "ret". In task_file show() code is similar but correct. > I can do early return with !prog too although we do not have > deep nesting level yet. > > > > >> +} > >> + > > > > [...] > > > >> + > >> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, > >> + int *fd, struct task_struct **task, > >> + struct files_struct **fstruct) > >> +{ > >> + struct files_struct *files; > >> + struct task_struct *tk; > >> + u32 sid = *id; > >> + int sfd; > >> + > >> + /* If this function returns a non-NULL file object, > >> + * it held a reference to the files_struct and file. > >> + * Otherwise, it does not hold any reference. > >> + */ > >> +again: > >> + if (*fstruct) { > >> + files = *fstruct; > >> + sfd = *fd; > >> + } else { > >> + tk = task_seq_get_next(ns, &sid); > >> + if (!tk) > >> + return NULL; > >> + > >> + files = get_files_struct(tk); > >> + put_task_struct(tk); > > > > task is put here, but is still used below.. is there some additional > > hidden refcounting involved? > > Good question. I had an impression that we take a reference count > for task->files so task should not go away. But reading linux > code again, I do not have sufficient evidence to back my claim. > So I will reference count task as well, e.g., do not put_task_struct() > until all files are done here. All threads within the process share files table. So some threads might exit, but files will stay, which is why task_struct and files_struct have separate refcounting, and having refcount on files doesn't guarantee any particular task will stay alive for long enough. So I think we need to refcount both files and task in this case. Reading source code of copy_files() in kernel/fork.c (CLONE_FILES flags just bumps refcnt on old process' files_struct), seems to confirm this as well. > > > > >> + if (!files) { > >> + sid = ++(*id); > >> + *fd = 0; > >> + goto again; > >> + } > >> + *fstruct = files; > >> + *task = tk; > >> + if (sid == *id) { > >> + sfd = *fd; > >> + } else { > >> + *id = sid; > >> + sfd = 0; > >> + } > >> + } > >> + > >> + rcu_read_lock(); > >> + for (; sfd < files_fdtable(files)->max_fds; sfd++) { > > > > files_fdtable does rcu_dereference on each iteration, would it be > > better to just cache files_fdtable(files)->max_fds into local > > variable? It's unlikely that there will be many iterations, but > > still... > > I borrowed code from fs/proc/fd.c. But I can certainly to avoid > repeated reading max_fds as suggested. > > > > >> + struct file *f; > >> + > >> + f = fcheck_files(files, sfd); > >> + if (!f) > >> + continue; > >> + *fd = sfd; > >> + get_file(f); > >> + rcu_read_unlock(); > >> + return f; > >> + } > >> + > >> + /* the current task is done, go to the next task */ > >> + rcu_read_unlock(); > >> + put_files_struct(files); > >> + *fstruct = NULL; > > > > *task = NULL; for completeness? > > if *fstruct == NULL, will try to get next task, so *task = NULL > is unnecessary, but I can add it, won't hurt and possibly make > it easy to understand. > > > > >> + sid = ++(*id); > >> + *fd = 0; > >> + goto again; > >> +} > >> + > >> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) > >> +{ > >> + struct bpf_iter_seq_task_file_info *info = seq->private; > >> + struct files_struct *files = NULL; > >> + struct task_struct *task = NULL; > >> + struct file *file; > >> + u32 id = info->id; > >> + int fd = info->fd; > >> + > >> + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files); > >> + if (!file) { > >> + info->files = NULL; > > > > what about info->task here? > > info->files == NULL indicates the end of iteration, info->task will not > be checked any more. But I guess, I can assign NULL to task as well to > avoid confusion. > > > > >> + return NULL; > >> + } > >> + > >> + ++*pos; > >> + info->id = id; > >> + info->fd = fd; > >> + info->task = task; > >> + info->files = files; > >> + > >> + return file; > >> +} > >> + > > > > [...] > > > >> + > >> +struct bpf_iter__task_file { > >> + __bpf_md_ptr(struct bpf_iter_meta *, meta); > >> + __bpf_md_ptr(struct task_struct *, task); > >> + u32 fd; > > > > nit: sort of works by accident (due to all other field being 8-byte > > aligned pointers), shall we add __attribute__((aligned(8)))? > > This is what I thought as well. It should work. But I think > add aligned(8) wont' hurt to expresss the intention.. Will add it. > > > > >> + __bpf_md_ptr(struct file *, file); > >> +}; > >> + > > > > [...] > >
On 5/6/20 1:51 PM, Andrii Nakryiko wrote: > On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 5/6/20 12:30 AM, Andrii Nakryiko wrote: >>> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote: >>>> >>>> Only the tasks belonging to "current" pid namespace >>>> are enumerated. >>>> >>>> For task/file target, the bpf program will have access to >>>> struct task_struct *task >>>> u32 fd >>>> struct file *file >>>> where fd/file is an open file for the task. >>>> >>>> Signed-off-by: Yonghong Song <yhs@fb.com> >>>> --- >>> >>> I might be missing some subtleties with task refcounting for task_file >>> iterator, asked few questions below... >>> >>>> kernel/bpf/Makefile | 2 +- >>>> kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ >>>> 2 files changed, 337 insertions(+), 1 deletion(-) >>>> create mode 100644 kernel/bpf/task_iter.c >>>> >>>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile >>>> index b2b5eefc5254..37b2d8620153 100644 >>>> --- a/kernel/bpf/Makefile >>>> +++ b/kernel/bpf/Makefile >>>> @@ -2,7 +2,7 @@ >>>> obj-y := core.o >>>> CFLAGS_core.o += $(call cc-disable-warning, override-init) >>>> >>>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o >>>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o >>>> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o >>>> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o >>>> obj-$(CONFIG_BPF_SYSCALL) += disasm.o >>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >>>> new file mode 100644 >>>> index 000000000000..1ca258f6e9f4 >>>> --- /dev/null >>>> +++ b/kernel/bpf/task_iter.c >>>> @@ -0,0 +1,336 @@ >>>> +// SPDX-License-Identifier: GPL-2.0-only >>>> +/* Copyright (c) 2020 Facebook */ >>>> + >>>> +#include <linux/init.h> >>>> +#include <linux/namei.h> >>>> +#include <linux/pid_namespace.h> >>>> +#include <linux/fs.h> >>>> +#include <linux/fdtable.h> >>>> +#include <linux/filter.h> >>>> + >>>> +struct bpf_iter_seq_task_common { >>>> + struct pid_namespace *ns; >>>> +}; >>>> + >>>> +struct bpf_iter_seq_task_info { >>>> + struct bpf_iter_seq_task_common common; >>> >>> you have comment below in init_seq_pidns() that common is supposed to >>> be the very first field, but I think it's more important and >>> appropriate here, so that whoever adds anything here knows that order >>> of field is important. >> >> I can move the comments here. >> >>> >>>> + struct task_struct *task; >>>> + u32 id; >>>> +}; >>>> + >>> >>> [...] >>> >>>> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) >>>> +{ >>>> + struct bpf_iter_meta meta; >>>> + struct bpf_iter__task ctx; >>>> + struct bpf_prog *prog; >>>> + int ret = 0; >>>> + >>>> + meta.seq = seq; >>>> + prog = bpf_iter_get_info(&meta, in_stop); >>>> + if (prog) { >>> >>> >>> nit: `if (!prog) return 0;` here would reduce nesting level below >>> >>>> + meta.seq = seq; >>>> + ctx.meta = &meta; >>>> + ctx.task = v; >>>> + ret = bpf_iter_run_prog(prog, &ctx); >>>> + } >>>> + >>>> + return 0; >>> >>> return **ret**; ? >> >> It should return "ret". In task_file show() code is similar but correct. >> I can do early return with !prog too although we do not have >> deep nesting level yet. >> >>> >>>> +} >>>> + >>> >>> [...] >>> >>>> + >>>> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, >>>> + int *fd, struct task_struct **task, >>>> + struct files_struct **fstruct) >>>> +{ >>>> + struct files_struct *files; >>>> + struct task_struct *tk; >>>> + u32 sid = *id; >>>> + int sfd; >>>> + >>>> + /* If this function returns a non-NULL file object, >>>> + * it held a reference to the files_struct and file. >>>> + * Otherwise, it does not hold any reference. >>>> + */ >>>> +again: >>>> + if (*fstruct) { >>>> + files = *fstruct; >>>> + sfd = *fd; >>>> + } else { >>>> + tk = task_seq_get_next(ns, &sid); >>>> + if (!tk) >>>> + return NULL; >>>> + >>>> + files = get_files_struct(tk); >>>> + put_task_struct(tk); >>> >>> task is put here, but is still used below.. is there some additional >>> hidden refcounting involved? >> >> Good question. I had an impression that we take a reference count >> for task->files so task should not go away. But reading linux >> code again, I do not have sufficient evidence to back my claim. >> So I will reference count task as well, e.g., do not put_task_struct() >> until all files are done here. > > All threads within the process share files table. So some threads > might exit, but files will stay, which is why task_struct and > files_struct have separate refcounting, and having refcount on files > doesn't guarantee any particular task will stay alive for long enough. > So I think we need to refcount both files and task in this case. > Reading source code of copy_files() in kernel/fork.c (CLONE_FILES > flags just bumps refcnt on old process' files_struct), seems to > confirm this as well. Just checked the code. It does look like files are shared among threads (tasks). So yes, in this case, reference counting to both task and file_table needed. > >> >>> >>>> + if (!files) { >>>> + sid = ++(*id); >>>> + *fd = 0; >>>> + goto again; >>>> + } >>>> + *fstruct = files; >>>> + *task = tk; >>>> + if (sid == *id) { >>>> + sfd = *fd; >>>> + } else { >>>> + *id = sid; >>>> + sfd = 0; >>>> + } >>>> + } >>>> + >>>> + rcu_read_lock(); [...]
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index b2b5eefc5254..37b2d8620153 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..1ca258f6e9f4 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include <linux/init.h> +#include <linux/namei.h> +#include <linux/pid_namespace.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/filter.h> + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + struct bpf_iter_seq_task_common common; + struct task_struct *task; + u32 id; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, u32 *id) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); + pid = idr_get_next(&ns->idr, id); + if (pid) + task = get_pid_task(pid, PIDTYPE_PID); + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + u32 id = info->id; + + task = task_seq_get_next(info->common.ns, &id); + if (!task) + return NULL; + + ++*pos; + info->task = task; + info->id = id; + + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->id; + task = task_seq_get_next(info->common.ns, &info->id); + if (!task) + return NULL; + + put_task_struct(info->task); + info->task = task; + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) { + meta.seq = seq; + ctx.meta = &meta; + ctx.task = v; + ret = bpf_iter_run_prog(prog, &ctx); + } + + return 0; +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_info *info = seq->private; + + if (!v) + __task_seq_show(seq, v, true); + + if (info->task) { + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 id; + u32 fd; +}; + +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, + int *fd, struct task_struct **task, + struct files_struct **fstruct) +{ + struct files_struct *files; + struct task_struct *tk; + u32 sid = *id; + int sfd; + + /* If this function returns a non-NULL file object, + * it held a reference to the files_struct and file. + * Otherwise, it does not hold any reference. + */ +again: + if (*fstruct) { + files = *fstruct; + sfd = *fd; + } else { + tk = task_seq_get_next(ns, &sid); + if (!tk) + return NULL; + + files = get_files_struct(tk); + put_task_struct(tk); + if (!files) { + sid = ++(*id); + *fd = 0; + goto again; + } + *fstruct = files; + *task = tk; + if (sid == *id) { + sfd = *fd; + } else { + *id = sid; + sfd = 0; + } + } + + rcu_read_lock(); + for (; sfd < files_fdtable(files)->max_fds; sfd++) { + struct file *f; + + f = fcheck_files(files, sfd); + if (!f) + continue; + *fd = sfd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(files); + *fstruct = NULL; + sid = ++(*id); + *fd = 0; + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + u32 id = info->id; + int fd = info->fd; + + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files); + if (!file) { + info->files = NULL; + return NULL; + } + + ++*pos; + info->id = id; + info->fd = fd; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info->common.ns, &info->id, &info->fd, + &task, &files); + if (!file) { + info->files = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd; + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) { + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = v; + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) + __task_file_seq_show(seq, v, true); + else if (!IS_ERR(v)) + fput((struct file *)v); + + if (info->files) { + put_files_struct(info->files); + info->files = NULL; + } +} + +/* The first field of task/task_file private data is + * struct bpf_iter_seq_task_common. + */ +static int init_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +static int __init task_iter_init(void) +{ + struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + }; + struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + }; + int ret; + + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init);
Only the tasks belonging to "current" pid namespace are enumerated. For task/file target, the bpf program will have access to struct task_struct *task u32 fd struct file *file where fd/file is an open file for the task. Signed-off-by: Yonghong Song <yhs@fb.com> --- kernel/bpf/Makefile | 2 +- kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/task_iter.c