diff mbox series

[bpf-next,1/6] bpf: introduce BPF_PROG_TYPE_FILE_FILTER

Message ID 20181004025750.498303-2-ast@kernel.org
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: introduce BPF_CGROUP_FILE_OPEN | expand

Commit Message

Alexei Starovoitov Oct. 4, 2018, 2:57 a.m. UTC
Similar to networking sandboxing programs and cgroup-v2 based hooks
(BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
introduce basic per-container sandboxing for file access via
new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
security_file_open() LSM hook and works as additional file_open filter.
The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.

Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
is only available to root.

This program type has access to single argument 'struct bpf_file_info'
that contains standard sys_stat fields:
struct bpf_file_info {
        __u64 inode;
        __u32 dev_major;
        __u32 dev_minor;
        __u32 fs_magic;
        __u32 mnt_id;
        __u32 nlink;
        __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
        __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
};
Other file attributes can be added in the future to the end of this struct
without breaking bpf programs.

For debugging introduce bpf_get_file_path() helper that returns
NUL-terminated full path of the file. It should never be used for sandboxing.

Use cases:
- disallow certain FS types within containers (fs_magic == CGROUP2_SUPER_MAGIC)
- restrict permissions in particular mount (mnt_id == X && (flags & O_RDWR))
- disallow access to hard linked sensitive files (nlink > 1 && mode == 0700)
- disallow access to world writeable files (mode == 0..7)
- disallow access to given set of files (dev_major == X && dev_minor == Y && inode == Z)

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  10 +++
 include/linux/bpf_types.h  |   1 +
 include/uapi/linux/bpf.h   |  28 +++++-
 kernel/bpf/cgroup.c        | 171 +++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       |   7 ++
 5 files changed, 216 insertions(+), 1 deletion(-)

Comments

Roman Gushchin Oct. 4, 2018, 7:41 p.m. UTC | #1
On Wed, Oct 03, 2018 at 07:57:45PM -0700, Alexei Starovoitov wrote:
> Similar to networking sandboxing programs and cgroup-v2 based hooks
> (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
> introduce basic per-container sandboxing for file access via
> new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
> security_file_open() LSM hook and works as additional file_open filter.
> The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.
> 
> Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
> is only available to root.
> 
> This program type has access to single argument 'struct bpf_file_info'
> that contains standard sys_stat fields:
> struct bpf_file_info {
>         __u64 inode;
>         __u32 dev_major;
>         __u32 dev_minor;
>         __u32 fs_magic;
>         __u32 mnt_id;
>         __u32 nlink;
>         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
>         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> };

It's probably nice to have file uid/gid as well.

Thanks!
Andy Lutomirski Oct. 4, 2018, 7:51 p.m. UTC | #2
On Thu, Oct 4, 2018 at 12:41 PM Roman Gushchin <guro@fb.com> wrote:
>
> On Wed, Oct 03, 2018 at 07:57:45PM -0700, Alexei Starovoitov wrote:
> > Similar to networking sandboxing programs and cgroup-v2 based hooks
> > (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
> > introduce basic per-container sandboxing for file access via
> > new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
> > security_file_open() LSM hook and works as additional file_open filter.
> > The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.
> >
> > Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
> > is only available to root.
> >
> > This program type has access to single argument 'struct bpf_file_info'
> > that contains standard sys_stat fields:
> > struct bpf_file_info {
> >         __u64 inode;
> >         __u32 dev_major;
> >         __u32 dev_minor;
> >         __u32 fs_magic;
> >         __u32 mnt_id;
> >         __u32 nlink;
> >         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
> >         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> > };
>
> It's probably nice to have file uid/gid as well.

And an indication of which mount namespace we're looking at.

--Andy
Alexei Starovoitov Oct. 4, 2018, 10:23 p.m. UTC | #3
On Thu, Oct 04, 2018 at 12:51:00PM -0700, Andy Lutomirski wrote:
> On Thu, Oct 4, 2018 at 12:41 PM Roman Gushchin <guro@fb.com> wrote:
> >
> > On Wed, Oct 03, 2018 at 07:57:45PM -0700, Alexei Starovoitov wrote:
> > > Similar to networking sandboxing programs and cgroup-v2 based hooks
> > > (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
> > > introduce basic per-container sandboxing for file access via
> > > new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
> > > security_file_open() LSM hook and works as additional file_open filter.
> > > The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.
> > >
> > > Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
> > > is only available to root.
> > >
> > > This program type has access to single argument 'struct bpf_file_info'
> > > that contains standard sys_stat fields:
> > > struct bpf_file_info {
> > >         __u64 inode;
> > >         __u32 dev_major;
> > >         __u32 dev_minor;
> > >         __u32 fs_magic;
> > >         __u32 mnt_id;
> > >         __u32 nlink;
> > >         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
> > >         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> > > };
> >
> > It's probably nice to have file uid/gid as well.
> 
> And an indication of which mount namespace we're looking at.

Both certainly can be added in the future without breaking progs.
I didn't want to add too much all at once.
For file uid/gid I prototyped 
bpf_get_statx(struct bpf_file_info *file, int flags, int mask, struct statx *sx, int size);
helper that calls normal statx underneath.
But it's not fast, since sizeof(struct statx) == 256 and it has to be fully
inited by the helper or by the bpf prog (since bpf doesn't allow uninited memory anywhere).
Then I thought about going back to older sys_stat helper (without x), since structs
are smaller, but that didn't look as good either. So I've decied to table it for now
and get the basic support via 'struct bpf_file_info' first.
Then extend it later via new fields and new helpers.
Al Viro Oct. 5, 2018, 4:46 a.m. UTC | #4
On Wed, Oct 03, 2018 at 07:57:45PM -0700, Alexei Starovoitov wrote:
 
> @@ -15,6 +15,7 @@
>  #include <linux/bpf.h>
>  #include <linux/bpf-cgroup.h>
>  #include <net/sock.h>
> +#include <../fs/mount.h>

No.

> +	struct file *file = NULL;
> +	struct inode *inode;
> +	struct super_block *sb;
> +	struct mount *mnt;

Fuck, no.

> +	case offsetof(struct bpf_file_info, mnt_id):
> +		/* dst = real_mount(file->f_path.mnt)->mnt_id */
> +		mnt = real_mount(LD_1(file->f_path.mnt));
> +		LD_n(mnt->mnt_id);

NAK.  Anything in struct mount is private to just a couple of
files in fs/*.c.  Don't do that.  And keep in mind that internal
details can and will be changed at zero notice, so be careful
with adding such stuff.

Another problem is your direct poking in ->i_ino.  It's not
something directly exposed to userland at the moment and it should
not become such.  Filesystem has every right to have ->getattr()
set ->ino (== ->st_ino value) in whichever way it likes; the same
goes for ->dev.
Alexei Starovoitov Oct. 5, 2018, 10:05 p.m. UTC | #5
On Fri, Oct 05, 2018 at 05:46:59AM +0100, Al Viro wrote:
> On Wed, Oct 03, 2018 at 07:57:45PM -0700, Alexei Starovoitov wrote:
>  
> > @@ -15,6 +15,7 @@
> >  #include <linux/bpf.h>
> >  #include <linux/bpf-cgroup.h>
> >  #include <net/sock.h>
> > +#include <../fs/mount.h>
> 
> No.

I've considered splitting cgroup_file_filter_ctx_access() into
separate .c inside fs/ directory, but it felt odd to move just that
function whereas the rest of the bpf bits are in kernel/bpf/
only to avoid doing this "../fs/" hack.
How about calling this new file fs/bpf_file_filter.c ?

> > +	struct file *file = NULL;
> > +	struct inode *inode;
> > +	struct super_block *sb;
> > +	struct mount *mnt;
> 
> Fuck, no.
> 
> > +	case offsetof(struct bpf_file_info, mnt_id):
> > +		/* dst = real_mount(file->f_path.mnt)->mnt_id */
> > +		mnt = real_mount(LD_1(file->f_path.mnt));
> > +		LD_n(mnt->mnt_id);
> 
> NAK.  Anything in struct mount is private to just a couple of
> files in fs/*.c.  Don't do that.  And keep in mind that internal
> details can and will be changed at zero notice, so be careful
> with adding such stuff.

yes. The internal details of file and inode structs can and will change.
Just like all the sk_buff internals that are exposed to bpf
via the same context rewriting mechanism.
See commit 9bac3d6d548e ("bpf: allow extended BPF programs access skb fields")
that exposed first 4 fields out of sk_buff to bpf progs via
'struct __sk_buff'.
Other fields were exposed later. Some of them are not even from sk_buff.
For networking programs 'struct __sk_buff' is the context.
For this new file_filter programs 'struct bpf_file_info' is the context.
That's the only thing bpf side can see.

> Another problem is your direct poking in ->i_ino.  It's not
> something directly exposed to userland at the moment and it should
> not become such.

The patch is not making i_ino directly exposed.
Only 'struct bpf_file_info' is exposed to user space / bpf programs.

In 'struct bpf_file_info' the field is called '__u64 inode'.
That is the abi.
The kernel side stays with 'unsigned long i_ino;' field.
Its size is different on 32/64 architectures, but to bpf progs
it's always seen as '__u64 inode'.
If in the future the kernel decides to change 'i_ino' to be u64,
nothing will change on bpf side.

> Filesystem has every right to have ->getattr()
> set ->ino (== ->st_ino value) in whichever way it likes; 

Essentially what cgroup_file_filter_ctx_access() is doing
is the same as generic_fillattr() + cp_statx() work, but via
on the fly rewriting of bpf instructions during
loading of bpf program.

See my other reply to Andy where I argued that certain
fields like uid/gid probably don't make sense to expose
to bpf via ctx rewriter mechanism and instead they can be
done via new bpf_get_file_statx() helper.
That's future work.

> the same
> goes for ->dev.

Notice how single kernel field file->f_inode->i_sb->s_dev
is exposed to bpf via two fields dev_major and dev_minor
inside 'struct bpf_file_info'.
For dev_minor the ctx rewriting is done as:
+       case offsetof(struct bpf_file_info, dev_minor):
+               /* dst = file->f_inode->i_sb->s_dev */
+               inode = LD_1(file->f_inode);
+               sb = LD_n(inode->i_sb);
+               LD_n(sb->s_dev);
+               *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, MINORMASK);
+               break;

If the last AND instruction was not there, the whole 's_dev' field
would have been exposed to bpf side and that would be questionable
design choice, since s_dev has kernel internal encoding of major/minor.
Instead the AND instruction is masking minor bits.
So above ctx rewriting for 'dev_minor' field is equivalent
to line
  stat->dev = inode->i_sb->s_dev;
from generic_fillattr()
plus another line
  tmp.stx_dev_minor = MINOR(stat->dev);
from cp_statx()

This way the kernel internal field 's_dev' is split into
two dev_major/dev_minor bpf visible fields.

The end result is that no new kernel information is exposed.
The same information is seen via statx.
Andy Lutomirski Oct. 5, 2018, 10:09 p.m. UTC | #6
On Fri, Oct 5, 2018 at 3:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Oct 05, 2018 at 05:46:59AM +0100, Al Viro wrote:
>
> > Another problem is your direct poking in ->i_ino.  It's not
> > something directly exposed to userland at the moment and it should
> > not become such.
>
> The patch is not making i_ino directly exposed.
> Only 'struct bpf_file_info' is exposed to user space / bpf programs.

I think Al is saying that the valie of i_ino is not something that
user code is permitted to know regardless of how you format it because
it may or may not actually match the value returned by stat().
Another way of saying that is that your patch is digging into an
internal data structure and is doing it wrong.

--Andy
Alexei Starovoitov Oct. 5, 2018, 10:27 p.m. UTC | #7
On Fri, Oct 05, 2018 at 03:09:20PM -0700, Andy Lutomirski wrote:
> On Fri, Oct 5, 2018 at 3:05 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Oct 05, 2018 at 05:46:59AM +0100, Al Viro wrote:
> >
> > > Another problem is your direct poking in ->i_ino.  It's not
> > > something directly exposed to userland at the moment and it should
> > > not become such.
> >
> > The patch is not making i_ino directly exposed.
> > Only 'struct bpf_file_info' is exposed to user space / bpf programs.
> 
> I think Al is saying that the valie of i_ino is not something that
> user code is permitted to know regardless of how you format it because
> it may or may not actually match the value returned by stat().
> Another way of saying that is that your patch is digging into an
> internal data structure and is doing it wrong.

several fs implementation I've looked at just do generic_fillattr()
for these fields. Are you saying some FS don't use inode->i_ino at all?
And it's bogus, hence shouldn't be read?
Al Viro Oct. 5, 2018, 11:47 p.m. UTC | #8
On Fri, Oct 05, 2018 at 03:27:54PM -0700, Alexei Starovoitov wrote:
> On Fri, Oct 05, 2018 at 03:09:20PM -0700, Andy Lutomirski wrote:
> > On Fri, Oct 5, 2018 at 3:05 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Oct 05, 2018 at 05:46:59AM +0100, Al Viro wrote:
> > >
> > > > Another problem is your direct poking in ->i_ino.  It's not
> > > > something directly exposed to userland at the moment and it should
> > > > not become such.
> > >
> > > The patch is not making i_ino directly exposed.
> > > Only 'struct bpf_file_info' is exposed to user space / bpf programs.
> > 
> > I think Al is saying that the valie of i_ino is not something that
> > user code is permitted to know regardless of how you format it because
> > it may or may not actually match the value returned by stat().
> > Another way of saying that is that your patch is digging into an
> > internal data structure and is doing it wrong.
> 
> several fs implementation I've looked at just do generic_fillattr()
> for these fields. Are you saying some FS don't use inode->i_ino at all?
> And it's bogus, hence shouldn't be read?

Bloody wonderful...  "Several instances use a library function and do not
override that part of its results; ergo, let's assume that all of them
do the same".

generic_fillattr() is a library function.  In a lot of cases this is all
->getattr() instance needs to do.  And yes, use of ->i_dev and ->i_ino
to intialize ->st_dev and ->st_ino happens to be the default.  However,
this is entirely up to the filesystem in question.  These fields are
fs-private; whether to use them for stat(2) (or anything userland-visible,
really) or to calculate some other value is up to individual filesystem.

FWIW, finding which instances do that is as simple as
grep -n '[-]>[[:space:]]*ino[[:space:]]*=' `git grep -l -w generic_fillattr`
on the plausible theory that ->getattr() instances will be using that helper
at least for some of the fields.  Discarding fs/stat.c, where generic_fillattr()
itself lives, we are left with
fs/ceph/inode.c:558:            if (realm->ino == ci->i_vino.ino)
fs/ceph/inode.c:2268:           stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
fs/cifs/inode.c:2067:   stat->ino = CIFS_I(inode)->uniqueid;
fs/fat/file.c:410:              stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
fs/fuse/dir.c:866:      stat->ino = attr->ino;
fs/fuse/dir.c:954:              stat->ino = fi->orig_ino;
fs/nfs/inode.c:841:     stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
Trivial examination shows that all matches except the first one *are* in ->getattr()
instances of the filesystems in question or are called from such.

And no, it's not that each of those filesystems does not use inode->i_ino at all.
There's a bunch of library helpers in fs/*.c that happen to use the value filesystem
has seen fit to store there.  Whether to use those helpers or not, what to store in
that field, etc. is, again, entirely up to the filesystem in question.
generic_fillattr() is one of those, that's all there is to it.

That's precisely why I really do not like the idea of hooks poking in the internals
of kernel data structures.  Especially since not even "it's not visible outside of
a subsystem-internal header" appears to slow you down.

The same goes for tracepoints, etc. - turning random details of implementation into
a carved in stone ABI is actively harmful.

NAK.

PS: If anything, visibility to hooks should be opt-in.  Sure, we can start actively hiding
the things, but that's a winless arms race  - even if we bloody went and encrypted the
private stuff, you'd still be able to pull decryption key from where it would be accessed
by legitimate users, etc.  Нахуй нам это надо?
Alexei Starovoitov Oct. 6, 2018, 12:22 a.m. UTC | #9
On Sat, Oct 06, 2018 at 12:47:06AM +0100, Al Viro wrote:
> 
> And no, it's not that each of those filesystems does not use inode->i_ino at all.
> There's a bunch of library helpers in fs/*.c that happen to use the value filesystem
> has seen fit to store there.  Whether to use those helpers or not, what to store in
> that field, etc. is, again, entirely up to the filesystem in question.
> generic_fillattr() is one of those, that's all there is to it.

makes sense.

> That's precisely why I really do not like the idea of hooks poking in the internals
> of kernel data structures.  Especially since not even "it's not visible outside of
> a subsystem-internal header" appears to slow you down.

that's why there is a code review to get the patches in shape that
don't expose kernel internals by _accident_.

> The same goes for tracepoints, etc. - turning random details of implementation into
> a carved in stone ABI is actively harmful.

we're not talking about tracepoints here.

> PS: If anything, visibility to hooks should be opt-in.

not sure how do you expect 'opt-in' to be imlemented.
As far as exposing inode and dev I agree that was a wrong approach.
Going to switch to struct file_handle instead.
Doing statx for every file_open is probably too slow for practical use.

> Sure, we can start actively hiding
> the things, but that's a winless arms race  - even if we bloody went and encrypted the
> private stuff, you'd still be able to pull decryption key from where it would be accessed
> by legitimate users, etc.  Нахуй нам это надо?

why do this ?
the use cases are explained in commit log.

Back to internals exposure.
Do you see an issue in the following:
struct bpf_file_info {
        __u32 fs_magic; // file->f_inode->i_sb->s_magic
        __u32 mnt_id;   // real_mount(file->f_path.mnt)->mnt_id
        __u32 nlink;    // file->f_inode->i_nlink
        __u32 mode;     // file->f_inode->i_mode
        __u32 flags;    // file->f_flags
};
and separate helper that returns struct file_handle ?
and ctx rewriting moved to fs/bpf_file_filter.c ?
Jann Horn Oct. 8, 2018, 12:56 a.m. UTC | #10
+cc kernel-hardening because this is related to sandboxing
+cc Mickaël Salaün because this looks related to his Landlock proposal

On Mon, Oct 8, 2018 at 2:30 AM Alexei Starovoitov <ast@kernel.org> wrote:
> Similar to networking sandboxing programs and cgroup-v2 based hooks
> (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
> introduce basic per-container sandboxing for file access via
> new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
> security_file_open() LSM hook and works as additional file_open filter.
> The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.

Why do_dentry_open() specifically, and nothing else? If you want to
filter open, wouldn't you also want to filter a bunch of other
filesystem operations with LSM hooks, like rename, unlink, truncate
and so on? Landlock benefits there from re-using the existing security
hooks.

> Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
> is only available to root.
>
> This program type has access to single argument 'struct bpf_file_info'
> that contains standard sys_stat fields:
> struct bpf_file_info {
>         __u64 inode;
>         __u32 dev_major;
>         __u32 dev_minor;
>         __u32 fs_magic;
>         __u32 mnt_id;
>         __u32 nlink;
>         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
>         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> };
> Other file attributes can be added in the future to the end of this struct
> without breaking bpf programs.
>
> For debugging introduce bpf_get_file_path() helper that returns
> NUL-terminated full path of the file. It should never be used for sandboxing.
>
> Use cases:
> - disallow certain FS types within containers (fs_magic == CGROUP2_SUPER_MAGIC)
> - restrict permissions in particular mount (mnt_id == X && (flags & O_RDWR))
> - disallow access to hard linked sensitive files (nlink > 1 && mode == 0700)
> - disallow access to world writeable files (mode == 0..7)
> - disallow access to given set of files (dev_major == X && dev_minor == Y && inode == Z)

That last one sounds weird. It doesn't work if you want to ban access
to a whole directory at once. And in general, highly specific
blocklists make me nervous, because if you add anything new and forget
to put it on the list, you have a problem.

> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf-cgroup.h |  10 +++
>  include/linux/bpf_types.h  |   1 +
>  include/uapi/linux/bpf.h   |  28 +++++-
>  kernel/bpf/cgroup.c        | 171 +++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c       |   7 ++
>  5 files changed, 216 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 588dd5f0bd85..766f0223c222 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -109,6 +109,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
>  int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>                                       short access, enum bpf_attach_type type);
>
> +int __cgroup_bpf_file_filter(struct file *file, enum bpf_attach_type type);
> +
>  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
>         struct bpf_map *map)
>  {
> @@ -253,6 +255,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>                                                                               \
>         __ret;                                                                \
>  })
> +#define BPF_CGROUP_RUN_PROG_FILE_FILTER(file)                               \
> +({                                                                           \
> +       int __ret = 0;                                                        \
> +       if (cgroup_bpf_enabled)                                               \
> +               __ret = __cgroup_bpf_file_filter(file, BPF_CGROUP_FILE_OPEN); \
> +       __ret;                                                                \
> +})
>  int cgroup_bpf_prog_attach(const union bpf_attr *attr,
>                            enum bpf_prog_type ptype, struct bpf_prog *prog);
>  int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> @@ -321,6 +330,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>  #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_FILE_FILTER(file) ({ 0; })
>
>  #define for_each_cgroup_storage_type(stype) for (; false; )
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 5432f4c9f50e..f182b2e37b94 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -33,6 +33,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
>  #ifdef CONFIG_INET
>  BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
>  #endif
> +BPF_PROG_TYPE(BPF_PROG_TYPE_FILE_FILTER, file_filter)
>
>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f9187b41dff6..c0df8dd99edc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -154,6 +154,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_LIRC_MODE2,
>         BPF_PROG_TYPE_SK_REUSEPORT,
>         BPF_PROG_TYPE_FLOW_DISSECTOR,
> +       BPF_PROG_TYPE_FILE_FILTER,
>  };
>
>  enum bpf_attach_type {
> @@ -175,6 +176,7 @@ enum bpf_attach_type {
>         BPF_CGROUP_UDP6_SENDMSG,
>         BPF_LIRC_MODE2,
>         BPF_FLOW_DISSECTOR,
> +       BPF_CGROUP_FILE_OPEN,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -2215,6 +2217,18 @@ union bpf_attr {
>   *             pointer that was returned from bpf_sk_lookup_xxx\ ().
>   *     Return
>   *             0 on success, or a negative error in case of failure.
> + *
> + * int bpf_get_file_path(struct bpf_file_info *file, char *buf, u32 size_of_buf)
> + *     Description
> + *             Reconstruct the full path of *file* and store it into *buf* of
> + *             *size_of_buf*. The *size_of_buf* must be strictly positive.
> + *             On success, the helper makes sure that the *buf* is NUL-terminated.
> + *             On failure, it is filled with string "(error)".
> + *             This helper should only be used for debugging.
> + *             'char *path' should never be used for permission checks.
> + *     Return
> + *             0 on success, or a negative error in case of failure.
> + *
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -2303,7 +2317,8 @@ union bpf_attr {
>         FN(skb_ancestor_cgroup_id),     \
>         FN(sk_lookup_tcp),              \
>         FN(sk_lookup_udp),              \
> -       FN(sk_release),
> +       FN(sk_release),                 \
> +       FN(get_file_path),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2896,4 +2911,15 @@ struct bpf_flow_keys {
>         };
>  };
>
> +struct bpf_file_info {
> +       __u64 inode;
> +       __u32 dev_major;
> +       __u32 dev_minor;
> +       __u32 fs_magic;
> +       __u32 mnt_id;
> +       __u32 nlink;
> +       __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
> +       __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> +};
[...]
Alexei Starovoitov Oct. 8, 2018, 2:22 a.m. UTC | #11
On Mon, Oct 08, 2018 at 02:56:15AM +0200, Jann Horn wrote:
> +cc kernel-hardening because this is related to sandboxing
> +cc Mickaël Salaün because this looks related to his Landlock proposal

It may seem that this work overlaps with landlock, but the goals
are different. Landlock is LSM based to act as _security_ framework
with end goal being available to unpriv users.
While this cgroup-bpf hook is expected to stay root only,
since we're trying to restrict what containers can do in a trusted environment.
'sandboxing' is overloaded word.
Sandboxing for security and sandboxing of trusted root are different.

> On Mon, Oct 8, 2018 at 2:30 AM Alexei Starovoitov <ast@kernel.org> wrote:
> > Similar to networking sandboxing programs and cgroup-v2 based hooks
> > (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
> > introduce basic per-container sandboxing for file access via
> > new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
> > security_file_open() LSM hook and works as additional file_open filter.
> > The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.
> 
> Why do_dentry_open() specifically, and nothing else? If you want to
> filter open, wouldn't you also want to filter a bunch of other
> filesystem operations with LSM hooks, like rename, unlink, truncate
> and so on? Landlock benefits there from re-using the existing security
> hooks.

It may make sense to extend in the future, but we don't have clear
user cases for rename/unlink/truncate at this point.

As you can see the amount of pushback even for basic file access is high.
Hence I don't think the landlock can be upstreamed in the current form,
since it touches VFS layer a lot more than this patch.
It's intrusive to LSM, and adds new concepts to BPF as well.
This work fits into existing BPF machinery and minimally intrusive to VFS.
I hope we can find a common ground with Al regarding what file
access primitives are exposed to BPF side.
Once we agree on that the landlock can piggy back on this work
and extend it to all file-based LSM hooks.
The first step for everyone interested in bpf-based 'sandboxing'
is to figure out VFS<->BPF interface.
If you or Mickael have suggestions on what bpf progs should and should not
see at these hooks, it's a good time to discuss.
I believe the fields proposed are the obvious minimum.

> > Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
> > is only available to root.
> >
> > This program type has access to single argument 'struct bpf_file_info'
> > that contains standard sys_stat fields:
> > struct bpf_file_info {
> >         __u64 inode;
> >         __u32 dev_major;
> >         __u32 dev_minor;
> >         __u32 fs_magic;
> >         __u32 mnt_id;
> >         __u32 nlink;
> >         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
> >         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
> > };
> > Other file attributes can be added in the future to the end of this struct
> > without breaking bpf programs.
> >
> > For debugging introduce bpf_get_file_path() helper that returns
> > NUL-terminated full path of the file. It should never be used for sandboxing.
> >
> > Use cases:
> > - disallow certain FS types within containers (fs_magic == CGROUP2_SUPER_MAGIC)
> > - restrict permissions in particular mount (mnt_id == X && (flags & O_RDWR))
> > - disallow access to hard linked sensitive files (nlink > 1 && mode == 0700)
> > - disallow access to world writeable files (mode == 0..7)
> > - disallow access to given set of files (dev_major == X && dev_minor == Y && inode == Z)
> 
> That last one sounds weird. It doesn't work if you want to ban access
> to a whole directory at once. And in general, highly specific
> blocklists make me nervous, because if you add anything new and forget
> to put it on the list, you have a problem.

In the upcoming V2 of the patches the direct exposure to dev and inode will be removed.
And instead the opaque 'struct file_handle' will be available to bpf progs.
The use case is indeed to restrict access to specific blacklist of files.
The user space will collect the set of files via sys_name_to_handle_at(),
then store the fhandles in bpf map, and bpf prog will consult the map
to deny the access.
It's not a replacement for ACLs, directory permissions, etc. The use case
is to prevent trusted containers messing up the environment.
The continuous integration system needs to run some containers (and tests inside them)
with root privs. When these jobs mess up the system the subsequent jobs may incorrectly
fail. We believe that this cgroup based container enforcement will solve this use case
and similar other use cases when containers are trusted, but could be buggy when
it comes to file access.

To recap what I'm implementing in V2:
1.
struct bpf_file_info {
        __u32 fs_magic; // file->f_inode->i_sb->s_magic
        __u32 mnt_id;   // real_mount(file->f_path.mnt)->mnt_id
        __u32 nlink;    // file->f_inode->i_nlink
        __u32 mode;     // file->f_inode->i_mode
        __u32 flags;    // file->f_flags
};
I double checked what VFS layer does with above fields and I think
there is no additional user space exposure will be made when such
fields are seen by bpf progs.
But since I'm not a VFS expert, I'd like Al to confirm.

2.
bpf_get_file_handle(struct bpf_file_info *ctx, struct file_handle *fh, int fh_size);
helper that bpf prog will use to obtain fh of the file about to be open.

3.
bpf_get_file_statx(struct bpf_file_info *ctx, struct statx *sx, int size, int flags);
Though struct statx is 256 bytes, and the helper would have to touch all bytes
I couldn't figure out the faster way to get to inode/dev/uid of the given file
that will work on all underlying FSes.

Thoughts?
Mickaël Salaün Oct. 8, 2018, 9:06 a.m. UTC | #12
On 10/8/18 04:22, Alexei Starovoitov wrote:
> On Mon, Oct 08, 2018 at 02:56:15AM +0200, Jann Horn wrote:
>> +cc kernel-hardening because this is related to sandboxing
>> +cc Mickaël Salaün because this looks related to his Landlock proposal

Thanks for the CC. Because this patch series is an access-control
mechanism, it will undoubtedly interest LSM folks as well.

Here is the full thread:
https://lore.kernel.org/lkml/20181004025750.498303-1-ast@kernel.org/

> 
> It may seem that this work overlaps with landlock, but the goals
> are different. Landlock is LSM based to act as _security_ framework
> with end goal being available to unpriv users.

Right, one of the end goal of Landlock is to make most of its
access-control features available to unprivileged processes. However,
some of them may require higher privileges, but we should minimize them
as much as possible to not end up with the SUID-binary (or
CAP_SYS_ADMIN) syndrome.

> While this cgroup-bpf hook is expected to stay root only,
> since we're trying to restrict what containers can do in a trusted environment.
> 'sandboxing' is overloaded word.
> Sandboxing for security and sandboxing of trusted root are different.

Sure we must treat them in a different manner but we must take care of
which capability this sandboxing mechanism really need, and hopefully
not root-like ones, especially for a container.

As a reminder, using cgroups to sandbox processes is definitely a use
case for Landlock, as well as the process hierarchy one (seccomp-like).
I implemented sandboxing with cgroups in a previous patch series, but
put it aside for now to minimize the number of patches.

> 
>> On Mon, Oct 8, 2018 at 2:30 AM Alexei Starovoitov <ast@kernel.org> wrote:
>>> Similar to networking sandboxing programs and cgroup-v2 based hooks
>>> (BPF_CGROUP_INET_[INGRESS|EGRESS,] BPF_CGROUP_INET[4|6]_[BIND|CONNECT], etc)
>>> introduce basic per-container sandboxing for file access via
>>> new BPF_PROG_TYPE_FILE_FILTER program type that attaches after
>>> security_file_open() LSM hook and works as additional file_open filter.
>>> The new cgroup bpf hook is called BPF_CGROUP_FILE_OPEN.
>>
>> Why do_dentry_open() specifically, and nothing else? If you want to
>> filter open, wouldn't you also want to filter a bunch of other
>> filesystem operations with LSM hooks, like rename, unlink, truncate
>> and so on? Landlock benefits there from re-using the existing security
>> hooks.
> 
> It may make sense to extend in the future, but we don't have clear
> user cases for rename/unlink/truncate at this point.

I guess the users who ask for the current features will come with new
ones, which will move towards the same goal as Landlock.

> 
> As you can see the amount of pushback even for basic file access is high.
> Hence I don't think the landlock can be upstreamed in the current form,
> since it touches VFS layer a lot more than this patch.

That is not true for the main part. The only patch from Landlock
touching the FS subsystem is to add a new LSM hook. This was NAKed by Al
but I guess mostly because of misunderstanding. I'll send a new specific
patch to the FS subsystem soon to properly explain the impact on path
lookup and why it doesn't expose more that userspace have already access.

Regarding the upstreaming concern, if I remember correctly, the net/BPF
and the LSM maintainers are (mostly ?) OK. The feedback I got (you
included) was good.

I can definitely create a lighter version of Landlock with your use
cases in mind. There is already all the necessary building blocks: the
BPF (un-)privileged handling, the file checks (see the FS_PICK subtype)
and the cgroups handling. I may even be easier to upstream (and answer
Casey's request) to make the patch series smaller.

> It's intrusive to LSM, and adds new concepts to BPF as well.
> This work fits into existing BPF machinery and minimally intrusive to VFS.
> I hope we can find a common ground with Al regarding what file
> access primitives are exposed to BPF side.
> Once we agree on that the landlock can piggy back on this work
> and extend it to all file-based LSM hooks.

Can you explain your thought about piggy backing? If your patch is
included, Landlock will still need to add new BPF prog types, and will
still be an LSM (which is designed to handle access-control).

> The first step for everyone interested in bpf-based 'sandboxing'
> is to figure out VFS<->BPF interface.
> If you or Mickael have suggestions on what bpf progs should and should not
> see at these hooks, it's a good time to discuss.
> I believe the fields proposed are the obvious minimum.

In a nutshell, Landlock rely on an inode map, which contains references
to inodes. These references/handles can then be used to "compare" with a
requested inode. This way, we can limit the checks to what is already
available to the process which loaded the map. I suggest to not use
these hardcoded fields but use a BFP helper with a file handle as
argument and dedicated flags, as I did in a previous version of Landlock
(which I planned to upstream as a new future feature).

> 
>>> Just like other cgroup-bpf programs new BPF_PROG_TYPE_FILE_FILTER type
>>> is only available to root.
>>>
>>> This program type has access to single argument 'struct bpf_file_info'
>>> that contains standard sys_stat fields:
>>> struct bpf_file_info {
>>>         __u64 inode;
>>>         __u32 dev_major;
>>>         __u32 dev_minor;
>>>         __u32 fs_magic;
>>>         __u32 mnt_id;
>>>         __u32 nlink;
>>>         __u32 mode;     /* file mode S_ISDIR, S_ISLNK, 0755, etc */
>>>         __u32 flags;    /* open flags O_RDWR, O_CREAT, etc */
>>> };
>>> Other file attributes can be added in the future to the end of this struct
>>> without breaking bpf programs.
>>>
>>> For debugging introduce bpf_get_file_path() helper that returns
>>> NUL-terminated full path of the file. It should never be used for sandboxing.

Well, I'm convinced it *will* be used for sandboxing. Once a feature is
available, it is too late to forbid users to use it the way they want,
even if you warn them about security issues…

>>>
>>> Use cases:
>>> - disallow certain FS types within containers (fs_magic == CGROUP2_SUPER_MAGIC)
>>> - restrict permissions in particular mount (mnt_id == X && (flags & O_RDWR))
>>> - disallow access to hard linked sensitive files (nlink > 1 && mode == 0700)
>>> - disallow access to world writeable files (mode == 0..7)
>>> - disallow access to given set of files (dev_major == X && dev_minor == Y && inode == Z)

These use cases are interesting and should help to harden containers.
They are definitely in the scope of what Landlock can do.

However, I would like to see which capabilities are needed to access
these properties.

>>
>> That last one sounds weird. It doesn't work if you want to ban access
>> to a whole directory at once. And in general, highly specific
>> blocklists make me nervous, because if you add anything new and forget
>> to put it on the list, you have a problem.
> 
> In the upcoming V2 of the patches the direct exposure to dev and inode will be removed.
> And instead the opaque 'struct file_handle' will be available to bpf progs.
> The use case is indeed to restrict access to specific blacklist of files.
> The user space will collect the set of files via sys_name_to_handle_at(),
> then store the fhandles in bpf map, and bpf prog will consult the map
> to deny the access.
> It's not a replacement for ACLs, directory permissions, etc. The use case
> is to prevent trusted containers messing up the environment.
> The continuous integration system needs to run some containers (and tests inside them)
> with root privs. When these jobs mess up the system the subsequent jobs may incorrectly
> fail. We believe that this cgroup based container enforcement will solve this use case
> and similar other use cases when containers are trusted, but could be buggy when
> it comes to file access.
> 
> To recap what I'm implementing in V2:
> 1.
> struct bpf_file_info {
>         __u32 fs_magic; // file->f_inode->i_sb->s_magic
>         __u32 mnt_id;   // real_mount(file->f_path.mnt)->mnt_id
>         __u32 nlink;    // file->f_inode->i_nlink
>         __u32 mode;     // file->f_inode->i_mode
>         __u32 flags;    // file->f_flags
> };
> I double checked what VFS layer does with above fields and I think
> there is no additional user space exposure will be made when such
> fields are seen by bpf progs.
> But since I'm not a VFS expert, I'd like Al to confirm.
> 
> 2.
> bpf_get_file_handle(struct bpf_file_info *ctx, struct file_handle *fh, int fh_size);
> helper that bpf prog will use to obtain fh of the file about to be open.

This looks like Landlock. :)

> 
> 3.
> bpf_get_file_statx(struct bpf_file_info *ctx, struct statx *sx, int size, int flags);
> Though struct statx is 256 bytes, and the helper would have to touch all bytes
> I couldn't figure out the faster way to get to inode/dev/uid of the given file
> that will work on all underlying FSes.

This looks like a previous helper I implemented, but tried to avoid
because of security issues (mostly for unpriv processes).

> 
> Thoughts?
> 
>
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 588dd5f0bd85..766f0223c222 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -109,6 +109,8 @@  int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+int __cgroup_bpf_file_filter(struct file *file, enum bpf_attach_type type);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -253,6 +255,13 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 									      \
 	__ret;								      \
 })
+#define BPF_CGROUP_RUN_PROG_FILE_FILTER(file)				     \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_file_filter(file, BPF_CGROUP_FILE_OPEN); \
+	__ret;								      \
+})
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -321,6 +330,7 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_FILE_FILTER(file) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5432f4c9f50e..f182b2e37b94 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -33,6 +33,7 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_FILE_FILTER, file_filter)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..c0df8dd99edc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -154,6 +154,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
+	BPF_PROG_TYPE_FILE_FILTER,
 };
 
 enum bpf_attach_type {
@@ -175,6 +176,7 @@  enum bpf_attach_type {
 	BPF_CGROUP_UDP6_SENDMSG,
 	BPF_LIRC_MODE2,
 	BPF_FLOW_DISSECTOR,
+	BPF_CGROUP_FILE_OPEN,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2215,6 +2217,18 @@  union bpf_attr {
  *		pointer that was returned from bpf_sk_lookup_xxx\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_file_path(struct bpf_file_info *file, char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Reconstruct the full path of *file* and store it into *buf* of
+ * 		*size_of_buf*. The *size_of_buf* must be strictly positive.
+ * 		On success, the helper makes sure that the *buf* is NUL-terminated.
+ * 		On failure, it is filled with string "(error)".
+ * 		This helper should only be used for debugging.
+ * 		'char *path' should never be used for permission checks.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2303,7 +2317,8 @@  union bpf_attr {
 	FN(skb_ancestor_cgroup_id),	\
 	FN(sk_lookup_tcp),		\
 	FN(sk_lookup_udp),		\
-	FN(sk_release),
+	FN(sk_release),			\
+	FN(get_file_path),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2896,4 +2911,15 @@  struct bpf_flow_keys {
 	};
 };
 
+struct bpf_file_info {
+	__u64 inode;
+	__u32 dev_major;
+	__u32 dev_minor;
+	__u32 fs_magic;
+	__u32 mnt_id;
+	__u32 nlink;
+	__u32 mode;	/* file mode S_ISDIR, S_ISLNK, 0755, etc */
+	__u32 flags;	/* open flags O_RDWR, O_CREAT, etc */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 00f6ed2e4f9a..38d0b4aa83ea 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,6 +15,7 @@ 
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
+#include <../fs/mount.h>
 
 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
@@ -754,3 +755,173 @@  const struct bpf_verifier_ops cg_dev_verifier_ops = {
 	.get_func_proto		= cgroup_dev_func_proto,
 	.is_valid_access	= cgroup_dev_is_valid_access,
 };
+
+int __cgroup_bpf_file_filter(struct file *file, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], file, BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_file_filter);
+
+BPF_CALL_3(bpf_get_file_path, struct file *, file, char *, buf, u64, size)
+{
+	char *p = file_path(file, buf, size);
+	int len;
+
+	if (IS_ERR(p)) {
+		strncpy(buf, "(error)", size);
+		return PTR_ERR(p);
+	}
+	len = buf + size - p;
+	memmove(buf, p, len);
+	memset(buf + len, 0, size - len);
+	return 0;
+}
+
+const struct bpf_func_proto bpf_get_file_path_proto = {
+	.func		= bpf_get_file_path,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+cgroup_file_filter_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_get_file_path:
+		return &bpf_get_file_path_proto;
+	default:
+		return cgroup_dev_func_proto(func_id, prog);
+	}
+}
+
+static bool cgroup_file_filter_is_valid_access(int off, int size,
+					       enum bpf_access_type type,
+					       const struct bpf_prog *prog,
+					       struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off + size > sizeof(struct bpf_file_info) ||
+	    off % size || type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_file_info, fs_magic):
+	case offsetof(struct bpf_file_info, mnt_id):
+	case offsetof(struct bpf_file_info, dev_major):
+	case offsetof(struct bpf_file_info, dev_minor):
+	case offsetof(struct bpf_file_info, nlink):
+	case offsetof(struct bpf_file_info, mode):
+	case offsetof(struct bpf_file_info, flags):
+		return size == size_default;
+
+	case offsetof(struct bpf_file_info, inode):
+		return size == sizeof(__u64);
+
+	default:
+		if (size != size_default)
+			return false;
+	}
+	return true;
+}
+
+#define LD_1(F) ({					\
+	typeof(F) val = 0;				\
+	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(val),		\
+			      si->dst_reg, si->src_reg,	\
+			      ((size_t)&F));		\
+	*target_size = sizeof(val);			\
+	val;						\
+	})
+
+#define LD_n(F) ({					\
+	typeof(F) val = 0;				\
+	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(val),		\
+			      si->dst_reg, si->dst_reg,	\
+			      ((size_t)&F));		\
+	*target_size = sizeof(val);			\
+	val;						\
+	})
+
+static u32 cgroup_file_filter_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
+					 struct bpf_insn *insn_buf,
+					 struct bpf_prog *prog,
+					 u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+	struct file *file = NULL;
+	struct inode *inode;
+	struct super_block *sb;
+	struct mount *mnt;
+
+	switch (si->off) {
+	case offsetof(struct bpf_file_info, fs_magic):
+		/* dst = file->f_inode->i_sb->s_magic */
+		inode = LD_1(file->f_inode);
+		sb = LD_n(inode->i_sb);
+		LD_n(sb->s_magic);
+		break;
+	case offsetof(struct bpf_file_info, dev_major):
+		/* dst = file->f_inode->i_sb->s_dev */
+		inode = LD_1(file->f_inode);
+		sb = LD_n(inode->i_sb);
+		LD_n(sb->s_dev);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, MINORBITS);
+		break;
+	case offsetof(struct bpf_file_info, dev_minor):
+		/* dst = file->f_inode->i_sb->s_dev */
+		inode = LD_1(file->f_inode);
+		sb = LD_n(inode->i_sb);
+		LD_n(sb->s_dev);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, MINORMASK);
+		break;
+	case offsetof(struct bpf_file_info, inode):
+		/* dst = file->f_inode->i_ino */
+		inode = LD_1(file->f_inode);
+		LD_n(inode->i_ino);
+		break;
+	case offsetof(struct bpf_file_info, mode):
+		/* dst = file->f_inode->i_mode */
+		inode = LD_1(file->f_inode);
+		LD_n(inode->i_mode);
+		break;
+	case offsetof(struct bpf_file_info, nlink):
+		/* dst = file->f_inode->i_nlink */
+		inode = LD_1(file->f_inode);
+		LD_n(inode->i_nlink);
+		break;
+	case offsetof(struct bpf_file_info, flags):
+		/* dst = file->f_flags */
+		LD_1(file->f_flags);
+		break;
+	case offsetof(struct bpf_file_info, mnt_id):
+		/* dst = real_mount(file->f_path.mnt)->mnt_id */
+		mnt = real_mount(LD_1(file->f_path.mnt));
+		LD_n(mnt->mnt_id);
+		break;
+	}
+	return insn - insn_buf;
+}
+#undef LD_1
+#undef LD_n
+
+const struct bpf_prog_ops file_filter_prog_ops = {
+};
+
+const struct bpf_verifier_ops file_filter_verifier_ops = {
+	.get_func_proto		= cgroup_file_filter_proto,
+	.is_valid_access	= cgroup_file_filter_is_valid_access,
+	.convert_ctx_access	= cgroup_file_filter_ctx_access
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5742df21598c..7b0ffb8d7063 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1630,6 +1630,9 @@  static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_FLOW_DISSECTOR:
 		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
 		break;
+	case BPF_CGROUP_FILE_OPEN:
+		ptype = BPF_PROG_TYPE_FILE_FILTER;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1699,6 +1702,9 @@  static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_DEVICE:
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
+	case BPF_CGROUP_FILE_OPEN:
+		ptype = BPF_PROG_TYPE_FILE_FILTER;
+		break;
 	case BPF_SK_MSG_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
 	case BPF_SK_SKB_STREAM_PARSER:
@@ -1741,6 +1747,7 @@  static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
+	case BPF_CGROUP_FILE_OPEN:
 		break;
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);