diff mbox series

[v2,bpf-next,4/6] bpf: implement bpf_prog replacement for an active bpf_cgroup_link

Message ID 20200325065746.640559-5-andriin@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Add support for cgroup bpf_link | expand

Commit Message

Andrii Nakryiko March 25, 2020, 6:57 a.m. UTC
Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
under given bpf_link. Currently this is only supported for bpf_cgroup_link,
but will be extended to other kinds of bpf_links in follow-up patches.

For bpf_cgroup_link, implemented functionality matches existing semantics for
direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
unconditionally set new bpf_prog regardless of which bpf_prog is currently
active under given bpf_link, or, optionally, can specify expected active
bpf_prog. If active bpf_prog doesn't match expected one, no changes are
performed, old bpf_link stays intact and attached, operation returns
a failure.

cgroup_bpf_replace() operation is resolving race between auto-detachment and
bpf_prog update in the same fashion as it's done for bpf_link detachment,
except in this case update has no way of succeeding because of target cgroup
marked as dying. So in this case error is returned.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 include/linux/bpf-cgroup.h | 11 ++++++
 include/uapi/linux/bpf.h   | 12 ++++++
 kernel/bpf/cgroup.c        | 80 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 52 +++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 27 +++++++++++++
 5 files changed, 182 insertions(+)

Comments

kernel test robot March 25, 2020, 10:57 p.m. UTC | #1
Hi Andrii,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]
[cannot apply to bpf/master cgroup/for-next v5.6-rc7 next-20200325]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200326-055942
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-5) 7.5.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/cgroup-defs.h:22:0,
                    from include/linux/cgroup.h:28,
                    from include/linux/memcontrol.h:13,
                    from include/linux/swap.h:9,
                    from include/linux/suspend.h:5,
                    from arch/x86/kernel/asm-offsets.c:13:
>> include/linux/bpf-cgroup.h:380:45: warning: 'struct bpf_link' declared inside parameter list will not be visible outside of this definition or declaration
    static inline int cgroup_bpf_replace(struct bpf_link *link,
                                                ^~~~~~~~
--
   In file included from include/linux/cgroup-defs.h:22:0,
                    from include/linux/cgroup.h:28,
                    from include/linux/memcontrol.h:13,
                    from include/linux/swap.h:9,
                    from include/linux/suspend.h:5,
                    from arch/x86/kernel/asm-offsets.c:13:
>> include/linux/bpf-cgroup.h:380:45: warning: 'struct bpf_link' declared inside parameter list will not be visible outside of this definition or declaration
    static inline int cgroup_bpf_replace(struct bpf_link *link,
                                                ^~~~~~~~
   20 real  6 user  8 sys  71.33% cpu 	make prepare

vim +380 include/linux/bpf-cgroup.h

   379	
 > 380	static inline int cgroup_bpf_replace(struct bpf_link *link,
   381					     struct bpf_prog *old_prog,
   382					     struct bpf_prog *new_prog)
   383	{
   384		return -EINVAL;
   385	}
   386	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot March 26, 2020, 12:45 a.m. UTC | #2
Hi Andrii,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]
[cannot apply to bpf/master cgroup/for-next v5.6-rc7 next-20200325]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200326-055942
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: mips-fuloong2e_defconfig (attached as .config)
compiler: mips64el-linux-gcc (GCC) 5.5.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=5.5.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/cgroup-defs.h:22:0,
                    from include/linux/cgroup.h:28,
                    from include/linux/memcontrol.h:13,
                    from include/linux/swap.h:9,
                    from include/linux/suspend.h:5,
                    from arch/mips/kernel/asm-offsets.c:17:
>> include/linux/bpf-cgroup.h:382:17: warning: 'struct bpf_link' declared inside parameter list
             struct bpf_prog *new_prog)
                    ^
>> include/linux/bpf-cgroup.h:382:17: warning: its scope is only this definition or declaration, which is probably not what you want
--
   In file included from include/linux/cgroup-defs.h:22:0,
                    from include/linux/cgroup.h:28,
                    from include/linux/memcontrol.h:13,
                    from include/linux/swap.h:9,
                    from include/linux/suspend.h:5,
                    from arch/mips/kernel/asm-offsets.c:17:
>> include/linux/bpf-cgroup.h:382:17: warning: 'struct bpf_link' declared inside parameter list
             struct bpf_prog *new_prog)
                    ^
>> include/linux/bpf-cgroup.h:382:17: warning: its scope is only this definition or declaration, which is probably not what you want
   20 real  6 user  9 sys  75.80% cpu 	make prepare

vim +382 include/linux/bpf-cgroup.h

   379	
   380	static inline int cgroup_bpf_replace(struct bpf_link *link,
   381					     struct bpf_prog *old_prog,
 > 382					     struct bpf_prog *new_prog)
   383	{
   384		return -EINVAL;
   385	}
   386	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Andrii Nakryiko March 26, 2020, 1:28 a.m. UTC | #3
On Wed, Mar 25, 2020 at 3:58 PM kbuild test robot <lkp@intel.com> wrote:
>
> Hi Andrii,
>
> I love your patch! Perhaps something to improve:
>
> [auto build test WARNING on bpf-next/master]
> [cannot apply to bpf/master cgroup/for-next v5.6-rc7 next-20200325]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
>
> url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200326-055942
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> config: i386-tinyconfig (attached as .config)
> compiler: gcc-7 (Debian 7.5.0-5) 7.5.0
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=i386
>
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot <lkp@intel.com>
>
> All warnings (new ones prefixed by >>):
>
>    In file included from include/linux/cgroup-defs.h:22:0,
>                     from include/linux/cgroup.h:28,
>                     from include/linux/memcontrol.h:13,
>                     from include/linux/swap.h:9,
>                     from include/linux/suspend.h:5,
>                     from arch/x86/kernel/asm-offsets.c:13:
> >> include/linux/bpf-cgroup.h:380:45: warning: 'struct bpf_link' declared inside parameter list will not be visible outside of this definition or declaration
>     static inline int cgroup_bpf_replace(struct bpf_link *link,
>                                                 ^~~~~~~~
> --
>    In file included from include/linux/cgroup-defs.h:22:0,
>                     from include/linux/cgroup.h:28,
>                     from include/linux/memcontrol.h:13,
>                     from include/linux/swap.h:9,
>                     from include/linux/suspend.h:5,
>                     from arch/x86/kernel/asm-offsets.c:13:
> >> include/linux/bpf-cgroup.h:380:45: warning: 'struct bpf_link' declared inside parameter list will not be visible outside of this definition or declaration
>     static inline int cgroup_bpf_replace(struct bpf_link *link,
>                                                 ^~~~~~~~
>    20 real  6 user  8 sys  71.33% cpu   make prepare
>
> vim +380 include/linux/bpf-cgroup.h
>
>    379

It's a matter of forward declaring struct bpf_link here. Will fix in
v3, but I'll wait a bit for any feedback before updating.

>  > 380  static inline int cgroup_bpf_replace(struct bpf_link *link,
>    381                                       struct bpf_prog *old_prog,
>    382                                       struct bpf_prog *new_prog)
>    383  {
>    384          return -EINVAL;
>    385  }
>    386
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Alexei Starovoitov March 26, 2020, 11:35 p.m. UTC | #4
On Tue, Mar 24, 2020 at 11:57:44PM -0700, Andrii Nakryiko wrote:
>  
> +/* Swap updated BPF program for given link in effective program arrays across
> + * all descendant cgroups. This function is guaranteed to succeed.
> + */
> +static void replace_effective_prog(struct cgroup *cgrp,
> +				   enum bpf_attach_type type,
> +				   struct bpf_cgroup_link *link)
> +{
> +	struct bpf_prog_array_item *item;
> +	struct cgroup_subsys_state *css;
> +	struct bpf_prog_array *progs;
> +	struct bpf_prog_list *pl;
> +	struct list_head *head;
> +	struct cgroup *cg;
> +	int pos;
> +
> +	css_for_each_descendant_pre(css, &cgrp->self) {
> +		struct cgroup *desc = container_of(css, struct cgroup, self);
> +
> +		if (percpu_ref_is_zero(&desc->bpf.refcnt))
> +			continue;
> +
> +		/* found position of link in effective progs array */
> +		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> +			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> +				continue;
> +
> +			head = &cg->bpf.progs[type];
> +			list_for_each_entry(pl, head, node) {
> +				if (!prog_list_prog(pl))
> +					continue;
> +				if (pl->link == link)
> +					goto found;
> +				pos++;
> +			}
> +		}
> +found:
> +		BUG_ON(!cg);
> +		progs = rcu_dereference_protected(
> +				desc->bpf.effective[type],
> +				lockdep_is_held(&cgroup_mutex));
> +		item = &progs->items[pos];
> +		WRITE_ONCE(item->prog, link->link.prog);
> +	}
> +}
> +
> +/**
> + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> + *                          to descendants
> + * @cgrp: The cgroup which descendants to traverse
> + * @link: A link for which to replace BPF program
> + * @type: Type of attach operation
> + *
> + * Must be called with cgroup_mutex held.
> + */
> +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> +			 struct bpf_prog *new_prog)
> +{
> +	struct list_head *progs = &cgrp->bpf.progs[link->type];
> +	struct bpf_prog *old_prog;
> +	struct bpf_prog_list *pl;
> +	bool found = false;
> +
> +	if (link->link.prog->type != new_prog->type)
> +		return -EINVAL;
> +
> +	list_for_each_entry(pl, progs, node) {
> +		if (pl->link == link) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		return -ENOENT;
> +
> +	old_prog = xchg(&link->link.prog, new_prog);
> +	replace_effective_prog(cgrp, link->type, link);

I think with 'found = true' in this function you're assuming that it will be
found in replace_effective_prog() ? I don't think that's the case.
Try to create bpf_link with BPF_F_ALLOW_OVERRIDE, override it in a child cgroup
with another link and then try to LINK_UPDATE the former. The link is there,
but the prog is not executing and it's not in effective array. What LINK_UPDATE
suppose to do? I guess it should succeed?
Even trickier that the prog will be in effective array in some of
css_for_each_descendant_pre() and not in others. This cgroup attach semantics
were convoluted from the day one. Apparently people use all three variants now,
but I wouldn't bet that everyone understands it.
Hence my proposal to support F_ALLOW_MULTI for links only. At least initially.
It's so much simpler to explain. And owning bpf_link will guarantee that the
prog is executing (unless cgroup is removed and sockets are closed). I guess
default (no-override) is acceptable to bpf_link as well and in that sense it
will be very similar to XDP with single prog attached. So I think I can live
with default and ALLOW_MULTI for now. But we should probably redesign
overriding capabilities. Folks need to attach multiple progs to a given cgroup
and disallow all progs in children. Currently it's not possible to do, since
MULTI in the parent allows at least one (default, override or multi) in the
children. bpf_link inheriting this logic won't help to solve this use case. It
feels that link should stay as multi only and override or not in the children
should be a separate property. Probably not related to link at all. It fits
better as a cgroup permission.

Anyhow I'm going to apply patches 1 and 2, since they are good cleanup
regardless of what we decide here.
Andrii Nakryiko March 26, 2020, 11:59 p.m. UTC | #5
On Thu, Mar 26, 2020 at 4:35 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 24, 2020 at 11:57:44PM -0700, Andrii Nakryiko wrote:
> >
> > +/* Swap updated BPF program for given link in effective program arrays across
> > + * all descendant cgroups. This function is guaranteed to succeed.
> > + */
> > +static void replace_effective_prog(struct cgroup *cgrp,
> > +                                enum bpf_attach_type type,
> > +                                struct bpf_cgroup_link *link)
> > +{
> > +     struct bpf_prog_array_item *item;
> > +     struct cgroup_subsys_state *css;
> > +     struct bpf_prog_array *progs;
> > +     struct bpf_prog_list *pl;
> > +     struct list_head *head;
> > +     struct cgroup *cg;
> > +     int pos;
> > +
> > +     css_for_each_descendant_pre(css, &cgrp->self) {
> > +             struct cgroup *desc = container_of(css, struct cgroup, self);
> > +
> > +             if (percpu_ref_is_zero(&desc->bpf.refcnt))
> > +                     continue;
> > +
> > +             /* found position of link in effective progs array */
> > +             for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> > +                     if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> > +                             continue;
> > +
> > +                     head = &cg->bpf.progs[type];
> > +                     list_for_each_entry(pl, head, node) {
> > +                             if (!prog_list_prog(pl))
> > +                                     continue;
> > +                             if (pl->link == link)
> > +                                     goto found;
> > +                             pos++;
> > +                     }
> > +             }
> > +found:
> > +             BUG_ON(!cg);
> > +             progs = rcu_dereference_protected(
> > +                             desc->bpf.effective[type],
> > +                             lockdep_is_held(&cgroup_mutex));
> > +             item = &progs->items[pos];
> > +             WRITE_ONCE(item->prog, link->link.prog);
> > +     }
> > +}
> > +
> > +/**
> > + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> > + *                          to descendants
> > + * @cgrp: The cgroup which descendants to traverse
> > + * @link: A link for which to replace BPF program
> > + * @type: Type of attach operation
> > + *
> > + * Must be called with cgroup_mutex held.
> > + */
> > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > +                      struct bpf_prog *new_prog)
> > +{
> > +     struct list_head *progs = &cgrp->bpf.progs[link->type];
> > +     struct bpf_prog *old_prog;
> > +     struct bpf_prog_list *pl;
> > +     bool found = false;
> > +
> > +     if (link->link.prog->type != new_prog->type)
> > +             return -EINVAL;
> > +
> > +     list_for_each_entry(pl, progs, node) {
> > +             if (pl->link == link) {
> > +                     found = true;
> > +                     break;
> > +             }
> > +     }
> > +     if (!found)
> > +             return -ENOENT;
> > +
> > +     old_prog = xchg(&link->link.prog, new_prog);
> > +     replace_effective_prog(cgrp, link->type, link);
>
> I think with 'found = true' in this function you're assuming that it will be
> found in replace_effective_prog() ? I don't think that's the case.
> Try to create bpf_link with BPF_F_ALLOW_OVERRIDE, override it in a child cgroup
> with another link and then try to LINK_UPDATE the former. The link is there,
> but the prog is not executing and it's not in effective array. What LINK_UPDATE
> suppose to do? I guess it should succeed?

Yes, this is a great catch! I should have used ALLOW_OVERRIDE at the
root cgroup level in my selftest, it would catch it immediately.

BUG_ON(!cg) in replace_effective_prog() is too aggressive, if I
replace it with `if (!cg) continue;` it will handle this as well.

> Even trickier that the prog will be in effective array in some of
> css_for_each_descendant_pre() and not in others. This cgroup attach semantics
> were convoluted from the day one. Apparently people use all three variants now,
> but I wouldn't bet that everyone understands it.

Agree about convoluted logic, spent enormous time understanding and
modifying it :)

But apart from BUG_ON(!cg) problem, everything works because each
level of hierarchy is treated independently in
replace_effective_prog(). Search for attached link on each level is
reset and performed anew. If found - we replace program, if not - must
be ALLOW_OVERRIDE case, i.e., actually overridden link.

> Hence my proposal to support F_ALLOW_MULTI for links only. At least initially.
> It's so much simpler to explain. And owning bpf_link will guarantee that the
> prog is executing (unless cgroup is removed and sockets are closed). I guess
> default (no-override) is acceptable to bpf_link as well and in that sense it
> will be very similar to XDP with single prog attached. So I think I can live
> with default and ALLOW_MULTI for now. But we should probably redesign
> overriding capabilities. Folks need to attach multiple progs to a given cgroup
> and disallow all progs in children. Currently it's not possible to do, since
> MULTI in the parent allows at least one (default, override or multi) in the
> children. bpf_link inheriting this logic won't help to solve this use case. It
> feels that link should stay as multi only and override or not in the children
> should be a separate property. Probably not related to link at all. It fits
> better as a cgroup permission.

Yeah, we had a brief discussion with Andrey on mailing list. Not sure
what the solution looks like, but it should be orthogonal to link/prog
attachment operation, probably.

If you insist and Andrey is ok with dropping ALLOW_OVERRIDE, it's
easy. But fixing the logic to handle it is also easy. So are we sure
about supporting 2 out of 3 existing modes? :)

>
> Anyhow I'm going to apply patches 1 and 2, since they are good cleanup
> regardless of what we decide here.

Thanks, will rebase on top of bpf-next master for v3.
Alexei Starovoitov March 27, 2020, 12:34 a.m. UTC | #6
On Thu, Mar 26, 2020 at 04:59:06PM -0700, Andrii Nakryiko wrote:
> On Thu, Mar 26, 2020 at 4:35 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 24, 2020 at 11:57:44PM -0700, Andrii Nakryiko wrote:
> > >
> > > +/* Swap updated BPF program for given link in effective program arrays across
> > > + * all descendant cgroups. This function is guaranteed to succeed.
> > > + */
> > > +static void replace_effective_prog(struct cgroup *cgrp,
> > > +                                enum bpf_attach_type type,
> > > +                                struct bpf_cgroup_link *link)
> > > +{
> > > +     struct bpf_prog_array_item *item;
> > > +     struct cgroup_subsys_state *css;
> > > +     struct bpf_prog_array *progs;
> > > +     struct bpf_prog_list *pl;
> > > +     struct list_head *head;
> > > +     struct cgroup *cg;
> > > +     int pos;
> > > +
> > > +     css_for_each_descendant_pre(css, &cgrp->self) {
> > > +             struct cgroup *desc = container_of(css, struct cgroup, self);
> > > +
> > > +             if (percpu_ref_is_zero(&desc->bpf.refcnt))
> > > +                     continue;
> > > +
> > > +             /* found position of link in effective progs array */
> > > +             for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> > > +                     if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> > > +                             continue;
> > > +
> > > +                     head = &cg->bpf.progs[type];
> > > +                     list_for_each_entry(pl, head, node) {
> > > +                             if (!prog_list_prog(pl))
> > > +                                     continue;
> > > +                             if (pl->link == link)
> > > +                                     goto found;
> > > +                             pos++;
> > > +                     }
> > > +             }
> > > +found:
> > > +             BUG_ON(!cg);
> > > +             progs = rcu_dereference_protected(
> > > +                             desc->bpf.effective[type],
> > > +                             lockdep_is_held(&cgroup_mutex));
> > > +             item = &progs->items[pos];
> > > +             WRITE_ONCE(item->prog, link->link.prog);
> > > +     }
> > > +}
> > > +
> > > +/**
> > > + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> > > + *                          to descendants
> > > + * @cgrp: The cgroup which descendants to traverse
> > > + * @link: A link for which to replace BPF program
> > > + * @type: Type of attach operation
> > > + *
> > > + * Must be called with cgroup_mutex held.
> > > + */
> > > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > > +                      struct bpf_prog *new_prog)
> > > +{
> > > +     struct list_head *progs = &cgrp->bpf.progs[link->type];
> > > +     struct bpf_prog *old_prog;
> > > +     struct bpf_prog_list *pl;
> > > +     bool found = false;
> > > +
> > > +     if (link->link.prog->type != new_prog->type)
> > > +             return -EINVAL;
> > > +
> > > +     list_for_each_entry(pl, progs, node) {
> > > +             if (pl->link == link) {
> > > +                     found = true;
> > > +                     break;
> > > +             }
> > > +     }
> > > +     if (!found)
> > > +             return -ENOENT;
> > > +
> > > +     old_prog = xchg(&link->link.prog, new_prog);
> > > +     replace_effective_prog(cgrp, link->type, link);
> >
> > I think with 'found = true' in this function you're assuming that it will be
> > found in replace_effective_prog() ? I don't think that's the case.
> > Try to create bpf_link with BPF_F_ALLOW_OVERRIDE, override it in a child cgroup
> > with another link and then try to LINK_UPDATE the former. The link is there,
> > but the prog is not executing and it's not in effective array. What LINK_UPDATE
> > suppose to do? I guess it should succeed?
> 
> Yes, this is a great catch! I should have used ALLOW_OVERRIDE at the
> root cgroup level in my selftest, it would catch it immediately.
> 
> BUG_ON(!cg) in replace_effective_prog() is too aggressive, if I
> replace it with `if (!cg) continue;` it will handle this as well.
> 
> > Even trickier that the prog will be in effective array in some of
> > css_for_each_descendant_pre() and not in others. This cgroup attach semantics
> > were convoluted from the day one. Apparently people use all three variants now,
> > but I wouldn't bet that everyone understands it.
> 
> Agree about convoluted logic, spent enormous time understanding and
> modifying it :)
> 
> But apart from BUG_ON(!cg) problem, everything works because each
> level of hierarchy is treated independently in
> replace_effective_prog(). Search for attached link on each level is
> reset and performed anew. If found - we replace program, if not - must
> be ALLOW_OVERRIDE case, i.e., actually overridden link.
> 
> > Hence my proposal to support F_ALLOW_MULTI for links only. At least initially.
> > It's so much simpler to explain. And owning bpf_link will guarantee that the
> > prog is executing (unless cgroup is removed and sockets are closed). I guess
> > default (no-override) is acceptable to bpf_link as well and in that sense it
> > will be very similar to XDP with single prog attached. So I think I can live
> > with default and ALLOW_MULTI for now. But we should probably redesign
> > overriding capabilities. Folks need to attach multiple progs to a given cgroup
> > and disallow all progs in children. Currently it's not possible to do, since
> > MULTI in the parent allows at least one (default, override or multi) in the
> > children. bpf_link inheriting this logic won't help to solve this use case. It
> > feels that link should stay as multi only and override or not in the children
> > should be a separate property. Probably not related to link at all. It fits
> > better as a cgroup permission.
> 
> Yeah, we had a brief discussion with Andrey on mailing list. Not sure
> what the solution looks like, but it should be orthogonal to link/prog
> attachment operation, probably.
> 
> If you insist and Andrey is ok with dropping ALLOW_OVERRIDE, it's
> easy. But fixing the logic to handle it is also easy. So are we sure
> about supporting 2 out of 3 existing modes? :)

I wasn't clear enough. My preference is only multi for bpf_link with a concrete
plan how cgroup permissions can do no-override, selective override, and
whatever else container folks need.
I can imagine somebody may want to attach bind/connect at outer container level
and disallow this specific attach_type for children while allowing other
cgroup-bpf prog types in inner containers. There is no way to do so now and
flags for bpf_link is not the answer either.

> Thanks, will rebase on top of bpf-next master for v3.

please wait with repost until this discussion settles.
Andrii Nakryiko March 27, 2020, 12:55 a.m. UTC | #7
On Thu, Mar 26, 2020 at 5:34 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 26, 2020 at 04:59:06PM -0700, Andrii Nakryiko wrote:
> > On Thu, Mar 26, 2020 at 4:35 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Mar 24, 2020 at 11:57:44PM -0700, Andrii Nakryiko wrote:
> > > >
> > > > +/* Swap updated BPF program for given link in effective program arrays across
> > > > + * all descendant cgroups. This function is guaranteed to succeed.
> > > > + */
> > > > +static void replace_effective_prog(struct cgroup *cgrp,
> > > > +                                enum bpf_attach_type type,
> > > > +                                struct bpf_cgroup_link *link)
> > > > +{
> > > > +     struct bpf_prog_array_item *item;
> > > > +     struct cgroup_subsys_state *css;
> > > > +     struct bpf_prog_array *progs;
> > > > +     struct bpf_prog_list *pl;
> > > > +     struct list_head *head;
> > > > +     struct cgroup *cg;
> > > > +     int pos;
> > > > +
> > > > +     css_for_each_descendant_pre(css, &cgrp->self) {
> > > > +             struct cgroup *desc = container_of(css, struct cgroup, self);
> > > > +
> > > > +             if (percpu_ref_is_zero(&desc->bpf.refcnt))
> > > > +                     continue;
> > > > +
> > > > +             /* found position of link in effective progs array */
> > > > +             for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> > > > +                     if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> > > > +                             continue;
> > > > +
> > > > +                     head = &cg->bpf.progs[type];
> > > > +                     list_for_each_entry(pl, head, node) {
> > > > +                             if (!prog_list_prog(pl))
> > > > +                                     continue;
> > > > +                             if (pl->link == link)
> > > > +                                     goto found;
> > > > +                             pos++;
> > > > +                     }
> > > > +             }
> > > > +found:
> > > > +             BUG_ON(!cg);
> > > > +             progs = rcu_dereference_protected(
> > > > +                             desc->bpf.effective[type],
> > > > +                             lockdep_is_held(&cgroup_mutex));
> > > > +             item = &progs->items[pos];
> > > > +             WRITE_ONCE(item->prog, link->link.prog);
> > > > +     }
> > > > +}
> > > > +
> > > > +/**
> > > > + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> > > > + *                          to descendants
> > > > + * @cgrp: The cgroup which descendants to traverse
> > > > + * @link: A link for which to replace BPF program
> > > > + * @type: Type of attach operation
> > > > + *
> > > > + * Must be called with cgroup_mutex held.
> > > > + */
> > > > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > > > +                      struct bpf_prog *new_prog)
> > > > +{
> > > > +     struct list_head *progs = &cgrp->bpf.progs[link->type];
> > > > +     struct bpf_prog *old_prog;
> > > > +     struct bpf_prog_list *pl;
> > > > +     bool found = false;
> > > > +
> > > > +     if (link->link.prog->type != new_prog->type)
> > > > +             return -EINVAL;
> > > > +
> > > > +     list_for_each_entry(pl, progs, node) {
> > > > +             if (pl->link == link) {
> > > > +                     found = true;
> > > > +                     break;
> > > > +             }
> > > > +     }
> > > > +     if (!found)
> > > > +             return -ENOENT;
> > > > +
> > > > +     old_prog = xchg(&link->link.prog, new_prog);
> > > > +     replace_effective_prog(cgrp, link->type, link);
> > >
> > > I think with 'found = true' in this function you're assuming that it will be
> > > found in replace_effective_prog() ? I don't think that's the case.
> > > Try to create bpf_link with BPF_F_ALLOW_OVERRIDE, override it in a child cgroup
> > > with another link and then try to LINK_UPDATE the former. The link is there,
> > > but the prog is not executing and it's not in effective array. What LINK_UPDATE
> > > suppose to do? I guess it should succeed?
> >
> > Yes, this is a great catch! I should have used ALLOW_OVERRIDE at the
> > root cgroup level in my selftest, it would catch it immediately.
> >
> > BUG_ON(!cg) in replace_effective_prog() is too aggressive, if I
> > replace it with `if (!cg) continue;` it will handle this as well.
> >
> > > Even trickier that the prog will be in effective array in some of
> > > css_for_each_descendant_pre() and not in others. This cgroup attach semantics
> > > were convoluted from the day one. Apparently people use all three variants now,
> > > but I wouldn't bet that everyone understands it.
> >
> > Agree about convoluted logic, spent enormous time understanding and
> > modifying it :)
> >
> > But apart from BUG_ON(!cg) problem, everything works because each
> > level of hierarchy is treated independently in
> > replace_effective_prog(). Search for attached link on each level is
> > reset and performed anew. If found - we replace program, if not - must
> > be ALLOW_OVERRIDE case, i.e., actually overridden link.
> >
> > > Hence my proposal to support F_ALLOW_MULTI for links only. At least initially.
> > > It's so much simpler to explain. And owning bpf_link will guarantee that the
> > > prog is executing (unless cgroup is removed and sockets are closed). I guess
> > > default (no-override) is acceptable to bpf_link as well and in that sense it
> > > will be very similar to XDP with single prog attached. So I think I can live
> > > with default and ALLOW_MULTI for now. But we should probably redesign
> > > overriding capabilities. Folks need to attach multiple progs to a given cgroup
> > > and disallow all progs in children. Currently it's not possible to do, since
> > > MULTI in the parent allows at least one (default, override or multi) in the
> > > children. bpf_link inheriting this logic won't help to solve this use case. It
> > > feels that link should stay as multi only and override or not in the children
> > > should be a separate property. Probably not related to link at all. It fits
> > > better as a cgroup permission.
> >
> > Yeah, we had a brief discussion with Andrey on mailing list. Not sure
> > what the solution looks like, but it should be orthogonal to link/prog
> > attachment operation, probably.
> >
> > If you insist and Andrey is ok with dropping ALLOW_OVERRIDE, it's
> > easy. But fixing the logic to handle it is also easy. So are we sure
> > about supporting 2 out of 3 existing modes? :)
>
> I wasn't clear enough. My preference is only multi for bpf_link with a concrete

Ah, ok, it certainly read as default + multi should be supported.
Alright, I'll drop NONE and OVERRIDE mode (so back to initial
version).

> plan how cgroup permissions can do no-override, selective override, and
> whatever else container folks need.
> I can imagine somebody may want to attach bind/connect at outer container level
> and disallow this specific attach_type for children while allowing other
> cgroup-bpf prog types in inner containers. There is no way to do so now and
> flags for bpf_link is not the answer either.
>
> > Thanks, will rebase on top of bpf-next master for v3.
>
> please wait with repost until this discussion settles.

Sure, will do.
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d2d969669564..a8d78efd3cea 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -100,6 +100,8 @@  int __cgroup_bpf_attach(struct cgroup *cgrp,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type);
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog);
 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		       union bpf_attr __user *uattr);
 
@@ -110,6 +112,8 @@  int cgroup_bpf_attach(struct cgroup *cgrp,
 		      u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type);
+int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
+		       struct bpf_prog *new_prog);
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		     union bpf_attr __user *uattr);
 
@@ -373,6 +377,13 @@  static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline int cgroup_bpf_replace(struct bpf_link *link,
+				     struct bpf_prog *old_prog,
+				     struct bpf_prog *new_prog)
+{
+	return -EINVAL;
+}
+
 static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 					union bpf_attr __user *uattr)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 948ebbfd401b..d7583483fca5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@  enum bpf_cmd {
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
 	BPF_LINK_CREATE,
+	BPF_LINK_UPDATE,
 };
 
 enum bpf_map_type {
@@ -575,6 +576,17 @@  union bpf_attr {
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
 	} link_create;
+
+	struct { /* struct used by BPF_LINK_UPDATE command */
+		__u32		link_fd;	/* link fd */
+		/* new program fd to update link with */
+		__u32		new_prog_fd;
+		__u32		flags;		/* extra flags */
+		/* expected link's program fd; is specified only if
+		 * BPF_F_REPLACE flag is set in flags */
+		__u32		old_prog_fd;
+	} link_update;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c5cedc8c3428..2c70e2c95cb7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -506,6 +506,86 @@  int __cgroup_bpf_attach(struct cgroup *cgrp,
 	return err;
 }
 
+/* Swap updated BPF program for given link in effective program arrays across
+ * all descendant cgroups. This function is guaranteed to succeed.
+ */
+static void replace_effective_prog(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_cgroup_link *link)
+{
+	struct bpf_prog_array_item *item;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_array *progs;
+	struct bpf_prog_list *pl;
+	struct list_head *head;
+	struct cgroup *cg;
+	int pos;
+
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		if (percpu_ref_is_zero(&desc->bpf.refcnt))
+			continue;
+
+		/* found position of link in effective progs array */
+		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+				continue;
+
+			head = &cg->bpf.progs[type];
+			list_for_each_entry(pl, head, node) {
+				if (!prog_list_prog(pl))
+					continue;
+				if (pl->link == link)
+					goto found;
+				pos++;
+			}
+		}
+found:
+		BUG_ON(!cg);
+		progs = rcu_dereference_protected(
+				desc->bpf.effective[type],
+				lockdep_is_held(&cgroup_mutex));
+		item = &progs->items[pos];
+		WRITE_ONCE(item->prog, link->link.prog);
+	}
+}
+
+/**
+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
+ *                          to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @link: A link for which to replace BPF program
+ * @type: Type of attach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog)
+{
+	struct list_head *progs = &cgrp->bpf.progs[link->type];
+	struct bpf_prog *old_prog;
+	struct bpf_prog_list *pl;
+	bool found = false;
+
+	if (link->link.prog->type != new_prog->type)
+		return -EINVAL;
+
+	list_for_each_entry(pl, progs, node) {
+		if (pl->link == link) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+
+	old_prog = xchg(&link->link.prog, new_prog);
+	replace_effective_prog(cgrp, link->type, link);
+	bpf_prog_put(old_prog);
+	return 0;
+}
+
 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 638ec8b54741..a52426e1e0df 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3572,6 +3572,55 @@  static int link_create(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
+
+static int link_update(union bpf_attr *attr)
+{
+	struct bpf_prog *old_prog = NULL, *new_prog;
+	struct bpf_link *link;
+	u32 flags;
+	int ret;
+
+	if (CHECK_ATTR(BPF_LINK_UPDATE))
+		return -EINVAL;
+
+	flags = attr->link_update.flags;
+	if (flags & ~BPF_F_REPLACE)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->link_update.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
+	if (IS_ERR(new_prog))
+		return PTR_ERR(new_prog);
+
+	if (flags & BPF_F_REPLACE) {
+		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
+		if (IS_ERR(old_prog)) {
+			ret = PTR_ERR(old_prog);
+			old_prog = NULL;
+			goto out_put_progs;
+		}
+	}
+
+#ifdef CONFIG_CGROUP_BPF
+	if (link->ops == &bpf_cgroup_link_lops) {
+		ret = cgroup_bpf_replace(link, old_prog, new_prog);
+		goto out_put_progs;
+	}
+#endif
+	ret = -EINVAL;
+
+out_put_progs:
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	if (ret)
+		bpf_prog_put(new_prog);
+	return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -3685,6 +3734,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_LINK_CREATE:
 		err = link_create(&attr);
 		break;
+	case BPF_LINK_UPDATE:
+		err = link_update(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 219624fba9ba..915dda3f7f19 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6317,6 +6317,33 @@  int cgroup_bpf_attach(struct cgroup *cgrp,
 	return ret;
 }
 
+int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
+		       struct bpf_prog *new_prog)
+{
+	struct bpf_cgroup_link *cg_link;
+	int ret;
+
+	if (link->ops != &bpf_cgroup_link_lops)
+		return -EINVAL;
+
+	cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+	mutex_lock(&cgroup_mutex);
+	/* link might have been auto-released by dying cgroup, so fail */
+	if (!cg_link->cgroup) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (old_prog && link->prog != old_prog) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type)
 {