diff mbox series

[bpf-next,4/6] bpf: implement bpf_prog replacement for an active bpf_cgroup_link

Message ID 20200320203615.1519013-5-andriin@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Add support for cgroup bpf_link | expand

Commit Message

Andrii Nakryiko March 20, 2020, 8:36 p.m. UTC
Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
under given bpf_link. Currently this is only supported for bpf_cgroup_link,
but will be extended to other kinds of bpf_links in follow-up patches.

For bpf_cgroup_link, implemented functionality matches existing semantics for
direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
unconditionally set new bpf_prog regardless of which bpf_prog is currently
active under given bpf_link, or, optionally, can specify expected active
bpf_prog. If active bpf_prog doesn't match expected one, operation is a noop
and returns a failure.

cgroup_bpf_replace() operation is resolving race between auto-detachment and
bpf_prog update in the same fashion as it's done for bpf_link detachment,
except in this case update has no way of succeeding because of target cgroup
marked as dying. So in this case error is returned.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 include/linux/bpf-cgroup.h |  4 ++
 include/uapi/linux/bpf.h   | 12 ++++++
 kernel/bpf/cgroup.c        | 77 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 60 +++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 21 +++++++++++
 5 files changed, 174 insertions(+)

Comments

kernel test robot March 21, 2020, 12:33 a.m. UTC | #1
Hi Andrii,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[cannot apply to bpf/master cgroup/for-next v5.6-rc6 next-20200320]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200321-045848
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=9.2.0 make.cross ARCH=m68k 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/kernel.h:11,
                    from include/linux/list.h:9,
                    from include/linux/timer.h:5,
                    from include/linux/workqueue.h:9,
                    from include/linux/bpf.h:9,
                    from kernel/bpf/syscall.c:4:
   kernel/bpf/syscall.c: In function 'link_update':
>> include/linux/kernel.h:994:51: error: dereferencing pointer to incomplete type 'struct bpf_cgroup_link'
     994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
         |                                                   ^~
   include/linux/compiler.h:330:9: note: in definition of macro '__compiletime_assert'
     330 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler.h:350:2: note: in expansion of macro '_compiletime_assert'
     350 |  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
      39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
         |                                     ^~~~~~~~~~~~~~~~~~
   include/linux/kernel.h:994:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
     994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
         |  ^~~~~~~~~~~~~~~~
   include/linux/kernel.h:994:20: note: in expansion of macro '__same_type'
     994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
         |                    ^~~~~~~~~~~
>> kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
    3612 |   cg_link = container_of(link, struct bpf_cgroup_link, link);
         |             ^~~~~~~~~~~~
   In file included from <command-line>:
>> include/linux/compiler_types.h:129:35: error: invalid use of undefined type 'struct bpf_cgroup_link'
     129 | #define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
         |                                   ^~~~~~~~~~~~~~~~~~
   include/linux/stddef.h:17:32: note: in expansion of macro '__compiler_offsetof'
      17 | #define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
         |                                ^~~~~~~~~~~~~~~~~~~
   include/linux/kernel.h:997:21: note: in expansion of macro 'offsetof'
     997 |  ((type *)(__mptr - offsetof(type, member))); })
         |                     ^~~~~~~~
>> kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
    3612 |   cg_link = container_of(link, struct bpf_cgroup_link, link);
         |             ^~~~~~~~~~~~
>> kernel/bpf/syscall.c:3618:9: error: implicit declaration of function 'cgroup_bpf_replace'; did you mean 'cgroup_bpf_offline'? [-Werror=implicit-function-declaration]
    3618 |   ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
         |         ^~~~~~~~~~~~~~~~~~
         |         cgroup_bpf_offline
   cc1: some warnings being treated as errors

vim +994 include/linux/kernel.h

cf14f27f82af78 Alexei Starovoitov 2018-03-28  984  
^1da177e4c3f41 Linus Torvalds     2005-04-16  985  /**
^1da177e4c3f41 Linus Torvalds     2005-04-16  986   * container_of - cast a member of a structure out to the containing structure
^1da177e4c3f41 Linus Torvalds     2005-04-16  987   * @ptr:	the pointer to the member.
^1da177e4c3f41 Linus Torvalds     2005-04-16  988   * @type:	the type of the container struct this is embedded in.
^1da177e4c3f41 Linus Torvalds     2005-04-16  989   * @member:	the name of the member within the struct.
^1da177e4c3f41 Linus Torvalds     2005-04-16  990   *
^1da177e4c3f41 Linus Torvalds     2005-04-16  991   */
^1da177e4c3f41 Linus Torvalds     2005-04-16  992  #define container_of(ptr, type, member) ({				\
c7acec713d14c6 Ian Abbott         2017-07-12  993  	void *__mptr = (void *)(ptr);					\
c7acec713d14c6 Ian Abbott         2017-07-12 @994  	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
c7acec713d14c6 Ian Abbott         2017-07-12  995  			 !__same_type(*(ptr), void),			\
c7acec713d14c6 Ian Abbott         2017-07-12  996  			 "pointer type mismatch in container_of()");	\
c7acec713d14c6 Ian Abbott         2017-07-12  997  	((type *)(__mptr - offsetof(type, member))); })
^1da177e4c3f41 Linus Torvalds     2005-04-16  998  

:::::: The code at line 994 was first introduced by commit
:::::: c7acec713d14c6ce8a20154f9dfda258d6bcad3b kernel.h: handle pointers to arrays better in container_of()

:::::: TO: Ian Abbott <abbotti@mev.co.uk>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Andrii Nakryiko March 21, 2020, 12:58 a.m. UTC | #2
On Fri, Mar 20, 2020 at 5:34 PM kbuild test robot <lkp@intel.com> wrote:
>
> Hi Andrii,
>
> I love your patch! Yet something to improve:
>
> [auto build test ERROR on bpf-next/master]
> [cannot apply to bpf/master cgroup/for-next v5.6-rc6 next-20200320]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
>
> url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200321-045848
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> config: m68k-sun3_defconfig (attached as .config)
> compiler: m68k-linux-gcc (GCC) 9.2.0
> reproduce:
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # save the attached .config to linux build tree
>         GCC_VERSION=9.2.0 make.cross ARCH=m68k
>
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot <lkp@intel.com>
>
> All error/warnings (new ones prefixed by >>):
>
>    In file included from include/linux/kernel.h:11,
>                     from include/linux/list.h:9,
>                     from include/linux/timer.h:5,
>                     from include/linux/workqueue.h:9,
>                     from include/linux/bpf.h:9,
>                     from kernel/bpf/syscall.c:4:
>    kernel/bpf/syscall.c: In function 'link_update':
> >> include/linux/kernel.h:994:51: error: dereferencing pointer to incomplete type 'struct bpf_cgroup_link'
>      994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
>          |                                                   ^~
>    include/linux/compiler.h:330:9: note: in definition of macro '__compiletime_assert'
>      330 |   if (!(condition))     \
>          |         ^~~~~~~~~
>    include/linux/compiler.h:350:2: note: in expansion of macro '_compiletime_assert'
>      350 |  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
>          |  ^~~~~~~~~~~~~~~~~~~
>    include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
>       39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
>          |                                     ^~~~~~~~~~~~~~~~~~
>    include/linux/kernel.h:994:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
>      994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
>          |  ^~~~~~~~~~~~~~~~
>    include/linux/kernel.h:994:20: note: in expansion of macro '__same_type'
>      994 |  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
>          |                    ^~~~~~~~~~~
> >> kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
>     3612 |   cg_link = container_of(link, struct bpf_cgroup_link, link);
>          |             ^~~~~~~~~~~~
>    In file included from <command-line>:
> >> include/linux/compiler_types.h:129:35: error: invalid use of undefined type 'struct bpf_cgroup_link'
>      129 | #define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
>          |                                   ^~~~~~~~~~~~~~~~~~
>    include/linux/stddef.h:17:32: note: in expansion of macro '__compiler_offsetof'
>       17 | #define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
>          |                                ^~~~~~~~~~~~~~~~~~~
>    include/linux/kernel.h:997:21: note: in expansion of macro 'offsetof'
>      997 |  ((type *)(__mptr - offsetof(type, member))); })
>          |                     ^~~~~~~~
> >> kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
>     3612 |   cg_link = container_of(link, struct bpf_cgroup_link, link);
>          |             ^~~~~~~~~~~~
> >> kernel/bpf/syscall.c:3618:9: error: implicit declaration of function 'cgroup_bpf_replace'; did you mean 'cgroup_bpf_offline'? [-Werror=implicit-function-declaration]
>     3618 |   ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
>          |         ^~~~~~~~~~~~~~~~~~
>          |         cgroup_bpf_offline
>    cc1: some warnings being treated as errors
>

Forgot to stub out cgroup_bpf_replace(), will fix in v2.


> vim +994 include/linux/kernel.h
>
> cf14f27f82af78 Alexei Starovoitov 2018-03-28  984
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  985  /**
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  986   * container_of - cast a member of a structure out to the containing structure
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  987   * @ptr:     the pointer to the member.
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  988   * @type:    the type of the container struct this is embedded in.
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  989   * @member:  the name of the member within the struct.
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  990   *
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  991   */
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  992  #define container_of(ptr, type, member) ({                           \
> c7acec713d14c6 Ian Abbott         2017-07-12  993       void *__mptr = (void *)(ptr);                                   \
> c7acec713d14c6 Ian Abbott         2017-07-12 @994       BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&   \
> c7acec713d14c6 Ian Abbott         2017-07-12  995                        !__same_type(*(ptr), void),                    \
> c7acec713d14c6 Ian Abbott         2017-07-12  996                        "pointer type mismatch in container_of()");    \
> c7acec713d14c6 Ian Abbott         2017-07-12  997       ((type *)(__mptr - offsetof(type, member))); })
> ^1da177e4c3f41 Linus Torvalds     2005-04-16  998
>
> :::::: The code at line 994 was first introduced by commit
> :::::: c7acec713d14c6ce8a20154f9dfda258d6bcad3b kernel.h: handle pointers to arrays better in container_of()
>
> :::::: TO: Ian Abbott <abbotti@mev.co.uk>
> :::::: CC: Linus Torvalds <torvalds@linux-foundation.org>
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot March 21, 2020, 1:28 a.m. UTC | #3
Hi Andrii,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[cannot apply to bpf/master cgroup/for-next v5.6-rc6 next-20200320]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/Add-support-for-cgroup-bpf_link/20200321-045848
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: mips-64r6el_defconfig (attached as .config)
compiler: mips64el-linux-gcc (GCC) 5.5.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=5.5.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/linux/kernel.h:11:0,
                    from include/linux/list.h:9,
                    from include/linux/timer.h:5,
                    from include/linux/workqueue.h:9,
                    from include/linux/bpf.h:9,
                    from kernel/bpf/syscall.c:4:
   kernel/bpf/syscall.c: In function 'link_update':
   include/linux/kernel.h:994:51: error: dereferencing pointer to incomplete type 'struct bpf_cgroup_link'
     BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
                                                      ^
   include/linux/compiler.h:330:9: note: in definition of macro '__compiletime_assert'
      if (!(condition))     \
            ^
   include/linux/compiler.h:350:2: note: in expansion of macro '_compiletime_assert'
     _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
     ^
   include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
    #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                        ^
   include/linux/kernel.h:994:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
     BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
     ^
   include/linux/kernel.h:994:20: note: in expansion of macro '__same_type'
     BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
                       ^
   kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
      cg_link = container_of(link, struct bpf_cgroup_link, link);
                ^
   In file included from <command-line>:0:0:
>> kernel/bpf/syscall.c:3612:39: error: invalid use of undefined type 'struct bpf_cgroup_link'
      cg_link = container_of(link, struct bpf_cgroup_link, link);
                                          ^
   include/linux/compiler_types.h:129:54: note: in definition of macro '__compiler_offsetof'
    #define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
                                                         ^
   include/linux/kernel.h:997:21: note: in expansion of macro 'offsetof'
     ((type *)(__mptr - offsetof(type, member))); })
                        ^
   kernel/bpf/syscall.c:3612:13: note: in expansion of macro 'container_of'
      cg_link = container_of(link, struct bpf_cgroup_link, link);
                ^
>> kernel/bpf/syscall.c:3618:9: error: implicit declaration of function 'cgroup_bpf_replace' [-Werror=implicit-function-declaration]
      ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
            ^
   cc1: some warnings being treated as errors

vim +3612 kernel/bpf/syscall.c

  3576	
  3577	static int link_update(union bpf_attr *attr)
  3578	{
  3579		struct bpf_prog *old_prog = NULL, *new_prog;
  3580		enum bpf_prog_type ptype;
  3581		struct bpf_link *link;
  3582		u32 flags;
  3583		int ret;
  3584	
  3585		if (CHECK_ATTR(BPF_LINK_UPDATE))
  3586			return -EINVAL;
  3587	
  3588		flags = attr->link_update.flags;
  3589		if (flags & ~BPF_F_REPLACE)
  3590			return -EINVAL;
  3591	
  3592		link = bpf_link_get_from_fd(attr->link_update.link_fd);
  3593		if (IS_ERR(link))
  3594			return PTR_ERR(link);
  3595	
  3596		new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
  3597		if (IS_ERR(new_prog))
  3598			return PTR_ERR(new_prog);
  3599	
  3600		if (flags & BPF_F_REPLACE) {
  3601			old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
  3602			if (IS_ERR(old_prog)) {
  3603				ret = PTR_ERR(old_prog);
  3604				old_prog = NULL;
  3605				goto out_put_progs;
  3606			}
  3607		}
  3608	
  3609		if (link->ops == &bpf_cgroup_link_lops) {
  3610			struct bpf_cgroup_link *cg_link;
  3611	
> 3612			cg_link = container_of(link, struct bpf_cgroup_link, link);
  3613			ptype = attach_type_to_prog_type(cg_link->type);
  3614			if (ptype != new_prog->type) {
  3615				ret = -EINVAL;
  3616				goto out_put_progs;
  3617			}
> 3618			ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
  3619						 old_prog, new_prog);
  3620		} else {
  3621			ret = -EINVAL;
  3622		}
  3623	
  3624	out_put_progs:
  3625		if (old_prog)
  3626			bpf_prog_put(old_prog);
  3627		if (ret)
  3628			bpf_prog_put(new_prog);
  3629		return ret;
  3630	}
  3631	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Toke Høiland-Jørgensen March 23, 2020, 10:58 a.m. UTC | #4
Andrii Nakryiko <andriin@fb.com> writes:

> Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
> under given bpf_link. Currently this is only supported for bpf_cgroup_link,
> but will be extended to other kinds of bpf_links in follow-up patches.
>
> For bpf_cgroup_link, implemented functionality matches existing semantics for
> direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
> unconditionally set new bpf_prog regardless of which bpf_prog is currently
> active under given bpf_link, or, optionally, can specify expected active
> bpf_prog. If active bpf_prog doesn't match expected one, operation is a noop
> and returns a failure.

Nit: I'd consider a 'noop' to be something that succeeds, so that last
sentence is a contradiction. Maybe "If active bpf_prog doesn't match
expected one, the kernel will abort the operation and return a failure."?

> cgroup_bpf_replace() operation is resolving race between auto-detachment and
> bpf_prog update in the same fashion as it's done for bpf_link detachment,
> except in this case update has no way of succeeding because of target cgroup
> marked as dying. So in this case error is returned.
>
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  include/linux/bpf-cgroup.h |  4 ++
>  include/uapi/linux/bpf.h   | 12 ++++++
>  kernel/bpf/cgroup.c        | 77 ++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c       | 60 +++++++++++++++++++++++++++++
>  kernel/cgroup/cgroup.c     | 21 +++++++++++
>  5 files changed, 174 insertions(+)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index ab95824a1d99..5735d8bfd69e 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -98,6 +98,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>  			struct bpf_cgroup_link *link,
>  			enum bpf_attach_type type);
> +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> +			 struct bpf_prog *new_prog);
>  int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>  		       union bpf_attr __user *uattr);
>  
> @@ -108,6 +110,8 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
>  		      u32 flags);
>  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>  		      enum bpf_attach_type type);
> +int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> +		       struct bpf_prog *old_prog, struct bpf_prog *new_prog);
>  int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>  		     union bpf_attr __user *uattr);
>  
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index fad9f79bb8f1..fa944093f9fc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -112,6 +112,7 @@ enum bpf_cmd {
>  	BPF_MAP_UPDATE_BATCH,
>  	BPF_MAP_DELETE_BATCH,
>  	BPF_LINK_CREATE,
> +	BPF_LINK_UPDATE,
>  };

I feel like there's a BPF_LINK_QUERY operation missing here? Otherwise,
how is userspace supposed to discover which program is currently
attached to a link?

>  enum bpf_map_type {
> @@ -574,6 +575,17 @@ union bpf_attr {
>  		__u32		target_fd;	/* object to attach to */
>  		__u32		attach_type;	/* attach type */
>  	} link_create;
> +
> +	struct { /* struct used by BPF_LINK_UPDATE command */
> +		__u32		link_fd;	/* link fd */
> +		/* new program fd to update link with */
> +		__u32		new_prog_fd;
> +		__u32		flags;		/* extra flags */
> +		/* expected link's program fd; is specified only if
> +		 * BPF_F_REPLACE flag is set in flags */
> +		__u32		old_prog_fd;
> +	} link_update;
> +
>  } __attribute__((aligned(8)));
>  
>  /* The description below is an attempt at providing documentation to eBPF
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index b960e8633f23..b9f4971336f3 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -501,6 +501,83 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>  	return err;
>  }
>  
> +/* Swap updated BPF program for given link in effective program arrays across
> + * all descendant cgroups. This function is guaranteed to succeed.
> + */
> +static void replace_effective_prog(struct cgroup *cgrp,
> +				   enum bpf_attach_type type,
> +				   struct bpf_cgroup_link *link)
> +{
> +	struct bpf_prog_array_item *item;
> +	struct cgroup_subsys_state *css;
> +	struct bpf_prog_array *progs;
> +	struct bpf_prog_list *pl;
> +	struct list_head *head;
> +	struct cgroup *cg;
> +	int pos;
> +
> +	css_for_each_descendant_pre(css, &cgrp->self) {
> +		struct cgroup *desc = container_of(css, struct cgroup, self);
> +
> +		if (percpu_ref_is_zero(&desc->bpf.refcnt))
> +			continue;
> +
> +		/* found position of link in effective progs array */
> +		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> +			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> +				continue;
> +
> +			head = &cg->bpf.progs[type];
> +			list_for_each_entry(pl, head, node) {
> +				if (!prog_list_prog(pl))
> +					continue;
> +				if (pl->link == link)
> +					goto found;
> +				pos++;
> +			}
> +		}
> +found:
> +		BUG_ON(!cg);
> +		progs = rcu_dereference_protected(
> +				desc->bpf.effective[type],
> +				lockdep_is_held(&cgroup_mutex));
> +		item = &progs->items[pos];
> +		WRITE_ONCE(item->prog, link->link.prog);
> +	}
> +}
> +
> +/**
> + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> + *                          to descendants
> + * @cgrp: The cgroup which descendants to traverse
> + * @link: A link for which to replace BPF program
> + * @type: Type of attach operation
> + *
> + * Must be called with cgroup_mutex held.
> + */
> +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> +			 struct bpf_prog *new_prog)
> +{
> +	struct list_head *progs = &cgrp->bpf.progs[link->type];
> +	struct bpf_prog *old_prog;
> +	struct bpf_prog_list *pl;
> +	bool found = false;
> +
> +	list_for_each_entry(pl, progs, node) {
> +		if (pl->link == link) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		return -ENOENT;
> +
> +	old_prog = xchg(&link->link.prog, new_prog);
> +	replace_effective_prog(cgrp, link->type, link);
> +	bpf_prog_put(old_prog);
> +	return 0;
> +}
> +
>  static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
>  					       struct bpf_prog *prog,
>  					       struct bpf_cgroup_link *link,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index f6e7d32a2632..1ff7aaa2c727 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3572,6 +3572,63 @@ static int link_create(union bpf_attr *attr)
>  	return ret;
>  }
>  
> +#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
> +
> +static int link_update(union bpf_attr *attr)
> +{
> +	struct bpf_prog *old_prog = NULL, *new_prog;
> +	enum bpf_prog_type ptype;
> +	struct bpf_link *link;
> +	u32 flags;
> +	int ret;
> +
> +	if (CHECK_ATTR(BPF_LINK_UPDATE))
> +		return -EINVAL;
> +
> +	flags = attr->link_update.flags;
> +	if (flags & ~BPF_F_REPLACE)
> +		return -EINVAL;
> +
> +	link = bpf_link_get_from_fd(attr->link_update.link_fd);
> +	if (IS_ERR(link))
> +		return PTR_ERR(link);
> +
> +	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
> +	if (IS_ERR(new_prog))
> +		return PTR_ERR(new_prog);
> +
> +	if (flags & BPF_F_REPLACE) {
> +		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
> +		if (IS_ERR(old_prog)) {
> +			ret = PTR_ERR(old_prog);
> +			old_prog = NULL;
> +			goto out_put_progs;
> +		}
> +	}

Shouldn't the default be to require an old FD and do atomic update, but
provide a flag (BPF_F_CLOBBER?) to opt-in to unconditional replace
behaviour? Since the unconditional replace is inherently racy I don't
think it should be the default; in fact, I'm not sure if it should be
allowed at all?

> +	if (link->ops == &bpf_cgroup_link_lops) {
> +		struct bpf_cgroup_link *cg_link;
> +
> +		cg_link = container_of(link, struct bpf_cgroup_link, link);
> +		ptype = attach_type_to_prog_type(cg_link->type);
> +		if (ptype != new_prog->type) {
> +			ret = -EINVAL;
> +			goto out_put_progs;
> +		}
> +		ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
> +					 old_prog, new_prog);
> +	} else {
> +		ret = -EINVAL;
> +	}
> +
> +out_put_progs:
> +	if (old_prog)
> +		bpf_prog_put(old_prog);
> +	if (ret)
> +		bpf_prog_put(new_prog);
> +	return ret;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>  {
>  	union bpf_attr attr = {};
> @@ -3685,6 +3742,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>  	case BPF_LINK_CREATE:
>  		err = link_create(&attr);
>  		break;
> +	case BPF_LINK_UPDATE:
> +		err = link_update(&attr);
> +		break;
>  	default:
>  		err = -EINVAL;
>  		break;
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 219624fba9ba..d4787fccf183 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6317,6 +6317,27 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
>  	return ret;
>  }
>  
> +int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> +		       struct bpf_prog *old_prog, struct bpf_prog *new_prog)
> +{
> +	int ret;
> +
> +	mutex_lock(&cgroup_mutex);
> +	/* link might have been auto-released by dying cgroup, so fail */
> +	if (!link->cgroup) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +	if (old_prog && link->link.prog != old_prog) {
> +		ret = -EPERM;
> +		goto out_unlock;
> +	}
> +	ret = __cgroup_bpf_replace(cgrp, link, new_prog);
> +out_unlock:
> +	mutex_unlock(&cgroup_mutex);
> +	return ret;
> +}
> +
>  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>  		      enum bpf_attach_type type)
>  {
> -- 
> 2.17.1
Andrii Nakryiko March 23, 2020, 5:55 p.m. UTC | #5
On Mon, Mar 23, 2020 at 3:58 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andriin@fb.com> writes:
>
> > Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
> > under given bpf_link. Currently this is only supported for bpf_cgroup_link,
> > but will be extended to other kinds of bpf_links in follow-up patches.
> >
> > For bpf_cgroup_link, implemented functionality matches existing semantics for
> > direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
> > unconditionally set new bpf_prog regardless of which bpf_prog is currently
> > active under given bpf_link, or, optionally, can specify expected active
> > bpf_prog. If active bpf_prog doesn't match expected one, operation is a noop
> > and returns a failure.
>
> Nit: I'd consider a 'noop' to be something that succeeds, so that last
> sentence is a contradiction. Maybe "If active bpf_prog doesn't match
> expected one, the kernel will abort the operation and return a failure."?

Heh, for me "noop" (no operation) means no changes done, it doesn't
mean that syscall itself is successful. But I'll change the wording to
be less confusing.

>
> > cgroup_bpf_replace() operation is resolving race between auto-detachment and
> > bpf_prog update in the same fashion as it's done for bpf_link detachment,
> > except in this case update has no way of succeeding because of target cgroup
> > marked as dying. So in this case error is returned.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> > ---
> >  include/linux/bpf-cgroup.h |  4 ++
> >  include/uapi/linux/bpf.h   | 12 ++++++
> >  kernel/bpf/cgroup.c        | 77 ++++++++++++++++++++++++++++++++++++++
> >  kernel/bpf/syscall.c       | 60 +++++++++++++++++++++++++++++
> >  kernel/cgroup/cgroup.c     | 21 +++++++++++
> >  5 files changed, 174 insertions(+)
> >
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index ab95824a1d99..5735d8bfd69e 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -98,6 +98,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
> >  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> >                       struct bpf_cgroup_link *link,
> >                       enum bpf_attach_type type);
> > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > +                      struct bpf_prog *new_prog);
> >  int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> >                      union bpf_attr __user *uattr);
> >
> > @@ -108,6 +110,8 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
> >                     u32 flags);
> >  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> >                     enum bpf_attach_type type);
> > +int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > +                    struct bpf_prog *old_prog, struct bpf_prog *new_prog);
> >  int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> >                    union bpf_attr __user *uattr);
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index fad9f79bb8f1..fa944093f9fc 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -112,6 +112,7 @@ enum bpf_cmd {
> >       BPF_MAP_UPDATE_BATCH,
> >       BPF_MAP_DELETE_BATCH,
> >       BPF_LINK_CREATE,
> > +     BPF_LINK_UPDATE,
> >  };
>
> I feel like there's a BPF_LINK_QUERY operation missing here? Otherwise,
> how is userspace supposed to discover which program is currently
> attached to a link?

Probably, but it should return not just attached BPF program, but also
whatever target it is attached to (e.g., cgroup, ifindex, perf event
fd, etc). But we'll need to design it properly, so I didn't do
implementation yet.

>
> >  enum bpf_map_type {
> > @@ -574,6 +575,17 @@ union bpf_attr {
> >               __u32           target_fd;      /* object to attach to */
> >               __u32           attach_type;    /* attach type */
> >       } link_create;
> > +
> > +     struct { /* struct used by BPF_LINK_UPDATE command */
> > +             __u32           link_fd;        /* link fd */
> > +             /* new program fd to update link with */
> > +             __u32           new_prog_fd;
> > +             __u32           flags;          /* extra flags */
> > +             /* expected link's program fd; is specified only if
> > +              * BPF_F_REPLACE flag is set in flags */
> > +             __u32           old_prog_fd;
> > +     } link_update;
> > +
> >  } __attribute__((aligned(8)));
> >
> >  /* The description below is an attempt at providing documentation to eBPF
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index b960e8633f23..b9f4971336f3 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -501,6 +501,83 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
> >       return err;
> >  }
> >
> > +/* Swap updated BPF program for given link in effective program arrays across
> > + * all descendant cgroups. This function is guaranteed to succeed.
> > + */
> > +static void replace_effective_prog(struct cgroup *cgrp,
> > +                                enum bpf_attach_type type,
> > +                                struct bpf_cgroup_link *link)
> > +{
> > +     struct bpf_prog_array_item *item;
> > +     struct cgroup_subsys_state *css;
> > +     struct bpf_prog_array *progs;
> > +     struct bpf_prog_list *pl;
> > +     struct list_head *head;
> > +     struct cgroup *cg;
> > +     int pos;
> > +
> > +     css_for_each_descendant_pre(css, &cgrp->self) {
> > +             struct cgroup *desc = container_of(css, struct cgroup, self);
> > +
> > +             if (percpu_ref_is_zero(&desc->bpf.refcnt))
> > +                     continue;
> > +
> > +             /* found position of link in effective progs array */
> > +             for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
> > +                     if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
> > +                             continue;
> > +
> > +                     head = &cg->bpf.progs[type];
> > +                     list_for_each_entry(pl, head, node) {
> > +                             if (!prog_list_prog(pl))
> > +                                     continue;
> > +                             if (pl->link == link)
> > +                                     goto found;
> > +                             pos++;
> > +                     }
> > +             }
> > +found:
> > +             BUG_ON(!cg);
> > +             progs = rcu_dereference_protected(
> > +                             desc->bpf.effective[type],
> > +                             lockdep_is_held(&cgroup_mutex));
> > +             item = &progs->items[pos];
> > +             WRITE_ONCE(item->prog, link->link.prog);
> > +     }
> > +}
> > +
> > +/**
> > + * __cgroup_bpf_replace() - Replace link's program and propagate the change
> > + *                          to descendants
> > + * @cgrp: The cgroup which descendants to traverse
> > + * @link: A link for which to replace BPF program
> > + * @type: Type of attach operation
> > + *
> > + * Must be called with cgroup_mutex held.
> > + */
> > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > +                      struct bpf_prog *new_prog)
> > +{
> > +     struct list_head *progs = &cgrp->bpf.progs[link->type];
> > +     struct bpf_prog *old_prog;
> > +     struct bpf_prog_list *pl;
> > +     bool found = false;
> > +
> > +     list_for_each_entry(pl, progs, node) {
> > +             if (pl->link == link) {
> > +                     found = true;
> > +                     break;
> > +             }
> > +     }
> > +     if (!found)
> > +             return -ENOENT;
> > +
> > +     old_prog = xchg(&link->link.prog, new_prog);
> > +     replace_effective_prog(cgrp, link->type, link);
> > +     bpf_prog_put(old_prog);
> > +     return 0;
> > +}
> > +
> >  static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
> >                                              struct bpf_prog *prog,
> >                                              struct bpf_cgroup_link *link,
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index f6e7d32a2632..1ff7aaa2c727 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -3572,6 +3572,63 @@ static int link_create(union bpf_attr *attr)
> >       return ret;
> >  }
> >
> > +#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
> > +
> > +static int link_update(union bpf_attr *attr)
> > +{
> > +     struct bpf_prog *old_prog = NULL, *new_prog;
> > +     enum bpf_prog_type ptype;
> > +     struct bpf_link *link;
> > +     u32 flags;
> > +     int ret;
> > +
> > +     if (CHECK_ATTR(BPF_LINK_UPDATE))
> > +             return -EINVAL;
> > +
> > +     flags = attr->link_update.flags;
> > +     if (flags & ~BPF_F_REPLACE)
> > +             return -EINVAL;
> > +
> > +     link = bpf_link_get_from_fd(attr->link_update.link_fd);
> > +     if (IS_ERR(link))
> > +             return PTR_ERR(link);
> > +
> > +     new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
> > +     if (IS_ERR(new_prog))
> > +             return PTR_ERR(new_prog);
> > +
> > +     if (flags & BPF_F_REPLACE) {
> > +             old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
> > +             if (IS_ERR(old_prog)) {
> > +                     ret = PTR_ERR(old_prog);
> > +                     old_prog = NULL;
> > +                     goto out_put_progs;
> > +             }
> > +     }
>
> Shouldn't the default be to require an old FD and do atomic update, but
> provide a flag (BPF_F_CLOBBER?) to opt-in to unconditional replace
> behaviour? Since the unconditional replace is inherently racy I don't
> think it should be the default; in fact, I'm not sure if it should be
> allowed at all?

I don't feel strongly either way, but I expect majority of use cases
to use non-pinned bpf_link with just FD open by an application, where
application knows that no one else can update program from under link.
In which case setting new BPF program won't be racy.

>
> > +     if (link->ops == &bpf_cgroup_link_lops) {
> > +             struct bpf_cgroup_link *cg_link;
> > +
> > +             cg_link = container_of(link, struct bpf_cgroup_link, link);
> > +             ptype = attach_type_to_prog_type(cg_link->type);
> > +             if (ptype != new_prog->type) {
> > +                     ret = -EINVAL;
> > +                     goto out_put_progs;
> > +             }
> > +             ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
> > +                                      old_prog, new_prog);
> > +     } else {
> > +             ret = -EINVAL;
> > +     }
> > +
> > +out_put_progs:
> > +     if (old_prog)
> > +             bpf_prog_put(old_prog);
> > +     if (ret)
> > +             bpf_prog_put(new_prog);
> > +     return ret;
> > +}
> > +
> >  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
> >  {
> >       union bpf_attr attr = {};
> > @@ -3685,6 +3742,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> >       case BPF_LINK_CREATE:
> >               err = link_create(&attr);
> >               break;
> > +     case BPF_LINK_UPDATE:
> > +             err = link_update(&attr);
> > +             break;
> >       default:
> >               err = -EINVAL;
> >               break;
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index 219624fba9ba..d4787fccf183 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -6317,6 +6317,27 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
> >       return ret;
> >  }
> >
> > +int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
> > +                    struct bpf_prog *old_prog, struct bpf_prog *new_prog)
> > +{
> > +     int ret;
> > +
> > +     mutex_lock(&cgroup_mutex);
> > +     /* link might have been auto-released by dying cgroup, so fail */
> > +     if (!link->cgroup) {
> > +             ret = -EINVAL;
> > +             goto out_unlock;
> > +     }
> > +     if (old_prog && link->link.prog != old_prog) {
> > +             ret = -EPERM;
> > +             goto out_unlock;
> > +     }
> > +     ret = __cgroup_bpf_replace(cgrp, link, new_prog);
> > +out_unlock:
> > +     mutex_unlock(&cgroup_mutex);
> > +     return ret;
> > +}
> > +
> >  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> >                     enum bpf_attach_type type)
> >  {
> > --
> > 2.17.1
>
Toke Høiland-Jørgensen March 23, 2020, 7:29 p.m. UTC | #6
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Mon, Mar 23, 2020 at 3:58 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andriin@fb.com> writes:
>>
>> > Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
>> > under given bpf_link. Currently this is only supported for bpf_cgroup_link,
>> > but will be extended to other kinds of bpf_links in follow-up patches.
>> >
>> > For bpf_cgroup_link, implemented functionality matches existing semantics for
>> > direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
>> > unconditionally set new bpf_prog regardless of which bpf_prog is currently
>> > active under given bpf_link, or, optionally, can specify expected active
>> > bpf_prog. If active bpf_prog doesn't match expected one, operation is a noop
>> > and returns a failure.
>>
>> Nit: I'd consider a 'noop' to be something that succeeds, so that last
>> sentence is a contradiction. Maybe "If active bpf_prog doesn't match
>> expected one, the kernel will abort the operation and return a failure."?
>
> Heh, for me "noop" (no operation) means no changes done, it doesn't
> mean that syscall itself is successful. But I'll change the wording to
> be less confusing.

Cool.

>>
>> > cgroup_bpf_replace() operation is resolving race between auto-detachment and
>> > bpf_prog update in the same fashion as it's done for bpf_link detachment,
>> > except in this case update has no way of succeeding because of target cgroup
>> > marked as dying. So in this case error is returned.
>> >
>> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
>> > ---
>> >  include/linux/bpf-cgroup.h |  4 ++
>> >  include/uapi/linux/bpf.h   | 12 ++++++
>> >  kernel/bpf/cgroup.c        | 77 ++++++++++++++++++++++++++++++++++++++
>> >  kernel/bpf/syscall.c       | 60 +++++++++++++++++++++++++++++
>> >  kernel/cgroup/cgroup.c     | 21 +++++++++++
>> >  5 files changed, 174 insertions(+)
>> >
>> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> > index ab95824a1d99..5735d8bfd69e 100644
>> > --- a/include/linux/bpf-cgroup.h
>> > +++ b/include/linux/bpf-cgroup.h
>> > @@ -98,6 +98,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>> >  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>> >                       struct bpf_cgroup_link *link,
>> >                       enum bpf_attach_type type);
>> > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
>> > +                      struct bpf_prog *new_prog);
>> >  int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>> >                      union bpf_attr __user *uattr);
>> >
>> > @@ -108,6 +110,8 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
>> >                     u32 flags);
>> >  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>> >                     enum bpf_attach_type type);
>> > +int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
>> > +                    struct bpf_prog *old_prog, struct bpf_prog *new_prog);
>> >  int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>> >                    union bpf_attr __user *uattr);
>> >
>> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> > index fad9f79bb8f1..fa944093f9fc 100644
>> > --- a/include/uapi/linux/bpf.h
>> > +++ b/include/uapi/linux/bpf.h
>> > @@ -112,6 +112,7 @@ enum bpf_cmd {
>> >       BPF_MAP_UPDATE_BATCH,
>> >       BPF_MAP_DELETE_BATCH,
>> >       BPF_LINK_CREATE,
>> > +     BPF_LINK_UPDATE,
>> >  };
>>
>> I feel like there's a BPF_LINK_QUERY operation missing here? Otherwise,
>> how is userspace supposed to discover which program is currently
>> attached to a link?
>
> Probably, but it should return not just attached BPF program, but also
> whatever target it is attached to (e.g., cgroup, ifindex, perf event
> fd, etc).

Yes, sure.

> But we'll need to design it properly, so I didn't do implementation
> yet.

Right, OK.

>> >  enum bpf_map_type {
>> > @@ -574,6 +575,17 @@ union bpf_attr {
>> >               __u32           target_fd;      /* object to attach to */
>> >               __u32           attach_type;    /* attach type */
>> >       } link_create;
>> > +
>> > +     struct { /* struct used by BPF_LINK_UPDATE command */
>> > +             __u32           link_fd;        /* link fd */
>> > +             /* new program fd to update link with */
>> > +             __u32           new_prog_fd;
>> > +             __u32           flags;          /* extra flags */
>> > +             /* expected link's program fd; is specified only if
>> > +              * BPF_F_REPLACE flag is set in flags */
>> > +             __u32           old_prog_fd;
>> > +     } link_update;
>> > +
>> >  } __attribute__((aligned(8)));
>> >
>> >  /* The description below is an attempt at providing documentation to eBPF
>> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> > index b960e8633f23..b9f4971336f3 100644
>> > --- a/kernel/bpf/cgroup.c
>> > +++ b/kernel/bpf/cgroup.c
>> > @@ -501,6 +501,83 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>> >       return err;
>> >  }
>> >
>> > +/* Swap updated BPF program for given link in effective program arrays across
>> > + * all descendant cgroups. This function is guaranteed to succeed.
>> > + */
>> > +static void replace_effective_prog(struct cgroup *cgrp,
>> > +                                enum bpf_attach_type type,
>> > +                                struct bpf_cgroup_link *link)
>> > +{
>> > +     struct bpf_prog_array_item *item;
>> > +     struct cgroup_subsys_state *css;
>> > +     struct bpf_prog_array *progs;
>> > +     struct bpf_prog_list *pl;
>> > +     struct list_head *head;
>> > +     struct cgroup *cg;
>> > +     int pos;
>> > +
>> > +     css_for_each_descendant_pre(css, &cgrp->self) {
>> > +             struct cgroup *desc = container_of(css, struct cgroup, self);
>> > +
>> > +             if (percpu_ref_is_zero(&desc->bpf.refcnt))
>> > +                     continue;
>> > +
>> > +             /* found position of link in effective progs array */
>> > +             for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
>> > +                     if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
>> > +                             continue;
>> > +
>> > +                     head = &cg->bpf.progs[type];
>> > +                     list_for_each_entry(pl, head, node) {
>> > +                             if (!prog_list_prog(pl))
>> > +                                     continue;
>> > +                             if (pl->link == link)
>> > +                                     goto found;
>> > +                             pos++;
>> > +                     }
>> > +             }
>> > +found:
>> > +             BUG_ON(!cg);
>> > +             progs = rcu_dereference_protected(
>> > +                             desc->bpf.effective[type],
>> > +                             lockdep_is_held(&cgroup_mutex));
>> > +             item = &progs->items[pos];
>> > +             WRITE_ONCE(item->prog, link->link.prog);
>> > +     }
>> > +}
>> > +
>> > +/**
>> > + * __cgroup_bpf_replace() - Replace link's program and propagate the change
>> > + *                          to descendants
>> > + * @cgrp: The cgroup which descendants to traverse
>> > + * @link: A link for which to replace BPF program
>> > + * @type: Type of attach operation
>> > + *
>> > + * Must be called with cgroup_mutex held.
>> > + */
>> > +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
>> > +                      struct bpf_prog *new_prog)
>> > +{
>> > +     struct list_head *progs = &cgrp->bpf.progs[link->type];
>> > +     struct bpf_prog *old_prog;
>> > +     struct bpf_prog_list *pl;
>> > +     bool found = false;
>> > +
>> > +     list_for_each_entry(pl, progs, node) {
>> > +             if (pl->link == link) {
>> > +                     found = true;
>> > +                     break;
>> > +             }
>> > +     }
>> > +     if (!found)
>> > +             return -ENOENT;
>> > +
>> > +     old_prog = xchg(&link->link.prog, new_prog);
>> > +     replace_effective_prog(cgrp, link->type, link);
>> > +     bpf_prog_put(old_prog);
>> > +     return 0;
>> > +}
>> > +
>> >  static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
>> >                                              struct bpf_prog *prog,
>> >                                              struct bpf_cgroup_link *link,
>> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> > index f6e7d32a2632..1ff7aaa2c727 100644
>> > --- a/kernel/bpf/syscall.c
>> > +++ b/kernel/bpf/syscall.c
>> > @@ -3572,6 +3572,63 @@ static int link_create(union bpf_attr *attr)
>> >       return ret;
>> >  }
>> >
>> > +#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
>> > +
>> > +static int link_update(union bpf_attr *attr)
>> > +{
>> > +     struct bpf_prog *old_prog = NULL, *new_prog;
>> > +     enum bpf_prog_type ptype;
>> > +     struct bpf_link *link;
>> > +     u32 flags;
>> > +     int ret;
>> > +
>> > +     if (CHECK_ATTR(BPF_LINK_UPDATE))
>> > +             return -EINVAL;
>> > +
>> > +     flags = attr->link_update.flags;
>> > +     if (flags & ~BPF_F_REPLACE)
>> > +             return -EINVAL;
>> > +
>> > +     link = bpf_link_get_from_fd(attr->link_update.link_fd);
>> > +     if (IS_ERR(link))
>> > +             return PTR_ERR(link);
>> > +
>> > +     new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
>> > +     if (IS_ERR(new_prog))
>> > +             return PTR_ERR(new_prog);
>> > +
>> > +     if (flags & BPF_F_REPLACE) {
>> > +             old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
>> > +             if (IS_ERR(old_prog)) {
>> > +                     ret = PTR_ERR(old_prog);
>> > +                     old_prog = NULL;
>> > +                     goto out_put_progs;
>> > +             }
>> > +     }
>>
>> Shouldn't the default be to require an old FD and do atomic update, but
>> provide a flag (BPF_F_CLOBBER?) to opt-in to unconditional replace
>> behaviour? Since the unconditional replace is inherently racy I don't
>> think it should be the default; in fact, I'm not sure if it should be
>> allowed at all?
>
> I don't feel strongly either way, but I expect majority of use cases
> to use non-pinned bpf_link with just FD open by an application, where
> application knows that no one else can update program from under link.
> In which case setting new BPF program won't be racy.

Ah. As you've probably noticed, my mental model is somewhat biased
towards utilities that exit after invocation, so didn't think about that :)

Still, even in the case of such a long-running application, it would
still know which fd was already attached, so it could still supply it,
even though there's no possibility of a race. Especially if we abstract
that behind the bpf_link data structure in libbpf.

-Toke
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ab95824a1d99..5735d8bfd69e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -98,6 +98,8 @@  int __cgroup_bpf_attach(struct cgroup *cgrp,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type);
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog);
 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		       union bpf_attr __user *uattr);
 
@@ -108,6 +110,8 @@  int cgroup_bpf_attach(struct cgroup *cgrp,
 		      u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type);
+int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+		       struct bpf_prog *old_prog, struct bpf_prog *new_prog);
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		     union bpf_attr __user *uattr);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fad9f79bb8f1..fa944093f9fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@  enum bpf_cmd {
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
 	BPF_LINK_CREATE,
+	BPF_LINK_UPDATE,
 };
 
 enum bpf_map_type {
@@ -574,6 +575,17 @@  union bpf_attr {
 		__u32		target_fd;	/* object to attach to */
 		__u32		attach_type;	/* attach type */
 	} link_create;
+
+	struct { /* struct used by BPF_LINK_UPDATE command */
+		__u32		link_fd;	/* link fd */
+		/* new program fd to update link with */
+		__u32		new_prog_fd;
+		__u32		flags;		/* extra flags */
+		/* expected link's program fd; is specified only if
+		 * BPF_F_REPLACE flag is set in flags */
+		__u32		old_prog_fd;
+	} link_update;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b960e8633f23..b9f4971336f3 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -501,6 +501,83 @@  int __cgroup_bpf_attach(struct cgroup *cgrp,
 	return err;
 }
 
+/* Swap updated BPF program for given link in effective program arrays across
+ * all descendant cgroups. This function is guaranteed to succeed.
+ */
+static void replace_effective_prog(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_cgroup_link *link)
+{
+	struct bpf_prog_array_item *item;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_array *progs;
+	struct bpf_prog_list *pl;
+	struct list_head *head;
+	struct cgroup *cg;
+	int pos;
+
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		if (percpu_ref_is_zero(&desc->bpf.refcnt))
+			continue;
+
+		/* found position of link in effective progs array */
+		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+				continue;
+
+			head = &cg->bpf.progs[type];
+			list_for_each_entry(pl, head, node) {
+				if (!prog_list_prog(pl))
+					continue;
+				if (pl->link == link)
+					goto found;
+				pos++;
+			}
+		}
+found:
+		BUG_ON(!cg);
+		progs = rcu_dereference_protected(
+				desc->bpf.effective[type],
+				lockdep_is_held(&cgroup_mutex));
+		item = &progs->items[pos];
+		WRITE_ONCE(item->prog, link->link.prog);
+	}
+}
+
+/**
+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
+ *                          to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @link: A link for which to replace BPF program
+ * @type: Type of attach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog)
+{
+	struct list_head *progs = &cgrp->bpf.progs[link->type];
+	struct bpf_prog *old_prog;
+	struct bpf_prog_list *pl;
+	bool found = false;
+
+	list_for_each_entry(pl, progs, node) {
+		if (pl->link == link) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+
+	old_prog = xchg(&link->link.prog, new_prog);
+	replace_effective_prog(cgrp, link->type, link);
+	bpf_prog_put(old_prog);
+	return 0;
+}
+
 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f6e7d32a2632..1ff7aaa2c727 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3572,6 +3572,63 @@  static int link_create(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
+
+static int link_update(union bpf_attr *attr)
+{
+	struct bpf_prog *old_prog = NULL, *new_prog;
+	enum bpf_prog_type ptype;
+	struct bpf_link *link;
+	u32 flags;
+	int ret;
+
+	if (CHECK_ATTR(BPF_LINK_UPDATE))
+		return -EINVAL;
+
+	flags = attr->link_update.flags;
+	if (flags & ~BPF_F_REPLACE)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->link_update.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
+	if (IS_ERR(new_prog))
+		return PTR_ERR(new_prog);
+
+	if (flags & BPF_F_REPLACE) {
+		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
+		if (IS_ERR(old_prog)) {
+			ret = PTR_ERR(old_prog);
+			old_prog = NULL;
+			goto out_put_progs;
+		}
+	}
+
+	if (link->ops == &bpf_cgroup_link_lops) {
+		struct bpf_cgroup_link *cg_link;
+
+		cg_link = container_of(link, struct bpf_cgroup_link, link);
+		ptype = attach_type_to_prog_type(cg_link->type);
+		if (ptype != new_prog->type) {
+			ret = -EINVAL;
+			goto out_put_progs;
+		}
+		ret = cgroup_bpf_replace(cg_link->cgroup, cg_link,
+					 old_prog, new_prog);
+	} else {
+		ret = -EINVAL;
+	}
+
+out_put_progs:
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	if (ret)
+		bpf_prog_put(new_prog);
+	return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -3685,6 +3742,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_LINK_CREATE:
 		err = link_create(&attr);
 		break;
+	case BPF_LINK_UPDATE:
+		err = link_update(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 219624fba9ba..d4787fccf183 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6317,6 +6317,27 @@  int cgroup_bpf_attach(struct cgroup *cgrp,
 	return ret;
 }
 
+int cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+		       struct bpf_prog *old_prog, struct bpf_prog *new_prog)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	/* link might have been auto-released by dying cgroup, so fail */
+	if (!link->cgroup) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (old_prog && link->link.prog != old_prog) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	ret = __cgroup_bpf_replace(cgrp, link, new_prog);
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type)
 {