diff mbox series

[RFC,bpf-next] bpf: allow users to opt out of releasing objects when urefs are gone

Message ID 20190121201021.30785-1-jakub.kicinski@netronome.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series [RFC,bpf-next] bpf: allow users to opt out of releasing objects when urefs are gone | expand

Commit Message

Jakub Kicinski Jan. 21, 2019, 8:10 p.m. UTC
Commit c9da161c6517 ("bpf: fix clearing on persistent program array maps")
added protection against dependency loops between programs and maps leading
to zombie objects which would never be freed.  FD maps are flushed when
last user reference is gone.

This is confusing to users of bpftool, as updates to tail call maps have
no effect, unless those maps are pinned (or open by another entity).

Today, flushing should not be a requirement for privileged user, as root
has access to GET_FD_BY_ID commends, and therefore can manually flush
the maps.  Add a flag to opt out of automatic flushing.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
Hi!

I wonder what the reactions to this (untested) patch would be?  I don't
really care strongly, but perhaps others experienced a similar issue?
---
 include/uapi/linux/bpf.h |  3 +++
 kernel/bpf/arraymap.c    | 13 ++++++++++---
 kernel/bpf/syscall.c     |  9 ++++++++-
 net/core/sock_map.c      |  3 ++-
 4 files changed, 23 insertions(+), 5 deletions(-)

Comments

Daniel Borkmann Jan. 24, 2019, 11:17 a.m. UTC | #1
On 01/21/2019 09:10 PM, Jakub Kicinski wrote:
> Commit c9da161c6517 ("bpf: fix clearing on persistent program array maps")
> added protection against dependency loops between programs and maps leading
> to zombie objects which would never be freed.  FD maps are flushed when
> last user reference is gone.
> 
> This is confusing to users of bpftool, as updates to tail call maps have
> no effect, unless those maps are pinned (or open by another entity).

Probably would make sense to only allow updates by providing the pinned
map path in case of tail call map (and e.g. not by id)?

> Today, flushing should not be a requirement for privileged user, as root
> has access to GET_FD_BY_ID commends, and therefore can manually flush
> the maps.  Add a flag to opt out of automatic flushing.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
> ---
> Hi!
> 
> I wonder what the reactions to this (untested) patch would be?  I don't
> really care strongly, but perhaps others experienced a similar issue?

My main worry with this is that this might lead to hard to debug memory
consumption issues, for example, once apps start to make wider use of
this flag (and even worse would load maps with BPF progs with circular
references), then they won't be cleaned up on detach and keep holding
potentially huge amount of kernel memory. It's true that similar issues
would happen when long running app leaks tail call map fd or the map
is a stale leftover node in bpf fs, but it feels there's still better
tooling out of user space for tracing back their correlation (e.g. fds,
file access to pinned entry) and given the fact that then there's a
guaranteed flush once urefs are fully dropped. Not having it and scanning
through GET_FD_BY_ID range for such cases to determine a stale tail call
map, its potential origin, and breaking up dependencies seems much harder
(or at least there's proper tooling missing for it today).

> ---
>  include/uapi/linux/bpf.h |  3 +++
>  kernel/bpf/arraymap.c    | 13 ++++++++++---
>  kernel/bpf/syscall.c     |  9 ++++++++-
>  net/core/sock_map.c      |  3 ++-
>  4 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 91c43884f295..771f5cd957a7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -291,6 +291,9 @@ enum bpf_attach_type {
>  /* Zero-initialize hash function seed. This should only be used for testing. */
>  #define BPF_F_ZERO_SEED		(1U << 6)
>  
> +/* Don't flush fd map when last reference is gone */
> +#define BPF_F_FD_MAP_NO_UREF_FLUSH	(1U << 7)
> +
>  /* flags for BPF_PROG_QUERY */
>  #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
>  
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 25632a75d630..ca8a4504c612 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -23,6 +23,8 @@
>  
>  #define ARRAY_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> +#define FD_ARRAY_CREATE_FLAG_MASK \
> +	(ARRAY_CREATE_FLAG_MASK | BPF_F_FD_MAP_NO_UREF_FLUSH)
>  
>  static void bpf_array_free_percpu(struct bpf_array *array)
>  {
> @@ -54,7 +56,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
>  }
>  
>  /* Called from syscall */
> -int array_map_alloc_check(union bpf_attr *attr)
> +static int __array_map_alloc_check(union bpf_attr *attr, u32 allowed_flags)
>  {
>  	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
>  	int numa_node = bpf_map_attr_numa_node(attr);
> @@ -62,7 +64,7 @@ int array_map_alloc_check(union bpf_attr *attr)
>  	/* check sanity of attributes */
>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
>  	    attr->value_size == 0 ||
> -	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
> +	    attr->map_flags & ~allowed_flags ||
>  	    (percpu && numa_node != NUMA_NO_NODE))
>  		return -EINVAL;
>  
> @@ -75,6 +77,11 @@ int array_map_alloc_check(union bpf_attr *attr)
>  	return 0;
>  }
>  
> +int array_map_alloc_check(union bpf_attr *attr)
> +{
> +	return __array_map_alloc_check(attr, ARRAY_CREATE_FLAG_MASK);
> +}
> +
>  static struct bpf_map *array_map_alloc(union bpf_attr *attr)
>  {
>  	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
> @@ -431,7 +438,7 @@ static int fd_array_map_alloc_check(union bpf_attr *attr)
>  	/* only file descriptors can be stored in this type of map */
>  	if (attr->value_size != sizeof(u32))
>  		return -EINVAL;
> -	return array_map_alloc_check(attr);
> +	return __array_map_alloc_check(attr, FD_ARRAY_CREATE_FLAG_MASK);
>  }
>  
>  static void fd_array_map_free(struct bpf_map *map)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b155cd17c1bd..42f10766f62e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -123,6 +123,12 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
>  		err = ops->map_alloc_check(attr);
>  		if (err)
>  			return ERR_PTR(err);
> +		/* Unprivileged users can't GET_FD_BY_ID so they always
> +		 * need the flush protection.
> +		 */
> +		if ((attr->map_flags & BPF_F_FD_MAP_NO_UREF_FLUSH) &&
> +		    !capable(CAP_SYS_ADMIN))
> +			return ERR_PTR(-EPERM);
>  	}
>  	if (attr->map_ifindex)
>  		ops = &bpf_map_offload_ops;
> @@ -293,7 +299,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
>  static void bpf_map_put_uref(struct bpf_map *map)
>  {
>  	if (atomic_dec_and_test(&map->usercnt)) {
> -		if (map->ops->map_release_uref)
> +		if (map->ops->map_release_uref &&
> +		    !(map->map_flags & BPF_F_FD_MAP_NO_UREF_FLUSH))
>  			map->ops->map_release_uref(map);
>  	}
>  }
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index be6092ac69f8..ea5a16d88bec 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -19,7 +19,8 @@ struct bpf_stab {
>  };
>  
>  #define SOCK_CREATE_FLAG_MASK				\
> -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
> +	 BPF_F_FD_MAP_NO_UREF_FLUSH)
>  
>  static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
>  {
>
Jakub Kicinski Jan. 24, 2019, 7:28 p.m. UTC | #2
On Thu, 24 Jan 2019 12:17:27 +0100, Daniel Borkmann wrote:
> On 01/21/2019 09:10 PM, Jakub Kicinski wrote:
> > Commit c9da161c6517 ("bpf: fix clearing on persistent program array maps")
> > added protection against dependency loops between programs and maps leading
> > to zombie objects which would never be freed.  FD maps are flushed when
> > last user reference is gone.
> > 
> > This is confusing to users of bpftool, as updates to tail call maps have
> > no effect, unless those maps are pinned (or open by another entity).  
> 
> Probably would make sense to only allow updates by providing the pinned
> map path in case of tail call map (and e.g. not by id)?

Hm, yes, or at least print a warning.

> > Today, flushing should not be a requirement for privileged user, as root
> > has access to GET_FD_BY_ID commends, and therefore can manually flush
> > the maps.  Add a flag to opt out of automatic flushing.
> > 
> > Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
> > ---
> > Hi!
> > 
> > I wonder what the reactions to this (untested) patch would be?  I don't
> > really care strongly, but perhaps others experienced a similar issue?  
> 
> My main worry with this is that this might lead to hard to debug memory
> consumption issues, for example, once apps start to make wider use of
> this flag (and even worse would load maps with BPF progs with circular
> references), then they won't be cleaned up on detach and keep holding
> potentially huge amount of kernel memory. It's true that similar issues
> would happen when long running app leaks tail call map fd or the map
> is a stale leftover node in bpf fs, but it feels there's still better
> tooling out of user space for tracing back their correlation (e.g. fds,
> file access to pinned entry) and given the fact that then there's a
> guaranteed flush once urefs are fully dropped. Not having it and scanning
> through GET_FD_BY_ID range for such cases to determine a stale tail call
> map, its potential origin, and breaking up dependencies seems much harder
> (or at least there's proper tooling missing for it today).

Makes sense, I'll toss the patch with clear conscience :)
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..771f5cd957a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -291,6 +291,9 @@  enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Don't flush fd map when last reference is gone */
+#define BPF_F_FD_MAP_NO_UREF_FLUSH	(1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..ca8a4504c612 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -23,6 +23,8 @@ 
 
 #define ARRAY_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define FD_ARRAY_CREATE_FLAG_MASK \
+	(ARRAY_CREATE_FLAG_MASK | BPF_F_FD_MAP_NO_UREF_FLUSH)
 
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
@@ -54,7 +56,7 @@  static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 
 /* Called from syscall */
-int array_map_alloc_check(union bpf_attr *attr)
+static int __array_map_alloc_check(union bpf_attr *attr, u32 allowed_flags)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
@@ -62,7 +64,7 @@  int array_map_alloc_check(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    attr->value_size == 0 ||
-	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+	    attr->map_flags & ~allowed_flags ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return -EINVAL;
 
@@ -75,6 +77,11 @@  int array_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+int array_map_alloc_check(union bpf_attr *attr)
+{
+	return __array_map_alloc_check(attr, ARRAY_CREATE_FLAG_MASK);
+}
+
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
@@ -431,7 +438,7 @@  static int fd_array_map_alloc_check(union bpf_attr *attr)
 	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return -EINVAL;
-	return array_map_alloc_check(attr);
+	return __array_map_alloc_check(attr, FD_ARRAY_CREATE_FLAG_MASK);
 }
 
 static void fd_array_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..42f10766f62e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -123,6 +123,12 @@  static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 		err = ops->map_alloc_check(attr);
 		if (err)
 			return ERR_PTR(err);
+		/* Unprivileged users can't GET_FD_BY_ID so they always
+		 * need the flush protection.
+		 */
+		if ((attr->map_flags & BPF_F_FD_MAP_NO_UREF_FLUSH) &&
+		    !capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
 	}
 	if (attr->map_ifindex)
 		ops = &bpf_map_offload_ops;
@@ -293,7 +299,8 @@  static void bpf_map_free_deferred(struct work_struct *work)
 static void bpf_map_put_uref(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->usercnt)) {
-		if (map->ops->map_release_uref)
+		if (map->ops->map_release_uref &&
+		    !(map->map_flags & BPF_F_FD_MAP_NO_UREF_FLUSH))
 			map->ops->map_release_uref(map);
 	}
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..ea5a16d88bec 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -19,7 +19,8 @@  struct bpf_stab {
 };
 
 #define SOCK_CREATE_FLAG_MASK				\
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+	 BPF_F_FD_MAP_NO_UREF_FLUSH)
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {