diff mbox series

[v3] Add BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES bpf(2) command

Message ID 20180729205835.34850-1-dancol@google.com
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series [v3] Add BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES bpf(2) command | expand

Commit Message

Daniel Colascione July 29, 2018, 8:58 p.m. UTC
BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all
references to maps active at the instant the
BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is
issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the
expiration of map references obtained by BPF programs from other maps.

The purpose of this command is to provide a means for userspace to
replace a BPF map with another, newer version, then ensure that no
component is still using the "old" map before manipulating the "old"
map in some way.

Signed-off-by: Daniel Colascione <dancol@google.com>
---
 include/uapi/linux/bpf.h | 14 ++++++++++++++
 kernel/bpf/syscall.c     | 13 +++++++++++++
 2 files changed, 27 insertions(+)

Comments

Daniel Borkmann July 30, 2018, 10:04 a.m. UTC | #1
On 07/29/2018 10:58 PM, Daniel Colascione wrote:
> BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all
> references to maps active at the instant the
> BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is
> issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the
> expiration of map references obtained by BPF programs from other maps.
> 
> The purpose of this command is to provide a means for userspace to
> replace a BPF map with another, newer version, then ensure that no
> component is still using the "old" map before manipulating the "old"
> map in some way.
> 
> Signed-off-by: Daniel Colascione <dancol@google.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++++++
>  kernel/bpf/syscall.c     | 13 +++++++++++++
>  2 files changed, 27 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b7db3261c62d..ca3cfca76edc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -75,6 +75,19 @@ struct bpf_lpm_trie_key {
>  	__u8	data[0];	/* Arbitrary size */
>  };
>  
> +/* BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all
> + * references to maps active at the instant the
> + * BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is
> + * issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the
> + * expiration of map references obtained by BPF programs from
> + * other maps.
> + *
> + * The purpose of this command is to provide a means for userspace to
> + * replace a BPF map with another, newer version, then ensure that no
> + * component is still using the "old" map before manipulating the
> + * "old" map in some way.
> + */
> +
>  /* BPF syscall commands, see bpf(2) man-page for details. */
>  enum bpf_cmd {
>  	BPF_MAP_CREATE,
> @@ -98,6 +111,7 @@ enum bpf_cmd {
>  	BPF_BTF_LOAD,
>  	BPF_BTF_GET_FD_BY_ID,
>  	BPF_TASK_FD_QUERY,
> +	BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES,
>  };
>  
>  enum bpf_map_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a31a1ba0f8ea..bc9a0713f47d 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2274,6 +2274,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>  	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
>  		return -EPERM;
>  
> +	if (cmd == BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES) {
> +		if (uattr != NULL || size != 0)
> +			return -EINVAL;
> +		err = security_bpf(cmd, NULL, 0);
> +		if (err < 0)
> +			return err;
> +		/* BPF programs always enter a critical section while
> +		 * they have a map reference outstanding.
> +		 */
> +		synchronize_rcu();
> +		return 0;
> +	}
> +
>  	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
>  	if (err)
>  		return err;
> 

Hmm, I don't think such UAPI as above is future-proof. In case we would want
a similar mechanism in future for other maps, we would need a whole new bpf
command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
the underlying map may not even be a map-to-map. Additionally, we don't have
any map object at hand in the above, so we couldn't make any finer grained
decisions either. Something like below would be more suitable and leaves room
for extending this further in future.

Thanks,
Daniel

From 8dfea71b73fa0d402633b76f78c106e82a7a5007 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 30 Jul 2018 11:47:37 +0200
Subject: [PATCH] sync map refs

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c    |  1 +
 kernel/bpf/hashtab.c     |  1 +
 kernel/bpf/map_in_map.c  |  6 ++++++
 kernel/bpf/map_in_map.h  |  1 +
 kernel/bpf/syscall.c     | 24 ++++++++++++++++++++++++
 7 files changed, 35 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b5ad95..7b51f86 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -34,6 +34,7 @@ struct bpf_map_ops {
 	void (*map_free)(struct bpf_map *map);
 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
 	void (*map_release_uref)(struct bpf_map *map);
+	int (*map_sync_refs)(struct bpf_map *map);

 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8701139..e6ec1de 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -98,6 +98,7 @@ enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_SYNC_REFS,
 };

 enum bpf_map_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 544e58f..ddaf42a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -748,5 +748,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+	.map_sync_refs = bpf_map_sync_refs,
 	.map_gen_lookup = array_of_map_gen_lookup,
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 513d9df..05380ea 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1407,5 +1407,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+	.map_sync_refs = bpf_map_sync_refs,
 	.map_gen_lookup = htab_of_map_gen_lookup,
 };
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 1da5746..698a50f 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -96,6 +96,12 @@ void bpf_map_fd_put_ptr(void *ptr)
 	bpf_map_put(ptr);
 }

+int bpf_map_sync_refs(struct bpf_map *map)
+{
+	synchronize_rcu();
+	return 0;
+}
+
 u32 bpf_map_fd_sys_lookup_elem(void *ptr)
 {
 	return ((struct bpf_map *)ptr)->id;
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9..ac02456 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,6 +19,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
 			 int ufd);
 void bpf_map_fd_put_ptr(void *ptr);
+int bpf_map_sync_refs(struct bpf_map *map);
 u32 bpf_map_fd_sys_lookup_elem(void *ptr);

 #endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba..b1286cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -896,6 +896,27 @@ static int map_get_next_key(union bpf_attr *attr)
 	return err;
 }

+#define BPF_MAP_SYNC_REFS_LAST_FIELD map_fd
+
+static int map_sync_refs(union bpf_attr *attr)
+{
+	int err = -ENOTSUPP, ufd = attr->map_fd;
+	struct bpf_map *map;
+	struct fd f;
+
+	if (CHECK_ATTR(BPF_MAP_SYNC_REFS))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (map->ops->map_sync_refs)
+		err = map->ops->map_sync_refs(map);
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2303,6 +2324,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_NEXT_KEY:
 		err = map_get_next_key(&attr);
 		break;
+	case BPF_MAP_SYNC_REFS:
+		err = map_sync_refs(&attr);
+		break;
 	case BPF_PROG_LOAD:
 		err = bpf_prog_load(&attr);
 		break;
Daniel Colascione July 30, 2018, 10:25 a.m. UTC | #2
On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Hmm, I don't think such UAPI as above is future-proof. In case we would want
> a similar mechanism in future for other maps, we would need a whole new bpf
> command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
> the underlying map may not even be a map-to-map. Additionally, we don't have
> any map object at hand in the above, so we couldn't make any finer grained
> decisions either. Something like below would be more suitable and leaves room
> for extending this further in future.

YAGNI. Your proposed mechanism doesn't add anything under the current
implementation. It's also not clear how a map-specific synchronization
command is supposed to work in cases where we swap multiple map
references. Do we synchronize_rcu multiple times? Why would we impose
that inefficiency just for the sake of some non-specific future
extensibility? Add some kind of batching layer? The current approach
works for the anticipated use cases.

While my preference is not to talk about map-to-maps at all in the
user API and instead spec the thing as talking about map references in
general, I'd rather have something that talks about
references-to-maps-acquired-from-maps than this interface.
Jakub Kicinski July 31, 2018, 12:26 a.m. UTC | #3
On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:
> On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > Hmm, I don't think such UAPI as above is future-proof. In case we would want
> > a similar mechanism in future for other maps, we would need a whole new bpf
> > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
> > the underlying map may not even be a map-to-map. Additionally, we don't have
> > any map object at hand in the above, so we couldn't make any finer grained
> > decisions either. Something like below would be more suitable and leaves room
> > for extending this further in future.  
> 
> YAGNI.  Your proposed mechanism doesn't add anything under the current
> implementation. 

FWIW in case of HW offload targeting a particular map may allow users
to avoid a potentially slow sync with all the devices on the system.

> It's also not clear how a map-specific synchronization
> command is supposed to work in cases where we swap multiple map
> references. Do we synchronize_rcu multiple times? Why would we impose
> that inefficiency just for the sake of some non-specific future
> extensibility? Add some kind of batching layer? The current approach
> works for the anticipated use cases.
> 
> While my preference is not to talk about map-to-maps at all in the
> user API and instead spec the thing as talking about map references in
> general, I'd rather have something that talks about
> references-to-maps-acquired-from-maps than this interface.
Daniel Colascione July 31, 2018, 12:33 a.m. UTC | #4
On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:
> > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > > Hmm, I don't think such UAPI as above is future-proof. In case we would want
> > > a similar mechanism in future for other maps, we would need a whole new bpf
> > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
> > > the underlying map may not even be a map-to-map. Additionally, we don't have
> > > any map object at hand in the above, so we couldn't make any finer grained
> > > decisions either. Something like below would be more suitable and leaves room
> > > for extending this further in future.
> >
> > YAGNI.  Your proposed mechanism doesn't add anything under the current
> > implementation.
>
> FWIW in case of HW offload targeting a particular map may allow users
> to avoid a potentially slow sync with all the devices on the system.


Sure. But such a thing doesn't exist right now (right?), and we can
add that more-efficient-in-that-one-case BPF interface when it lands.
I'd rather keep things simple for now.
Jakub Kicinski July 31, 2018, 12:45 a.m. UTC | #5
On Mon, 30 Jul 2018 17:33:39 -0700, Daniel Colascione wrote:
> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski wrote:
> > On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:  
> > > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:  
> > > > Hmm, I don't think such UAPI as above is future-proof. In case we would want
> > > > a similar mechanism in future for other maps, we would need a whole new bpf
> > > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
> > > > the underlying map may not even be a map-to-map. Additionally, we don't have
> > > > any map object at hand in the above, so we couldn't make any finer grained
> > > > decisions either. Something like below would be more suitable and leaves room
> > > > for extending this further in future.  
> > >
> > > YAGNI.  Your proposed mechanism doesn't add anything under the current
> > > implementation.  
> >
> > FWIW in case of HW offload targeting a particular map may allow users
> > to avoid a potentially slow sync with all the devices on the system.  
> 
> Sure. But such a thing doesn't exist right now (right?), and we can
> add that more-efficient-in-that-one-case BPF interface when it lands.
> I'd rather keep things simple for now.

You mean map-in-map offload doesn't exist today?  True, but it's on the
roadmap for Netronome.

Tangentially it would be really useful for us to have a "the map has
actually been freed" notification/barrier.  We have complaints of users
creating huge maps and then trying to free and recreate them quickly,
and the creation part failing with -ENOMEM, because the free from a
workqueue didn't run, yet :(  It'd probably require a different API to
solve than what's discussed here, but since we're talking about syncing
things I thought I'd put it out there...
Daniel Colascione July 31, 2018, 12:50 a.m. UTC | #6
On Mon, Jul 30, 2018 at 5:45 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Mon, 30 Jul 2018 17:33:39 -0700, Daniel Colascione wrote:
>> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski wrote:
>> > On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:
>> > > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> > > > Hmm, I don't think such UAPI as above is future-proof. In case we would want
>> > > > a similar mechanism in future for other maps, we would need a whole new bpf
>> > > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
>> > > > the underlying map may not even be a map-to-map. Additionally, we don't have
>> > > > any map object at hand in the above, so we couldn't make any finer grained
>> > > > decisions either. Something like below would be more suitable and leaves room
>> > > > for extending this further in future.
>> > >
>> > > YAGNI.  Your proposed mechanism doesn't add anything under the current
>> > > implementation.
>> >
>> > FWIW in case of HW offload targeting a particular map may allow users
>> > to avoid a potentially slow sync with all the devices on the system.
>>
>> Sure. But such a thing doesn't exist right now (right?), and we can
>> add that more-efficient-in-that-one-case BPF interface when it lands.
>> I'd rather keep things simple for now.
>
> You mean map-in-map offload doesn't exist today?  True, but it's on the
> roadmap for Netronome.

Sounds cool. I'd still wait until map-in-map offload lands until
adding the map-specific API though.

> Tangentially it would be really useful for us to have a "the map has
> actually been freed" notification/barrier.  We have complaints of users
> creating huge maps and then trying to free and recreate them quickly,
> and the creation part failing with -ENOMEM, because the free from a
> workqueue didn't run, yet :(  It'd probably require a different API to
> solve than what's discussed here, but since we're talking about syncing
> things I thought I'd put it out there...

Yeah. What you're talking about is what I meant upthread when I
briefly mentioned a "refcount draining approach" --- you'd block until
all references except your own to a map disappeared.
Jakub Kicinski July 31, 2018, 1:14 a.m. UTC | #7
On Mon, 30 Jul 2018 17:50:15 -0700, Daniel Colascione wrote:
> > Tangentially it would be really useful for us to have a "the map has
> > actually been freed" notification/barrier.  We have complaints of users
> > creating huge maps and then trying to free and recreate them quickly,
> > and the creation part failing with -ENOMEM, because the free from a
> > workqueue didn't run, yet :(  It'd probably require a different API to
> > solve than what's discussed here, but since we're talking about syncing
> > things I thought I'd put it out there...  
> 
> Yeah. What you're talking about is what I meant upthread when I
> briefly mentioned a "refcount draining approach" --- you'd block until
> all references except your own to a map disappeared.

I don't think so.  The ref count goes to 0, work gets scheduled to call
free, work runs, free gets called, device memory gets freed.  Now the
memory can be reused.  As long as there are any refs we can't free
memory, unless we want to make the semantics really ugly.

But as I said, only superficially related to this patch.  Perhaps
solution will naturally fall out of an API to query the device
capabilities and resources, if we ever get there.
Daniel Borkmann July 31, 2018, 8:34 a.m. UTC | #8
On 07/31/2018 02:33 AM, Daniel Colascione wrote:
> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski
> <jakub.kicinski@netronome.com> wrote:
>> On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:
>>> On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> Hmm, I don't think such UAPI as above is future-proof. In case we would want
>>>> a similar mechanism in future for other maps, we would need a whole new bpf
>>>> command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
>>>> the underlying map may not even be a map-to-map. Additionally, we don't have
>>>> any map object at hand in the above, so we couldn't make any finer grained
>>>> decisions either. Something like below would be more suitable and leaves room
>>>> for extending this further in future.
>>>
>>> YAGNI.  Your proposed mechanism doesn't add anything under the current
>>> implementation.
>>
>> FWIW in case of HW offload targeting a particular map may allow users
>> to avoid a potentially slow sync with all the devices on the system.
> 
> Sure. But such a thing doesn't exist right now (right?), and we can
> add that more-efficient-in-that-one-case BPF interface when it lands.
> I'd rather keep things simple for now.

I don't see a reason why that is even more complicated. An API command name
such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and
exposes specific map details (here: map-in-map) into the UAPI whereas it
should reside within a specific implementation instead similar to other ops
we have for maps. If in future other maps would be added that would have
similar mechanisms of inner objects they return to the BPF program, we'll
be adding yet another command just for this. Also, union bpf_attr is extensible,
e.g. additional members could be added in future whenever needed for this
subcommand instead of forcing it to NULL as done here. E.g. having a high-level
one like bpf(BPF_MAP_WAIT, { .map_fd = fd }) could later on also cover the
use-case from Jakub by adding an 'event' member into union bpf_attr where we
could add other map specific wakeups for user space while the current default
of 0 would be BPF_MAP_WAIT_PENDING_REFS (or the like). All I'm saying is to
keep it generic so it can be extended later.
Daniel Colascione July 31, 2018, 9:36 a.m. UTC | #9
On Tue, Jul 31, 2018 at 1:34 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 07/31/2018 02:33 AM, Daniel Colascione wrote:
>> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski
>> <jakub.kicinski@netronome.com> wrote:
>>> On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote:
>>>> On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>> Hmm, I don't think such UAPI as above is future-proof. In case we would want
>>>>> a similar mechanism in future for other maps, we would need a whole new bpf
>>>>> command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though
>>>>> the underlying map may not even be a map-to-map. Additionally, we don't have
>>>>> any map object at hand in the above, so we couldn't make any finer grained
>>>>> decisions either. Something like below would be more suitable and leaves room
>>>>> for extending this further in future.
>>>>
>>>> YAGNI.  Your proposed mechanism doesn't add anything under the current
>>>> implementation.
>>>
>>> FWIW in case of HW offload targeting a particular map may allow users
>>> to avoid a potentially slow sync with all the devices on the system.
>>
>> Sure. But such a thing doesn't exist right now (right?), and we can
>> add that more-efficient-in-that-one-case BPF interface when it lands.
>> I'd rather keep things simple for now.
>
> I don't see a reason why that is even more complicated.

Both the API and the implementation are much more complicated in the
per-map ops version: just look at the patch size. The size argument
isn't necessarily a dealbreaker, but I still don't see what the extra
code size and complexity is buying.

> An API command name
> such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and
> exposes specific map details (here: map-in-map) into the UAPI whereas it
> should reside within a specific implementation instead similar to other ops
> we have for maps.

But synchronize isn't conceptually a command that applies to a
specific map. It waits on all references. Did you address my point
about your proposed map-specific interface requiring redundant
synchronize_rcu calls in the case where we swap multiple maps and want
to wait for all the references to drain? Under my proposal, you'd just
BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your
proposal, we'd make it a per-map operation, so we'd issue one
synchronize_rcu per map.

> If in future other maps would be added that would have
> similar mechanisms of inner objects they return to the BPF program, we'll
> be adding yet another command just for this.

And that's why my personal preference is to just calling this thing
BPF_SYNCHRONIZE, which I'd define to wait for all such "inner
objects". Alexei is the one who asked for the very specific naming, I
believe.

Anyway, we have a very simple patch that we could apply today. It
addresses a real need, and it doesn't preclude adding something more
specific later, when we know we need it. Besides, it's not as if
adding a BPF command is particularly expensive.

> Also, union bpf_attr is extensible,
> e.g. additional members could be added in future whenever needed for this
> subcommand instead of forcing it to NULL as done here.

We fail with EINVAL when attr != NULL now, which means that we can
safely accept a non-NULL attr-based subcommand later without breaking
anyone. The interface is already extensible.

> All I'm saying is to
> keep it generic so it can be extended later.

Sure, but no more extensible than it has to be. Prematurely-added
extension points tend to cause trouble later.
Alexei Starovoitov Aug. 10, 2018, 10:29 p.m. UTC | #10
On Thu, Aug 09, 2018 at 10:17:50AM -0700, Daniel Colascione wrote:
> Ping.

sorry for delay. have been off grid. let's continue in the other thread for full context
Alexei Starovoitov Aug. 10, 2018, 10:52 p.m. UTC | #11
On Tue, Jul 31, 2018 at 02:36:39AM -0700, Daniel Colascione wrote:
> 
> > An API command name
> > such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and
> > exposes specific map details (here: map-in-map) into the UAPI whereas it
> > should reside within a specific implementation instead similar to other ops
> > we have for maps.
> 
> But synchronize isn't conceptually a command that applies to a
> specific map. It waits on all references. Did you address my point
> about your proposed map-specific interface requiring redundant
> synchronize_rcu calls in the case where we swap multiple maps and want
> to wait for all the references to drain? Under my proposal, you'd just
> BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your
> proposal, we'd make it a per-map operation, so we'd issue one
> synchronize_rcu per map.

optimizing for multi-map sync sounds like premature optimization.
In general I'd prefer DanielB proposal to make the sync logic map and fd specific,
but before we argue about implementation further let's agree on
the problem we're trying to solve.
I believe the only issue being discussed is user space doesn't know
when it's ok to start draining the inner map when it was replaced
by bpf_map_update syscall command with another map, right?
If we agree on that, should bpf_map_update handle it then?
Wouldn't it be much easier to understand and use from user pov?
No new commands to learn. map_update syscall replaced the map
and old map is no longer accessed by the program via this given map-in-map.
But if the replaced map is used directly or it sits in some other
map-in-map slot the progs can still access it.

My issue with DanielC SYNC cmd that it exposes implementation details
and introduces complex 'synchronization' semantics. To majority of
the users it won't be obvious what is being synchronized.

My issue with DanielB WAIT_REF map_fd cmd that it needs to wait for all refs
to this map to be dropped. I think combination of usercnt and refcnt
can answer that, but feels dangerous to sleep potentially forever
in a syscall until all prog->map references are gone, though such
cmd is useful beyond map-in-map use case.
Daniel Colascione Aug. 14, 2018, 8:37 p.m. UTC | #12
On Fri, Aug 10, 2018 at 3:52 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Tue, Jul 31, 2018 at 02:36:39AM -0700, Daniel Colascione wrote:
>>
>> > An API command name
>> > such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and
>> > exposes specific map details (here: map-in-map) into the UAPI whereas it
>> > should reside within a specific implementation instead similar to other ops
>> > we have for maps.
>>
>> But synchronize isn't conceptually a command that applies to a
>> specific map. It waits on all references. Did you address my point
>> about your proposed map-specific interface requiring redundant
>> synchronize_rcu calls in the case where we swap multiple maps and want
>> to wait for all the references to drain? Under my proposal, you'd just
>> BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your
>> proposal, we'd make it a per-map operation, so we'd issue one
>> synchronize_rcu per map.
>
> optimizing for multi-map sync sounds like premature optimization.

Maybe, but the per-map proposal is less efficient *and* more
complicated! I don't want to spend more code just to go slower.

> I believe the only issue being discussed is user space doesn't know
> when it's ok to start draining the inner map when it was replaced
> by bpf_map_update syscall command with another map, right?

Yes.

> If we agree on that, should bpf_map_update handle it then?
> Wouldn't it be much easier to understand and use from user pov?
> No new commands to learn. map_update syscall replaced the map
> and old map is no longer accessed by the program via this given map-in-map.

Maybe with a new BPF_SYNCHRONIZE flag for BPF_MAP_UPDATE_ELEM and
BPF_MAP_DELETE_ELEM. Otherwise, it seems wrong to make every user of
these commands pay for synchronization that only a few will need.

> But if the replaced map is used directly or it sits in some other
> map-in-map slot the progs can still access it.
>
> My issue with DanielC SYNC cmd that it exposes implementation details

What implementation details? The command semantics are defined
entirely in terms of existing user-visible primitives.

> and introduces complex 'synchronization' semantics. To majority of
> the users it won't be obvious what is being synchronized.
>
> My issue with DanielB WAIT_REF map_fd cmd that it needs to wait for all refs
> to this map to be dropped. I think combination of usercnt and refcnt
> can answer that, but feels dangerous to sleep potentially forever
> in a syscall until all prog->map references are gone, though such
> cmd is useful beyond map-in-map use case.

In what scenarios?

In any case, can we submit _something_?
Alexei Starovoitov Aug. 16, 2018, 4:01 a.m. UTC | #13
On Tue, Aug 14, 2018 at 01:37:12PM -0700, Daniel Colascione wrote:
> 
> > If we agree on that, should bpf_map_update handle it then?
> > Wouldn't it be much easier to understand and use from user pov?
> > No new commands to learn. map_update syscall replaced the map
> > and old map is no longer accessed by the program via this given map-in-map.
> 
> Maybe with a new BPF_SYNCHRONIZE flag for BPF_MAP_UPDATE_ELEM and
> BPF_MAP_DELETE_ELEM. Otherwise, it seems wrong to make every user of
> these commands pay for synchronization that only a few will need.

I don't think extra flag is needed. Extra sync_rcu() for map-in-map
is useful for all users. I would consider it a bugfix,
since users that examine deleted map have this race today
and removing the race is always a good thing especially since the cost
is small.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b7db3261c62d..ca3cfca76edc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -75,6 +75,19 @@  struct bpf_lpm_trie_key {
 	__u8	data[0];	/* Arbitrary size */
 };
 
+/* BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all
+ * references to maps active at the instant the
+ * BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is
+ * issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the
+ * expiration of map references obtained by BPF programs from
+ * other maps.
+ *
+ * The purpose of this command is to provide a means for userspace to
+ * replace a BPF map with another, newer version, then ensure that no
+ * component is still using the "old" map before manipulating the
+ * "old" map in some way.
+ */
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -98,6 +111,7 @@  enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..bc9a0713f47d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2274,6 +2274,19 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (cmd == BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES) {
+		if (uattr != NULL || size != 0)
+			return -EINVAL;
+		err = security_bpf(cmd, NULL, 0);
+		if (err < 0)
+			return err;
+		/* BPF programs always enter a critical section while
+		 * they have a map reference outstanding.
+		 */
+		synchronize_rcu();
+		return 0;
+	}
+
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
 	if (err)
 		return err;