diff mbox

[v12,net-next,03/11] bpf: add lookup/update/delete/iterate methods to BPF maps

Message ID 1410808721-27493-4-git-send-email-ast@plumgrid.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov Sept. 15, 2014, 7:18 p.m. UTC
'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error

- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error

- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/bpf.h      |    8 ++
 include/uapi/linux/bpf.h |   38 ++++++++
 kernel/bpf/syscall.c     |  229 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 275 insertions(+)

Comments

David Miller Sept. 16, 2014, 7:16 p.m. UTC | #1
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 15 Sep 2014 12:18:33 -0700

> @@ -83,6 +112,15 @@ union bpf_attr {
>  		__u32	value_size;	/* size of value in bytes */
>  		__u32	max_entries;	/* max number of entries in a map */
>  	};
> +
> +	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> +		int map_fd;
> +		void __user *key;
> +		union {
> +			void __user *value;
> +			void __user *next_key;
> +		};
> +	};
>  };
>  
>  #endif /* _UAPI__LINUX_BPF_H__ */

Depending upon the processor ABI, this change can increase the
alignment requirements of union bpf_attr.  So the structure is not
compatible between patch #1 and patch #3 here.

Also, you haven't implemented any compat layer whatsoever for the
necessary translations.  This happens because you are using pointers
which are different sized between 32-bit and 64-bit ABIs.

I would suggest you use instead something like "aligned_u64" since
these are just arbitrary userland cookies and using "aligned_u64"
vs. "u64" will make it so that you don't have to deal with the 64-bit
type alignment differences between x86-32 and x86-64 while writing the
compat wrappers (if any).
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 16, 2014, 8:44 p.m. UTC | #2
On Tue, Sep 16, 2014 at 12:16 PM, David Miller <davem@davemloft.net> wrote:
> From: Alexei Starovoitov <ast@plumgrid.com>
> Date: Mon, 15 Sep 2014 12:18:33 -0700
>
>> @@ -83,6 +112,15 @@ union bpf_attr {
>>               __u32   value_size;     /* size of value in bytes */
>>               __u32   max_entries;    /* max number of entries in a map */
>>       };
>> +
>> +     struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
>> +             int map_fd;
>> +             void __user *key;
>> +             union {
>> +                     void __user *value;
>> +                     void __user *next_key;
>> +             };
>> +     };
>>  };
>>
>>  #endif /* _UAPI__LINUX_BPF_H__ */
>
> Depending upon the processor ABI, this change can increase the
> alignment requirements of union bpf_attr.  So the structure is not
> compatible between patch #1 and patch #3 here.

the union indeed changes alignment from patch #1 to #3,
but, imo, it is not a problem, since kernel does:
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(attr, uattr, size) != 0)
and then proceeds with further bpf_attr validation, so even
if user space alignment is 4 and user assumes contents
from patch 1, but kernel alignment is 8 and contents from
patch 3, it is still ok. Backwards compatibility is preserved.

> Also, you haven't implemented any compat layer whatsoever for the
> necessary translations.  This happens because you are using pointers
> which are different sized between 32-bit and 64-bit ABIs.

I mentioned it in the cover letter:
- implemented and tested compat support (not part of this set)

I'll roll compat layer into the set to make it less confusing.

> I would suggest you use instead something like "aligned_u64" since
> these are just arbitrary userland cookies and using "aligned_u64"
> vs. "u64" will make it so that you don't have to deal with the 64-bit
> type alignment differences between x86-32 and x86-64 while writing the
> compat wrappers (if any).

I haven't thought of 'aligned_u64' for this case.
For counters and masks it would be perfect, but here user is
passing real pointers to key and value, so they have to
be 'void __user *', otherwise user would need to type cast
them, which I want to avoid.

>> +     struct { /* anonymous struct used by BPF_PROG_LOAD command */
>> +             __u32 prog_type;        /* one of enum bpf_prog_type */
>> +             __u32 insn_cnt;
>> +             const struct bpf_insn __user *insns;
>> +             const char __user *license;
>> +     };
>> +};
>
> Again, these need syscall compat handling.  You may want to redesign
> the types so that perhaps no compat layer translations will be needed.

'foo __user *' pointers vs 'aligned_u64'... It's a trade off.
I can make all pointer fields to be u64 and then, indeed,
no compat layer will be needed, but user would need to
do ugly type casts. I think compat layer is a better option.
It's simple enough. I'll roll it into the set and respin.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 16, 2014, 8:54 p.m. UTC | #3
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 16 Sep 2014 13:44:12 -0700

> the union indeed changes alignment from patch #1 to #3,
> but, imo, it is not a problem, since kernel does:

It changes the alignment of the datastructures in userspace.

> I haven't thought of 'aligned_u64' for this case.
> For counters and masks it would be perfect, but here user is
> passing real pointers to key and value, so they have to
> be 'void __user *', otherwise user would need to type cast
> them, which I want to avoid.

The cost of the compat layer must be considered and weighted
against this cast, which I think is really no big deal.

> I think compat layer is a better option.

It's overhead you'll have to support forever, I think you should
reconsider.

All of the "ugly casting" will be hidden, or can be hidden, in the
syscall wrappers and/or interfaces in userspace.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 16, 2014, 9:23 p.m. UTC | #4
On Tue, Sep 16, 2014 at 1:54 PM, David Miller <davem@davemloft.net> wrote:
>
> The cost of the compat layer must be considered and weighted
> against this cast, which I think is really no big deal.
>
>> I think compat layer is a better option.
>
> It's overhead you'll have to support forever, I think you should
> reconsider.
>
> All of the "ugly casting" will be hidden, or can be hidden, in the
> syscall wrappers and/or interfaces in userspace.

ahh, ok. I thought you're strongly against any type of casts.
In such case I can get rid of 'union bpf_attr' as well and simply
define a struct per command, then syscall will look like:
sys_bpf(int cmd, void __user *attr, unsigned int size);
and uapi/linux/bpf.h will have:
struct bpf_prog_load_attr { /* for BPF_PROG_LOAD cmd */
    __u32 prog_type;
    __u32 insn_cnt;
    __aligned_u64 insns;
    __aligned_u64 license;
    __u32 log_level;
    __u32 log_size;
    __aligned_u64 log_buf;
};
and similar for other commands.
no compat layer and type checking will be done
by syscall wrappers. Ok?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 16, 2014, 9:49 p.m. UTC | #5
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 16 Sep 2014 14:23:27 -0700

> no compat layer and type checking will be done
> by syscall wrappers. Ok?

Why are you against using strong typing just for everything other
than the user pointer blobs?

I don't understand the resistence to my suggestion to just use
aligned_u64 instead of "void __user *" in the union members.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 16, 2014, 10:06 p.m. UTC | #6
On Tue, Sep 16, 2014 at 2:49 PM, David Miller <davem@davemloft.net> wrote:
> From: Alexei Starovoitov <ast@plumgrid.com>
> Date: Tue, 16 Sep 2014 14:23:27 -0700
>
>> no compat layer and type checking will be done
>> by syscall wrappers. Ok?
>
> Why are you against using strong typing just for everything other
> than the user pointer blobs?
>
> I don't understand the resistence to my suggestion to just use
> aligned_u64 instead of "void __user *" in the union members.

All these different variants are ok to me. There are pros and
cons to either approach. I'm not against strong typing.
I just thought it would be cleaner not to use 'union' and
was asking for opinion. That's all. Sure, I will keep 'union'
and only change pointers to __aligned_u64.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48014a71f0fe..2887f3f9da59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,7 @@ 
 
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
+#include <linux/file.h>
 
 struct bpf_map;
 
@@ -17,6 +18,12 @@  struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
 	struct bpf_map *(*map_alloc)(union bpf_attr *attr);
 	void (*map_free)(struct bpf_map *);
+	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+
+	/* funcs callable from userspace and from eBPF programs */
+	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+	int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+	int (*map_delete_elem)(struct bpf_map *map, void *key);
 };
 
 struct bpf_map {
@@ -37,5 +44,6 @@  struct bpf_map_type_list {
 
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 void bpf_map_put(struct bpf_map *map);
+struct bpf_map *bpf_map_get(struct fd f);
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecc8fa48a2a4..d4ba8bb66bd4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -70,6 +70,35 @@  enum bpf_cmd {
 	 * map is deleted when fd is closed
 	 */
 	BPF_MAP_CREATE,
+
+	/* lookup key in a given map
+	 * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value
+	 * returns zero and stores found elem into value
+	 * or negative error
+	 */
+	BPF_MAP_LOOKUP_ELEM,
+
+	/* create or update key/value pair in a given map
+	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value
+	 * returns zero or negative error
+	 */
+	BPF_MAP_UPDATE_ELEM,
+
+	/* find and delete elem by key in a given map
+	 * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key
+	 * returns zero or negative error
+	 */
+	BPF_MAP_DELETE_ELEM,
+
+	/* lookup key in a given map and return next key
+	 * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->next_key
+	 * returns zero and stores next key or negative error
+	 */
+	BPF_MAP_GET_NEXT_KEY,
 };
 
 enum bpf_map_type {
@@ -83,6 +112,15 @@  union bpf_attr {
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
 	};
+
+	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+		int map_fd;
+		void __user *key;
+		union {
+			void __user *value;
+			void __user *next_key;
+		};
+	};
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 80082ec5a5fc..fdf7bd79cf76 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@ 
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
+#include <linux/file.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -111,6 +112,222 @@  free_map:
 	return err;
 }
 
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+	struct bpf_map *map;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &bpf_map_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	map = f.file->private_data;
+
+	return map;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *uvalue = attr->value;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ESRCH;
+	rcu_read_lock();
+	value = map->ops->map_lookup_elem(map, key);
+	if (!value)
+		goto err_unlock;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto err_unlock;
+
+	err = 0;
+
+err_unlock:
+	rcu_read_unlock();
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+
+static int map_update_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *uvalue = attr->value;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER);
+	if (!value)
+		goto free_key;
+
+	err = -EFAULT;
+	if (copy_from_user(value, uvalue, map->value_size) != 0)
+		goto free_value;
+
+	/* eBPF program that use maps are running under rcu_read_lock(),
+	 * therefore all map accessors rely on this fact, so do the same here
+	 */
+	rcu_read_lock();
+	err = map->ops->map_update_elem(map, key, value);
+	rcu_read_unlock();
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *unext_key = attr->next_key;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *next_key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	next_key = kmalloc(map->key_size, GFP_USER);
+	if (!next_key)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+	if (err)
+		goto free_next_key;
+
+	err = -EFAULT;
+	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+		goto free_next_key;
+
+	err = 0;
+
+free_next_key:
+	kfree(next_key);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr *attr;
@@ -140,6 +357,18 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_CREATE:
 		err = map_create(attr);
 		break;
+	case BPF_MAP_LOOKUP_ELEM:
+		err = map_lookup_elem(attr);
+		break;
+	case BPF_MAP_UPDATE_ELEM:
+		err = map_update_elem(attr);
+		break;
+	case BPF_MAP_DELETE_ELEM:
+		err = map_delete_elem(attr);
+		break;
+	case BPF_MAP_GET_NEXT_KEY:
+		err = map_get_next_key(attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;