diff mbox series

[bpf-next,3/6] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

Message ID 153902586650.8888.7406217713259884259.stgit@kernel
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series Implement queue/stack maps | expand

Commit Message

Mauricio Vasquez Oct. 8, 2018, 7:11 p.m. UTC
The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
---
 include/linux/bpf.h      |    1 +
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/syscall.c     |   81 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

Comments

Song Liu Oct. 9, 2018, 1:13 a.m. UTC | #1
On Mon, Oct 8, 2018 at 12:12 PM Mauricio Vasquez B
<mauricio.vasquez@polito.it> wrote:
>
> The following patch implements a bpf queue/stack maps that
> provides the peek/pop/push functions.  There is not a direct
> relationship between those functions and the current maps
> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
> this is mapped to the pop operation in the queue/stack maps
> and it is still to implement in other kind of maps.

Do we need this system call for other maps (non-stack/queue)?
If not, maybe we can just call it POP, and only implement it for
stack and queue?

>
> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
> ---
>  include/linux/bpf.h      |    1 +
>  include/uapi/linux/bpf.h |    1 +
>  kernel/bpf/syscall.c     |   81 ++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 027697b6a22f..98c7eeb6d138 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -39,6 +39,7 @@ struct bpf_map_ops {
>         void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>         int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
>         int (*map_delete_elem)(struct bpf_map *map, void *key);
> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>
>         /* funcs called by prog_array and perf_event_array map */
>         void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f9187b41dff6..3bb94aa2d408 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -103,6 +103,7 @@ enum bpf_cmd {
>         BPF_BTF_LOAD,
>         BPF_BTF_GET_FD_BY_ID,
>         BPF_TASK_FD_QUERY,
> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>  };
>
>  enum bpf_map_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index eb75e8af73ff..c33d9303f72f 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
>         return err;
>  }
>
> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
> +
> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
> +{
> +       void __user *ukey = u64_to_user_ptr(attr->key);
> +       void __user *uvalue = u64_to_user_ptr(attr->value);
> +       int ufd = attr->map_fd;
> +       struct bpf_map *map;
> +       void *key, *value, *ptr;
> +       u32 value_size;
> +       struct fd f;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
> +               return -EINVAL;
> +
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
> +               err = -EPERM;
> +               goto err_put;
> +       }
> +
> +       key = __bpf_copy_key(ukey, map->key_size);
> +       if (IS_ERR(key)) {
> +               err = PTR_ERR(key);
> +               goto err_put;
> +       }
> +
> +       value_size = map->value_size;
> +
> +       err = -ENOMEM;
> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> +       if (!value)
> +               goto free_key;
> +
> +       err = -EFAULT;
> +       if (copy_from_user(value, uvalue, value_size) != 0)
> +               goto free_value;
> +
> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
> +        * inside bpf map update or delete otherwise deadlocks are possible
> +        */
> +       preempt_disable();
> +       __this_cpu_inc(bpf_prog_active);
> +       if (!map->ops->map_lookup_and_delete_elem) {
Why do have have this check here? Shouldn't it be check much earlier?
If we really need it here, we need at least add the following:

__this_cpu_dec(bpf_prog_active);
       preempt_enable();


> +               err = -ENOTSUPP;
> +               goto free_value;
> +       }
> +       rcu_read_lock();
> +       ptr = map->ops->map_lookup_and_delete_elem(map, key);
> +       if (ptr)
> +               memcpy(value, ptr, value_size);
> +       rcu_read_unlock();
> +       err = ptr ? 0 : -ENOENT;
> +       __this_cpu_dec(bpf_prog_active);
> +       preempt_enable();
> +
> +       if (err)
> +               goto free_value;
> +
> +       if (copy_to_user(uvalue, value, value_size) != 0)
> +               goto free_value;
> +
> +       err = 0;
> +
> +free_value:
> +       kfree(value);
> +free_key:
> +       kfree(key);
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  static const struct bpf_prog_ops * const bpf_prog_types[] = {
>  #define BPF_PROG_TYPE(_id, _name) \
>         [_id] = & _name ## _prog_ops,
> @@ -2448,6 +2526,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_TASK_FD_QUERY:
>                 err = bpf_task_fd_query(&attr, uattr);
>                 break;
> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
> +               err = map_lookup_and_delete_elem(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;
>
Mauricio Vasquez Oct. 9, 2018, 12:56 p.m. UTC | #2
On 10/08/2018 08:13 PM, Song Liu wrote:
> On Mon, Oct 8, 2018 at 12:12 PM Mauricio Vasquez B
> <mauricio.vasquez@polito.it> wrote:
>> The following patch implements a bpf queue/stack maps that
>> provides the peek/pop/push functions.  There is not a direct
>> relationship between those functions and the current maps
>> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
>> this is mapped to the pop operation in the queue/stack maps
>> and it is still to implement in other kind of maps.
> Do we need this system call for other maps (non-stack/queue)?
> If not, maybe we can just call it POP, and only implement it for
> stack and queue?
>
Yes, this system call could also benefit other maps.  The first idea was 
to add pop/push/peek system calls as well, but them Alexei realized it 
was too specific for queue/stack maps and we decided to go ahead with 
this solution that is more general.

>> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
>> ---
>>   include/linux/bpf.h      |    1 +
>>   include/uapi/linux/bpf.h |    1 +
>>   kernel/bpf/syscall.c     |   81 ++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 83 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 027697b6a22f..98c7eeb6d138 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -39,6 +39,7 @@ struct bpf_map_ops {
>>          void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>>          int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
>>          int (*map_delete_elem)(struct bpf_map *map, void *key);
>> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>>
>>          /* funcs called by prog_array and perf_event_array map */
>>          void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f9187b41dff6..3bb94aa2d408 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -103,6 +103,7 @@ enum bpf_cmd {
>>          BPF_BTF_LOAD,
>>          BPF_BTF_GET_FD_BY_ID,
>>          BPF_TASK_FD_QUERY,
>> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>>   };
>>
>>   enum bpf_map_type {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index eb75e8af73ff..c33d9303f72f 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
>>          return err;
>>   }
>>
>> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
>> +
>> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
>> +{
>> +       void __user *ukey = u64_to_user_ptr(attr->key);
>> +       void __user *uvalue = u64_to_user_ptr(attr->value);
>> +       int ufd = attr->map_fd;
>> +       struct bpf_map *map;
>> +       void *key, *value, *ptr;
>> +       u32 value_size;
>> +       struct fd f;
>> +       int err;
>> +
>> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
>> +               return -EINVAL;
>> +
>> +       f = fdget(ufd);
>> +       map = __bpf_map_get(f);
>> +       if (IS_ERR(map))
>> +               return PTR_ERR(map);
>> +
>> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
>> +               err = -EPERM;
>> +               goto err_put;
>> +       }
>> +
>> +       key = __bpf_copy_key(ukey, map->key_size);
>> +       if (IS_ERR(key)) {
>> +               err = PTR_ERR(key);
>> +               goto err_put;
>> +       }
>> +
>> +       value_size = map->value_size;
>> +
>> +       err = -ENOMEM;
>> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
>> +       if (!value)
>> +               goto free_key;
>> +
>> +       err = -EFAULT;
>> +       if (copy_from_user(value, uvalue, value_size) != 0)
>> +               goto free_value;
>> +
>> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
>> +        * inside bpf map update or delete otherwise deadlocks are possible
>> +        */
>> +       preempt_disable();
>> +       __this_cpu_inc(bpf_prog_active);
>> +       if (!map->ops->map_lookup_and_delete_elem) {
> Why do have have this check here? Shouldn't it be check much earlier?
> If we really need it here, we need at least add the following:
In this particular patch the check can be done much earlier, but in next 
patch we need it on this position, so I leave it here to avoid moving 
around on next patch.

> __this_cpu_dec(bpf_prog_active);
>         preempt_enable();
You are right, I missed that. Will fix for next version.
>
>
>> +               err = -ENOTSUPP;
>> +               goto free_value;
>> +       }
>> +       rcu_read_lock();
>> +       ptr = map->ops->map_lookup_and_delete_elem(map, key);
>> +       if (ptr)
>> +               memcpy(value, ptr, value_size);
>> +       rcu_read_unlock();
>> +       err = ptr ? 0 : -ENOENT;
>> +       __this_cpu_dec(bpf_prog_active);
>> +       preempt_enable();
>> +
>> +       if (err)
>> +               goto free_value;
>> +
>> +       if (copy_to_user(uvalue, value, value_size) != 0)
>> +               goto free_value;
>> +
>> +       err = 0;
>> +
>> +free_value:
>> +       kfree(value);
>> +free_key:
>> +       kfree(key);
>> +err_put:
>> +       fdput(f);
>> +       return err;
>> +}
>> +
>>   static const struct bpf_prog_ops * const bpf_prog_types[] = {
>>   #define BPF_PROG_TYPE(_id, _name) \
>>          [_id] = & _name ## _prog_ops,
>> @@ -2448,6 +2526,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>>          case BPF_TASK_FD_QUERY:
>>                  err = bpf_task_fd_query(&attr, uattr);
>>                  break;
>> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
>> +               err = map_lookup_and_delete_elem(&attr);
>> +               break;
>>          default:
>>                  err = -EINVAL;
>>                  break;
>>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..98c7eeb6d138 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@  struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+	void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..3bb94aa2d408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@  enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb75e8af73ff..c33d9303f72f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -975,6 +975,84 @@  static int map_get_next_key(union bpf_attr *attr)
 	return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct bpf_map *map;
+	void *key, *value, *ptr;
+	u32 value_size;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	key = __bpf_copy_key(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto err_put;
+	}
+
+	value_size = map->value_size;
+
+	err = -ENOMEM;
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	err = -EFAULT;
+	if (copy_from_user(value, uvalue, value_size) != 0)
+		goto free_value;
+
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	if (!map->ops->map_lookup_and_delete_elem) {
+		err = -ENOTSUPP;
+		goto free_value;
+	}
+	rcu_read_lock();
+	ptr = map->ops->map_lookup_and_delete_elem(map, key);
+	if (ptr)
+		memcpy(value, ptr, value_size);
+	rcu_read_unlock();
+	err = ptr ? 0 : -ENOENT;
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	if (err)
+		goto free_value;
+
+	if (copy_to_user(uvalue, value, value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2448,6 +2526,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_TASK_FD_QUERY:
 		err = bpf_task_fd_query(&attr, uattr);
 		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+		err = map_lookup_and_delete_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;