[bpf-next,v2,3/7] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

Message ID 153918037098.8915.14965935582566410782.stgit@kernel
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series
  • Implement queue/stack maps
Related show

Commit Message

Mauricio Vasquez Oct. 10, 2018, 2:06 p.m.
The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
---
 include/linux/bpf.h      |    1 +
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/syscall.c     |   82 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)

Comments

Song Liu Oct. 10, 2018, 4:48 p.m. | #1
On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B
<mauricio.vasquez@polito.it> wrote:
>
> The following patch implements a bpf queue/stack maps that
> provides the peek/pop/push functions.  There is not a direct
> relationship between those functions and the current maps
> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
> this is mapped to the pop operation in the queue/stack maps
> and it is still to implement in other kind of maps.
>
> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
> ---
>  include/linux/bpf.h      |    1 +
>  include/uapi/linux/bpf.h |    1 +
>  kernel/bpf/syscall.c     |   82 ++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 84 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 9b558713447f..5793f0c7fbb5 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -39,6 +39,7 @@ struct bpf_map_ops {
>         void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>         int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
>         int (*map_delete_elem)(struct bpf_map *map, void *key);
> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>
>         /* funcs called by prog_array and perf_event_array map */
>         void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f9187b41dff6..3bb94aa2d408 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -103,6 +103,7 @@ enum bpf_cmd {
>         BPF_BTF_LOAD,
>         BPF_BTF_GET_FD_BY_ID,
>         BPF_TASK_FD_QUERY,
> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>  };
>
>  enum bpf_map_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index f36c080ad356..6907d661dea5 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr)
>         return err;
>  }
>
> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
> +
> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
> +{
> +       void __user *ukey = u64_to_user_ptr(attr->key);
> +       void __user *uvalue = u64_to_user_ptr(attr->value);
> +       int ufd = attr->map_fd;
> +       struct bpf_map *map;
> +       void *key, *value, *ptr;
> +       u32 value_size;
> +       struct fd f;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
> +               return -EINVAL;
> +
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
> +               err = -EPERM;
> +               goto err_put;
> +       }
> +
> +       key = __bpf_copy_key(ukey, map->key_size);
> +       if (IS_ERR(key)) {
> +               err = PTR_ERR(key);
> +               goto err_put;
> +       }
> +
> +       value_size = map->value_size;
> +
> +       err = -ENOMEM;
> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> +       if (!value)
> +               goto free_key;
> +
> +       err = -EFAULT;
> +       if (copy_from_user(value, uvalue, value_size) != 0)
> +               goto free_value;
> +
> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
> +        * inside bpf map update or delete otherwise deadlocks are possible
> +        */
> +       preempt_disable();
> +       __this_cpu_inc(bpf_prog_active);
> +       if (map->ops->map_lookup_and_delete_elem) {
> +               rcu_read_lock();
> +               ptr = map->ops->map_lookup_and_delete_elem(map, key);
> +               if (ptr)
> +                       memcpy(value, ptr, value_size);
I think we are exposed to race condition with push and pop in parallel.
map_lookup_and_delete_elem() only updates the head/tail, so it gives
no protection for the buffer pointed by ptr.

Thanks,
Song

> +               rcu_read_unlock();
> +               err = ptr ? 0 : -ENOENT;
> +       } else {
> +               err = -ENOTSUPP;
> +       }
> +
> +       __this_cpu_dec(bpf_prog_active);
> +       preempt_enable();
> +
> +       if (err)
> +               goto free_value;
> +
> +       if (copy_to_user(uvalue, value, value_size) != 0)
> +               goto free_value;
> +
> +       err = 0;
> +
> +free_value:
> +       kfree(value);
> +free_key:
> +       kfree(key);
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  static const struct bpf_prog_ops * const bpf_prog_types[] = {
>  #define BPF_PROG_TYPE(_id, _name) \
>         [_id] = & _name ## _prog_ops,
> @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_TASK_FD_QUERY:
>                 err = bpf_task_fd_query(&attr, uattr);
>                 break;
> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
> +               err = map_lookup_and_delete_elem(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;
>
Mauricio Vasquez Oct. 10, 2018, 5:47 p.m. | #2
On 10/10/2018 11:48 AM, Song Liu wrote:
> On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B
> <mauricio.vasquez@polito.it> wrote:
>> The following patch implements a bpf queue/stack maps that
>> provides the peek/pop/push functions.  There is not a direct
>> relationship between those functions and the current maps
>> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
>> this is mapped to the pop operation in the queue/stack maps
>> and it is still to implement in other kind of maps.
>>
>> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
>> ---
>>   include/linux/bpf.h      |    1 +
>>   include/uapi/linux/bpf.h |    1 +
>>   kernel/bpf/syscall.c     |   82 ++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 84 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 9b558713447f..5793f0c7fbb5 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -39,6 +39,7 @@ struct bpf_map_ops {
>>          void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>>          int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
>>          int (*map_delete_elem)(struct bpf_map *map, void *key);
>> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>>
>>          /* funcs called by prog_array and perf_event_array map */
>>          void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f9187b41dff6..3bb94aa2d408 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -103,6 +103,7 @@ enum bpf_cmd {
>>          BPF_BTF_LOAD,
>>          BPF_BTF_GET_FD_BY_ID,
>>          BPF_TASK_FD_QUERY,
>> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>>   };
>>
>>   enum bpf_map_type {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index f36c080ad356..6907d661dea5 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr)
>>          return err;
>>   }
>>
>> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
>> +
>> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
>> +{
>> +       void __user *ukey = u64_to_user_ptr(attr->key);
>> +       void __user *uvalue = u64_to_user_ptr(attr->value);
>> +       int ufd = attr->map_fd;
>> +       struct bpf_map *map;
>> +       void *key, *value, *ptr;
>> +       u32 value_size;
>> +       struct fd f;
>> +       int err;
>> +
>> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
>> +               return -EINVAL;
>> +
>> +       f = fdget(ufd);
>> +       map = __bpf_map_get(f);
>> +       if (IS_ERR(map))
>> +               return PTR_ERR(map);
>> +
>> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
>> +               err = -EPERM;
>> +               goto err_put;
>> +       }
>> +
>> +       key = __bpf_copy_key(ukey, map->key_size);
>> +       if (IS_ERR(key)) {
>> +               err = PTR_ERR(key);
>> +               goto err_put;
>> +       }
>> +
>> +       value_size = map->value_size;
>> +
>> +       err = -ENOMEM;
>> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
>> +       if (!value)
>> +               goto free_key;
>> +
>> +       err = -EFAULT;
>> +       if (copy_from_user(value, uvalue, value_size) != 0)
>> +               goto free_value;
>> +
>> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
>> +        * inside bpf map update or delete otherwise deadlocks are possible
>> +        */
>> +       preempt_disable();
>> +       __this_cpu_inc(bpf_prog_active);
>> +       if (map->ops->map_lookup_and_delete_elem) {
>> +               rcu_read_lock();
>> +               ptr = map->ops->map_lookup_and_delete_elem(map, key);
>> +               if (ptr)
>> +                       memcpy(value, ptr, value_size);
> I think we are exposed to race condition with push and pop in parallel.
> map_lookup_and_delete_elem() only updates the head/tail, so it gives
> no protection for the buffer pointed by ptr.

queue/stack maps does not use this 'ptr', the pop operation directly 
copies the value into the buffer allocated in map_lookup_and_delete_elem().
The copy from the queue/stack buffer into 'value' and the head/tail 
update are protected by a spinlock in the queue/stack maps implementation.

On the other hand, future implementation of map_lookup_and_delete 
operation in other kind of maps should guarantee that the return ptr is 
rcu protected.

Does it make sense to you?

Thanks,
Mauricio.
> Thanks,
> Song
>
>> +               rcu_read_unlock();
>> +               err = ptr ? 0 : -ENOENT;
>> +       } else {
>> +               err = -ENOTSUPP;
>> +       }
>> +
>> +       __this_cpu_dec(bpf_prog_active);
>> +       preempt_enable();
>> +
>> +       if (err)
>> +               goto free_value;
>> +
>> +       if (copy_to_user(uvalue, value, value_size) != 0)
>> +               goto free_value;
>> +
>> +       err = 0;
>> +
>> +free_value:
>> +       kfree(value);
>> +free_key:
>> +       kfree(key);
>> +err_put:
>> +       fdput(f);
>> +       return err;
>> +}
>> +
>>   static const struct bpf_prog_ops * const bpf_prog_types[] = {
>>   #define BPF_PROG_TYPE(_id, _name) \
>>          [_id] = & _name ## _prog_ops,
>> @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>>          case BPF_TASK_FD_QUERY:
>>                  err = bpf_task_fd_query(&attr, uattr);
>>                  break;
>> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
>> +               err = map_lookup_and_delete_elem(&attr);
>> +               break;
>>          default:
>>                  err = -EINVAL;
>>                  break;
>>
Song Liu Oct. 10, 2018, 10:34 p.m. | #3
On Wed, Oct 10, 2018 at 10:48 AM Mauricio Vasquez
<mauricio.vasquez@polito.it> wrote:
>
>
>
> On 10/10/2018 11:48 AM, Song Liu wrote:
> > On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B
> > <mauricio.vasquez@polito.it> wrote:
> >> The following patch implements a bpf queue/stack maps that
> >> provides the peek/pop/push functions.  There is not a direct
> >> relationship between those functions and the current maps
> >> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
> >> this is mapped to the pop operation in the queue/stack maps
> >> and it is still to implement in other kind of maps.
> >>
> >> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
> >> ---
> >>   include/linux/bpf.h      |    1 +
> >>   include/uapi/linux/bpf.h |    1 +
> >>   kernel/bpf/syscall.c     |   82 ++++++++++++++++++++++++++++++++++++++++++++++
> >>   3 files changed, 84 insertions(+)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index 9b558713447f..5793f0c7fbb5 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -39,6 +39,7 @@ struct bpf_map_ops {
> >>          void *(*map_lookup_elem)(struct bpf_map *map, void *key);
> >>          int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
> >>          int (*map_delete_elem)(struct bpf_map *map, void *key);
> >> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
> >>
> >>          /* funcs called by prog_array and perf_event_array map */
> >>          void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index f9187b41dff6..3bb94aa2d408 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -103,6 +103,7 @@ enum bpf_cmd {
> >>          BPF_BTF_LOAD,
> >>          BPF_BTF_GET_FD_BY_ID,
> >>          BPF_TASK_FD_QUERY,
> >> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
> >>   };
> >>
> >>   enum bpf_map_type {
> >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> >> index f36c080ad356..6907d661dea5 100644
> >> --- a/kernel/bpf/syscall.c
> >> +++ b/kernel/bpf/syscall.c
> >> @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr)
> >>          return err;
> >>   }
> >>
> >> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
> >> +
> >> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
> >> +{
> >> +       void __user *ukey = u64_to_user_ptr(attr->key);
> >> +       void __user *uvalue = u64_to_user_ptr(attr->value);
> >> +       int ufd = attr->map_fd;
> >> +       struct bpf_map *map;
> >> +       void *key, *value, *ptr;
> >> +       u32 value_size;
> >> +       struct fd f;
> >> +       int err;
> >> +
> >> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
> >> +               return -EINVAL;
> >> +
> >> +       f = fdget(ufd);
> >> +       map = __bpf_map_get(f);
> >> +       if (IS_ERR(map))
> >> +               return PTR_ERR(map);
> >> +
> >> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
> >> +               err = -EPERM;
> >> +               goto err_put;
> >> +       }
> >> +
> >> +       key = __bpf_copy_key(ukey, map->key_size);
> >> +       if (IS_ERR(key)) {
> >> +               err = PTR_ERR(key);
> >> +               goto err_put;
> >> +       }
> >> +
> >> +       value_size = map->value_size;
> >> +
> >> +       err = -ENOMEM;
> >> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> >> +       if (!value)
> >> +               goto free_key;
> >> +
> >> +       err = -EFAULT;
> >> +       if (copy_from_user(value, uvalue, value_size) != 0)
> >> +               goto free_value;
> >> +
> >> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
> >> +        * inside bpf map update or delete otherwise deadlocks are possible
> >> +        */
> >> +       preempt_disable();
> >> +       __this_cpu_inc(bpf_prog_active);
> >> +       if (map->ops->map_lookup_and_delete_elem) {
> >> +               rcu_read_lock();
> >> +               ptr = map->ops->map_lookup_and_delete_elem(map, key);
> >> +               if (ptr)
> >> +                       memcpy(value, ptr, value_size);
> > I think we are exposed to race condition with push and pop in parallel.
> > map_lookup_and_delete_elem() only updates the head/tail, so it gives
> > no protection for the buffer pointed by ptr.
>
> queue/stack maps does not use this 'ptr', the pop operation directly
> copies the value into the buffer allocated in map_lookup_and_delete_elem().
> The copy from the queue/stack buffer into 'value' and the head/tail
> update are protected by a spinlock in the queue/stack maps implementation.
>
> On the other hand, future implementation of map_lookup_and_delete
> operation in other kind of maps should guarantee that the return ptr is
> rcu protected.
>
> Does it make sense to you?

I reread the other patch, and found it does NOT use the following logic for
queue and stack:

               rcu_read_lock();
               ptr = map->ops->map_lookup_and_delete_elem(map, key);
               if (ptr)
                       memcpy(value, ptr, value_size);

I guess this part is not used at all? Can we just remove it?

Thanks,
Song






> >
> >> +               rcu_read_unlock();
> >> +               err = ptr ? 0 : -ENOENT;
> >> +       } else {
> >> +               err = -ENOTSUPP;
> >> +       }
> >> +
> >> +       __this_cpu_dec(bpf_prog_active);
> >> +       preempt_enable();
> >> +
> >> +       if (err)
> >> +               goto free_value;
> >> +
> >> +       if (copy_to_user(uvalue, value, value_size) != 0)
> >> +               goto free_value;
> >> +
> >> +       err = 0;
> >> +
> >> +free_value:
> >> +       kfree(value);
> >> +free_key:
> >> +       kfree(key);
> >> +err_put:
> >> +       fdput(f);
> >> +       return err;
> >> +}
> >> +
> >>   static const struct bpf_prog_ops * const bpf_prog_types[] = {
> >>   #define BPF_PROG_TYPE(_id, _name) \
> >>          [_id] = & _name ## _prog_ops,
> >> @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> >>          case BPF_TASK_FD_QUERY:
> >>                  err = bpf_task_fd_query(&attr, uattr);
> >>                  break;
> >> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
> >> +               err = map_lookup_and_delete_elem(&attr);
> >> +               break;
> >>          default:
> >>                  err = -EINVAL;
> >>                  break;
> >>
>
Mauricio Vasquez Oct. 10, 2018, 10:50 p.m. | #4
On 10/10/2018 05:34 PM, Song Liu wrote:
> On Wed, Oct 10, 2018 at 10:48 AM Mauricio Vasquez
> <mauricio.vasquez@polito.it> wrote:
>>
>>
>> On 10/10/2018 11:48 AM, Song Liu wrote:
>>> On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B
>>> <mauricio.vasquez@polito.it> wrote:
>>>> The following patch implements a bpf queue/stack maps that
>>>> provides the peek/pop/push functions.  There is not a direct
>>>> relationship between those functions and the current maps
>>>> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
>>>> this is mapped to the pop operation in the queue/stack maps
>>>> and it is still to implement in other kind of maps.
>>>>
>>>> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
>>>> ---
>>>>    include/linux/bpf.h      |    1 +
>>>>    include/uapi/linux/bpf.h |    1 +
>>>>    kernel/bpf/syscall.c     |   82 ++++++++++++++++++++++++++++++++++++++++++++++
>>>>    3 files changed, 84 insertions(+)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index 9b558713447f..5793f0c7fbb5 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -39,6 +39,7 @@ struct bpf_map_ops {
>>>>           void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>>>>           int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
>>>>           int (*map_delete_elem)(struct bpf_map *map, void *key);
>>>> +       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>>>>
>>>>           /* funcs called by prog_array and perf_event_array map */
>>>>           void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index f9187b41dff6..3bb94aa2d408 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -103,6 +103,7 @@ enum bpf_cmd {
>>>>           BPF_BTF_LOAD,
>>>>           BPF_BTF_GET_FD_BY_ID,
>>>>           BPF_TASK_FD_QUERY,
>>>> +       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>>>>    };
>>>>
>>>>    enum bpf_map_type {
>>>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>>>> index f36c080ad356..6907d661dea5 100644
>>>> --- a/kernel/bpf/syscall.c
>>>> +++ b/kernel/bpf/syscall.c
>>>> @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr)
>>>>           return err;
>>>>    }
>>>>
>>>> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
>>>> +
>>>> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
>>>> +{
>>>> +       void __user *ukey = u64_to_user_ptr(attr->key);
>>>> +       void __user *uvalue = u64_to_user_ptr(attr->value);
>>>> +       int ufd = attr->map_fd;
>>>> +       struct bpf_map *map;
>>>> +       void *key, *value, *ptr;
>>>> +       u32 value_size;
>>>> +       struct fd f;
>>>> +       int err;
>>>> +
>>>> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
>>>> +               return -EINVAL;
>>>> +
>>>> +       f = fdget(ufd);
>>>> +       map = __bpf_map_get(f);
>>>> +       if (IS_ERR(map))
>>>> +               return PTR_ERR(map);
>>>> +
>>>> +       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
>>>> +               err = -EPERM;
>>>> +               goto err_put;
>>>> +       }
>>>> +
>>>> +       key = __bpf_copy_key(ukey, map->key_size);
>>>> +       if (IS_ERR(key)) {
>>>> +               err = PTR_ERR(key);
>>>> +               goto err_put;
>>>> +       }
>>>> +
>>>> +       value_size = map->value_size;
>>>> +
>>>> +       err = -ENOMEM;
>>>> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
>>>> +       if (!value)
>>>> +               goto free_key;
>>>> +
>>>> +       err = -EFAULT;
>>>> +       if (copy_from_user(value, uvalue, value_size) != 0)
>>>> +               goto free_value;
>>>> +
>>>> +       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
>>>> +        * inside bpf map update or delete otherwise deadlocks are possible
>>>> +        */
>>>> +       preempt_disable();
>>>> +       __this_cpu_inc(bpf_prog_active);
>>>> +       if (map->ops->map_lookup_and_delete_elem) {
>>>> +               rcu_read_lock();
>>>> +               ptr = map->ops->map_lookup_and_delete_elem(map, key);
>>>> +               if (ptr)
>>>> +                       memcpy(value, ptr, value_size);
>>> I think we are exposed to race condition with push and pop in parallel.
>>> map_lookup_and_delete_elem() only updates the head/tail, so it gives
>>> no protection for the buffer pointed by ptr.
>> queue/stack maps does not use this 'ptr', the pop operation directly
>> copies the value into the buffer allocated in map_lookup_and_delete_elem().
>> The copy from the queue/stack buffer into 'value' and the head/tail
>> update are protected by a spinlock in the queue/stack maps implementation.
>>
>> On the other hand, future implementation of map_lookup_and_delete
>> operation in other kind of maps should guarantee that the return ptr is
>> rcu protected.
>>
>> Does it make sense to you?
> I reread the other patch, and found it does NOT use the following logic for
> queue and stack:
>
>                 rcu_read_lock();
>                 ptr = map->ops->map_lookup_and_delete_elem(map, key);
>                 if (ptr)
>                         memcpy(value, ptr, value_size);
>
> I guess this part is not used at all? Can we just remove it?
>
> Thanks,
> Song

This is the base code for map_lookup_and_delete support, it is not used 
in queue/stack maps.

I think we can leave it there, so when somebody implements 
lookup_and_delete for other maps doesn't have to care about implementing 
also this.
>
>
>
>
>
>>>> +               rcu_read_unlock();
>>>> +               err = ptr ? 0 : -ENOENT;
>>>> +       } else {
>>>> +               err = -ENOTSUPP;
>>>> +       }
>>>> +
>>>> +       __this_cpu_dec(bpf_prog_active);
>>>> +       preempt_enable();
>>>> +
>>>> +       if (err)
>>>> +               goto free_value;
>>>> +
>>>> +       if (copy_to_user(uvalue, value, value_size) != 0)
>>>> +               goto free_value;
>>>> +
>>>> +       err = 0;
>>>> +
>>>> +free_value:
>>>> +       kfree(value);
>>>> +free_key:
>>>> +       kfree(key);
>>>> +err_put:
>>>> +       fdput(f);
>>>> +       return err;
>>>> +}
>>>> +
>>>>    static const struct bpf_prog_ops * const bpf_prog_types[] = {
>>>>    #define BPF_PROG_TYPE(_id, _name) \
>>>>           [_id] = & _name ## _prog_ops,
>>>> @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>>>>           case BPF_TASK_FD_QUERY:
>>>>                   err = bpf_task_fd_query(&attr, uattr);
>>>>                   break;
>>>> +       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
>>>> +               err = map_lookup_and_delete_elem(&attr);
>>>> +               break;
>>>>           default:
>>>>                   err = -EINVAL;
>>>>                   break;
>>>>
Alexei Starovoitov Oct. 11, 2018, 11:51 p.m. | #5
On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
> > > Does it make sense to you?
> > I reread the other patch, and found it does NOT use the following logic for
> > queue and stack:
> > 
> >                 rcu_read_lock();
> >                 ptr = map->ops->map_lookup_and_delete_elem(map, key);
> >                 if (ptr)
> >                         memcpy(value, ptr, value_size);
> > 
> > I guess this part is not used at all? Can we just remove it?
> > 
> > Thanks,
> > Song
> 
> This is the base code for map_lookup_and_delete support, it is not used in
> queue/stack maps.
> 
> I think we can leave it there, so when somebody implements lookup_and_delete
> for other maps doesn't have to care about implementing also this.

The code looks useful to me, but I also agree with Song. And in the kernel
we don't typically add 'almost dead code'.
May be provide an implementation of the lookup_and_delete for hash map
so it's actually used ?
imo it would be a useful feature to have for hash map and will clarify
the intent of it for all other maps and for stack/queue in particular.
Mauricio Vasquez Oct. 16, 2018, 9:16 p.m. | #6
On 10/11/2018 06:51 PM, Alexei Starovoitov wrote:
> On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
>>>> Does it make sense to you?
>>> I reread the other patch, and found it does NOT use the following logic for
>>> queue and stack:
>>>
>>>                  rcu_read_lock();
>>>                  ptr = map->ops->map_lookup_and_delete_elem(map, key);
>>>                  if (ptr)
>>>                          memcpy(value, ptr, value_size);
>>>
>>> I guess this part is not used at all? Can we just remove it?
>>>
>>> Thanks,
>>> Song
>> This is the base code for map_lookup_and_delete support, it is not used in
>> queue/stack maps.
>>
>> I think we can leave it there, so when somebody implements lookup_and_delete
>> for other maps doesn't have to care about implementing also this.
> The code looks useful to me, but I also agree with Song. And in the kernel
> we don't typically add 'almost dead code'.
> May be provide an implementation of the lookup_and_delete for hash map
> so it's actually used ?

I haven't written any code but I think there is a potential problem here.
Current lookup_and_delete returns a pointer to the element, hence 
deletion of the element should be done using call_rcu to guarantee this 
is valid after returning.
In the hashtab, the deletion only uses call_rcu when there is not 
prealloc, otherwise the element is pushed on the list of free elements 
immediately.
If we move the logic to push elements into the free list under a 
call_rcu invocation, it could happen that the free list is empty because 
the call_rcu is still pending (a similar issue we had with the 
queue/stack maps when they used a pass by reference API).

There is another way to implement it without this issue, in syscall.c:
l = ops->lookup(key);
memcpy(l, some_buffer)
ops->delete(key)

The point here is that the lookup_and_delete operation is not being used 
at all, so, is lookup + delete = lookup_and_delete?, can it be generalized?
If this is true, then what is the sense of having lookup_and_delete 
syscall?,


> imo it would be a useful feature to have for hash map and will clarify
> the intent of it for all other maps and for stack/queue in particular.
>
>
Alexei Starovoitov Oct. 16, 2018, 11:20 p.m. | #7
On Tue, Oct 16, 2018 at 04:16:39PM -0500, Mauricio Vasquez wrote:
> 
> 
> On 10/11/2018 06:51 PM, Alexei Starovoitov wrote:
> > On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
> > > > > Does it make sense to you?
> > > > I reread the other patch, and found it does NOT use the following logic for
> > > > queue and stack:
> > > > 
> > > >                  rcu_read_lock();
> > > >                  ptr = map->ops->map_lookup_and_delete_elem(map, key);
> > > >                  if (ptr)
> > > >                          memcpy(value, ptr, value_size);
> > > > 
> > > > I guess this part is not used at all? Can we just remove it?
> > > > 
> > > > Thanks,
> > > > Song
> > > This is the base code for map_lookup_and_delete support, it is not used in
> > > queue/stack maps.
> > > 
> > > I think we can leave it there, so when somebody implements lookup_and_delete
> > > for other maps doesn't have to care about implementing also this.
> > The code looks useful to me, but I also agree with Song. And in the kernel
> > we don't typically add 'almost dead code'.
> > May be provide an implementation of the lookup_and_delete for hash map
> > so it's actually used ?
> 
> I haven't written any code but I think there is a potential problem here.
> Current lookup_and_delete returns a pointer to the element, hence deletion
> of the element should be done using call_rcu to guarantee this is valid
> after returning.
> In the hashtab, the deletion only uses call_rcu when there is not prealloc,
> otherwise the element is pushed on the list of free elements immediately.
> If we move the logic to push elements into the free list under a call_rcu
> invocation, it could happen that the free list is empty because the call_rcu
> is still pending (a similar issue we had with the queue/stack maps when they
> used a pass by reference API).
> 
> There is another way to implement it without this issue, in syscall.c:
> l = ops->lookup(key);
> memcpy(l, some_buffer)
> ops->delete(key)
> 
> The point here is that the lookup_and_delete operation is not being used at
> all, so, is lookup + delete = lookup_and_delete?, can it be generalized?
> If this is true, then what is the sense of having lookup_and_delete
> syscall?,

I though of lookup_and_delete command as atomic operation.
Only in such case it would make sense.
Otherwise there is no point in having additional cmd.
In case of hash map the implementation would be similar to delete:
  raw_spin_lock_irqsave(&b->lock, flags);
  l = lookup_elem_raw(head, hash, key, key_size);
  if (l) {
          hlist_nulls_del_rcu(&l->hash_node);
          bpf_long_memcpy(); // into temp kernel area
          free_htab_elem(htab, l);
          ret = 0;
  }
  raw_spin_unlock_irqrestore(&b->lock, flags);
  copy_to_user();

there is a chance that some other cpu is doing lookup in parallel
and may be modifying value, so bpf_long_mempcy() isn't fully atomic.
But bpf side is written together with user space side,
so above almost-atomic lookup_and_delete is usable instead
of lookup and then delete which races too much.

Having said that I think it's fine to defer this new ndo for now
and leave lookup_and_delete syscall cmd for stack/queue map only.
Mauricio Vasquez Oct. 17, 2018, 2:29 a.m. | #8
On 10/16/2018 06:20 PM, Alexei Starovoitov wrote:
> On Tue, Oct 16, 2018 at 04:16:39PM -0500, Mauricio Vasquez wrote:
>>
>> On 10/11/2018 06:51 PM, Alexei Starovoitov wrote:
>>> On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
>>>>>> Does it make sense to you?
>>>>> I reread the other patch, and found it does NOT use the following logic for
>>>>> queue and stack:
>>>>>
>>>>>                   rcu_read_lock();
>>>>>                   ptr = map->ops->map_lookup_and_delete_elem(map, key);
>>>>>                   if (ptr)
>>>>>                           memcpy(value, ptr, value_size);
>>>>>
>>>>> I guess this part is not used at all? Can we just remove it?
>>>>>
>>>>> Thanks,
>>>>> Song
>>>> This is the base code for map_lookup_and_delete support, it is not used in
>>>> queue/stack maps.
>>>>
>>>> I think we can leave it there, so when somebody implements lookup_and_delete
>>>> for other maps doesn't have to care about implementing also this.
>>> The code looks useful to me, but I also agree with Song. And in the kernel
>>> we don't typically add 'almost dead code'.
>>> May be provide an implementation of the lookup_and_delete for hash map
>>> so it's actually used ?
>> I haven't written any code but I think there is a potential problem here.
>> Current lookup_and_delete returns a pointer to the element, hence deletion
>> of the element should be done using call_rcu to guarantee this is valid
>> after returning.
>> In the hashtab, the deletion only uses call_rcu when there is not prealloc,
>> otherwise the element is pushed on the list of free elements immediately.
>> If we move the logic to push elements into the free list under a call_rcu
>> invocation, it could happen that the free list is empty because the call_rcu
>> is still pending (a similar issue we had with the queue/stack maps when they
>> used a pass by reference API).
>>
>> There is another way to implement it without this issue, in syscall.c:
>> l = ops->lookup(key);
>> memcpy(l, some_buffer)
>> ops->delete(key)
>>
>> The point here is that the lookup_and_delete operation is not being used at
>> all, so, is lookup + delete = lookup_and_delete?, can it be generalized?
>> If this is true, then what is the sense of having lookup_and_delete
>> syscall?,
> I though of lookup_and_delete command as atomic operation.
> Only in such case it would make sense.
> Otherwise there is no point in having additional cmd.
> In case of hash map the implementation would be similar to delete:
>    raw_spin_lock_irqsave(&b->lock, flags);
>    l = lookup_elem_raw(head, hash, key, key_size);
>    if (l) {
>            hlist_nulls_del_rcu(&l->hash_node);
>            bpf_long_memcpy(); // into temp kernel area
>            free_htab_elem(htab, l);
>            ret = 0;
>    }
>    raw_spin_unlock_irqrestore(&b->lock, flags);
>    copy_to_user();

Well, this is new approach, currently operations have no enough info to 
perform the copy_to_user directly, btw, is there any technical reason 
why a double mem copy is done? (from the map value into a temp kernel 
buffer and then to userspace?)

>
> there is a chance that some other cpu is doing lookup in parallel
> and may be modifying value, so bpf_long_mempcy() isn't fully atomic.

I think we already have that case, if an eBPF program is updating the 
map, a lookup from userspace could see a partially updated value.
> But bpf side is written together with user space side,
> so above almost-atomic lookup_and_delete is usable instead
> of lookup and then delete which races too much.
>
> Having said that I think it's fine to defer this new ndo for now
> and leave lookup_and_delete syscall cmd for stack/queue map only.
>
I agree, just a question, should we remove the "almost dead code" or 
leave it there?

Thanks,
Mauricio.

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9b558713447f..5793f0c7fbb5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@  struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+	void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..3bb94aa2d408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@  enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f36c080ad356..6907d661dea5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -980,6 +980,85 @@  static int map_get_next_key(union bpf_attr *attr)
 	return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct bpf_map *map;
+	void *key, *value, *ptr;
+	u32 value_size;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	key = __bpf_copy_key(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto err_put;
+	}
+
+	value_size = map->value_size;
+
+	err = -ENOMEM;
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	err = -EFAULT;
+	if (copy_from_user(value, uvalue, value_size) != 0)
+		goto free_value;
+
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	if (map->ops->map_lookup_and_delete_elem) {
+		rcu_read_lock();
+		ptr = map->ops->map_lookup_and_delete_elem(map, key);
+		if (ptr)
+			memcpy(value, ptr, value_size);
+		rcu_read_unlock();
+		err = ptr ? 0 : -ENOENT;
+	} else {
+		err = -ENOTSUPP;
+	}
+
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	if (err)
+		goto free_value;
+
+	if (copy_to_user(uvalue, value, value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2453,6 +2532,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_TASK_FD_QUERY:
 		err = bpf_task_fd_query(&attr, uattr);
 		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+		err = map_lookup_and_delete_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;