diff mbox series

[RFC,bpf-next,v2,2/6] bpf: add BPF_MAP_DUMP command to access more than one entry per call

Message ID 20190627202417.33370-3-brianvv@google.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series bpf: add BPF_MAP_DUMP command to | expand

Commit Message

Brian Vazquez June 27, 2019, 8:24 p.m. UTC
This introduces a new command to retrieve a variable number of entries
from a bpf map wrapping the existing bpf methods:
map_get_next_key and map_lookup_elem

Note that map_dump doesn't guarantee that reading the entire table is
consistent since this function is always racing with kernel and user code
but the same behaviour is found when the entire table is walked using
the current interfaces: map_get_next_key + map_lookup_elem.
It is also important to note that when a locked map is provided it is
consistent only for 1 entry at the time, meaning that the buf returned
might or might not be consistent.

Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
---
 include/uapi/linux/bpf.h |   9 ++++
 kernel/bpf/syscall.c     | 108 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

Comments

Alexei Starovoitov June 27, 2019, 10:12 p.m. UTC | #1
On Thu, Jun 27, 2019 at 01:24:13PM -0700, Brian Vazquez wrote:
> This introduces a new command to retrieve a variable number of entries
> from a bpf map wrapping the existing bpf methods:
> map_get_next_key and map_lookup_elem
> 
> Note that map_dump doesn't guarantee that reading the entire table is
> consistent since this function is always racing with kernel and user code
> but the same behaviour is found when the entire table is walked using
> the current interfaces: map_get_next_key + map_lookup_elem.
> It is also important to note that when a locked map is provided it is
> consistent only for 1 entry at the time, meaning that the buf returned
> might or might not be consistent.

Please explain the api behavior and corner cases in the commit log
or in code comments.

Would it make sense to return last key back into prev_key,
so that next map_dump command doesn't need to copy it from the
buffer?

> Suggested-by: Stanislav Fomichev <sdf@google.com>
> Signed-off-by: Brian Vazquez <brianvv@google.com>
> ---
>  include/uapi/linux/bpf.h |   9 ++++
>  kernel/bpf/syscall.c     | 108 +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 117 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b077507efa3f3..1d753958874df 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -106,6 +106,7 @@ enum bpf_cmd {
>  	BPF_TASK_FD_QUERY,
>  	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>  	BPF_MAP_FREEZE,
> +	BPF_MAP_DUMP,
>  };
>  
>  enum bpf_map_type {
> @@ -385,6 +386,14 @@ union bpf_attr {
>  		__u64		flags;
>  	};
>  
> +	struct { /* struct used by BPF_MAP_DUMP command */
> +		__u32		map_fd;
> +		__aligned_u64	prev_key;
> +		__aligned_u64	buf;
> +		__aligned_u64	buf_len; /* input/output: len of buf */
> +		__u64		flags;
> +	} dump;
> +
>  	struct { /* anonymous struct used by BPF_PROG_LOAD command */
>  		__u32		prog_type;	/* one of enum bpf_prog_type */
>  		__u32		insn_cnt;
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a1823a50f9be0..7653346b5cfd1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1097,6 +1097,111 @@ static int map_get_next_key(union bpf_attr *attr)
>  	return err;
>  }
>  
> +/* last field in 'union bpf_attr' used by this command */
> +#define BPF_MAP_DUMP_LAST_FIELD dump.buf_len
> +
> +static int map_dump(union bpf_attr *attr)
> +{
> +	void __user *ukey = u64_to_user_ptr(attr->dump.prev_key);
> +	void __user *ubuf = u64_to_user_ptr(attr->dump.buf);
> +	u32 __user *ubuf_len = u64_to_user_ptr(attr->dump.buf_len);
> +	int ufd = attr->dump.map_fd;
> +	struct bpf_map *map;
> +	void *buf, *prev_key, *key, *value;
> +	u32 value_size, elem_size, buf_len, cp_len;
> +	struct fd f;
> +	int err;
> +
> +	if (CHECK_ATTR(BPF_MAP_DUMP))
> +		return -EINVAL;
> +
> +	attr->flags = 0;
> +	if (attr->dump.flags & ~BPF_F_LOCK)
> +		return -EINVAL;
> +
> +	f = fdget(ufd);
> +	map = __bpf_map_get(f);
> +	if (IS_ERR(map))
> +		return PTR_ERR(map);
> +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
> +		err = -EPERM;
> +		goto err_put;
> +	}
> +
> +	if ((attr->dump.flags & BPF_F_LOCK) &&
> +	    !map_value_has_spin_lock(map)) {
> +		err = -EINVAL;
> +		goto err_put;
> +	}
> +
> +	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
> +	    map->map_type == BPF_MAP_TYPE_STACK) {
> +		err = -ENOTSUPP;
> +		goto err_put;
> +	}
> +
> +	value_size = bpf_map_value_size(map);
> +
> +	err = get_user(buf_len, ubuf_len);
> +	if (err)
> +		goto err_put;
> +
> +	elem_size = map->key_size + value_size;
> +	if (buf_len < elem_size) {
> +		err = -EINVAL;
> +		goto err_put;
> +	}
> +
> +	if (ukey) {
> +		prev_key = __bpf_copy_key(ukey, map->key_size);
> +		if (IS_ERR(prev_key)) {
> +			err = PTR_ERR(prev_key);
> +			goto err_put;
> +		}
> +	} else {
> +		prev_key = NULL;
> +	}
> +
> +	err = -ENOMEM;
> +	buf = kmalloc(elem_size, GFP_USER | __GFP_NOWARN);
> +	if (!buf)
> +		goto err_put;
> +
> +	key = buf;
> +	value = key + map->key_size;
> +	for (cp_len = 0;  cp_len + elem_size <= buf_len ; cp_len += elem_size) {

checkpatch.pl please.

> +next:
> +		if (signal_pending(current)) {
> +			err = -EINTR;
> +			break;
> +		}
> +
> +		rcu_read_lock();
> +		err = map->ops->map_get_next_key(map, prev_key, key);
> +		rcu_read_unlock();
> +
> +		if (err)
> +			break;

should probably be only for ENOENT case?
and other errors should be returned to user ?

> +
> +		if (bpf_map_copy_value(map, key, value, attr->dump.flags))
> +			goto next;

only for ENOENT as well?
and instead of goto use continue and move cp_len+= to the end after prev_key=key?

> +
> +		if (copy_to_user(ubuf + cp_len, buf, elem_size))
> +			break;

return error to user?

> +
> +		prev_key = key;
> +	}
> +
> +	if (cp_len)
> +		err = 0;

this will mask any above errors if there was at least one element copied.

> +	if (copy_to_user(ubuf_len, &cp_len, sizeof(cp_len)))
> +		err = -EFAULT;
> +	kfree(buf);
> +err_put:
> +	fdput(f);
> +	return err;
> +}
> +
>  #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
>  
>  static int map_lookup_and_delete_elem(union bpf_attr *attr)
> @@ -2891,6 +2996,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>  	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
>  		err = map_lookup_and_delete_elem(&attr);
>  		break;
> +	case BPF_MAP_DUMP:
> +		err = map_dump(&attr);
> +		break;
>  	default:
>  		err = -EINVAL;
>  		break;
> -- 
> 2.22.0.410.gd8fdbe21b5-goog
>
Brian Vazquez June 28, 2019, 5:49 p.m. UTC | #2
> Please explain the api behavior and corner cases in the commit log
> or in code comments.

Ack, will prepare a new version adding those.

> Would it make sense to return last key back into prev_key,
> so that next map_dump command doesn't need to copy it from the
> buffer?

Actually that's a good idea.


> checkpatch.pl please.

 I did use the script and it didn't complain, are you seeing something?

> > +next:
> > +             if (signal_pending(current)) {
> > +                     err = -EINTR;
> > +                     break;
> > +             }
> > +
> > +             rcu_read_lock();
> > +             err = map->ops->map_get_next_key(map, prev_key, key);
> > +             rcu_read_unlock();
> > +
> > +             if (err)
> > +                     break;
>
> should probably be only for ENOENT case?

Yes, this makes sense.

> and other errors should be returned to user ?

and what if the error happened when we had already copied some
entries? Current behavior masks the error to 0 if we copied at least 1
element

>
> > +
> > +             if (bpf_map_copy_value(map, key, value, attr->dump.flags))
> > +                     goto next;
>
> only for ENOENT as well?
> and instead of goto use continue and move cp_len+= to the end after prev_key=key?

Right

>
> > +
> > +             if (copy_to_user(ubuf + cp_len, buf, elem_size))
> > +                     break;
>
> return error to user?
>
> > +
> > +             prev_key = key;
> > +     }
> > +
> > +     if (cp_len)
> > +             err = 0;
>
> this will mask any above errors if there was at least one element copied.

So in general if we copied elements and suddenly we find and error we
should return that error and maybe set cp_len to 0 to 'invalidate' the
data that was already copied?
Yes, I think that sounds like the correct thing to do, what do you think?
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b077507efa3f3..1d753958874df 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -106,6 +106,7 @@  enum bpf_cmd {
 	BPF_TASK_FD_QUERY,
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
+	BPF_MAP_DUMP,
 };
 
 enum bpf_map_type {
@@ -385,6 +386,14 @@  union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_DUMP command */
+		__u32		map_fd;
+		__aligned_u64	prev_key;
+		__aligned_u64	buf;
+		__aligned_u64	buf_len; /* input/output: len of buf */
+		__u64		flags;
+	} dump;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1823a50f9be0..7653346b5cfd1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1097,6 +1097,111 @@  static int map_get_next_key(union bpf_attr *attr)
 	return err;
 }
 
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_DUMP_LAST_FIELD dump.buf_len
+
+static int map_dump(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_user_ptr(attr->dump.prev_key);
+	void __user *ubuf = u64_to_user_ptr(attr->dump.buf);
+	u32 __user *ubuf_len = u64_to_user_ptr(attr->dump.buf_len);
+	int ufd = attr->dump.map_fd;
+	struct bpf_map *map;
+	void *buf, *prev_key, *key, *value;
+	u32 value_size, elem_size, buf_len, cp_len;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_DUMP))
+		return -EINVAL;
+
+	attr->flags = 0;
+	if (attr->dump.flags & ~BPF_F_LOCK)
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	if ((attr->dump.flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK) {
+		err = -ENOTSUPP;
+		goto err_put;
+	}
+
+	value_size = bpf_map_value_size(map);
+
+	err = get_user(buf_len, ubuf_len);
+	if (err)
+		goto err_put;
+
+	elem_size = map->key_size + value_size;
+	if (buf_len < elem_size) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	if (ukey) {
+		prev_key = __bpf_copy_key(ukey, map->key_size);
+		if (IS_ERR(prev_key)) {
+			err = PTR_ERR(prev_key);
+			goto err_put;
+		}
+	} else {
+		prev_key = NULL;
+	}
+
+	err = -ENOMEM;
+	buf = kmalloc(elem_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		goto err_put;
+
+	key = buf;
+	value = key + map->key_size;
+	for (cp_len = 0;  cp_len + elem_size <= buf_len ; cp_len += elem_size) {
+next:
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+
+		rcu_read_lock();
+		err = map->ops->map_get_next_key(map, prev_key, key);
+		rcu_read_unlock();
+
+		if (err)
+			break;
+
+		if (bpf_map_copy_value(map, key, value, attr->dump.flags))
+			goto next;
+
+		if (copy_to_user(ubuf + cp_len, buf, elem_size))
+			break;
+
+		prev_key = key;
+	}
+
+	if (cp_len)
+		err = 0;
+	if (copy_to_user(ubuf_len, &cp_len, sizeof(cp_len)))
+		err = -EFAULT;
+	kfree(buf);
+err_put:
+	fdput(f);
+	return err;
+}
+
 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
 
 static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -2891,6 +2996,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
 		err = map_lookup_and_delete_elem(&attr);
 		break;
+	case BPF_MAP_DUMP:
+		err = map_dump(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;