diff mbox

[v2,net-next,3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key

Message ID 1452586852-1575604-4-git-send-email-kafai@fb.com
State Deferred, archived
Delegated to: David Miller
Headers show

Commit Message

Martin KaFai Lau Jan. 12, 2016, 8:20 a.m. UTC
Add map_lookup_percpu_elem() syscall to lookup value
of a particular CPU.

The user is expected to loop through all CPU values by:
for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
	if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))
		total_value += value;
}

* If the cpu is offline, errno == ENXIO
* If the key is deleted before the iteration is done,
  errno == ENOENT.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

Comments

Eric Dumazet Jan. 13, 2016, 2:24 a.m. UTC | #1
On Tue, 2016-01-12 at 00:20 -0800, Martin KaFai Lau wrote:
> Add map_lookup_percpu_elem() syscall to lookup value
> of a particular CPU.

...

> +static int map_lookup_percpu_elem(union bpf_attr *attr)
> +{
> +	void __user *ukey = u64_to_ptr(attr->key);
> +	void __user *uvalue = u64_to_ptr(attr->value);
> +	u32 __user ucpu = attr->cpu;
> +	int ufd = attr->map_fd;
> +	struct percpu_map_value_info value_info;
> +	struct bpf_map *map;
> +	void *key, *value;
> +	struct fd f;
> +	int err;
> +
> +	if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
> +	    ucpu >= num_possible_cpus())
> +		return -EINVAL;

Arg, a num_possible_cpus() wrong call.

You probably want to use    (ucpu >= nr_cpu_ids)

hint : You can have hosts with 2 possibles cpus, CPU0 and CPU2

So the userland loop in your changelog is wrong.
Ming Lei Jan. 13, 2016, 2:42 a.m. UTC | #2
On Tue, Jan 12, 2016 at 4:20 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> Add map_lookup_percpu_elem() syscall to lookup value
> of a particular CPU.
>
> The user is expected to loop through all CPU values by:
> for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
>         if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))

Thinking of further, the above way can not work well, the iteration may
takes long time to collect the value from each CPU, and finally
the accumulated value has been stale for long.

Syscall is often the preempt point.

>                 total_value += value;
> }
>
> * If the cpu is offline, errno == ENXIO
> * If the key is deleted before the iteration is done,
>   errno == ENOENT.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/uapi/linux/bpf.h |  2 ++
>  kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 95 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 43ae40c..2c73c09 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -73,6 +73,7 @@ enum bpf_cmd {
>         BPF_PROG_LOAD,
>         BPF_OBJ_PIN,
>         BPF_OBJ_GET,
> +       BPF_MAP_LOOKUP_PERCPU_ELEM,
>  };
>
>  enum bpf_map_type {
> @@ -115,6 +116,7 @@ union bpf_attr {
>                         __aligned_u64 next_key;
>                 };
>                 __u64           flags;
> +               __u32           cpu;
>         };
>
>         struct { /* anonymous struct used by BPF_PROG_LOAD command */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6373970..ba1172b 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -289,6 +289,96 @@ err_put:
>         return err;
>  }
>
> +/* last field in 'union bpf_attr' used by this command */
> +#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
> +
> +struct percpu_map_value_info {
> +       struct bpf_map *map;
> +       void *key;
> +       void *value;
> +       bool found;
> +};
> +
> +static void percpu_map_lookup_value(void *i)
> +{
> +       struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
> +       struct bpf_map *map = info->map;
> +       void *ptr;
> +
> +       rcu_read_lock();
> +       ptr = map->ops->map_lookup_elem(map, info->key);
> +       if (ptr) {
> +               memcpy(info->value, ptr, map->value_size);
> +               info->found = true;
> +       } else {
> +               info->found = false;
> +       }
> +       rcu_read_unlock();
> +}
> +
> +static int map_lookup_percpu_elem(union bpf_attr *attr)
> +{
> +       void __user *ukey = u64_to_ptr(attr->key);
> +       void __user *uvalue = u64_to_ptr(attr->value);
> +       u32 __user ucpu = attr->cpu;
> +       int ufd = attr->map_fd;
> +       struct percpu_map_value_info value_info;
> +       struct bpf_map *map;
> +       void *key, *value;
> +       struct fd f;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
> +           ucpu >= num_possible_cpus())
> +               return -EINVAL;
> +
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       err = -ENOMEM;
> +       key = kmalloc(map->key_size, GFP_USER);

The kmalloc may trigger memory reclaim and take dozens of seconds
or more.

> +       if (!key)
> +               goto err_put;
> +
> +       err = -EFAULT;
> +       if (copy_from_user(key, ukey, map->key_size) != 0)
> +               goto free_key;
> +
> +       err = -ENOMEM;
> +       value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);

Same with above.

> +       if (!value)
> +               goto free_key;
> +
> +       value_info.map = map;
> +       value_info.key = key;
> +       value_info.value = value;
> +
> +       err = smp_call_function_single(ucpu, percpu_map_lookup_value,
> +                                      &value_info, 1);

It is slow too.

> +       if (err)
> +               goto free_value;
> +
> +       err = -ENOENT;
> +       if (!value_info.found)
> +               goto free_value;
> +
> +       err = -EFAULT;
> +       if (copy_to_user(uvalue, value, map->value_size) != 0)
> +               goto free_value;

page fault may be triggered.

> +
> +       err = 0;
> +
> +free_value:
> +       kfree(value);
> +free_key:
> +       kfree(key);
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
>
>  static int map_update_elem(union bpf_attr *attr)
> @@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_OBJ_GET:
>                 err = bpf_obj_get(&attr);
>                 break;
> +       case BPF_MAP_LOOKUP_PERCPU_ELEM:
> +               err = map_lookup_percpu_elem(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;

So I don't think it is good to retrieve value from one CPU via one
single system call, and accumulate them finally in userspace.

One approach I thought of is to define the function(or sort of)

 handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)

in bpf kernel code for collecting value from each cpu and
accumulating them into 'val_total', and most of situations, the
function can be implemented without loop most of situations.
kernel can call this function directly, and the total value can be
return to userspace by one single syscall.

Alexei and anyone, could you comment on this draft idea for
perpcu map?

Thanks,
Ming Lei
Alexei Starovoitov Jan. 13, 2016, 5:23 a.m. UTC | #3
On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
> 
> So I don't think it is good to retrieve value from one CPU via one
> single system call, and accumulate them finally in userspace.
> 
> One approach I thought of is to define the function(or sort of)
> 
>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
> 
> in bpf kernel code for collecting value from each cpu and
> accumulating them into 'val_total', and most of situations, the
> function can be implemented without loop most of situations.
> kernel can call this function directly, and the total value can be
> return to userspace by one single syscall.
> 
> Alexei and anyone, could you comment on this draft idea for
> perpcu map?

I'm not sure how you expect user space to specify such callback.
Kernel cannot execute user code.
Also syscall/malloc/etc is a noise comparing to ipi and it
will still be there, so
for(all cpus) { syscall+ipi;} will have the same speed.
I think in this use case the overhead of ipi is justified,
since user space needs to read accurate numbers otherwise
the whole per-cpu is not very useful. One can just use
normal hash map and do normal increment. All cpus will race
and the counter may contain complete garbage, but in some
cases such rough counters are actually good enough.
Here per-cpu hash gives fast performance and _accurate_
numbers to userspace.
Having said that if you see a way to avoid ipi and still
get correct numbers to user space, it would be great.
Ming Lei Jan. 13, 2016, 3:43 p.m. UTC | #4
On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
>>
>> So I don't think it is good to retrieve value from one CPU via one
>> single system call, and accumulate them finally in userspace.
>>
>> One approach I thought of is to define the function(or sort of)
>>
>>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
>>
>> in bpf kernel code for collecting value from each cpu and
>> accumulating them into 'val_total', and most of situations, the
>> function can be implemented without loop most of situations.
>> kernel can call this function directly, and the total value can be
>> return to userspace by one single syscall.
>>
>> Alexei and anyone, could you comment on this draft idea for
>> perpcu map?
>
> I'm not sure how you expect user space to specify such callback.
> Kernel cannot execute user code.

I mean the above callback function can be built into bpf code and then
run from kernel after loading like in packet filter case by tcpdump, maybe
one new prog type is needed. It is doable in theroy. I need to investigate
a bit to understand how it can be called from kernel, and it might be OK
to call it via kprobe, but not elegent just for accumulating value from each
CPU.

> Also syscall/malloc/etc is a noise comparing to ipi and it
> will still be there, so
> for(all cpus) { syscall+ipi;} will have the same speed.

In the syscall path, lots of slow things, and finally the accumulated
value is often stale and may not reprensent accurate number at any
time, and can be thought as invalid.

> I think in this use case the overhead of ipi is justified,
> since user space needs to read accurate numbers otherwise
> the whole per-cpu is not very useful. One can just use
> normal hash map and do normal increment. All cpus will race
> and the counter may contain complete garbage, but in some
> cases such rough counters are actually good enough.
> Here per-cpu hash gives fast performance and _accurate_
> numbers to userspace.

IMO the 'accurate' value on each CPU at different times
doesn't matter, and from user view, they do care the
accurate/final number accumulated from all CPUs at the
specified time.

> Having said that if you see a way to avoid ipi and still
> get correct numbers to user space, it would be great.

As I mentioned in another thread, the current non-percpu
map has the same problem too between lookup element syscall
and updating element in eBPF prog.
Alexei Starovoitov Jan. 14, 2016, 1:24 a.m. UTC | #5
On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote:
> On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
> >>
> >> So I don't think it is good to retrieve value from one CPU via one
> >> single system call, and accumulate them finally in userspace.
> >>
> >> One approach I thought of is to define the function(or sort of)
> >>
> >>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
> >>
> >> in bpf kernel code for collecting value from each cpu and
> >> accumulating them into 'val_total', and most of situations, the
> >> function can be implemented without loop most of situations.
> >> kernel can call this function directly, and the total value can be
> >> return to userspace by one single syscall.
> >>
> >> Alexei and anyone, could you comment on this draft idea for
> >> perpcu map?
> >
> > I'm not sure how you expect user space to specify such callback.
> > Kernel cannot execute user code.
> 
> I mean the above callback function can be built into bpf code and then
> run from kernel after loading like in packet filter case by tcpdump, maybe
> one new prog type is needed. It is doable in theroy. I need to investigate
> a bit to understand how it can be called from kernel, and it might be OK
> to call it via kprobe, but not elegent just for accumulating value from each
> CPU.

that would be a total overkill.

> > Also syscall/malloc/etc is a noise comparing to ipi and it
> > will still be there, so
> > for(all cpus) { syscall+ipi;} will have the same speed.
> 
> In the syscall path, lots of slow things, and finally the accumulated
> value is often stale and may not reprensent accurate number at any
> time, and can be thought as invalid.

no. stale != invalid.
Some analytics/monitor applications are good with ball park numbers
and for them regular hash map with non-atomic increment is good enough,
but others need accurate numbers. Even though they may be seconds stale.
Ming Lei Jan. 14, 2016, 3:23 a.m. UTC | #6
On Thu, Jan 14, 2016 at 9:24 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote:
>> On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
>> >>
>> >> So I don't think it is good to retrieve value from one CPU via one
>> >> single system call, and accumulate them finally in userspace.
>> >>
>> >> One approach I thought of is to define the function(or sort of)
>> >>
>> >>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
>> >>
>> >> in bpf kernel code for collecting value from each cpu and
>> >> accumulating them into 'val_total', and most of situations, the
>> >> function can be implemented without loop most of situations.
>> >> kernel can call this function directly, and the total value can be
>> >> return to userspace by one single syscall.
>> >>
>> >> Alexei and anyone, could you comment on this draft idea for
>> >> perpcu map?
>> >
>> > I'm not sure how you expect user space to specify such callback.
>> > Kernel cannot execute user code.
>>
>> I mean the above callback function can be built into bpf code and then
>> run from kernel after loading like in packet filter case by tcpdump, maybe
>> one new prog type is needed. It is doable in theroy. I need to investigate
>> a bit to understand how it can be called from kernel, and it might be OK
>> to call it via kprobe, but not elegent just for accumulating value from each
>> CPU.
>
> that would be a total overkill.

With current bpf framework, it isn't difficult to implement the prog type
for percpu, and it is still reasonable because accumulating values
from each CPU is one policy and should have been defined/provide
from bpf code in case of percpu map.

>
>> > Also syscall/malloc/etc is a noise comparing to ipi and it
>> > will still be there, so
>> > for(all cpus) { syscall+ipi;} will have the same speed.
>>
>> In the syscall path, lots of slow things, and finally the accumulated
>> value is often stale and may not reprensent accurate number at any
>> time, and can be thought as invalid.
>
> no. stale != invalid.
> Some analytics/monitor applications are good with ball park numbers
> and for them regular hash map with non-atomic increment is good enough,
> but others need accurate numbers. Even though they may be seconds stale.

Let me make it clearly, and the following is one case for reading packet length
on each CPU.

time: --------T0--------T1----------T2---------T3-------T4-----------------
CPU0:               E0(0)  E0(1)  .... E0(2M)
CPU1:                               E1(0)  E1(2) .... E1(1M)
CPU2:                                                  E2(0) ....
E2(10K)
CPU3:
E3(0)...E3(1K)

                    R0
                                    R1
                                                      R2
                                                                       R3

               A4

1) suppose the system has 4 cpu cores

2) There is one single event(suppose it is packets received) we are
interetested in, and if the event happens on CPU(i) we will add the
received packet's length into the value of CPU(i)

2) E0(i) reprents the ith event happened on CPU0 from T0, which will
be recored into the value of CPU(0), E1(i) represents the ith event on
CPU(1) from T1, ....

2) R0 represents reading value of CPU0 at the time T0, and R1 represents
reading value of CPU1 at the time T1, .....

3) A4 represents accumulating all percpu value into one totoal value at time T4,
suppose value(A4) = value(R0) + value(R1) + value(R2) + value(R3).

3) if we use syscall to implement Ri(i=1...3), the period between T(i)
and T(i+1)
can become quite big, for example dozens of seconds, so the accumulated value
in A4 can't represent the actual/correct value(counter) at any time between T0
and T4, and the value is wrong actually, and all events in above diagram
(E0(0)~E0(2M), E1(0)~E1(1M),  E2(0) .... E2(10K), ...) aren't counted at all,
and the missed number can be quite huge.

So does the value got by A4 make sense for user?
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ae40c..2c73c09 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,7 @@  enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_MAP_LOOKUP_PERCPU_ELEM,
 };
 
 enum bpf_map_type {
@@ -115,6 +116,7 @@  union bpf_attr {
 			__aligned_u64 next_key;
 		};
 		__u64		flags;
+		__u32		cpu;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6373970..ba1172b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -289,6 +289,96 @@  err_put:
 	return err;
 }
 
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
+
+struct percpu_map_value_info {
+	struct bpf_map *map;
+	void *key;
+	void *value;
+	bool found;
+};
+
+static void percpu_map_lookup_value(void *i)
+{
+	struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
+	struct bpf_map *map = info->map;
+	void *ptr;
+
+	rcu_read_lock();
+	ptr = map->ops->map_lookup_elem(map, info->key);
+	if (ptr) {
+		memcpy(info->value, ptr, map->value_size);
+		info->found = true;
+	} else {
+		info->found = false;
+	}
+	rcu_read_unlock();
+}
+
+static int map_lookup_percpu_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *uvalue = u64_to_ptr(attr->value);
+	u32 __user ucpu = attr->cpu;
+	int ufd = attr->map_fd;
+	struct percpu_map_value_info value_info;
+	struct bpf_map *map;
+	void *key, *value;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
+	    ucpu >= num_possible_cpus())
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	value_info.map = map;
+	value_info.key = key;
+	value_info.value = value;
+
+	err = smp_call_function_single(ucpu, percpu_map_lookup_value,
+				       &value_info, 1);
+	if (err)
+		goto free_value;
+
+	err = -ENOENT;
+	if (!value_info.found)
+		goto free_value;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -792,6 +882,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+	case BPF_MAP_LOOKUP_PERCPU_ELEM:
+		err = map_lookup_percpu_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;