Message ID | 1452586852-1575604-4-git-send-email-kafai@fb.com |
---|---|
State | Deferred, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, 2016-01-12 at 00:20 -0800, Martin KaFai Lau wrote: > Add map_lookup_percpu_elem() syscall to lookup value > of a particular CPU. ... > +static int map_lookup_percpu_elem(union bpf_attr *attr) > +{ > + void __user *ukey = u64_to_ptr(attr->key); > + void __user *uvalue = u64_to_ptr(attr->value); > + u32 __user ucpu = attr->cpu; > + int ufd = attr->map_fd; > + struct percpu_map_value_info value_info; > + struct bpf_map *map; > + void *key, *value; > + struct fd f; > + int err; > + > + if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) || > + ucpu >= num_possible_cpus()) > + return -EINVAL; Arg, a num_possible_cpus() wrong call. You probably want to use (ucpu >= nr_cpu_ids) hint : You can have hosts with 2 possibles cpus, CPU0 and CPU2 So the userland loop in your changelog is wrong.
On Tue, Jan 12, 2016 at 4:20 PM, Martin KaFai Lau <kafai@fb.com> wrote: > Add map_lookup_percpu_elem() syscall to lookup value > of a particular CPU. > > The user is expected to loop through all CPU values by: > for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) { > if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu)) Thinking of further, the above way can not work well, the iteration may takes long time to collect the value from each CPU, and finally the accumulated value has been stale for long. Syscall is often the preempt point. > total_value += value; > } > > * If the cpu is offline, errno == ENXIO > * If the key is deleted before the iteration is done, > errno == ENOENT. > > Signed-off-by: Martin KaFai Lau <kafai@fb.com> > --- > include/uapi/linux/bpf.h | 2 ++ > kernel/bpf/syscall.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 95 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 43ae40c..2c73c09 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -73,6 +73,7 @@ enum bpf_cmd { > BPF_PROG_LOAD, > BPF_OBJ_PIN, > BPF_OBJ_GET, > + BPF_MAP_LOOKUP_PERCPU_ELEM, > }; > > enum bpf_map_type { > @@ -115,6 +116,7 @@ union bpf_attr { > __aligned_u64 next_key; > }; > __u64 flags; > + __u32 cpu; > }; > > struct { /* anonymous struct used by BPF_PROG_LOAD command */ > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 6373970..ba1172b 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -289,6 +289,96 @@ err_put: > return err; > } > > +/* last field in 'union bpf_attr' used by this command */ > +#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu > + > +struct percpu_map_value_info { > + struct bpf_map *map; > + void *key; > + void *value; > + bool found; > +}; > + > +static void percpu_map_lookup_value(void *i) > +{ > + struct percpu_map_value_info *info = (struct percpu_map_value_info *)i; > + struct bpf_map *map = info->map; > + void *ptr; > + > + rcu_read_lock(); > + ptr = map->ops->map_lookup_elem(map, info->key); > + if (ptr) { > + memcpy(info->value, ptr, map->value_size); > + info->found = true; > + } else { > + info->found = false; > + } > + rcu_read_unlock(); > +} > + > +static int map_lookup_percpu_elem(union bpf_attr *attr) > +{ > + void __user *ukey = u64_to_ptr(attr->key); > + void __user *uvalue = u64_to_ptr(attr->value); > + u32 __user ucpu = attr->cpu; > + int ufd = attr->map_fd; > + struct percpu_map_value_info value_info; > + struct bpf_map *map; > + void *key, *value; > + struct fd f; > + int err; > + > + if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) || > + ucpu >= num_possible_cpus()) > + return -EINVAL; > + > + f = fdget(ufd); > + map = __bpf_map_get(f); > + if (IS_ERR(map)) > + return PTR_ERR(map); > + > + err = -ENOMEM; > + key = kmalloc(map->key_size, GFP_USER); The kmalloc may trigger memory reclaim and take dozens of seconds or more. > + if (!key) > + goto err_put; > + > + err = -EFAULT; > + if (copy_from_user(key, ukey, map->key_size) != 0) > + goto free_key; > + > + err = -ENOMEM; > + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); Same with above. > + if (!value) > + goto free_key; > + > + value_info.map = map; > + value_info.key = key; > + value_info.value = value; > + > + err = smp_call_function_single(ucpu, percpu_map_lookup_value, > + &value_info, 1); It is slow too. > + if (err) > + goto free_value; > + > + err = -ENOENT; > + if (!value_info.found) > + goto free_value; > + > + err = -EFAULT; > + if (copy_to_user(uvalue, value, map->value_size) != 0) > + goto free_value; page fault may be triggered. > + > + err = 0; > + > +free_value: > + kfree(value); > +free_key: > + kfree(key); > +err_put: > + fdput(f); > + return err; > +} > + > #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags > > static int map_update_elem(union bpf_attr *attr) > @@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_OBJ_GET: > err = bpf_obj_get(&attr); > break; > + case BPF_MAP_LOOKUP_PERCPU_ELEM: > + err = map_lookup_percpu_elem(&attr); > + break; > default: > err = -EINVAL; > break; So I don't think it is good to retrieve value from one CPU via one single system call, and accumulate them finally in userspace. One approach I thought of is to define the function(or sort of) handle_cpu_value(u32 cpu, void *val_cpu, void *val_total) in bpf kernel code for collecting value from each cpu and accumulating them into 'val_total', and most of situations, the function can be implemented without loop most of situations. kernel can call this function directly, and the total value can be return to userspace by one single syscall. Alexei and anyone, could you comment on this draft idea for perpcu map? Thanks, Ming Lei
On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote: > > So I don't think it is good to retrieve value from one CPU via one > single system call, and accumulate them finally in userspace. > > One approach I thought of is to define the function(or sort of) > > handle_cpu_value(u32 cpu, void *val_cpu, void *val_total) > > in bpf kernel code for collecting value from each cpu and > accumulating them into 'val_total', and most of situations, the > function can be implemented without loop most of situations. > kernel can call this function directly, and the total value can be > return to userspace by one single syscall. > > Alexei and anyone, could you comment on this draft idea for > perpcu map? I'm not sure how you expect user space to specify such callback. Kernel cannot execute user code. Also syscall/malloc/etc is a noise comparing to ipi and it will still be there, so for(all cpus) { syscall+ipi;} will have the same speed. I think in this use case the overhead of ipi is justified, since user space needs to read accurate numbers otherwise the whole per-cpu is not very useful. One can just use normal hash map and do normal increment. All cpus will race and the counter may contain complete garbage, but in some cases such rough counters are actually good enough. Here per-cpu hash gives fast performance and _accurate_ numbers to userspace. Having said that if you see a way to avoid ipi and still get correct numbers to user space, it would be great.
On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote: >> >> So I don't think it is good to retrieve value from one CPU via one >> single system call, and accumulate them finally in userspace. >> >> One approach I thought of is to define the function(or sort of) >> >> handle_cpu_value(u32 cpu, void *val_cpu, void *val_total) >> >> in bpf kernel code for collecting value from each cpu and >> accumulating them into 'val_total', and most of situations, the >> function can be implemented without loop most of situations. >> kernel can call this function directly, and the total value can be >> return to userspace by one single syscall. >> >> Alexei and anyone, could you comment on this draft idea for >> perpcu map? > > I'm not sure how you expect user space to specify such callback. > Kernel cannot execute user code. I mean the above callback function can be built into bpf code and then run from kernel after loading like in packet filter case by tcpdump, maybe one new prog type is needed. It is doable in theroy. I need to investigate a bit to understand how it can be called from kernel, and it might be OK to call it via kprobe, but not elegent just for accumulating value from each CPU. > Also syscall/malloc/etc is a noise comparing to ipi and it > will still be there, so > for(all cpus) { syscall+ipi;} will have the same speed. In the syscall path, lots of slow things, and finally the accumulated value is often stale and may not reprensent accurate number at any time, and can be thought as invalid. > I think in this use case the overhead of ipi is justified, > since user space needs to read accurate numbers otherwise > the whole per-cpu is not very useful. One can just use > normal hash map and do normal increment. All cpus will race > and the counter may contain complete garbage, but in some > cases such rough counters are actually good enough. > Here per-cpu hash gives fast performance and _accurate_ > numbers to userspace. IMO the 'accurate' value on each CPU at different times doesn't matter, and from user view, they do care the accurate/final number accumulated from all CPUs at the specified time. > Having said that if you see a way to avoid ipi and still > get correct numbers to user space, it would be great. As I mentioned in another thread, the current non-percpu map has the same problem too between lookup element syscall and updating element in eBPF prog.
On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote: > On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote: > >> > >> So I don't think it is good to retrieve value from one CPU via one > >> single system call, and accumulate them finally in userspace. > >> > >> One approach I thought of is to define the function(or sort of) > >> > >> handle_cpu_value(u32 cpu, void *val_cpu, void *val_total) > >> > >> in bpf kernel code for collecting value from each cpu and > >> accumulating them into 'val_total', and most of situations, the > >> function can be implemented without loop most of situations. > >> kernel can call this function directly, and the total value can be > >> return to userspace by one single syscall. > >> > >> Alexei and anyone, could you comment on this draft idea for > >> perpcu map? > > > > I'm not sure how you expect user space to specify such callback. > > Kernel cannot execute user code. > > I mean the above callback function can be built into bpf code and then > run from kernel after loading like in packet filter case by tcpdump, maybe > one new prog type is needed. It is doable in theroy. I need to investigate > a bit to understand how it can be called from kernel, and it might be OK > to call it via kprobe, but not elegent just for accumulating value from each > CPU. that would be a total overkill. > > Also syscall/malloc/etc is a noise comparing to ipi and it > > will still be there, so > > for(all cpus) { syscall+ipi;} will have the same speed. > > In the syscall path, lots of slow things, and finally the accumulated > value is often stale and may not reprensent accurate number at any > time, and can be thought as invalid. no. stale != invalid. Some analytics/monitor applications are good with ball park numbers and for them regular hash map with non-atomic increment is good enough, but others need accurate numbers. Even though they may be seconds stale.
On Thu, Jan 14, 2016 at 9:24 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote: >> On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov >> <alexei.starovoitov@gmail.com> wrote: >> > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote: >> >> >> >> So I don't think it is good to retrieve value from one CPU via one >> >> single system call, and accumulate them finally in userspace. >> >> >> >> One approach I thought of is to define the function(or sort of) >> >> >> >> handle_cpu_value(u32 cpu, void *val_cpu, void *val_total) >> >> >> >> in bpf kernel code for collecting value from each cpu and >> >> accumulating them into 'val_total', and most of situations, the >> >> function can be implemented without loop most of situations. >> >> kernel can call this function directly, and the total value can be >> >> return to userspace by one single syscall. >> >> >> >> Alexei and anyone, could you comment on this draft idea for >> >> perpcu map? >> > >> > I'm not sure how you expect user space to specify such callback. >> > Kernel cannot execute user code. >> >> I mean the above callback function can be built into bpf code and then >> run from kernel after loading like in packet filter case by tcpdump, maybe >> one new prog type is needed. It is doable in theroy. I need to investigate >> a bit to understand how it can be called from kernel, and it might be OK >> to call it via kprobe, but not elegent just for accumulating value from each >> CPU. > > that would be a total overkill. With current bpf framework, it isn't difficult to implement the prog type for percpu, and it is still reasonable because accumulating values from each CPU is one policy and should have been defined/provide from bpf code in case of percpu map. > >> > Also syscall/malloc/etc is a noise comparing to ipi and it >> > will still be there, so >> > for(all cpus) { syscall+ipi;} will have the same speed. >> >> In the syscall path, lots of slow things, and finally the accumulated >> value is often stale and may not reprensent accurate number at any >> time, and can be thought as invalid. > > no. stale != invalid. > Some analytics/monitor applications are good with ball park numbers > and for them regular hash map with non-atomic increment is good enough, > but others need accurate numbers. Even though they may be seconds stale. Let me make it clearly, and the following is one case for reading packet length on each CPU. time: --------T0--------T1----------T2---------T3-------T4----------------- CPU0: E0(0) E0(1) .... E0(2M) CPU1: E1(0) E1(2) .... E1(1M) CPU2: E2(0) .... E2(10K) CPU3: E3(0)...E3(1K) R0 R1 R2 R3 A4 1) suppose the system has 4 cpu cores 2) There is one single event(suppose it is packets received) we are interetested in, and if the event happens on CPU(i) we will add the received packet's length into the value of CPU(i) 2) E0(i) reprents the ith event happened on CPU0 from T0, which will be recored into the value of CPU(0), E1(i) represents the ith event on CPU(1) from T1, .... 2) R0 represents reading value of CPU0 at the time T0, and R1 represents reading value of CPU1 at the time T1, ..... 3) A4 represents accumulating all percpu value into one totoal value at time T4, suppose value(A4) = value(R0) + value(R1) + value(R2) + value(R3). 3) if we use syscall to implement Ri(i=1...3), the period between T(i) and T(i+1) can become quite big, for example dozens of seconds, so the accumulated value in A4 can't represent the actual/correct value(counter) at any time between T0 and T4, and the value is wrong actually, and all events in above diagram (E0(0)~E0(2M), E1(0)~E1(1M), E2(0) .... E2(10K), ...) aren't counted at all, and the missed number can be quite huge. So does the value got by A4 make sense for user?
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ae40c..2c73c09 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,6 +73,7 @@ enum bpf_cmd { BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, + BPF_MAP_LOOKUP_PERCPU_ELEM, }; enum bpf_map_type { @@ -115,6 +116,7 @@ union bpf_attr { __aligned_u64 next_key; }; __u64 flags; + __u32 cpu; }; struct { /* anonymous struct used by BPF_PROG_LOAD command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6373970..ba1172b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -289,6 +289,96 @@ err_put: return err; } +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu + +struct percpu_map_value_info { + struct bpf_map *map; + void *key; + void *value; + bool found; +}; + +static void percpu_map_lookup_value(void *i) +{ + struct percpu_map_value_info *info = (struct percpu_map_value_info *)i; + struct bpf_map *map = info->map; + void *ptr; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, info->key); + if (ptr) { + memcpy(info->value, ptr, map->value_size); + info->found = true; + } else { + info->found = false; + } + rcu_read_unlock(); +} + +static int map_lookup_percpu_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + u32 __user ucpu = attr->cpu; + int ufd = attr->map_fd; + struct percpu_map_value_info value_info; + struct bpf_map *map; + void *key, *value; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) || + ucpu >= num_possible_cpus()) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + value_info.map = map; + value_info.key = key; + value_info.value = value; + + err = smp_call_function_single(ucpu, percpu_map_lookup_value, + &value_info, 1); + if (err) + goto free_value; + + err = -ENOENT; + if (!value_info.found) + goto free_value; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) @@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + case BPF_MAP_LOOKUP_PERCPU_ELEM: + err = map_lookup_percpu_elem(&attr); + break; default: err = -EINVAL; break;
Add map_lookup_percpu_elem() syscall to lookup value of a particular CPU. The user is expected to loop through all CPU values by: for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) { if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu)) total_value += value; } * If the cpu is offline, errno == ENXIO * If the key is deleted before the iteration is done, errno == ENOENT. Signed-off-by: Martin KaFai Lau <kafai@fb.com> --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+)