Message ID | 20191025071842.7724-2-bjorn.topel@gmail.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | xsk: XSKMAP lookup improvements | expand |
On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote: > From: Björn Töpel <bjorn.topel@intel.com> > > Prior this commit, the array storing XDP socket instances were stored > in a separate allocated array of the XSKMAP. Now, we store the sockets > as a flexible array member in a similar fashion as the arraymap. Doing > so, we do less pointer chasing in the lookup. > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Thanks for the re-spin. > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c > index 82a1ffe15dfa..a83e92fe2971 100644 > --- a/kernel/bpf/xskmap.c > +++ b/kernel/bpf/xskmap.c > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) > attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) > return ERR_PTR(-EINVAL); > > - m = kzalloc(sizeof(*m), GFP_USER); > - if (!m) > + numa_node = bpf_map_attr_numa_node(attr); > + size = struct_size(m, xsk_map, attr->max_entries); > + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); Now we didn't use array_size() previously because the sum here may overflow. We could use __ab_c_size() here, the name is probably too ugly to use directly and IDK what we'd have to name such a accumulation helper... So maybe just make cost and size a u64 and we should be in the clear. > + err = bpf_map_charge_init(&mem, cost); > + if (err < 0) > + return ERR_PTR(err); > + > + m = bpf_map_area_alloc(size, numa_node); > + if (!m) { > + bpf_map_charge_finish(&mem); > return ERR_PTR(-ENOMEM); > + } > > bpf_map_init_from_attr(&m->map, attr); > + bpf_map_charge_move(&m->map.memory, &mem); > spin_lock_init(&m->lock); > > - cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); > - cost += sizeof(struct list_head) * num_possible_cpus(); > - > - /* Notice returns -EPERM on if map size is larger than memlock limit */ > - err = bpf_map_charge_init(&m->map.memory, cost); > - if (err) > - goto free_m; > - > - err = -ENOMEM; > - > m->flush_list = alloc_percpu(struct list_head); > - if (!m->flush_list) > - goto free_charge; > + if (!m->flush_list) { > + bpf_map_charge_finish(&m->map.memory); > + bpf_map_area_free(m); > + return ERR_PTR(-ENOMEM); > + } > > for_each_possible_cpu(cpu) > INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); > > - m->xsk_map = bpf_map_area_alloc(m->map.max_entries * > - sizeof(struct xdp_sock *), > - m->map.numa_node); > - if (!m->xsk_map) > - goto free_percpu; > return &m->map; > - > -free_percpu: > - free_percpu(m->flush_list); > -free_charge: > - bpf_map_charge_finish(&m->map.memory); > -free_m: > - kfree(m); > - return ERR_PTR(err); > } > > static void xsk_map_free(struct bpf_map *map)
On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > > On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote: > > From: Björn Töpel <bjorn.topel@intel.com> > > > > Prior this commit, the array storing XDP socket instances were stored > > in a separate allocated array of the XSKMAP. Now, we store the sockets > > as a flexible array member in a similar fashion as the arraymap. Doing > > so, we do less pointer chasing in the lookup. > > > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > > Thanks for the re-spin. > > > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c > > index 82a1ffe15dfa..a83e92fe2971 100644 > > --- a/kernel/bpf/xskmap.c > > +++ b/kernel/bpf/xskmap.c > > > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) > > attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) > > return ERR_PTR(-EINVAL); > > > > - m = kzalloc(sizeof(*m), GFP_USER); > > - if (!m) > > + numa_node = bpf_map_attr_numa_node(attr); > > + size = struct_size(m, xsk_map, attr->max_entries); > > + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); > > Now we didn't use array_size() previously because the sum here may > overflow. > > We could use __ab_c_size() here, the name is probably too ugly to use > directly and IDK what we'd have to name such a accumulation helper... > > So maybe just make cost and size a u64 and we should be in the clear. > Hmm, but both: int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); void *bpf_map_area_alloc(size_t size, int numa_node); pass size as size_t, so casting to u64 doesn't really help on 32-bit systems, right? Wdyt about simply adding: if (cost < size) return ERR_PTR(-EINVAL) after the cost calculation for explicit overflow checking? So, if size's struct_size overflows, the allocation will fail. And we'll catch the cost overflow with the if-statement, no? Another option is changing the size_t in bpf_map_... to u64. Maybe that's better, since arraymap and devmap uses u64 for cost/size. Björn > > + err = bpf_map_charge_init(&mem, cost); > > + if (err < 0) > > + return ERR_PTR(err); > > + > > + m = bpf_map_area_alloc(size, numa_node); > > + if (!m) { > > + bpf_map_charge_finish(&mem); > > return ERR_PTR(-ENOMEM); > > + } > > > > bpf_map_init_from_attr(&m->map, attr); > > + bpf_map_charge_move(&m->map.memory, &mem); > > spin_lock_init(&m->lock); > > > > - cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); > > - cost += sizeof(struct list_head) * num_possible_cpus(); > > - > > - /* Notice returns -EPERM on if map size is larger than memlock limit */ > > - err = bpf_map_charge_init(&m->map.memory, cost); > > - if (err) > > - goto free_m; > > - > > - err = -ENOMEM; > > - > > m->flush_list = alloc_percpu(struct list_head); > > - if (!m->flush_list) > > - goto free_charge; > > + if (!m->flush_list) { > > + bpf_map_charge_finish(&m->map.memory); > > + bpf_map_area_free(m); > > + return ERR_PTR(-ENOMEM); > > + } > > > > for_each_possible_cpu(cpu) > > INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); > > > > - m->xsk_map = bpf_map_area_alloc(m->map.max_entries * > > - sizeof(struct xdp_sock *), > > - m->map.numa_node); > > - if (!m->xsk_map) > > - goto free_percpu; > > return &m->map; > > - > > -free_percpu: > > - free_percpu(m->flush_list); > > -free_charge: > > - bpf_map_charge_finish(&m->map.memory); > > -free_m: > > - kfree(m); > > - return ERR_PTR(err); > > } > > > > static void xsk_map_free(struct bpf_map *map)
On Mon, 28 Oct 2019 23:11:50 +0100, Björn Töpel wrote: > On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski > <jakub.kicinski@netronome.com> wrote: > > > > On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote: > > > From: Björn Töpel <bjorn.topel@intel.com> > > > > > > Prior this commit, the array storing XDP socket instances were stored > > > in a separate allocated array of the XSKMAP. Now, we store the sockets > > > as a flexible array member in a similar fashion as the arraymap. Doing > > > so, we do less pointer chasing in the lookup. > > > > > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > > > > Thanks for the re-spin. > > > > > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c > > > index 82a1ffe15dfa..a83e92fe2971 100644 > > > --- a/kernel/bpf/xskmap.c > > > +++ b/kernel/bpf/xskmap.c > > > > > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) > > > attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) > > > return ERR_PTR(-EINVAL); > > > > > > - m = kzalloc(sizeof(*m), GFP_USER); > > > - if (!m) > > > + numa_node = bpf_map_attr_numa_node(attr); > > > + size = struct_size(m, xsk_map, attr->max_entries); > > > + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); > > > > Now we didn't use array_size() previously because the sum here may > > overflow. > > > > We could use __ab_c_size() here, the name is probably too ugly to use > > directly and IDK what we'd have to name such a accumulation helper... > > > > So maybe just make cost and size a u64 and we should be in the clear. > > > > Hmm, but both: > int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); > void *bpf_map_area_alloc(size_t size, int numa_node); > pass size as size_t, so casting to u64 doesn't really help on 32-bit > systems, right? Yup :( IOW looks like the overflows will not be caught on 32bit machines in all existing code that does the (u64) cast. I hope I'm wrong there. > Wdyt about simply adding: > if (cost < size) > return ERR_PTR(-EINVAL) > after the cost calculation for explicit overflow checking? We'd need that for all users of these helpers. Could it perhaps makes sense to pass "alloc_size" and "extra_cost" as separate size_t to bpf_map_charge_init() and then we can do the overflow checking there, centrally? We can probably get rid of all the u64 casting too at that point, and use standard overflow helpers, yuppie :) > So, if size's struct_size overflows, the allocation will fail. > And we'll catch the cost overflow with the if-statement, no? > > Another option is changing the size_t in bpf_map_... to u64. Maybe > that's better, since arraymap and devmap uses u64 for cost/size.
On Mon, 28 Oct 2019 at 23:26, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > > On Mon, 28 Oct 2019 23:11:50 +0100, Björn Töpel wrote: > > On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski > > <jakub.kicinski@netronome.com> wrote: > > > > > > On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote: > > > > From: Björn Töpel <bjorn.topel@intel.com> > > > > > > > > Prior this commit, the array storing XDP socket instances were stored > > > > in a separate allocated array of the XSKMAP. Now, we store the sockets > > > > as a flexible array member in a similar fashion as the arraymap. Doing > > > > so, we do less pointer chasing in the lookup. > > > > > > > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > > > > > > Thanks for the re-spin. > > > > > > > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c > > > > index 82a1ffe15dfa..a83e92fe2971 100644 > > > > --- a/kernel/bpf/xskmap.c > > > > +++ b/kernel/bpf/xskmap.c > > > > > > > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) > > > > attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) > > > > return ERR_PTR(-EINVAL); > > > > > > > > - m = kzalloc(sizeof(*m), GFP_USER); > > > > - if (!m) > > > > + numa_node = bpf_map_attr_numa_node(attr); > > > > + size = struct_size(m, xsk_map, attr->max_entries); > > > > + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); > > > > > > Now we didn't use array_size() previously because the sum here may > > > overflow. > > > > > > We could use __ab_c_size() here, the name is probably too ugly to use > > > directly and IDK what we'd have to name such a accumulation helper... > > > > > > So maybe just make cost and size a u64 and we should be in the clear. > > > > > > > Hmm, but both: > > int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); > > void *bpf_map_area_alloc(size_t size, int numa_node); > > pass size as size_t, so casting to u64 doesn't really help on 32-bit > > systems, right? > > Yup :( IOW looks like the overflows will not be caught on 32bit > machines in all existing code that does the (u64) cast. I hope > I'm wrong there. > > > Wdyt about simply adding: > > if (cost < size) > > return ERR_PTR(-EINVAL) > > after the cost calculation for explicit overflow checking? > > We'd need that for all users of these helpers. Could it perhaps makes > sense to pass "alloc_size" and "extra_cost" as separate size_t to > bpf_map_charge_init() and then we can do the overflow checking there, > centrally? > The cost/size calculations seem to vary a bit from map to map, so I don't know about the extra size_t arguments... but all of them do use u64 for cost and explicit casting, in favor of u32 overflow checks. Changing bpf_map_charge_init()/bpf_map_area_alloc() size to u64 would be the smallest change, together with a 64-to-32 overflow check in those functions. > We can probably get rid of all the u64 casting too at that point, > and use standard overflow helpers, yuppie :) > Yeah, that's the other path, but more churn (check_add_overflow() in every map). Preferred path? > > So, if size's struct_size overflows, the allocation will fail. > > And we'll catch the cost overflow with the if-statement, no? > > > > Another option is changing the size_t in bpf_map_... to u64. Maybe > > that's better, since arraymap and devmap uses u64 for cost/size.
On Mon, Oct 28, 2019 at 11:21 PM Björn Töpel <bjorn.topel@gmail.com> wrote: > Changing bpf_map_charge_init()/bpf_map_area_alloc() size to u64 would > be the smallest change, together with a 64-to-32 overflow check in > those functions. +1. bpf_map_charge_init() already has such check. Changing them to u64 is probably good idea.
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 82a1ffe15dfa..a83e92fe2971 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,9 +11,9 @@ struct xsk_map { struct bpf_map map; - struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ + struct xdp_sock *xsk_map[]; }; int xsk_map_inc(struct xsk_map *map) @@ -80,9 +80,10 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { + struct bpf_map_memory mem; + int cpu, err, numa_node; struct xsk_map *m; - int cpu, err; - u64 cost; + size_t cost, size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) return ERR_PTR(-EINVAL); - m = kzalloc(sizeof(*m), GFP_USER); - if (!m) + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); + + err = bpf_map_charge_init(&mem, cost); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); - cost += sizeof(struct list_head) * num_possible_cpus(); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, cost); - if (err) - goto free_m; - - err = -ENOMEM; - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) - goto free_charge; + if (!m->flush_list) { + bpf_map_charge_finish(&m->map.memory); + bpf_map_area_free(m); + return ERR_PTR(-ENOMEM); + } for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - m->xsk_map = bpf_map_area_alloc(m->map.max_entries * - sizeof(struct xdp_sock *), - m->map.numa_node); - if (!m->xsk_map) - goto free_percpu; return &m->map; - -free_percpu: - free_percpu(m->flush_list); -free_charge: - bpf_map_charge_finish(&m->map.memory); -free_m: - kfree(m); - return ERR_PTR(err); } static void xsk_map_free(struct bpf_map *map) @@ -139,8 +131,7 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); free_percpu(m->flush_list); - bpf_map_area_free(m->xsk_map); - kfree(m); + bpf_map_area_free(m); } static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)