diff mbox series

[bpf-next,v2,1/2] xsk: store struct xdp_sock as a flexible array member of the XSKMAP

Message ID 20191025071842.7724-2-bjorn.topel@gmail.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series xsk: XSKMAP lookup improvements | expand

Commit Message

Björn Töpel Oct. 25, 2019, 7:18 a.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

Prior this commit, the array storing XDP socket instances were stored
in a separate allocated array of the XSKMAP. Now, we store the sockets
as a flexible array member in a similar fashion as the arraymap. Doing
so, we do less pointer chasing in the lookup.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 kernel/bpf/xskmap.c | 55 +++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

Comments

Jakub Kicinski Oct. 28, 2019, 5:55 p.m. UTC | #1
On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> Prior this commit, the array storing XDP socket instances were stored
> in a separate allocated array of the XSKMAP. Now, we store the sockets
> as a flexible array member in a similar fashion as the arraymap. Doing
> so, we do less pointer chasing in the lookup.
> 
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Thanks for the re-spin.

> diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
> index 82a1ffe15dfa..a83e92fe2971 100644
> --- a/kernel/bpf/xskmap.c
> +++ b/kernel/bpf/xskmap.c

> @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
>  	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
>  		return ERR_PTR(-EINVAL);
>  
> -	m = kzalloc(sizeof(*m), GFP_USER);
> -	if (!m)
> +	numa_node = bpf_map_attr_numa_node(attr);
> +	size = struct_size(m, xsk_map, attr->max_entries);
> +	cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());

Now we didn't use array_size() previously because the sum here may
overflow.

We could use __ab_c_size() here, the name is probably too ugly to use
directly and IDK what we'd have to name such a accumulation helper...

So maybe just make cost and size a u64 and we should be in the clear.

> +	err = bpf_map_charge_init(&mem, cost);
> +	if (err < 0)
> +		return ERR_PTR(err);
> +
> +	m = bpf_map_area_alloc(size, numa_node);
> +	if (!m) {
> +		bpf_map_charge_finish(&mem);
>  		return ERR_PTR(-ENOMEM);
> +	}
>  
>  	bpf_map_init_from_attr(&m->map, attr);
> +	bpf_map_charge_move(&m->map.memory, &mem);
>  	spin_lock_init(&m->lock);
>  
> -	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
> -	cost += sizeof(struct list_head) * num_possible_cpus();
> -
> -	/* Notice returns -EPERM on if map size is larger than memlock limit */
> -	err = bpf_map_charge_init(&m->map.memory, cost);
> -	if (err)
> -		goto free_m;
> -
> -	err = -ENOMEM;
> -
>  	m->flush_list = alloc_percpu(struct list_head);
> -	if (!m->flush_list)
> -		goto free_charge;
> +	if (!m->flush_list) {
> +		bpf_map_charge_finish(&m->map.memory);
> +		bpf_map_area_free(m);
> +		return ERR_PTR(-ENOMEM);
> +	}
>  
>  	for_each_possible_cpu(cpu)
>  		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
>  
> -	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
> -					sizeof(struct xdp_sock *),
> -					m->map.numa_node);
> -	if (!m->xsk_map)
> -		goto free_percpu;
>  	return &m->map;
> -
> -free_percpu:
> -	free_percpu(m->flush_list);
> -free_charge:
> -	bpf_map_charge_finish(&m->map.memory);
> -free_m:
> -	kfree(m);
> -	return ERR_PTR(err);
>  }
>  
>  static void xsk_map_free(struct bpf_map *map)
Björn Töpel Oct. 28, 2019, 10:11 p.m. UTC | #2
On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote:
> > From: Björn Töpel <bjorn.topel@intel.com>
> >
> > Prior this commit, the array storing XDP socket instances were stored
> > in a separate allocated array of the XSKMAP. Now, we store the sockets
> > as a flexible array member in a similar fashion as the arraymap. Doing
> > so, we do less pointer chasing in the lookup.
> >
> > Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>
> Thanks for the re-spin.
>
> > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
> > index 82a1ffe15dfa..a83e92fe2971 100644
> > --- a/kernel/bpf/xskmap.c
> > +++ b/kernel/bpf/xskmap.c
>
> > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
> >           attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
> >               return ERR_PTR(-EINVAL);
> >
> > -     m = kzalloc(sizeof(*m), GFP_USER);
> > -     if (!m)
> > +     numa_node = bpf_map_attr_numa_node(attr);
> > +     size = struct_size(m, xsk_map, attr->max_entries);
> > +     cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
>
> Now we didn't use array_size() previously because the sum here may
> overflow.
>
> We could use __ab_c_size() here, the name is probably too ugly to use
> directly and IDK what we'd have to name such a accumulation helper...
>
> So maybe just make cost and size a u64 and we should be in the clear.
>

Hmm, but both:
  int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
  void *bpf_map_area_alloc(size_t size, int numa_node);
pass size as size_t, so casting to u64 doesn't really help on 32-bit
systems, right?

Wdyt about simply adding:
  if (cost < size)
    return ERR_PTR(-EINVAL)
after the cost calculation for explicit overflow checking?

So, if size's struct_size overflows, the allocation will fail.
And we'll catch the cost overflow with the if-statement, no?

Another option is changing the size_t in bpf_map_... to u64. Maybe
that's better, since arraymap and devmap uses u64 for cost/size.


Björn

> > +     err = bpf_map_charge_init(&mem, cost);
> > +     if (err < 0)
> > +             return ERR_PTR(err);
> > +
> > +     m = bpf_map_area_alloc(size, numa_node);
> > +     if (!m) {
> > +             bpf_map_charge_finish(&mem);
> >               return ERR_PTR(-ENOMEM);
> > +     }
> >
> >       bpf_map_init_from_attr(&m->map, attr);
> > +     bpf_map_charge_move(&m->map.memory, &mem);
> >       spin_lock_init(&m->lock);
> >
> > -     cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
> > -     cost += sizeof(struct list_head) * num_possible_cpus();
> > -
> > -     /* Notice returns -EPERM on if map size is larger than memlock limit */
> > -     err = bpf_map_charge_init(&m->map.memory, cost);
> > -     if (err)
> > -             goto free_m;
> > -
> > -     err = -ENOMEM;
> > -
> >       m->flush_list = alloc_percpu(struct list_head);
> > -     if (!m->flush_list)
> > -             goto free_charge;
> > +     if (!m->flush_list) {
> > +             bpf_map_charge_finish(&m->map.memory);
> > +             bpf_map_area_free(m);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> >
> >       for_each_possible_cpu(cpu)
> >               INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
> >
> > -     m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
> > -                                     sizeof(struct xdp_sock *),
> > -                                     m->map.numa_node);
> > -     if (!m->xsk_map)
> > -             goto free_percpu;
> >       return &m->map;
> > -
> > -free_percpu:
> > -     free_percpu(m->flush_list);
> > -free_charge:
> > -     bpf_map_charge_finish(&m->map.memory);
> > -free_m:
> > -     kfree(m);
> > -     return ERR_PTR(err);
> >  }
> >
> >  static void xsk_map_free(struct bpf_map *map)
Jakub Kicinski Oct. 28, 2019, 10:26 p.m. UTC | #3
On Mon, 28 Oct 2019 23:11:50 +0100, Björn Töpel wrote:
> On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski
> <jakub.kicinski@netronome.com> wrote:
> >
> > On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote:  
> > > From: Björn Töpel <bjorn.topel@intel.com>
> > >
> > > Prior this commit, the array storing XDP socket instances were stored
> > > in a separate allocated array of the XSKMAP. Now, we store the sockets
> > > as a flexible array member in a similar fashion as the arraymap. Doing
> > > so, we do less pointer chasing in the lookup.
> > >
> > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com>  
> >
> > Thanks for the re-spin.
> >  
> > > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
> > > index 82a1ffe15dfa..a83e92fe2971 100644
> > > --- a/kernel/bpf/xskmap.c
> > > +++ b/kernel/bpf/xskmap.c  
> >  
> > > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
> > >           attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
> > >               return ERR_PTR(-EINVAL);
> > >
> > > -     m = kzalloc(sizeof(*m), GFP_USER);
> > > -     if (!m)
> > > +     numa_node = bpf_map_attr_numa_node(attr);
> > > +     size = struct_size(m, xsk_map, attr->max_entries);
> > > +     cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());  
> >
> > Now we didn't use array_size() previously because the sum here may
> > overflow.
> >
> > We could use __ab_c_size() here, the name is probably too ugly to use
> > directly and IDK what we'd have to name such a accumulation helper...
> >
> > So maybe just make cost and size a u64 and we should be in the clear.
> >  
> 
> Hmm, but both:
>   int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
>   void *bpf_map_area_alloc(size_t size, int numa_node);
> pass size as size_t, so casting to u64 doesn't really help on 32-bit
> systems, right?

Yup :( IOW looks like the overflows will not be caught on 32bit
machines in all existing code that does the (u64) cast. I hope 
I'm wrong there.

> Wdyt about simply adding:
>   if (cost < size)
>     return ERR_PTR(-EINVAL)
> after the cost calculation for explicit overflow checking?

We'd need that for all users of these helpers. Could it perhaps makes
sense to pass "alloc_size" and "extra_cost" as separate size_t to
bpf_map_charge_init() and then we can do the overflow checking there,
centrally?

We can probably get rid of all the u64 casting too at that point,
and use standard overflow helpers, yuppie :)

> So, if size's struct_size overflows, the allocation will fail.
> And we'll catch the cost overflow with the if-statement, no?
> 
> Another option is changing the size_t in bpf_map_... to u64. Maybe
> that's better, since arraymap and devmap uses u64 for cost/size.
Björn Töpel Oct. 29, 2019, 6:20 a.m. UTC | #4
On Mon, 28 Oct 2019 at 23:26, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Mon, 28 Oct 2019 23:11:50 +0100, Björn Töpel wrote:
> > On Mon, 28 Oct 2019 at 18:55, Jakub Kicinski
> > <jakub.kicinski@netronome.com> wrote:
> > >
> > > On Fri, 25 Oct 2019 09:18:40 +0200, Björn Töpel wrote:
> > > > From: Björn Töpel <bjorn.topel@intel.com>
> > > >
> > > > Prior this commit, the array storing XDP socket instances were stored
> > > > in a separate allocated array of the XSKMAP. Now, we store the sockets
> > > > as a flexible array member in a similar fashion as the arraymap. Doing
> > > > so, we do less pointer chasing in the lookup.
> > > >
> > > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> > >
> > > Thanks for the re-spin.
> > >
> > > > diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
> > > > index 82a1ffe15dfa..a83e92fe2971 100644
> > > > --- a/kernel/bpf/xskmap.c
> > > > +++ b/kernel/bpf/xskmap.c
> > >
> > > > @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
> > > >           attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
> > > >               return ERR_PTR(-EINVAL);
> > > >
> > > > -     m = kzalloc(sizeof(*m), GFP_USER);
> > > > -     if (!m)
> > > > +     numa_node = bpf_map_attr_numa_node(attr);
> > > > +     size = struct_size(m, xsk_map, attr->max_entries);
> > > > +     cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
> > >
> > > Now we didn't use array_size() previously because the sum here may
> > > overflow.
> > >
> > > We could use __ab_c_size() here, the name is probably too ugly to use
> > > directly and IDK what we'd have to name such a accumulation helper...
> > >
> > > So maybe just make cost and size a u64 and we should be in the clear.
> > >
> >
> > Hmm, but both:
> >   int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
> >   void *bpf_map_area_alloc(size_t size, int numa_node);
> > pass size as size_t, so casting to u64 doesn't really help on 32-bit
> > systems, right?
>
> Yup :( IOW looks like the overflows will not be caught on 32bit
> machines in all existing code that does the (u64) cast. I hope
> I'm wrong there.
>
> > Wdyt about simply adding:
> >   if (cost < size)
> >     return ERR_PTR(-EINVAL)
> > after the cost calculation for explicit overflow checking?
>
> We'd need that for all users of these helpers. Could it perhaps makes
> sense to pass "alloc_size" and "extra_cost" as separate size_t to
> bpf_map_charge_init() and then we can do the overflow checking there,
> centrally?
>

The cost/size calculations seem to vary a bit from map to map, so I
don't know about the extra size_t arguments... but all of them do use
u64 for cost and explicit casting, in favor of u32 overflow checks.
Changing bpf_map_charge_init()/bpf_map_area_alloc() size to u64 would
be the smallest change, together with a 64-to-32 overflow check in
those functions.

> We can probably get rid of all the u64 casting too at that point,
> and use standard overflow helpers, yuppie :)
>

Yeah, that's the other path, but more churn (check_add_overflow() in every map).

Preferred path?

> > So, if size's struct_size overflows, the allocation will fail.
> > And we'll catch the cost overflow with the if-statement, no?
> >
> > Another option is changing the size_t in bpf_map_... to u64. Maybe
> > that's better, since arraymap and devmap uses u64 for cost/size.
Alexei Starovoitov Oct. 29, 2019, 1:44 p.m. UTC | #5
On Mon, Oct 28, 2019 at 11:21 PM Björn Töpel <bjorn.topel@gmail.com> wrote:
> Changing bpf_map_charge_init()/bpf_map_area_alloc() size to u64 would
> be the smallest change, together with a 64-to-32 overflow check in
> those functions.

+1. bpf_map_charge_init() already has such check.
Changing them to u64 is probably good idea.
diff mbox series

Patch

diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 82a1ffe15dfa..a83e92fe2971 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -11,9 +11,9 @@ 
 
 struct xsk_map {
 	struct bpf_map map;
-	struct xdp_sock **xsk_map;
 	struct list_head __percpu *flush_list;
 	spinlock_t lock; /* Synchronize map updates */
+	struct xdp_sock *xsk_map[];
 };
 
 int xsk_map_inc(struct xsk_map *map)
@@ -80,9 +80,10 @@  static void xsk_map_sock_delete(struct xdp_sock *xs,
 
 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 {
+	struct bpf_map_memory mem;
+	int cpu, err, numa_node;
 	struct xsk_map *m;
-	int cpu, err;
-	u64 cost;
+	size_t cost, size;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -92,44 +93,35 @@  static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
 		return ERR_PTR(-EINVAL);
 
-	m = kzalloc(sizeof(*m), GFP_USER);
-	if (!m)
+	numa_node = bpf_map_attr_numa_node(attr);
+	size = struct_size(m, xsk_map, attr->max_entries);
+	cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
+
+	err = bpf_map_charge_init(&mem, cost);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	m = bpf_map_area_alloc(size, numa_node);
+	if (!m) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	bpf_map_init_from_attr(&m->map, attr);
+	bpf_map_charge_move(&m->map.memory, &mem);
 	spin_lock_init(&m->lock);
 
-	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
-	cost += sizeof(struct list_head) * num_possible_cpus();
-
-	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_charge_init(&m->map.memory, cost);
-	if (err)
-		goto free_m;
-
-	err = -ENOMEM;
-
 	m->flush_list = alloc_percpu(struct list_head);
-	if (!m->flush_list)
-		goto free_charge;
+	if (!m->flush_list) {
+		bpf_map_charge_finish(&m->map.memory);
+		bpf_map_area_free(m);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	for_each_possible_cpu(cpu)
 		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
 
-	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
-					sizeof(struct xdp_sock *),
-					m->map.numa_node);
-	if (!m->xsk_map)
-		goto free_percpu;
 	return &m->map;
-
-free_percpu:
-	free_percpu(m->flush_list);
-free_charge:
-	bpf_map_charge_finish(&m->map.memory);
-free_m:
-	kfree(m);
-	return ERR_PTR(err);
 }
 
 static void xsk_map_free(struct bpf_map *map)
@@ -139,8 +131,7 @@  static void xsk_map_free(struct bpf_map *map)
 	bpf_clear_redirect_map(map);
 	synchronize_net();
 	free_percpu(m->flush_list);
-	bpf_map_area_free(m->xsk_map);
-	kfree(m);
+	bpf_map_area_free(m);
 }
 
 static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)