diff mbox series

[net-next,v3,1/3] xdp: Refactor devmap code in preparation for subsequent additions

Message ID 155144955040.28287.1075106871059918653.stgit@alrua-x1
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series xdp: Use a default map for xdp_redirect helper | expand

Commit Message

Toke Høiland-Jørgensen March 1, 2019, 2:12 p.m. UTC
The subsequent commits introducing default maps and a hash-based ifindex
devmap require a bit of refactoring of the devmap code. Perform this first
so the subsequent commits become easier to read.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/devmap.c |  177 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 109 insertions(+), 68 deletions(-)

Comments

Jakub Kicinski March 2, 2019, 1:08 a.m. UTC | #1
On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> The subsequent commits introducing default maps and a hash-based ifindex
> devmap require a bit of refactoring of the devmap code. Perform this first
> so the subsequent commits become easier to read.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>

> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 191b79948424..1037fc08c504 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -75,6 +75,7 @@ struct bpf_dtab {
>  	struct bpf_dtab_netdev **netdev_map;
>  	unsigned long __percpu *flush_needed;
>  	struct list_head list;
> +	struct rcu_head rcu;

I think the RCU change may warrant a separate commit or at least a
mention in the commit message ;)

>  };
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>  }
>  
> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> +			    bool check_memlock)
>  {
> -	struct bpf_dtab *dtab;
> -	int err = -EINVAL;
>  	u64 cost;
> -
> -	if (!capable(CAP_NET_ADMIN))
> -		return ERR_PTR(-EPERM);
> -
> -	/* check sanity of attributes */
> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> -		return ERR_PTR(-EINVAL);

perhaps consider moving these to a map_alloc_check callback?

> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> -	if (!dtab)
> -		return ERR_PTR(-ENOMEM);
> +	int err;
>  
>  	bpf_map_init_from_attr(&dtab->map, attr);
>  
> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>  	if (cost >= U32_MAX - PAGE_SIZE)
> -		goto free_dtab;
> +		return -EINVAL;
>  
>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>  
> -	/* if map size is larger than memlock limit, reject it early */
> -	err = bpf_map_precharge_memlock(dtab->map.pages);
> -	if (err)
> -		goto free_dtab;
> -
> -	err = -ENOMEM;
> +	if (check_memlock) {
> +		/* if map size is larger than memlock limit, reject it early */
> +		err = bpf_map_precharge_memlock(dtab->map.pages);
> +		if (err)
> +			return -EINVAL;
> +	}
>  
>  	/* A per cpu bitfield with a bit per possible net device */
>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>  						__alignof__(unsigned long),
>  						GFP_KERNEL | __GFP_NOWARN);
>  	if (!dtab->flush_needed)
> -		goto free_dtab;
> +		goto err_alloc;
>  
>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>  					      sizeof(struct bpf_dtab_netdev *),
>  					      dtab->map.numa_node);
>  	if (!dtab->netdev_map)
> -		goto free_dtab;
> +		goto err_map;
>  
> -	spin_lock(&dev_map_lock);
> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
> -	spin_unlock(&dev_map_lock);
> +	return 0;
>  
> -	return &dtab->map;
> -free_dtab:
> + err_map:

The label should name the first action.  So it was correct, you're
making it less good :)  Also why the space?

>  	free_percpu(dtab->flush_needed);
> -	kfree(dtab);
> -	return ERR_PTR(err);
> + err_alloc:

and no need for this one

> +	return -ENOMEM;
>  }
>  
> -static void dev_map_free(struct bpf_map *map)
> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  {
> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -	int i, cpu;
> +	struct bpf_dtab *dtab;
> +	int err;
>  
> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> -	 * so the programs (can be more than one that used this map) were
> -	 * disconnected from events. Wait for outstanding critical sections in
> -	 * these programs to complete. The rcu critical section only guarantees
> -	 * no further reads against netdev_map. It does __not__ ensure pending
> -	 * flush operations (if any) are complete.
> -	 */
> +	if (!capable(CAP_NET_ADMIN))
> +		return ERR_PTR(-EPERM);
> +
> +	/* check sanity of attributes */
> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> +		return ERR_PTR(-EINVAL);
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> +	if (!dtab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = dev_map_init_map(dtab, attr, true);
> +	if (err) {
> +		kfree(dtab);
> +		return ERR_PTR(err);
> +	}
>  
>  	spin_lock(&dev_map_lock);
> -	list_del_rcu(&dtab->list);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>  	spin_unlock(&dev_map_lock);
>  
> -	bpf_clear_redirect_map(map);
> -	synchronize_rcu();
> +	return &dtab->map;
> +}
> +
> +static void __dev_map_free(struct rcu_head *rcu)
> +{
> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
> +	int i, cpu;
>  
>  	/* To ensure all pending flush operations have completed wait for flush
>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)

There is a cond_resched() here, I don't think you can call
cond_resched() from an RCU callback.

>  	kfree(dtab);
>  }
>  
> +static void dev_map_free(struct bpf_map *map)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +
> +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> +	 * so the programs (can be more than one that used this map) were
> +	 * disconnected from events. Wait for outstanding critical sections in
> +	 * these programs to complete. The rcu critical section only guarantees
> +	 * no further reads against netdev_map. It does __not__ ensure pending
> +	 * flush operations (if any) are complete.
> +	 */
> +
> +	spin_lock(&dev_map_lock);
> +	list_del_rcu(&dtab->list);
> +	spin_unlock(&dev_map_lock);
> +
> +	bpf_clear_redirect_map(map);
> +	call_rcu(&dtab->rcu, __dev_map_free);
> +}
> +
>  static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
Toke Høiland-Jørgensen March 4, 2019, 12:47 p.m. UTC | #2
Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
>> The subsequent commits introducing default maps and a hash-based ifindex
>> devmap require a bit of refactoring of the devmap code. Perform this first
>> so the subsequent commits become easier to read.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index 191b79948424..1037fc08c504 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -75,6 +75,7 @@ struct bpf_dtab {
>>  	struct bpf_dtab_netdev **netdev_map;
>>  	unsigned long __percpu *flush_needed;
>>  	struct list_head list;
>> +	struct rcu_head rcu;
>
> I think the RCU change may warrant a separate commit or at least a
> mention in the commit message ;)

Heh, fair point.

>>  };
>>  
>>  static DEFINE_SPINLOCK(dev_map_lock);
>> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>>  }
>>  
>> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>> +			    bool check_memlock)
>>  {
>> -	struct bpf_dtab *dtab;
>> -	int err = -EINVAL;
>>  	u64 cost;
>> -
>> -	if (!capable(CAP_NET_ADMIN))
>> -		return ERR_PTR(-EPERM);
>> -
>> -	/* check sanity of attributes */
>> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> -		return ERR_PTR(-EINVAL);
>
> perhaps consider moving these to a map_alloc_check callback?

The reason I kept them here is that these checks are only needed
strictly when the parameters come from userspace. In the other
allocation paths, the attrs are hard-coded to valid values, so the only
thing having the check would do is guard against future changes to one
part of the file being out of sync with the other part. I can certainly
add the check, it just seemed a bit excessive...

>> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> -	if (!dtab)
>> -		return ERR_PTR(-ENOMEM);
>> +	int err;
>>  
>>  	bpf_map_init_from_attr(&dtab->map, attr);
>>  
>> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>>  	if (cost >= U32_MAX - PAGE_SIZE)
>> -		goto free_dtab;
>> +		return -EINVAL;
>>  
>>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>>  
>> -	/* if map size is larger than memlock limit, reject it early */
>> -	err = bpf_map_precharge_memlock(dtab->map.pages);
>> -	if (err)
>> -		goto free_dtab;
>> -
>> -	err = -ENOMEM;
>> +	if (check_memlock) {
>> +		/* if map size is larger than memlock limit, reject it early */
>> +		err = bpf_map_precharge_memlock(dtab->map.pages);
>> +		if (err)
>> +			return -EINVAL;
>> +	}
>>  
>>  	/* A per cpu bitfield with a bit per possible net device */
>>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>>  						__alignof__(unsigned long),
>>  						GFP_KERNEL | __GFP_NOWARN);
>>  	if (!dtab->flush_needed)
>> -		goto free_dtab;
>> +		goto err_alloc;
>>  
>>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>>  					      sizeof(struct bpf_dtab_netdev *),
>>  					      dtab->map.numa_node);
>>  	if (!dtab->netdev_map)
>> -		goto free_dtab;
>> +		goto err_map;
>>  
>> -	spin_lock(&dev_map_lock);
>> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
>> -	spin_unlock(&dev_map_lock);
>> +	return 0;
>>  
>> -	return &dtab->map;
>> -free_dtab:
>> + err_map:
>
> The label should name the first action.  So it was correct, you're
> making it less good :)

Heh, yeah, guess you're right.

> Also why the space?

Because the might GNU ordained it (i.e., Emacs added those). Will fix.

>>  	free_percpu(dtab->flush_needed);
>> -	kfree(dtab);
>> -	return ERR_PTR(err);
>> + err_alloc:
>
> and no need for this one

Right, guess I can just rely on the NULL check in the free functions;
would make things simpler in patch 3 as well :)

>> +	return -ENOMEM;
>>  }
>>  
>> -static void dev_map_free(struct bpf_map *map)
>> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  {
>> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> -	int i, cpu;
>> +	struct bpf_dtab *dtab;
>> +	int err;
>>  
>> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
>> -	 * so the programs (can be more than one that used this map) were
>> -	 * disconnected from events. Wait for outstanding critical sections in
>> -	 * these programs to complete. The rcu critical section only guarantees
>> -	 * no further reads against netdev_map. It does __not__ ensure pending
>> -	 * flush operations (if any) are complete.
>> -	 */
>> +	if (!capable(CAP_NET_ADMIN))
>> +		return ERR_PTR(-EPERM);
>> +
>> +	/* check sanity of attributes */
>> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> +	if (!dtab)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	err = dev_map_init_map(dtab, attr, true);
>> +	if (err) {
>> +		kfree(dtab);
>> +		return ERR_PTR(err);
>> +	}
>>  
>>  	spin_lock(&dev_map_lock);
>> -	list_del_rcu(&dtab->list);
>> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>>  	spin_unlock(&dev_map_lock);
>>  
>> -	bpf_clear_redirect_map(map);
>> -	synchronize_rcu();
>> +	return &dtab->map;
>> +}
>> +
>> +static void __dev_map_free(struct rcu_head *rcu)
>> +{
>> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
>> +	int i, cpu;
>>  
>>  	/* To ensure all pending flush operations have completed wait for flush
>>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
>> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
>
> There is a cond_resched() here, I don't think you can call
> cond_resched() from an RCU callback.

Hmm, bugger. I thought I could just use call_rcu() as semantically
equivalent (but slightly slower) to just calling synchronize_rcu()
inside the function.

In an earlier version I had a namespace enter/exit notifier in devmap.c
as well, to react to new namespaces. And that notifier has a comment
about avoiding calls to synchronize_rcu(). But since this version
doesn't actually need that, maybe I can just keep using direct calls and
synchronize_rcu() and avoid the callback? I'm a bit worried about adding
both synchronize_rcu() and cond_resched() as a possible side effect of
every call to bpf_prog_put(), though; so maybe it's better to move the
cleanup somewhere it's actually safe to call cond_resched(); what would
that be, a workqueue?

-Toke
Jakub Kicinski March 4, 2019, 5:08 p.m. UTC | #3
On Mon, 04 Mar 2019 13:47:47 +0100, Toke Høiland-Jørgensen wrote:
> In an earlier version I had a namespace enter/exit notifier in devmap.c
> as well, to react to new namespaces. And that notifier has a comment
> about avoiding calls to synchronize_rcu(). But since this version
> doesn't actually need that, maybe I can just keep using direct calls and
> synchronize_rcu() and avoid the callback? I'm a bit worried about adding
> both synchronize_rcu() and cond_resched() as a possible side effect of
> every call to bpf_prog_put(), though; so maybe it's better to move the
> cleanup somewhere it's actually safe to call cond_resched(); what would
> that be, a workqueue?

Workqueue would be my go to.
Toke Høiland-Jørgensen March 4, 2019, 5:37 p.m. UTC | #4
Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Mon, 04 Mar 2019 13:47:47 +0100, Toke Høiland-Jørgensen wrote:
>> In an earlier version I had a namespace enter/exit notifier in devmap.c
>> as well, to react to new namespaces. And that notifier has a comment
>> about avoiding calls to synchronize_rcu(). But since this version
>> doesn't actually need that, maybe I can just keep using direct calls and
>> synchronize_rcu() and avoid the callback? I'm a bit worried about adding
>> both synchronize_rcu() and cond_resched() as a possible side effect of
>> every call to bpf_prog_put(), though; so maybe it's better to move the
>> cleanup somewhere it's actually safe to call cond_resched(); what would
>> that be, a workqueue?
>
> Workqueue would be my go to.

Gotcha! Thanks :)

-Toke
diff mbox series

Patch

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..1037fc08c504 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,6 +75,7 @@  struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long __percpu *flush_needed;
 	struct list_head list;
+	struct rcu_head rcu;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
@@ -85,23 +86,11 @@  static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+			    bool check_memlock)
 {
-	struct bpf_dtab *dtab;
-	int err = -EINVAL;
 	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	dtab = kzalloc(sizeof(*dtab), GFP_USER);
-	if (!dtab)
-		return ERR_PTR(-ENOMEM);
+	int err;
 
 	bpf_map_init_from_attr(&dtab->map, attr);
 
@@ -109,60 +98,72 @@  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
+		return -EINVAL;
 
 	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
-	if (err)
-		goto free_dtab;
-
-	err = -ENOMEM;
+	if (check_memlock) {
+		/* if map size is larger than memlock limit, reject it early */
+		err = bpf_map_precharge_memlock(dtab->map.pages);
+		if (err)
+			return -EINVAL;
+	}
 
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		goto err_alloc;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_dtab;
+		goto err_map;
 
-	spin_lock(&dev_map_lock);
-	list_add_tail_rcu(&dtab->list, &dev_map_list);
-	spin_unlock(&dev_map_lock);
+	return 0;
 
-	return &dtab->map;
-free_dtab:
+ err_map:
 	free_percpu(dtab->flush_needed);
-	kfree(dtab);
-	return ERR_PTR(err);
+ err_alloc:
+	return -ENOMEM;
 }
 
-static void dev_map_free(struct bpf_map *map)
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	int i, cpu;
+	struct bpf_dtab *dtab;
+	int err;
 
-	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete. The rcu critical section only guarantees
-	 * no further reads against netdev_map. It does __not__ ensure pending
-	 * flush operations (if any) are complete.
-	 */
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
+
+	err = dev_map_init_map(dtab, attr, true);
+	if (err) {
+		kfree(dtab);
+		return ERR_PTR(err);
+	}
 
 	spin_lock(&dev_map_lock);
-	list_del_rcu(&dtab->list);
+	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
-	bpf_clear_redirect_map(map);
-	synchronize_rcu();
+	return &dtab->map;
+}
+
+static void __dev_map_free(struct rcu_head *rcu)
+{
+	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
+	int i, cpu;
 
 	/* To ensure all pending flush operations have completed wait for flush
 	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
@@ -192,6 +193,26 @@  static void dev_map_free(struct bpf_map *map)
 	kfree(dtab);
 }
 
+static void dev_map_free(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further reads against netdev_map. It does __not__ ensure pending
+	 * flush operations (if any) are complete.
+	 */
+
+	spin_lock(&dev_map_lock);
+	list_del_rcu(&dtab->list);
+	spin_unlock(&dev_map_lock);
+
+	bpf_clear_redirect_map(map);
+	call_rcu(&dtab->rcu, __dev_map_free);
+}
+
 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -429,12 +450,42 @@  static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+						    struct bpf_dtab *dtab,
+						    u32 ifindex,
+						    unsigned int bit)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct net *net = current->nsproxy->net_ns;
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct bpf_dtab_netdev *dev;
+
+	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+					sizeof(void *), gfp);
+	if (!dev->bulkq) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev->dev = dev_get_by_index(net, ifindex);
+	if (!dev->dev) {
+		free_percpu(dev->bulkq);
+		kfree(dev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev->bit = bit;
+	dev->dtab = dtab;
+
+	return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				 void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -449,26 +500,9 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
-		if (!dev)
-			return -ENOMEM;
-
-		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
-						sizeof(void *), gfp);
-		if (!dev->bulkq) {
-			kfree(dev);
-			return -ENOMEM;
-		}
-
-		dev->dev = dev_get_by_index(net, ifindex);
-		if (!dev->dev) {
-			free_percpu(dev->bulkq);
-			kfree(dev);
-			return -EINVAL;
-		}
-
-		dev->bit = i;
-		dev->dtab = dtab;
+		dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
 	}
 
 	/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -482,6 +516,13 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	return __dev_map_update_elem(current->nsproxy->net_ns,
+				     map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,