diff mbox

[net-next,11/12] net: add notifier hooks for devmap bpf map

Message ID 20170717163002.24315.38734.stgit@john-Precision-Tower-5810
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend July 17, 2017, 4:30 p.m. UTC
The BPF map devmap holds a refcnt on the net_device structure when
it is in the map. We need to do this to ensure on driver unload we
don't lose a dev reference.

However, its not very convenient to have to manually unload the map
when destroying a net device so add notifier handlers to do the cleanup
automatically. But this creates a race between update/destroy BPF
syscall and programs and the unregister netdev hook.

Unfortunately, the best I could come up with is either to live with
requiring manual removal of net devices from the map before removing
the net device OR to add a mutex in devmap to ensure the map is not
modified while we are removing a device. The fallout also requires
that BPF programs no longer update/delete the map from the BPF program
side because the mutex may sleep and this can not be done from inside
an rcu critical section.  This is not a real problem though because I
have not come up with any use cases where this is actually useful in
practice. If/when we come up with a compelling user for this we may
need to revisit this.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/filter.h |    2 +
 kernel/bpf/devmap.c    |   73 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  |    2 +
 3 files changed, 75 insertions(+), 2 deletions(-)

Comments

Levin, Alexander (Sasha Levin) July 30, 2017, 1:28 p.m. UTC | #1
On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote:
>@@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
> 	 * Remembering the driver side flush operation will happen before the
> 	 * net device is removed.
> 	 */
>+	mutex_lock(&dev_map_list_mutex);
> 	old_dev = xchg(&dtab->netdev_map[i], dev);
> 	if (old_dev)
> 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
>+	mutex_unlock(&dev_map_list_mutex);
>
> 	return 0;
> }

This function gets called under rcu critical section, where we can't grab mutexes:

BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747
in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1
1 lock held by syz-executor1/16315:
 #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] map_delete_elem kernel/bpf/syscall.c:577 [inline]
 #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
 #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388
Preemption disabled at:
[<ffffffff8c363bd1>] map_delete_elem kernel/bpf/syscall.c:582 [inline]
[<ffffffff8c363bd1>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
[<ffffffff8c363bd1>] SyS_bpf+0x1d41/0x4ba0 kernel/bpf/syscall.c:1388
CPU: 2 PID: 16315 Comm: syz-executor1 Not tainted 4.13.0-rc2-next-20170727 #235
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x11d/0x1e5 lib/dump_stack.c:52
 ___might_sleep+0x3cc/0x520 kernel/sched/core.c:6001
 __might_sleep+0x95/0x190 kernel/sched/core.c:5954
 __mutex_lock_common kernel/locking/mutex.c:747 [inline]
 __mutex_lock+0x146/0x19b0 kernel/locking/mutex.c:893
 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
 dev_map_delete_elem+0x82/0x110 kernel/bpf/devmap.c:325
 map_delete_elem kernel/bpf/syscall.c:585 [inline]
 SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
 SyS_bpf+0x1deb/0x4ba0 kernel/bpf/syscall.c:1388
 do_syscall_64+0x26a/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x452309
RSP: 002b:00007f8d83d66c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000452309
RDX: 0000000000000010 RSI: 0000000020007000 RDI: 0000000000000003
RBP: 0000000000000270 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b85e4
R13: 00000000ffffffff R14: 0000000000000003 R15: 0000000020007000
Daniel Borkmann July 31, 2017, 8:55 a.m. UTC | #2
On 07/30/2017 03:28 PM, Levin, Alexander (Sasha Levin) wrote:
> On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote:
>> @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>> 	 * Remembering the driver side flush operation will happen before the
>> 	 * net device is removed.
>> 	 */
>> +	mutex_lock(&dev_map_list_mutex);
>> 	old_dev = xchg(&dtab->netdev_map[i], dev);
>> 	if (old_dev)
>> 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
>> +	mutex_unlock(&dev_map_list_mutex);
>>
>> 	return 0;
>> }
>
> This function gets called under rcu critical section, where we can't grab mutexes:

Agree, same goes for the delete callback that mutex is not allowed
in this context. If I recall, this was for the devmap netdev notifier
in order to check whether we need to purge dev entries from the map,
so that the device can be unregistered gracefully. Given that devmap
ops like update/delete are only allowed from user space, we could
look into whether this map type actually needs to hold RCU at all
here, or other option is to try and get rid of the mutex altogether.
John, could you take a look for a fix?

Thanks a lot,
Daniel

> BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747
> in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1
> 1 lock held by syz-executor1/16315:
>   #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] map_delete_elem kernel/bpf/syscall.c:577 [inline]
>   #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
>   #0:  (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388
> Preemption disabled at:
> [<ffffffff8c363bd1>] map_delete_elem kernel/bpf/syscall.c:582 [inline]
> [<ffffffff8c363bd1>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
> [<ffffffff8c363bd1>] SyS_bpf+0x1d41/0x4ba0 kernel/bpf/syscall.c:1388
> CPU: 2 PID: 16315 Comm: syz-executor1 Not tainted 4.13.0-rc2-next-20170727 #235
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014
> Call Trace:
>   __dump_stack lib/dump_stack.c:16 [inline]
>   dump_stack+0x11d/0x1e5 lib/dump_stack.c:52
>   ___might_sleep+0x3cc/0x520 kernel/sched/core.c:6001
>   __might_sleep+0x95/0x190 kernel/sched/core.c:5954
>   __mutex_lock_common kernel/locking/mutex.c:747 [inline]
>   __mutex_lock+0x146/0x19b0 kernel/locking/mutex.c:893
>   mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>   dev_map_delete_elem+0x82/0x110 kernel/bpf/devmap.c:325
>   map_delete_elem kernel/bpf/syscall.c:585 [inline]
>   SYSC_bpf kernel/bpf/syscall.c:1427 [inline]
>   SyS_bpf+0x1deb/0x4ba0 kernel/bpf/syscall.c:1388
>   do_syscall_64+0x26a/0x800 arch/x86/entry/common.c:287
>   entry_SYSCALL64_slow_path+0x25/0x25
> RIP: 0033:0x452309
> RSP: 002b:00007f8d83d66c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000141
> RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000452309
> RDX: 0000000000000010 RSI: 0000000020007000 RDI: 0000000000000003
> RBP: 0000000000000270 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b85e4
> R13: 00000000ffffffff R14: 0000000000000003 R15: 0000000020007000
>
John Fastabend July 31, 2017, 2:47 p.m. UTC | #3
On 07/31/2017 01:55 AM, Daniel Borkmann wrote:
> On 07/30/2017 03:28 PM, Levin, Alexander (Sasha Levin) wrote:
>> On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote:
>>> @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>      * Remembering the driver side flush operation will happen before the
>>>      * net device is removed.
>>>      */
>>> +    mutex_lock(&dev_map_list_mutex);
>>>     old_dev = xchg(&dtab->netdev_map[i], dev);
>>>     if (old_dev)
>>>         call_rcu(&old_dev->rcu, __dev_map_entry_free);
>>> +    mutex_unlock(&dev_map_list_mutex);
>>>
>>>     return 0;
>>> }
>>
>> This function gets called under rcu critical section, where we can't grab mutexes:
> 
> Agree, same goes for the delete callback that mutex is not allowed
> in this context. If I recall, this was for the devmap netdev notifier
> in order to check whether we need to purge dev entries from the map,
> so that the device can be unregistered gracefully. Given that devmap
> ops like update/delete are only allowed from user space, we could
> look into whether this map type actually needs to hold RCU at all
> here, or other option is to try and get rid of the mutex altogether.
> John, could you take a look for a fix?
> 
> Thanks a lot,
> Daniel
> 

I'll work up a fix today/tomorrow. Thanks.
diff mbox

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3323ee9..d19ed3c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -716,7 +716,7 @@  struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
  * same cpu context. Further for best results no more than a single map
  * for the do_redirect/do_flush pair should be used. This limitation is
  * because we only track one map and force a flush when the map changes.
- * This does not appear to be a real limiation for existing software.
+ * This does not appear to be a real limitation for existing software.
  */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
 int xdp_do_redirect(struct net_device *dev,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index b2ef04a..899364d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -34,6 +34,17 @@ 
  * netdev_map consistent in this case. From the devmap side BPF programs
  * calling into these operations are the same as multiple user space threads
  * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * Unfortunately, the xchg() operations do not protect against this. To avoid
+ * potentially removing incorrect objects the dev_map_list_mutex protects
+ * conflicting netdev unregister and BPF syscall operations. Updates and
+ * deletes from a BPF program (done in rcu critical section) are blocked
+ * because of this mutex.
  */
 #include <linux/bpf.h>
 #include <linux/jhash.h>
@@ -54,8 +65,12 @@  struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long int __percpu *flush_needed;
+	struct list_head list;
 };
 
+static DEFINE_MUTEX(dev_map_list_mutex);
+static LIST_HEAD(dev_map_list);
+
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_dtab *dtab;
@@ -112,6 +127,9 @@  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
+	mutex_lock(&dev_map_list_mutex);
+	list_add_tail(&dtab->list, &dev_map_list);
+	mutex_unlock(&dev_map_list_mutex);
 	return &dtab->map;
 
 free_dtab:
@@ -146,6 +164,11 @@  static void dev_map_free(struct bpf_map *map)
 			cpu_relax();
 	}
 
+	/* Although we should no longer have datapath or bpf syscall operations
+	 * at this point we we can still race with netdev notifier, hence the
+	 * lock.
+	 */
+	mutex_lock(&dev_map_list_mutex);
 	for (i = 0; i < dtab->map.max_entries; i++) {
 		struct bpf_dtab_netdev *dev;
 
@@ -160,6 +183,8 @@  static void dev_map_free(struct bpf_map *map)
 	/* At this point bpf program is detached and all pending operations
 	 * _must_ be complete
 	 */
+	list_del(&dtab->list);
+	mutex_unlock(&dev_map_list_mutex);
 	free_percpu(dtab->flush_needed);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
@@ -296,9 +321,11 @@  static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	 * the driver tear down ensures all soft irqs are complete before
 	 * removing the net device in the case of dev_put equals zero.
 	 */
+	mutex_lock(&dev_map_list_mutex);
 	old_dev = xchg(&dtab->netdev_map[k], NULL);
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	mutex_unlock(&dev_map_list_mutex);
 	return 0;
 }
 
@@ -341,9 +368,11 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	 * Remembering the driver side flush operation will happen before the
 	 * net device is removed.
 	 */
+	mutex_lock(&dev_map_list_mutex);
 	old_dev = xchg(&dtab->netdev_map[i], dev);
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	mutex_unlock(&dev_map_list_mutex);
 
 	return 0;
 }
@@ -356,3 +385,47 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
 };
+
+static int dev_map_notification(struct notifier_block *notifier,
+				ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dtab *dtab;
+	int i;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		mutex_lock(&dev_map_list_mutex);
+		list_for_each_entry(dtab, &dev_map_list, list) {
+			for (i = 0; i < dtab->map.max_entries; i++) {
+				struct bpf_dtab_netdev *dev;
+
+				dev = dtab->netdev_map[i];
+				if (!dev ||
+				    dev->dev->ifindex != netdev->ifindex)
+					continue;
+				dev = xchg(&dtab->netdev_map[i], NULL);
+				if (dev)
+					call_rcu(&dev->rcu,
+						 __dev_map_entry_free);
+			}
+		}
+		mutex_unlock(&dev_map_list_mutex);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+	.notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+	register_netdevice_notifier(&dev_map_notifier);
+	return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df05d65..ebe9b38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1281,7 +1281,7 @@  static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	 * for now.
 	 */
 	case BPF_MAP_TYPE_DEVMAP:
-		if (func_id == BPF_FUNC_map_lookup_elem)
+		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS: