Message ID | 20170717163002.24315.38734.stgit@john-Precision-Tower-5810 |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote: >@@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, > * Remembering the driver side flush operation will happen before the > * net device is removed. > */ >+ mutex_lock(&dev_map_list_mutex); > old_dev = xchg(&dtab->netdev_map[i], dev); > if (old_dev) > call_rcu(&old_dev->rcu, __dev_map_entry_free); >+ mutex_unlock(&dev_map_list_mutex); > > return 0; > } This function gets called under rcu critical section, where we can't grab mutexes: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1 1 lock held by syz-executor1/16315: #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] map_delete_elem kernel/bpf/syscall.c:577 [inline] #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388 Preemption disabled at: [<ffffffff8c363bd1>] map_delete_elem kernel/bpf/syscall.c:582 [inline] [<ffffffff8c363bd1>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] [<ffffffff8c363bd1>] SyS_bpf+0x1d41/0x4ba0 kernel/bpf/syscall.c:1388 CPU: 2 PID: 16315 Comm: syz-executor1 Not tainted 4.13.0-rc2-next-20170727 #235 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x11d/0x1e5 lib/dump_stack.c:52 ___might_sleep+0x3cc/0x520 kernel/sched/core.c:6001 __might_sleep+0x95/0x190 kernel/sched/core.c:5954 __mutex_lock_common kernel/locking/mutex.c:747 [inline] __mutex_lock+0x146/0x19b0 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 dev_map_delete_elem+0x82/0x110 kernel/bpf/devmap.c:325 map_delete_elem kernel/bpf/syscall.c:585 [inline] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] SyS_bpf+0x1deb/0x4ba0 kernel/bpf/syscall.c:1388 do_syscall_64+0x26a/0x800 arch/x86/entry/common.c:287 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x452309 RSP: 002b:00007f8d83d66c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000452309 RDX: 0000000000000010 RSI: 0000000020007000 RDI: 0000000000000003 RBP: 0000000000000270 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b85e4 R13: 00000000ffffffff R14: 0000000000000003 R15: 0000000020007000
On 07/30/2017 03:28 PM, Levin, Alexander (Sasha Levin) wrote: > On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote: >> @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, >> * Remembering the driver side flush operation will happen before the >> * net device is removed. >> */ >> + mutex_lock(&dev_map_list_mutex); >> old_dev = xchg(&dtab->netdev_map[i], dev); >> if (old_dev) >> call_rcu(&old_dev->rcu, __dev_map_entry_free); >> + mutex_unlock(&dev_map_list_mutex); >> >> return 0; >> } > > This function gets called under rcu critical section, where we can't grab mutexes: Agree, same goes for the delete callback that mutex is not allowed in this context. If I recall, this was for the devmap netdev notifier in order to check whether we need to purge dev entries from the map, so that the device can be unregistered gracefully. Given that devmap ops like update/delete are only allowed from user space, we could look into whether this map type actually needs to hold RCU at all here, or other option is to try and get rid of the mutex altogether. John, could you take a look for a fix? Thanks a lot, Daniel > BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 > in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1 > 1 lock held by syz-executor1/16315: > #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] map_delete_elem kernel/bpf/syscall.c:577 [inline] > #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] > #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388 > Preemption disabled at: > [<ffffffff8c363bd1>] map_delete_elem kernel/bpf/syscall.c:582 [inline] > [<ffffffff8c363bd1>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] > [<ffffffff8c363bd1>] SyS_bpf+0x1d41/0x4ba0 kernel/bpf/syscall.c:1388 > CPU: 2 PID: 16315 Comm: syz-executor1 Not tainted 4.13.0-rc2-next-20170727 #235 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 > Call Trace: > __dump_stack lib/dump_stack.c:16 [inline] > dump_stack+0x11d/0x1e5 lib/dump_stack.c:52 > ___might_sleep+0x3cc/0x520 kernel/sched/core.c:6001 > __might_sleep+0x95/0x190 kernel/sched/core.c:5954 > __mutex_lock_common kernel/locking/mutex.c:747 [inline] > __mutex_lock+0x146/0x19b0 kernel/locking/mutex.c:893 > mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 > dev_map_delete_elem+0x82/0x110 kernel/bpf/devmap.c:325 > map_delete_elem kernel/bpf/syscall.c:585 [inline] > SYSC_bpf kernel/bpf/syscall.c:1427 [inline] > SyS_bpf+0x1deb/0x4ba0 kernel/bpf/syscall.c:1388 > do_syscall_64+0x26a/0x800 arch/x86/entry/common.c:287 > entry_SYSCALL64_slow_path+0x25/0x25 > RIP: 0033:0x452309 > RSP: 002b:00007f8d83d66c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000141 > RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000452309 > RDX: 0000000000000010 RSI: 0000000020007000 RDI: 0000000000000003 > RBP: 0000000000000270 R08: 0000000000000000 R09: 0000000000000000 > R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b85e4 > R13: 00000000ffffffff R14: 0000000000000003 R15: 0000000020007000 >
On 07/31/2017 01:55 AM, Daniel Borkmann wrote: > On 07/30/2017 03:28 PM, Levin, Alexander (Sasha Levin) wrote: >> On Mon, Jul 17, 2017 at 09:30:02AM -0700, John Fastabend wrote: >>> @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, >>> * Remembering the driver side flush operation will happen before the >>> * net device is removed. >>> */ >>> + mutex_lock(&dev_map_list_mutex); >>> old_dev = xchg(&dtab->netdev_map[i], dev); >>> if (old_dev) >>> call_rcu(&old_dev->rcu, __dev_map_entry_free); >>> + mutex_unlock(&dev_map_list_mutex); >>> >>> return 0; >>> } >> >> This function gets called under rcu critical section, where we can't grab mutexes: > > Agree, same goes for the delete callback that mutex is not allowed > in this context. If I recall, this was for the devmap netdev notifier > in order to check whether we need to purge dev entries from the map, > so that the device can be unregistered gracefully. Given that devmap > ops like update/delete are only allowed from user space, we could > look into whether this map type actually needs to hold RCU at all > here, or other option is to try and get rid of the mutex altogether. > John, could you take a look for a fix? > > Thanks a lot, > Daniel > I'll work up a fix today/tomorrow. Thanks.
diff --git a/include/linux/filter.h b/include/linux/filter.h index 3323ee9..d19ed3c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -716,7 +716,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. - * This does not appear to be a real limiation for existing software. + * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); int xdp_do_redirect(struct net_device *dev, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b2ef04a..899364d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -34,6 +34,17 @@ * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. + * + * Finally, any of the above may race with a netdev_unregister notifier. The + * unregister notifier must search for net devices in the map structure that + * contain a reference to the net device and remove them. This is a two step + * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) + * check to see if the ifindex is the same as the net_device being removed. + * Unfortunately, the xchg() operations do not protect against this. To avoid + * potentially removing incorrect objects the dev_map_list_mutex protects + * conflicting netdev unregister and BPF syscall operations. Updates and + * deletes from a BPF program (done in rcu critical section) are blocked + * because of this mutex. */ #include <linux/bpf.h> #include <linux/jhash.h> @@ -54,8 +65,12 @@ struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; unsigned long int __percpu *flush_needed; + struct list_head list; }; +static DEFINE_MUTEX(dev_map_list_mutex); +static LIST_HEAD(dev_map_list); + static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; @@ -112,6 +127,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; + mutex_lock(&dev_map_list_mutex); + list_add_tail(&dtab->list, &dev_map_list); + mutex_unlock(&dev_map_list_mutex); return &dtab->map; free_dtab: @@ -146,6 +164,11 @@ static void dev_map_free(struct bpf_map *map) cpu_relax(); } + /* Although we should no longer have datapath or bpf syscall operations + * at this point we we can still race with netdev notifier, hence the + * lock. + */ + mutex_lock(&dev_map_list_mutex); for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -160,6 +183,8 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + list_del(&dtab->list); + mutex_unlock(&dev_map_list_mutex); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -296,9 +321,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) * the driver tear down ensures all soft irqs are complete before * removing the net device in the case of dev_put equals zero. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, * Remembering the driver side flush operation will happen before the * net device is removed. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[i], dev); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -356,3 +385,47 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, }; + +static int dev_map_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dtab *dtab; + int i; + + switch (event) { + case NETDEV_UNREGISTER: + mutex_lock(&dev_map_list_mutex); + list_for_each_entry(dtab, &dev_map_list, list) { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev || + dev->dev->ifindex != netdev->ifindex) + continue; + dev = xchg(&dtab->netdev_map[i], NULL); + if (dev) + call_rcu(&dev->rcu, + __dev_map_entry_free); + } + } + mutex_unlock(&dev_map_list_mutex); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block dev_map_notifier = { + .notifier_call = dev_map_notification, +}; + +static int __init dev_map_init(void) +{ + register_netdevice_notifier(&dev_map_notifier); + return 0; +} + +subsys_initcall(dev_map_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df05d65..ebe9b38 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1281,7 +1281,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * for now. */ case BPF_MAP_TYPE_DEVMAP: - if (func_id == BPF_FUNC_map_lookup_elem) + if (func_id != BPF_FUNC_redirect_map) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: